diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,48013 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 34265, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00014592149423610097, + "grad_norm": 3.1947744515847005, + "learning_rate": 1.458576429404901e-07, + "loss": 0.9761, + "step": 5 + }, + { + "epoch": 0.00029184298847220193, + "grad_norm": 2.780818532134856, + "learning_rate": 2.917152858809802e-07, + "loss": 0.9579, + "step": 10 + }, + { + "epoch": 0.00043776448270830295, + "grad_norm": 2.8959010685272393, + "learning_rate": 4.3757292882147023e-07, + "loss": 0.9036, + "step": 15 + }, + { + "epoch": 0.0005836859769444039, + "grad_norm": 2.5371985504370937, + "learning_rate": 5.834305717619604e-07, + "loss": 0.9564, + "step": 20 + }, + { + "epoch": 0.0007296074711805049, + "grad_norm": 2.605626399144433, + "learning_rate": 7.292882147024504e-07, + "loss": 0.87, + "step": 25 + }, + { + "epoch": 0.0008755289654166059, + "grad_norm": 2.3600499623595086, + "learning_rate": 8.751458576429405e-07, + "loss": 0.9499, + "step": 30 + }, + { + "epoch": 0.0010214504596527069, + "grad_norm": 2.3297311105558225, + "learning_rate": 1.0210035005834306e-06, + "loss": 0.9012, + "step": 35 + }, + { + "epoch": 0.0011673719538888077, + "grad_norm": 2.1913988052814823, + "learning_rate": 1.1668611435239208e-06, + "loss": 0.9282, + "step": 40 + }, + { + "epoch": 0.0013132934481249088, + "grad_norm": 2.105605630758833, + "learning_rate": 1.3127187864644107e-06, + "loss": 0.8002, + "step": 45 + }, + { + "epoch": 0.0014592149423610099, + "grad_norm": 1.8307704603554018, + "learning_rate": 1.4585764294049009e-06, + "loss": 0.864, + "step": 50 + }, + { + "epoch": 0.0016051364365971107, + "grad_norm": 1.7749968650938042, + "learning_rate": 1.604434072345391e-06, + "loss": 0.8337, + "step": 55 + }, + { + "epoch": 0.0017510579308332118, + "grad_norm": 1.8264857625369404, + "learning_rate": 1.750291715285881e-06, + "loss": 0.8928, + "step": 60 + }, + { + "epoch": 0.0018969794250693127, + "grad_norm": 1.697528237991233, + "learning_rate": 1.8961493582263713e-06, + "loss": 0.794, + "step": 65 + }, + { + "epoch": 0.0020429009193054137, + "grad_norm": 1.5659385030646167, + "learning_rate": 2.0420070011668612e-06, + "loss": 0.879, + "step": 70 + }, + { + "epoch": 0.002188822413541515, + "grad_norm": 1.7961674617899717, + "learning_rate": 2.187864644107351e-06, + "loss": 0.8283, + "step": 75 + }, + { + "epoch": 0.0023347439077776155, + "grad_norm": 1.6507596906396012, + "learning_rate": 2.3337222870478415e-06, + "loss": 0.8682, + "step": 80 + }, + { + "epoch": 0.0024806654020137165, + "grad_norm": 1.4535579925158708, + "learning_rate": 2.4795799299883315e-06, + "loss": 0.7529, + "step": 85 + }, + { + "epoch": 0.0026265868962498176, + "grad_norm": 1.698106722774744, + "learning_rate": 2.6254375729288214e-06, + "loss": 0.8147, + "step": 90 + }, + { + "epoch": 0.0027725083904859187, + "grad_norm": 1.4680260129152574, + "learning_rate": 2.7712952158693118e-06, + "loss": 0.7276, + "step": 95 + }, + { + "epoch": 0.0029184298847220198, + "grad_norm": 1.542895003410177, + "learning_rate": 2.9171528588098017e-06, + "loss": 0.7615, + "step": 100 + }, + { + "epoch": 0.0030643513789581204, + "grad_norm": 1.429813215058147, + "learning_rate": 3.0630105017502916e-06, + "loss": 0.7247, + "step": 105 + }, + { + "epoch": 0.0032102728731942215, + "grad_norm": 1.5130072783727644, + "learning_rate": 3.208868144690782e-06, + "loss": 0.7495, + "step": 110 + }, + { + "epoch": 0.0033561943674303226, + "grad_norm": 1.569844141360473, + "learning_rate": 3.3547257876312724e-06, + "loss": 0.765, + "step": 115 + }, + { + "epoch": 0.0035021158616664236, + "grad_norm": 1.6175884841912778, + "learning_rate": 3.500583430571762e-06, + "loss": 0.7472, + "step": 120 + }, + { + "epoch": 0.0036480373559025243, + "grad_norm": 1.6904569479496365, + "learning_rate": 3.6464410735122522e-06, + "loss": 0.7803, + "step": 125 + }, + { + "epoch": 0.0037939588501386253, + "grad_norm": 1.6251739015186617, + "learning_rate": 3.7922987164527426e-06, + "loss": 0.7604, + "step": 130 + }, + { + "epoch": 0.003939880344374726, + "grad_norm": 1.5427918519690202, + "learning_rate": 3.938156359393232e-06, + "loss": 0.7573, + "step": 135 + }, + { + "epoch": 0.0040858018386108275, + "grad_norm": 1.5576076657615734, + "learning_rate": 4.0840140023337225e-06, + "loss": 0.7623, + "step": 140 + }, + { + "epoch": 0.004231723332846929, + "grad_norm": 1.3686901744336726, + "learning_rate": 4.229871645274212e-06, + "loss": 0.7286, + "step": 145 + }, + { + "epoch": 0.00437764482708303, + "grad_norm": 1.4216493984338547, + "learning_rate": 4.375729288214702e-06, + "loss": 0.7901, + "step": 150 + }, + { + "epoch": 0.004523566321319131, + "grad_norm": 1.6045464118077426, + "learning_rate": 4.521586931155193e-06, + "loss": 0.7569, + "step": 155 + }, + { + "epoch": 0.004669487815555231, + "grad_norm": 1.8627989370501201, + "learning_rate": 4.667444574095683e-06, + "loss": 0.7453, + "step": 160 + }, + { + "epoch": 0.004815409309791332, + "grad_norm": 1.4743706942055885, + "learning_rate": 4.8133022170361734e-06, + "loss": 0.7583, + "step": 165 + }, + { + "epoch": 0.004961330804027433, + "grad_norm": 1.4542212784423725, + "learning_rate": 4.959159859976663e-06, + "loss": 0.7329, + "step": 170 + }, + { + "epoch": 0.005107252298263534, + "grad_norm": 1.712902676622777, + "learning_rate": 5.105017502917153e-06, + "loss": 0.7729, + "step": 175 + }, + { + "epoch": 0.005253173792499635, + "grad_norm": 1.4755335061451906, + "learning_rate": 5.250875145857643e-06, + "loss": 0.7765, + "step": 180 + }, + { + "epoch": 0.005399095286735736, + "grad_norm": 1.5994272580423068, + "learning_rate": 5.396732788798133e-06, + "loss": 0.7251, + "step": 185 + }, + { + "epoch": 0.005545016780971837, + "grad_norm": 1.6354259469081376, + "learning_rate": 5.5425904317386235e-06, + "loss": 0.7597, + "step": 190 + }, + { + "epoch": 0.0056909382752079385, + "grad_norm": 1.7814990205632233, + "learning_rate": 5.688448074679113e-06, + "loss": 0.7613, + "step": 195 + }, + { + "epoch": 0.0058368597694440395, + "grad_norm": 1.7555434578362554, + "learning_rate": 5.834305717619603e-06, + "loss": 0.7148, + "step": 200 + }, + { + "epoch": 0.00598278126368014, + "grad_norm": 1.6267363052677177, + "learning_rate": 5.980163360560093e-06, + "loss": 0.7144, + "step": 205 + }, + { + "epoch": 0.006128702757916241, + "grad_norm": 1.4669150349400875, + "learning_rate": 6.126021003500583e-06, + "loss": 0.7067, + "step": 210 + }, + { + "epoch": 0.006274624252152342, + "grad_norm": 1.411713064440319, + "learning_rate": 6.271878646441074e-06, + "loss": 0.7083, + "step": 215 + }, + { + "epoch": 0.006420545746388443, + "grad_norm": 1.607398390369769, + "learning_rate": 6.417736289381564e-06, + "loss": 0.7628, + "step": 220 + }, + { + "epoch": 0.006566467240624544, + "grad_norm": 1.6395778660498548, + "learning_rate": 6.563593932322054e-06, + "loss": 0.7746, + "step": 225 + }, + { + "epoch": 0.006712388734860645, + "grad_norm": 1.6890901541211365, + "learning_rate": 6.709451575262545e-06, + "loss": 0.7222, + "step": 230 + }, + { + "epoch": 0.006858310229096746, + "grad_norm": 1.5759953543797294, + "learning_rate": 6.855309218203035e-06, + "loss": 0.7078, + "step": 235 + }, + { + "epoch": 0.007004231723332847, + "grad_norm": 1.5849245570015338, + "learning_rate": 7.001166861143524e-06, + "loss": 0.6711, + "step": 240 + }, + { + "epoch": 0.007150153217568948, + "grad_norm": 1.5803687806173186, + "learning_rate": 7.147024504084014e-06, + "loss": 0.7133, + "step": 245 + }, + { + "epoch": 0.0072960747118050485, + "grad_norm": 1.7032674577448692, + "learning_rate": 7.2928821470245045e-06, + "loss": 0.686, + "step": 250 + }, + { + "epoch": 0.00744199620604115, + "grad_norm": 1.8734431421404527, + "learning_rate": 7.438739789964995e-06, + "loss": 0.7024, + "step": 255 + }, + { + "epoch": 0.007587917700277251, + "grad_norm": 1.8678774528018942, + "learning_rate": 7.584597432905485e-06, + "loss": 0.7022, + "step": 260 + }, + { + "epoch": 0.007733839194513352, + "grad_norm": 1.8130034609400518, + "learning_rate": 7.730455075845975e-06, + "loss": 0.7036, + "step": 265 + }, + { + "epoch": 0.007879760688749453, + "grad_norm": 1.5904331838236196, + "learning_rate": 7.876312718786464e-06, + "loss": 0.7502, + "step": 270 + }, + { + "epoch": 0.008025682182985553, + "grad_norm": 1.530190434975237, + "learning_rate": 8.022170361726955e-06, + "loss": 0.7003, + "step": 275 + }, + { + "epoch": 0.008171603677221655, + "grad_norm": 1.4468723229371472, + "learning_rate": 8.168028004667445e-06, + "loss": 0.6538, + "step": 280 + }, + { + "epoch": 0.008317525171457755, + "grad_norm": 1.5013623893566552, + "learning_rate": 8.313885647607934e-06, + "loss": 0.7231, + "step": 285 + }, + { + "epoch": 0.008463446665693857, + "grad_norm": 1.439329760933021, + "learning_rate": 8.459743290548424e-06, + "loss": 0.7081, + "step": 290 + }, + { + "epoch": 0.008609368159929957, + "grad_norm": 1.5440021877487, + "learning_rate": 8.605600933488915e-06, + "loss": 0.6921, + "step": 295 + }, + { + "epoch": 0.00875528965416606, + "grad_norm": 1.6291279310781581, + "learning_rate": 8.751458576429405e-06, + "loss": 0.694, + "step": 300 + }, + { + "epoch": 0.00890121114840216, + "grad_norm": 1.6431941602450315, + "learning_rate": 8.897316219369896e-06, + "loss": 0.7175, + "step": 305 + }, + { + "epoch": 0.009047132642638261, + "grad_norm": 1.7240714682583684, + "learning_rate": 9.043173862310385e-06, + "loss": 0.7037, + "step": 310 + }, + { + "epoch": 0.009193054136874362, + "grad_norm": 1.5076816635550303, + "learning_rate": 9.189031505250875e-06, + "loss": 0.6908, + "step": 315 + }, + { + "epoch": 0.009338975631110462, + "grad_norm": 1.6421334821435518, + "learning_rate": 9.334889148191366e-06, + "loss": 0.6826, + "step": 320 + }, + { + "epoch": 0.009484897125346564, + "grad_norm": 1.567201180354376, + "learning_rate": 9.480746791131856e-06, + "loss": 0.763, + "step": 325 + }, + { + "epoch": 0.009630818619582664, + "grad_norm": 1.5537223777920866, + "learning_rate": 9.626604434072347e-06, + "loss": 0.6731, + "step": 330 + }, + { + "epoch": 0.009776740113818766, + "grad_norm": 1.5693711908918668, + "learning_rate": 9.772462077012835e-06, + "loss": 0.71, + "step": 335 + }, + { + "epoch": 0.009922661608054866, + "grad_norm": 1.500621710642294, + "learning_rate": 9.918319719953326e-06, + "loss": 0.6813, + "step": 340 + }, + { + "epoch": 0.010068583102290968, + "grad_norm": 1.51244614882415, + "learning_rate": 1.0064177362893815e-05, + "loss": 0.696, + "step": 345 + }, + { + "epoch": 0.010214504596527068, + "grad_norm": 1.6592392474503885, + "learning_rate": 1.0210035005834307e-05, + "loss": 0.723, + "step": 350 + }, + { + "epoch": 0.01036042609076317, + "grad_norm": 1.5357571274774384, + "learning_rate": 1.0355892648774796e-05, + "loss": 0.7266, + "step": 355 + }, + { + "epoch": 0.01050634758499927, + "grad_norm": 1.7075409579058634, + "learning_rate": 1.0501750291715286e-05, + "loss": 0.7177, + "step": 360 + }, + { + "epoch": 0.01065226907923537, + "grad_norm": 1.6769841817579385, + "learning_rate": 1.0647607934655777e-05, + "loss": 0.6869, + "step": 365 + }, + { + "epoch": 0.010798190573471473, + "grad_norm": 1.635129245404418, + "learning_rate": 1.0793465577596266e-05, + "loss": 0.6174, + "step": 370 + }, + { + "epoch": 0.010944112067707573, + "grad_norm": 1.5277961671404743, + "learning_rate": 1.0939323220536758e-05, + "loss": 0.6973, + "step": 375 + }, + { + "epoch": 0.011090033561943675, + "grad_norm": 1.6672105268978017, + "learning_rate": 1.1085180863477247e-05, + "loss": 0.6885, + "step": 380 + }, + { + "epoch": 0.011235955056179775, + "grad_norm": 1.5484556976557604, + "learning_rate": 1.1231038506417737e-05, + "loss": 0.6266, + "step": 385 + }, + { + "epoch": 0.011381876550415877, + "grad_norm": 1.563661138118462, + "learning_rate": 1.1376896149358226e-05, + "loss": 0.6736, + "step": 390 + }, + { + "epoch": 0.011527798044651977, + "grad_norm": 1.3888219387699416, + "learning_rate": 1.1522753792298717e-05, + "loss": 0.6552, + "step": 395 + }, + { + "epoch": 0.011673719538888079, + "grad_norm": 1.4148867756265207, + "learning_rate": 1.1668611435239207e-05, + "loss": 0.6459, + "step": 400 + }, + { + "epoch": 0.01181964103312418, + "grad_norm": 1.4990728642252718, + "learning_rate": 1.1814469078179698e-05, + "loss": 0.7266, + "step": 405 + }, + { + "epoch": 0.01196556252736028, + "grad_norm": 1.5586964890227837, + "learning_rate": 1.1960326721120186e-05, + "loss": 0.6796, + "step": 410 + }, + { + "epoch": 0.012111484021596381, + "grad_norm": 1.4423410905481469, + "learning_rate": 1.2106184364060677e-05, + "loss": 0.6557, + "step": 415 + }, + { + "epoch": 0.012257405515832482, + "grad_norm": 1.5511893531096423, + "learning_rate": 1.2252042007001167e-05, + "loss": 0.6625, + "step": 420 + }, + { + "epoch": 0.012403327010068584, + "grad_norm": 1.5902547361644932, + "learning_rate": 1.2397899649941658e-05, + "loss": 0.7426, + "step": 425 + }, + { + "epoch": 0.012549248504304684, + "grad_norm": 1.5111039547721734, + "learning_rate": 1.2543757292882147e-05, + "loss": 0.6831, + "step": 430 + }, + { + "epoch": 0.012695169998540786, + "grad_norm": 1.591632736563798, + "learning_rate": 1.2689614935822638e-05, + "loss": 0.7043, + "step": 435 + }, + { + "epoch": 0.012841091492776886, + "grad_norm": 1.6357362500630288, + "learning_rate": 1.2835472578763128e-05, + "loss": 0.7664, + "step": 440 + }, + { + "epoch": 0.012987012987012988, + "grad_norm": 1.5945529648895056, + "learning_rate": 1.298133022170362e-05, + "loss": 0.6463, + "step": 445 + }, + { + "epoch": 0.013132934481249088, + "grad_norm": 1.609309211131705, + "learning_rate": 1.3127187864644109e-05, + "loss": 0.6818, + "step": 450 + }, + { + "epoch": 0.013278855975485188, + "grad_norm": 1.5404955167609322, + "learning_rate": 1.3273045507584597e-05, + "loss": 0.6427, + "step": 455 + }, + { + "epoch": 0.01342477746972129, + "grad_norm": 1.5181450307496702, + "learning_rate": 1.341890315052509e-05, + "loss": 0.7149, + "step": 460 + }, + { + "epoch": 0.01357069896395739, + "grad_norm": 1.839697213371066, + "learning_rate": 1.3564760793465577e-05, + "loss": 0.7382, + "step": 465 + }, + { + "epoch": 0.013716620458193492, + "grad_norm": 1.664060733330266, + "learning_rate": 1.371061843640607e-05, + "loss": 0.6614, + "step": 470 + }, + { + "epoch": 0.013862541952429593, + "grad_norm": 1.6757405718327731, + "learning_rate": 1.3856476079346558e-05, + "loss": 0.7493, + "step": 475 + }, + { + "epoch": 0.014008463446665695, + "grad_norm": 1.6644547957700877, + "learning_rate": 1.4002333722287048e-05, + "loss": 0.7036, + "step": 480 + }, + { + "epoch": 0.014154384940901795, + "grad_norm": 1.9023550591981342, + "learning_rate": 1.4148191365227539e-05, + "loss": 0.7264, + "step": 485 + }, + { + "epoch": 0.014300306435137897, + "grad_norm": 1.453455421325802, + "learning_rate": 1.4294049008168028e-05, + "loss": 0.6838, + "step": 490 + }, + { + "epoch": 0.014446227929373997, + "grad_norm": 1.6908321474248393, + "learning_rate": 1.443990665110852e-05, + "loss": 0.724, + "step": 495 + }, + { + "epoch": 0.014592149423610097, + "grad_norm": 1.5833993213625799, + "learning_rate": 1.4585764294049009e-05, + "loss": 0.7204, + "step": 500 + }, + { + "epoch": 0.014738070917846199, + "grad_norm": 2.0269188296839427, + "learning_rate": 1.4731621936989498e-05, + "loss": 0.7267, + "step": 505 + }, + { + "epoch": 0.0148839924120823, + "grad_norm": 1.5472605771373973, + "learning_rate": 1.487747957992999e-05, + "loss": 0.7459, + "step": 510 + }, + { + "epoch": 0.015029913906318401, + "grad_norm": 1.7025139983932893, + "learning_rate": 1.502333722287048e-05, + "loss": 0.6562, + "step": 515 + }, + { + "epoch": 0.015175835400554501, + "grad_norm": 1.8673550437347441, + "learning_rate": 1.516919486581097e-05, + "loss": 0.6948, + "step": 520 + }, + { + "epoch": 0.015321756894790603, + "grad_norm": 1.3565338880388187, + "learning_rate": 1.5315052508751458e-05, + "loss": 0.6973, + "step": 525 + }, + { + "epoch": 0.015467678389026704, + "grad_norm": 1.7590429090976183, + "learning_rate": 1.546091015169195e-05, + "loss": 0.7272, + "step": 530 + }, + { + "epoch": 0.015613599883262804, + "grad_norm": 1.7144170173144773, + "learning_rate": 1.560676779463244e-05, + "loss": 0.7295, + "step": 535 + }, + { + "epoch": 0.015759521377498906, + "grad_norm": 1.5606571713526964, + "learning_rate": 1.575262543757293e-05, + "loss": 0.665, + "step": 540 + }, + { + "epoch": 0.015905442871735006, + "grad_norm": 1.6006824964823971, + "learning_rate": 1.589848308051342e-05, + "loss": 0.6726, + "step": 545 + }, + { + "epoch": 0.016051364365971106, + "grad_norm": 2.094293530293622, + "learning_rate": 1.604434072345391e-05, + "loss": 0.7411, + "step": 550 + }, + { + "epoch": 0.01619728586020721, + "grad_norm": 1.7078000915064542, + "learning_rate": 1.61901983663944e-05, + "loss": 0.7587, + "step": 555 + }, + { + "epoch": 0.01634320735444331, + "grad_norm": 1.4944710379279686, + "learning_rate": 1.633605600933489e-05, + "loss": 0.6323, + "step": 560 + }, + { + "epoch": 0.01648912884867941, + "grad_norm": 1.574611787971994, + "learning_rate": 1.648191365227538e-05, + "loss": 0.6682, + "step": 565 + }, + { + "epoch": 0.01663505034291551, + "grad_norm": 1.5278996951086554, + "learning_rate": 1.662777129521587e-05, + "loss": 0.7071, + "step": 570 + }, + { + "epoch": 0.016780971837151614, + "grad_norm": 1.6822476418518675, + "learning_rate": 1.677362893815636e-05, + "loss": 0.7017, + "step": 575 + }, + { + "epoch": 0.016926893331387714, + "grad_norm": 1.7656120267143682, + "learning_rate": 1.6919486581096848e-05, + "loss": 0.669, + "step": 580 + }, + { + "epoch": 0.017072814825623814, + "grad_norm": 1.8451054979090913, + "learning_rate": 1.7065344224037343e-05, + "loss": 0.689, + "step": 585 + }, + { + "epoch": 0.017218736319859915, + "grad_norm": 1.6107491561470146, + "learning_rate": 1.721120186697783e-05, + "loss": 0.709, + "step": 590 + }, + { + "epoch": 0.017364657814096015, + "grad_norm": 1.7631298036500338, + "learning_rate": 1.7357059509918318e-05, + "loss": 0.752, + "step": 595 + }, + { + "epoch": 0.01751057930833212, + "grad_norm": 1.9098001836709986, + "learning_rate": 1.750291715285881e-05, + "loss": 0.774, + "step": 600 + }, + { + "epoch": 0.01765650080256822, + "grad_norm": 1.6853961941046716, + "learning_rate": 1.76487747957993e-05, + "loss": 0.6939, + "step": 605 + }, + { + "epoch": 0.01780242229680432, + "grad_norm": 1.6804282668264998, + "learning_rate": 1.7794632438739792e-05, + "loss": 0.6952, + "step": 610 + }, + { + "epoch": 0.01794834379104042, + "grad_norm": 1.5399435158399386, + "learning_rate": 1.794049008168028e-05, + "loss": 0.6674, + "step": 615 + }, + { + "epoch": 0.018094265285276523, + "grad_norm": 1.6417423013675476, + "learning_rate": 1.808634772462077e-05, + "loss": 0.6926, + "step": 620 + }, + { + "epoch": 0.018240186779512623, + "grad_norm": 1.7183768786991622, + "learning_rate": 1.8232205367561262e-05, + "loss": 0.729, + "step": 625 + }, + { + "epoch": 0.018386108273748723, + "grad_norm": 1.7341801849493557, + "learning_rate": 1.837806301050175e-05, + "loss": 0.6775, + "step": 630 + }, + { + "epoch": 0.018532029767984824, + "grad_norm": 1.6649539048207478, + "learning_rate": 1.852392065344224e-05, + "loss": 0.7253, + "step": 635 + }, + { + "epoch": 0.018677951262220924, + "grad_norm": 1.7590539603786544, + "learning_rate": 1.8669778296382732e-05, + "loss": 0.6353, + "step": 640 + }, + { + "epoch": 0.018823872756457027, + "grad_norm": 1.6172500057134809, + "learning_rate": 1.881563593932322e-05, + "loss": 0.6806, + "step": 645 + }, + { + "epoch": 0.018969794250693128, + "grad_norm": 1.6803811206114834, + "learning_rate": 1.896149358226371e-05, + "loss": 0.8018, + "step": 650 + }, + { + "epoch": 0.019115715744929228, + "grad_norm": 1.5984355067301463, + "learning_rate": 1.9107351225204202e-05, + "loss": 0.7164, + "step": 655 + }, + { + "epoch": 0.019261637239165328, + "grad_norm": 1.563429573575732, + "learning_rate": 1.9253208868144694e-05, + "loss": 0.652, + "step": 660 + }, + { + "epoch": 0.01940755873340143, + "grad_norm": 1.8095363077898927, + "learning_rate": 1.939906651108518e-05, + "loss": 0.6753, + "step": 665 + }, + { + "epoch": 0.019553480227637532, + "grad_norm": 1.5555149820078507, + "learning_rate": 1.954492415402567e-05, + "loss": 0.7037, + "step": 670 + }, + { + "epoch": 0.019699401721873632, + "grad_norm": 1.7121310281353082, + "learning_rate": 1.9690781796966164e-05, + "loss": 0.6707, + "step": 675 + }, + { + "epoch": 0.019845323216109732, + "grad_norm": 1.4498071021238534, + "learning_rate": 1.9836639439906652e-05, + "loss": 0.7046, + "step": 680 + }, + { + "epoch": 0.019991244710345833, + "grad_norm": 1.4466233892468816, + "learning_rate": 1.9982497082847143e-05, + "loss": 0.6903, + "step": 685 + }, + { + "epoch": 0.020137166204581936, + "grad_norm": 1.6278206523630134, + "learning_rate": 2.012835472578763e-05, + "loss": 0.7052, + "step": 690 + }, + { + "epoch": 0.020283087698818036, + "grad_norm": 1.6981244706447884, + "learning_rate": 2.0274212368728122e-05, + "loss": 0.7527, + "step": 695 + }, + { + "epoch": 0.020429009193054137, + "grad_norm": 1.7031385948067341, + "learning_rate": 2.0420070011668613e-05, + "loss": 0.7486, + "step": 700 + }, + { + "epoch": 0.020574930687290237, + "grad_norm": 1.5818150882172228, + "learning_rate": 2.05659276546091e-05, + "loss": 0.6582, + "step": 705 + }, + { + "epoch": 0.02072085218152634, + "grad_norm": 1.6842467002662005, + "learning_rate": 2.0711785297549592e-05, + "loss": 0.6885, + "step": 710 + }, + { + "epoch": 0.02086677367576244, + "grad_norm": 1.4282165804306994, + "learning_rate": 2.0857642940490083e-05, + "loss": 0.6359, + "step": 715 + }, + { + "epoch": 0.02101269516999854, + "grad_norm": 1.7032330817338395, + "learning_rate": 2.100350058343057e-05, + "loss": 0.7242, + "step": 720 + }, + { + "epoch": 0.02115861666423464, + "grad_norm": 1.538448142519383, + "learning_rate": 2.1149358226371062e-05, + "loss": 0.6382, + "step": 725 + }, + { + "epoch": 0.02130453815847074, + "grad_norm": 1.584463731730771, + "learning_rate": 2.1295215869311554e-05, + "loss": 0.7076, + "step": 730 + }, + { + "epoch": 0.021450459652706845, + "grad_norm": 1.7489371138422, + "learning_rate": 2.1441073512252045e-05, + "loss": 0.6995, + "step": 735 + }, + { + "epoch": 0.021596381146942945, + "grad_norm": 1.6427548661471818, + "learning_rate": 2.1586931155192533e-05, + "loss": 0.7647, + "step": 740 + }, + { + "epoch": 0.021742302641179045, + "grad_norm": 1.7376674014421456, + "learning_rate": 2.173278879813302e-05, + "loss": 0.7063, + "step": 745 + }, + { + "epoch": 0.021888224135415146, + "grad_norm": 1.7308918478302657, + "learning_rate": 2.1878646441073515e-05, + "loss": 0.7039, + "step": 750 + }, + { + "epoch": 0.02203414562965125, + "grad_norm": 1.7009110447382148, + "learning_rate": 2.2024504084014003e-05, + "loss": 0.7357, + "step": 755 + }, + { + "epoch": 0.02218006712388735, + "grad_norm": 1.6845153941030226, + "learning_rate": 2.2170361726954494e-05, + "loss": 0.6467, + "step": 760 + }, + { + "epoch": 0.02232598861812345, + "grad_norm": 1.827793921542039, + "learning_rate": 2.2316219369894982e-05, + "loss": 0.7235, + "step": 765 + }, + { + "epoch": 0.02247191011235955, + "grad_norm": 1.649800671415183, + "learning_rate": 2.2462077012835473e-05, + "loss": 0.7395, + "step": 770 + }, + { + "epoch": 0.02261783160659565, + "grad_norm": 1.6690762339397862, + "learning_rate": 2.2607934655775964e-05, + "loss": 0.7044, + "step": 775 + }, + { + "epoch": 0.022763753100831754, + "grad_norm": 1.611645621469813, + "learning_rate": 2.2753792298716452e-05, + "loss": 0.7094, + "step": 780 + }, + { + "epoch": 0.022909674595067854, + "grad_norm": 1.3988095282507587, + "learning_rate": 2.2899649941656943e-05, + "loss": 0.7096, + "step": 785 + }, + { + "epoch": 0.023055596089303954, + "grad_norm": 1.7174132991172002, + "learning_rate": 2.3045507584597435e-05, + "loss": 0.7229, + "step": 790 + }, + { + "epoch": 0.023201517583540054, + "grad_norm": 1.3661904097330837, + "learning_rate": 2.3191365227537922e-05, + "loss": 0.6368, + "step": 795 + }, + { + "epoch": 0.023347439077776158, + "grad_norm": 1.5937893185090637, + "learning_rate": 2.3337222870478414e-05, + "loss": 0.6955, + "step": 800 + }, + { + "epoch": 0.02349336057201226, + "grad_norm": 1.6873174766585761, + "learning_rate": 2.3483080513418905e-05, + "loss": 0.6511, + "step": 805 + }, + { + "epoch": 0.02363928206624836, + "grad_norm": 1.607069446791644, + "learning_rate": 2.3628938156359396e-05, + "loss": 0.718, + "step": 810 + }, + { + "epoch": 0.02378520356048446, + "grad_norm": 1.7139729288179766, + "learning_rate": 2.3774795799299884e-05, + "loss": 0.6824, + "step": 815 + }, + { + "epoch": 0.02393112505472056, + "grad_norm": 1.4872438626984554, + "learning_rate": 2.392065344224037e-05, + "loss": 0.6344, + "step": 820 + }, + { + "epoch": 0.024077046548956663, + "grad_norm": 1.5654039723021604, + "learning_rate": 2.4066511085180866e-05, + "loss": 0.7409, + "step": 825 + }, + { + "epoch": 0.024222968043192763, + "grad_norm": 1.446173231049819, + "learning_rate": 2.4212368728121354e-05, + "loss": 0.7065, + "step": 830 + }, + { + "epoch": 0.024368889537428863, + "grad_norm": 1.7701228013414976, + "learning_rate": 2.4358226371061845e-05, + "loss": 0.6896, + "step": 835 + }, + { + "epoch": 0.024514811031664963, + "grad_norm": 1.5583655871026196, + "learning_rate": 2.4504084014002333e-05, + "loss": 0.7142, + "step": 840 + }, + { + "epoch": 0.024660732525901067, + "grad_norm": 1.383613673297909, + "learning_rate": 2.4649941656942824e-05, + "loss": 0.6883, + "step": 845 + }, + { + "epoch": 0.024806654020137167, + "grad_norm": 1.6319134030710103, + "learning_rate": 2.4795799299883316e-05, + "loss": 0.7462, + "step": 850 + }, + { + "epoch": 0.024952575514373267, + "grad_norm": 1.5517947834609296, + "learning_rate": 2.4941656942823803e-05, + "loss": 0.6795, + "step": 855 + }, + { + "epoch": 0.025098497008609368, + "grad_norm": 1.6712335021090057, + "learning_rate": 2.5087514585764295e-05, + "loss": 0.69, + "step": 860 + }, + { + "epoch": 0.025244418502845468, + "grad_norm": 1.998899182109342, + "learning_rate": 2.523337222870479e-05, + "loss": 0.747, + "step": 865 + }, + { + "epoch": 0.02539033999708157, + "grad_norm": 1.458426590820653, + "learning_rate": 2.5379229871645277e-05, + "loss": 0.668, + "step": 870 + }, + { + "epoch": 0.02553626149131767, + "grad_norm": 1.6661814738196528, + "learning_rate": 2.5525087514585765e-05, + "loss": 0.8119, + "step": 875 + }, + { + "epoch": 0.025682182985553772, + "grad_norm": 1.8043267177876838, + "learning_rate": 2.5670945157526256e-05, + "loss": 0.7128, + "step": 880 + }, + { + "epoch": 0.025828104479789872, + "grad_norm": 1.595635424906511, + "learning_rate": 2.5816802800466744e-05, + "loss": 0.6743, + "step": 885 + }, + { + "epoch": 0.025974025974025976, + "grad_norm": 1.3210397592732002, + "learning_rate": 2.596266044340724e-05, + "loss": 0.6914, + "step": 890 + }, + { + "epoch": 0.026119947468262076, + "grad_norm": 1.5700248005327584, + "learning_rate": 2.6108518086347726e-05, + "loss": 0.6959, + "step": 895 + }, + { + "epoch": 0.026265868962498176, + "grad_norm": 1.5136291542399603, + "learning_rate": 2.6254375729288217e-05, + "loss": 0.7323, + "step": 900 + }, + { + "epoch": 0.026411790456734276, + "grad_norm": 1.7009729282195039, + "learning_rate": 2.6400233372228705e-05, + "loss": 0.6918, + "step": 905 + }, + { + "epoch": 0.026557711950970377, + "grad_norm": 1.4271883652876505, + "learning_rate": 2.6546091015169193e-05, + "loss": 0.686, + "step": 910 + }, + { + "epoch": 0.02670363344520648, + "grad_norm": 1.6543368167985741, + "learning_rate": 2.6691948658109688e-05, + "loss": 0.7205, + "step": 915 + }, + { + "epoch": 0.02684955493944258, + "grad_norm": 1.4606292966835321, + "learning_rate": 2.683780630105018e-05, + "loss": 0.6939, + "step": 920 + }, + { + "epoch": 0.02699547643367868, + "grad_norm": 1.5086639765454437, + "learning_rate": 2.6983663943990667e-05, + "loss": 0.69, + "step": 925 + }, + { + "epoch": 0.02714139792791478, + "grad_norm": 1.479890690618661, + "learning_rate": 2.7129521586931155e-05, + "loss": 0.6813, + "step": 930 + }, + { + "epoch": 0.027287319422150885, + "grad_norm": 1.8008746804886702, + "learning_rate": 2.7275379229871646e-05, + "loss": 0.6938, + "step": 935 + }, + { + "epoch": 0.027433240916386985, + "grad_norm": 1.6557731270365816, + "learning_rate": 2.742123687281214e-05, + "loss": 0.6724, + "step": 940 + }, + { + "epoch": 0.027579162410623085, + "grad_norm": 1.7436705568477375, + "learning_rate": 2.7567094515752628e-05, + "loss": 0.679, + "step": 945 + }, + { + "epoch": 0.027725083904859185, + "grad_norm": 1.3996057203371555, + "learning_rate": 2.7712952158693116e-05, + "loss": 0.7129, + "step": 950 + }, + { + "epoch": 0.027871005399095285, + "grad_norm": 1.4694127082983113, + "learning_rate": 2.7858809801633607e-05, + "loss": 0.712, + "step": 955 + }, + { + "epoch": 0.02801692689333139, + "grad_norm": 1.6722150708335481, + "learning_rate": 2.8004667444574095e-05, + "loss": 0.7106, + "step": 960 + }, + { + "epoch": 0.02816284838756749, + "grad_norm": 1.3025846309675377, + "learning_rate": 2.815052508751459e-05, + "loss": 0.7091, + "step": 965 + }, + { + "epoch": 0.02830876988180359, + "grad_norm": 1.4139616367230547, + "learning_rate": 2.8296382730455077e-05, + "loss": 0.733, + "step": 970 + }, + { + "epoch": 0.02845469137603969, + "grad_norm": 1.3737571684570908, + "learning_rate": 2.844224037339557e-05, + "loss": 0.6835, + "step": 975 + }, + { + "epoch": 0.028600612870275793, + "grad_norm": 1.6961221498002164, + "learning_rate": 2.8588098016336056e-05, + "loss": 0.7389, + "step": 980 + }, + { + "epoch": 0.028746534364511894, + "grad_norm": 1.4455556810839383, + "learning_rate": 2.8733955659276544e-05, + "loss": 0.7136, + "step": 985 + }, + { + "epoch": 0.028892455858747994, + "grad_norm": 1.545586528378323, + "learning_rate": 2.887981330221704e-05, + "loss": 0.7191, + "step": 990 + }, + { + "epoch": 0.029038377352984094, + "grad_norm": 1.940525043359535, + "learning_rate": 2.902567094515753e-05, + "loss": 0.7679, + "step": 995 + }, + { + "epoch": 0.029184298847220194, + "grad_norm": 1.6239557635261068, + "learning_rate": 2.9171528588098018e-05, + "loss": 0.716, + "step": 1000 + }, + { + "epoch": 0.029330220341456298, + "grad_norm": 1.9001859265232084, + "learning_rate": 2.9317386231038506e-05, + "loss": 0.7008, + "step": 1005 + }, + { + "epoch": 0.029476141835692398, + "grad_norm": 1.6080340776285513, + "learning_rate": 2.9463243873978997e-05, + "loss": 0.6666, + "step": 1010 + }, + { + "epoch": 0.0296220633299285, + "grad_norm": 1.510133794582749, + "learning_rate": 2.960910151691949e-05, + "loss": 0.6858, + "step": 1015 + }, + { + "epoch": 0.0297679848241646, + "grad_norm": 1.377491998083542, + "learning_rate": 2.975495915985998e-05, + "loss": 0.7237, + "step": 1020 + }, + { + "epoch": 0.0299139063184007, + "grad_norm": 1.6423776203754805, + "learning_rate": 2.9900816802800467e-05, + "loss": 0.7219, + "step": 1025 + }, + { + "epoch": 0.030059827812636802, + "grad_norm": 1.89278007028969, + "learning_rate": 3.004667444574096e-05, + "loss": 0.7372, + "step": 1030 + }, + { + "epoch": 0.030205749306872903, + "grad_norm": 1.466322406875981, + "learning_rate": 3.0192532088681446e-05, + "loss": 0.7088, + "step": 1035 + }, + { + "epoch": 0.030351670801109003, + "grad_norm": 1.601207367034692, + "learning_rate": 3.033838973162194e-05, + "loss": 0.7708, + "step": 1040 + }, + { + "epoch": 0.030497592295345103, + "grad_norm": 1.552335498453275, + "learning_rate": 3.048424737456243e-05, + "loss": 0.7334, + "step": 1045 + }, + { + "epoch": 0.030643513789581207, + "grad_norm": 1.5465196791465428, + "learning_rate": 3.0630105017502916e-05, + "loss": 0.7422, + "step": 1050 + }, + { + "epoch": 0.030789435283817307, + "grad_norm": 1.8495863677675473, + "learning_rate": 3.077596266044341e-05, + "loss": 0.7161, + "step": 1055 + }, + { + "epoch": 0.030935356778053407, + "grad_norm": 1.4806086724803613, + "learning_rate": 3.09218203033839e-05, + "loss": 0.6846, + "step": 1060 + }, + { + "epoch": 0.031081278272289507, + "grad_norm": 1.8485293146616193, + "learning_rate": 3.106767794632439e-05, + "loss": 0.7296, + "step": 1065 + }, + { + "epoch": 0.031227199766525607, + "grad_norm": 1.9255223889371524, + "learning_rate": 3.121353558926488e-05, + "loss": 0.7408, + "step": 1070 + }, + { + "epoch": 0.03137312126076171, + "grad_norm": 1.423045709866818, + "learning_rate": 3.135939323220537e-05, + "loss": 0.7017, + "step": 1075 + }, + { + "epoch": 0.03151904275499781, + "grad_norm": 1.7502566892021552, + "learning_rate": 3.150525087514586e-05, + "loss": 0.7621, + "step": 1080 + }, + { + "epoch": 0.031664964249233915, + "grad_norm": 1.5044752857327865, + "learning_rate": 3.165110851808635e-05, + "loss": 0.7685, + "step": 1085 + }, + { + "epoch": 0.03181088574347001, + "grad_norm": 1.706014273236418, + "learning_rate": 3.179696616102684e-05, + "loss": 0.7406, + "step": 1090 + }, + { + "epoch": 0.031956807237706115, + "grad_norm": 1.3184924575175818, + "learning_rate": 3.194282380396733e-05, + "loss": 0.6446, + "step": 1095 + }, + { + "epoch": 0.03210272873194221, + "grad_norm": 1.480355858792458, + "learning_rate": 3.208868144690782e-05, + "loss": 0.6993, + "step": 1100 + }, + { + "epoch": 0.032248650226178316, + "grad_norm": 1.59599280364544, + "learning_rate": 3.2234539089848306e-05, + "loss": 0.7147, + "step": 1105 + }, + { + "epoch": 0.03239457172041442, + "grad_norm": 1.693123809405065, + "learning_rate": 3.23803967327888e-05, + "loss": 0.7396, + "step": 1110 + }, + { + "epoch": 0.032540493214650516, + "grad_norm": 1.2426197431450783, + "learning_rate": 3.252625437572929e-05, + "loss": 0.7069, + "step": 1115 + }, + { + "epoch": 0.03268641470888662, + "grad_norm": 1.4824005566342304, + "learning_rate": 3.267211201866978e-05, + "loss": 0.6977, + "step": 1120 + }, + { + "epoch": 0.03283233620312272, + "grad_norm": 1.4740320021968292, + "learning_rate": 3.281796966161027e-05, + "loss": 0.6351, + "step": 1125 + }, + { + "epoch": 0.03297825769735882, + "grad_norm": 1.5623660740625849, + "learning_rate": 3.296382730455076e-05, + "loss": 0.7545, + "step": 1130 + }, + { + "epoch": 0.033124179191594924, + "grad_norm": 1.4295313989283687, + "learning_rate": 3.3109684947491247e-05, + "loss": 0.729, + "step": 1135 + }, + { + "epoch": 0.03327010068583102, + "grad_norm": 1.4341872631511632, + "learning_rate": 3.325554259043174e-05, + "loss": 0.6981, + "step": 1140 + }, + { + "epoch": 0.033416022180067124, + "grad_norm": 1.5157801689309949, + "learning_rate": 3.340140023337223e-05, + "loss": 0.6526, + "step": 1145 + }, + { + "epoch": 0.03356194367430323, + "grad_norm": 1.358575252506013, + "learning_rate": 3.354725787631272e-05, + "loss": 0.664, + "step": 1150 + }, + { + "epoch": 0.033707865168539325, + "grad_norm": 1.399471604852333, + "learning_rate": 3.369311551925321e-05, + "loss": 0.6395, + "step": 1155 + }, + { + "epoch": 0.03385378666277543, + "grad_norm": 1.6780295418808926, + "learning_rate": 3.3838973162193696e-05, + "loss": 0.6918, + "step": 1160 + }, + { + "epoch": 0.033999708157011525, + "grad_norm": 1.8749579850500733, + "learning_rate": 3.398483080513419e-05, + "loss": 0.7318, + "step": 1165 + }, + { + "epoch": 0.03414562965124763, + "grad_norm": 1.5350108766982589, + "learning_rate": 3.4130688448074685e-05, + "loss": 0.7566, + "step": 1170 + }, + { + "epoch": 0.03429155114548373, + "grad_norm": 1.5655487306424387, + "learning_rate": 3.427654609101517e-05, + "loss": 0.7289, + "step": 1175 + }, + { + "epoch": 0.03443747263971983, + "grad_norm": 1.4571372656445885, + "learning_rate": 3.442240373395566e-05, + "loss": 0.7596, + "step": 1180 + }, + { + "epoch": 0.03458339413395593, + "grad_norm": 1.6794172098342217, + "learning_rate": 3.456826137689615e-05, + "loss": 0.6708, + "step": 1185 + }, + { + "epoch": 0.03472931562819203, + "grad_norm": 1.6805832875498925, + "learning_rate": 3.4714119019836636e-05, + "loss": 0.8062, + "step": 1190 + }, + { + "epoch": 0.034875237122428134, + "grad_norm": 1.3146337017621434, + "learning_rate": 3.4859976662777134e-05, + "loss": 0.7486, + "step": 1195 + }, + { + "epoch": 0.03502115861666424, + "grad_norm": 1.7545223488891168, + "learning_rate": 3.500583430571762e-05, + "loss": 0.6879, + "step": 1200 + }, + { + "epoch": 0.035167080110900334, + "grad_norm": 1.4967220005992836, + "learning_rate": 3.515169194865811e-05, + "loss": 0.6696, + "step": 1205 + }, + { + "epoch": 0.03531300160513644, + "grad_norm": 1.345003682960457, + "learning_rate": 3.52975495915986e-05, + "loss": 0.6719, + "step": 1210 + }, + { + "epoch": 0.035458923099372534, + "grad_norm": 1.3306498551040196, + "learning_rate": 3.5443407234539086e-05, + "loss": 0.6595, + "step": 1215 + }, + { + "epoch": 0.03560484459360864, + "grad_norm": 1.6302230033146294, + "learning_rate": 3.5589264877479584e-05, + "loss": 0.7375, + "step": 1220 + }, + { + "epoch": 0.03575076608784474, + "grad_norm": 1.384508228051698, + "learning_rate": 3.5735122520420075e-05, + "loss": 0.6709, + "step": 1225 + }, + { + "epoch": 0.03589668758208084, + "grad_norm": 1.3008258698355437, + "learning_rate": 3.588098016336056e-05, + "loss": 0.6238, + "step": 1230 + }, + { + "epoch": 0.03604260907631694, + "grad_norm": 1.911916803446253, + "learning_rate": 3.602683780630105e-05, + "loss": 0.7463, + "step": 1235 + }, + { + "epoch": 0.036188530570553046, + "grad_norm": 1.4898103249753876, + "learning_rate": 3.617269544924154e-05, + "loss": 0.7167, + "step": 1240 + }, + { + "epoch": 0.03633445206478914, + "grad_norm": 1.4667267735145646, + "learning_rate": 3.631855309218203e-05, + "loss": 0.7409, + "step": 1245 + }, + { + "epoch": 0.036480373559025246, + "grad_norm": 1.4045462101272668, + "learning_rate": 3.6464410735122524e-05, + "loss": 0.797, + "step": 1250 + }, + { + "epoch": 0.03662629505326134, + "grad_norm": 1.5856985921341709, + "learning_rate": 3.661026837806301e-05, + "loss": 0.7175, + "step": 1255 + }, + { + "epoch": 0.03677221654749745, + "grad_norm": 1.5243823667462688, + "learning_rate": 3.67561260210035e-05, + "loss": 0.698, + "step": 1260 + }, + { + "epoch": 0.03691813804173355, + "grad_norm": 1.6455881879908836, + "learning_rate": 3.690198366394399e-05, + "loss": 0.7294, + "step": 1265 + }, + { + "epoch": 0.03706405953596965, + "grad_norm": 1.5669911601188393, + "learning_rate": 3.704784130688448e-05, + "loss": 0.6779, + "step": 1270 + }, + { + "epoch": 0.03720998103020575, + "grad_norm": 1.5657409691383104, + "learning_rate": 3.719369894982497e-05, + "loss": 0.7173, + "step": 1275 + }, + { + "epoch": 0.03735590252444185, + "grad_norm": 1.4649791712006108, + "learning_rate": 3.7339556592765465e-05, + "loss": 0.6902, + "step": 1280 + }, + { + "epoch": 0.03750182401867795, + "grad_norm": 1.5115106621884336, + "learning_rate": 3.748541423570595e-05, + "loss": 0.796, + "step": 1285 + }, + { + "epoch": 0.037647745512914055, + "grad_norm": 1.5684923982419243, + "learning_rate": 3.763127187864644e-05, + "loss": 0.7142, + "step": 1290 + }, + { + "epoch": 0.03779366700715015, + "grad_norm": 1.5099222776771948, + "learning_rate": 3.777712952158693e-05, + "loss": 0.6823, + "step": 1295 + }, + { + "epoch": 0.037939588501386255, + "grad_norm": 1.4243264396360908, + "learning_rate": 3.792298716452742e-05, + "loss": 0.6916, + "step": 1300 + }, + { + "epoch": 0.03808550999562235, + "grad_norm": 1.3191401279149118, + "learning_rate": 3.8068844807467914e-05, + "loss": 0.6742, + "step": 1305 + }, + { + "epoch": 0.038231431489858456, + "grad_norm": 1.3218496472271208, + "learning_rate": 3.8214702450408405e-05, + "loss": 0.7052, + "step": 1310 + }, + { + "epoch": 0.03837735298409456, + "grad_norm": 1.448794701520074, + "learning_rate": 3.836056009334889e-05, + "loss": 0.7278, + "step": 1315 + }, + { + "epoch": 0.038523274478330656, + "grad_norm": 1.3899813988499192, + "learning_rate": 3.850641773628939e-05, + "loss": 0.7099, + "step": 1320 + }, + { + "epoch": 0.03866919597256676, + "grad_norm": 1.4726248781173996, + "learning_rate": 3.865227537922987e-05, + "loss": 0.581, + "step": 1325 + }, + { + "epoch": 0.03881511746680286, + "grad_norm": 1.2626979658752469, + "learning_rate": 3.879813302217036e-05, + "loss": 0.6695, + "step": 1330 + }, + { + "epoch": 0.03896103896103896, + "grad_norm": 1.435335222367685, + "learning_rate": 3.8943990665110854e-05, + "loss": 0.7263, + "step": 1335 + }, + { + "epoch": 0.039106960455275064, + "grad_norm": 1.3976072965377189, + "learning_rate": 3.908984830805134e-05, + "loss": 0.7122, + "step": 1340 + }, + { + "epoch": 0.03925288194951116, + "grad_norm": 1.3097883164112671, + "learning_rate": 3.923570595099184e-05, + "loss": 0.7605, + "step": 1345 + }, + { + "epoch": 0.039398803443747264, + "grad_norm": 1.5303588178820082, + "learning_rate": 3.938156359393233e-05, + "loss": 0.7838, + "step": 1350 + }, + { + "epoch": 0.03954472493798337, + "grad_norm": 1.6361409986880862, + "learning_rate": 3.952742123687281e-05, + "loss": 0.769, + "step": 1355 + }, + { + "epoch": 0.039690646432219465, + "grad_norm": 1.6507215081937656, + "learning_rate": 3.9673278879813304e-05, + "loss": 0.7495, + "step": 1360 + }, + { + "epoch": 0.03983656792645557, + "grad_norm": 1.4393876063478734, + "learning_rate": 3.9819136522753795e-05, + "loss": 0.6859, + "step": 1365 + }, + { + "epoch": 0.039982489420691665, + "grad_norm": 1.5005476834299698, + "learning_rate": 3.9964994165694286e-05, + "loss": 0.7226, + "step": 1370 + }, + { + "epoch": 0.04012841091492777, + "grad_norm": 1.5310678050070499, + "learning_rate": 4.011085180863478e-05, + "loss": 0.7582, + "step": 1375 + }, + { + "epoch": 0.04027433240916387, + "grad_norm": 1.3212589600926459, + "learning_rate": 4.025670945157526e-05, + "loss": 0.6253, + "step": 1380 + }, + { + "epoch": 0.04042025390339997, + "grad_norm": 1.2668398157226317, + "learning_rate": 4.040256709451575e-05, + "loss": 0.7296, + "step": 1385 + }, + { + "epoch": 0.04056617539763607, + "grad_norm": 1.2292822802173848, + "learning_rate": 4.0548424737456244e-05, + "loss": 0.6372, + "step": 1390 + }, + { + "epoch": 0.04071209689187217, + "grad_norm": 1.482502641264336, + "learning_rate": 4.0694282380396735e-05, + "loss": 0.7039, + "step": 1395 + }, + { + "epoch": 0.04085801838610827, + "grad_norm": 1.3959579669494955, + "learning_rate": 4.0840140023337226e-05, + "loss": 0.6954, + "step": 1400 + }, + { + "epoch": 0.04100393988034438, + "grad_norm": 1.7627540859226858, + "learning_rate": 4.098599766627772e-05, + "loss": 0.6798, + "step": 1405 + }, + { + "epoch": 0.041149861374580474, + "grad_norm": 2.0770882092602108, + "learning_rate": 4.11318553092182e-05, + "loss": 0.7118, + "step": 1410 + }, + { + "epoch": 0.04129578286881658, + "grad_norm": 1.5840491687532077, + "learning_rate": 4.127771295215869e-05, + "loss": 0.716, + "step": 1415 + }, + { + "epoch": 0.04144170436305268, + "grad_norm": 1.4297647544021, + "learning_rate": 4.1423570595099184e-05, + "loss": 0.6965, + "step": 1420 + }, + { + "epoch": 0.04158762585728878, + "grad_norm": 1.398340378534979, + "learning_rate": 4.1569428238039676e-05, + "loss": 0.7058, + "step": 1425 + }, + { + "epoch": 0.04173354735152488, + "grad_norm": 1.3188527713486136, + "learning_rate": 4.171528588098017e-05, + "loss": 0.7317, + "step": 1430 + }, + { + "epoch": 0.04187946884576098, + "grad_norm": 1.5069846562860565, + "learning_rate": 4.186114352392065e-05, + "loss": 0.7435, + "step": 1435 + }, + { + "epoch": 0.04202539033999708, + "grad_norm": 1.277428473829212, + "learning_rate": 4.200700116686114e-05, + "loss": 0.704, + "step": 1440 + }, + { + "epoch": 0.042171311834233186, + "grad_norm": 1.415125736273544, + "learning_rate": 4.215285880980164e-05, + "loss": 0.6985, + "step": 1445 + }, + { + "epoch": 0.04231723332846928, + "grad_norm": 1.4305166349397616, + "learning_rate": 4.2298716452742125e-05, + "loss": 0.6834, + "step": 1450 + }, + { + "epoch": 0.042463154822705386, + "grad_norm": 1.7288035751579278, + "learning_rate": 4.2444574095682616e-05, + "loss": 0.7263, + "step": 1455 + }, + { + "epoch": 0.04260907631694148, + "grad_norm": 1.5886957733523792, + "learning_rate": 4.259043173862311e-05, + "loss": 0.6945, + "step": 1460 + }, + { + "epoch": 0.042754997811177586, + "grad_norm": 1.5777879075523655, + "learning_rate": 4.273628938156359e-05, + "loss": 0.7524, + "step": 1465 + }, + { + "epoch": 0.04290091930541369, + "grad_norm": 1.277107562463112, + "learning_rate": 4.288214702450409e-05, + "loss": 0.7441, + "step": 1470 + }, + { + "epoch": 0.04304684079964979, + "grad_norm": 1.2626646097924976, + "learning_rate": 4.3028004667444574e-05, + "loss": 0.6478, + "step": 1475 + }, + { + "epoch": 0.04319276229388589, + "grad_norm": 1.6447873754918085, + "learning_rate": 4.3173862310385065e-05, + "loss": 0.6455, + "step": 1480 + }, + { + "epoch": 0.04333868378812199, + "grad_norm": 1.3730093102411656, + "learning_rate": 4.3319719953325557e-05, + "loss": 0.6718, + "step": 1485 + }, + { + "epoch": 0.04348460528235809, + "grad_norm": 1.3065791285419022, + "learning_rate": 4.346557759626604e-05, + "loss": 0.6949, + "step": 1490 + }, + { + "epoch": 0.043630526776594195, + "grad_norm": 1.4009097173279175, + "learning_rate": 4.361143523920654e-05, + "loss": 0.7027, + "step": 1495 + }, + { + "epoch": 0.04377644827083029, + "grad_norm": 1.357777526652003, + "learning_rate": 4.375729288214703e-05, + "loss": 0.727, + "step": 1500 + }, + { + "epoch": 0.043922369765066395, + "grad_norm": 1.8183917864134866, + "learning_rate": 4.3903150525087515e-05, + "loss": 0.7275, + "step": 1505 + }, + { + "epoch": 0.0440682912593025, + "grad_norm": 1.4573978251867685, + "learning_rate": 4.4049008168028006e-05, + "loss": 0.8611, + "step": 1510 + }, + { + "epoch": 0.044214212753538595, + "grad_norm": 1.6232079352018782, + "learning_rate": 4.41948658109685e-05, + "loss": 0.7801, + "step": 1515 + }, + { + "epoch": 0.0443601342477747, + "grad_norm": 1.4392573700330675, + "learning_rate": 4.434072345390899e-05, + "loss": 0.7565, + "step": 1520 + }, + { + "epoch": 0.044506055742010796, + "grad_norm": 1.5163832264971888, + "learning_rate": 4.448658109684948e-05, + "loss": 0.6971, + "step": 1525 + }, + { + "epoch": 0.0446519772362469, + "grad_norm": 2.317978025584996, + "learning_rate": 4.4632438739789964e-05, + "loss": 0.764, + "step": 1530 + }, + { + "epoch": 0.044797898730483, + "grad_norm": 1.7525092590216673, + "learning_rate": 4.4778296382730455e-05, + "loss": 0.7569, + "step": 1535 + }, + { + "epoch": 0.0449438202247191, + "grad_norm": 1.5015982832337358, + "learning_rate": 4.4924154025670946e-05, + "loss": 0.6919, + "step": 1540 + }, + { + "epoch": 0.045089741718955204, + "grad_norm": 1.4924861531543965, + "learning_rate": 4.507001166861144e-05, + "loss": 0.7925, + "step": 1545 + }, + { + "epoch": 0.0452356632131913, + "grad_norm": 1.4757138590501715, + "learning_rate": 4.521586931155193e-05, + "loss": 0.7373, + "step": 1550 + }, + { + "epoch": 0.045381584707427404, + "grad_norm": 1.2053883351250492, + "learning_rate": 4.536172695449242e-05, + "loss": 0.6691, + "step": 1555 + }, + { + "epoch": 0.04552750620166351, + "grad_norm": 1.4506980323554002, + "learning_rate": 4.5507584597432904e-05, + "loss": 0.7319, + "step": 1560 + }, + { + "epoch": 0.045673427695899604, + "grad_norm": 1.259230745043163, + "learning_rate": 4.5653442240373396e-05, + "loss": 0.7161, + "step": 1565 + }, + { + "epoch": 0.04581934919013571, + "grad_norm": 1.294327365458817, + "learning_rate": 4.579929988331389e-05, + "loss": 0.74, + "step": 1570 + }, + { + "epoch": 0.045965270684371805, + "grad_norm": 1.3861326100551372, + "learning_rate": 4.594515752625438e-05, + "loss": 0.7444, + "step": 1575 + }, + { + "epoch": 0.04611119217860791, + "grad_norm": 1.258164525124542, + "learning_rate": 4.609101516919487e-05, + "loss": 0.6658, + "step": 1580 + }, + { + "epoch": 0.04625711367284401, + "grad_norm": 1.5429846005126808, + "learning_rate": 4.6236872812135354e-05, + "loss": 0.7618, + "step": 1585 + }, + { + "epoch": 0.04640303516708011, + "grad_norm": 1.2413030915088856, + "learning_rate": 4.6382730455075845e-05, + "loss": 0.6828, + "step": 1590 + }, + { + "epoch": 0.04654895666131621, + "grad_norm": 1.2893065668062218, + "learning_rate": 4.652858809801634e-05, + "loss": 0.7207, + "step": 1595 + }, + { + "epoch": 0.046694878155552316, + "grad_norm": 1.534254191324295, + "learning_rate": 4.667444574095683e-05, + "loss": 0.6597, + "step": 1600 + }, + { + "epoch": 0.04684079964978841, + "grad_norm": 1.325070694110307, + "learning_rate": 4.682030338389732e-05, + "loss": 0.7319, + "step": 1605 + }, + { + "epoch": 0.04698672114402452, + "grad_norm": 1.7421480208019755, + "learning_rate": 4.696616102683781e-05, + "loss": 0.7613, + "step": 1610 + }, + { + "epoch": 0.04713264263826061, + "grad_norm": 1.3878222764467754, + "learning_rate": 4.7112018669778294e-05, + "loss": 0.7144, + "step": 1615 + }, + { + "epoch": 0.04727856413249672, + "grad_norm": 1.437164477838428, + "learning_rate": 4.725787631271879e-05, + "loss": 0.7222, + "step": 1620 + }, + { + "epoch": 0.04742448562673282, + "grad_norm": 1.4216858430458277, + "learning_rate": 4.7403733955659277e-05, + "loss": 0.7008, + "step": 1625 + }, + { + "epoch": 0.04757040712096892, + "grad_norm": 1.4633195224395286, + "learning_rate": 4.754959159859977e-05, + "loss": 0.7669, + "step": 1630 + }, + { + "epoch": 0.04771632861520502, + "grad_norm": 1.433515409194029, + "learning_rate": 4.769544924154026e-05, + "loss": 0.75, + "step": 1635 + }, + { + "epoch": 0.04786225010944112, + "grad_norm": 1.4079939090780536, + "learning_rate": 4.784130688448074e-05, + "loss": 0.6996, + "step": 1640 + }, + { + "epoch": 0.04800817160367722, + "grad_norm": 1.3618814112009943, + "learning_rate": 4.798716452742124e-05, + "loss": 0.7725, + "step": 1645 + }, + { + "epoch": 0.048154093097913325, + "grad_norm": 1.4928210231146142, + "learning_rate": 4.813302217036173e-05, + "loss": 0.6997, + "step": 1650 + }, + { + "epoch": 0.04830001459214942, + "grad_norm": 1.3787922186887145, + "learning_rate": 4.827887981330222e-05, + "loss": 0.729, + "step": 1655 + }, + { + "epoch": 0.048445936086385526, + "grad_norm": 1.4613625687495408, + "learning_rate": 4.842473745624271e-05, + "loss": 0.6909, + "step": 1660 + }, + { + "epoch": 0.04859185758062162, + "grad_norm": 1.3840626726504617, + "learning_rate": 4.85705950991832e-05, + "loss": 0.6814, + "step": 1665 + }, + { + "epoch": 0.048737779074857726, + "grad_norm": 1.2832391941332677, + "learning_rate": 4.871645274212369e-05, + "loss": 0.7335, + "step": 1670 + }, + { + "epoch": 0.04888370056909383, + "grad_norm": 1.5278573819228933, + "learning_rate": 4.886231038506418e-05, + "loss": 0.7602, + "step": 1675 + }, + { + "epoch": 0.049029622063329927, + "grad_norm": 1.3147818058889946, + "learning_rate": 4.9008168028004666e-05, + "loss": 0.759, + "step": 1680 + }, + { + "epoch": 0.04917554355756603, + "grad_norm": 1.4974669874886688, + "learning_rate": 4.915402567094516e-05, + "loss": 0.7281, + "step": 1685 + }, + { + "epoch": 0.049321465051802134, + "grad_norm": 1.325413873664663, + "learning_rate": 4.929988331388565e-05, + "loss": 0.6783, + "step": 1690 + }, + { + "epoch": 0.04946738654603823, + "grad_norm": 1.6775492170661628, + "learning_rate": 4.944574095682614e-05, + "loss": 0.7222, + "step": 1695 + }, + { + "epoch": 0.049613308040274334, + "grad_norm": 1.5164031702499228, + "learning_rate": 4.959159859976663e-05, + "loss": 0.7444, + "step": 1700 + }, + { + "epoch": 0.04975922953451043, + "grad_norm": 1.3883945829821218, + "learning_rate": 4.973745624270712e-05, + "loss": 0.7094, + "step": 1705 + }, + { + "epoch": 0.049905151028746535, + "grad_norm": 1.4141884125777868, + "learning_rate": 4.988331388564761e-05, + "loss": 0.7099, + "step": 1710 + }, + { + "epoch": 0.05005107252298264, + "grad_norm": 1.5489285429991548, + "learning_rate": 4.99999998952091e-05, + "loss": 0.7698, + "step": 1715 + }, + { + "epoch": 0.050196994017218735, + "grad_norm": 1.5009385671849187, + "learning_rate": 4.999999622752747e-05, + "loss": 0.7016, + "step": 1720 + }, + { + "epoch": 0.05034291551145484, + "grad_norm": 1.1100258836488293, + "learning_rate": 4.999998732030149e-05, + "loss": 0.7713, + "step": 1725 + }, + { + "epoch": 0.050488837005690936, + "grad_norm": 1.3623240200429385, + "learning_rate": 4.999997317353324e-05, + "loss": 0.7417, + "step": 1730 + }, + { + "epoch": 0.05063475849992704, + "grad_norm": 1.1959053957175283, + "learning_rate": 4.999995378722602e-05, + "loss": 0.7139, + "step": 1735 + }, + { + "epoch": 0.05078067999416314, + "grad_norm": 1.621560063903723, + "learning_rate": 4.999992916138431e-05, + "loss": 0.7428, + "step": 1740 + }, + { + "epoch": 0.05092660148839924, + "grad_norm": 1.1896886442979866, + "learning_rate": 4.999989929601388e-05, + "loss": 0.7488, + "step": 1745 + }, + { + "epoch": 0.05107252298263534, + "grad_norm": 1.4603550657267539, + "learning_rate": 4.999986419112167e-05, + "loss": 0.7483, + "step": 1750 + }, + { + "epoch": 0.05121844447687144, + "grad_norm": 1.6414486516516842, + "learning_rate": 4.9999823846715864e-05, + "loss": 0.748, + "step": 1755 + }, + { + "epoch": 0.051364365971107544, + "grad_norm": 1.5055996794183943, + "learning_rate": 4.999977826280584e-05, + "loss": 0.7276, + "step": 1760 + }, + { + "epoch": 0.05151028746534365, + "grad_norm": 1.3554159340661034, + "learning_rate": 4.999972743940223e-05, + "loss": 0.7041, + "step": 1765 + }, + { + "epoch": 0.051656208959579744, + "grad_norm": 1.4503814058183095, + "learning_rate": 4.999967137651686e-05, + "loss": 0.7733, + "step": 1770 + }, + { + "epoch": 0.05180213045381585, + "grad_norm": 1.4164752212206861, + "learning_rate": 4.999961007416279e-05, + "loss": 0.7353, + "step": 1775 + }, + { + "epoch": 0.05194805194805195, + "grad_norm": 1.6006939733383188, + "learning_rate": 4.999954353235429e-05, + "loss": 0.7567, + "step": 1780 + }, + { + "epoch": 0.05209397344228805, + "grad_norm": 1.3758489941956624, + "learning_rate": 4.999947175110686e-05, + "loss": 0.7446, + "step": 1785 + }, + { + "epoch": 0.05223989493652415, + "grad_norm": 1.3256297071339445, + "learning_rate": 4.999939473043721e-05, + "loss": 0.8025, + "step": 1790 + }, + { + "epoch": 0.05238581643076025, + "grad_norm": 1.3357548954547407, + "learning_rate": 4.999931247036329e-05, + "loss": 0.7409, + "step": 1795 + }, + { + "epoch": 0.05253173792499635, + "grad_norm": 1.3184267663106126, + "learning_rate": 4.999922497090424e-05, + "loss": 0.6972, + "step": 1800 + }, + { + "epoch": 0.052677659419232456, + "grad_norm": 1.4287090803256959, + "learning_rate": 4.999913223208044e-05, + "loss": 0.7577, + "step": 1805 + }, + { + "epoch": 0.05282358091346855, + "grad_norm": 1.4513227193251272, + "learning_rate": 4.999903425391349e-05, + "loss": 0.7461, + "step": 1810 + }, + { + "epoch": 0.052969502407704656, + "grad_norm": 1.5577580011424679, + "learning_rate": 4.99989310364262e-05, + "loss": 0.7457, + "step": 1815 + }, + { + "epoch": 0.05311542390194075, + "grad_norm": 1.4317513899882508, + "learning_rate": 4.9998822579642616e-05, + "loss": 0.683, + "step": 1820 + }, + { + "epoch": 0.05326134539617686, + "grad_norm": 1.3873535818291367, + "learning_rate": 4.999870888358799e-05, + "loss": 0.7166, + "step": 1825 + }, + { + "epoch": 0.05340726689041296, + "grad_norm": 1.3423429338740624, + "learning_rate": 4.9998589948288786e-05, + "loss": 0.7035, + "step": 1830 + }, + { + "epoch": 0.05355318838464906, + "grad_norm": 1.3402373645243024, + "learning_rate": 4.999846577377272e-05, + "loss": 0.7836, + "step": 1835 + }, + { + "epoch": 0.05369910987888516, + "grad_norm": 1.4676081185696548, + "learning_rate": 4.9998336360068695e-05, + "loss": 0.6916, + "step": 1840 + }, + { + "epoch": 0.05384503137312126, + "grad_norm": 3.2226642715494096, + "learning_rate": 4.999820170720686e-05, + "loss": 0.7324, + "step": 1845 + }, + { + "epoch": 0.05399095286735736, + "grad_norm": 1.5147730323151014, + "learning_rate": 4.999806181521855e-05, + "loss": 0.775, + "step": 1850 + }, + { + "epoch": 0.054136874361593465, + "grad_norm": 1.2677879562387062, + "learning_rate": 4.9997916684136357e-05, + "loss": 0.6843, + "step": 1855 + }, + { + "epoch": 0.05428279585582956, + "grad_norm": 1.4626998503363509, + "learning_rate": 4.9997766313994075e-05, + "loss": 0.7271, + "step": 1860 + }, + { + "epoch": 0.054428717350065665, + "grad_norm": 1.6242490303940638, + "learning_rate": 4.999761070482672e-05, + "loss": 0.7511, + "step": 1865 + }, + { + "epoch": 0.05457463884430177, + "grad_norm": 1.471367070391399, + "learning_rate": 4.999744985667053e-05, + "loss": 0.7383, + "step": 1870 + }, + { + "epoch": 0.054720560338537866, + "grad_norm": 1.2398458084232888, + "learning_rate": 4.9997283769562965e-05, + "loss": 0.6543, + "step": 1875 + }, + { + "epoch": 0.05486648183277397, + "grad_norm": 1.2460398967700295, + "learning_rate": 4.9997112443542686e-05, + "loss": 0.7728, + "step": 1880 + }, + { + "epoch": 0.055012403327010066, + "grad_norm": 1.5515635682991524, + "learning_rate": 4.9996935878649616e-05, + "loss": 0.8184, + "step": 1885 + }, + { + "epoch": 0.05515832482124617, + "grad_norm": 1.4140897260560117, + "learning_rate": 4.999675407492484e-05, + "loss": 0.6774, + "step": 1890 + }, + { + "epoch": 0.055304246315482274, + "grad_norm": 1.2477321550872238, + "learning_rate": 4.9996567032410724e-05, + "loss": 0.6574, + "step": 1895 + }, + { + "epoch": 0.05545016780971837, + "grad_norm": 1.3933862786721032, + "learning_rate": 4.99963747511508e-05, + "loss": 0.7777, + "step": 1900 + }, + { + "epoch": 0.055596089303954474, + "grad_norm": 1.3336333392863404, + "learning_rate": 4.999617723118985e-05, + "loss": 0.7539, + "step": 1905 + }, + { + "epoch": 0.05574201079819057, + "grad_norm": 1.2310526135000395, + "learning_rate": 4.999597447257388e-05, + "loss": 0.7208, + "step": 1910 + }, + { + "epoch": 0.055887932292426674, + "grad_norm": 1.195398380972192, + "learning_rate": 4.9995766475350106e-05, + "loss": 0.6929, + "step": 1915 + }, + { + "epoch": 0.05603385378666278, + "grad_norm": 1.3763010713502086, + "learning_rate": 4.9995553239566956e-05, + "loss": 0.7382, + "step": 1920 + }, + { + "epoch": 0.056179775280898875, + "grad_norm": 1.3502815894725948, + "learning_rate": 4.9995334765274093e-05, + "loss": 0.7508, + "step": 1925 + }, + { + "epoch": 0.05632569677513498, + "grad_norm": 1.2630819320313937, + "learning_rate": 4.9995111052522374e-05, + "loss": 0.7831, + "step": 1930 + }, + { + "epoch": 0.056471618269371075, + "grad_norm": 1.1598061036629406, + "learning_rate": 4.999488210136393e-05, + "loss": 0.7027, + "step": 1935 + }, + { + "epoch": 0.05661753976360718, + "grad_norm": 1.3085030319156963, + "learning_rate": 4.999464791185204e-05, + "loss": 0.6834, + "step": 1940 + }, + { + "epoch": 0.05676346125784328, + "grad_norm": 1.189253421244276, + "learning_rate": 4.9994408484041266e-05, + "loss": 0.6539, + "step": 1945 + }, + { + "epoch": 0.05690938275207938, + "grad_norm": 1.328377197158246, + "learning_rate": 4.999416381798735e-05, + "loss": 0.7957, + "step": 1950 + }, + { + "epoch": 0.05705530424631548, + "grad_norm": 1.3831582959239892, + "learning_rate": 4.999391391374726e-05, + "loss": 0.7051, + "step": 1955 + }, + { + "epoch": 0.05720122574055159, + "grad_norm": 1.2371294210719483, + "learning_rate": 4.999365877137922e-05, + "loss": 0.6856, + "step": 1960 + }, + { + "epoch": 0.05734714723478768, + "grad_norm": 1.1433664875714422, + "learning_rate": 4.9993398390942605e-05, + "loss": 0.6796, + "step": 1965 + }, + { + "epoch": 0.05749306872902379, + "grad_norm": 1.4680255325182232, + "learning_rate": 4.9993132772498075e-05, + "loss": 0.7548, + "step": 1970 + }, + { + "epoch": 0.057638990223259884, + "grad_norm": 1.2455145333553008, + "learning_rate": 4.999286191610749e-05, + "loss": 0.7135, + "step": 1975 + }, + { + "epoch": 0.05778491171749599, + "grad_norm": 1.8252722413199478, + "learning_rate": 4.99925858218339e-05, + "loss": 0.7772, + "step": 1980 + }, + { + "epoch": 0.05793083321173209, + "grad_norm": 1.5337533716975953, + "learning_rate": 4.999230448974161e-05, + "loss": 0.7922, + "step": 1985 + }, + { + "epoch": 0.05807675470596819, + "grad_norm": 1.304275618366775, + "learning_rate": 4.999201791989614e-05, + "loss": 0.698, + "step": 1990 + }, + { + "epoch": 0.05822267620020429, + "grad_norm": 1.480397438136825, + "learning_rate": 4.999172611236422e-05, + "loss": 0.7348, + "step": 1995 + }, + { + "epoch": 0.05836859769444039, + "grad_norm": 1.252769843255006, + "learning_rate": 4.99914290672138e-05, + "loss": 0.7229, + "step": 2000 + }, + { + "epoch": 0.05851451918867649, + "grad_norm": 1.1784037490192194, + "learning_rate": 4.9991126784514056e-05, + "loss": 0.715, + "step": 2005 + }, + { + "epoch": 0.058660440682912596, + "grad_norm": 1.2044815996761107, + "learning_rate": 4.999081926433538e-05, + "loss": 0.7012, + "step": 2010 + }, + { + "epoch": 0.05880636217714869, + "grad_norm": 1.291539137109826, + "learning_rate": 4.999050650674937e-05, + "loss": 0.7418, + "step": 2015 + }, + { + "epoch": 0.058952283671384796, + "grad_norm": 1.320349320649597, + "learning_rate": 4.9990188511828876e-05, + "loss": 0.7824, + "step": 2020 + }, + { + "epoch": 0.05909820516562089, + "grad_norm": 1.2782542334459186, + "learning_rate": 4.998986527964794e-05, + "loss": 0.7197, + "step": 2025 + }, + { + "epoch": 0.059244126659857, + "grad_norm": 1.3127633566021457, + "learning_rate": 4.998953681028184e-05, + "loss": 0.8242, + "step": 2030 + }, + { + "epoch": 0.0593900481540931, + "grad_norm": 1.2456638738937682, + "learning_rate": 4.998920310380706e-05, + "loss": 0.7047, + "step": 2035 + }, + { + "epoch": 0.0595359696483292, + "grad_norm": 1.5294055379073646, + "learning_rate": 4.998886416030131e-05, + "loss": 0.7231, + "step": 2040 + }, + { + "epoch": 0.0596818911425653, + "grad_norm": 1.404503139124834, + "learning_rate": 4.9988519979843515e-05, + "loss": 0.741, + "step": 2045 + }, + { + "epoch": 0.0598278126368014, + "grad_norm": 1.4071879656978263, + "learning_rate": 4.998817056251383e-05, + "loss": 0.7122, + "step": 2050 + }, + { + "epoch": 0.0599737341310375, + "grad_norm": 1.5919737599471258, + "learning_rate": 4.9987815908393624e-05, + "loss": 0.6738, + "step": 2055 + }, + { + "epoch": 0.060119655625273605, + "grad_norm": 1.3510983803352308, + "learning_rate": 4.9987456017565486e-05, + "loss": 0.8096, + "step": 2060 + }, + { + "epoch": 0.0602655771195097, + "grad_norm": 1.38060452056238, + "learning_rate": 4.998709089011321e-05, + "loss": 0.8115, + "step": 2065 + }, + { + "epoch": 0.060411498613745805, + "grad_norm": 1.5194135746506383, + "learning_rate": 4.998672052612185e-05, + "loss": 0.7685, + "step": 2070 + }, + { + "epoch": 0.06055742010798191, + "grad_norm": 1.1273003115312668, + "learning_rate": 4.998634492567762e-05, + "loss": 0.7174, + "step": 2075 + }, + { + "epoch": 0.060703341602218006, + "grad_norm": 1.2092491367776603, + "learning_rate": 4.998596408886801e-05, + "loss": 0.7448, + "step": 2080 + }, + { + "epoch": 0.06084926309645411, + "grad_norm": 1.290181374820797, + "learning_rate": 4.9985578015781685e-05, + "loss": 0.7346, + "step": 2085 + }, + { + "epoch": 0.060995184590690206, + "grad_norm": 1.1652853683145246, + "learning_rate": 4.998518670650857e-05, + "loss": 0.6608, + "step": 2090 + }, + { + "epoch": 0.06114110608492631, + "grad_norm": 1.2051801845636834, + "learning_rate": 4.998479016113978e-05, + "loss": 0.6581, + "step": 2095 + }, + { + "epoch": 0.06128702757916241, + "grad_norm": 1.3292843794218534, + "learning_rate": 4.9984388379767645e-05, + "loss": 0.7597, + "step": 2100 + }, + { + "epoch": 0.06143294907339851, + "grad_norm": 1.1499416491903214, + "learning_rate": 4.9983981362485747e-05, + "loss": 0.7384, + "step": 2105 + }, + { + "epoch": 0.061578870567634614, + "grad_norm": 1.493032918350083, + "learning_rate": 4.998356910938886e-05, + "loss": 0.7258, + "step": 2110 + }, + { + "epoch": 0.06172479206187071, + "grad_norm": 1.1758083858592523, + "learning_rate": 4.998315162057298e-05, + "loss": 0.7267, + "step": 2115 + }, + { + "epoch": 0.061870713556106814, + "grad_norm": 1.1991582459982524, + "learning_rate": 4.998272889613534e-05, + "loss": 0.685, + "step": 2120 + }, + { + "epoch": 0.06201663505034292, + "grad_norm": 1.1807827849653567, + "learning_rate": 4.998230093617437e-05, + "loss": 0.6939, + "step": 2125 + }, + { + "epoch": 0.062162556544579015, + "grad_norm": 1.3403326608048847, + "learning_rate": 4.998186774078972e-05, + "loss": 0.7885, + "step": 2130 + }, + { + "epoch": 0.06230847803881512, + "grad_norm": 1.3819878556600145, + "learning_rate": 4.9981429310082284e-05, + "loss": 0.8333, + "step": 2135 + }, + { + "epoch": 0.062454399533051215, + "grad_norm": 1.1141701273740472, + "learning_rate": 4.9980985644154144e-05, + "loss": 0.7041, + "step": 2140 + }, + { + "epoch": 0.06260032102728733, + "grad_norm": 1.286037742419842, + "learning_rate": 4.998053674310863e-05, + "loss": 0.7495, + "step": 2145 + }, + { + "epoch": 0.06274624252152342, + "grad_norm": 1.3604656301414162, + "learning_rate": 4.998008260705027e-05, + "loss": 0.7477, + "step": 2150 + }, + { + "epoch": 0.06289216401575952, + "grad_norm": 1.2711167229338713, + "learning_rate": 4.997962323608482e-05, + "loss": 0.7133, + "step": 2155 + }, + { + "epoch": 0.06303808550999562, + "grad_norm": 1.2439249243670005, + "learning_rate": 4.997915863031926e-05, + "loss": 0.7814, + "step": 2160 + }, + { + "epoch": 0.06318400700423173, + "grad_norm": 1.0594446077432493, + "learning_rate": 4.997868878986176e-05, + "loss": 0.6933, + "step": 2165 + }, + { + "epoch": 0.06332992849846783, + "grad_norm": 1.4538820689363272, + "learning_rate": 4.997821371482175e-05, + "loss": 0.7247, + "step": 2170 + }, + { + "epoch": 0.06347584999270392, + "grad_norm": 1.5099163474537787, + "learning_rate": 4.997773340530985e-05, + "loss": 0.7228, + "step": 2175 + }, + { + "epoch": 0.06362177148694002, + "grad_norm": 1.5250824525307163, + "learning_rate": 4.9977247861437917e-05, + "loss": 0.7148, + "step": 2180 + }, + { + "epoch": 0.06376769298117613, + "grad_norm": 1.4301435712897916, + "learning_rate": 4.997675708331902e-05, + "loss": 0.7548, + "step": 2185 + }, + { + "epoch": 0.06391361447541223, + "grad_norm": 1.1769706738775563, + "learning_rate": 4.997626107106744e-05, + "loss": 0.7117, + "step": 2190 + }, + { + "epoch": 0.06405953596964833, + "grad_norm": 1.419877004192711, + "learning_rate": 4.9975759824798676e-05, + "loss": 0.7297, + "step": 2195 + }, + { + "epoch": 0.06420545746388442, + "grad_norm": 1.4033595729322852, + "learning_rate": 4.997525334462948e-05, + "loss": 0.6969, + "step": 2200 + }, + { + "epoch": 0.06435137895812053, + "grad_norm": 1.1946787927906284, + "learning_rate": 4.997474163067776e-05, + "loss": 0.7459, + "step": 2205 + }, + { + "epoch": 0.06449730045235663, + "grad_norm": 1.241877789114203, + "learning_rate": 4.99742246830627e-05, + "loss": 0.768, + "step": 2210 + }, + { + "epoch": 0.06464322194659274, + "grad_norm": 1.2228153686102359, + "learning_rate": 4.997370250190468e-05, + "loss": 0.7219, + "step": 2215 + }, + { + "epoch": 0.06478914344082884, + "grad_norm": 1.3195641442753128, + "learning_rate": 4.9973175087325285e-05, + "loss": 0.7507, + "step": 2220 + }, + { + "epoch": 0.06493506493506493, + "grad_norm": 1.4244210629807366, + "learning_rate": 4.997264243944735e-05, + "loss": 0.721, + "step": 2225 + }, + { + "epoch": 0.06508098642930103, + "grad_norm": 1.1960041483445072, + "learning_rate": 4.997210455839491e-05, + "loss": 0.7399, + "step": 2230 + }, + { + "epoch": 0.06522690792353714, + "grad_norm": 1.1991878426224685, + "learning_rate": 4.997156144429321e-05, + "loss": 0.7562, + "step": 2235 + }, + { + "epoch": 0.06537282941777324, + "grad_norm": 1.3054437830587324, + "learning_rate": 4.997101309726873e-05, + "loss": 0.6905, + "step": 2240 + }, + { + "epoch": 0.06551875091200934, + "grad_norm": 1.1787769240409458, + "learning_rate": 4.997045951744917e-05, + "loss": 0.7622, + "step": 2245 + }, + { + "epoch": 0.06566467240624543, + "grad_norm": 1.3229927030556956, + "learning_rate": 4.9969900704963436e-05, + "loss": 0.7085, + "step": 2250 + }, + { + "epoch": 0.06581059390048154, + "grad_norm": 1.1124966798807985, + "learning_rate": 4.9969336659941646e-05, + "loss": 0.6807, + "step": 2255 + }, + { + "epoch": 0.06595651539471764, + "grad_norm": 1.2596715345120049, + "learning_rate": 4.9968767382515164e-05, + "loss": 0.7053, + "step": 2260 + }, + { + "epoch": 0.06610243688895374, + "grad_norm": 1.5016762619635984, + "learning_rate": 4.9968192872816554e-05, + "loss": 0.7123, + "step": 2265 + }, + { + "epoch": 0.06624835838318985, + "grad_norm": 1.4737925975333308, + "learning_rate": 4.996761313097959e-05, + "loss": 0.7365, + "step": 2270 + }, + { + "epoch": 0.06639427987742594, + "grad_norm": 1.186690900860536, + "learning_rate": 4.9967028157139304e-05, + "loss": 0.678, + "step": 2275 + }, + { + "epoch": 0.06654020137166204, + "grad_norm": 1.1589042008228565, + "learning_rate": 4.996643795143189e-05, + "loss": 0.6864, + "step": 2280 + }, + { + "epoch": 0.06668612286589815, + "grad_norm": 1.1064561143791263, + "learning_rate": 4.9965842513994795e-05, + "loss": 0.6957, + "step": 2285 + }, + { + "epoch": 0.06683204436013425, + "grad_norm": 1.2865770484963301, + "learning_rate": 4.9965241844966674e-05, + "loss": 0.706, + "step": 2290 + }, + { + "epoch": 0.06697796585437035, + "grad_norm": 1.1870033260695017, + "learning_rate": 4.9964635944487424e-05, + "loss": 0.7526, + "step": 2295 + }, + { + "epoch": 0.06712388734860646, + "grad_norm": 1.2389259592263213, + "learning_rate": 4.996402481269812e-05, + "loss": 0.6709, + "step": 2300 + }, + { + "epoch": 0.06726980884284255, + "grad_norm": 1.2067755710375718, + "learning_rate": 4.996340844974108e-05, + "loss": 0.7896, + "step": 2305 + }, + { + "epoch": 0.06741573033707865, + "grad_norm": 1.1379793099244961, + "learning_rate": 4.9962786855759843e-05, + "loss": 0.7196, + "step": 2310 + }, + { + "epoch": 0.06756165183131475, + "grad_norm": 1.4337643942611982, + "learning_rate": 4.996216003089914e-05, + "loss": 0.7121, + "step": 2315 + }, + { + "epoch": 0.06770757332555086, + "grad_norm": 1.2297091477283193, + "learning_rate": 4.996152797530497e-05, + "loss": 0.6277, + "step": 2320 + }, + { + "epoch": 0.06785349481978696, + "grad_norm": 1.336303746163312, + "learning_rate": 4.996089068912449e-05, + "loss": 0.6736, + "step": 2325 + }, + { + "epoch": 0.06799941631402305, + "grad_norm": 1.2172510478016814, + "learning_rate": 4.9960248172506115e-05, + "loss": 0.6564, + "step": 2330 + }, + { + "epoch": 0.06814533780825915, + "grad_norm": 1.2510885722792702, + "learning_rate": 4.995960042559947e-05, + "loss": 0.7631, + "step": 2335 + }, + { + "epoch": 0.06829125930249526, + "grad_norm": 1.086301288205196, + "learning_rate": 4.99589474485554e-05, + "loss": 0.7352, + "step": 2340 + }, + { + "epoch": 0.06843718079673136, + "grad_norm": 1.2941420068079943, + "learning_rate": 4.9958289241525945e-05, + "loss": 0.6949, + "step": 2345 + }, + { + "epoch": 0.06858310229096747, + "grad_norm": 1.1265269718678437, + "learning_rate": 4.9957625804664386e-05, + "loss": 0.6864, + "step": 2350 + }, + { + "epoch": 0.06872902378520356, + "grad_norm": 1.091114609828629, + "learning_rate": 4.9956957138125234e-05, + "loss": 0.6432, + "step": 2355 + }, + { + "epoch": 0.06887494527943966, + "grad_norm": 1.521826692370768, + "learning_rate": 4.995628324206419e-05, + "loss": 0.7355, + "step": 2360 + }, + { + "epoch": 0.06902086677367576, + "grad_norm": 1.169906654154956, + "learning_rate": 4.995560411663817e-05, + "loss": 0.6685, + "step": 2365 + }, + { + "epoch": 0.06916678826791187, + "grad_norm": 1.2552300646284833, + "learning_rate": 4.995491976200534e-05, + "loss": 0.6765, + "step": 2370 + }, + { + "epoch": 0.06931270976214797, + "grad_norm": 1.1080565485099827, + "learning_rate": 4.995423017832505e-05, + "loss": 0.7245, + "step": 2375 + }, + { + "epoch": 0.06945863125638406, + "grad_norm": 1.3039222055343613, + "learning_rate": 4.995353536575789e-05, + "loss": 0.7249, + "step": 2380 + }, + { + "epoch": 0.06960455275062016, + "grad_norm": 1.2252843417382429, + "learning_rate": 4.9952835324465663e-05, + "loss": 0.7206, + "step": 2385 + }, + { + "epoch": 0.06975047424485627, + "grad_norm": 1.3286629535986196, + "learning_rate": 4.995213005461139e-05, + "loss": 0.7701, + "step": 2390 + }, + { + "epoch": 0.06989639573909237, + "grad_norm": 1.376678831939567, + "learning_rate": 4.995141955635928e-05, + "loss": 0.6757, + "step": 2395 + }, + { + "epoch": 0.07004231723332847, + "grad_norm": 1.2135980799250596, + "learning_rate": 4.995070382987483e-05, + "loss": 0.7356, + "step": 2400 + }, + { + "epoch": 0.07018823872756456, + "grad_norm": 1.1511071959834587, + "learning_rate": 4.9949982875324666e-05, + "loss": 0.692, + "step": 2405 + }, + { + "epoch": 0.07033416022180067, + "grad_norm": 1.1891927187571163, + "learning_rate": 4.9949256692876704e-05, + "loss": 0.6449, + "step": 2410 + }, + { + "epoch": 0.07048008171603677, + "grad_norm": 1.3449098804464046, + "learning_rate": 4.9948525282700044e-05, + "loss": 0.6946, + "step": 2415 + }, + { + "epoch": 0.07062600321027288, + "grad_norm": 1.467551334116575, + "learning_rate": 4.9947788644965e-05, + "loss": 0.7251, + "step": 2420 + }, + { + "epoch": 0.07077192470450898, + "grad_norm": 1.2865829004410783, + "learning_rate": 4.9947046779843116e-05, + "loss": 0.7013, + "step": 2425 + }, + { + "epoch": 0.07091784619874507, + "grad_norm": 1.495112540509866, + "learning_rate": 4.994629968750715e-05, + "loss": 0.7519, + "step": 2430 + }, + { + "epoch": 0.07106376769298117, + "grad_norm": 1.28058130982145, + "learning_rate": 4.994554736813108e-05, + "loss": 0.7423, + "step": 2435 + }, + { + "epoch": 0.07120968918721728, + "grad_norm": 1.041245082212463, + "learning_rate": 4.9944789821890095e-05, + "loss": 0.7318, + "step": 2440 + }, + { + "epoch": 0.07135561068145338, + "grad_norm": 1.6895277069336783, + "learning_rate": 4.9944027048960594e-05, + "loss": 0.7415, + "step": 2445 + }, + { + "epoch": 0.07150153217568948, + "grad_norm": 1.4092738824681306, + "learning_rate": 4.994325904952022e-05, + "loss": 0.7828, + "step": 2450 + }, + { + "epoch": 0.07164745366992557, + "grad_norm": 1.0498275429598172, + "learning_rate": 4.9942485823747805e-05, + "loss": 0.6679, + "step": 2455 + }, + { + "epoch": 0.07179337516416168, + "grad_norm": 1.2457136035786602, + "learning_rate": 4.994170737182341e-05, + "loss": 0.685, + "step": 2460 + }, + { + "epoch": 0.07193929665839778, + "grad_norm": 1.084800211714994, + "learning_rate": 4.994092369392832e-05, + "loss": 0.6796, + "step": 2465 + }, + { + "epoch": 0.07208521815263388, + "grad_norm": 1.3031546895787143, + "learning_rate": 4.994013479024501e-05, + "loss": 0.717, + "step": 2470 + }, + { + "epoch": 0.07223113964686999, + "grad_norm": 1.1445623682796333, + "learning_rate": 4.993934066095722e-05, + "loss": 0.7178, + "step": 2475 + }, + { + "epoch": 0.07237706114110609, + "grad_norm": 1.7556927538055653, + "learning_rate": 4.9938541306249846e-05, + "loss": 0.7739, + "step": 2480 + }, + { + "epoch": 0.07252298263534218, + "grad_norm": 1.153150601321591, + "learning_rate": 4.993773672630906e-05, + "loss": 0.7342, + "step": 2485 + }, + { + "epoch": 0.07266890412957829, + "grad_norm": 1.157971600723398, + "learning_rate": 4.9936926921322206e-05, + "loss": 0.7251, + "step": 2490 + }, + { + "epoch": 0.07281482562381439, + "grad_norm": 1.2305823435532268, + "learning_rate": 4.993611189147788e-05, + "loss": 0.6486, + "step": 2495 + }, + { + "epoch": 0.07296074711805049, + "grad_norm": 1.157643925802279, + "learning_rate": 4.9935291636965854e-05, + "loss": 0.7137, + "step": 2500 + }, + { + "epoch": 0.0731066686122866, + "grad_norm": 1.0908201149923213, + "learning_rate": 4.993446615797715e-05, + "loss": 0.6464, + "step": 2505 + }, + { + "epoch": 0.07325259010652269, + "grad_norm": 1.319550772585411, + "learning_rate": 4.9933635454704e-05, + "loss": 0.7119, + "step": 2510 + }, + { + "epoch": 0.07339851160075879, + "grad_norm": 1.3133574588174883, + "learning_rate": 4.993279952733984e-05, + "loss": 0.6559, + "step": 2515 + }, + { + "epoch": 0.0735444330949949, + "grad_norm": 1.3731793545515336, + "learning_rate": 4.993195837607935e-05, + "loss": 0.757, + "step": 2520 + }, + { + "epoch": 0.073690354589231, + "grad_norm": 1.3463596177268926, + "learning_rate": 4.993111200111838e-05, + "loss": 0.7369, + "step": 2525 + }, + { + "epoch": 0.0738362760834671, + "grad_norm": 1.1720972772559952, + "learning_rate": 4.993026040265404e-05, + "loss": 0.6913, + "step": 2530 + }, + { + "epoch": 0.07398219757770319, + "grad_norm": 1.2311212852262081, + "learning_rate": 4.992940358088465e-05, + "loss": 0.7237, + "step": 2535 + }, + { + "epoch": 0.0741281190719393, + "grad_norm": 1.2313173688199424, + "learning_rate": 4.992854153600972e-05, + "loss": 0.7099, + "step": 2540 + }, + { + "epoch": 0.0742740405661754, + "grad_norm": 1.116975561940082, + "learning_rate": 4.992767426823e-05, + "loss": 0.7015, + "step": 2545 + }, + { + "epoch": 0.0744199620604115, + "grad_norm": 1.3447765667807197, + "learning_rate": 4.992680177774745e-05, + "loss": 0.6965, + "step": 2550 + }, + { + "epoch": 0.0745658835546476, + "grad_norm": 1.2803959342160747, + "learning_rate": 4.9925924064765245e-05, + "loss": 0.7027, + "step": 2555 + }, + { + "epoch": 0.0747118050488837, + "grad_norm": 1.244134295034872, + "learning_rate": 4.9925041129487776e-05, + "loss": 0.7836, + "step": 2560 + }, + { + "epoch": 0.0748577265431198, + "grad_norm": 1.2429016066981884, + "learning_rate": 4.992415297212065e-05, + "loss": 0.7296, + "step": 2565 + }, + { + "epoch": 0.0750036480373559, + "grad_norm": 1.2265724380564247, + "learning_rate": 4.9923259592870694e-05, + "loss": 0.7165, + "step": 2570 + }, + { + "epoch": 0.075149569531592, + "grad_norm": 1.117185974511804, + "learning_rate": 4.992236099194595e-05, + "loss": 0.6919, + "step": 2575 + }, + { + "epoch": 0.07529549102582811, + "grad_norm": 1.3265955971131624, + "learning_rate": 4.992145716955567e-05, + "loss": 0.6736, + "step": 2580 + }, + { + "epoch": 0.0754414125200642, + "grad_norm": 1.2238152641126043, + "learning_rate": 4.992054812591032e-05, + "loss": 0.6726, + "step": 2585 + }, + { + "epoch": 0.0755873340143003, + "grad_norm": 1.1525077899777592, + "learning_rate": 4.99196338612216e-05, + "loss": 0.7443, + "step": 2590 + }, + { + "epoch": 0.0757332555085364, + "grad_norm": 1.302404501600372, + "learning_rate": 4.991871437570241e-05, + "loss": 0.7528, + "step": 2595 + }, + { + "epoch": 0.07587917700277251, + "grad_norm": 1.102349301530605, + "learning_rate": 4.991778966956687e-05, + "loss": 0.6982, + "step": 2600 + }, + { + "epoch": 0.07602509849700861, + "grad_norm": 1.1613283935385472, + "learning_rate": 4.9916859743030296e-05, + "loss": 0.6683, + "step": 2605 + }, + { + "epoch": 0.0761710199912447, + "grad_norm": 1.2000643942941749, + "learning_rate": 4.991592459630927e-05, + "loss": 0.7065, + "step": 2610 + }, + { + "epoch": 0.07631694148548081, + "grad_norm": 1.2361901542750646, + "learning_rate": 4.991498422962154e-05, + "loss": 0.7013, + "step": 2615 + }, + { + "epoch": 0.07646286297971691, + "grad_norm": 1.027968841207083, + "learning_rate": 4.9914038643186094e-05, + "loss": 0.6505, + "step": 2620 + }, + { + "epoch": 0.07660878447395301, + "grad_norm": 1.214881352411773, + "learning_rate": 4.9913087837223126e-05, + "loss": 0.6923, + "step": 2625 + }, + { + "epoch": 0.07675470596818912, + "grad_norm": 1.0925923759394451, + "learning_rate": 4.991213181195405e-05, + "loss": 0.6846, + "step": 2630 + }, + { + "epoch": 0.07690062746242521, + "grad_norm": 1.502110141792278, + "learning_rate": 4.991117056760149e-05, + "loss": 0.7293, + "step": 2635 + }, + { + "epoch": 0.07704654895666131, + "grad_norm": 1.3626183124314462, + "learning_rate": 4.99102041043893e-05, + "loss": 0.7359, + "step": 2640 + }, + { + "epoch": 0.07719247045089742, + "grad_norm": 1.1885957240536167, + "learning_rate": 4.9909232422542536e-05, + "loss": 0.7286, + "step": 2645 + }, + { + "epoch": 0.07733839194513352, + "grad_norm": 1.3070204347903123, + "learning_rate": 4.9908255522287456e-05, + "loss": 0.6748, + "step": 2650 + }, + { + "epoch": 0.07748431343936962, + "grad_norm": 1.4889256476361912, + "learning_rate": 4.9907273403851576e-05, + "loss": 0.7004, + "step": 2655 + }, + { + "epoch": 0.07763023493360573, + "grad_norm": 1.1707314013743833, + "learning_rate": 4.990628606746358e-05, + "loss": 0.68, + "step": 2660 + }, + { + "epoch": 0.07777615642784182, + "grad_norm": 1.1793157559061662, + "learning_rate": 4.9905293513353404e-05, + "loss": 0.7043, + "step": 2665 + }, + { + "epoch": 0.07792207792207792, + "grad_norm": 1.367234663921608, + "learning_rate": 4.990429574175216e-05, + "loss": 0.7256, + "step": 2670 + }, + { + "epoch": 0.07806799941631402, + "grad_norm": 1.1716092733437509, + "learning_rate": 4.990329275289222e-05, + "loss": 0.6514, + "step": 2675 + }, + { + "epoch": 0.07821392091055013, + "grad_norm": 1.2881490710967423, + "learning_rate": 4.9902284547007136e-05, + "loss": 0.6928, + "step": 2680 + }, + { + "epoch": 0.07835984240478623, + "grad_norm": 1.4308049463344874, + "learning_rate": 4.99012711243317e-05, + "loss": 0.687, + "step": 2685 + }, + { + "epoch": 0.07850576389902232, + "grad_norm": 1.1257191513918912, + "learning_rate": 4.99002524851019e-05, + "loss": 0.7151, + "step": 2690 + }, + { + "epoch": 0.07865168539325842, + "grad_norm": 1.029860229789929, + "learning_rate": 4.9899228629554936e-05, + "loss": 0.7179, + "step": 2695 + }, + { + "epoch": 0.07879760688749453, + "grad_norm": 1.050000619595012, + "learning_rate": 4.989819955792924e-05, + "loss": 0.7189, + "step": 2700 + }, + { + "epoch": 0.07894352838173063, + "grad_norm": 1.2054061729668342, + "learning_rate": 4.9897165270464455e-05, + "loss": 0.6993, + "step": 2705 + }, + { + "epoch": 0.07908944987596674, + "grad_norm": 1.3294811677975067, + "learning_rate": 4.989612576740142e-05, + "loss": 0.6857, + "step": 2710 + }, + { + "epoch": 0.07923537137020283, + "grad_norm": 1.2837275469413965, + "learning_rate": 4.989508104898222e-05, + "loss": 0.8049, + "step": 2715 + }, + { + "epoch": 0.07938129286443893, + "grad_norm": 1.0284261979165217, + "learning_rate": 4.989403111545012e-05, + "loss": 0.6989, + "step": 2720 + }, + { + "epoch": 0.07952721435867503, + "grad_norm": 1.0804210127140008, + "learning_rate": 4.989297596704963e-05, + "loss": 0.7475, + "step": 2725 + }, + { + "epoch": 0.07967313585291114, + "grad_norm": 1.2579467326388063, + "learning_rate": 4.989191560402646e-05, + "loss": 0.7253, + "step": 2730 + }, + { + "epoch": 0.07981905734714724, + "grad_norm": 1.3763527016356465, + "learning_rate": 4.9890850026627535e-05, + "loss": 0.6831, + "step": 2735 + }, + { + "epoch": 0.07996497884138333, + "grad_norm": 1.48212909796467, + "learning_rate": 4.988977923510098e-05, + "loss": 0.6892, + "step": 2740 + }, + { + "epoch": 0.08011090033561943, + "grad_norm": 1.3190649102951475, + "learning_rate": 4.988870322969617e-05, + "loss": 0.6818, + "step": 2745 + }, + { + "epoch": 0.08025682182985554, + "grad_norm": 1.1274317740056343, + "learning_rate": 4.988762201066366e-05, + "loss": 0.6693, + "step": 2750 + }, + { + "epoch": 0.08040274332409164, + "grad_norm": 1.2275943658402562, + "learning_rate": 4.988653557825523e-05, + "loss": 0.7176, + "step": 2755 + }, + { + "epoch": 0.08054866481832774, + "grad_norm": 1.1836597863183846, + "learning_rate": 4.988544393272389e-05, + "loss": 0.7263, + "step": 2760 + }, + { + "epoch": 0.08069458631256383, + "grad_norm": 1.08063105033071, + "learning_rate": 4.988434707432384e-05, + "loss": 0.7262, + "step": 2765 + }, + { + "epoch": 0.08084050780679994, + "grad_norm": 1.1881601832723485, + "learning_rate": 4.98832450033105e-05, + "loss": 0.6736, + "step": 2770 + }, + { + "epoch": 0.08098642930103604, + "grad_norm": 1.3997745095761405, + "learning_rate": 4.988213771994052e-05, + "loss": 0.7748, + "step": 2775 + }, + { + "epoch": 0.08113235079527215, + "grad_norm": 1.1908259079867707, + "learning_rate": 4.988102522447174e-05, + "loss": 0.7158, + "step": 2780 + }, + { + "epoch": 0.08127827228950825, + "grad_norm": 1.3583654691551421, + "learning_rate": 4.987990751716324e-05, + "loss": 0.7132, + "step": 2785 + }, + { + "epoch": 0.08142419378374434, + "grad_norm": 1.4059881030327255, + "learning_rate": 4.987878459827528e-05, + "loss": 0.6947, + "step": 2790 + }, + { + "epoch": 0.08157011527798044, + "grad_norm": 1.2023733663689815, + "learning_rate": 4.987765646806936e-05, + "loss": 0.689, + "step": 2795 + }, + { + "epoch": 0.08171603677221655, + "grad_norm": 1.317230584309162, + "learning_rate": 4.987652312680819e-05, + "loss": 0.7492, + "step": 2800 + }, + { + "epoch": 0.08186195826645265, + "grad_norm": 1.3783632730546544, + "learning_rate": 4.9875384574755694e-05, + "loss": 0.7346, + "step": 2805 + }, + { + "epoch": 0.08200787976068875, + "grad_norm": 1.196338333312184, + "learning_rate": 4.9874240812177e-05, + "loss": 0.7646, + "step": 2810 + }, + { + "epoch": 0.08215380125492484, + "grad_norm": 1.246386257612844, + "learning_rate": 4.987309183933844e-05, + "loss": 0.7509, + "step": 2815 + }, + { + "epoch": 0.08229972274916095, + "grad_norm": 1.2055115580600684, + "learning_rate": 4.98719376565076e-05, + "loss": 0.7155, + "step": 2820 + }, + { + "epoch": 0.08244564424339705, + "grad_norm": 1.1966349790298623, + "learning_rate": 4.987077826395323e-05, + "loss": 0.7391, + "step": 2825 + }, + { + "epoch": 0.08259156573763315, + "grad_norm": 1.0963618941025084, + "learning_rate": 4.986961366194533e-05, + "loss": 0.6591, + "step": 2830 + }, + { + "epoch": 0.08273748723186926, + "grad_norm": 1.192009327828126, + "learning_rate": 4.9868443850755104e-05, + "loss": 0.7139, + "step": 2835 + }, + { + "epoch": 0.08288340872610536, + "grad_norm": 1.1364803193025055, + "learning_rate": 4.9867268830654944e-05, + "loss": 0.6826, + "step": 2840 + }, + { + "epoch": 0.08302933022034145, + "grad_norm": 1.1202359205091355, + "learning_rate": 4.98660886019185e-05, + "loss": 0.755, + "step": 2845 + }, + { + "epoch": 0.08317525171457756, + "grad_norm": 1.1929110366360403, + "learning_rate": 4.986490316482059e-05, + "loss": 0.7325, + "step": 2850 + }, + { + "epoch": 0.08332117320881366, + "grad_norm": 1.2271051062947187, + "learning_rate": 4.9863712519637265e-05, + "loss": 0.6565, + "step": 2855 + }, + { + "epoch": 0.08346709470304976, + "grad_norm": 1.5273667328861529, + "learning_rate": 4.986251666664581e-05, + "loss": 0.7884, + "step": 2860 + }, + { + "epoch": 0.08361301619728587, + "grad_norm": 1.267577656974418, + "learning_rate": 4.9861315606124686e-05, + "loss": 0.6872, + "step": 2865 + }, + { + "epoch": 0.08375893769152196, + "grad_norm": 1.4567896589375045, + "learning_rate": 4.9860109338353584e-05, + "loss": 0.7867, + "step": 2870 + }, + { + "epoch": 0.08390485918575806, + "grad_norm": 1.3045714144836331, + "learning_rate": 4.9858897863613404e-05, + "loss": 0.7437, + "step": 2875 + }, + { + "epoch": 0.08405078067999416, + "grad_norm": 0.9988958220535561, + "learning_rate": 4.985768118218627e-05, + "loss": 0.6783, + "step": 2880 + }, + { + "epoch": 0.08419670217423027, + "grad_norm": 1.1781778104181264, + "learning_rate": 4.9856459294355497e-05, + "loss": 0.6905, + "step": 2885 + }, + { + "epoch": 0.08434262366846637, + "grad_norm": 1.231471787388155, + "learning_rate": 4.985523220040563e-05, + "loss": 0.7351, + "step": 2890 + }, + { + "epoch": 0.08448854516270246, + "grad_norm": 1.268218950110271, + "learning_rate": 4.9853999900622416e-05, + "loss": 0.6934, + "step": 2895 + }, + { + "epoch": 0.08463446665693856, + "grad_norm": 1.113655372665911, + "learning_rate": 4.9852762395292835e-05, + "loss": 0.7237, + "step": 2900 + }, + { + "epoch": 0.08478038815117467, + "grad_norm": 1.2525174924193185, + "learning_rate": 4.985151968470505e-05, + "loss": 0.6752, + "step": 2905 + }, + { + "epoch": 0.08492630964541077, + "grad_norm": 1.335474313666936, + "learning_rate": 4.9850271769148444e-05, + "loss": 0.7254, + "step": 2910 + }, + { + "epoch": 0.08507223113964688, + "grad_norm": 1.3590788327898935, + "learning_rate": 4.984901864891363e-05, + "loss": 0.7297, + "step": 2915 + }, + { + "epoch": 0.08521815263388297, + "grad_norm": 1.3306705278241404, + "learning_rate": 4.984776032429241e-05, + "loss": 0.7401, + "step": 2920 + }, + { + "epoch": 0.08536407412811907, + "grad_norm": 1.2362927879506074, + "learning_rate": 4.984649679557782e-05, + "loss": 0.6944, + "step": 2925 + }, + { + "epoch": 0.08550999562235517, + "grad_norm": 1.1225371760943397, + "learning_rate": 4.984522806306408e-05, + "loss": 0.6683, + "step": 2930 + }, + { + "epoch": 0.08565591711659128, + "grad_norm": 1.0449421965989825, + "learning_rate": 4.984395412704666e-05, + "loss": 0.6312, + "step": 2935 + }, + { + "epoch": 0.08580183861082738, + "grad_norm": 1.0897455771394955, + "learning_rate": 4.98426749878222e-05, + "loss": 0.7312, + "step": 2940 + }, + { + "epoch": 0.08594776010506347, + "grad_norm": 0.9833949712313572, + "learning_rate": 4.984139064568858e-05, + "loss": 0.7011, + "step": 2945 + }, + { + "epoch": 0.08609368159929957, + "grad_norm": 1.0874351286478818, + "learning_rate": 4.984010110094489e-05, + "loss": 0.7516, + "step": 2950 + }, + { + "epoch": 0.08623960309353568, + "grad_norm": 1.2921691113693496, + "learning_rate": 4.9838806353891407e-05, + "loss": 0.7186, + "step": 2955 + }, + { + "epoch": 0.08638552458777178, + "grad_norm": 1.0415040392541335, + "learning_rate": 4.983750640482965e-05, + "loss": 0.6917, + "step": 2960 + }, + { + "epoch": 0.08653144608200788, + "grad_norm": 1.1673917672784808, + "learning_rate": 4.9836201254062334e-05, + "loss": 0.7273, + "step": 2965 + }, + { + "epoch": 0.08667736757624397, + "grad_norm": 1.262873236997477, + "learning_rate": 4.9834890901893386e-05, + "loss": 0.6596, + "step": 2970 + }, + { + "epoch": 0.08682328907048008, + "grad_norm": 1.1768615350196587, + "learning_rate": 4.983357534862795e-05, + "loss": 0.6493, + "step": 2975 + }, + { + "epoch": 0.08696921056471618, + "grad_norm": 1.059654404353001, + "learning_rate": 4.9832254594572366e-05, + "loss": 0.6768, + "step": 2980 + }, + { + "epoch": 0.08711513205895229, + "grad_norm": 1.1256162724457224, + "learning_rate": 4.983092864003421e-05, + "loss": 0.7422, + "step": 2985 + }, + { + "epoch": 0.08726105355318839, + "grad_norm": 1.148474679319773, + "learning_rate": 4.982959748532225e-05, + "loss": 0.7004, + "step": 2990 + }, + { + "epoch": 0.08740697504742448, + "grad_norm": 1.173954561594251, + "learning_rate": 4.9828261130746466e-05, + "loss": 0.7753, + "step": 2995 + }, + { + "epoch": 0.08755289654166058, + "grad_norm": 1.3498005917217282, + "learning_rate": 4.9826919576618066e-05, + "loss": 0.6834, + "step": 3000 + }, + { + "epoch": 0.08769881803589669, + "grad_norm": 1.1462259162480168, + "learning_rate": 4.982557282324943e-05, + "loss": 0.6973, + "step": 3005 + }, + { + "epoch": 0.08784473953013279, + "grad_norm": 1.1072100689840563, + "learning_rate": 4.9824220870954206e-05, + "loss": 0.6913, + "step": 3010 + }, + { + "epoch": 0.0879906610243689, + "grad_norm": 1.2807573783932273, + "learning_rate": 4.9822863720047203e-05, + "loss": 0.6536, + "step": 3015 + }, + { + "epoch": 0.088136582518605, + "grad_norm": 1.0365849324960532, + "learning_rate": 4.982150137084446e-05, + "loss": 0.6788, + "step": 3020 + }, + { + "epoch": 0.08828250401284109, + "grad_norm": 1.1952376905923747, + "learning_rate": 4.982013382366323e-05, + "loss": 0.6548, + "step": 3025 + }, + { + "epoch": 0.08842842550707719, + "grad_norm": 1.3987627048957174, + "learning_rate": 4.981876107882198e-05, + "loss": 0.7139, + "step": 3030 + }, + { + "epoch": 0.0885743470013133, + "grad_norm": 1.0769926083176462, + "learning_rate": 4.9817383136640355e-05, + "loss": 0.6507, + "step": 3035 + }, + { + "epoch": 0.0887202684955494, + "grad_norm": 1.1732014292395665, + "learning_rate": 4.9815999997439256e-05, + "loss": 0.7292, + "step": 3040 + }, + { + "epoch": 0.0888661899897855, + "grad_norm": 1.003817734610896, + "learning_rate": 4.9814611661540774e-05, + "loss": 0.6988, + "step": 3045 + }, + { + "epoch": 0.08901211148402159, + "grad_norm": 2.040218808094716, + "learning_rate": 4.981321812926819e-05, + "loss": 0.685, + "step": 3050 + }, + { + "epoch": 0.0891580329782577, + "grad_norm": 1.1880004978203618, + "learning_rate": 4.981181940094602e-05, + "loss": 0.6618, + "step": 3055 + }, + { + "epoch": 0.0893039544724938, + "grad_norm": 1.1564266807523542, + "learning_rate": 4.9810415476899994e-05, + "loss": 0.7569, + "step": 3060 + }, + { + "epoch": 0.0894498759667299, + "grad_norm": 1.2535946055928098, + "learning_rate": 4.980900635745704e-05, + "loss": 0.7508, + "step": 3065 + }, + { + "epoch": 0.089595797460966, + "grad_norm": 1.1351444943082982, + "learning_rate": 4.980759204294529e-05, + "loss": 0.6699, + "step": 3070 + }, + { + "epoch": 0.0897417189552021, + "grad_norm": 1.3462445407924764, + "learning_rate": 4.98061725336941e-05, + "loss": 0.715, + "step": 3075 + }, + { + "epoch": 0.0898876404494382, + "grad_norm": 1.1012961748924166, + "learning_rate": 4.980474783003403e-05, + "loss": 0.642, + "step": 3080 + }, + { + "epoch": 0.0900335619436743, + "grad_norm": 1.1211860552780468, + "learning_rate": 4.980331793229684e-05, + "loss": 0.6619, + "step": 3085 + }, + { + "epoch": 0.09017948343791041, + "grad_norm": 1.184993006230416, + "learning_rate": 4.980188284081552e-05, + "loss": 0.6635, + "step": 3090 + }, + { + "epoch": 0.09032540493214651, + "grad_norm": 1.1983927883515362, + "learning_rate": 4.980044255592425e-05, + "loss": 0.6764, + "step": 3095 + }, + { + "epoch": 0.0904713264263826, + "grad_norm": 1.3113670074414379, + "learning_rate": 4.979899707795844e-05, + "loss": 0.727, + "step": 3100 + }, + { + "epoch": 0.0906172479206187, + "grad_norm": 1.3952508835169883, + "learning_rate": 4.979754640725467e-05, + "loss": 0.711, + "step": 3105 + }, + { + "epoch": 0.09076316941485481, + "grad_norm": 1.031330087988221, + "learning_rate": 4.979609054415078e-05, + "loss": 0.6518, + "step": 3110 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 1.4750094137763674, + "learning_rate": 4.979462948898578e-05, + "loss": 0.7427, + "step": 3115 + }, + { + "epoch": 0.09105501240332702, + "grad_norm": 1.3348483940492921, + "learning_rate": 4.979316324209992e-05, + "loss": 0.6531, + "step": 3120 + }, + { + "epoch": 0.0912009338975631, + "grad_norm": 1.0286431946256631, + "learning_rate": 4.979169180383463e-05, + "loss": 0.6378, + "step": 3125 + }, + { + "epoch": 0.09134685539179921, + "grad_norm": 1.0571151209612941, + "learning_rate": 4.9790215174532566e-05, + "loss": 0.7234, + "step": 3130 + }, + { + "epoch": 0.09149277688603531, + "grad_norm": 1.1991209013086663, + "learning_rate": 4.978873335453758e-05, + "loss": 0.6777, + "step": 3135 + }, + { + "epoch": 0.09163869838027142, + "grad_norm": 1.175362245670362, + "learning_rate": 4.978724634419477e-05, + "loss": 0.6914, + "step": 3140 + }, + { + "epoch": 0.09178461987450752, + "grad_norm": 1.196997930322026, + "learning_rate": 4.978575414385038e-05, + "loss": 0.6474, + "step": 3145 + }, + { + "epoch": 0.09193054136874361, + "grad_norm": 1.3384223340418013, + "learning_rate": 4.978425675385191e-05, + "loss": 0.7768, + "step": 3150 + }, + { + "epoch": 0.09207646286297971, + "grad_norm": 1.0384767224639597, + "learning_rate": 4.978275417454806e-05, + "loss": 0.6996, + "step": 3155 + }, + { + "epoch": 0.09222238435721582, + "grad_norm": 1.2381868343202516, + "learning_rate": 4.978124640628873e-05, + "loss": 0.7248, + "step": 3160 + }, + { + "epoch": 0.09236830585145192, + "grad_norm": 1.2102390617323306, + "learning_rate": 4.977973344942503e-05, + "loss": 0.7065, + "step": 3165 + }, + { + "epoch": 0.09251422734568802, + "grad_norm": 1.1861424369383182, + "learning_rate": 4.977821530430928e-05, + "loss": 0.7112, + "step": 3170 + }, + { + "epoch": 0.09266014883992411, + "grad_norm": 1.1126931965592837, + "learning_rate": 4.977669197129502e-05, + "loss": 0.7256, + "step": 3175 + }, + { + "epoch": 0.09280607033416022, + "grad_norm": 1.2141998501071505, + "learning_rate": 4.9775163450736964e-05, + "loss": 0.7438, + "step": 3180 + }, + { + "epoch": 0.09295199182839632, + "grad_norm": 1.0884904273507405, + "learning_rate": 4.977362974299108e-05, + "loss": 0.6883, + "step": 3185 + }, + { + "epoch": 0.09309791332263243, + "grad_norm": 1.16234164762006, + "learning_rate": 4.97720908484145e-05, + "loss": 0.6792, + "step": 3190 + }, + { + "epoch": 0.09324383481686853, + "grad_norm": 1.1112809561411157, + "learning_rate": 4.97705467673656e-05, + "loss": 0.7112, + "step": 3195 + }, + { + "epoch": 0.09338975631110463, + "grad_norm": 1.1256078172996438, + "learning_rate": 4.976899750020394e-05, + "loss": 0.7838, + "step": 3200 + }, + { + "epoch": 0.09353567780534072, + "grad_norm": 0.9768412065363534, + "learning_rate": 4.97674430472903e-05, + "loss": 0.6704, + "step": 3205 + }, + { + "epoch": 0.09368159929957683, + "grad_norm": 1.08551886453828, + "learning_rate": 4.976588340898666e-05, + "loss": 0.7083, + "step": 3210 + }, + { + "epoch": 0.09382752079381293, + "grad_norm": 1.1884734839902384, + "learning_rate": 4.976431858565621e-05, + "loss": 0.6928, + "step": 3215 + }, + { + "epoch": 0.09397344228804903, + "grad_norm": 1.105909237294257, + "learning_rate": 4.976274857766336e-05, + "loss": 0.6955, + "step": 3220 + }, + { + "epoch": 0.09411936378228514, + "grad_norm": 1.0942350340815739, + "learning_rate": 4.97611733853737e-05, + "loss": 0.69, + "step": 3225 + }, + { + "epoch": 0.09426528527652123, + "grad_norm": 1.0963794673649976, + "learning_rate": 4.9759593009154046e-05, + "loss": 0.7406, + "step": 3230 + }, + { + "epoch": 0.09441120677075733, + "grad_norm": 1.150762137302678, + "learning_rate": 4.975800744937243e-05, + "loss": 0.6666, + "step": 3235 + }, + { + "epoch": 0.09455712826499343, + "grad_norm": 1.2624789491552089, + "learning_rate": 4.975641670639806e-05, + "loss": 0.6959, + "step": 3240 + }, + { + "epoch": 0.09470304975922954, + "grad_norm": 1.1590024853640135, + "learning_rate": 4.9754820780601395e-05, + "loss": 0.7952, + "step": 3245 + }, + { + "epoch": 0.09484897125346564, + "grad_norm": 1.427640198590033, + "learning_rate": 4.9753219672354054e-05, + "loss": 0.7542, + "step": 3250 + }, + { + "epoch": 0.09499489274770173, + "grad_norm": 1.1741027034407927, + "learning_rate": 4.97516133820289e-05, + "loss": 0.7209, + "step": 3255 + }, + { + "epoch": 0.09514081424193783, + "grad_norm": 1.0880492775724266, + "learning_rate": 4.9750001909999966e-05, + "loss": 0.6759, + "step": 3260 + }, + { + "epoch": 0.09528673573617394, + "grad_norm": 1.7605510387792094, + "learning_rate": 4.974838525664254e-05, + "loss": 0.7226, + "step": 3265 + }, + { + "epoch": 0.09543265723041004, + "grad_norm": 1.05993250960408, + "learning_rate": 4.974676342233307e-05, + "loss": 0.6388, + "step": 3270 + }, + { + "epoch": 0.09557857872464615, + "grad_norm": 1.1455793222520776, + "learning_rate": 4.9745136407449246e-05, + "loss": 0.7146, + "step": 3275 + }, + { + "epoch": 0.09572450021888224, + "grad_norm": 1.0616651244419713, + "learning_rate": 4.974350421236994e-05, + "loss": 0.6945, + "step": 3280 + }, + { + "epoch": 0.09587042171311834, + "grad_norm": 1.121995227351763, + "learning_rate": 4.974186683747523e-05, + "loss": 0.7067, + "step": 3285 + }, + { + "epoch": 0.09601634320735444, + "grad_norm": 1.2071546013549588, + "learning_rate": 4.974022428314643e-05, + "loss": 0.7406, + "step": 3290 + }, + { + "epoch": 0.09616226470159055, + "grad_norm": 1.1636972409998387, + "learning_rate": 4.9738576549766036e-05, + "loss": 0.6935, + "step": 3295 + }, + { + "epoch": 0.09630818619582665, + "grad_norm": 1.2427381463150373, + "learning_rate": 4.9736923637717734e-05, + "loss": 0.7455, + "step": 3300 + }, + { + "epoch": 0.09645410769006274, + "grad_norm": 1.016242405024273, + "learning_rate": 4.973526554738646e-05, + "loss": 0.6622, + "step": 3305 + }, + { + "epoch": 0.09660002918429884, + "grad_norm": 1.2431076572728077, + "learning_rate": 4.973360227915831e-05, + "loss": 0.6717, + "step": 3310 + }, + { + "epoch": 0.09674595067853495, + "grad_norm": 1.6131792652997232, + "learning_rate": 4.973193383342061e-05, + "loss": 0.7117, + "step": 3315 + }, + { + "epoch": 0.09689187217277105, + "grad_norm": 1.186107378187242, + "learning_rate": 4.9730260210561906e-05, + "loss": 0.7224, + "step": 3320 + }, + { + "epoch": 0.09703779366700716, + "grad_norm": 1.2147870521498203, + "learning_rate": 4.9728581410971914e-05, + "loss": 0.6879, + "step": 3325 + }, + { + "epoch": 0.09718371516124324, + "grad_norm": 1.2312511243072912, + "learning_rate": 4.972689743504159e-05, + "loss": 0.7256, + "step": 3330 + }, + { + "epoch": 0.09732963665547935, + "grad_norm": 1.2516252749369035, + "learning_rate": 4.9725208283163065e-05, + "loss": 0.6975, + "step": 3335 + }, + { + "epoch": 0.09747555814971545, + "grad_norm": 1.1007904395171177, + "learning_rate": 4.97235139557297e-05, + "loss": 0.6565, + "step": 3340 + }, + { + "epoch": 0.09762147964395156, + "grad_norm": 1.03754920253913, + "learning_rate": 4.9721814453136044e-05, + "loss": 0.6863, + "step": 3345 + }, + { + "epoch": 0.09776740113818766, + "grad_norm": 1.0957651348877475, + "learning_rate": 4.9720109775777856e-05, + "loss": 0.6866, + "step": 3350 + }, + { + "epoch": 0.09791332263242375, + "grad_norm": 1.1472195240860321, + "learning_rate": 4.9718399924052114e-05, + "loss": 0.708, + "step": 3355 + }, + { + "epoch": 0.09805924412665985, + "grad_norm": 1.1121867630779652, + "learning_rate": 4.971668489835697e-05, + "loss": 0.6556, + "step": 3360 + }, + { + "epoch": 0.09820516562089596, + "grad_norm": 1.1609523591723236, + "learning_rate": 4.9714964699091815e-05, + "loss": 0.6499, + "step": 3365 + }, + { + "epoch": 0.09835108711513206, + "grad_norm": 1.1403769749316155, + "learning_rate": 4.971323932665724e-05, + "loss": 0.6795, + "step": 3370 + }, + { + "epoch": 0.09849700860936816, + "grad_norm": 1.218840248722583, + "learning_rate": 4.9711508781454994e-05, + "loss": 0.6973, + "step": 3375 + }, + { + "epoch": 0.09864293010360427, + "grad_norm": 0.9992988573750679, + "learning_rate": 4.970977306388809e-05, + "loss": 0.7881, + "step": 3380 + }, + { + "epoch": 0.09878885159784036, + "grad_norm": 1.0798197309985502, + "learning_rate": 4.9708032174360736e-05, + "loss": 0.6932, + "step": 3385 + }, + { + "epoch": 0.09893477309207646, + "grad_norm": 1.0567826946360994, + "learning_rate": 4.97062861132783e-05, + "loss": 0.6397, + "step": 3390 + }, + { + "epoch": 0.09908069458631256, + "grad_norm": 1.1183540851594458, + "learning_rate": 4.97045348810474e-05, + "loss": 0.7244, + "step": 3395 + }, + { + "epoch": 0.09922661608054867, + "grad_norm": 1.08644138005523, + "learning_rate": 4.970277847807585e-05, + "loss": 0.6688, + "step": 3400 + }, + { + "epoch": 0.09937253757478477, + "grad_norm": 1.0183267086715422, + "learning_rate": 4.970101690477265e-05, + "loss": 0.7521, + "step": 3405 + }, + { + "epoch": 0.09951845906902086, + "grad_norm": 0.9532633375357952, + "learning_rate": 4.9699250161548025e-05, + "loss": 0.6653, + "step": 3410 + }, + { + "epoch": 0.09966438056325697, + "grad_norm": 1.1128782232322667, + "learning_rate": 4.969747824881339e-05, + "loss": 0.7332, + "step": 3415 + }, + { + "epoch": 0.09981030205749307, + "grad_norm": 1.2277887767057005, + "learning_rate": 4.969570116698136e-05, + "loss": 0.7424, + "step": 3420 + }, + { + "epoch": 0.09995622355172917, + "grad_norm": 1.3449492003729393, + "learning_rate": 4.969391891646577e-05, + "loss": 0.688, + "step": 3425 + }, + { + "epoch": 0.10010214504596528, + "grad_norm": 1.0547876220999304, + "learning_rate": 4.969213149768165e-05, + "loss": 0.7012, + "step": 3430 + }, + { + "epoch": 0.10024806654020137, + "grad_norm": 1.1135509924007378, + "learning_rate": 4.9690338911045224e-05, + "loss": 0.6858, + "step": 3435 + }, + { + "epoch": 0.10039398803443747, + "grad_norm": 1.2930177751139922, + "learning_rate": 4.968854115697395e-05, + "loss": 0.7177, + "step": 3440 + }, + { + "epoch": 0.10053990952867357, + "grad_norm": 0.9801091259944152, + "learning_rate": 4.968673823588644e-05, + "loss": 0.68, + "step": 3445 + }, + { + "epoch": 0.10068583102290968, + "grad_norm": 1.3230915844116853, + "learning_rate": 4.968493014820257e-05, + "loss": 0.6961, + "step": 3450 + }, + { + "epoch": 0.10083175251714578, + "grad_norm": 1.121684088453338, + "learning_rate": 4.9683116894343365e-05, + "loss": 0.6681, + "step": 3455 + }, + { + "epoch": 0.10097767401138187, + "grad_norm": 1.3146857840377224, + "learning_rate": 4.968129847473109e-05, + "loss": 0.6483, + "step": 3460 + }, + { + "epoch": 0.10112359550561797, + "grad_norm": 1.1950769138034345, + "learning_rate": 4.967947488978918e-05, + "loss": 0.6859, + "step": 3465 + }, + { + "epoch": 0.10126951699985408, + "grad_norm": 2.6954405873542187, + "learning_rate": 4.967764613994231e-05, + "loss": 0.7587, + "step": 3470 + }, + { + "epoch": 0.10141543849409018, + "grad_norm": 1.1494002331768778, + "learning_rate": 4.9675812225616315e-05, + "loss": 0.7168, + "step": 3475 + }, + { + "epoch": 0.10156135998832629, + "grad_norm": 1.1387928894097017, + "learning_rate": 4.967397314723827e-05, + "loss": 0.7162, + "step": 3480 + }, + { + "epoch": 0.10170728148256238, + "grad_norm": 1.0665249328499524, + "learning_rate": 4.967212890523645e-05, + "loss": 0.6751, + "step": 3485 + }, + { + "epoch": 0.10185320297679848, + "grad_norm": 1.3485398869041203, + "learning_rate": 4.9670279500040304e-05, + "loss": 0.7123, + "step": 3490 + }, + { + "epoch": 0.10199912447103458, + "grad_norm": 1.3247771365679535, + "learning_rate": 4.9668424932080496e-05, + "loss": 0.7611, + "step": 3495 + }, + { + "epoch": 0.10214504596527069, + "grad_norm": 1.3875140355929463, + "learning_rate": 4.9666565201788924e-05, + "loss": 0.6743, + "step": 3500 + }, + { + "epoch": 0.10229096745950679, + "grad_norm": 1.118033850514331, + "learning_rate": 4.966470030959863e-05, + "loss": 0.6804, + "step": 3505 + }, + { + "epoch": 0.10243688895374288, + "grad_norm": 1.1170995094232437, + "learning_rate": 4.966283025594391e-05, + "loss": 0.6768, + "step": 3510 + }, + { + "epoch": 0.10258281044797898, + "grad_norm": 1.1493628430712297, + "learning_rate": 4.9660955041260237e-05, + "loss": 0.6983, + "step": 3515 + }, + { + "epoch": 0.10272873194221509, + "grad_norm": 1.4000711593961446, + "learning_rate": 4.9659074665984286e-05, + "loss": 0.6511, + "step": 3520 + }, + { + "epoch": 0.10287465343645119, + "grad_norm": 1.3070196294106666, + "learning_rate": 4.965718913055393e-05, + "loss": 0.7775, + "step": 3525 + }, + { + "epoch": 0.1030205749306873, + "grad_norm": 1.2717808332885354, + "learning_rate": 4.965529843540827e-05, + "loss": 0.7129, + "step": 3530 + }, + { + "epoch": 0.10316649642492338, + "grad_norm": 1.1457671612092146, + "learning_rate": 4.9653402580987576e-05, + "loss": 0.72, + "step": 3535 + }, + { + "epoch": 0.10331241791915949, + "grad_norm": 1.2061272507395866, + "learning_rate": 4.965150156773333e-05, + "loss": 0.6714, + "step": 3540 + }, + { + "epoch": 0.10345833941339559, + "grad_norm": 1.309567326916106, + "learning_rate": 4.9649595396088225e-05, + "loss": 0.6464, + "step": 3545 + }, + { + "epoch": 0.1036042609076317, + "grad_norm": 1.1702096394824586, + "learning_rate": 4.964768406649616e-05, + "loss": 0.7203, + "step": 3550 + }, + { + "epoch": 0.1037501824018678, + "grad_norm": 1.2502357271888245, + "learning_rate": 4.9645767579402196e-05, + "loss": 0.6603, + "step": 3555 + }, + { + "epoch": 0.1038961038961039, + "grad_norm": 1.151807755146837, + "learning_rate": 4.9643845935252645e-05, + "loss": 0.717, + "step": 3560 + }, + { + "epoch": 0.10404202539033999, + "grad_norm": 1.1128499028564218, + "learning_rate": 4.964191913449499e-05, + "loss": 0.7244, + "step": 3565 + }, + { + "epoch": 0.1041879468845761, + "grad_norm": 1.0151509912176624, + "learning_rate": 4.963998717757793e-05, + "loss": 0.7432, + "step": 3570 + }, + { + "epoch": 0.1043338683788122, + "grad_norm": 1.1681965560359133, + "learning_rate": 4.9638050064951345e-05, + "loss": 0.6638, + "step": 3575 + }, + { + "epoch": 0.1044797898730483, + "grad_norm": 1.0094258276922727, + "learning_rate": 4.963610779706634e-05, + "loss": 0.6754, + "step": 3580 + }, + { + "epoch": 0.10462571136728441, + "grad_norm": 1.0151724027887894, + "learning_rate": 4.9634160374375204e-05, + "loss": 0.6727, + "step": 3585 + }, + { + "epoch": 0.1047716328615205, + "grad_norm": 1.305935248131152, + "learning_rate": 4.963220779733142e-05, + "loss": 0.7117, + "step": 3590 + }, + { + "epoch": 0.1049175543557566, + "grad_norm": 1.0699537996460562, + "learning_rate": 4.963025006638969e-05, + "loss": 0.6609, + "step": 3595 + }, + { + "epoch": 0.1050634758499927, + "grad_norm": 1.1105590591702637, + "learning_rate": 4.962828718200592e-05, + "loss": 0.6672, + "step": 3600 + }, + { + "epoch": 0.10520939734422881, + "grad_norm": 1.1904371044392517, + "learning_rate": 4.962631914463719e-05, + "loss": 0.7133, + "step": 3605 + }, + { + "epoch": 0.10535531883846491, + "grad_norm": 1.152210070696163, + "learning_rate": 4.96243459547418e-05, + "loss": 0.715, + "step": 3610 + }, + { + "epoch": 0.105501240332701, + "grad_norm": 1.137920585360193, + "learning_rate": 4.962236761277924e-05, + "loss": 0.7249, + "step": 3615 + }, + { + "epoch": 0.1056471618269371, + "grad_norm": 1.0733114833174349, + "learning_rate": 4.9620384119210204e-05, + "loss": 0.6783, + "step": 3620 + }, + { + "epoch": 0.10579308332117321, + "grad_norm": 1.2624089700840981, + "learning_rate": 4.961839547449659e-05, + "loss": 0.7054, + "step": 3625 + }, + { + "epoch": 0.10593900481540931, + "grad_norm": 1.1024741720966582, + "learning_rate": 4.961640167910149e-05, + "loss": 0.6817, + "step": 3630 + }, + { + "epoch": 0.10608492630964542, + "grad_norm": 1.055684212905771, + "learning_rate": 4.961440273348919e-05, + "loss": 0.7008, + "step": 3635 + }, + { + "epoch": 0.1062308478038815, + "grad_norm": 1.1231448411310259, + "learning_rate": 4.961239863812519e-05, + "loss": 0.7423, + "step": 3640 + }, + { + "epoch": 0.10637676929811761, + "grad_norm": 1.263122923150071, + "learning_rate": 4.9610389393476184e-05, + "loss": 0.653, + "step": 3645 + }, + { + "epoch": 0.10652269079235371, + "grad_norm": 1.1323829386152715, + "learning_rate": 4.960837500001005e-05, + "loss": 0.6384, + "step": 3650 + }, + { + "epoch": 0.10666861228658982, + "grad_norm": 1.1650779744183986, + "learning_rate": 4.9606355458195876e-05, + "loss": 0.6542, + "step": 3655 + }, + { + "epoch": 0.10681453378082592, + "grad_norm": 1.234097765982052, + "learning_rate": 4.9604330768503964e-05, + "loss": 0.6946, + "step": 3660 + }, + { + "epoch": 0.10696045527506201, + "grad_norm": 1.2408927795713185, + "learning_rate": 4.9602300931405795e-05, + "loss": 0.6617, + "step": 3665 + }, + { + "epoch": 0.10710637676929811, + "grad_norm": 1.0662935084016967, + "learning_rate": 4.9600265947374053e-05, + "loss": 0.6373, + "step": 3670 + }, + { + "epoch": 0.10725229826353422, + "grad_norm": 1.1469927538453333, + "learning_rate": 4.9598225816882624e-05, + "loss": 0.6683, + "step": 3675 + }, + { + "epoch": 0.10739821975777032, + "grad_norm": 1.186915343133445, + "learning_rate": 4.959618054040659e-05, + "loss": 0.7256, + "step": 3680 + }, + { + "epoch": 0.10754414125200643, + "grad_norm": 1.000177644978394, + "learning_rate": 4.959413011842223e-05, + "loss": 0.6498, + "step": 3685 + }, + { + "epoch": 0.10769006274624252, + "grad_norm": 1.1539678091223273, + "learning_rate": 4.9592074551407015e-05, + "loss": 0.6186, + "step": 3690 + }, + { + "epoch": 0.10783598424047862, + "grad_norm": 1.2031960672132378, + "learning_rate": 4.959001383983964e-05, + "loss": 0.6727, + "step": 3695 + }, + { + "epoch": 0.10798190573471472, + "grad_norm": 1.1203093307457614, + "learning_rate": 4.9587947984199974e-05, + "loss": 0.7084, + "step": 3700 + }, + { + "epoch": 0.10812782722895083, + "grad_norm": 1.0930047803753644, + "learning_rate": 4.958587698496908e-05, + "loss": 0.6911, + "step": 3705 + }, + { + "epoch": 0.10827374872318693, + "grad_norm": 1.0804426531539648, + "learning_rate": 4.958380084262924e-05, + "loss": 0.7264, + "step": 3710 + }, + { + "epoch": 0.10841967021742302, + "grad_norm": 1.0327209110706148, + "learning_rate": 4.958171955766392e-05, + "loss": 0.6717, + "step": 3715 + }, + { + "epoch": 0.10856559171165912, + "grad_norm": 1.1456593956829084, + "learning_rate": 4.957963313055778e-05, + "loss": 0.6963, + "step": 3720 + }, + { + "epoch": 0.10871151320589523, + "grad_norm": 1.0654436206228783, + "learning_rate": 4.95775415617967e-05, + "loss": 0.6323, + "step": 3725 + }, + { + "epoch": 0.10885743470013133, + "grad_norm": 1.139456632225083, + "learning_rate": 4.9575444851867717e-05, + "loss": 0.6703, + "step": 3730 + }, + { + "epoch": 0.10900335619436743, + "grad_norm": 1.0315056259890136, + "learning_rate": 4.9573343001259104e-05, + "loss": 0.6562, + "step": 3735 + }, + { + "epoch": 0.10914927768860354, + "grad_norm": 0.9934468425215857, + "learning_rate": 4.957123601046032e-05, + "loss": 0.7148, + "step": 3740 + }, + { + "epoch": 0.10929519918283963, + "grad_norm": 1.1066247601584256, + "learning_rate": 4.9569123879962e-05, + "loss": 0.7387, + "step": 3745 + }, + { + "epoch": 0.10944112067707573, + "grad_norm": 0.9969143972124256, + "learning_rate": 4.9567006610256005e-05, + "loss": 0.6504, + "step": 3750 + }, + { + "epoch": 0.10958704217131184, + "grad_norm": 1.1041909453708623, + "learning_rate": 4.956488420183539e-05, + "loss": 0.6679, + "step": 3755 + }, + { + "epoch": 0.10973296366554794, + "grad_norm": 1.2074690639340238, + "learning_rate": 4.956275665519438e-05, + "loss": 0.6779, + "step": 3760 + }, + { + "epoch": 0.10987888515978404, + "grad_norm": 0.999561955003958, + "learning_rate": 4.9560623970828416e-05, + "loss": 0.6711, + "step": 3765 + }, + { + "epoch": 0.11002480665402013, + "grad_norm": 1.0014789293999864, + "learning_rate": 4.9558486149234146e-05, + "loss": 0.6856, + "step": 3770 + }, + { + "epoch": 0.11017072814825624, + "grad_norm": 1.0498638907167868, + "learning_rate": 4.955634319090939e-05, + "loss": 0.6988, + "step": 3775 + }, + { + "epoch": 0.11031664964249234, + "grad_norm": 1.1802100008229806, + "learning_rate": 4.9554195096353176e-05, + "loss": 0.6323, + "step": 3780 + }, + { + "epoch": 0.11046257113672844, + "grad_norm": 1.3345812142761189, + "learning_rate": 4.9552041866065745e-05, + "loss": 0.6842, + "step": 3785 + }, + { + "epoch": 0.11060849263096455, + "grad_norm": 1.2694364016288235, + "learning_rate": 4.9549883500548494e-05, + "loss": 0.7214, + "step": 3790 + }, + { + "epoch": 0.11075441412520064, + "grad_norm": 1.1593843434128928, + "learning_rate": 4.9547720000304046e-05, + "loss": 0.7009, + "step": 3795 + }, + { + "epoch": 0.11090033561943674, + "grad_norm": 1.0358096270572228, + "learning_rate": 4.954555136583622e-05, + "loss": 0.6624, + "step": 3800 + }, + { + "epoch": 0.11104625711367284, + "grad_norm": 1.4308647177244223, + "learning_rate": 4.954337759765002e-05, + "loss": 0.6679, + "step": 3805 + }, + { + "epoch": 0.11119217860790895, + "grad_norm": 1.0952065711390835, + "learning_rate": 4.9541198696251644e-05, + "loss": 0.7018, + "step": 3810 + }, + { + "epoch": 0.11133810010214505, + "grad_norm": 1.3218450322554676, + "learning_rate": 4.9539014662148484e-05, + "loss": 0.7201, + "step": 3815 + }, + { + "epoch": 0.11148402159638114, + "grad_norm": 1.2463715817740397, + "learning_rate": 4.9536825495849155e-05, + "loss": 0.718, + "step": 3820 + }, + { + "epoch": 0.11162994309061725, + "grad_norm": 1.102259866702118, + "learning_rate": 4.9534631197863423e-05, + "loss": 0.6718, + "step": 3825 + }, + { + "epoch": 0.11177586458485335, + "grad_norm": 1.2021265414902833, + "learning_rate": 4.9532431768702286e-05, + "loss": 0.7545, + "step": 3830 + }, + { + "epoch": 0.11192178607908945, + "grad_norm": 1.347624790502832, + "learning_rate": 4.953022720887791e-05, + "loss": 0.7185, + "step": 3835 + }, + { + "epoch": 0.11206770757332556, + "grad_norm": 1.165874375525849, + "learning_rate": 4.952801751890368e-05, + "loss": 0.6511, + "step": 3840 + }, + { + "epoch": 0.11221362906756165, + "grad_norm": 1.205242395696266, + "learning_rate": 4.952580269929414e-05, + "loss": 0.6618, + "step": 3845 + }, + { + "epoch": 0.11235955056179775, + "grad_norm": 0.9887761301249266, + "learning_rate": 4.952358275056509e-05, + "loss": 0.6744, + "step": 3850 + }, + { + "epoch": 0.11250547205603385, + "grad_norm": 1.1099126410011242, + "learning_rate": 4.952135767323347e-05, + "loss": 0.677, + "step": 3855 + }, + { + "epoch": 0.11265139355026996, + "grad_norm": 1.1898642626689542, + "learning_rate": 4.951912746781741e-05, + "loss": 0.679, + "step": 3860 + }, + { + "epoch": 0.11279731504450606, + "grad_norm": 1.0778655871867437, + "learning_rate": 4.951689213483627e-05, + "loss": 0.6526, + "step": 3865 + }, + { + "epoch": 0.11294323653874215, + "grad_norm": 1.0782087896187873, + "learning_rate": 4.9514651674810605e-05, + "loss": 0.6702, + "step": 3870 + }, + { + "epoch": 0.11308915803297825, + "grad_norm": 1.1134393749112423, + "learning_rate": 4.9512406088262134e-05, + "loss": 0.6703, + "step": 3875 + }, + { + "epoch": 0.11323507952721436, + "grad_norm": 1.3534667536585756, + "learning_rate": 4.9510155375713765e-05, + "loss": 0.6475, + "step": 3880 + }, + { + "epoch": 0.11338100102145046, + "grad_norm": 1.3521566330146864, + "learning_rate": 4.9507899537689655e-05, + "loss": 0.7709, + "step": 3885 + }, + { + "epoch": 0.11352692251568657, + "grad_norm": 0.960034124184033, + "learning_rate": 4.9505638574715094e-05, + "loss": 0.6727, + "step": 3890 + }, + { + "epoch": 0.11367284400992266, + "grad_norm": 1.1115533964068696, + "learning_rate": 4.95033724873166e-05, + "loss": 0.6992, + "step": 3895 + }, + { + "epoch": 0.11381876550415876, + "grad_norm": 1.2702673940872604, + "learning_rate": 4.950110127602186e-05, + "loss": 0.6363, + "step": 3900 + }, + { + "epoch": 0.11396468699839486, + "grad_norm": 1.1744532141248218, + "learning_rate": 4.949882494135979e-05, + "loss": 0.6128, + "step": 3905 + }, + { + "epoch": 0.11411060849263097, + "grad_norm": 1.1534039386334998, + "learning_rate": 4.9496543483860457e-05, + "loss": 0.6734, + "step": 3910 + }, + { + "epoch": 0.11425652998686707, + "grad_norm": 1.2909904316329845, + "learning_rate": 4.9494256904055156e-05, + "loss": 0.7373, + "step": 3915 + }, + { + "epoch": 0.11440245148110317, + "grad_norm": 1.082559793202668, + "learning_rate": 4.949196520247634e-05, + "loss": 0.663, + "step": 3920 + }, + { + "epoch": 0.11454837297533926, + "grad_norm": 1.037947317039563, + "learning_rate": 4.948966837965769e-05, + "loss": 0.676, + "step": 3925 + }, + { + "epoch": 0.11469429446957537, + "grad_norm": 1.0357868161932815, + "learning_rate": 4.948736643613406e-05, + "loss": 0.6627, + "step": 3930 + }, + { + "epoch": 0.11484021596381147, + "grad_norm": 1.1225526894162952, + "learning_rate": 4.948505937244151e-05, + "loss": 0.6769, + "step": 3935 + }, + { + "epoch": 0.11498613745804757, + "grad_norm": 1.0133660858244744, + "learning_rate": 4.948274718911727e-05, + "loss": 0.7131, + "step": 3940 + }, + { + "epoch": 0.11513205895228368, + "grad_norm": 1.0866598577665991, + "learning_rate": 4.948042988669978e-05, + "loss": 0.7543, + "step": 3945 + }, + { + "epoch": 0.11527798044651977, + "grad_norm": 0.9942202007589459, + "learning_rate": 4.947810746572867e-05, + "loss": 0.7076, + "step": 3950 + }, + { + "epoch": 0.11542390194075587, + "grad_norm": 1.0969714442260865, + "learning_rate": 4.9475779926744745e-05, + "loss": 0.6907, + "step": 3955 + }, + { + "epoch": 0.11556982343499198, + "grad_norm": 1.3370496962122116, + "learning_rate": 4.947344727029003e-05, + "loss": 0.6899, + "step": 3960 + }, + { + "epoch": 0.11571574492922808, + "grad_norm": 1.0718127570063432, + "learning_rate": 4.947110949690773e-05, + "loss": 0.6848, + "step": 3965 + }, + { + "epoch": 0.11586166642346418, + "grad_norm": 1.0313075927975328, + "learning_rate": 4.946876660714223e-05, + "loss": 0.7205, + "step": 3970 + }, + { + "epoch": 0.11600758791770027, + "grad_norm": 1.0133343716034173, + "learning_rate": 4.946641860153912e-05, + "loss": 0.6747, + "step": 3975 + }, + { + "epoch": 0.11615350941193638, + "grad_norm": 1.0503887675078798, + "learning_rate": 4.946406548064517e-05, + "loss": 0.6747, + "step": 3980 + }, + { + "epoch": 0.11629943090617248, + "grad_norm": 1.203700752827074, + "learning_rate": 4.9461707245008366e-05, + "loss": 0.7548, + "step": 3985 + }, + { + "epoch": 0.11644535240040858, + "grad_norm": 1.0132059358440235, + "learning_rate": 4.9459343895177846e-05, + "loss": 0.6902, + "step": 3990 + }, + { + "epoch": 0.11659127389464469, + "grad_norm": 1.3602021524431593, + "learning_rate": 4.945697543170397e-05, + "loss": 0.7233, + "step": 3995 + }, + { + "epoch": 0.11673719538888078, + "grad_norm": 1.4124815233747543, + "learning_rate": 4.9454601855138275e-05, + "loss": 0.7342, + "step": 4000 + }, + { + "epoch": 0.11688311688311688, + "grad_norm": 1.0415423509236525, + "learning_rate": 4.9452223166033505e-05, + "loss": 0.7283, + "step": 4005 + }, + { + "epoch": 0.11702903837735298, + "grad_norm": 1.0177430690987501, + "learning_rate": 4.9449839364943565e-05, + "loss": 0.6869, + "step": 4010 + }, + { + "epoch": 0.11717495987158909, + "grad_norm": 1.1911709596838373, + "learning_rate": 4.944745045242359e-05, + "loss": 0.7244, + "step": 4015 + }, + { + "epoch": 0.11732088136582519, + "grad_norm": 1.5140239272691038, + "learning_rate": 4.944505642902985e-05, + "loss": 0.708, + "step": 4020 + }, + { + "epoch": 0.11746680286006128, + "grad_norm": 1.245772020034725, + "learning_rate": 4.944265729531987e-05, + "loss": 0.7375, + "step": 4025 + }, + { + "epoch": 0.11761272435429738, + "grad_norm": 1.0706146586105905, + "learning_rate": 4.9440253051852314e-05, + "loss": 0.6, + "step": 4030 + }, + { + "epoch": 0.11775864584853349, + "grad_norm": 1.1510707015059383, + "learning_rate": 4.943784369918707e-05, + "loss": 0.7206, + "step": 4035 + }, + { + "epoch": 0.11790456734276959, + "grad_norm": 1.2125701370710924, + "learning_rate": 4.9435429237885175e-05, + "loss": 0.6427, + "step": 4040 + }, + { + "epoch": 0.1180504888370057, + "grad_norm": 1.0945514690903801, + "learning_rate": 4.9433009668508906e-05, + "loss": 0.7198, + "step": 4045 + }, + { + "epoch": 0.11819641033124179, + "grad_norm": 0.9882674443895904, + "learning_rate": 4.9430584991621705e-05, + "loss": 0.6413, + "step": 4050 + }, + { + "epoch": 0.11834233182547789, + "grad_norm": 1.175455587370083, + "learning_rate": 4.9428155207788186e-05, + "loss": 0.7123, + "step": 4055 + }, + { + "epoch": 0.118488253319714, + "grad_norm": 1.126090153112986, + "learning_rate": 4.942572031757418e-05, + "loss": 0.6487, + "step": 4060 + }, + { + "epoch": 0.1186341748139501, + "grad_norm": 1.1899483259446826, + "learning_rate": 4.9423280321546694e-05, + "loss": 0.6808, + "step": 4065 + }, + { + "epoch": 0.1187800963081862, + "grad_norm": 1.2708256085640324, + "learning_rate": 4.942083522027393e-05, + "loss": 0.7331, + "step": 4070 + }, + { + "epoch": 0.11892601780242229, + "grad_norm": 1.1621919290063742, + "learning_rate": 4.941838501432528e-05, + "loss": 0.706, + "step": 4075 + }, + { + "epoch": 0.1190719392966584, + "grad_norm": 1.1698227735759303, + "learning_rate": 4.9415929704271305e-05, + "loss": 0.6504, + "step": 4080 + }, + { + "epoch": 0.1192178607908945, + "grad_norm": 1.087854250147867, + "learning_rate": 4.941346929068379e-05, + "loss": 0.6949, + "step": 4085 + }, + { + "epoch": 0.1193637822851306, + "grad_norm": 1.1253673663926613, + "learning_rate": 4.9411003774135665e-05, + "loss": 0.6163, + "step": 4090 + }, + { + "epoch": 0.1195097037793667, + "grad_norm": 0.9752195636234635, + "learning_rate": 4.940853315520108e-05, + "loss": 0.6488, + "step": 4095 + }, + { + "epoch": 0.1196556252736028, + "grad_norm": 1.130358705459771, + "learning_rate": 4.940605743445538e-05, + "loss": 0.6918, + "step": 4100 + }, + { + "epoch": 0.1198015467678389, + "grad_norm": 1.044950772445061, + "learning_rate": 4.940357661247507e-05, + "loss": 0.6876, + "step": 4105 + }, + { + "epoch": 0.119947468262075, + "grad_norm": 1.1423681046813763, + "learning_rate": 4.9401090689837845e-05, + "loss": 0.6599, + "step": 4110 + }, + { + "epoch": 0.1200933897563111, + "grad_norm": 1.1078600357014607, + "learning_rate": 4.939859966712262e-05, + "loss": 0.7092, + "step": 4115 + }, + { + "epoch": 0.12023931125054721, + "grad_norm": 1.0821247812326489, + "learning_rate": 4.939610354490947e-05, + "loss": 0.6713, + "step": 4120 + }, + { + "epoch": 0.12038523274478331, + "grad_norm": 1.189240848690589, + "learning_rate": 4.939360232377965e-05, + "loss": 0.7582, + "step": 4125 + }, + { + "epoch": 0.1205311542390194, + "grad_norm": 1.102766516656728, + "learning_rate": 4.939109600431564e-05, + "loss": 0.7039, + "step": 4130 + }, + { + "epoch": 0.1206770757332555, + "grad_norm": 1.2338185259530063, + "learning_rate": 4.938858458710105e-05, + "loss": 0.6249, + "step": 4135 + }, + { + "epoch": 0.12082299722749161, + "grad_norm": 1.3950450279512345, + "learning_rate": 4.938606807272075e-05, + "loss": 0.7288, + "step": 4140 + }, + { + "epoch": 0.12096891872172771, + "grad_norm": 1.5536625376003828, + "learning_rate": 4.938354646176072e-05, + "loss": 0.6954, + "step": 4145 + }, + { + "epoch": 0.12111484021596382, + "grad_norm": 1.1243355087911477, + "learning_rate": 4.9381019754808195e-05, + "loss": 0.6975, + "step": 4150 + }, + { + "epoch": 0.12126076171019991, + "grad_norm": 1.1423456295198111, + "learning_rate": 4.9378487952451543e-05, + "loss": 0.7113, + "step": 4155 + }, + { + "epoch": 0.12140668320443601, + "grad_norm": 1.020352268854177, + "learning_rate": 4.9375951055280354e-05, + "loss": 0.7548, + "step": 4160 + }, + { + "epoch": 0.12155260469867211, + "grad_norm": 1.0902738623073274, + "learning_rate": 4.937340906388538e-05, + "loss": 0.6624, + "step": 4165 + }, + { + "epoch": 0.12169852619290822, + "grad_norm": 1.2323026010364944, + "learning_rate": 4.93708619788586e-05, + "loss": 0.705, + "step": 4170 + }, + { + "epoch": 0.12184444768714432, + "grad_norm": 1.0858094879022224, + "learning_rate": 4.936830980079312e-05, + "loss": 0.6603, + "step": 4175 + }, + { + "epoch": 0.12199036918138041, + "grad_norm": 1.4823841077723303, + "learning_rate": 4.9365752530283265e-05, + "loss": 0.718, + "step": 4180 + }, + { + "epoch": 0.12213629067561652, + "grad_norm": 1.1247872681375826, + "learning_rate": 4.936319016792456e-05, + "loss": 0.702, + "step": 4185 + }, + { + "epoch": 0.12228221216985262, + "grad_norm": 0.947658429483614, + "learning_rate": 4.936062271431369e-05, + "loss": 0.6895, + "step": 4190 + }, + { + "epoch": 0.12242813366408872, + "grad_norm": 1.0737204863217322, + "learning_rate": 4.935805017004853e-05, + "loss": 0.6789, + "step": 4195 + }, + { + "epoch": 0.12257405515832483, + "grad_norm": 0.9740477450508347, + "learning_rate": 4.935547253572815e-05, + "loss": 0.6973, + "step": 4200 + }, + { + "epoch": 0.12271997665256092, + "grad_norm": 1.3669559040356531, + "learning_rate": 4.93528898119528e-05, + "loss": 0.7341, + "step": 4205 + }, + { + "epoch": 0.12286589814679702, + "grad_norm": 1.1197413087704782, + "learning_rate": 4.935030199932392e-05, + "loss": 0.6459, + "step": 4210 + }, + { + "epoch": 0.12301181964103312, + "grad_norm": 1.1820391059611846, + "learning_rate": 4.9347709098444114e-05, + "loss": 0.7717, + "step": 4215 + }, + { + "epoch": 0.12315774113526923, + "grad_norm": 0.8971069426873689, + "learning_rate": 4.934511110991721e-05, + "loss": 0.6174, + "step": 4220 + }, + { + "epoch": 0.12330366262950533, + "grad_norm": 1.0219446113715438, + "learning_rate": 4.934250803434818e-05, + "loss": 0.6937, + "step": 4225 + }, + { + "epoch": 0.12344958412374142, + "grad_norm": 1.4596743787177244, + "learning_rate": 4.933989987234321e-05, + "loss": 0.7195, + "step": 4230 + }, + { + "epoch": 0.12359550561797752, + "grad_norm": 1.008207225955857, + "learning_rate": 4.9337286624509655e-05, + "loss": 0.6482, + "step": 4235 + }, + { + "epoch": 0.12374142711221363, + "grad_norm": 1.1801377144179224, + "learning_rate": 4.9334668291456054e-05, + "loss": 0.711, + "step": 4240 + }, + { + "epoch": 0.12388734860644973, + "grad_norm": 1.257243421328094, + "learning_rate": 4.933204487379214e-05, + "loss": 0.7059, + "step": 4245 + }, + { + "epoch": 0.12403327010068584, + "grad_norm": 1.1182641851236583, + "learning_rate": 4.932941637212883e-05, + "loss": 0.7219, + "step": 4250 + }, + { + "epoch": 0.12417919159492193, + "grad_norm": 1.1027275067917797, + "learning_rate": 4.9326782787078215e-05, + "loss": 0.6704, + "step": 4255 + }, + { + "epoch": 0.12432511308915803, + "grad_norm": 1.2503708162830354, + "learning_rate": 4.932414411925356e-05, + "loss": 0.715, + "step": 4260 + }, + { + "epoch": 0.12447103458339413, + "grad_norm": 1.0708093798217997, + "learning_rate": 4.9321500369269345e-05, + "loss": 0.7156, + "step": 4265 + }, + { + "epoch": 0.12461695607763024, + "grad_norm": 1.0249800173155617, + "learning_rate": 4.9318851537741224e-05, + "loss": 0.6608, + "step": 4270 + }, + { + "epoch": 0.12476287757186634, + "grad_norm": 1.2562160401716829, + "learning_rate": 4.931619762528601e-05, + "loss": 0.6431, + "step": 4275 + }, + { + "epoch": 0.12490879906610243, + "grad_norm": 1.0807148903111947, + "learning_rate": 4.931353863252172e-05, + "loss": 0.7285, + "step": 4280 + }, + { + "epoch": 0.12505472056033853, + "grad_norm": 1.1314341038735056, + "learning_rate": 4.9310874560067554e-05, + "loss": 0.6278, + "step": 4285 + }, + { + "epoch": 0.12520064205457465, + "grad_norm": 0.9660677941704441, + "learning_rate": 4.930820540854389e-05, + "loss": 0.6952, + "step": 4290 + }, + { + "epoch": 0.12534656354881074, + "grad_norm": 1.325618375733907, + "learning_rate": 4.930553117857228e-05, + "loss": 0.7188, + "step": 4295 + }, + { + "epoch": 0.12549248504304683, + "grad_norm": 1.0740669808255412, + "learning_rate": 4.930285187077549e-05, + "loss": 0.7184, + "step": 4300 + }, + { + "epoch": 0.12563840653728295, + "grad_norm": 1.0665592702074642, + "learning_rate": 4.930016748577743e-05, + "loss": 0.7001, + "step": 4305 + }, + { + "epoch": 0.12578432803151904, + "grad_norm": 0.9641585920422673, + "learning_rate": 4.929747802420321e-05, + "loss": 0.6355, + "step": 4310 + }, + { + "epoch": 0.12593024952575516, + "grad_norm": 1.1397503515141967, + "learning_rate": 4.929478348667913e-05, + "loss": 0.665, + "step": 4315 + }, + { + "epoch": 0.12607617101999125, + "grad_norm": 1.0569617435941585, + "learning_rate": 4.929208387383265e-05, + "loss": 0.6023, + "step": 4320 + }, + { + "epoch": 0.12622209251422734, + "grad_norm": 1.1146521228633575, + "learning_rate": 4.9289379186292434e-05, + "loss": 0.7216, + "step": 4325 + }, + { + "epoch": 0.12636801400846345, + "grad_norm": 1.1676304605724792, + "learning_rate": 4.9286669424688326e-05, + "loss": 0.6546, + "step": 4330 + }, + { + "epoch": 0.12651393550269954, + "grad_norm": 1.0647112575028943, + "learning_rate": 4.928395458965134e-05, + "loss": 0.68, + "step": 4335 + }, + { + "epoch": 0.12665985699693566, + "grad_norm": 1.1318939328332238, + "learning_rate": 4.928123468181366e-05, + "loss": 0.6718, + "step": 4340 + }, + { + "epoch": 0.12680577849117175, + "grad_norm": 1.137918290181405, + "learning_rate": 4.927850970180869e-05, + "loss": 0.6484, + "step": 4345 + }, + { + "epoch": 0.12695169998540784, + "grad_norm": 1.106880749643652, + "learning_rate": 4.927577965027097e-05, + "loss": 0.6762, + "step": 4350 + }, + { + "epoch": 0.12709762147964396, + "grad_norm": 1.255135741197893, + "learning_rate": 4.927304452783627e-05, + "loss": 0.7049, + "step": 4355 + }, + { + "epoch": 0.12724354297388005, + "grad_norm": 0.9412812453220591, + "learning_rate": 4.9270304335141494e-05, + "loss": 0.7122, + "step": 4360 + }, + { + "epoch": 0.12738946446811616, + "grad_norm": 1.0080959791537925, + "learning_rate": 4.926755907282475e-05, + "loss": 0.6345, + "step": 4365 + }, + { + "epoch": 0.12753538596235225, + "grad_norm": 1.1128014160594106, + "learning_rate": 4.926480874152534e-05, + "loss": 0.6976, + "step": 4370 + }, + { + "epoch": 0.12768130745658834, + "grad_norm": 1.3479512069390571, + "learning_rate": 4.9262053341883716e-05, + "loss": 0.7235, + "step": 4375 + }, + { + "epoch": 0.12782722895082446, + "grad_norm": 1.4031641810510902, + "learning_rate": 4.925929287454151e-05, + "loss": 0.7379, + "step": 4380 + }, + { + "epoch": 0.12797315044506055, + "grad_norm": 1.1046431647470536, + "learning_rate": 4.9256527340141576e-05, + "loss": 0.7056, + "step": 4385 + }, + { + "epoch": 0.12811907193929667, + "grad_norm": 0.9880921574790832, + "learning_rate": 4.92537567393279e-05, + "loss": 0.7059, + "step": 4390 + }, + { + "epoch": 0.12826499343353276, + "grad_norm": 1.0059221521204251, + "learning_rate": 4.925098107274569e-05, + "loss": 0.6627, + "step": 4395 + }, + { + "epoch": 0.12841091492776885, + "grad_norm": 1.230884368448577, + "learning_rate": 4.924820034104129e-05, + "loss": 0.746, + "step": 4400 + }, + { + "epoch": 0.12855683642200497, + "grad_norm": 1.1311689317839877, + "learning_rate": 4.924541454486226e-05, + "loss": 0.6738, + "step": 4405 + }, + { + "epoch": 0.12870275791624106, + "grad_norm": 1.081562653473379, + "learning_rate": 4.92426236848573e-05, + "loss": 0.7304, + "step": 4410 + }, + { + "epoch": 0.12884867941047717, + "grad_norm": 1.108022164741601, + "learning_rate": 4.923982776167634e-05, + "loss": 0.6601, + "step": 4415 + }, + { + "epoch": 0.12899460090471326, + "grad_norm": 1.1913322601354124, + "learning_rate": 4.923702677597047e-05, + "loss": 0.6678, + "step": 4420 + }, + { + "epoch": 0.12914052239894935, + "grad_norm": 1.180137582174622, + "learning_rate": 4.923422072839192e-05, + "loss": 0.7028, + "step": 4425 + }, + { + "epoch": 0.12928644389318547, + "grad_norm": 1.062312363516143, + "learning_rate": 4.9231409619594154e-05, + "loss": 0.7371, + "step": 4430 + }, + { + "epoch": 0.12943236538742156, + "grad_norm": 1.094084924334837, + "learning_rate": 4.9228593450231783e-05, + "loss": 0.6962, + "step": 4435 + }, + { + "epoch": 0.12957828688165768, + "grad_norm": 1.1019862792373496, + "learning_rate": 4.9225772220960615e-05, + "loss": 0.6869, + "step": 4440 + }, + { + "epoch": 0.12972420837589377, + "grad_norm": 1.11205304567185, + "learning_rate": 4.92229459324376e-05, + "loss": 0.6253, + "step": 4445 + }, + { + "epoch": 0.12987012987012986, + "grad_norm": 1.1615422757029696, + "learning_rate": 4.922011458532093e-05, + "loss": 0.7022, + "step": 4450 + }, + { + "epoch": 0.13001605136436598, + "grad_norm": 1.2643020115074577, + "learning_rate": 4.9217278180269896e-05, + "loss": 0.7211, + "step": 4455 + }, + { + "epoch": 0.13016197285860207, + "grad_norm": 1.0660846661940044, + "learning_rate": 4.921443671794504e-05, + "loss": 0.7106, + "step": 4460 + }, + { + "epoch": 0.13030789435283818, + "grad_norm": 1.015264723691173, + "learning_rate": 4.9211590199008034e-05, + "loss": 0.6745, + "step": 4465 + }, + { + "epoch": 0.13045381584707427, + "grad_norm": 1.2598946183709452, + "learning_rate": 4.9208738624121746e-05, + "loss": 0.714, + "step": 4470 + }, + { + "epoch": 0.13059973734131036, + "grad_norm": 1.1393077380814258, + "learning_rate": 4.9205881993950226e-05, + "loss": 0.6199, + "step": 4475 + }, + { + "epoch": 0.13074565883554648, + "grad_norm": 1.2068626428688078, + "learning_rate": 4.920302030915868e-05, + "loss": 0.7118, + "step": 4480 + }, + { + "epoch": 0.13089158032978257, + "grad_norm": 1.1276464526411303, + "learning_rate": 4.920015357041352e-05, + "loss": 0.6904, + "step": 4485 + }, + { + "epoch": 0.1310375018240187, + "grad_norm": 1.0378150446365662, + "learning_rate": 4.9197281778382294e-05, + "loss": 0.6299, + "step": 4490 + }, + { + "epoch": 0.13118342331825478, + "grad_norm": 1.1522706788153476, + "learning_rate": 4.9194404933733785e-05, + "loss": 0.6915, + "step": 4495 + }, + { + "epoch": 0.13132934481249087, + "grad_norm": 1.5649634900383422, + "learning_rate": 4.9191523037137896e-05, + "loss": 0.7416, + "step": 4500 + }, + { + "epoch": 0.13147526630672698, + "grad_norm": 0.9639275009163263, + "learning_rate": 4.918863608926575e-05, + "loss": 0.658, + "step": 4505 + }, + { + "epoch": 0.13162118780096307, + "grad_norm": 1.0803790708853265, + "learning_rate": 4.918574409078961e-05, + "loss": 0.7385, + "step": 4510 + }, + { + "epoch": 0.1317671092951992, + "grad_norm": 1.0431205640469672, + "learning_rate": 4.9182847042382935e-05, + "loss": 0.6494, + "step": 4515 + }, + { + "epoch": 0.13191303078943528, + "grad_norm": 1.2574540054935648, + "learning_rate": 4.917994494472036e-05, + "loss": 0.7031, + "step": 4520 + }, + { + "epoch": 0.13205895228367137, + "grad_norm": 1.2716559360632516, + "learning_rate": 4.91770377984777e-05, + "loss": 0.7143, + "step": 4525 + }, + { + "epoch": 0.1322048737779075, + "grad_norm": 1.2012888935894919, + "learning_rate": 4.917412560433192e-05, + "loss": 0.7083, + "step": 4530 + }, + { + "epoch": 0.13235079527214358, + "grad_norm": 1.1803619899103541, + "learning_rate": 4.9171208362961204e-05, + "loss": 0.699, + "step": 4535 + }, + { + "epoch": 0.1324967167663797, + "grad_norm": 1.0113819215671593, + "learning_rate": 4.916828607504486e-05, + "loss": 0.6507, + "step": 4540 + }, + { + "epoch": 0.13264263826061579, + "grad_norm": 1.24736339433036, + "learning_rate": 4.916535874126341e-05, + "loss": 0.6628, + "step": 4545 + }, + { + "epoch": 0.13278855975485188, + "grad_norm": 1.3041346693606257, + "learning_rate": 4.9162426362298536e-05, + "loss": 0.6616, + "step": 4550 + }, + { + "epoch": 0.132934481249088, + "grad_norm": 0.9813318283956239, + "learning_rate": 4.91594889388331e-05, + "loss": 0.6752, + "step": 4555 + }, + { + "epoch": 0.13308040274332408, + "grad_norm": 1.2641451451617216, + "learning_rate": 4.915654647155114e-05, + "loss": 0.6096, + "step": 4560 + }, + { + "epoch": 0.1332263242375602, + "grad_norm": 3.9537948455186704, + "learning_rate": 4.915359896113785e-05, + "loss": 0.5926, + "step": 4565 + }, + { + "epoch": 0.1333722457317963, + "grad_norm": 1.1490951671689527, + "learning_rate": 4.9150646408279634e-05, + "loss": 0.6715, + "step": 4570 + }, + { + "epoch": 0.1335181672260324, + "grad_norm": 1.5253736779489158, + "learning_rate": 4.914768881366403e-05, + "loss": 0.7057, + "step": 4575 + }, + { + "epoch": 0.1336640887202685, + "grad_norm": 0.9836469966393219, + "learning_rate": 4.9144726177979764e-05, + "loss": 0.6935, + "step": 4580 + }, + { + "epoch": 0.1338100102145046, + "grad_norm": 1.2595981973617496, + "learning_rate": 4.914175850191677e-05, + "loss": 0.6646, + "step": 4585 + }, + { + "epoch": 0.1339559317087407, + "grad_norm": 1.0990961231019638, + "learning_rate": 4.9138785786166105e-05, + "loss": 0.7039, + "step": 4590 + }, + { + "epoch": 0.1341018532029768, + "grad_norm": 1.066233568866453, + "learning_rate": 4.913580803142002e-05, + "loss": 0.7323, + "step": 4595 + }, + { + "epoch": 0.1342477746972129, + "grad_norm": 1.021845294965165, + "learning_rate": 4.913282523837195e-05, + "loss": 0.7186, + "step": 4600 + }, + { + "epoch": 0.134393696191449, + "grad_norm": 1.0648379183111054, + "learning_rate": 4.912983740771649e-05, + "loss": 0.6623, + "step": 4605 + }, + { + "epoch": 0.1345396176856851, + "grad_norm": 1.119953961319831, + "learning_rate": 4.912684454014942e-05, + "loss": 0.6414, + "step": 4610 + }, + { + "epoch": 0.1346855391799212, + "grad_norm": 1.070167813054844, + "learning_rate": 4.912384663636768e-05, + "loss": 0.6872, + "step": 4615 + }, + { + "epoch": 0.1348314606741573, + "grad_norm": 0.9937951025153453, + "learning_rate": 4.9120843697069366e-05, + "loss": 0.6578, + "step": 4620 + }, + { + "epoch": 0.13497738216839342, + "grad_norm": 1.150098588548831, + "learning_rate": 4.9117835722953817e-05, + "loss": 0.6992, + "step": 4625 + }, + { + "epoch": 0.1351233036626295, + "grad_norm": 0.9620579969205204, + "learning_rate": 4.911482271472145e-05, + "loss": 0.6356, + "step": 4630 + }, + { + "epoch": 0.1352692251568656, + "grad_norm": 0.9772741486828223, + "learning_rate": 4.911180467307392e-05, + "loss": 0.6842, + "step": 4635 + }, + { + "epoch": 0.13541514665110171, + "grad_norm": 1.0244177126704055, + "learning_rate": 4.910878159871403e-05, + "loss": 0.6888, + "step": 4640 + }, + { + "epoch": 0.1355610681453378, + "grad_norm": 1.032799435304836, + "learning_rate": 4.9105753492345766e-05, + "loss": 0.7135, + "step": 4645 + }, + { + "epoch": 0.13570698963957392, + "grad_norm": 1.070007781509982, + "learning_rate": 4.910272035467427e-05, + "loss": 0.7026, + "step": 4650 + }, + { + "epoch": 0.13585291113381, + "grad_norm": 1.271958715442886, + "learning_rate": 4.909968218640588e-05, + "loss": 0.6879, + "step": 4655 + }, + { + "epoch": 0.1359988326280461, + "grad_norm": 0.9339405685144769, + "learning_rate": 4.909663898824807e-05, + "loss": 0.6276, + "step": 4660 + }, + { + "epoch": 0.13614475412228222, + "grad_norm": 1.0887017892321462, + "learning_rate": 4.909359076090952e-05, + "loss": 0.7068, + "step": 4665 + }, + { + "epoch": 0.1362906756165183, + "grad_norm": 1.1379308712176395, + "learning_rate": 4.909053750510005e-05, + "loss": 0.6944, + "step": 4670 + }, + { + "epoch": 0.13643659711075443, + "grad_norm": 1.1635600427634953, + "learning_rate": 4.9087479221530696e-05, + "loss": 0.7056, + "step": 4675 + }, + { + "epoch": 0.13658251860499052, + "grad_norm": 1.1933753832724734, + "learning_rate": 4.908441591091362e-05, + "loss": 0.6832, + "step": 4680 + }, + { + "epoch": 0.1367284400992266, + "grad_norm": 1.065601768601014, + "learning_rate": 4.908134757396216e-05, + "loss": 0.6863, + "step": 4685 + }, + { + "epoch": 0.13687436159346272, + "grad_norm": 1.3104649809246165, + "learning_rate": 4.907827421139085e-05, + "loss": 0.7004, + "step": 4690 + }, + { + "epoch": 0.1370202830876988, + "grad_norm": 1.2607542895960762, + "learning_rate": 4.907519582391538e-05, + "loss": 0.6578, + "step": 4695 + }, + { + "epoch": 0.13716620458193493, + "grad_norm": 1.3972807316000297, + "learning_rate": 4.90721124122526e-05, + "loss": 0.7043, + "step": 4700 + }, + { + "epoch": 0.13731212607617102, + "grad_norm": 1.345064497649985, + "learning_rate": 4.9069023977120555e-05, + "loss": 0.665, + "step": 4705 + }, + { + "epoch": 0.1374580475704071, + "grad_norm": 1.168170781570877, + "learning_rate": 4.906593051923843e-05, + "loss": 0.6474, + "step": 4710 + }, + { + "epoch": 0.13760396906464323, + "grad_norm": 1.2861586385641843, + "learning_rate": 4.90628320393266e-05, + "loss": 0.6904, + "step": 4715 + }, + { + "epoch": 0.13774989055887932, + "grad_norm": 1.128370005126937, + "learning_rate": 4.9059728538106605e-05, + "loss": 0.65, + "step": 4720 + }, + { + "epoch": 0.13789581205311544, + "grad_norm": 1.196672964789622, + "learning_rate": 4.905662001630116e-05, + "loss": 0.7126, + "step": 4725 + }, + { + "epoch": 0.13804173354735153, + "grad_norm": 1.2021832701329431, + "learning_rate": 4.905350647463412e-05, + "loss": 0.6709, + "step": 4730 + }, + { + "epoch": 0.13818765504158761, + "grad_norm": 0.9824054962661549, + "learning_rate": 4.905038791383056e-05, + "loss": 0.6711, + "step": 4735 + }, + { + "epoch": 0.13833357653582373, + "grad_norm": 0.9710205271656333, + "learning_rate": 4.904726433461668e-05, + "loss": 0.6784, + "step": 4740 + }, + { + "epoch": 0.13847949803005982, + "grad_norm": 1.3081974111966406, + "learning_rate": 4.904413573771987e-05, + "loss": 0.6411, + "step": 4745 + }, + { + "epoch": 0.13862541952429594, + "grad_norm": 1.2014268435934012, + "learning_rate": 4.904100212386867e-05, + "loss": 0.6922, + "step": 4750 + }, + { + "epoch": 0.13877134101853203, + "grad_norm": 1.149626759325989, + "learning_rate": 4.9037863493792804e-05, + "loss": 0.6724, + "step": 4755 + }, + { + "epoch": 0.13891726251276812, + "grad_norm": 1.0969383172941878, + "learning_rate": 4.9034719848223176e-05, + "loss": 0.5979, + "step": 4760 + }, + { + "epoch": 0.13906318400700424, + "grad_norm": 1.3775594643994746, + "learning_rate": 4.9031571187891824e-05, + "loss": 0.7088, + "step": 4765 + }, + { + "epoch": 0.13920910550124033, + "grad_norm": 1.0982138184363692, + "learning_rate": 4.902841751353198e-05, + "loss": 0.586, + "step": 4770 + }, + { + "epoch": 0.13935502699547644, + "grad_norm": 1.287348970584067, + "learning_rate": 4.902525882587804e-05, + "loss": 0.6989, + "step": 4775 + }, + { + "epoch": 0.13950094848971253, + "grad_norm": 1.158895061664004, + "learning_rate": 4.9022095125665556e-05, + "loss": 0.76, + "step": 4780 + }, + { + "epoch": 0.13964686998394862, + "grad_norm": 1.0175963904056657, + "learning_rate": 4.9018926413631264e-05, + "loss": 0.671, + "step": 4785 + }, + { + "epoch": 0.13979279147818474, + "grad_norm": 1.0738061674913413, + "learning_rate": 4.901575269051304e-05, + "loss": 0.6443, + "step": 4790 + }, + { + "epoch": 0.13993871297242083, + "grad_norm": 1.2007880769927555, + "learning_rate": 4.901257395704996e-05, + "loss": 0.6912, + "step": 4795 + }, + { + "epoch": 0.14008463446665695, + "grad_norm": 1.1057567414366558, + "learning_rate": 4.900939021398225e-05, + "loss": 0.6735, + "step": 4800 + }, + { + "epoch": 0.14023055596089304, + "grad_norm": 0.9618347954465882, + "learning_rate": 4.9006201462051304e-05, + "loss": 0.6853, + "step": 4805 + }, + { + "epoch": 0.14037647745512913, + "grad_norm": 1.146186579488446, + "learning_rate": 4.900300770199968e-05, + "loss": 0.6854, + "step": 4810 + }, + { + "epoch": 0.14052239894936525, + "grad_norm": 1.2796031039217017, + "learning_rate": 4.8999808934571105e-05, + "loss": 0.5789, + "step": 4815 + }, + { + "epoch": 0.14066832044360134, + "grad_norm": 0.9250417270724687, + "learning_rate": 4.899660516051047e-05, + "loss": 0.7052, + "step": 4820 + }, + { + "epoch": 0.14081424193783745, + "grad_norm": 1.1235962312884695, + "learning_rate": 4.899339638056383e-05, + "loss": 0.7218, + "step": 4825 + }, + { + "epoch": 0.14096016343207354, + "grad_norm": 1.3728019715463347, + "learning_rate": 4.899018259547842e-05, + "loss": 0.6882, + "step": 4830 + }, + { + "epoch": 0.14110608492630963, + "grad_norm": 1.2141369863368308, + "learning_rate": 4.898696380600263e-05, + "loss": 0.6143, + "step": 4835 + }, + { + "epoch": 0.14125200642054575, + "grad_norm": 1.0633781600946668, + "learning_rate": 4.8983740012886006e-05, + "loss": 0.6758, + "step": 4840 + }, + { + "epoch": 0.14139792791478184, + "grad_norm": 1.1798611618248072, + "learning_rate": 4.898051121687927e-05, + "loss": 0.6439, + "step": 4845 + }, + { + "epoch": 0.14154384940901796, + "grad_norm": 1.182610569361954, + "learning_rate": 4.897727741873431e-05, + "loss": 0.5814, + "step": 4850 + }, + { + "epoch": 0.14168977090325405, + "grad_norm": 1.1453313508660983, + "learning_rate": 4.897403861920417e-05, + "loss": 0.6654, + "step": 4855 + }, + { + "epoch": 0.14183569239749014, + "grad_norm": 1.2003229203678043, + "learning_rate": 4.897079481904308e-05, + "loss": 0.7414, + "step": 4860 + }, + { + "epoch": 0.14198161389172625, + "grad_norm": 0.9859090190617696, + "learning_rate": 4.896754601900641e-05, + "loss": 0.6386, + "step": 4865 + }, + { + "epoch": 0.14212753538596234, + "grad_norm": 1.2551714302410668, + "learning_rate": 4.89642922198507e-05, + "loss": 0.682, + "step": 4870 + }, + { + "epoch": 0.14227345688019846, + "grad_norm": 1.0549960492077843, + "learning_rate": 4.896103342233366e-05, + "loss": 0.6495, + "step": 4875 + }, + { + "epoch": 0.14241937837443455, + "grad_norm": 1.163727737321146, + "learning_rate": 4.8957769627214165e-05, + "loss": 0.6879, + "step": 4880 + }, + { + "epoch": 0.14256529986867064, + "grad_norm": 1.1715697763094626, + "learning_rate": 4.895450083525225e-05, + "loss": 0.6547, + "step": 4885 + }, + { + "epoch": 0.14271122136290676, + "grad_norm": 1.2938395441259627, + "learning_rate": 4.895122704720912e-05, + "loss": 0.788, + "step": 4890 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.9947646306332976, + "learning_rate": 4.894794826384713e-05, + "loss": 0.6386, + "step": 4895 + }, + { + "epoch": 0.14300306435137897, + "grad_norm": 1.1393631171633438, + "learning_rate": 4.89446644859298e-05, + "loss": 0.7044, + "step": 4900 + }, + { + "epoch": 0.14314898584561506, + "grad_norm": 1.0325043364535382, + "learning_rate": 4.894137571422183e-05, + "loss": 0.6716, + "step": 4905 + }, + { + "epoch": 0.14329490733985115, + "grad_norm": 1.0884063442719651, + "learning_rate": 4.8938081949489076e-05, + "loss": 0.7032, + "step": 4910 + }, + { + "epoch": 0.14344082883408726, + "grad_norm": 1.0482144588280986, + "learning_rate": 4.893478319249854e-05, + "loss": 0.6732, + "step": 4915 + }, + { + "epoch": 0.14358675032832335, + "grad_norm": 1.1021930031427787, + "learning_rate": 4.8931479444018405e-05, + "loss": 0.6692, + "step": 4920 + }, + { + "epoch": 0.14373267182255947, + "grad_norm": 0.9880131115060304, + "learning_rate": 4.8928170704818e-05, + "loss": 0.6224, + "step": 4925 + }, + { + "epoch": 0.14387859331679556, + "grad_norm": 1.132437443518024, + "learning_rate": 4.8924856975667846e-05, + "loss": 0.6935, + "step": 4930 + }, + { + "epoch": 0.14402451481103168, + "grad_norm": 1.1129437648839622, + "learning_rate": 4.8921538257339593e-05, + "loss": 0.6585, + "step": 4935 + }, + { + "epoch": 0.14417043630526777, + "grad_norm": 0.9815954620433064, + "learning_rate": 4.891821455060607e-05, + "loss": 0.7098, + "step": 4940 + }, + { + "epoch": 0.14431635779950386, + "grad_norm": 1.059684768469752, + "learning_rate": 4.8914885856241267e-05, + "loss": 0.6469, + "step": 4945 + }, + { + "epoch": 0.14446227929373998, + "grad_norm": 1.0538157876388539, + "learning_rate": 4.8911552175020334e-05, + "loss": 0.691, + "step": 4950 + }, + { + "epoch": 0.14460820078797607, + "grad_norm": 1.03665936643074, + "learning_rate": 4.8908213507719574e-05, + "loss": 0.6919, + "step": 4955 + }, + { + "epoch": 0.14475412228221218, + "grad_norm": 1.1438242333865547, + "learning_rate": 4.890486985511646e-05, + "loss": 0.6259, + "step": 4960 + }, + { + "epoch": 0.14490004377644827, + "grad_norm": 1.299769319417847, + "learning_rate": 4.8901521217989624e-05, + "loss": 0.705, + "step": 4965 + }, + { + "epoch": 0.14504596527068436, + "grad_norm": 1.0054967292808565, + "learning_rate": 4.889816759711887e-05, + "loss": 0.7142, + "step": 4970 + }, + { + "epoch": 0.14519188676492048, + "grad_norm": 1.1293320400197682, + "learning_rate": 4.889480899328513e-05, + "loss": 0.6492, + "step": 4975 + }, + { + "epoch": 0.14533780825915657, + "grad_norm": 1.0636561751639835, + "learning_rate": 4.889144540727054e-05, + "loss": 0.6319, + "step": 4980 + }, + { + "epoch": 0.1454837297533927, + "grad_norm": 1.0355254899477946, + "learning_rate": 4.888807683985836e-05, + "loss": 0.6272, + "step": 4985 + }, + { + "epoch": 0.14562965124762878, + "grad_norm": 0.8114873826693616, + "learning_rate": 4.8884703291833016e-05, + "loss": 0.6872, + "step": 4990 + }, + { + "epoch": 0.14577557274186487, + "grad_norm": 1.1570324957219593, + "learning_rate": 4.888132476398012e-05, + "loss": 0.7216, + "step": 4995 + }, + { + "epoch": 0.14592149423610098, + "grad_norm": 1.0605098990256272, + "learning_rate": 4.8877941257086426e-05, + "loss": 0.6517, + "step": 5000 + }, + { + "epoch": 0.14606741573033707, + "grad_norm": 1.0447378828772196, + "learning_rate": 4.887455277193983e-05, + "loss": 0.685, + "step": 5005 + }, + { + "epoch": 0.1462133372245732, + "grad_norm": 1.1086047539944173, + "learning_rate": 4.887115930932941e-05, + "loss": 0.6356, + "step": 5010 + }, + { + "epoch": 0.14635925871880928, + "grad_norm": 1.1572395217436067, + "learning_rate": 4.8867760870045407e-05, + "loss": 0.6728, + "step": 5015 + }, + { + "epoch": 0.14650518021304537, + "grad_norm": 1.161251114302908, + "learning_rate": 4.886435745487919e-05, + "loss": 0.6273, + "step": 5020 + }, + { + "epoch": 0.1466511017072815, + "grad_norm": 1.1784241428774225, + "learning_rate": 4.8860949064623334e-05, + "loss": 0.6599, + "step": 5025 + }, + { + "epoch": 0.14679702320151758, + "grad_norm": 1.1036218345992077, + "learning_rate": 4.885753570007153e-05, + "loss": 0.6713, + "step": 5030 + }, + { + "epoch": 0.1469429446957537, + "grad_norm": 1.2354273027950193, + "learning_rate": 4.8854117362018646e-05, + "loss": 0.6909, + "step": 5035 + }, + { + "epoch": 0.1470888661899898, + "grad_norm": 0.9618966191704894, + "learning_rate": 4.8850694051260706e-05, + "loss": 0.5868, + "step": 5040 + }, + { + "epoch": 0.14723478768422588, + "grad_norm": 1.1227826210418184, + "learning_rate": 4.8847265768594904e-05, + "loss": 0.7109, + "step": 5045 + }, + { + "epoch": 0.147380709178462, + "grad_norm": 1.0305454471962345, + "learning_rate": 4.884383251481956e-05, + "loss": 0.7081, + "step": 5050 + }, + { + "epoch": 0.14752663067269808, + "grad_norm": 0.9091481542770193, + "learning_rate": 4.8840394290734184e-05, + "loss": 0.6882, + "step": 5055 + }, + { + "epoch": 0.1476725521669342, + "grad_norm": 1.2964403866327527, + "learning_rate": 4.8836951097139425e-05, + "loss": 0.6731, + "step": 5060 + }, + { + "epoch": 0.1478184736611703, + "grad_norm": 1.294351175807002, + "learning_rate": 4.88335029348371e-05, + "loss": 0.7685, + "step": 5065 + }, + { + "epoch": 0.14796439515540638, + "grad_norm": 1.0056899358987934, + "learning_rate": 4.8830049804630165e-05, + "loss": 0.6663, + "step": 5070 + }, + { + "epoch": 0.1481103166496425, + "grad_norm": 1.2437689987091818, + "learning_rate": 4.8826591707322763e-05, + "loss": 0.6802, + "step": 5075 + }, + { + "epoch": 0.1482562381438786, + "grad_norm": 1.5587137635670796, + "learning_rate": 4.8823128643720164e-05, + "loss": 0.6358, + "step": 5080 + }, + { + "epoch": 0.1484021596381147, + "grad_norm": 1.2639662143413182, + "learning_rate": 4.881966061462882e-05, + "loss": 0.6509, + "step": 5085 + }, + { + "epoch": 0.1485480811323508, + "grad_norm": 1.277369191959652, + "learning_rate": 4.881618762085631e-05, + "loss": 0.698, + "step": 5090 + }, + { + "epoch": 0.14869400262658689, + "grad_norm": 1.0087963830058446, + "learning_rate": 4.88127096632114e-05, + "loss": 0.6757, + "step": 5095 + }, + { + "epoch": 0.148839924120823, + "grad_norm": 1.0282290704085635, + "learning_rate": 4.8809226742504e-05, + "loss": 0.7144, + "step": 5100 + }, + { + "epoch": 0.1489858456150591, + "grad_norm": 1.1488234619012705, + "learning_rate": 4.8805738859545145e-05, + "loss": 0.6593, + "step": 5105 + }, + { + "epoch": 0.1491317671092952, + "grad_norm": 1.1336513346242925, + "learning_rate": 4.880224601514709e-05, + "loss": 0.6366, + "step": 5110 + }, + { + "epoch": 0.1492776886035313, + "grad_norm": 1.2423066341851585, + "learning_rate": 4.879874821012318e-05, + "loss": 0.6618, + "step": 5115 + }, + { + "epoch": 0.1494236100977674, + "grad_norm": 1.0125245719137783, + "learning_rate": 4.879524544528797e-05, + "loss": 0.6662, + "step": 5120 + }, + { + "epoch": 0.1495695315920035, + "grad_norm": 1.0921171765322315, + "learning_rate": 4.879173772145712e-05, + "loss": 0.7018, + "step": 5125 + }, + { + "epoch": 0.1497154530862396, + "grad_norm": 1.0515883225222553, + "learning_rate": 4.878822503944748e-05, + "loss": 0.6242, + "step": 5130 + }, + { + "epoch": 0.14986137458047571, + "grad_norm": 1.077422894481393, + "learning_rate": 4.8784707400077046e-05, + "loss": 0.6864, + "step": 5135 + }, + { + "epoch": 0.1500072960747118, + "grad_norm": 1.0463109059324265, + "learning_rate": 4.878118480416496e-05, + "loss": 0.7291, + "step": 5140 + }, + { + "epoch": 0.1501532175689479, + "grad_norm": 1.1686528462841, + "learning_rate": 4.877765725253153e-05, + "loss": 0.6392, + "step": 5145 + }, + { + "epoch": 0.150299139063184, + "grad_norm": 1.1830796683410716, + "learning_rate": 4.877412474599822e-05, + "loss": 0.7308, + "step": 5150 + }, + { + "epoch": 0.1504450605574201, + "grad_norm": 1.0811186690939443, + "learning_rate": 4.8770587285387614e-05, + "loss": 0.6596, + "step": 5155 + }, + { + "epoch": 0.15059098205165622, + "grad_norm": 1.1366921499180116, + "learning_rate": 4.876704487152349e-05, + "loss": 0.6531, + "step": 5160 + }, + { + "epoch": 0.1507369035458923, + "grad_norm": 0.9852321412385228, + "learning_rate": 4.876349750523078e-05, + "loss": 0.6501, + "step": 5165 + }, + { + "epoch": 0.1508828250401284, + "grad_norm": 1.0925438342031004, + "learning_rate": 4.8759945187335536e-05, + "loss": 0.7011, + "step": 5170 + }, + { + "epoch": 0.15102874653436452, + "grad_norm": 1.1415468833364038, + "learning_rate": 4.875638791866499e-05, + "loss": 0.6853, + "step": 5175 + }, + { + "epoch": 0.1511746680286006, + "grad_norm": 1.1206948253508857, + "learning_rate": 4.875282570004752e-05, + "loss": 0.6391, + "step": 5180 + }, + { + "epoch": 0.15132058952283672, + "grad_norm": 1.179685788984038, + "learning_rate": 4.8749258532312634e-05, + "loss": 0.7173, + "step": 5185 + }, + { + "epoch": 0.1514665110170728, + "grad_norm": 1.1748425401073708, + "learning_rate": 4.874568641629105e-05, + "loss": 0.6443, + "step": 5190 + }, + { + "epoch": 0.1516124325113089, + "grad_norm": 0.9436392998085367, + "learning_rate": 4.8742109352814566e-05, + "loss": 0.6405, + "step": 5195 + }, + { + "epoch": 0.15175835400554502, + "grad_norm": 1.1083561088494678, + "learning_rate": 4.873852734271619e-05, + "loss": 0.6832, + "step": 5200 + }, + { + "epoch": 0.1519042754997811, + "grad_norm": 1.1630036048375099, + "learning_rate": 4.8734940386830056e-05, + "loss": 0.6366, + "step": 5205 + }, + { + "epoch": 0.15205019699401723, + "grad_norm": 1.2124648252002201, + "learning_rate": 4.873134848599146e-05, + "loss": 0.6641, + "step": 5210 + }, + { + "epoch": 0.15219611848825332, + "grad_norm": 1.0871915424942498, + "learning_rate": 4.872775164103683e-05, + "loss": 0.631, + "step": 5215 + }, + { + "epoch": 0.1523420399824894, + "grad_norm": 1.209576301952038, + "learning_rate": 4.872414985280375e-05, + "loss": 0.71, + "step": 5220 + }, + { + "epoch": 0.15248796147672553, + "grad_norm": 1.1416729248026865, + "learning_rate": 4.872054312213099e-05, + "loss": 0.6771, + "step": 5225 + }, + { + "epoch": 0.15263388297096162, + "grad_norm": 1.0748770372325782, + "learning_rate": 4.8716931449858425e-05, + "loss": 0.6954, + "step": 5230 + }, + { + "epoch": 0.15277980446519773, + "grad_norm": 1.0393095883770218, + "learning_rate": 4.871331483682712e-05, + "loss": 0.6499, + "step": 5235 + }, + { + "epoch": 0.15292572595943382, + "grad_norm": 1.2908996352090831, + "learning_rate": 4.870969328387925e-05, + "loss": 0.6323, + "step": 5240 + }, + { + "epoch": 0.1530716474536699, + "grad_norm": 1.2845316508047322, + "learning_rate": 4.870606679185816e-05, + "loss": 0.6603, + "step": 5245 + }, + { + "epoch": 0.15321756894790603, + "grad_norm": 1.2062113371983205, + "learning_rate": 4.870243536160837e-05, + "loss": 0.7101, + "step": 5250 + }, + { + "epoch": 0.15336349044214212, + "grad_norm": 1.1831901751159792, + "learning_rate": 4.869879899397551e-05, + "loss": 0.6826, + "step": 5255 + }, + { + "epoch": 0.15350941193637824, + "grad_norm": 1.1160114001769645, + "learning_rate": 4.8695157689806376e-05, + "loss": 0.6613, + "step": 5260 + }, + { + "epoch": 0.15365533343061433, + "grad_norm": 1.1258650726921502, + "learning_rate": 4.869151144994891e-05, + "loss": 0.6861, + "step": 5265 + }, + { + "epoch": 0.15380125492485042, + "grad_norm": 0.9280687045307197, + "learning_rate": 4.8687860275252215e-05, + "loss": 0.6183, + "step": 5270 + }, + { + "epoch": 0.15394717641908653, + "grad_norm": 0.9064983444339252, + "learning_rate": 4.8684204166566544e-05, + "loss": 0.5915, + "step": 5275 + }, + { + "epoch": 0.15409309791332262, + "grad_norm": 1.0988152905414788, + "learning_rate": 4.868054312474327e-05, + "loss": 0.6409, + "step": 5280 + }, + { + "epoch": 0.15423901940755874, + "grad_norm": 1.1223331190394121, + "learning_rate": 4.867687715063495e-05, + "loss": 0.6541, + "step": 5285 + }, + { + "epoch": 0.15438494090179483, + "grad_norm": 1.008787848856111, + "learning_rate": 4.867320624509526e-05, + "loss": 0.6431, + "step": 5290 + }, + { + "epoch": 0.15453086239603095, + "grad_norm": 1.2080754350677765, + "learning_rate": 4.866953040897905e-05, + "loss": 0.6512, + "step": 5295 + }, + { + "epoch": 0.15467678389026704, + "grad_norm": 1.07179898974179, + "learning_rate": 4.8665849643142317e-05, + "loss": 0.6637, + "step": 5300 + }, + { + "epoch": 0.15482270538450313, + "grad_norm": 1.1328167409438716, + "learning_rate": 4.866216394844217e-05, + "loss": 0.714, + "step": 5305 + }, + { + "epoch": 0.15496862687873925, + "grad_norm": 0.9425023546525281, + "learning_rate": 4.865847332573691e-05, + "loss": 0.6911, + "step": 5310 + }, + { + "epoch": 0.15511454837297534, + "grad_norm": 1.0125632535244773, + "learning_rate": 4.8654777775885965e-05, + "loss": 0.6659, + "step": 5315 + }, + { + "epoch": 0.15526046986721145, + "grad_norm": 1.19689588460288, + "learning_rate": 4.8651077299749906e-05, + "loss": 0.6954, + "step": 5320 + }, + { + "epoch": 0.15540639136144754, + "grad_norm": 1.0873149093220642, + "learning_rate": 4.864737189819046e-05, + "loss": 0.6731, + "step": 5325 + }, + { + "epoch": 0.15555231285568363, + "grad_norm": 1.0397964420372123, + "learning_rate": 4.864366157207052e-05, + "loss": 0.7068, + "step": 5330 + }, + { + "epoch": 0.15569823434991975, + "grad_norm": 1.148051255288061, + "learning_rate": 4.863994632225407e-05, + "loss": 0.629, + "step": 5335 + }, + { + "epoch": 0.15584415584415584, + "grad_norm": 0.9748830282118187, + "learning_rate": 4.86362261496063e-05, + "loss": 0.7136, + "step": 5340 + }, + { + "epoch": 0.15599007733839196, + "grad_norm": 1.04018221273993, + "learning_rate": 4.863250105499351e-05, + "loss": 0.7435, + "step": 5345 + }, + { + "epoch": 0.15613599883262805, + "grad_norm": 1.0753301489715088, + "learning_rate": 4.8628771039283164e-05, + "loss": 0.691, + "step": 5350 + }, + { + "epoch": 0.15628192032686414, + "grad_norm": 0.9987315263637163, + "learning_rate": 4.862503610334386e-05, + "loss": 0.6576, + "step": 5355 + }, + { + "epoch": 0.15642784182110026, + "grad_norm": 1.110055366325937, + "learning_rate": 4.8621296248045356e-05, + "loss": 0.6804, + "step": 5360 + }, + { + "epoch": 0.15657376331533635, + "grad_norm": 1.0318457910467402, + "learning_rate": 4.861755147425854e-05, + "loss": 0.6495, + "step": 5365 + }, + { + "epoch": 0.15671968480957246, + "grad_norm": 1.3122231548784538, + "learning_rate": 4.861380178285545e-05, + "loss": 0.7177, + "step": 5370 + }, + { + "epoch": 0.15686560630380855, + "grad_norm": 1.086020519330073, + "learning_rate": 4.8610047174709286e-05, + "loss": 0.6827, + "step": 5375 + }, + { + "epoch": 0.15701152779804464, + "grad_norm": 1.1448770673103832, + "learning_rate": 4.860628765069436e-05, + "loss": 0.7152, + "step": 5380 + }, + { + "epoch": 0.15715744929228076, + "grad_norm": 1.3072081786400245, + "learning_rate": 4.860252321168616e-05, + "loss": 0.7188, + "step": 5385 + }, + { + "epoch": 0.15730337078651685, + "grad_norm": 1.199245824404477, + "learning_rate": 4.859875385856131e-05, + "loss": 0.6852, + "step": 5390 + }, + { + "epoch": 0.15744929228075297, + "grad_norm": 0.9864940431883602, + "learning_rate": 4.8594979592197556e-05, + "loss": 0.627, + "step": 5395 + }, + { + "epoch": 0.15759521377498906, + "grad_norm": 1.3693859584373105, + "learning_rate": 4.859120041347381e-05, + "loss": 0.6847, + "step": 5400 + }, + { + "epoch": 0.15774113526922515, + "grad_norm": 1.1882215755116146, + "learning_rate": 4.8587416323270143e-05, + "loss": 0.6731, + "step": 5405 + }, + { + "epoch": 0.15788705676346126, + "grad_norm": 1.1036861819893564, + "learning_rate": 4.858362732246774e-05, + "loss": 0.6396, + "step": 5410 + }, + { + "epoch": 0.15803297825769735, + "grad_norm": 0.939339364635614, + "learning_rate": 4.857983341194894e-05, + "loss": 0.6229, + "step": 5415 + }, + { + "epoch": 0.15817889975193347, + "grad_norm": 1.091683234533646, + "learning_rate": 4.857603459259722e-05, + "loss": 0.6843, + "step": 5420 + }, + { + "epoch": 0.15832482124616956, + "grad_norm": 1.182683675277485, + "learning_rate": 4.857223086529721e-05, + "loss": 0.678, + "step": 5425 + }, + { + "epoch": 0.15847074274040565, + "grad_norm": 1.0125636639076858, + "learning_rate": 4.856842223093469e-05, + "loss": 0.6581, + "step": 5430 + }, + { + "epoch": 0.15861666423464177, + "grad_norm": 1.1544716654081766, + "learning_rate": 4.856460869039656e-05, + "loss": 0.6591, + "step": 5435 + }, + { + "epoch": 0.15876258572887786, + "grad_norm": 1.0557542586450097, + "learning_rate": 4.856079024457087e-05, + "loss": 0.7035, + "step": 5440 + }, + { + "epoch": 0.15890850722311398, + "grad_norm": 1.068501828389954, + "learning_rate": 4.8556966894346835e-05, + "loss": 0.6997, + "step": 5445 + }, + { + "epoch": 0.15905442871735007, + "grad_norm": 1.2898699377901977, + "learning_rate": 4.855313864061478e-05, + "loss": 0.7242, + "step": 5450 + }, + { + "epoch": 0.15920035021158616, + "grad_norm": 0.9952796665767981, + "learning_rate": 4.854930548426617e-05, + "loss": 0.6669, + "step": 5455 + }, + { + "epoch": 0.15934627170582227, + "grad_norm": 1.0317824180744029, + "learning_rate": 4.854546742619366e-05, + "loss": 0.6626, + "step": 5460 + }, + { + "epoch": 0.15949219320005836, + "grad_norm": 1.2959033115840224, + "learning_rate": 4.854162446729099e-05, + "loss": 0.6843, + "step": 5465 + }, + { + "epoch": 0.15963811469429448, + "grad_norm": 0.9682518497042534, + "learning_rate": 4.8537776608453076e-05, + "loss": 0.6969, + "step": 5470 + }, + { + "epoch": 0.15978403618853057, + "grad_norm": 1.152364469758684, + "learning_rate": 4.853392385057596e-05, + "loss": 0.7765, + "step": 5475 + }, + { + "epoch": 0.15992995768276666, + "grad_norm": 1.2953412871187777, + "learning_rate": 4.8530066194556826e-05, + "loss": 0.7051, + "step": 5480 + }, + { + "epoch": 0.16007587917700278, + "grad_norm": 1.13569127606471, + "learning_rate": 4.852620364129401e-05, + "loss": 0.614, + "step": 5485 + }, + { + "epoch": 0.16022180067123887, + "grad_norm": 1.089300887087154, + "learning_rate": 4.852233619168697e-05, + "loss": 0.6372, + "step": 5490 + }, + { + "epoch": 0.16036772216547499, + "grad_norm": 1.0636244357901254, + "learning_rate": 4.851846384663631e-05, + "loss": 0.6333, + "step": 5495 + }, + { + "epoch": 0.16051364365971107, + "grad_norm": 1.1966686080424564, + "learning_rate": 4.85145866070438e-05, + "loss": 0.6535, + "step": 5500 + }, + { + "epoch": 0.16065956515394716, + "grad_norm": 1.2361775717794952, + "learning_rate": 4.851070447381231e-05, + "loss": 0.6928, + "step": 5505 + }, + { + "epoch": 0.16080548664818328, + "grad_norm": 1.0366677003504112, + "learning_rate": 4.850681744784586e-05, + "loss": 0.6936, + "step": 5510 + }, + { + "epoch": 0.16095140814241937, + "grad_norm": 0.9784890372392392, + "learning_rate": 4.8502925530049645e-05, + "loss": 0.6742, + "step": 5515 + }, + { + "epoch": 0.1610973296366555, + "grad_norm": 1.1623953035476973, + "learning_rate": 4.849902872132994e-05, + "loss": 0.6962, + "step": 5520 + }, + { + "epoch": 0.16124325113089158, + "grad_norm": 0.9920771397838775, + "learning_rate": 4.84951270225942e-05, + "loss": 0.6755, + "step": 5525 + }, + { + "epoch": 0.16138917262512767, + "grad_norm": 1.0360150273961153, + "learning_rate": 4.849122043475101e-05, + "loss": 0.646, + "step": 5530 + }, + { + "epoch": 0.1615350941193638, + "grad_norm": 1.0842899779982063, + "learning_rate": 4.8487308958710095e-05, + "loss": 0.7225, + "step": 5535 + }, + { + "epoch": 0.16168101561359988, + "grad_norm": 0.9934102350266147, + "learning_rate": 4.848339259538232e-05, + "loss": 0.6545, + "step": 5540 + }, + { + "epoch": 0.161826937107836, + "grad_norm": 1.0007913908525419, + "learning_rate": 4.8479471345679665e-05, + "loss": 0.6836, + "step": 5545 + }, + { + "epoch": 0.16197285860207208, + "grad_norm": 0.9987884105604277, + "learning_rate": 4.847554521051529e-05, + "loss": 0.6396, + "step": 5550 + }, + { + "epoch": 0.16211878009630817, + "grad_norm": 1.1616065848634984, + "learning_rate": 4.8471614190803445e-05, + "loss": 0.6765, + "step": 5555 + }, + { + "epoch": 0.1622647015905443, + "grad_norm": 1.0277702503069528, + "learning_rate": 4.846767828745956e-05, + "loss": 0.6814, + "step": 5560 + }, + { + "epoch": 0.16241062308478038, + "grad_norm": 0.940727128258496, + "learning_rate": 4.846373750140017e-05, + "loss": 0.6683, + "step": 5565 + }, + { + "epoch": 0.1625565445790165, + "grad_norm": 1.1316840728421842, + "learning_rate": 4.8459791833542974e-05, + "loss": 0.7343, + "step": 5570 + }, + { + "epoch": 0.1627024660732526, + "grad_norm": 1.1362685968542987, + "learning_rate": 4.845584128480678e-05, + "loss": 0.7035, + "step": 5575 + }, + { + "epoch": 0.16284838756748868, + "grad_norm": 1.1619185968850532, + "learning_rate": 4.845188585611156e-05, + "loss": 0.6272, + "step": 5580 + }, + { + "epoch": 0.1629943090617248, + "grad_norm": 1.1493295018543637, + "learning_rate": 4.84479255483784e-05, + "loss": 0.6721, + "step": 5585 + }, + { + "epoch": 0.16314023055596089, + "grad_norm": 1.2637264689619623, + "learning_rate": 4.844396036252954e-05, + "loss": 0.6259, + "step": 5590 + }, + { + "epoch": 0.163286152050197, + "grad_norm": 1.0338309243815138, + "learning_rate": 4.843999029948834e-05, + "loss": 0.6724, + "step": 5595 + }, + { + "epoch": 0.1634320735444331, + "grad_norm": 1.142890497919786, + "learning_rate": 4.84360153601793e-05, + "loss": 0.711, + "step": 5600 + }, + { + "epoch": 0.16357799503866918, + "grad_norm": 1.199686987301231, + "learning_rate": 4.843203554552808e-05, + "loss": 0.7436, + "step": 5605 + }, + { + "epoch": 0.1637239165329053, + "grad_norm": 1.1146011661956983, + "learning_rate": 4.8428050856461435e-05, + "loss": 0.6477, + "step": 5610 + }, + { + "epoch": 0.1638698380271414, + "grad_norm": 1.012823412350095, + "learning_rate": 4.842406129390728e-05, + "loss": 0.6844, + "step": 5615 + }, + { + "epoch": 0.1640157595213775, + "grad_norm": 1.0118054412475699, + "learning_rate": 4.842006685879466e-05, + "loss": 0.6253, + "step": 5620 + }, + { + "epoch": 0.1641616810156136, + "grad_norm": 1.2776973921047927, + "learning_rate": 4.841606755205374e-05, + "loss": 0.6913, + "step": 5625 + }, + { + "epoch": 0.1643076025098497, + "grad_norm": 1.2615868176843938, + "learning_rate": 4.841206337461585e-05, + "loss": 0.7349, + "step": 5630 + }, + { + "epoch": 0.1644535240040858, + "grad_norm": 1.413966864551977, + "learning_rate": 4.840805432741343e-05, + "loss": 0.6433, + "step": 5635 + }, + { + "epoch": 0.1645994454983219, + "grad_norm": 1.0752591122123394, + "learning_rate": 4.840404041138007e-05, + "loss": 0.6604, + "step": 5640 + }, + { + "epoch": 0.164745366992558, + "grad_norm": 1.2220504694822736, + "learning_rate": 4.840002162745048e-05, + "loss": 0.7091, + "step": 5645 + }, + { + "epoch": 0.1648912884867941, + "grad_norm": 1.00067850003655, + "learning_rate": 4.8395997976560494e-05, + "loss": 0.6487, + "step": 5650 + }, + { + "epoch": 0.16503720998103022, + "grad_norm": 1.227101205694228, + "learning_rate": 4.839196945964713e-05, + "loss": 0.7506, + "step": 5655 + }, + { + "epoch": 0.1651831314752663, + "grad_norm": 0.8271325522279511, + "learning_rate": 4.838793607764847e-05, + "loss": 0.6034, + "step": 5660 + }, + { + "epoch": 0.1653290529695024, + "grad_norm": 0.9097144350949397, + "learning_rate": 4.838389783150377e-05, + "loss": 0.6275, + "step": 5665 + }, + { + "epoch": 0.16547497446373852, + "grad_norm": 1.023868334317864, + "learning_rate": 4.837985472215343e-05, + "loss": 0.737, + "step": 5670 + }, + { + "epoch": 0.1656208959579746, + "grad_norm": 1.0397327730501735, + "learning_rate": 4.8375806750538935e-05, + "loss": 0.6662, + "step": 5675 + }, + { + "epoch": 0.16576681745221072, + "grad_norm": 1.1020131697539324, + "learning_rate": 4.837175391760295e-05, + "loss": 0.7612, + "step": 5680 + }, + { + "epoch": 0.1659127389464468, + "grad_norm": 1.0589878823799486, + "learning_rate": 4.8367696224289246e-05, + "loss": 0.7373, + "step": 5685 + }, + { + "epoch": 0.1660586604406829, + "grad_norm": 1.1197679951174682, + "learning_rate": 4.836363367154273e-05, + "loss": 0.7379, + "step": 5690 + }, + { + "epoch": 0.16620458193491902, + "grad_norm": 1.114287211537706, + "learning_rate": 4.835956626030946e-05, + "loss": 0.6572, + "step": 5695 + }, + { + "epoch": 0.1663505034291551, + "grad_norm": 0.936589720333887, + "learning_rate": 4.835549399153659e-05, + "loss": 0.6742, + "step": 5700 + }, + { + "epoch": 0.16649642492339123, + "grad_norm": 1.044362062233453, + "learning_rate": 4.835141686617242e-05, + "loss": 0.6395, + "step": 5705 + }, + { + "epoch": 0.16664234641762732, + "grad_norm": 1.309826716022715, + "learning_rate": 4.8347334885166415e-05, + "loss": 0.7038, + "step": 5710 + }, + { + "epoch": 0.1667882679118634, + "grad_norm": 1.0851768951951963, + "learning_rate": 4.83432480494691e-05, + "loss": 0.6787, + "step": 5715 + }, + { + "epoch": 0.16693418940609953, + "grad_norm": 1.3255765713766328, + "learning_rate": 4.833915636003221e-05, + "loss": 0.6378, + "step": 5720 + }, + { + "epoch": 0.16708011090033562, + "grad_norm": 1.1504464039235212, + "learning_rate": 4.8335059817808534e-05, + "loss": 0.6257, + "step": 5725 + }, + { + "epoch": 0.16722603239457173, + "grad_norm": 1.210658498690085, + "learning_rate": 4.833095842375205e-05, + "loss": 0.6785, + "step": 5730 + }, + { + "epoch": 0.16737195388880782, + "grad_norm": 1.1507376808743373, + "learning_rate": 4.832685217881785e-05, + "loss": 0.6526, + "step": 5735 + }, + { + "epoch": 0.1675178753830439, + "grad_norm": 0.935819823610171, + "learning_rate": 4.832274108396214e-05, + "loss": 0.6681, + "step": 5740 + }, + { + "epoch": 0.16766379687728003, + "grad_norm": 0.9730334815352761, + "learning_rate": 4.831862514014226e-05, + "loss": 0.7321, + "step": 5745 + }, + { + "epoch": 0.16780971837151612, + "grad_norm": 0.9993958934888955, + "learning_rate": 4.8314504348316696e-05, + "loss": 0.6565, + "step": 5750 + }, + { + "epoch": 0.16795563986575224, + "grad_norm": 1.001239674928972, + "learning_rate": 4.8310378709445046e-05, + "loss": 0.6337, + "step": 5755 + }, + { + "epoch": 0.16810156135998833, + "grad_norm": 1.0611798977997413, + "learning_rate": 4.830624822448804e-05, + "loss": 0.6625, + "step": 5760 + }, + { + "epoch": 0.16824748285422442, + "grad_norm": 0.9888993272834491, + "learning_rate": 4.830211289440753e-05, + "loss": 0.6557, + "step": 5765 + }, + { + "epoch": 0.16839340434846053, + "grad_norm": 2.152173819012769, + "learning_rate": 4.829797272016653e-05, + "loss": 0.766, + "step": 5770 + }, + { + "epoch": 0.16853932584269662, + "grad_norm": 1.0701881200332837, + "learning_rate": 4.829382770272914e-05, + "loss": 0.6659, + "step": 5775 + }, + { + "epoch": 0.16868524733693274, + "grad_norm": 0.9238367398869555, + "learning_rate": 4.82896778430606e-05, + "loss": 0.6124, + "step": 5780 + }, + { + "epoch": 0.16883116883116883, + "grad_norm": 1.0253541688718055, + "learning_rate": 4.82855231421273e-05, + "loss": 0.6254, + "step": 5785 + }, + { + "epoch": 0.16897709032540492, + "grad_norm": 0.9757344795619404, + "learning_rate": 4.828136360089672e-05, + "loss": 0.6405, + "step": 5790 + }, + { + "epoch": 0.16912301181964104, + "grad_norm": 1.0157182161251295, + "learning_rate": 4.8277199220337505e-05, + "loss": 0.5949, + "step": 5795 + }, + { + "epoch": 0.16926893331387713, + "grad_norm": 1.1023884601325222, + "learning_rate": 4.827303000141939e-05, + "loss": 0.7219, + "step": 5800 + }, + { + "epoch": 0.16941485480811325, + "grad_norm": 1.1919971111854084, + "learning_rate": 4.826885594511326e-05, + "loss": 0.7427, + "step": 5805 + }, + { + "epoch": 0.16956077630234934, + "grad_norm": 1.2008317899810013, + "learning_rate": 4.8264677052391145e-05, + "loss": 0.7178, + "step": 5810 + }, + { + "epoch": 0.16970669779658543, + "grad_norm": 1.060616247228624, + "learning_rate": 4.8260493324226144e-05, + "loss": 0.6464, + "step": 5815 + }, + { + "epoch": 0.16985261929082154, + "grad_norm": 1.170001997881897, + "learning_rate": 4.825630476159255e-05, + "loss": 0.7141, + "step": 5820 + }, + { + "epoch": 0.16999854078505763, + "grad_norm": 1.263408537229351, + "learning_rate": 4.8252111365465716e-05, + "loss": 0.7057, + "step": 5825 + }, + { + "epoch": 0.17014446227929375, + "grad_norm": 1.267697984164991, + "learning_rate": 4.824791313682218e-05, + "loss": 0.6737, + "step": 5830 + }, + { + "epoch": 0.17029038377352984, + "grad_norm": 1.1313684764504524, + "learning_rate": 4.824371007663955e-05, + "loss": 0.6446, + "step": 5835 + }, + { + "epoch": 0.17043630526776593, + "grad_norm": 0.8904872063757217, + "learning_rate": 4.823950218589661e-05, + "loss": 0.5936, + "step": 5840 + }, + { + "epoch": 0.17058222676200205, + "grad_norm": 0.9177114011028235, + "learning_rate": 4.823528946557324e-05, + "loss": 0.6051, + "step": 5845 + }, + { + "epoch": 0.17072814825623814, + "grad_norm": 1.0465279849310067, + "learning_rate": 4.8231071916650437e-05, + "loss": 0.6362, + "step": 5850 + }, + { + "epoch": 0.17087406975047426, + "grad_norm": 1.0484570449255428, + "learning_rate": 4.822684954011036e-05, + "loss": 0.6922, + "step": 5855 + }, + { + "epoch": 0.17101999124471035, + "grad_norm": 1.0630360493698148, + "learning_rate": 4.822262233693625e-05, + "loss": 0.6545, + "step": 5860 + }, + { + "epoch": 0.17116591273894644, + "grad_norm": 1.064448379920905, + "learning_rate": 4.82183903081125e-05, + "loss": 0.6681, + "step": 5865 + }, + { + "epoch": 0.17131183423318255, + "grad_norm": 1.0179475441838266, + "learning_rate": 4.821415345462462e-05, + "loss": 0.6805, + "step": 5870 + }, + { + "epoch": 0.17145775572741864, + "grad_norm": 1.0529543354757331, + "learning_rate": 4.8209911777459224e-05, + "loss": 0.7163, + "step": 5875 + }, + { + "epoch": 0.17160367722165476, + "grad_norm": 1.2444090941640662, + "learning_rate": 4.820566527760408e-05, + "loss": 0.6994, + "step": 5880 + }, + { + "epoch": 0.17174959871589085, + "grad_norm": 0.9796656943074434, + "learning_rate": 4.820141395604806e-05, + "loss": 0.6353, + "step": 5885 + }, + { + "epoch": 0.17189552021012694, + "grad_norm": 1.0947312013035475, + "learning_rate": 4.819715781378115e-05, + "loss": 0.753, + "step": 5890 + }, + { + "epoch": 0.17204144170436306, + "grad_norm": 1.0291982170118557, + "learning_rate": 4.8192896851794505e-05, + "loss": 0.6487, + "step": 5895 + }, + { + "epoch": 0.17218736319859915, + "grad_norm": 1.0234653008674273, + "learning_rate": 4.818863107108034e-05, + "loss": 0.7204, + "step": 5900 + }, + { + "epoch": 0.17233328469283526, + "grad_norm": 1.1212524385769005, + "learning_rate": 4.818436047263204e-05, + "loss": 0.657, + "step": 5905 + }, + { + "epoch": 0.17247920618707135, + "grad_norm": 1.0782562264447584, + "learning_rate": 4.818008505744408e-05, + "loss": 0.681, + "step": 5910 + }, + { + "epoch": 0.17262512768130744, + "grad_norm": 1.0428112906770304, + "learning_rate": 4.817580482651208e-05, + "loss": 0.6951, + "step": 5915 + }, + { + "epoch": 0.17277104917554356, + "grad_norm": 0.972462681192514, + "learning_rate": 4.8171519780832765e-05, + "loss": 0.6707, + "step": 5920 + }, + { + "epoch": 0.17291697066977965, + "grad_norm": 1.0255109633322963, + "learning_rate": 4.816722992140399e-05, + "loss": 0.62, + "step": 5925 + }, + { + "epoch": 0.17306289216401577, + "grad_norm": 1.0797119296181434, + "learning_rate": 4.816293524922474e-05, + "loss": 0.7043, + "step": 5930 + }, + { + "epoch": 0.17320881365825186, + "grad_norm": 1.1114160469440066, + "learning_rate": 4.815863576529509e-05, + "loss": 0.6796, + "step": 5935 + }, + { + "epoch": 0.17335473515248795, + "grad_norm": 1.0707451870361893, + "learning_rate": 4.815433147061627e-05, + "loss": 0.7113, + "step": 5940 + }, + { + "epoch": 0.17350065664672407, + "grad_norm": 0.9441907324371049, + "learning_rate": 4.8150022366190603e-05, + "loss": 0.6214, + "step": 5945 + }, + { + "epoch": 0.17364657814096016, + "grad_norm": 1.003492656610968, + "learning_rate": 4.8145708453021556e-05, + "loss": 0.7142, + "step": 5950 + }, + { + "epoch": 0.17379249963519627, + "grad_norm": 0.995657731749954, + "learning_rate": 4.814138973211369e-05, + "loss": 0.6679, + "step": 5955 + }, + { + "epoch": 0.17393842112943236, + "grad_norm": 1.1481344668759932, + "learning_rate": 4.813706620447272e-05, + "loss": 0.6584, + "step": 5960 + }, + { + "epoch": 0.17408434262366845, + "grad_norm": 1.0689489093974827, + "learning_rate": 4.813273787110545e-05, + "loss": 0.6767, + "step": 5965 + }, + { + "epoch": 0.17423026411790457, + "grad_norm": 0.9954588182114794, + "learning_rate": 4.812840473301981e-05, + "loss": 0.6818, + "step": 5970 + }, + { + "epoch": 0.17437618561214066, + "grad_norm": 0.9873849933885248, + "learning_rate": 4.8124066791224856e-05, + "loss": 0.6692, + "step": 5975 + }, + { + "epoch": 0.17452210710637678, + "grad_norm": 1.2391376806185903, + "learning_rate": 4.811972404673076e-05, + "loss": 0.7145, + "step": 5980 + }, + { + "epoch": 0.17466802860061287, + "grad_norm": 1.1920169049206775, + "learning_rate": 4.8115376500548805e-05, + "loss": 0.6718, + "step": 5985 + }, + { + "epoch": 0.17481395009484896, + "grad_norm": 1.041370507070442, + "learning_rate": 4.8111024153691404e-05, + "loss": 0.6688, + "step": 5990 + }, + { + "epoch": 0.17495987158908508, + "grad_norm": 0.9506914333904248, + "learning_rate": 4.810666700717208e-05, + "loss": 0.6129, + "step": 5995 + }, + { + "epoch": 0.17510579308332117, + "grad_norm": 1.1161032087763265, + "learning_rate": 4.810230506200548e-05, + "loss": 0.66, + "step": 6000 + }, + { + "epoch": 0.17525171457755728, + "grad_norm": 1.1309355315614864, + "learning_rate": 4.8097938319207344e-05, + "loss": 0.6208, + "step": 6005 + }, + { + "epoch": 0.17539763607179337, + "grad_norm": 1.123185824147339, + "learning_rate": 4.809356677979459e-05, + "loss": 0.6356, + "step": 6010 + }, + { + "epoch": 0.1755435575660295, + "grad_norm": 1.132026453055045, + "learning_rate": 4.808919044478518e-05, + "loss": 0.664, + "step": 6015 + }, + { + "epoch": 0.17568947906026558, + "grad_norm": 1.036896144774975, + "learning_rate": 4.808480931519823e-05, + "loss": 0.6217, + "step": 6020 + }, + { + "epoch": 0.17583540055450167, + "grad_norm": 1.027792930328618, + "learning_rate": 4.8080423392053974e-05, + "loss": 0.6529, + "step": 6025 + }, + { + "epoch": 0.1759813220487378, + "grad_norm": 1.1116890228310785, + "learning_rate": 4.807603267637376e-05, + "loss": 0.7438, + "step": 6030 + }, + { + "epoch": 0.17612724354297388, + "grad_norm": 1.0100042795081638, + "learning_rate": 4.807163716918004e-05, + "loss": 0.6307, + "step": 6035 + }, + { + "epoch": 0.17627316503721, + "grad_norm": 1.1178401914737577, + "learning_rate": 4.806723687149639e-05, + "loss": 0.5722, + "step": 6040 + }, + { + "epoch": 0.17641908653144608, + "grad_norm": 1.0737063318935556, + "learning_rate": 4.80628317843475e-05, + "loss": 0.6561, + "step": 6045 + }, + { + "epoch": 0.17656500802568217, + "grad_norm": 0.9639959347348943, + "learning_rate": 4.805842190875919e-05, + "loss": 0.6287, + "step": 6050 + }, + { + "epoch": 0.1767109295199183, + "grad_norm": 1.1674550348764956, + "learning_rate": 4.8054007245758366e-05, + "loss": 0.7137, + "step": 6055 + }, + { + "epoch": 0.17685685101415438, + "grad_norm": 1.1415061379870395, + "learning_rate": 4.804958779637307e-05, + "loss": 0.6984, + "step": 6060 + }, + { + "epoch": 0.1770027725083905, + "grad_norm": 0.9811302103482721, + "learning_rate": 4.8045163561632464e-05, + "loss": 0.6271, + "step": 6065 + }, + { + "epoch": 0.1771486940026266, + "grad_norm": 1.0717390539277747, + "learning_rate": 4.80407345425668e-05, + "loss": 0.6716, + "step": 6070 + }, + { + "epoch": 0.17729461549686268, + "grad_norm": 1.0128661408453679, + "learning_rate": 4.803630074020746e-05, + "loss": 0.638, + "step": 6075 + }, + { + "epoch": 0.1774405369910988, + "grad_norm": 1.0359011777591365, + "learning_rate": 4.803186215558694e-05, + "loss": 0.7067, + "step": 6080 + }, + { + "epoch": 0.17758645848533489, + "grad_norm": 0.9434861313338309, + "learning_rate": 4.8027418789738856e-05, + "loss": 0.6349, + "step": 6085 + }, + { + "epoch": 0.177732379979571, + "grad_norm": 0.9817959766461949, + "learning_rate": 4.802297064369792e-05, + "loss": 0.5733, + "step": 6090 + }, + { + "epoch": 0.1778783014738071, + "grad_norm": 1.1010829244932019, + "learning_rate": 4.8018517718499954e-05, + "loss": 0.6447, + "step": 6095 + }, + { + "epoch": 0.17802422296804318, + "grad_norm": 1.1612357818858425, + "learning_rate": 4.801406001518193e-05, + "loss": 0.7169, + "step": 6100 + }, + { + "epoch": 0.1781701444622793, + "grad_norm": 1.0588180113576187, + "learning_rate": 4.800959753478188e-05, + "loss": 0.6918, + "step": 6105 + }, + { + "epoch": 0.1783160659565154, + "grad_norm": 1.0819446219492115, + "learning_rate": 4.800513027833899e-05, + "loss": 0.7051, + "step": 6110 + }, + { + "epoch": 0.1784619874507515, + "grad_norm": 1.26935986814738, + "learning_rate": 4.800065824689355e-05, + "loss": 0.6974, + "step": 6115 + }, + { + "epoch": 0.1786079089449876, + "grad_norm": 0.9414455874726568, + "learning_rate": 4.7996181441486956e-05, + "loss": 0.6378, + "step": 6120 + }, + { + "epoch": 0.1787538304392237, + "grad_norm": 1.0219113583134098, + "learning_rate": 4.79916998631617e-05, + "loss": 0.6913, + "step": 6125 + }, + { + "epoch": 0.1788997519334598, + "grad_norm": 1.077174276236011, + "learning_rate": 4.798721351296143e-05, + "loss": 0.6507, + "step": 6130 + }, + { + "epoch": 0.1790456734276959, + "grad_norm": 1.5031551601664102, + "learning_rate": 4.7982722391930836e-05, + "loss": 0.6695, + "step": 6135 + }, + { + "epoch": 0.179191594921932, + "grad_norm": 1.2128193036952357, + "learning_rate": 4.797822650111578e-05, + "loss": 0.6958, + "step": 6140 + }, + { + "epoch": 0.1793375164161681, + "grad_norm": 0.9556463067340181, + "learning_rate": 4.7973725841563235e-05, + "loss": 0.582, + "step": 6145 + }, + { + "epoch": 0.1794834379104042, + "grad_norm": 0.8949170976311094, + "learning_rate": 4.796922041432123e-05, + "loss": 0.6042, + "step": 6150 + }, + { + "epoch": 0.1796293594046403, + "grad_norm": 0.8307943038662808, + "learning_rate": 4.7964710220438955e-05, + "loss": 0.5589, + "step": 6155 + }, + { + "epoch": 0.1797752808988764, + "grad_norm": 1.2085501426829277, + "learning_rate": 4.796019526096669e-05, + "loss": 0.6419, + "step": 6160 + }, + { + "epoch": 0.17992120239311252, + "grad_norm": 1.203525025146659, + "learning_rate": 4.795567553695583e-05, + "loss": 0.6876, + "step": 6165 + }, + { + "epoch": 0.1800671238873486, + "grad_norm": 1.137892597316037, + "learning_rate": 4.795115104945887e-05, + "loss": 0.6512, + "step": 6170 + }, + { + "epoch": 0.1802130453815847, + "grad_norm": 0.9297481779726161, + "learning_rate": 4.7946621799529425e-05, + "loss": 0.5781, + "step": 6175 + }, + { + "epoch": 0.18035896687582081, + "grad_norm": 0.9683969973650844, + "learning_rate": 4.794208778822222e-05, + "loss": 0.7322, + "step": 6180 + }, + { + "epoch": 0.1805048883700569, + "grad_norm": 1.1950088806934311, + "learning_rate": 4.793754901659308e-05, + "loss": 0.6243, + "step": 6185 + }, + { + "epoch": 0.18065080986429302, + "grad_norm": 1.069789473235599, + "learning_rate": 4.7933005485698935e-05, + "loss": 0.6426, + "step": 6190 + }, + { + "epoch": 0.1807967313585291, + "grad_norm": 0.9221254606544639, + "learning_rate": 4.792845719659785e-05, + "loss": 0.6564, + "step": 6195 + }, + { + "epoch": 0.1809426528527652, + "grad_norm": 1.0405067806342225, + "learning_rate": 4.792390415034896e-05, + "loss": 0.6926, + "step": 6200 + }, + { + "epoch": 0.18108857434700132, + "grad_norm": 0.9865959892063116, + "learning_rate": 4.791934634801254e-05, + "loss": 0.6577, + "step": 6205 + }, + { + "epoch": 0.1812344958412374, + "grad_norm": 1.1541761691466992, + "learning_rate": 4.791478379064995e-05, + "loss": 0.6403, + "step": 6210 + }, + { + "epoch": 0.18138041733547353, + "grad_norm": 1.0592593559496497, + "learning_rate": 4.791021647932368e-05, + "loss": 0.6836, + "step": 6215 + }, + { + "epoch": 0.18152633882970962, + "grad_norm": 1.054394693247683, + "learning_rate": 4.79056444150973e-05, + "loss": 0.6195, + "step": 6220 + }, + { + "epoch": 0.1816722603239457, + "grad_norm": 1.092947735961128, + "learning_rate": 4.79010675990355e-05, + "loss": 0.6957, + "step": 6225 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.9145313085964767, + "learning_rate": 4.789648603220409e-05, + "loss": 0.6493, + "step": 6230 + }, + { + "epoch": 0.1819641033124179, + "grad_norm": 1.1543108191792981, + "learning_rate": 4.789189971566996e-05, + "loss": 0.707, + "step": 6235 + }, + { + "epoch": 0.18211002480665403, + "grad_norm": 0.9973475345644737, + "learning_rate": 4.788730865050113e-05, + "loss": 0.6055, + "step": 6240 + }, + { + "epoch": 0.18225594630089012, + "grad_norm": 0.9964083712353639, + "learning_rate": 4.788271283776671e-05, + "loss": 0.6313, + "step": 6245 + }, + { + "epoch": 0.1824018677951262, + "grad_norm": 1.1053482173325482, + "learning_rate": 4.787811227853693e-05, + "loss": 0.6362, + "step": 6250 + }, + { + "epoch": 0.18254778928936233, + "grad_norm": 1.0657924676749686, + "learning_rate": 4.78735069738831e-05, + "loss": 0.6592, + "step": 6255 + }, + { + "epoch": 0.18269371078359842, + "grad_norm": 1.3867094774485877, + "learning_rate": 4.786889692487766e-05, + "loss": 0.7252, + "step": 6260 + }, + { + "epoch": 0.18283963227783454, + "grad_norm": 0.8749883791255879, + "learning_rate": 4.7864282132594144e-05, + "loss": 0.6059, + "step": 6265 + }, + { + "epoch": 0.18298555377207062, + "grad_norm": 1.1091481714620315, + "learning_rate": 4.7859662598107184e-05, + "loss": 0.6656, + "step": 6270 + }, + { + "epoch": 0.18313147526630671, + "grad_norm": 1.2251151464167234, + "learning_rate": 4.785503832249255e-05, + "loss": 0.6715, + "step": 6275 + }, + { + "epoch": 0.18327739676054283, + "grad_norm": 1.3348698757094466, + "learning_rate": 4.785040930682706e-05, + "loss": 0.7539, + "step": 6280 + }, + { + "epoch": 0.18342331825477892, + "grad_norm": 1.1331883741282, + "learning_rate": 4.784577555218871e-05, + "loss": 0.7123, + "step": 6285 + }, + { + "epoch": 0.18356923974901504, + "grad_norm": 1.0607702341484464, + "learning_rate": 4.7841137059656496e-05, + "loss": 0.6456, + "step": 6290 + }, + { + "epoch": 0.18371516124325113, + "grad_norm": 0.9179007088115045, + "learning_rate": 4.783649383031063e-05, + "loss": 0.6533, + "step": 6295 + }, + { + "epoch": 0.18386108273748722, + "grad_norm": 0.928421946967105, + "learning_rate": 4.783184586523235e-05, + "loss": 0.6611, + "step": 6300 + }, + { + "epoch": 0.18400700423172334, + "grad_norm": 0.9657871704618354, + "learning_rate": 4.782719316550402e-05, + "loss": 0.6562, + "step": 6305 + }, + { + "epoch": 0.18415292572595943, + "grad_norm": 1.0250506981466307, + "learning_rate": 4.7822535732209114e-05, + "loss": 0.6743, + "step": 6310 + }, + { + "epoch": 0.18429884722019554, + "grad_norm": 1.2547229483808064, + "learning_rate": 4.781787356643221e-05, + "loss": 0.6738, + "step": 6315 + }, + { + "epoch": 0.18444476871443163, + "grad_norm": 1.053602139242692, + "learning_rate": 4.781320666925897e-05, + "loss": 0.6908, + "step": 6320 + }, + { + "epoch": 0.18459069020866772, + "grad_norm": 1.1389212011012986, + "learning_rate": 4.780853504177618e-05, + "loss": 0.7353, + "step": 6325 + }, + { + "epoch": 0.18473661170290384, + "grad_norm": 0.9306343862163826, + "learning_rate": 4.780385868507169e-05, + "loss": 0.578, + "step": 6330 + }, + { + "epoch": 0.18488253319713993, + "grad_norm": 1.3112971974964946, + "learning_rate": 4.77991776002345e-05, + "loss": 0.7199, + "step": 6335 + }, + { + "epoch": 0.18502845469137605, + "grad_norm": 0.9185205421190875, + "learning_rate": 4.779449178835469e-05, + "loss": 0.643, + "step": 6340 + }, + { + "epoch": 0.18517437618561214, + "grad_norm": 0.9408501696544516, + "learning_rate": 4.778980125052342e-05, + "loss": 0.6195, + "step": 6345 + }, + { + "epoch": 0.18532029767984823, + "grad_norm": 1.1435914700391108, + "learning_rate": 4.778510598783298e-05, + "loss": 0.7102, + "step": 6350 + }, + { + "epoch": 0.18546621917408435, + "grad_norm": 1.0323140847819996, + "learning_rate": 4.778040600137675e-05, + "loss": 0.7061, + "step": 6355 + }, + { + "epoch": 0.18561214066832044, + "grad_norm": 0.8508506708388412, + "learning_rate": 4.7775701292249216e-05, + "loss": 0.6398, + "step": 6360 + }, + { + "epoch": 0.18575806216255655, + "grad_norm": 1.0913335807064852, + "learning_rate": 4.777099186154596e-05, + "loss": 0.6632, + "step": 6365 + }, + { + "epoch": 0.18590398365679264, + "grad_norm": 1.1264675963207216, + "learning_rate": 4.776627771036363e-05, + "loss": 0.6836, + "step": 6370 + }, + { + "epoch": 0.18604990515102873, + "grad_norm": 1.0901891942435533, + "learning_rate": 4.776155883980004e-05, + "loss": 0.6542, + "step": 6375 + }, + { + "epoch": 0.18619582664526485, + "grad_norm": 0.8510195056285907, + "learning_rate": 4.775683525095405e-05, + "loss": 0.6037, + "step": 6380 + }, + { + "epoch": 0.18634174813950094, + "grad_norm": 1.0702993445210793, + "learning_rate": 4.775210694492563e-05, + "loss": 0.6518, + "step": 6385 + }, + { + "epoch": 0.18648766963373706, + "grad_norm": 1.0103717806419281, + "learning_rate": 4.774737392281587e-05, + "loss": 0.6251, + "step": 6390 + }, + { + "epoch": 0.18663359112797315, + "grad_norm": 1.0444080874384256, + "learning_rate": 4.774263618572693e-05, + "loss": 0.6471, + "step": 6395 + }, + { + "epoch": 0.18677951262220926, + "grad_norm": 1.077373469858686, + "learning_rate": 4.7737893734762094e-05, + "loss": 0.642, + "step": 6400 + }, + { + "epoch": 0.18692543411644535, + "grad_norm": 1.2061842981349742, + "learning_rate": 4.773314657102573e-05, + "loss": 0.6482, + "step": 6405 + }, + { + "epoch": 0.18707135561068144, + "grad_norm": 1.3391927614225956, + "learning_rate": 4.772839469562328e-05, + "loss": 0.6393, + "step": 6410 + }, + { + "epoch": 0.18721727710491756, + "grad_norm": 1.0302767115260794, + "learning_rate": 4.7723638109661326e-05, + "loss": 0.6479, + "step": 6415 + }, + { + "epoch": 0.18736319859915365, + "grad_norm": 1.0821221185157603, + "learning_rate": 4.771887681424753e-05, + "loss": 0.6549, + "step": 6420 + }, + { + "epoch": 0.18750912009338977, + "grad_norm": 1.1196781487509881, + "learning_rate": 4.771411081049065e-05, + "loss": 0.6575, + "step": 6425 + }, + { + "epoch": 0.18765504158762586, + "grad_norm": 1.096012458357204, + "learning_rate": 4.770934009950052e-05, + "loss": 0.7954, + "step": 6430 + }, + { + "epoch": 0.18780096308186195, + "grad_norm": 1.054387235681322, + "learning_rate": 4.770456468238811e-05, + "loss": 0.6618, + "step": 6435 + }, + { + "epoch": 0.18794688457609807, + "grad_norm": 1.1934403162734706, + "learning_rate": 4.769978456026545e-05, + "loss": 0.6832, + "step": 6440 + }, + { + "epoch": 0.18809280607033416, + "grad_norm": 1.108957855217466, + "learning_rate": 4.769499973424569e-05, + "loss": 0.7001, + "step": 6445 + }, + { + "epoch": 0.18823872756457027, + "grad_norm": 1.121477539021858, + "learning_rate": 4.7690210205443064e-05, + "loss": 0.5854, + "step": 6450 + }, + { + "epoch": 0.18838464905880636, + "grad_norm": 0.963012984332486, + "learning_rate": 4.76854159749729e-05, + "loss": 0.6395, + "step": 6455 + }, + { + "epoch": 0.18853057055304245, + "grad_norm": 0.9717360416316129, + "learning_rate": 4.768061704395164e-05, + "loss": 0.6392, + "step": 6460 + }, + { + "epoch": 0.18867649204727857, + "grad_norm": 1.0446803016216542, + "learning_rate": 4.7675813413496776e-05, + "loss": 0.6067, + "step": 6465 + }, + { + "epoch": 0.18882241354151466, + "grad_norm": 1.1289546660224659, + "learning_rate": 4.767100508472694e-05, + "loss": 0.6597, + "step": 6470 + }, + { + "epoch": 0.18896833503575078, + "grad_norm": 1.0813879804760762, + "learning_rate": 4.7666192058761846e-05, + "loss": 0.5891, + "step": 6475 + }, + { + "epoch": 0.18911425652998687, + "grad_norm": 0.9874927657928418, + "learning_rate": 4.7661374336722295e-05, + "loss": 0.6059, + "step": 6480 + }, + { + "epoch": 0.18926017802422296, + "grad_norm": 1.1574181698748829, + "learning_rate": 4.765655191973017e-05, + "loss": 0.6947, + "step": 6485 + }, + { + "epoch": 0.18940609951845908, + "grad_norm": 0.9236695678307092, + "learning_rate": 4.765172480890847e-05, + "loss": 0.6225, + "step": 6490 + }, + { + "epoch": 0.18955202101269517, + "grad_norm": 1.1691949116014009, + "learning_rate": 4.764689300538129e-05, + "loss": 0.6598, + "step": 6495 + }, + { + "epoch": 0.18969794250693128, + "grad_norm": 1.0012166008101588, + "learning_rate": 4.764205651027378e-05, + "loss": 0.659, + "step": 6500 + }, + { + "epoch": 0.18984386400116737, + "grad_norm": 1.096608112673433, + "learning_rate": 4.763721532471223e-05, + "loss": 0.693, + "step": 6505 + }, + { + "epoch": 0.18998978549540346, + "grad_norm": 1.0921466671057476, + "learning_rate": 4.7632369449823985e-05, + "loss": 0.6536, + "step": 6510 + }, + { + "epoch": 0.19013570698963958, + "grad_norm": 1.0037785754982855, + "learning_rate": 4.762751888673752e-05, + "loss": 0.6204, + "step": 6515 + }, + { + "epoch": 0.19028162848387567, + "grad_norm": 1.2000223857746146, + "learning_rate": 4.762266363658236e-05, + "loss": 0.6262, + "step": 6520 + }, + { + "epoch": 0.1904275499781118, + "grad_norm": 0.9308168238781539, + "learning_rate": 4.761780370048914e-05, + "loss": 0.6649, + "step": 6525 + }, + { + "epoch": 0.19057347147234788, + "grad_norm": 0.9283035334728622, + "learning_rate": 4.761293907958959e-05, + "loss": 0.653, + "step": 6530 + }, + { + "epoch": 0.19071939296658397, + "grad_norm": 1.1500026947608422, + "learning_rate": 4.760806977501654e-05, + "loss": 0.6856, + "step": 6535 + }, + { + "epoch": 0.19086531446082008, + "grad_norm": 0.9758282295107473, + "learning_rate": 4.760319578790387e-05, + "loss": 0.6762, + "step": 6540 + }, + { + "epoch": 0.19101123595505617, + "grad_norm": 1.0090983716089659, + "learning_rate": 4.759831711938661e-05, + "loss": 0.6334, + "step": 6545 + }, + { + "epoch": 0.1911571574492923, + "grad_norm": 1.0738045462103987, + "learning_rate": 4.759343377060083e-05, + "loss": 0.6486, + "step": 6550 + }, + { + "epoch": 0.19130307894352838, + "grad_norm": 1.116726838877331, + "learning_rate": 4.758854574268372e-05, + "loss": 0.7294, + "step": 6555 + }, + { + "epoch": 0.19144900043776447, + "grad_norm": 1.1269600840472502, + "learning_rate": 4.758365303677354e-05, + "loss": 0.6475, + "step": 6560 + }, + { + "epoch": 0.1915949219320006, + "grad_norm": 1.1690961820421073, + "learning_rate": 4.757875565400966e-05, + "loss": 0.7045, + "step": 6565 + }, + { + "epoch": 0.19174084342623668, + "grad_norm": 0.9105339143251346, + "learning_rate": 4.7573853595532514e-05, + "loss": 0.6553, + "step": 6570 + }, + { + "epoch": 0.1918867649204728, + "grad_norm": 1.0393979860287295, + "learning_rate": 4.756894686248364e-05, + "loss": 0.6044, + "step": 6575 + }, + { + "epoch": 0.1920326864147089, + "grad_norm": 1.0910726865267946, + "learning_rate": 4.7564035456005665e-05, + "loss": 0.6641, + "step": 6580 + }, + { + "epoch": 0.19217860790894498, + "grad_norm": 1.0476567579804363, + "learning_rate": 4.7559119377242314e-05, + "loss": 0.678, + "step": 6585 + }, + { + "epoch": 0.1923245294031811, + "grad_norm": 1.3435523023442613, + "learning_rate": 4.755419862733836e-05, + "loss": 0.5655, + "step": 6590 + }, + { + "epoch": 0.19247045089741718, + "grad_norm": 1.1618468904465096, + "learning_rate": 4.754927320743971e-05, + "loss": 0.7504, + "step": 6595 + }, + { + "epoch": 0.1926163723916533, + "grad_norm": 1.069578954625427, + "learning_rate": 4.7544343118693333e-05, + "loss": 0.6604, + "step": 6600 + }, + { + "epoch": 0.1927622938858894, + "grad_norm": 1.1058686229663288, + "learning_rate": 4.753940836224731e-05, + "loss": 0.6901, + "step": 6605 + }, + { + "epoch": 0.19290821538012548, + "grad_norm": 1.0441784519624024, + "learning_rate": 4.7534468939250776e-05, + "loss": 0.6507, + "step": 6610 + }, + { + "epoch": 0.1930541368743616, + "grad_norm": 1.3172366548273295, + "learning_rate": 4.752952485085396e-05, + "loss": 0.7757, + "step": 6615 + }, + { + "epoch": 0.1932000583685977, + "grad_norm": 1.0919544046884762, + "learning_rate": 4.7524576098208195e-05, + "loss": 0.6227, + "step": 6620 + }, + { + "epoch": 0.1933459798628338, + "grad_norm": 1.011298643045648, + "learning_rate": 4.7519622682465896e-05, + "loss": 0.6433, + "step": 6625 + }, + { + "epoch": 0.1934919013570699, + "grad_norm": 1.103939144420848, + "learning_rate": 4.751466460478055e-05, + "loss": 0.6967, + "step": 6630 + }, + { + "epoch": 0.19363782285130599, + "grad_norm": 1.0199218675373252, + "learning_rate": 4.750970186630674e-05, + "loss": 0.6224, + "step": 6635 + }, + { + "epoch": 0.1937837443455421, + "grad_norm": 0.9267471049802014, + "learning_rate": 4.750473446820014e-05, + "loss": 0.6635, + "step": 6640 + }, + { + "epoch": 0.1939296658397782, + "grad_norm": 1.046856200619719, + "learning_rate": 4.749976241161749e-05, + "loss": 0.6996, + "step": 6645 + }, + { + "epoch": 0.1940755873340143, + "grad_norm": 1.1076283860982237, + "learning_rate": 4.7494785697716635e-05, + "loss": 0.6673, + "step": 6650 + }, + { + "epoch": 0.1942215088282504, + "grad_norm": 1.0289516170087116, + "learning_rate": 4.748980432765648e-05, + "loss": 0.6571, + "step": 6655 + }, + { + "epoch": 0.1943674303224865, + "grad_norm": 1.383148541260663, + "learning_rate": 4.7484818302597044e-05, + "loss": 0.7467, + "step": 6660 + }, + { + "epoch": 0.1945133518167226, + "grad_norm": 0.9981041575891282, + "learning_rate": 4.747982762369942e-05, + "loss": 0.6326, + "step": 6665 + }, + { + "epoch": 0.1946592733109587, + "grad_norm": 1.015899829286423, + "learning_rate": 4.747483229212577e-05, + "loss": 0.6858, + "step": 6670 + }, + { + "epoch": 0.19480519480519481, + "grad_norm": 1.0888391003400781, + "learning_rate": 4.746983230903936e-05, + "loss": 0.6497, + "step": 6675 + }, + { + "epoch": 0.1949511162994309, + "grad_norm": 1.0169088233194108, + "learning_rate": 4.746482767560452e-05, + "loss": 0.6399, + "step": 6680 + }, + { + "epoch": 0.195097037793667, + "grad_norm": 1.2240384293424118, + "learning_rate": 4.7459818392986674e-05, + "loss": 0.6296, + "step": 6685 + }, + { + "epoch": 0.1952429592879031, + "grad_norm": 0.9764077789927863, + "learning_rate": 4.745480446235233e-05, + "loss": 0.6501, + "step": 6690 + }, + { + "epoch": 0.1953888807821392, + "grad_norm": 1.031592025792826, + "learning_rate": 4.7449785884869075e-05, + "loss": 0.6349, + "step": 6695 + }, + { + "epoch": 0.19553480227637532, + "grad_norm": 0.9722213567725236, + "learning_rate": 4.744476266170558e-05, + "loss": 0.6906, + "step": 6700 + }, + { + "epoch": 0.1956807237706114, + "grad_norm": 0.9781752872530398, + "learning_rate": 4.7439734794031594e-05, + "loss": 0.6642, + "step": 6705 + }, + { + "epoch": 0.1958266452648475, + "grad_norm": 0.9368228142924034, + "learning_rate": 4.743470228301795e-05, + "loss": 0.6062, + "step": 6710 + }, + { + "epoch": 0.19597256675908362, + "grad_norm": 1.120891321215899, + "learning_rate": 4.7429665129836566e-05, + "loss": 0.68, + "step": 6715 + }, + { + "epoch": 0.1961184882533197, + "grad_norm": 0.9755363423344646, + "learning_rate": 4.742462333566043e-05, + "loss": 0.6728, + "step": 6720 + }, + { + "epoch": 0.19626440974755582, + "grad_norm": 0.9037010963358724, + "learning_rate": 4.741957690166362e-05, + "loss": 0.6124, + "step": 6725 + }, + { + "epoch": 0.1964103312417919, + "grad_norm": 0.935710690771161, + "learning_rate": 4.74145258290213e-05, + "loss": 0.6664, + "step": 6730 + }, + { + "epoch": 0.196556252736028, + "grad_norm": 0.9508973949324396, + "learning_rate": 4.7409470118909696e-05, + "loss": 0.6252, + "step": 6735 + }, + { + "epoch": 0.19670217423026412, + "grad_norm": 1.1397935191116033, + "learning_rate": 4.740440977250613e-05, + "loss": 0.5876, + "step": 6740 + }, + { + "epoch": 0.1968480957245002, + "grad_norm": 1.0865488555557214, + "learning_rate": 4.7399344790989e-05, + "loss": 0.6663, + "step": 6745 + }, + { + "epoch": 0.19699401721873633, + "grad_norm": 0.9425440292226442, + "learning_rate": 4.7394275175537786e-05, + "loss": 0.6417, + "step": 6750 + }, + { + "epoch": 0.19713993871297242, + "grad_norm": 1.3444134711085947, + "learning_rate": 4.738920092733302e-05, + "loss": 0.6242, + "step": 6755 + }, + { + "epoch": 0.19728586020720854, + "grad_norm": 1.1784758621807747, + "learning_rate": 4.738412204755636e-05, + "loss": 0.6504, + "step": 6760 + }, + { + "epoch": 0.19743178170144463, + "grad_norm": 1.0366145966340217, + "learning_rate": 4.7379038537390504e-05, + "loss": 0.6234, + "step": 6765 + }, + { + "epoch": 0.19757770319568072, + "grad_norm": 1.3248750088146903, + "learning_rate": 4.737395039801926e-05, + "loss": 0.7027, + "step": 6770 + }, + { + "epoch": 0.19772362468991683, + "grad_norm": 1.3379530734272604, + "learning_rate": 4.736885763062747e-05, + "loss": 0.6045, + "step": 6775 + }, + { + "epoch": 0.19786954618415292, + "grad_norm": 0.9822153261271002, + "learning_rate": 4.736376023640111e-05, + "loss": 0.5932, + "step": 6780 + }, + { + "epoch": 0.19801546767838904, + "grad_norm": 1.041987249695403, + "learning_rate": 4.735865821652717e-05, + "loss": 0.6688, + "step": 6785 + }, + { + "epoch": 0.19816138917262513, + "grad_norm": 1.033502153320255, + "learning_rate": 4.7353551572193774e-05, + "loss": 0.6069, + "step": 6790 + }, + { + "epoch": 0.19830731066686122, + "grad_norm": 1.2771916490689248, + "learning_rate": 4.734844030459009e-05, + "loss": 0.6547, + "step": 6795 + }, + { + "epoch": 0.19845323216109734, + "grad_norm": 1.1844232423303866, + "learning_rate": 4.734332441490638e-05, + "loss": 0.6676, + "step": 6800 + }, + { + "epoch": 0.19859915365533343, + "grad_norm": 1.115957162946092, + "learning_rate": 4.733820390433397e-05, + "loss": 0.7077, + "step": 6805 + }, + { + "epoch": 0.19874507514956954, + "grad_norm": 1.0337584772295996, + "learning_rate": 4.733307877406526e-05, + "loss": 0.6067, + "step": 6810 + }, + { + "epoch": 0.19889099664380563, + "grad_norm": 1.2126984061692443, + "learning_rate": 4.732794902529375e-05, + "loss": 0.7241, + "step": 6815 + }, + { + "epoch": 0.19903691813804172, + "grad_norm": 1.0196398264549338, + "learning_rate": 4.732281465921398e-05, + "loss": 0.6566, + "step": 6820 + }, + { + "epoch": 0.19918283963227784, + "grad_norm": 1.027466485874258, + "learning_rate": 4.731767567702159e-05, + "loss": 0.7184, + "step": 6825 + }, + { + "epoch": 0.19932876112651393, + "grad_norm": 1.0345727192662957, + "learning_rate": 4.731253207991329e-05, + "loss": 0.6255, + "step": 6830 + }, + { + "epoch": 0.19947468262075005, + "grad_norm": 0.9710338198528217, + "learning_rate": 4.730738386908685e-05, + "loss": 0.7239, + "step": 6835 + }, + { + "epoch": 0.19962060411498614, + "grad_norm": 1.0840738938520926, + "learning_rate": 4.7302231045741155e-05, + "loss": 0.6311, + "step": 6840 + }, + { + "epoch": 0.19976652560922223, + "grad_norm": 1.159372544076218, + "learning_rate": 4.72970736110761e-05, + "loss": 0.7027, + "step": 6845 + }, + { + "epoch": 0.19991244710345835, + "grad_norm": 1.0581160310886049, + "learning_rate": 4.729191156629272e-05, + "loss": 0.6527, + "step": 6850 + }, + { + "epoch": 0.20005836859769444, + "grad_norm": 1.042686197366578, + "learning_rate": 4.728674491259308e-05, + "loss": 0.6426, + "step": 6855 + }, + { + "epoch": 0.20020429009193055, + "grad_norm": 0.9538219784456546, + "learning_rate": 4.728157365118033e-05, + "loss": 0.5846, + "step": 6860 + }, + { + "epoch": 0.20035021158616664, + "grad_norm": 1.1410763823129053, + "learning_rate": 4.7276397783258704e-05, + "loss": 0.6514, + "step": 6865 + }, + { + "epoch": 0.20049613308040273, + "grad_norm": 0.8994330418932087, + "learning_rate": 4.72712173100335e-05, + "loss": 0.638, + "step": 6870 + }, + { + "epoch": 0.20064205457463885, + "grad_norm": 0.991636648260618, + "learning_rate": 4.7266032232711066e-05, + "loss": 0.648, + "step": 6875 + }, + { + "epoch": 0.20078797606887494, + "grad_norm": 1.0711945313425744, + "learning_rate": 4.726084255249887e-05, + "loss": 0.6388, + "step": 6880 + }, + { + "epoch": 0.20093389756311106, + "grad_norm": 1.0736267645967408, + "learning_rate": 4.725564827060541e-05, + "loss": 0.624, + "step": 6885 + }, + { + "epoch": 0.20107981905734715, + "grad_norm": 1.1152268552481455, + "learning_rate": 4.725044938824028e-05, + "loss": 0.6922, + "step": 6890 + }, + { + "epoch": 0.20122574055158324, + "grad_norm": 1.070326805658138, + "learning_rate": 4.7245245906614135e-05, + "loss": 0.6443, + "step": 6895 + }, + { + "epoch": 0.20137166204581936, + "grad_norm": 1.212511864940288, + "learning_rate": 4.724003782693871e-05, + "loss": 0.633, + "step": 6900 + }, + { + "epoch": 0.20151758354005544, + "grad_norm": 1.0059937543670032, + "learning_rate": 4.723482515042679e-05, + "loss": 0.6335, + "step": 6905 + }, + { + "epoch": 0.20166350503429156, + "grad_norm": 1.038829969386479, + "learning_rate": 4.722960787829225e-05, + "loss": 0.6223, + "step": 6910 + }, + { + "epoch": 0.20180942652852765, + "grad_norm": 1.0160669391493726, + "learning_rate": 4.7224386011750024e-05, + "loss": 0.6163, + "step": 6915 + }, + { + "epoch": 0.20195534802276374, + "grad_norm": 1.039408920648934, + "learning_rate": 4.7219159552016133e-05, + "loss": 0.7062, + "step": 6920 + }, + { + "epoch": 0.20210126951699986, + "grad_norm": 0.9427194393697952, + "learning_rate": 4.721392850030765e-05, + "loss": 0.6896, + "step": 6925 + }, + { + "epoch": 0.20224719101123595, + "grad_norm": 1.1679738392459498, + "learning_rate": 4.720869285784272e-05, + "loss": 0.6946, + "step": 6930 + }, + { + "epoch": 0.20239311250547207, + "grad_norm": 0.9391318151626903, + "learning_rate": 4.720345262584056e-05, + "loss": 0.6924, + "step": 6935 + }, + { + "epoch": 0.20253903399970816, + "grad_norm": 0.9588551078489763, + "learning_rate": 4.719820780552146e-05, + "loss": 0.6628, + "step": 6940 + }, + { + "epoch": 0.20268495549394425, + "grad_norm": 0.8749889989565968, + "learning_rate": 4.719295839810678e-05, + "loss": 0.6018, + "step": 6945 + }, + { + "epoch": 0.20283087698818036, + "grad_norm": 0.9398262452419113, + "learning_rate": 4.718770440481892e-05, + "loss": 0.669, + "step": 6950 + }, + { + "epoch": 0.20297679848241645, + "grad_norm": 1.1493432072613248, + "learning_rate": 4.71824458268814e-05, + "loss": 0.6922, + "step": 6955 + }, + { + "epoch": 0.20312271997665257, + "grad_norm": 0.9226450378933662, + "learning_rate": 4.717718266551876e-05, + "loss": 0.65, + "step": 6960 + }, + { + "epoch": 0.20326864147088866, + "grad_norm": 0.9894497141437163, + "learning_rate": 4.717191492195662e-05, + "loss": 0.7219, + "step": 6965 + }, + { + "epoch": 0.20341456296512475, + "grad_norm": 1.1702694357766992, + "learning_rate": 4.7166642597421694e-05, + "loss": 0.6847, + "step": 6970 + }, + { + "epoch": 0.20356048445936087, + "grad_norm": 1.0133427838030136, + "learning_rate": 4.716136569314173e-05, + "loss": 0.6474, + "step": 6975 + }, + { + "epoch": 0.20370640595359696, + "grad_norm": 1.1317470783411034, + "learning_rate": 4.715608421034554e-05, + "loss": 0.6391, + "step": 6980 + }, + { + "epoch": 0.20385232744783308, + "grad_norm": 1.3146596701037883, + "learning_rate": 4.715079815026304e-05, + "loss": 0.711, + "step": 6985 + }, + { + "epoch": 0.20399824894206917, + "grad_norm": 1.047542170006923, + "learning_rate": 4.714550751412517e-05, + "loss": 0.6728, + "step": 6990 + }, + { + "epoch": 0.20414417043630526, + "grad_norm": 1.1227977061211136, + "learning_rate": 4.714021230316396e-05, + "loss": 0.6782, + "step": 6995 + }, + { + "epoch": 0.20429009193054137, + "grad_norm": 0.8566714275650146, + "learning_rate": 4.7134912518612505e-05, + "loss": 0.6473, + "step": 7000 + }, + { + "epoch": 0.20443601342477746, + "grad_norm": 1.1670620875905955, + "learning_rate": 4.7129608161704944e-05, + "loss": 0.6618, + "step": 7005 + }, + { + "epoch": 0.20458193491901358, + "grad_norm": 1.169494280289647, + "learning_rate": 4.7124299233676505e-05, + "loss": 0.7274, + "step": 7010 + }, + { + "epoch": 0.20472785641324967, + "grad_norm": 0.9880420599646522, + "learning_rate": 4.7118985735763473e-05, + "loss": 0.6801, + "step": 7015 + }, + { + "epoch": 0.20487377790748576, + "grad_norm": 1.0748091866208749, + "learning_rate": 4.71136676692032e-05, + "loss": 0.637, + "step": 7020 + }, + { + "epoch": 0.20501969940172188, + "grad_norm": 1.0359670441881943, + "learning_rate": 4.710834503523408e-05, + "loss": 0.6444, + "step": 7025 + }, + { + "epoch": 0.20516562089595797, + "grad_norm": 1.0576428276181502, + "learning_rate": 4.7103017835095596e-05, + "loss": 0.6065, + "step": 7030 + }, + { + "epoch": 0.20531154239019409, + "grad_norm": 1.050881756544358, + "learning_rate": 4.709768607002828e-05, + "loss": 0.6332, + "step": 7035 + }, + { + "epoch": 0.20545746388443017, + "grad_norm": 1.0588481059191026, + "learning_rate": 4.709234974127376e-05, + "loss": 0.6563, + "step": 7040 + }, + { + "epoch": 0.20560338537866626, + "grad_norm": 1.0876354510122306, + "learning_rate": 4.708700885007466e-05, + "loss": 0.6265, + "step": 7045 + }, + { + "epoch": 0.20574930687290238, + "grad_norm": 0.9494208042655983, + "learning_rate": 4.7081663397674744e-05, + "loss": 0.6197, + "step": 7050 + }, + { + "epoch": 0.20589522836713847, + "grad_norm": 1.0264984709933391, + "learning_rate": 4.7076313385318775e-05, + "loss": 0.6334, + "step": 7055 + }, + { + "epoch": 0.2060411498613746, + "grad_norm": 1.087782420699243, + "learning_rate": 4.707095881425261e-05, + "loss": 0.6535, + "step": 7060 + }, + { + "epoch": 0.20618707135561068, + "grad_norm": 1.1475040738201219, + "learning_rate": 4.706559968572317e-05, + "loss": 0.6668, + "step": 7065 + }, + { + "epoch": 0.20633299284984677, + "grad_norm": 1.0117981738040611, + "learning_rate": 4.706023600097842e-05, + "loss": 0.666, + "step": 7070 + }, + { + "epoch": 0.2064789143440829, + "grad_norm": 0.9885571687448587, + "learning_rate": 4.705486776126738e-05, + "loss": 0.6268, + "step": 7075 + }, + { + "epoch": 0.20662483583831898, + "grad_norm": 1.323306997951047, + "learning_rate": 4.704949496784017e-05, + "loss": 0.6735, + "step": 7080 + }, + { + "epoch": 0.2067707573325551, + "grad_norm": 1.074861145612746, + "learning_rate": 4.7044117621947933e-05, + "loss": 0.6723, + "step": 7085 + }, + { + "epoch": 0.20691667882679118, + "grad_norm": 1.0981060487750376, + "learning_rate": 4.703873572484288e-05, + "loss": 0.6272, + "step": 7090 + }, + { + "epoch": 0.20706260032102727, + "grad_norm": 0.9059518698618659, + "learning_rate": 4.70333492777783e-05, + "loss": 0.673, + "step": 7095 + }, + { + "epoch": 0.2072085218152634, + "grad_norm": 1.1167295448443404, + "learning_rate": 4.7027958282008516e-05, + "loss": 0.643, + "step": 7100 + }, + { + "epoch": 0.20735444330949948, + "grad_norm": 1.1798667190916945, + "learning_rate": 4.702256273878892e-05, + "loss": 0.6635, + "step": 7105 + }, + { + "epoch": 0.2075003648037356, + "grad_norm": 1.1848115007393785, + "learning_rate": 4.701716264937598e-05, + "loss": 0.6926, + "step": 7110 + }, + { + "epoch": 0.2076462862979717, + "grad_norm": 0.9503304612994011, + "learning_rate": 4.70117580150272e-05, + "loss": 0.6511, + "step": 7115 + }, + { + "epoch": 0.2077922077922078, + "grad_norm": 1.0693049133378056, + "learning_rate": 4.7006348837001145e-05, + "loss": 0.6802, + "step": 7120 + }, + { + "epoch": 0.2079381292864439, + "grad_norm": 0.9821430157861724, + "learning_rate": 4.700093511655745e-05, + "loss": 0.6213, + "step": 7125 + }, + { + "epoch": 0.20808405078067999, + "grad_norm": 1.1652465809060788, + "learning_rate": 4.6995516854956796e-05, + "loss": 0.6685, + "step": 7130 + }, + { + "epoch": 0.2082299722749161, + "grad_norm": 1.0341241783327564, + "learning_rate": 4.6990094053460924e-05, + "loss": 0.6053, + "step": 7135 + }, + { + "epoch": 0.2083758937691522, + "grad_norm": 1.1164556157591872, + "learning_rate": 4.698466671333264e-05, + "loss": 0.707, + "step": 7140 + }, + { + "epoch": 0.2085218152633883, + "grad_norm": 1.2647222430466085, + "learning_rate": 4.69792348358358e-05, + "loss": 0.6888, + "step": 7145 + }, + { + "epoch": 0.2086677367576244, + "grad_norm": 1.0699456711868018, + "learning_rate": 4.697379842223532e-05, + "loss": 0.6718, + "step": 7150 + }, + { + "epoch": 0.2088136582518605, + "grad_norm": 1.0503008123369013, + "learning_rate": 4.6968357473797174e-05, + "loss": 0.6629, + "step": 7155 + }, + { + "epoch": 0.2089595797460966, + "grad_norm": 1.0033675388866816, + "learning_rate": 4.696291199178838e-05, + "loss": 0.6922, + "step": 7160 + }, + { + "epoch": 0.2091055012403327, + "grad_norm": 1.0364023713431059, + "learning_rate": 4.695746197747702e-05, + "loss": 0.6557, + "step": 7165 + }, + { + "epoch": 0.20925142273456881, + "grad_norm": 1.0745685251413466, + "learning_rate": 4.695200743213224e-05, + "loss": 0.6962, + "step": 7170 + }, + { + "epoch": 0.2093973442288049, + "grad_norm": 1.0447779279640634, + "learning_rate": 4.6946548357024224e-05, + "loss": 0.65, + "step": 7175 + }, + { + "epoch": 0.209543265723041, + "grad_norm": 1.0173300390857254, + "learning_rate": 4.6941084753424226e-05, + "loss": 0.658, + "step": 7180 + }, + { + "epoch": 0.2096891872172771, + "grad_norm": 1.139286188217955, + "learning_rate": 4.693561662260454e-05, + "loss": 0.6968, + "step": 7185 + }, + { + "epoch": 0.2098351087115132, + "grad_norm": 1.0910424147480646, + "learning_rate": 4.693014396583854e-05, + "loss": 0.6471, + "step": 7190 + }, + { + "epoch": 0.20998103020574932, + "grad_norm": 1.0281749584360789, + "learning_rate": 4.692466678440061e-05, + "loss": 0.6242, + "step": 7195 + }, + { + "epoch": 0.2101269516999854, + "grad_norm": 0.9279945676590206, + "learning_rate": 4.691918507956624e-05, + "loss": 0.6891, + "step": 7200 + }, + { + "epoch": 0.2102728731942215, + "grad_norm": 1.0985422939099756, + "learning_rate": 4.691369885261193e-05, + "loss": 0.6215, + "step": 7205 + }, + { + "epoch": 0.21041879468845762, + "grad_norm": 1.0573035147819643, + "learning_rate": 4.690820810481525e-05, + "loss": 0.6838, + "step": 7210 + }, + { + "epoch": 0.2105647161826937, + "grad_norm": 0.9201826896755473, + "learning_rate": 4.690271283745484e-05, + "loss": 0.598, + "step": 7215 + }, + { + "epoch": 0.21071063767692982, + "grad_norm": 1.029101847571502, + "learning_rate": 4.689721305181037e-05, + "loss": 0.6384, + "step": 7220 + }, + { + "epoch": 0.2108565591711659, + "grad_norm": 0.9370642598255721, + "learning_rate": 4.689170874916255e-05, + "loss": 0.6642, + "step": 7225 + }, + { + "epoch": 0.211002480665402, + "grad_norm": 0.9698462818170229, + "learning_rate": 4.688619993079318e-05, + "loss": 0.6149, + "step": 7230 + }, + { + "epoch": 0.21114840215963812, + "grad_norm": 1.0293263506971218, + "learning_rate": 4.688068659798508e-05, + "loss": 0.6537, + "step": 7235 + }, + { + "epoch": 0.2112943236538742, + "grad_norm": 0.9575458990738401, + "learning_rate": 4.687516875202213e-05, + "loss": 0.6693, + "step": 7240 + }, + { + "epoch": 0.21144024514811033, + "grad_norm": 1.24600445598782, + "learning_rate": 4.6869646394189274e-05, + "loss": 0.6387, + "step": 7245 + }, + { + "epoch": 0.21158616664234642, + "grad_norm": 0.9455277141074121, + "learning_rate": 4.6864119525772496e-05, + "loss": 0.6646, + "step": 7250 + }, + { + "epoch": 0.2117320881365825, + "grad_norm": 0.9425085597144862, + "learning_rate": 4.685858814805883e-05, + "loss": 0.6479, + "step": 7255 + }, + { + "epoch": 0.21187800963081863, + "grad_norm": 1.0950647941401213, + "learning_rate": 4.685305226233635e-05, + "loss": 0.7046, + "step": 7260 + }, + { + "epoch": 0.21202393112505472, + "grad_norm": 1.069777006581796, + "learning_rate": 4.68475118698942e-05, + "loss": 0.676, + "step": 7265 + }, + { + "epoch": 0.21216985261929083, + "grad_norm": 1.013498824900132, + "learning_rate": 4.684196697202256e-05, + "loss": 0.7036, + "step": 7270 + }, + { + "epoch": 0.21231577411352692, + "grad_norm": 1.021137698202152, + "learning_rate": 4.683641757001266e-05, + "loss": 0.7009, + "step": 7275 + }, + { + "epoch": 0.212461695607763, + "grad_norm": 1.0145919802847734, + "learning_rate": 4.683086366515679e-05, + "loss": 0.7158, + "step": 7280 + }, + { + "epoch": 0.21260761710199913, + "grad_norm": 1.0189317326825382, + "learning_rate": 4.682530525874828e-05, + "loss": 0.6372, + "step": 7285 + }, + { + "epoch": 0.21275353859623522, + "grad_norm": 0.9927991164962339, + "learning_rate": 4.68197423520815e-05, + "loss": 0.6601, + "step": 7290 + }, + { + "epoch": 0.21289946009047134, + "grad_norm": 1.06097897855751, + "learning_rate": 4.6814174946451886e-05, + "loss": 0.6187, + "step": 7295 + }, + { + "epoch": 0.21304538158470743, + "grad_norm": 1.1754294225562802, + "learning_rate": 4.680860304315591e-05, + "loss": 0.6299, + "step": 7300 + }, + { + "epoch": 0.21319130307894352, + "grad_norm": 1.0015192944093279, + "learning_rate": 4.680302664349109e-05, + "loss": 0.6567, + "step": 7305 + }, + { + "epoch": 0.21333722457317963, + "grad_norm": 1.1631806279649357, + "learning_rate": 4.6797445748755994e-05, + "loss": 0.6114, + "step": 7310 + }, + { + "epoch": 0.21348314606741572, + "grad_norm": 0.9092795358625102, + "learning_rate": 4.679186036025025e-05, + "loss": 0.6191, + "step": 7315 + }, + { + "epoch": 0.21362906756165184, + "grad_norm": 1.1668756863865415, + "learning_rate": 4.67862704792745e-05, + "loss": 0.6784, + "step": 7320 + }, + { + "epoch": 0.21377498905588793, + "grad_norm": 0.8366312586188658, + "learning_rate": 4.678067610713046e-05, + "loss": 0.6352, + "step": 7325 + }, + { + "epoch": 0.21392091055012402, + "grad_norm": 0.9769138828210789, + "learning_rate": 4.6775077245120895e-05, + "loss": 0.6489, + "step": 7330 + }, + { + "epoch": 0.21406683204436014, + "grad_norm": 1.0710505378380875, + "learning_rate": 4.67694738945496e-05, + "loss": 0.6602, + "step": 7335 + }, + { + "epoch": 0.21421275353859623, + "grad_norm": 0.9565802930587681, + "learning_rate": 4.676386605672141e-05, + "loss": 0.6517, + "step": 7340 + }, + { + "epoch": 0.21435867503283235, + "grad_norm": 1.082336256883279, + "learning_rate": 4.6758253732942225e-05, + "loss": 0.7029, + "step": 7345 + }, + { + "epoch": 0.21450459652706844, + "grad_norm": 1.1233606213372693, + "learning_rate": 4.675263692451897e-05, + "loss": 0.6742, + "step": 7350 + }, + { + "epoch": 0.21465051802130453, + "grad_norm": 0.9118210230912315, + "learning_rate": 4.674701563275963e-05, + "loss": 0.6802, + "step": 7355 + }, + { + "epoch": 0.21479643951554064, + "grad_norm": 0.9407238998155552, + "learning_rate": 4.674138985897323e-05, + "loss": 0.6252, + "step": 7360 + }, + { + "epoch": 0.21494236100977673, + "grad_norm": 1.0670195978737167, + "learning_rate": 4.673575960446983e-05, + "loss": 0.6757, + "step": 7365 + }, + { + "epoch": 0.21508828250401285, + "grad_norm": 0.9390137390057487, + "learning_rate": 4.673012487056054e-05, + "loss": 0.5942, + "step": 7370 + }, + { + "epoch": 0.21523420399824894, + "grad_norm": 0.9688860667326249, + "learning_rate": 4.672448565855753e-05, + "loss": 0.6816, + "step": 7375 + }, + { + "epoch": 0.21538012549248503, + "grad_norm": 1.1335265174271185, + "learning_rate": 4.6718841969773974e-05, + "loss": 0.6708, + "step": 7380 + }, + { + "epoch": 0.21552604698672115, + "grad_norm": 0.8345896703669826, + "learning_rate": 4.671319380552412e-05, + "loss": 0.6382, + "step": 7385 + }, + { + "epoch": 0.21567196848095724, + "grad_norm": 0.8908583081029197, + "learning_rate": 4.6707541167123245e-05, + "loss": 0.6445, + "step": 7390 + }, + { + "epoch": 0.21581788997519336, + "grad_norm": 0.9177576348937473, + "learning_rate": 4.670188405588768e-05, + "loss": 0.683, + "step": 7395 + }, + { + "epoch": 0.21596381146942945, + "grad_norm": 1.1292408023159637, + "learning_rate": 4.669622247313478e-05, + "loss": 0.6793, + "step": 7400 + }, + { + "epoch": 0.21610973296366554, + "grad_norm": 1.093415973367532, + "learning_rate": 4.669055642018294e-05, + "loss": 0.6948, + "step": 7405 + }, + { + "epoch": 0.21625565445790165, + "grad_norm": 1.0503185924329987, + "learning_rate": 4.6684885898351634e-05, + "loss": 0.6216, + "step": 7410 + }, + { + "epoch": 0.21640157595213774, + "grad_norm": 0.9387566942021225, + "learning_rate": 4.667921090896133e-05, + "loss": 0.6928, + "step": 7415 + }, + { + "epoch": 0.21654749744637386, + "grad_norm": 0.9598127539859136, + "learning_rate": 4.6673531453333556e-05, + "loss": 0.6502, + "step": 7420 + }, + { + "epoch": 0.21669341894060995, + "grad_norm": 1.2961967017194649, + "learning_rate": 4.666784753279089e-05, + "loss": 0.6072, + "step": 7425 + }, + { + "epoch": 0.21683934043484604, + "grad_norm": 1.038007380448776, + "learning_rate": 4.666215914865692e-05, + "loss": 0.6937, + "step": 7430 + }, + { + "epoch": 0.21698526192908216, + "grad_norm": 0.8714575714025019, + "learning_rate": 4.665646630225631e-05, + "loss": 0.6275, + "step": 7435 + }, + { + "epoch": 0.21713118342331825, + "grad_norm": 1.0941428131078197, + "learning_rate": 4.6650768994914745e-05, + "loss": 0.6476, + "step": 7440 + }, + { + "epoch": 0.21727710491755436, + "grad_norm": 1.2795917803982988, + "learning_rate": 4.6645067227958936e-05, + "loss": 0.6831, + "step": 7445 + }, + { + "epoch": 0.21742302641179045, + "grad_norm": 1.1606057579949522, + "learning_rate": 4.6639361002716664e-05, + "loss": 0.6281, + "step": 7450 + }, + { + "epoch": 0.21756894790602654, + "grad_norm": 1.0364768107140152, + "learning_rate": 4.6633650320516715e-05, + "loss": 0.6391, + "step": 7455 + }, + { + "epoch": 0.21771486940026266, + "grad_norm": 1.0330098649566157, + "learning_rate": 4.662793518268893e-05, + "loss": 0.5885, + "step": 7460 + }, + { + "epoch": 0.21786079089449875, + "grad_norm": 1.1419313769732622, + "learning_rate": 4.6622215590564195e-05, + "loss": 0.6259, + "step": 7465 + }, + { + "epoch": 0.21800671238873487, + "grad_norm": 1.0983366181806211, + "learning_rate": 4.661649154547441e-05, + "loss": 0.6039, + "step": 7470 + }, + { + "epoch": 0.21815263388297096, + "grad_norm": 0.9894326338066628, + "learning_rate": 4.6610763048752536e-05, + "loss": 0.6497, + "step": 7475 + }, + { + "epoch": 0.21829855537720708, + "grad_norm": 0.9945505031730846, + "learning_rate": 4.660503010173257e-05, + "loss": 0.6227, + "step": 7480 + }, + { + "epoch": 0.21844447687144317, + "grad_norm": 1.0310615789054935, + "learning_rate": 4.659929270574951e-05, + "loss": 0.6831, + "step": 7485 + }, + { + "epoch": 0.21859039836567926, + "grad_norm": 1.0852740994889694, + "learning_rate": 4.659355086213943e-05, + "loss": 0.6238, + "step": 7490 + }, + { + "epoch": 0.21873631985991537, + "grad_norm": 1.0804409621228588, + "learning_rate": 4.658780457223943e-05, + "loss": 0.6445, + "step": 7495 + }, + { + "epoch": 0.21888224135415146, + "grad_norm": 1.2114396661840598, + "learning_rate": 4.658205383738763e-05, + "loss": 0.7118, + "step": 7500 + }, + { + "epoch": 0.21902816284838758, + "grad_norm": 1.136607885035878, + "learning_rate": 4.6576298658923204e-05, + "loss": 0.6585, + "step": 7505 + }, + { + "epoch": 0.21917408434262367, + "grad_norm": 0.9593866783911533, + "learning_rate": 4.6570539038186364e-05, + "loss": 0.6273, + "step": 7510 + }, + { + "epoch": 0.21932000583685976, + "grad_norm": 1.2315478632914256, + "learning_rate": 4.656477497651831e-05, + "loss": 0.6671, + "step": 7515 + }, + { + "epoch": 0.21946592733109588, + "grad_norm": 1.0103810708300325, + "learning_rate": 4.6559006475261346e-05, + "loss": 0.6908, + "step": 7520 + }, + { + "epoch": 0.21961184882533197, + "grad_norm": 1.0459683956658206, + "learning_rate": 4.655323353575876e-05, + "loss": 0.6262, + "step": 7525 + }, + { + "epoch": 0.21975777031956809, + "grad_norm": 1.0178379259190358, + "learning_rate": 4.65474561593549e-05, + "loss": 0.685, + "step": 7530 + }, + { + "epoch": 0.21990369181380418, + "grad_norm": 0.9983536592704769, + "learning_rate": 4.654167434739512e-05, + "loss": 0.6595, + "step": 7535 + }, + { + "epoch": 0.22004961330804026, + "grad_norm": 1.0708320003311147, + "learning_rate": 4.653588810122583e-05, + "loss": 0.7093, + "step": 7540 + }, + { + "epoch": 0.22019553480227638, + "grad_norm": 1.3927245501382268, + "learning_rate": 4.653009742219446e-05, + "loss": 0.6493, + "step": 7545 + }, + { + "epoch": 0.22034145629651247, + "grad_norm": 1.0596785589390862, + "learning_rate": 4.65243023116495e-05, + "loss": 0.6508, + "step": 7550 + }, + { + "epoch": 0.2204873777907486, + "grad_norm": 0.9104080685412203, + "learning_rate": 4.651850277094042e-05, + "loss": 0.5673, + "step": 7555 + }, + { + "epoch": 0.22063329928498468, + "grad_norm": 1.124816621662688, + "learning_rate": 4.6512698801417764e-05, + "loss": 0.6861, + "step": 7560 + }, + { + "epoch": 0.22077922077922077, + "grad_norm": 0.9569717256213983, + "learning_rate": 4.65068904044331e-05, + "loss": 0.6211, + "step": 7565 + }, + { + "epoch": 0.2209251422734569, + "grad_norm": 0.9835127678564364, + "learning_rate": 4.650107758133902e-05, + "loss": 0.6003, + "step": 7570 + }, + { + "epoch": 0.22107106376769298, + "grad_norm": 0.9500234760155296, + "learning_rate": 4.649526033348914e-05, + "loss": 0.6293, + "step": 7575 + }, + { + "epoch": 0.2212169852619291, + "grad_norm": 1.1420086690552729, + "learning_rate": 4.648943866223813e-05, + "loss": 0.6669, + "step": 7580 + }, + { + "epoch": 0.22136290675616518, + "grad_norm": 1.105151708378207, + "learning_rate": 4.648361256894165e-05, + "loss": 0.6936, + "step": 7585 + }, + { + "epoch": 0.22150882825040127, + "grad_norm": 1.4111895099639133, + "learning_rate": 4.6477782054956434e-05, + "loss": 0.6792, + "step": 7590 + }, + { + "epoch": 0.2216547497446374, + "grad_norm": 0.9960524177161135, + "learning_rate": 4.647194712164023e-05, + "loss": 0.6881, + "step": 7595 + }, + { + "epoch": 0.22180067123887348, + "grad_norm": 1.0727881848830199, + "learning_rate": 4.6466107770351805e-05, + "loss": 0.6765, + "step": 7600 + }, + { + "epoch": 0.2219465927331096, + "grad_norm": 0.9493967287522872, + "learning_rate": 4.6460264002450955e-05, + "loss": 0.6388, + "step": 7605 + }, + { + "epoch": 0.2220925142273457, + "grad_norm": 1.0857764231493143, + "learning_rate": 4.645441581929851e-05, + "loss": 0.6019, + "step": 7610 + }, + { + "epoch": 0.22223843572158178, + "grad_norm": 1.1562511495727308, + "learning_rate": 4.6448563222256336e-05, + "loss": 0.6299, + "step": 7615 + }, + { + "epoch": 0.2223843572158179, + "grad_norm": 0.904999610580651, + "learning_rate": 4.644270621268732e-05, + "loss": 0.6241, + "step": 7620 + }, + { + "epoch": 0.22253027871005399, + "grad_norm": 1.0347466023762502, + "learning_rate": 4.6436844791955364e-05, + "loss": 0.5937, + "step": 7625 + }, + { + "epoch": 0.2226762002042901, + "grad_norm": 1.0193732613943782, + "learning_rate": 4.6430978961425434e-05, + "loss": 0.6303, + "step": 7630 + }, + { + "epoch": 0.2228221216985262, + "grad_norm": 0.9735957266428596, + "learning_rate": 4.642510872246347e-05, + "loss": 0.597, + "step": 7635 + }, + { + "epoch": 0.22296804319276228, + "grad_norm": 1.1488660874717775, + "learning_rate": 4.641923407643648e-05, + "loss": 0.5677, + "step": 7640 + }, + { + "epoch": 0.2231139646869984, + "grad_norm": 0.9736238729284256, + "learning_rate": 4.641335502471249e-05, + "loss": 0.6511, + "step": 7645 + }, + { + "epoch": 0.2232598861812345, + "grad_norm": 1.127096262411377, + "learning_rate": 4.6407471568660536e-05, + "loss": 0.7143, + "step": 7650 + }, + { + "epoch": 0.2234058076754706, + "grad_norm": 1.2861637996060618, + "learning_rate": 4.6401583709650695e-05, + "loss": 0.6601, + "step": 7655 + }, + { + "epoch": 0.2235517291697067, + "grad_norm": 1.1626155280170258, + "learning_rate": 4.639569144905407e-05, + "loss": 0.5874, + "step": 7660 + }, + { + "epoch": 0.2236976506639428, + "grad_norm": 1.1450179666918712, + "learning_rate": 4.638979478824279e-05, + "loss": 0.6883, + "step": 7665 + }, + { + "epoch": 0.2238435721581789, + "grad_norm": 1.0276036788467968, + "learning_rate": 4.6383893728589975e-05, + "loss": 0.6587, + "step": 7670 + }, + { + "epoch": 0.223989493652415, + "grad_norm": 0.893358429157451, + "learning_rate": 4.637798827146982e-05, + "loss": 0.634, + "step": 7675 + }, + { + "epoch": 0.2241354151466511, + "grad_norm": 1.2468015038346163, + "learning_rate": 4.637207841825751e-05, + "loss": 0.6122, + "step": 7680 + }, + { + "epoch": 0.2242813366408872, + "grad_norm": 1.0105129161254078, + "learning_rate": 4.6366164170329284e-05, + "loss": 0.6172, + "step": 7685 + }, + { + "epoch": 0.2244272581351233, + "grad_norm": 0.9546960535958109, + "learning_rate": 4.6360245529062356e-05, + "loss": 0.6712, + "step": 7690 + }, + { + "epoch": 0.2245731796293594, + "grad_norm": 0.9246250406019902, + "learning_rate": 4.635432249583502e-05, + "loss": 0.6112, + "step": 7695 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.961446412475409, + "learning_rate": 4.634839507202654e-05, + "loss": 0.6771, + "step": 7700 + }, + { + "epoch": 0.22486502261783162, + "grad_norm": 1.1360914160925013, + "learning_rate": 4.634246325901725e-05, + "loss": 0.6564, + "step": 7705 + }, + { + "epoch": 0.2250109441120677, + "grad_norm": 1.0333366042083656, + "learning_rate": 4.633652705818847e-05, + "loss": 0.6196, + "step": 7710 + }, + { + "epoch": 0.2251568656063038, + "grad_norm": 1.1391768680085324, + "learning_rate": 4.633058647092256e-05, + "loss": 0.623, + "step": 7715 + }, + { + "epoch": 0.22530278710053991, + "grad_norm": 1.0248908811314938, + "learning_rate": 4.632464149860289e-05, + "loss": 0.6221, + "step": 7720 + }, + { + "epoch": 0.225448708594776, + "grad_norm": 0.9108781939269617, + "learning_rate": 4.631869214261386e-05, + "loss": 0.661, + "step": 7725 + }, + { + "epoch": 0.22559463008901212, + "grad_norm": 1.0285466781606194, + "learning_rate": 4.631273840434091e-05, + "loss": 0.6735, + "step": 7730 + }, + { + "epoch": 0.2257405515832482, + "grad_norm": 1.2330159732465003, + "learning_rate": 4.6306780285170434e-05, + "loss": 0.6234, + "step": 7735 + }, + { + "epoch": 0.2258864730774843, + "grad_norm": 1.1110321228740665, + "learning_rate": 4.630081778648994e-05, + "loss": 0.6139, + "step": 7740 + }, + { + "epoch": 0.22603239457172042, + "grad_norm": 1.0018857334264872, + "learning_rate": 4.629485090968788e-05, + "loss": 0.7078, + "step": 7745 + }, + { + "epoch": 0.2261783160659565, + "grad_norm": 1.1962417503635852, + "learning_rate": 4.6288879656153746e-05, + "loss": 0.6401, + "step": 7750 + }, + { + "epoch": 0.22632423756019263, + "grad_norm": 1.5167026681710511, + "learning_rate": 4.6282904027278074e-05, + "loss": 0.703, + "step": 7755 + }, + { + "epoch": 0.22647015905442872, + "grad_norm": 1.158930895185318, + "learning_rate": 4.62769240244524e-05, + "loss": 0.6592, + "step": 7760 + }, + { + "epoch": 0.2266160805486648, + "grad_norm": 0.9992968381378073, + "learning_rate": 4.627093964906927e-05, + "loss": 0.6375, + "step": 7765 + }, + { + "epoch": 0.22676200204290092, + "grad_norm": 0.8744079986899453, + "learning_rate": 4.6264950902522253e-05, + "loss": 0.6297, + "step": 7770 + }, + { + "epoch": 0.226907923537137, + "grad_norm": 1.0743209051150564, + "learning_rate": 4.625895778620597e-05, + "loss": 0.6679, + "step": 7775 + }, + { + "epoch": 0.22705384503137313, + "grad_norm": 0.9659038976755095, + "learning_rate": 4.6252960301515984e-05, + "loss": 0.5621, + "step": 7780 + }, + { + "epoch": 0.22719976652560922, + "grad_norm": 1.2384217917337363, + "learning_rate": 4.624695844984895e-05, + "loss": 0.6708, + "step": 7785 + }, + { + "epoch": 0.2273456880198453, + "grad_norm": 1.0826045097940296, + "learning_rate": 4.624095223260253e-05, + "loss": 0.6473, + "step": 7790 + }, + { + "epoch": 0.22749160951408143, + "grad_norm": 0.9188450931386862, + "learning_rate": 4.623494165117535e-05, + "loss": 0.6237, + "step": 7795 + }, + { + "epoch": 0.22763753100831752, + "grad_norm": 0.9859928298860685, + "learning_rate": 4.622892670696708e-05, + "loss": 0.6464, + "step": 7800 + }, + { + "epoch": 0.22778345250255363, + "grad_norm": 1.1669358062041992, + "learning_rate": 4.622290740137845e-05, + "loss": 0.6264, + "step": 7805 + }, + { + "epoch": 0.22792937399678972, + "grad_norm": 1.011989914026944, + "learning_rate": 4.6216883735811146e-05, + "loss": 0.6276, + "step": 7810 + }, + { + "epoch": 0.22807529549102581, + "grad_norm": 0.9732925583985184, + "learning_rate": 4.6210855711667885e-05, + "loss": 0.6405, + "step": 7815 + }, + { + "epoch": 0.22822121698526193, + "grad_norm": 0.9784160267226092, + "learning_rate": 4.620482333035241e-05, + "loss": 0.6491, + "step": 7820 + }, + { + "epoch": 0.22836713847949802, + "grad_norm": 0.950981513343793, + "learning_rate": 4.619878659326948e-05, + "loss": 0.6567, + "step": 7825 + }, + { + "epoch": 0.22851305997373414, + "grad_norm": 0.9936534018963062, + "learning_rate": 4.6192745501824865e-05, + "loss": 0.6688, + "step": 7830 + }, + { + "epoch": 0.22865898146797023, + "grad_norm": 0.9694163741131531, + "learning_rate": 4.618670005742533e-05, + "loss": 0.5973, + "step": 7835 + }, + { + "epoch": 0.22880490296220635, + "grad_norm": 1.2899515244708895, + "learning_rate": 4.618065026147868e-05, + "loss": 0.6725, + "step": 7840 + }, + { + "epoch": 0.22895082445644244, + "grad_norm": 1.005719952855351, + "learning_rate": 4.617459611539372e-05, + "loss": 0.6579, + "step": 7845 + }, + { + "epoch": 0.22909674595067853, + "grad_norm": 0.8829964980674914, + "learning_rate": 4.616853762058028e-05, + "loss": 0.6145, + "step": 7850 + }, + { + "epoch": 0.22924266744491464, + "grad_norm": 0.8873918242522172, + "learning_rate": 4.6162474778449184e-05, + "loss": 0.5489, + "step": 7855 + }, + { + "epoch": 0.22938858893915073, + "grad_norm": 1.0776994625416034, + "learning_rate": 4.6156407590412285e-05, + "loss": 0.6575, + "step": 7860 + }, + { + "epoch": 0.22953451043338685, + "grad_norm": 1.1758551600791651, + "learning_rate": 4.615033605788242e-05, + "loss": 0.6575, + "step": 7865 + }, + { + "epoch": 0.22968043192762294, + "grad_norm": 0.8689326529679396, + "learning_rate": 4.614426018227348e-05, + "loss": 0.651, + "step": 7870 + }, + { + "epoch": 0.22982635342185903, + "grad_norm": 1.0663020340156029, + "learning_rate": 4.613817996500036e-05, + "loss": 0.6443, + "step": 7875 + }, + { + "epoch": 0.22997227491609515, + "grad_norm": 0.7975763582854271, + "learning_rate": 4.613209540747891e-05, + "loss": 0.6341, + "step": 7880 + }, + { + "epoch": 0.23011819641033124, + "grad_norm": 1.0200147171388074, + "learning_rate": 4.612600651112607e-05, + "loss": 0.6805, + "step": 7885 + }, + { + "epoch": 0.23026411790456736, + "grad_norm": 0.9997976272014104, + "learning_rate": 4.611991327735973e-05, + "loss": 0.6419, + "step": 7890 + }, + { + "epoch": 0.23041003939880345, + "grad_norm": 1.0124966414078505, + "learning_rate": 4.611381570759883e-05, + "loss": 0.6435, + "step": 7895 + }, + { + "epoch": 0.23055596089303954, + "grad_norm": 1.0148534133246132, + "learning_rate": 4.61077138032633e-05, + "loss": 0.6597, + "step": 7900 + }, + { + "epoch": 0.23070188238727565, + "grad_norm": 1.0487671208762825, + "learning_rate": 4.610160756577406e-05, + "loss": 0.6318, + "step": 7905 + }, + { + "epoch": 0.23084780388151174, + "grad_norm": 0.9922302979776743, + "learning_rate": 4.6095496996553095e-05, + "loss": 0.6464, + "step": 7910 + }, + { + "epoch": 0.23099372537574786, + "grad_norm": 1.0756828244911314, + "learning_rate": 4.6089382097023345e-05, + "loss": 0.6208, + "step": 7915 + }, + { + "epoch": 0.23113964686998395, + "grad_norm": 0.9858037716854757, + "learning_rate": 4.608326286860878e-05, + "loss": 0.6216, + "step": 7920 + }, + { + "epoch": 0.23128556836422004, + "grad_norm": 1.0829510209891522, + "learning_rate": 4.607713931273439e-05, + "loss": 0.6361, + "step": 7925 + }, + { + "epoch": 0.23143148985845616, + "grad_norm": 0.9828157474884484, + "learning_rate": 4.607101143082614e-05, + "loss": 0.6119, + "step": 7930 + }, + { + "epoch": 0.23157741135269225, + "grad_norm": 0.9218204062158182, + "learning_rate": 4.606487922431104e-05, + "loss": 0.6352, + "step": 7935 + }, + { + "epoch": 0.23172333284692836, + "grad_norm": 1.2627713029350476, + "learning_rate": 4.6058742694617075e-05, + "loss": 0.6829, + "step": 7940 + }, + { + "epoch": 0.23186925434116445, + "grad_norm": 1.0605732958080172, + "learning_rate": 4.605260184317325e-05, + "loss": 0.6669, + "step": 7945 + }, + { + "epoch": 0.23201517583540054, + "grad_norm": 1.2039780403227063, + "learning_rate": 4.604645667140959e-05, + "loss": 0.6668, + "step": 7950 + }, + { + "epoch": 0.23216109732963666, + "grad_norm": 1.3794917993873899, + "learning_rate": 4.6040307180757095e-05, + "loss": 0.6919, + "step": 7955 + }, + { + "epoch": 0.23230701882387275, + "grad_norm": 1.1482924914290773, + "learning_rate": 4.603415337264781e-05, + "loss": 0.6708, + "step": 7960 + }, + { + "epoch": 0.23245294031810887, + "grad_norm": 0.9893061667380306, + "learning_rate": 4.602799524851476e-05, + "loss": 0.664, + "step": 7965 + }, + { + "epoch": 0.23259886181234496, + "grad_norm": 1.285670934417748, + "learning_rate": 4.602183280979195e-05, + "loss": 0.6462, + "step": 7970 + }, + { + "epoch": 0.23274478330658105, + "grad_norm": 1.1094735932361646, + "learning_rate": 4.6015666057914456e-05, + "loss": 0.6972, + "step": 7975 + }, + { + "epoch": 0.23289070480081717, + "grad_norm": 1.03649916776004, + "learning_rate": 4.60094949943183e-05, + "loss": 0.6695, + "step": 7980 + }, + { + "epoch": 0.23303662629505326, + "grad_norm": 0.9357764020518337, + "learning_rate": 4.600331962044055e-05, + "loss": 0.5931, + "step": 7985 + }, + { + "epoch": 0.23318254778928937, + "grad_norm": 1.0140902828062521, + "learning_rate": 4.599713993771923e-05, + "loss": 0.5776, + "step": 7990 + }, + { + "epoch": 0.23332846928352546, + "grad_norm": 1.0311708165876745, + "learning_rate": 4.5990955947593415e-05, + "loss": 0.6237, + "step": 7995 + }, + { + "epoch": 0.23347439077776155, + "grad_norm": 0.8908995237987619, + "learning_rate": 4.598476765150315e-05, + "loss": 0.6277, + "step": 8000 + }, + { + "epoch": 0.23362031227199767, + "grad_norm": 1.2632321959770048, + "learning_rate": 4.59785750508895e-05, + "loss": 0.6476, + "step": 8005 + }, + { + "epoch": 0.23376623376623376, + "grad_norm": 1.0704734377102212, + "learning_rate": 4.5972378147194526e-05, + "loss": 0.679, + "step": 8010 + }, + { + "epoch": 0.23391215526046988, + "grad_norm": 1.0413044935624627, + "learning_rate": 4.59661769418613e-05, + "loss": 0.6957, + "step": 8015 + }, + { + "epoch": 0.23405807675470597, + "grad_norm": 1.001972236043797, + "learning_rate": 4.595997143633389e-05, + "loss": 0.6207, + "step": 8020 + }, + { + "epoch": 0.23420399824894206, + "grad_norm": 1.0369600631181088, + "learning_rate": 4.5953761632057354e-05, + "loss": 0.6807, + "step": 8025 + }, + { + "epoch": 0.23434991974317818, + "grad_norm": 0.9484125250865714, + "learning_rate": 4.594754753047776e-05, + "loss": 0.6598, + "step": 8030 + }, + { + "epoch": 0.23449584123741427, + "grad_norm": 0.9575622002482312, + "learning_rate": 4.594132913304219e-05, + "loss": 0.6517, + "step": 8035 + }, + { + "epoch": 0.23464176273165038, + "grad_norm": 1.1696590967036824, + "learning_rate": 4.5935106441198696e-05, + "loss": 0.6378, + "step": 8040 + }, + { + "epoch": 0.23478768422588647, + "grad_norm": 1.0122955899702997, + "learning_rate": 4.5928879456396364e-05, + "loss": 0.6531, + "step": 8045 + }, + { + "epoch": 0.23493360572012256, + "grad_norm": 1.0435555441240427, + "learning_rate": 4.592264818008526e-05, + "loss": 0.6529, + "step": 8050 + }, + { + "epoch": 0.23507952721435868, + "grad_norm": 0.8714261880951645, + "learning_rate": 4.591641261371644e-05, + "loss": 0.6241, + "step": 8055 + }, + { + "epoch": 0.23522544870859477, + "grad_norm": 0.9694657008764006, + "learning_rate": 4.5910172758742e-05, + "loss": 0.6399, + "step": 8060 + }, + { + "epoch": 0.2353713702028309, + "grad_norm": 1.0901896256951364, + "learning_rate": 4.590392861661497e-05, + "loss": 0.5976, + "step": 8065 + }, + { + "epoch": 0.23551729169706698, + "grad_norm": 1.1780420339147422, + "learning_rate": 4.589768018878945e-05, + "loss": 0.6908, + "step": 8070 + }, + { + "epoch": 0.23566321319130307, + "grad_norm": 1.1293836913029043, + "learning_rate": 4.589142747672048e-05, + "loss": 0.6715, + "step": 8075 + }, + { + "epoch": 0.23580913468553918, + "grad_norm": 0.9637251883730631, + "learning_rate": 4.588517048186414e-05, + "loss": 0.6194, + "step": 8080 + }, + { + "epoch": 0.23595505617977527, + "grad_norm": 1.055958609880301, + "learning_rate": 4.587890920567746e-05, + "loss": 0.6513, + "step": 8085 + }, + { + "epoch": 0.2361009776740114, + "grad_norm": 1.1356198807463125, + "learning_rate": 4.5872643649618525e-05, + "loss": 0.6765, + "step": 8090 + }, + { + "epoch": 0.23624689916824748, + "grad_norm": 1.1309373299141614, + "learning_rate": 4.5866373815146376e-05, + "loss": 0.6277, + "step": 8095 + }, + { + "epoch": 0.23639282066248357, + "grad_norm": 1.1037383743425087, + "learning_rate": 4.5860099703721046e-05, + "loss": 0.6472, + "step": 8100 + }, + { + "epoch": 0.2365387421567197, + "grad_norm": 1.0501082677608518, + "learning_rate": 4.58538213168036e-05, + "loss": 0.6222, + "step": 8105 + }, + { + "epoch": 0.23668466365095578, + "grad_norm": 1.0830137055276021, + "learning_rate": 4.584753865585607e-05, + "loss": 0.6814, + "step": 8110 + }, + { + "epoch": 0.2368305851451919, + "grad_norm": 0.9762513270228858, + "learning_rate": 4.584125172234148e-05, + "loss": 0.5637, + "step": 8115 + }, + { + "epoch": 0.236976506639428, + "grad_norm": 0.9170030123885389, + "learning_rate": 4.5834960517723886e-05, + "loss": 0.6327, + "step": 8120 + }, + { + "epoch": 0.23712242813366408, + "grad_norm": 1.346858358083094, + "learning_rate": 4.582866504346829e-05, + "loss": 0.6949, + "step": 8125 + }, + { + "epoch": 0.2372683496279002, + "grad_norm": 1.0356137902428821, + "learning_rate": 4.5822365301040715e-05, + "loss": 0.6582, + "step": 8130 + }, + { + "epoch": 0.23741427112213628, + "grad_norm": 0.9622530458907691, + "learning_rate": 4.581606129190818e-05, + "loss": 0.6768, + "step": 8135 + }, + { + "epoch": 0.2375601926163724, + "grad_norm": 1.0684809591415723, + "learning_rate": 4.5809753017538686e-05, + "loss": 0.6712, + "step": 8140 + }, + { + "epoch": 0.2377061141106085, + "grad_norm": 1.0635318856228781, + "learning_rate": 4.580344047940123e-05, + "loss": 0.5895, + "step": 8145 + }, + { + "epoch": 0.23785203560484458, + "grad_norm": 1.0955118622977071, + "learning_rate": 4.5797123678965814e-05, + "loss": 0.6034, + "step": 8150 + }, + { + "epoch": 0.2379979570990807, + "grad_norm": 1.2686453968599383, + "learning_rate": 4.579080261770342e-05, + "loss": 0.6942, + "step": 8155 + }, + { + "epoch": 0.2381438785933168, + "grad_norm": 1.2471436362989532, + "learning_rate": 4.578447729708602e-05, + "loss": 0.6804, + "step": 8160 + }, + { + "epoch": 0.2382898000875529, + "grad_norm": 1.045783213201761, + "learning_rate": 4.577814771858658e-05, + "loss": 0.6716, + "step": 8165 + }, + { + "epoch": 0.238435721581789, + "grad_norm": 1.1585868358342493, + "learning_rate": 4.5771813883679065e-05, + "loss": 0.7187, + "step": 8170 + }, + { + "epoch": 0.23858164307602509, + "grad_norm": 1.130478461335575, + "learning_rate": 4.576547579383844e-05, + "loss": 0.619, + "step": 8175 + }, + { + "epoch": 0.2387275645702612, + "grad_norm": 1.0630748840472055, + "learning_rate": 4.575913345054062e-05, + "loss": 0.6305, + "step": 8180 + }, + { + "epoch": 0.2388734860644973, + "grad_norm": 1.0176127054434814, + "learning_rate": 4.575278685526257e-05, + "loss": 0.6032, + "step": 8185 + }, + { + "epoch": 0.2390194075587334, + "grad_norm": 1.1287092283985667, + "learning_rate": 4.574643600948219e-05, + "loss": 0.6703, + "step": 8190 + }, + { + "epoch": 0.2391653290529695, + "grad_norm": 0.9424134334247461, + "learning_rate": 4.57400809146784e-05, + "loss": 0.6561, + "step": 8195 + }, + { + "epoch": 0.2393112505472056, + "grad_norm": 1.200322925664898, + "learning_rate": 4.573372157233109e-05, + "loss": 0.7545, + "step": 8200 + }, + { + "epoch": 0.2394571720414417, + "grad_norm": 1.228633216218335, + "learning_rate": 4.5727357983921184e-05, + "loss": 0.6702, + "step": 8205 + }, + { + "epoch": 0.2396030935356778, + "grad_norm": 0.9490043695159041, + "learning_rate": 4.572099015093053e-05, + "loss": 0.6463, + "step": 8210 + }, + { + "epoch": 0.23974901502991391, + "grad_norm": 1.1769671690484456, + "learning_rate": 4.571461807484202e-05, + "loss": 0.6321, + "step": 8215 + }, + { + "epoch": 0.23989493652415, + "grad_norm": 1.0427798594595943, + "learning_rate": 4.570824175713949e-05, + "loss": 0.6399, + "step": 8220 + }, + { + "epoch": 0.24004085801838612, + "grad_norm": 1.0942267393901677, + "learning_rate": 4.57018611993078e-05, + "loss": 0.606, + "step": 8225 + }, + { + "epoch": 0.2401867795126222, + "grad_norm": 1.1724021908143896, + "learning_rate": 4.5695476402832774e-05, + "loss": 0.6503, + "step": 8230 + }, + { + "epoch": 0.2403327010068583, + "grad_norm": 1.0117924318821647, + "learning_rate": 4.568908736920123e-05, + "loss": 0.5978, + "step": 8235 + }, + { + "epoch": 0.24047862250109442, + "grad_norm": 0.9965386070949268, + "learning_rate": 4.568269409990098e-05, + "loss": 0.6359, + "step": 8240 + }, + { + "epoch": 0.2406245439953305, + "grad_norm": 0.9885244588560603, + "learning_rate": 4.5676296596420825e-05, + "loss": 0.6078, + "step": 8245 + }, + { + "epoch": 0.24077046548956663, + "grad_norm": 1.2099127854577707, + "learning_rate": 4.566989486025053e-05, + "loss": 0.6809, + "step": 8250 + }, + { + "epoch": 0.24091638698380272, + "grad_norm": 0.9826882431929175, + "learning_rate": 4.566348889288085e-05, + "loss": 0.588, + "step": 8255 + }, + { + "epoch": 0.2410623084780388, + "grad_norm": 0.9460549204407736, + "learning_rate": 4.565707869580355e-05, + "loss": 0.6337, + "step": 8260 + }, + { + "epoch": 0.24120822997227492, + "grad_norm": 1.0415505101044353, + "learning_rate": 4.565066427051136e-05, + "loss": 0.6895, + "step": 8265 + }, + { + "epoch": 0.241354151466511, + "grad_norm": 1.0425238728016206, + "learning_rate": 4.5644245618498e-05, + "loss": 0.6206, + "step": 8270 + }, + { + "epoch": 0.24150007296074713, + "grad_norm": 1.0464667302913129, + "learning_rate": 4.563782274125817e-05, + "loss": 0.7063, + "step": 8275 + }, + { + "epoch": 0.24164599445498322, + "grad_norm": 1.1866509120432345, + "learning_rate": 4.563139564028756e-05, + "loss": 0.696, + "step": 8280 + }, + { + "epoch": 0.2417919159492193, + "grad_norm": 1.0836941728350327, + "learning_rate": 4.562496431708283e-05, + "loss": 0.6541, + "step": 8285 + }, + { + "epoch": 0.24193783744345543, + "grad_norm": 1.1270774332039217, + "learning_rate": 4.561852877314165e-05, + "loss": 0.6531, + "step": 8290 + }, + { + "epoch": 0.24208375893769152, + "grad_norm": 1.012840678344838, + "learning_rate": 4.5612089009962645e-05, + "loss": 0.6219, + "step": 8295 + }, + { + "epoch": 0.24222968043192764, + "grad_norm": 1.2387280499643039, + "learning_rate": 4.560564502904544e-05, + "loss": 0.6194, + "step": 8300 + }, + { + "epoch": 0.24237560192616373, + "grad_norm": 1.1672074661401954, + "learning_rate": 4.559919683189063e-05, + "loss": 0.6951, + "step": 8305 + }, + { + "epoch": 0.24252152342039981, + "grad_norm": 1.0688373696208793, + "learning_rate": 4.559274441999981e-05, + "loss": 0.6083, + "step": 8310 + }, + { + "epoch": 0.24266744491463593, + "grad_norm": 1.0240525212158664, + "learning_rate": 4.558628779487553e-05, + "loss": 0.6694, + "step": 8315 + }, + { + "epoch": 0.24281336640887202, + "grad_norm": 0.885445710509539, + "learning_rate": 4.5579826958021346e-05, + "loss": 0.583, + "step": 8320 + }, + { + "epoch": 0.24295928790310814, + "grad_norm": 0.9226945774382187, + "learning_rate": 4.557336191094178e-05, + "loss": 0.658, + "step": 8325 + }, + { + "epoch": 0.24310520939734423, + "grad_norm": 1.2038544331400904, + "learning_rate": 4.5566892655142346e-05, + "loss": 0.597, + "step": 8330 + }, + { + "epoch": 0.24325113089158032, + "grad_norm": 1.0821248713422733, + "learning_rate": 4.556041919212952e-05, + "loss": 0.666, + "step": 8335 + }, + { + "epoch": 0.24339705238581644, + "grad_norm": 1.056849069876137, + "learning_rate": 4.555394152341078e-05, + "loss": 0.6495, + "step": 8340 + }, + { + "epoch": 0.24354297388005253, + "grad_norm": 0.8936922841811944, + "learning_rate": 4.554745965049455e-05, + "loss": 0.6318, + "step": 8345 + }, + { + "epoch": 0.24368889537428864, + "grad_norm": 0.8634715950232043, + "learning_rate": 4.554097357489029e-05, + "loss": 0.657, + "step": 8350 + }, + { + "epoch": 0.24383481686852473, + "grad_norm": 1.2203497179324476, + "learning_rate": 4.5534483298108376e-05, + "loss": 0.6704, + "step": 8355 + }, + { + "epoch": 0.24398073836276082, + "grad_norm": 0.8483026686485925, + "learning_rate": 4.55279888216602e-05, + "loss": 0.5976, + "step": 8360 + }, + { + "epoch": 0.24412665985699694, + "grad_norm": 1.1346864152443763, + "learning_rate": 4.552149014705813e-05, + "loss": 0.704, + "step": 8365 + }, + { + "epoch": 0.24427258135123303, + "grad_norm": 1.0198316606837534, + "learning_rate": 4.551498727581548e-05, + "loss": 0.6574, + "step": 8370 + }, + { + "epoch": 0.24441850284546915, + "grad_norm": 1.0515082285160497, + "learning_rate": 4.550848020944659e-05, + "loss": 0.6658, + "step": 8375 + }, + { + "epoch": 0.24456442433970524, + "grad_norm": 1.133774436600005, + "learning_rate": 4.550196894946674e-05, + "loss": 0.653, + "step": 8380 + }, + { + "epoch": 0.24471034583394133, + "grad_norm": 0.9605194852435424, + "learning_rate": 4.5495453497392204e-05, + "loss": 0.5541, + "step": 8385 + }, + { + "epoch": 0.24485626732817745, + "grad_norm": 1.1635317660708095, + "learning_rate": 4.5488933854740216e-05, + "loss": 0.6443, + "step": 8390 + }, + { + "epoch": 0.24500218882241354, + "grad_norm": 0.989708829256966, + "learning_rate": 4.548241002302901e-05, + "loss": 0.6283, + "step": 8395 + }, + { + "epoch": 0.24514811031664965, + "grad_norm": 0.8773266072878004, + "learning_rate": 4.547588200377777e-05, + "loss": 0.6002, + "step": 8400 + }, + { + "epoch": 0.24529403181088574, + "grad_norm": 0.8716366177549859, + "learning_rate": 4.5469349798506676e-05, + "loss": 0.5974, + "step": 8405 + }, + { + "epoch": 0.24543995330512183, + "grad_norm": 1.152374746893429, + "learning_rate": 4.546281340873688e-05, + "loss": 0.6654, + "step": 8410 + }, + { + "epoch": 0.24558587479935795, + "grad_norm": 1.1378550464077823, + "learning_rate": 4.545627283599048e-05, + "loss": 0.6309, + "step": 8415 + }, + { + "epoch": 0.24573179629359404, + "grad_norm": 1.0919652209746074, + "learning_rate": 4.5449728081790595e-05, + "loss": 0.6122, + "step": 8420 + }, + { + "epoch": 0.24587771778783016, + "grad_norm": 0.972255598277924, + "learning_rate": 4.544317914766128e-05, + "loss": 0.6211, + "step": 8425 + }, + { + "epoch": 0.24602363928206625, + "grad_norm": 0.9853205829633739, + "learning_rate": 4.543662603512758e-05, + "loss": 0.6416, + "step": 8430 + }, + { + "epoch": 0.24616956077630234, + "grad_norm": 1.056632555575449, + "learning_rate": 4.54300687457155e-05, + "loss": 0.6182, + "step": 8435 + }, + { + "epoch": 0.24631548227053846, + "grad_norm": 0.904331251783767, + "learning_rate": 4.5423507280952035e-05, + "loss": 0.6268, + "step": 8440 + }, + { + "epoch": 0.24646140376477454, + "grad_norm": 1.0785131583947387, + "learning_rate": 4.5416941642365156e-05, + "loss": 0.7279, + "step": 8445 + }, + { + "epoch": 0.24660732525901066, + "grad_norm": 0.9408671732821459, + "learning_rate": 4.541037183148378e-05, + "loss": 0.5732, + "step": 8450 + }, + { + "epoch": 0.24675324675324675, + "grad_norm": 0.9558067477128036, + "learning_rate": 4.540379784983782e-05, + "loss": 0.6305, + "step": 8455 + }, + { + "epoch": 0.24689916824748284, + "grad_norm": 1.365267892482908, + "learning_rate": 4.5397219698958135e-05, + "loss": 0.634, + "step": 8460 + }, + { + "epoch": 0.24704508974171896, + "grad_norm": 1.078231203840172, + "learning_rate": 4.5390637380376584e-05, + "loss": 0.6261, + "step": 8465 + }, + { + "epoch": 0.24719101123595505, + "grad_norm": 0.9099464200016814, + "learning_rate": 4.538405089562598e-05, + "loss": 0.6064, + "step": 8470 + }, + { + "epoch": 0.24733693273019117, + "grad_norm": 1.0196934506706474, + "learning_rate": 4.537746024624011e-05, + "loss": 0.6695, + "step": 8475 + }, + { + "epoch": 0.24748285422442726, + "grad_norm": 1.0435196695261786, + "learning_rate": 4.537086543375373e-05, + "loss": 0.5878, + "step": 8480 + }, + { + "epoch": 0.24762877571866335, + "grad_norm": 0.9955620064745526, + "learning_rate": 4.536426645970255e-05, + "loss": 0.5973, + "step": 8485 + }, + { + "epoch": 0.24777469721289946, + "grad_norm": 1.017454961241555, + "learning_rate": 4.535766332562329e-05, + "loss": 0.5482, + "step": 8490 + }, + { + "epoch": 0.24792061870713555, + "grad_norm": 0.9880543818092671, + "learning_rate": 4.535105603305359e-05, + "loss": 0.6253, + "step": 8495 + }, + { + "epoch": 0.24806654020137167, + "grad_norm": 1.090615192104329, + "learning_rate": 4.53444445835321e-05, + "loss": 0.6516, + "step": 8500 + }, + { + "epoch": 0.24821246169560776, + "grad_norm": 0.9290383012488898, + "learning_rate": 4.533782897859841e-05, + "loss": 0.6623, + "step": 8505 + }, + { + "epoch": 0.24835838318984385, + "grad_norm": 0.9009167803126679, + "learning_rate": 4.533120921979309e-05, + "loss": 0.618, + "step": 8510 + }, + { + "epoch": 0.24850430468407997, + "grad_norm": 0.9315590673598744, + "learning_rate": 4.532458530865767e-05, + "loss": 0.6065, + "step": 8515 + }, + { + "epoch": 0.24865022617831606, + "grad_norm": 1.1045170609903816, + "learning_rate": 4.531795724673466e-05, + "loss": 0.6344, + "step": 8520 + }, + { + "epoch": 0.24879614767255218, + "grad_norm": 1.1037701955497576, + "learning_rate": 4.5311325035567506e-05, + "loss": 0.6532, + "step": 8525 + }, + { + "epoch": 0.24894206916678827, + "grad_norm": 1.3239404377087938, + "learning_rate": 4.530468867670068e-05, + "loss": 0.6647, + "step": 8530 + }, + { + "epoch": 0.24908799066102436, + "grad_norm": 0.9195634618115939, + "learning_rate": 4.529804817167955e-05, + "loss": 0.6354, + "step": 8535 + }, + { + "epoch": 0.24923391215526047, + "grad_norm": 1.0231926947010603, + "learning_rate": 4.529140352205049e-05, + "loss": 0.6499, + "step": 8540 + }, + { + "epoch": 0.24937983364949656, + "grad_norm": 1.1107839505169632, + "learning_rate": 4.528475472936083e-05, + "loss": 0.662, + "step": 8545 + }, + { + "epoch": 0.24952575514373268, + "grad_norm": 1.1748706160940787, + "learning_rate": 4.5278101795158887e-05, + "loss": 0.6913, + "step": 8550 + }, + { + "epoch": 0.24967167663796877, + "grad_norm": 1.0095853667048336, + "learning_rate": 4.527144472099389e-05, + "loss": 0.6568, + "step": 8555 + }, + { + "epoch": 0.24981759813220486, + "grad_norm": 0.9869538252845689, + "learning_rate": 4.526478350841607e-05, + "loss": 0.6437, + "step": 8560 + }, + { + "epoch": 0.24996351962644098, + "grad_norm": 1.0706587362468276, + "learning_rate": 4.5258118158976634e-05, + "loss": 0.6694, + "step": 8565 + }, + { + "epoch": 0.25010944112067707, + "grad_norm": 1.0556725259557305, + "learning_rate": 4.5251448674227706e-05, + "loss": 0.7093, + "step": 8570 + }, + { + "epoch": 0.25025536261491316, + "grad_norm": 1.0651615949570774, + "learning_rate": 4.5244775055722424e-05, + "loss": 0.6834, + "step": 8575 + }, + { + "epoch": 0.2504012841091493, + "grad_norm": 1.016408579623856, + "learning_rate": 4.523809730501485e-05, + "loss": 0.624, + "step": 8580 + }, + { + "epoch": 0.2505472056033854, + "grad_norm": 1.108958059462406, + "learning_rate": 4.523141542366003e-05, + "loss": 0.666, + "step": 8585 + }, + { + "epoch": 0.2506931270976215, + "grad_norm": 0.9893682032067783, + "learning_rate": 4.522472941321396e-05, + "loss": 0.65, + "step": 8590 + }, + { + "epoch": 0.25083904859185757, + "grad_norm": 1.0517656203180392, + "learning_rate": 4.5218039275233605e-05, + "loss": 0.6924, + "step": 8595 + }, + { + "epoch": 0.25098497008609366, + "grad_norm": 1.106810116163582, + "learning_rate": 4.521134501127689e-05, + "loss": 0.6378, + "step": 8600 + }, + { + "epoch": 0.2511308915803298, + "grad_norm": 1.0653247405328319, + "learning_rate": 4.52046466229027e-05, + "loss": 0.6794, + "step": 8605 + }, + { + "epoch": 0.2512768130745659, + "grad_norm": 0.935325001010799, + "learning_rate": 4.519794411167087e-05, + "loss": 0.602, + "step": 8610 + }, + { + "epoch": 0.251422734568802, + "grad_norm": 0.948232298837928, + "learning_rate": 4.5191237479142215e-05, + "loss": 0.6187, + "step": 8615 + }, + { + "epoch": 0.2515686560630381, + "grad_norm": 1.1143308928301126, + "learning_rate": 4.51845267268785e-05, + "loss": 0.6335, + "step": 8620 + }, + { + "epoch": 0.25171457755727417, + "grad_norm": 1.1211944331520276, + "learning_rate": 4.517781185644244e-05, + "loss": 0.6318, + "step": 8625 + }, + { + "epoch": 0.2518604990515103, + "grad_norm": 1.0660203848166905, + "learning_rate": 4.517109286939773e-05, + "loss": 0.607, + "step": 8630 + }, + { + "epoch": 0.2520064205457464, + "grad_norm": 1.044750099957479, + "learning_rate": 4.5164369767309e-05, + "loss": 0.6132, + "step": 8635 + }, + { + "epoch": 0.2521523420399825, + "grad_norm": 0.8598749404929849, + "learning_rate": 4.5157642551741864e-05, + "loss": 0.5784, + "step": 8640 + }, + { + "epoch": 0.2522982635342186, + "grad_norm": 0.9280651180986257, + "learning_rate": 4.515091122426287e-05, + "loss": 0.633, + "step": 8645 + }, + { + "epoch": 0.25244418502845467, + "grad_norm": 1.0437175231155282, + "learning_rate": 4.514417578643953e-05, + "loss": 0.6762, + "step": 8650 + }, + { + "epoch": 0.2525901065226908, + "grad_norm": 1.0976874522613829, + "learning_rate": 4.513743623984032e-05, + "loss": 0.622, + "step": 8655 + }, + { + "epoch": 0.2527360280169269, + "grad_norm": 1.1713320919190777, + "learning_rate": 4.513069258603467e-05, + "loss": 0.6391, + "step": 8660 + }, + { + "epoch": 0.252881949511163, + "grad_norm": 1.1693868341335154, + "learning_rate": 4.512394482659297e-05, + "loss": 0.6066, + "step": 8665 + }, + { + "epoch": 0.2530278710053991, + "grad_norm": 0.8934313973242964, + "learning_rate": 4.511719296308656e-05, + "loss": 0.5705, + "step": 8670 + }, + { + "epoch": 0.2531737924996352, + "grad_norm": 1.005388539933998, + "learning_rate": 4.511043699708773e-05, + "loss": 0.6552, + "step": 8675 + }, + { + "epoch": 0.2533197139938713, + "grad_norm": 1.0494716156555224, + "learning_rate": 4.510367693016974e-05, + "loss": 0.6032, + "step": 8680 + }, + { + "epoch": 0.2534656354881074, + "grad_norm": 1.2078200903618985, + "learning_rate": 4.50969127639068e-05, + "loss": 0.6438, + "step": 8685 + }, + { + "epoch": 0.2536115569823435, + "grad_norm": 0.9663618367115041, + "learning_rate": 4.5090144499874065e-05, + "loss": 0.6985, + "step": 8690 + }, + { + "epoch": 0.2537574784765796, + "grad_norm": 1.0777409111031242, + "learning_rate": 4.508337213964766e-05, + "loss": 0.6609, + "step": 8695 + }, + { + "epoch": 0.2539033999708157, + "grad_norm": 1.0048933064083798, + "learning_rate": 4.507659568480464e-05, + "loss": 0.6396, + "step": 8700 + }, + { + "epoch": 0.2540493214650518, + "grad_norm": 0.9707644540521843, + "learning_rate": 4.506981513692305e-05, + "loss": 0.6317, + "step": 8705 + }, + { + "epoch": 0.2541952429592879, + "grad_norm": 1.1263204027773712, + "learning_rate": 4.5063030497581855e-05, + "loss": 0.7023, + "step": 8710 + }, + { + "epoch": 0.254341164453524, + "grad_norm": 1.2280235298093964, + "learning_rate": 4.505624176836099e-05, + "loss": 0.6149, + "step": 8715 + }, + { + "epoch": 0.2544870859477601, + "grad_norm": 0.9889017375908826, + "learning_rate": 4.504944895084132e-05, + "loss": 0.6286, + "step": 8720 + }, + { + "epoch": 0.2546330074419962, + "grad_norm": 1.1044853248915083, + "learning_rate": 4.504265204660472e-05, + "loss": 0.6741, + "step": 8725 + }, + { + "epoch": 0.25477892893623233, + "grad_norm": 1.167529989993183, + "learning_rate": 4.503585105723392e-05, + "loss": 0.6503, + "step": 8730 + }, + { + "epoch": 0.2549248504304684, + "grad_norm": 0.8834601678822187, + "learning_rate": 4.502904598431271e-05, + "loss": 0.5466, + "step": 8735 + }, + { + "epoch": 0.2550707719247045, + "grad_norm": 1.1465013234736792, + "learning_rate": 4.502223682942575e-05, + "loss": 0.6927, + "step": 8740 + }, + { + "epoch": 0.2552166934189406, + "grad_norm": 0.9242748984545789, + "learning_rate": 4.5015423594158687e-05, + "loss": 0.6378, + "step": 8745 + }, + { + "epoch": 0.2553626149131767, + "grad_norm": 0.9314522882806904, + "learning_rate": 4.50086062800981e-05, + "loss": 0.6409, + "step": 8750 + }, + { + "epoch": 0.25550853640741283, + "grad_norm": 1.1333642925365388, + "learning_rate": 4.500178488883154e-05, + "loss": 0.5847, + "step": 8755 + }, + { + "epoch": 0.2556544579016489, + "grad_norm": 0.8985439645057681, + "learning_rate": 4.499495942194749e-05, + "loss": 0.6874, + "step": 8760 + }, + { + "epoch": 0.255800379395885, + "grad_norm": 1.1175351223172236, + "learning_rate": 4.498812988103539e-05, + "loss": 0.701, + "step": 8765 + }, + { + "epoch": 0.2559463008901211, + "grad_norm": 0.9886598131585127, + "learning_rate": 4.4981296267685625e-05, + "loss": 0.6798, + "step": 8770 + }, + { + "epoch": 0.2560922223843572, + "grad_norm": 1.0816684480585688, + "learning_rate": 4.4974458583489535e-05, + "loss": 0.7554, + "step": 8775 + }, + { + "epoch": 0.25623814387859334, + "grad_norm": 1.1262262489071266, + "learning_rate": 4.496761683003938e-05, + "loss": 0.6731, + "step": 8780 + }, + { + "epoch": 0.25638406537282943, + "grad_norm": 1.0711936800059774, + "learning_rate": 4.496077100892842e-05, + "loss": 0.6485, + "step": 8785 + }, + { + "epoch": 0.2565299868670655, + "grad_norm": 1.0638662267386898, + "learning_rate": 4.4953921121750816e-05, + "loss": 0.6413, + "step": 8790 + }, + { + "epoch": 0.2566759083613016, + "grad_norm": 1.027071226153336, + "learning_rate": 4.494706717010169e-05, + "loss": 0.6422, + "step": 8795 + }, + { + "epoch": 0.2568218298555377, + "grad_norm": 1.1281795437000612, + "learning_rate": 4.494020915557713e-05, + "loss": 0.7148, + "step": 8800 + }, + { + "epoch": 0.25696775134977384, + "grad_norm": 1.091748509726268, + "learning_rate": 4.493334707977413e-05, + "loss": 0.6536, + "step": 8805 + }, + { + "epoch": 0.25711367284400993, + "grad_norm": 1.0596611242723968, + "learning_rate": 4.492648094429067e-05, + "loss": 0.6559, + "step": 8810 + }, + { + "epoch": 0.257259594338246, + "grad_norm": 1.0317673128159683, + "learning_rate": 4.491961075072564e-05, + "loss": 0.7173, + "step": 8815 + }, + { + "epoch": 0.2574055158324821, + "grad_norm": 1.196954737433136, + "learning_rate": 4.491273650067891e-05, + "loss": 0.6859, + "step": 8820 + }, + { + "epoch": 0.2575514373267182, + "grad_norm": 1.045284749576078, + "learning_rate": 4.4905858195751284e-05, + "loss": 0.6137, + "step": 8825 + }, + { + "epoch": 0.25769735882095435, + "grad_norm": 0.9940565864241572, + "learning_rate": 4.4898975837544474e-05, + "loss": 0.6506, + "step": 8830 + }, + { + "epoch": 0.25784328031519044, + "grad_norm": 1.0359292098971435, + "learning_rate": 4.489208942766119e-05, + "loss": 0.6134, + "step": 8835 + }, + { + "epoch": 0.2579892018094265, + "grad_norm": 1.2482575873253083, + "learning_rate": 4.488519896770505e-05, + "loss": 0.6277, + "step": 8840 + }, + { + "epoch": 0.2581351233036626, + "grad_norm": 0.9570472557612177, + "learning_rate": 4.4878304459280635e-05, + "loss": 0.5576, + "step": 8845 + }, + { + "epoch": 0.2582810447978987, + "grad_norm": 0.9820151190363825, + "learning_rate": 4.487140590399345e-05, + "loss": 0.6412, + "step": 8850 + }, + { + "epoch": 0.25842696629213485, + "grad_norm": 1.5260257101818073, + "learning_rate": 4.4864503303449965e-05, + "loss": 0.7492, + "step": 8855 + }, + { + "epoch": 0.25857288778637094, + "grad_norm": 0.9709510842242115, + "learning_rate": 4.485759665925756e-05, + "loss": 0.5788, + "step": 8860 + }, + { + "epoch": 0.25871880928060703, + "grad_norm": 1.030335770730899, + "learning_rate": 4.48506859730246e-05, + "loss": 0.6275, + "step": 8865 + }, + { + "epoch": 0.2588647307748431, + "grad_norm": 1.148293871146626, + "learning_rate": 4.484377124636035e-05, + "loss": 0.6522, + "step": 8870 + }, + { + "epoch": 0.2590106522690792, + "grad_norm": 1.0315312134836274, + "learning_rate": 4.4836852480875044e-05, + "loss": 0.665, + "step": 8875 + }, + { + "epoch": 0.25915657376331536, + "grad_norm": 0.8996174653397238, + "learning_rate": 4.482992967817984e-05, + "loss": 0.5888, + "step": 8880 + }, + { + "epoch": 0.25930249525755145, + "grad_norm": 1.0282357250944139, + "learning_rate": 4.482300283988684e-05, + "loss": 0.6569, + "step": 8885 + }, + { + "epoch": 0.25944841675178754, + "grad_norm": 0.9655238959303517, + "learning_rate": 4.481607196760909e-05, + "loss": 0.6178, + "step": 8890 + }, + { + "epoch": 0.2595943382460236, + "grad_norm": 0.863262081048551, + "learning_rate": 4.4809137062960574e-05, + "loss": 0.6088, + "step": 8895 + }, + { + "epoch": 0.2597402597402597, + "grad_norm": 0.8698525165931709, + "learning_rate": 4.4802198127556213e-05, + "loss": 0.6184, + "step": 8900 + }, + { + "epoch": 0.25988618123449586, + "grad_norm": 1.0250804577905368, + "learning_rate": 4.479525516301187e-05, + "loss": 0.6335, + "step": 8905 + }, + { + "epoch": 0.26003210272873195, + "grad_norm": 1.3822614195453675, + "learning_rate": 4.4788308170944335e-05, + "loss": 0.6248, + "step": 8910 + }, + { + "epoch": 0.26017802422296804, + "grad_norm": 0.9860823565374516, + "learning_rate": 4.478135715297136e-05, + "loss": 0.6489, + "step": 8915 + }, + { + "epoch": 0.26032394571720413, + "grad_norm": 1.058019067413886, + "learning_rate": 4.4774402110711606e-05, + "loss": 0.6251, + "step": 8920 + }, + { + "epoch": 0.2604698672114402, + "grad_norm": 1.00906502323677, + "learning_rate": 4.47674430457847e-05, + "loss": 0.6301, + "step": 8925 + }, + { + "epoch": 0.26061578870567637, + "grad_norm": 1.2686657373904193, + "learning_rate": 4.476047995981117e-05, + "loss": 0.7124, + "step": 8930 + }, + { + "epoch": 0.26076171019991246, + "grad_norm": 0.9058661015445746, + "learning_rate": 4.475351285441251e-05, + "loss": 0.6162, + "step": 8935 + }, + { + "epoch": 0.26090763169414855, + "grad_norm": 1.0833744894301809, + "learning_rate": 4.474654173121115e-05, + "loss": 0.622, + "step": 8940 + }, + { + "epoch": 0.26105355318838463, + "grad_norm": 0.9971828059390756, + "learning_rate": 4.4739566591830426e-05, + "loss": 0.6169, + "step": 8945 + }, + { + "epoch": 0.2611994746826207, + "grad_norm": 1.258582025535961, + "learning_rate": 4.4732587437894645e-05, + "loss": 0.644, + "step": 8950 + }, + { + "epoch": 0.26134539617685687, + "grad_norm": 1.0642359177210923, + "learning_rate": 4.472560427102903e-05, + "loss": 0.6496, + "step": 8955 + }, + { + "epoch": 0.26149131767109296, + "grad_norm": 1.0471021165508552, + "learning_rate": 4.471861709285974e-05, + "loss": 0.6554, + "step": 8960 + }, + { + "epoch": 0.26163723916532905, + "grad_norm": 0.9981258819363995, + "learning_rate": 4.471162590501388e-05, + "loss": 0.6282, + "step": 8965 + }, + { + "epoch": 0.26178316065956514, + "grad_norm": 1.3139455062385663, + "learning_rate": 4.4704630709119464e-05, + "loss": 0.6421, + "step": 8970 + }, + { + "epoch": 0.26192908215380123, + "grad_norm": 1.2672334590660292, + "learning_rate": 4.4697631506805456e-05, + "loss": 0.6542, + "step": 8975 + }, + { + "epoch": 0.2620750036480374, + "grad_norm": 0.8780000848733512, + "learning_rate": 4.4690628299701756e-05, + "loss": 0.5674, + "step": 8980 + }, + { + "epoch": 0.26222092514227346, + "grad_norm": 1.106894253749715, + "learning_rate": 4.4683621089439196e-05, + "loss": 0.7116, + "step": 8985 + }, + { + "epoch": 0.26236684663650955, + "grad_norm": 0.94579533978562, + "learning_rate": 4.467660987764952e-05, + "loss": 0.638, + "step": 8990 + }, + { + "epoch": 0.26251276813074564, + "grad_norm": 0.9336607293029315, + "learning_rate": 4.466959466596543e-05, + "loss": 0.5988, + "step": 8995 + }, + { + "epoch": 0.26265868962498173, + "grad_norm": 1.106859738574095, + "learning_rate": 4.466257545602056e-05, + "loss": 0.6259, + "step": 9000 + }, + { + "epoch": 0.2628046111192179, + "grad_norm": 1.1534969708972123, + "learning_rate": 4.465555224944944e-05, + "loss": 0.5885, + "step": 9005 + }, + { + "epoch": 0.26295053261345397, + "grad_norm": 1.1529115198321225, + "learning_rate": 4.4648525047887576e-05, + "loss": 0.6453, + "step": 9010 + }, + { + "epoch": 0.26309645410769006, + "grad_norm": 1.0293377078051025, + "learning_rate": 4.464149385297137e-05, + "loss": 0.5986, + "step": 9015 + }, + { + "epoch": 0.26324237560192615, + "grad_norm": 0.8846142765905193, + "learning_rate": 4.463445866633818e-05, + "loss": 0.6949, + "step": 9020 + }, + { + "epoch": 0.26338829709616224, + "grad_norm": 1.049061138368147, + "learning_rate": 4.462741948962627e-05, + "loss": 0.6353, + "step": 9025 + }, + { + "epoch": 0.2635342185903984, + "grad_norm": 0.9257236183525315, + "learning_rate": 4.462037632447484e-05, + "loss": 0.6453, + "step": 9030 + }, + { + "epoch": 0.2636801400846345, + "grad_norm": 0.9340035789822353, + "learning_rate": 4.461332917252403e-05, + "loss": 0.6532, + "step": 9035 + }, + { + "epoch": 0.26382606157887056, + "grad_norm": 0.9498286322382832, + "learning_rate": 4.46062780354149e-05, + "loss": 0.6407, + "step": 9040 + }, + { + "epoch": 0.26397198307310665, + "grad_norm": 1.1612496284411775, + "learning_rate": 4.459922291478944e-05, + "loss": 0.707, + "step": 9045 + }, + { + "epoch": 0.26411790456734274, + "grad_norm": 1.0934823183156754, + "learning_rate": 4.4592163812290565e-05, + "loss": 0.7081, + "step": 9050 + }, + { + "epoch": 0.2642638260615789, + "grad_norm": 1.1152636045238529, + "learning_rate": 4.4585100729562116e-05, + "loss": 0.6228, + "step": 9055 + }, + { + "epoch": 0.264409747555815, + "grad_norm": 0.9944470213777442, + "learning_rate": 4.457803366824888e-05, + "loss": 0.5851, + "step": 9060 + }, + { + "epoch": 0.26455566905005107, + "grad_norm": 0.9611301361871022, + "learning_rate": 4.457096262999653e-05, + "loss": 0.6424, + "step": 9065 + }, + { + "epoch": 0.26470159054428716, + "grad_norm": 0.9282054393101163, + "learning_rate": 4.456388761645171e-05, + "loss": 0.6507, + "step": 9070 + }, + { + "epoch": 0.26484751203852325, + "grad_norm": 0.9407764753115578, + "learning_rate": 4.4556808629261944e-05, + "loss": 0.6002, + "step": 9075 + }, + { + "epoch": 0.2649934335327594, + "grad_norm": 1.1649189255342702, + "learning_rate": 4.454972567007573e-05, + "loss": 0.6486, + "step": 9080 + }, + { + "epoch": 0.2651393550269955, + "grad_norm": 1.1724980079511826, + "learning_rate": 4.4542638740542454e-05, + "loss": 0.6327, + "step": 9085 + }, + { + "epoch": 0.26528527652123157, + "grad_norm": 1.0462629703387767, + "learning_rate": 4.4535547842312456e-05, + "loss": 0.6257, + "step": 9090 + }, + { + "epoch": 0.26543119801546766, + "grad_norm": 1.0117152089921373, + "learning_rate": 4.452845297703697e-05, + "loss": 0.6589, + "step": 9095 + }, + { + "epoch": 0.26557711950970375, + "grad_norm": 1.1398775730241093, + "learning_rate": 4.4521354146368165e-05, + "loss": 0.6717, + "step": 9100 + }, + { + "epoch": 0.2657230410039399, + "grad_norm": 1.0412595582253366, + "learning_rate": 4.4514251351959145e-05, + "loss": 0.6614, + "step": 9105 + }, + { + "epoch": 0.265868962498176, + "grad_norm": 1.0767540727890639, + "learning_rate": 4.4507144595463925e-05, + "loss": 0.6545, + "step": 9110 + }, + { + "epoch": 0.2660148839924121, + "grad_norm": 1.0196518532788041, + "learning_rate": 4.4500033878537444e-05, + "loss": 0.6301, + "step": 9115 + }, + { + "epoch": 0.26616080548664817, + "grad_norm": 1.0851434005566267, + "learning_rate": 4.449291920283558e-05, + "loss": 0.6374, + "step": 9120 + }, + { + "epoch": 0.2663067269808843, + "grad_norm": 1.0650008141540834, + "learning_rate": 4.44858005700151e-05, + "loss": 0.7091, + "step": 9125 + }, + { + "epoch": 0.2664526484751204, + "grad_norm": 0.9157099178884863, + "learning_rate": 4.447867798173371e-05, + "loss": 0.6179, + "step": 9130 + }, + { + "epoch": 0.2665985699693565, + "grad_norm": 1.2341099504920638, + "learning_rate": 4.447155143965005e-05, + "loss": 0.7344, + "step": 9135 + }, + { + "epoch": 0.2667444914635926, + "grad_norm": 1.0026383364226366, + "learning_rate": 4.446442094542366e-05, + "loss": 0.61, + "step": 9140 + }, + { + "epoch": 0.26689041295782867, + "grad_norm": 1.0532006414265724, + "learning_rate": 4.445728650071501e-05, + "loss": 0.6424, + "step": 9145 + }, + { + "epoch": 0.2670363344520648, + "grad_norm": 0.9517323938666045, + "learning_rate": 4.44501481071855e-05, + "loss": 0.5986, + "step": 9150 + }, + { + "epoch": 0.2671822559463009, + "grad_norm": 0.9046201282895397, + "learning_rate": 4.444300576649742e-05, + "loss": 0.6449, + "step": 9155 + }, + { + "epoch": 0.267328177440537, + "grad_norm": 0.8882552940513714, + "learning_rate": 4.4435859480314004e-05, + "loss": 0.6053, + "step": 9160 + }, + { + "epoch": 0.2674740989347731, + "grad_norm": 1.0321453914897791, + "learning_rate": 4.442870925029941e-05, + "loss": 0.7211, + "step": 9165 + }, + { + "epoch": 0.2676200204290092, + "grad_norm": 0.9371634087659004, + "learning_rate": 4.442155507811868e-05, + "loss": 0.5801, + "step": 9170 + }, + { + "epoch": 0.2677659419232453, + "grad_norm": 0.9158064321777787, + "learning_rate": 4.441439696543782e-05, + "loss": 0.6015, + "step": 9175 + }, + { + "epoch": 0.2679118634174814, + "grad_norm": 1.026516340182458, + "learning_rate": 4.440723491392372e-05, + "loss": 0.6103, + "step": 9180 + }, + { + "epoch": 0.2680577849117175, + "grad_norm": 0.9583772134122046, + "learning_rate": 4.440006892524419e-05, + "loss": 0.7085, + "step": 9185 + }, + { + "epoch": 0.2682037064059536, + "grad_norm": 0.9447783990267581, + "learning_rate": 4.439289900106797e-05, + "loss": 0.6681, + "step": 9190 + }, + { + "epoch": 0.2683496279001897, + "grad_norm": 1.0381487807905978, + "learning_rate": 4.4385725143064725e-05, + "loss": 0.5999, + "step": 9195 + }, + { + "epoch": 0.2684955493944258, + "grad_norm": 0.9480348043040826, + "learning_rate": 4.437854735290499e-05, + "loss": 0.6019, + "step": 9200 + }, + { + "epoch": 0.2686414708886619, + "grad_norm": 1.140769265493787, + "learning_rate": 4.437136563226028e-05, + "loss": 0.6559, + "step": 9205 + }, + { + "epoch": 0.268787392382898, + "grad_norm": 1.115671240264836, + "learning_rate": 4.436417998280297e-05, + "loss": 0.6619, + "step": 9210 + }, + { + "epoch": 0.2689333138771341, + "grad_norm": 1.075428668338084, + "learning_rate": 4.4356990406206395e-05, + "loss": 0.6501, + "step": 9215 + }, + { + "epoch": 0.2690792353713702, + "grad_norm": 1.3603889989071605, + "learning_rate": 4.4349796904144756e-05, + "loss": 0.681, + "step": 9220 + }, + { + "epoch": 0.26922515686560633, + "grad_norm": 1.2073389673461323, + "learning_rate": 4.434259947829321e-05, + "loss": 0.6722, + "step": 9225 + }, + { + "epoch": 0.2693710783598424, + "grad_norm": 1.1162252568589999, + "learning_rate": 4.433539813032781e-05, + "loss": 0.6038, + "step": 9230 + }, + { + "epoch": 0.2695169998540785, + "grad_norm": 1.039021375198927, + "learning_rate": 4.432819286192553e-05, + "loss": 0.6192, + "step": 9235 + }, + { + "epoch": 0.2696629213483146, + "grad_norm": 1.2151238941194762, + "learning_rate": 4.4320983674764245e-05, + "loss": 0.6778, + "step": 9240 + }, + { + "epoch": 0.2698088428425507, + "grad_norm": 1.1930561614733945, + "learning_rate": 4.431377057052274e-05, + "loss": 0.5987, + "step": 9245 + }, + { + "epoch": 0.26995476433678683, + "grad_norm": 0.9057296505516568, + "learning_rate": 4.4306553550880744e-05, + "loss": 0.596, + "step": 9250 + }, + { + "epoch": 0.2701006858310229, + "grad_norm": 1.0758695906453382, + "learning_rate": 4.4299332617518845e-05, + "loss": 0.6365, + "step": 9255 + }, + { + "epoch": 0.270246607325259, + "grad_norm": 1.0142675707376867, + "learning_rate": 4.42921077721186e-05, + "loss": 0.6528, + "step": 9260 + }, + { + "epoch": 0.2703925288194951, + "grad_norm": 1.1300311779497318, + "learning_rate": 4.428487901636243e-05, + "loss": 0.6846, + "step": 9265 + }, + { + "epoch": 0.2705384503137312, + "grad_norm": 1.1637159505865442, + "learning_rate": 4.4277646351933696e-05, + "loss": 0.5712, + "step": 9270 + }, + { + "epoch": 0.27068437180796734, + "grad_norm": 1.3591688453547368, + "learning_rate": 4.4270409780516656e-05, + "loss": 0.5792, + "step": 9275 + }, + { + "epoch": 0.27083029330220343, + "grad_norm": 0.9664209164435206, + "learning_rate": 4.426316930379648e-05, + "loss": 0.6053, + "step": 9280 + }, + { + "epoch": 0.2709762147964395, + "grad_norm": 1.0373865749406095, + "learning_rate": 4.425592492345925e-05, + "loss": 0.6667, + "step": 9285 + }, + { + "epoch": 0.2711221362906756, + "grad_norm": 0.9648188488889569, + "learning_rate": 4.4248676641191943e-05, + "loss": 0.6204, + "step": 9290 + }, + { + "epoch": 0.2712680577849117, + "grad_norm": 1.0927805250882077, + "learning_rate": 4.424142445868248e-05, + "loss": 0.6631, + "step": 9295 + }, + { + "epoch": 0.27141397927914784, + "grad_norm": 0.969006039490248, + "learning_rate": 4.4234168377619665e-05, + "loss": 0.6294, + "step": 9300 + }, + { + "epoch": 0.27155990077338393, + "grad_norm": 0.8923776068719823, + "learning_rate": 4.422690839969318e-05, + "loss": 0.6003, + "step": 9305 + }, + { + "epoch": 0.27170582226762, + "grad_norm": 0.9805808319216262, + "learning_rate": 4.421964452659368e-05, + "loss": 0.6365, + "step": 9310 + }, + { + "epoch": 0.2718517437618561, + "grad_norm": 1.116916243497408, + "learning_rate": 4.4212376760012683e-05, + "loss": 0.6713, + "step": 9315 + }, + { + "epoch": 0.2719976652560922, + "grad_norm": 1.1596441871577454, + "learning_rate": 4.420510510164263e-05, + "loss": 0.6769, + "step": 9320 + }, + { + "epoch": 0.27214358675032835, + "grad_norm": 0.9336167716258548, + "learning_rate": 4.419782955317685e-05, + "loss": 0.6157, + "step": 9325 + }, + { + "epoch": 0.27228950824456444, + "grad_norm": 1.0730903395786142, + "learning_rate": 4.4190550116309584e-05, + "loss": 0.6552, + "step": 9330 + }, + { + "epoch": 0.2724354297388005, + "grad_norm": 0.9622106854368614, + "learning_rate": 4.418326679273601e-05, + "loss": 0.6408, + "step": 9335 + }, + { + "epoch": 0.2725813512330366, + "grad_norm": 0.9128000562317355, + "learning_rate": 4.417597958415218e-05, + "loss": 0.6605, + "step": 9340 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 20.84925563166816, + "learning_rate": 4.416868849225504e-05, + "loss": 0.6775, + "step": 9345 + }, + { + "epoch": 0.27287319422150885, + "grad_norm": 0.8810440240742944, + "learning_rate": 4.416139351874246e-05, + "loss": 0.6168, + "step": 9350 + }, + { + "epoch": 0.27301911571574494, + "grad_norm": 0.9387956436694727, + "learning_rate": 4.415409466531323e-05, + "loss": 0.6026, + "step": 9355 + }, + { + "epoch": 0.27316503720998103, + "grad_norm": 1.0288075506259988, + "learning_rate": 4.4146791933667e-05, + "loss": 0.7043, + "step": 9360 + }, + { + "epoch": 0.2733109587042171, + "grad_norm": 1.0673780004829119, + "learning_rate": 4.413948532550437e-05, + "loss": 0.6474, + "step": 9365 + }, + { + "epoch": 0.2734568801984532, + "grad_norm": 1.0532994663723354, + "learning_rate": 4.4132174842526806e-05, + "loss": 0.6418, + "step": 9370 + }, + { + "epoch": 0.27360280169268936, + "grad_norm": 1.1156220457324426, + "learning_rate": 4.4124860486436695e-05, + "loss": 0.6407, + "step": 9375 + }, + { + "epoch": 0.27374872318692545, + "grad_norm": 1.0251531039399187, + "learning_rate": 4.411754225893731e-05, + "loss": 0.6034, + "step": 9380 + }, + { + "epoch": 0.27389464468116154, + "grad_norm": 0.9608927108302535, + "learning_rate": 4.4110220161732855e-05, + "loss": 0.5993, + "step": 9385 + }, + { + "epoch": 0.2740405661753976, + "grad_norm": 1.1877678378869183, + "learning_rate": 4.410289419652841e-05, + "loss": 0.6675, + "step": 9390 + }, + { + "epoch": 0.2741864876696337, + "grad_norm": 1.2854077719513781, + "learning_rate": 4.409556436502997e-05, + "loss": 0.7148, + "step": 9395 + }, + { + "epoch": 0.27433240916386986, + "grad_norm": 1.25513412571786, + "learning_rate": 4.408823066894441e-05, + "loss": 0.6142, + "step": 9400 + }, + { + "epoch": 0.27447833065810595, + "grad_norm": 1.0300988635514428, + "learning_rate": 4.408089310997952e-05, + "loss": 0.6067, + "step": 9405 + }, + { + "epoch": 0.27462425215234204, + "grad_norm": 0.9999223548561794, + "learning_rate": 4.407355168984399e-05, + "loss": 0.6807, + "step": 9410 + }, + { + "epoch": 0.27477017364657813, + "grad_norm": 1.1459692121118263, + "learning_rate": 4.4066206410247416e-05, + "loss": 0.6552, + "step": 9415 + }, + { + "epoch": 0.2749160951408142, + "grad_norm": 0.991276255895814, + "learning_rate": 4.405885727290026e-05, + "loss": 0.6314, + "step": 9420 + }, + { + "epoch": 0.27506201663505037, + "grad_norm": 0.9834130128941496, + "learning_rate": 4.4051504279513935e-05, + "loss": 0.6115, + "step": 9425 + }, + { + "epoch": 0.27520793812928646, + "grad_norm": 1.0521378607421596, + "learning_rate": 4.4044147431800696e-05, + "loss": 0.6918, + "step": 9430 + }, + { + "epoch": 0.27535385962352255, + "grad_norm": 1.247796341541082, + "learning_rate": 4.4036786731473747e-05, + "loss": 0.6333, + "step": 9435 + }, + { + "epoch": 0.27549978111775864, + "grad_norm": 1.1345550226866046, + "learning_rate": 4.402942218024716e-05, + "loss": 0.6332, + "step": 9440 + }, + { + "epoch": 0.2756457026119947, + "grad_norm": 1.0609247401101654, + "learning_rate": 4.4022053779835883e-05, + "loss": 0.5988, + "step": 9445 + }, + { + "epoch": 0.27579162410623087, + "grad_norm": 1.0309338328314486, + "learning_rate": 4.401468153195581e-05, + "loss": 0.6929, + "step": 9450 + }, + { + "epoch": 0.27593754560046696, + "grad_norm": 1.34035568672471, + "learning_rate": 4.4007305438323716e-05, + "loss": 0.7262, + "step": 9455 + }, + { + "epoch": 0.27608346709470305, + "grad_norm": 0.9962609990804928, + "learning_rate": 4.399992550065723e-05, + "loss": 0.6697, + "step": 9460 + }, + { + "epoch": 0.27622938858893914, + "grad_norm": 1.0155749732647312, + "learning_rate": 4.399254172067493e-05, + "loss": 0.6543, + "step": 9465 + }, + { + "epoch": 0.27637531008317523, + "grad_norm": 0.8872903575437754, + "learning_rate": 4.398515410009626e-05, + "loss": 0.6388, + "step": 9470 + }, + { + "epoch": 0.2765212315774114, + "grad_norm": 0.9356294193846711, + "learning_rate": 4.397776264064157e-05, + "loss": 0.6202, + "step": 9475 + }, + { + "epoch": 0.27666715307164746, + "grad_norm": 0.966011345876543, + "learning_rate": 4.397036734403209e-05, + "loss": 0.5627, + "step": 9480 + }, + { + "epoch": 0.27681307456588355, + "grad_norm": 0.794485955161537, + "learning_rate": 4.396296821198997e-05, + "loss": 0.5567, + "step": 9485 + }, + { + "epoch": 0.27695899606011964, + "grad_norm": 1.2233095189996241, + "learning_rate": 4.3955565246238225e-05, + "loss": 0.5855, + "step": 9490 + }, + { + "epoch": 0.27710491755435573, + "grad_norm": 0.9510694777237836, + "learning_rate": 4.394815844850077e-05, + "loss": 0.6041, + "step": 9495 + }, + { + "epoch": 0.2772508390485919, + "grad_norm": 1.0953573357454538, + "learning_rate": 4.394074782050242e-05, + "loss": 0.6975, + "step": 9500 + }, + { + "epoch": 0.27739676054282797, + "grad_norm": 0.9674381844392873, + "learning_rate": 4.3933333363968884e-05, + "loss": 0.6114, + "step": 9505 + }, + { + "epoch": 0.27754268203706406, + "grad_norm": 2.1746813973067844, + "learning_rate": 4.3925915080626756e-05, + "loss": 0.6002, + "step": 9510 + }, + { + "epoch": 0.27768860353130015, + "grad_norm": 0.8947633376924069, + "learning_rate": 4.3918492972203507e-05, + "loss": 0.568, + "step": 9515 + }, + { + "epoch": 0.27783452502553624, + "grad_norm": 0.9385015533082346, + "learning_rate": 4.391106704042753e-05, + "loss": 0.5574, + "step": 9520 + }, + { + "epoch": 0.2779804465197724, + "grad_norm": 0.940384338699598, + "learning_rate": 4.390363728702809e-05, + "loss": 0.5786, + "step": 9525 + }, + { + "epoch": 0.2781263680140085, + "grad_norm": 1.105896961869733, + "learning_rate": 4.3896203713735335e-05, + "loss": 0.6334, + "step": 9530 + }, + { + "epoch": 0.27827228950824456, + "grad_norm": 1.1600073440010827, + "learning_rate": 4.3888766322280315e-05, + "loss": 0.6404, + "step": 9535 + }, + { + "epoch": 0.27841821100248065, + "grad_norm": 0.9794154358006812, + "learning_rate": 4.388132511439497e-05, + "loss": 0.6914, + "step": 9540 + }, + { + "epoch": 0.27856413249671674, + "grad_norm": 1.4066206240283308, + "learning_rate": 4.387388009181212e-05, + "loss": 0.6718, + "step": 9545 + }, + { + "epoch": 0.2787100539909529, + "grad_norm": 0.9899398666962982, + "learning_rate": 4.386643125626548e-05, + "loss": 0.6164, + "step": 9550 + }, + { + "epoch": 0.278855975485189, + "grad_norm": 1.0108367918445247, + "learning_rate": 4.3858978609489646e-05, + "loss": 0.7144, + "step": 9555 + }, + { + "epoch": 0.27900189697942507, + "grad_norm": 1.1668019237590037, + "learning_rate": 4.3851522153220114e-05, + "loss": 0.6831, + "step": 9560 + }, + { + "epoch": 0.27914781847366116, + "grad_norm": 1.1421775344752243, + "learning_rate": 4.384406188919325e-05, + "loss": 0.6287, + "step": 9565 + }, + { + "epoch": 0.27929373996789725, + "grad_norm": 0.8948808720896755, + "learning_rate": 4.3836597819146324e-05, + "loss": 0.6615, + "step": 9570 + }, + { + "epoch": 0.2794396614621334, + "grad_norm": 1.1626018511343774, + "learning_rate": 4.3829129944817476e-05, + "loss": 0.6796, + "step": 9575 + }, + { + "epoch": 0.2795855829563695, + "grad_norm": 1.0008460850257275, + "learning_rate": 4.3821658267945747e-05, + "loss": 0.6513, + "step": 9580 + }, + { + "epoch": 0.2797315044506056, + "grad_norm": 1.1390733826075226, + "learning_rate": 4.381418279027105e-05, + "loss": 0.6401, + "step": 9585 + }, + { + "epoch": 0.27987742594484166, + "grad_norm": 1.1303844071107665, + "learning_rate": 4.38067035135342e-05, + "loss": 0.6096, + "step": 9590 + }, + { + "epoch": 0.28002334743907775, + "grad_norm": 1.0165469434512835, + "learning_rate": 4.379922043947688e-05, + "loss": 0.5765, + "step": 9595 + }, + { + "epoch": 0.2801692689333139, + "grad_norm": 1.0844107475307103, + "learning_rate": 4.379173356984165e-05, + "loss": 0.5534, + "step": 9600 + }, + { + "epoch": 0.28031519042755, + "grad_norm": 0.911743408262357, + "learning_rate": 4.378424290637199e-05, + "loss": 0.6006, + "step": 9605 + }, + { + "epoch": 0.2804611119217861, + "grad_norm": 1.016591910148524, + "learning_rate": 4.377674845081224e-05, + "loss": 0.6795, + "step": 9610 + }, + { + "epoch": 0.28060703341602217, + "grad_norm": 0.9716943752328762, + "learning_rate": 4.37692502049076e-05, + "loss": 0.6223, + "step": 9615 + }, + { + "epoch": 0.28075295491025826, + "grad_norm": 1.0479342340199176, + "learning_rate": 4.37617481704042e-05, + "loss": 0.6601, + "step": 9620 + }, + { + "epoch": 0.2808988764044944, + "grad_norm": 0.9210887513347649, + "learning_rate": 4.375424234904902e-05, + "loss": 0.5854, + "step": 9625 + }, + { + "epoch": 0.2810447978987305, + "grad_norm": 0.9585073815288241, + "learning_rate": 4.374673274258993e-05, + "loss": 0.6723, + "step": 9630 + }, + { + "epoch": 0.2811907193929666, + "grad_norm": 1.1027813582192172, + "learning_rate": 4.3739219352775685e-05, + "loss": 0.6795, + "step": 9635 + }, + { + "epoch": 0.28133664088720267, + "grad_norm": 0.8025597456106637, + "learning_rate": 4.373170218135592e-05, + "loss": 0.619, + "step": 9640 + }, + { + "epoch": 0.28148256238143876, + "grad_norm": 0.9707168305896594, + "learning_rate": 4.3724181230081144e-05, + "loss": 0.6139, + "step": 9645 + }, + { + "epoch": 0.2816284838756749, + "grad_norm": 1.4606884832562046, + "learning_rate": 4.371665650070275e-05, + "loss": 0.7026, + "step": 9650 + }, + { + "epoch": 0.281774405369911, + "grad_norm": 0.9409385044568542, + "learning_rate": 4.3709127994973017e-05, + "loss": 0.6136, + "step": 9655 + }, + { + "epoch": 0.2819203268641471, + "grad_norm": 1.1872235379032545, + "learning_rate": 4.370159571464509e-05, + "loss": 0.6534, + "step": 9660 + }, + { + "epoch": 0.2820662483583832, + "grad_norm": 1.0462182943011342, + "learning_rate": 4.3694059661473006e-05, + "loss": 0.6413, + "step": 9665 + }, + { + "epoch": 0.28221216985261927, + "grad_norm": 1.1015599076451104, + "learning_rate": 4.368651983721169e-05, + "loss": 0.6475, + "step": 9670 + }, + { + "epoch": 0.2823580913468554, + "grad_norm": 1.0058822146180264, + "learning_rate": 4.36789762436169e-05, + "loss": 0.6229, + "step": 9675 + }, + { + "epoch": 0.2825040128410915, + "grad_norm": 1.1300018511742729, + "learning_rate": 4.3671428882445335e-05, + "loss": 0.6585, + "step": 9680 + }, + { + "epoch": 0.2826499343353276, + "grad_norm": 1.247901797910968, + "learning_rate": 4.3663877755454514e-05, + "loss": 0.651, + "step": 9685 + }, + { + "epoch": 0.2827958558295637, + "grad_norm": 0.9637139935804208, + "learning_rate": 4.365632286440287e-05, + "loss": 0.5837, + "step": 9690 + }, + { + "epoch": 0.28294177732379977, + "grad_norm": 1.204244325118993, + "learning_rate": 4.36487642110497e-05, + "loss": 0.67, + "step": 9695 + }, + { + "epoch": 0.2830876988180359, + "grad_norm": 1.0817756537220737, + "learning_rate": 4.3641201797155176e-05, + "loss": 0.6488, + "step": 9700 + }, + { + "epoch": 0.283233620312272, + "grad_norm": 0.9795316655113857, + "learning_rate": 4.3633635624480344e-05, + "loss": 0.5998, + "step": 9705 + }, + { + "epoch": 0.2833795418065081, + "grad_norm": 1.0444383856671975, + "learning_rate": 4.362606569478715e-05, + "loss": 0.5875, + "step": 9710 + }, + { + "epoch": 0.2835254633007442, + "grad_norm": 1.0873743232596855, + "learning_rate": 4.361849200983835e-05, + "loss": 0.6857, + "step": 9715 + }, + { + "epoch": 0.2836713847949803, + "grad_norm": 0.950539944845474, + "learning_rate": 4.361091457139765e-05, + "loss": 0.6709, + "step": 9720 + }, + { + "epoch": 0.2838173062892164, + "grad_norm": 1.0180070499760425, + "learning_rate": 4.3603333381229594e-05, + "loss": 0.6299, + "step": 9725 + }, + { + "epoch": 0.2839632277834525, + "grad_norm": 1.0707600314051786, + "learning_rate": 4.35957484410996e-05, + "loss": 0.5681, + "step": 9730 + }, + { + "epoch": 0.2841091492776886, + "grad_norm": 0.9714989321799462, + "learning_rate": 4.3588159752773974e-05, + "loss": 0.6275, + "step": 9735 + }, + { + "epoch": 0.2842550707719247, + "grad_norm": 1.0654212860029892, + "learning_rate": 4.358056731801986e-05, + "loss": 0.661, + "step": 9740 + }, + { + "epoch": 0.2844009922661608, + "grad_norm": 1.002122570213685, + "learning_rate": 4.357297113860532e-05, + "loss": 0.5713, + "step": 9745 + }, + { + "epoch": 0.2845469137603969, + "grad_norm": 0.9270802455919358, + "learning_rate": 4.3565371216299264e-05, + "loss": 0.6234, + "step": 9750 + }, + { + "epoch": 0.284692835254633, + "grad_norm": 1.093674120690185, + "learning_rate": 4.355776755287146e-05, + "loss": 0.589, + "step": 9755 + }, + { + "epoch": 0.2848387567488691, + "grad_norm": 1.044829503458416, + "learning_rate": 4.355016015009257e-05, + "loss": 0.7154, + "step": 9760 + }, + { + "epoch": 0.2849846782431052, + "grad_norm": 1.1066046950139847, + "learning_rate": 4.3542549009734136e-05, + "loss": 0.7046, + "step": 9765 + }, + { + "epoch": 0.2851305997373413, + "grad_norm": 1.062783413036839, + "learning_rate": 4.353493413356853e-05, + "loss": 0.6056, + "step": 9770 + }, + { + "epoch": 0.28527652123157743, + "grad_norm": 1.3500126679976774, + "learning_rate": 4.352731552336905e-05, + "loss": 0.6969, + "step": 9775 + }, + { + "epoch": 0.2854224427258135, + "grad_norm": 1.040460287107519, + "learning_rate": 4.351969318090979e-05, + "loss": 0.6014, + "step": 9780 + }, + { + "epoch": 0.2855683642200496, + "grad_norm": 0.9308769001662154, + "learning_rate": 4.351206710796578e-05, + "loss": 0.5696, + "step": 9785 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.0272684037108237, + "learning_rate": 4.3504437306312895e-05, + "loss": 0.5701, + "step": 9790 + }, + { + "epoch": 0.2858602072085218, + "grad_norm": 1.1346822901872113, + "learning_rate": 4.349680377772786e-05, + "loss": 0.6213, + "step": 9795 + }, + { + "epoch": 0.28600612870275793, + "grad_norm": 1.035502910710529, + "learning_rate": 4.3489166523988314e-05, + "loss": 0.6203, + "step": 9800 + }, + { + "epoch": 0.286152050196994, + "grad_norm": 1.1199922438013243, + "learning_rate": 4.34815255468727e-05, + "loss": 0.6182, + "step": 9805 + }, + { + "epoch": 0.2862979716912301, + "grad_norm": 0.8558041401700277, + "learning_rate": 4.347388084816039e-05, + "loss": 0.6022, + "step": 9810 + }, + { + "epoch": 0.2864438931854662, + "grad_norm": 0.925740740465212, + "learning_rate": 4.346623242963158e-05, + "loss": 0.631, + "step": 9815 + }, + { + "epoch": 0.2865898146797023, + "grad_norm": 1.1047848031013747, + "learning_rate": 4.3458580293067345e-05, + "loss": 0.6051, + "step": 9820 + }, + { + "epoch": 0.28673573617393844, + "grad_norm": 1.1444653627692365, + "learning_rate": 4.3450924440249637e-05, + "loss": 0.6535, + "step": 9825 + }, + { + "epoch": 0.28688165766817453, + "grad_norm": 1.0742557687030576, + "learning_rate": 4.3443264872961255e-05, + "loss": 0.6239, + "step": 9830 + }, + { + "epoch": 0.2870275791624106, + "grad_norm": 0.9713479646957056, + "learning_rate": 4.343560159298588e-05, + "loss": 0.6541, + "step": 9835 + }, + { + "epoch": 0.2871735006566467, + "grad_norm": 1.0861694514831983, + "learning_rate": 4.342793460210806e-05, + "loss": 0.6214, + "step": 9840 + }, + { + "epoch": 0.28731942215088285, + "grad_norm": 1.0432355044644355, + "learning_rate": 4.3420263902113176e-05, + "loss": 0.6898, + "step": 9845 + }, + { + "epoch": 0.28746534364511894, + "grad_norm": 1.166679167665881, + "learning_rate": 4.34125894947875e-05, + "loss": 0.6332, + "step": 9850 + }, + { + "epoch": 0.28761126513935503, + "grad_norm": 1.68116103506683, + "learning_rate": 4.340491138191817e-05, + "loss": 0.7144, + "step": 9855 + }, + { + "epoch": 0.2877571866335911, + "grad_norm": 0.980564752548983, + "learning_rate": 4.3397229565293165e-05, + "loss": 0.6922, + "step": 9860 + }, + { + "epoch": 0.2879031081278272, + "grad_norm": 0.930910514750492, + "learning_rate": 4.3389544046701354e-05, + "loss": 0.6594, + "step": 9865 + }, + { + "epoch": 0.28804902962206336, + "grad_norm": 0.9475673383234228, + "learning_rate": 4.3381854827932435e-05, + "loss": 0.5478, + "step": 9870 + }, + { + "epoch": 0.28819495111629945, + "grad_norm": 1.0363261615363515, + "learning_rate": 4.3374161910777003e-05, + "loss": 0.5618, + "step": 9875 + }, + { + "epoch": 0.28834087261053554, + "grad_norm": 1.0473784030084592, + "learning_rate": 4.336646529702649e-05, + "loss": 0.6984, + "step": 9880 + }, + { + "epoch": 0.2884867941047716, + "grad_norm": 1.240370729375362, + "learning_rate": 4.3358764988473196e-05, + "loss": 0.6158, + "step": 9885 + }, + { + "epoch": 0.2886327155990077, + "grad_norm": 1.1672795191268774, + "learning_rate": 4.335106098691029e-05, + "loss": 0.7262, + "step": 9890 + }, + { + "epoch": 0.28877863709324386, + "grad_norm": 1.029083997695849, + "learning_rate": 4.3343353294131785e-05, + "loss": 0.6104, + "step": 9895 + }, + { + "epoch": 0.28892455858747995, + "grad_norm": 1.0161911817919154, + "learning_rate": 4.333564191193256e-05, + "loss": 0.6007, + "step": 9900 + }, + { + "epoch": 0.28907048008171604, + "grad_norm": 0.9981588214667476, + "learning_rate": 4.332792684210835e-05, + "loss": 0.6406, + "step": 9905 + }, + { + "epoch": 0.28921640157595213, + "grad_norm": 0.9717252770177476, + "learning_rate": 4.332020808645577e-05, + "loss": 0.608, + "step": 9910 + }, + { + "epoch": 0.2893623230701882, + "grad_norm": 1.1660459974683668, + "learning_rate": 4.331248564677226e-05, + "loss": 0.6685, + "step": 9915 + }, + { + "epoch": 0.28950824456442437, + "grad_norm": 0.8983558702272, + "learning_rate": 4.330475952485614e-05, + "loss": 0.6366, + "step": 9920 + }, + { + "epoch": 0.28965416605866046, + "grad_norm": 0.9507257548877919, + "learning_rate": 4.329702972250658e-05, + "loss": 0.6451, + "step": 9925 + }, + { + "epoch": 0.28980008755289655, + "grad_norm": 1.083196117585931, + "learning_rate": 4.328929624152362e-05, + "loss": 0.5941, + "step": 9930 + }, + { + "epoch": 0.28994600904713264, + "grad_norm": 0.9248926369171733, + "learning_rate": 4.328155908370813e-05, + "loss": 0.6035, + "step": 9935 + }, + { + "epoch": 0.2900919305413687, + "grad_norm": 1.0511133690881107, + "learning_rate": 4.327381825086186e-05, + "loss": 0.6358, + "step": 9940 + }, + { + "epoch": 0.29023785203560487, + "grad_norm": 0.927003854340498, + "learning_rate": 4.3266073744787406e-05, + "loss": 0.6097, + "step": 9945 + }, + { + "epoch": 0.29038377352984096, + "grad_norm": 0.9717675203678348, + "learning_rate": 4.3258325567288215e-05, + "loss": 0.5977, + "step": 9950 + }, + { + "epoch": 0.29052969502407705, + "grad_norm": 0.9250312722624365, + "learning_rate": 4.325057372016861e-05, + "loss": 0.6068, + "step": 9955 + }, + { + "epoch": 0.29067561651831314, + "grad_norm": 1.1098752507120653, + "learning_rate": 4.324281820523373e-05, + "loss": 0.6617, + "step": 9960 + }, + { + "epoch": 0.29082153801254923, + "grad_norm": 1.0808518096613506, + "learning_rate": 4.3235059024289615e-05, + "loss": 0.6192, + "step": 9965 + }, + { + "epoch": 0.2909674595067854, + "grad_norm": 1.0580047157919588, + "learning_rate": 4.3227296179143126e-05, + "loss": 0.6085, + "step": 9970 + }, + { + "epoch": 0.29111338100102147, + "grad_norm": 0.9393415328644182, + "learning_rate": 4.321952967160198e-05, + "loss": 0.6067, + "step": 9975 + }, + { + "epoch": 0.29125930249525755, + "grad_norm": 1.0528534113984533, + "learning_rate": 4.321175950347477e-05, + "loss": 0.6098, + "step": 9980 + }, + { + "epoch": 0.29140522398949364, + "grad_norm": 1.1579308027948558, + "learning_rate": 4.32039856765709e-05, + "loss": 0.6611, + "step": 9985 + }, + { + "epoch": 0.29155114548372973, + "grad_norm": 0.957011185388438, + "learning_rate": 4.319620819270067e-05, + "loss": 0.6418, + "step": 9990 + }, + { + "epoch": 0.2916970669779659, + "grad_norm": 1.0536886335974478, + "learning_rate": 4.318842705367521e-05, + "loss": 0.5636, + "step": 9995 + }, + { + "epoch": 0.29184298847220197, + "grad_norm": 1.0145603355228385, + "learning_rate": 4.3180642261306506e-05, + "loss": 0.6145, + "step": 10000 + }, + { + "epoch": 0.29198890996643806, + "grad_norm": 0.9321637361889581, + "learning_rate": 4.317285381740738e-05, + "loss": 0.6233, + "step": 10005 + }, + { + "epoch": 0.29213483146067415, + "grad_norm": 0.9838231596547119, + "learning_rate": 4.316506172379153e-05, + "loss": 0.6283, + "step": 10010 + }, + { + "epoch": 0.29228075295491024, + "grad_norm": 1.0149236769482028, + "learning_rate": 4.315726598227347e-05, + "loss": 0.6284, + "step": 10015 + }, + { + "epoch": 0.2924266744491464, + "grad_norm": 0.9228175888109796, + "learning_rate": 4.3149466594668606e-05, + "loss": 0.5833, + "step": 10020 + }, + { + "epoch": 0.2925725959433825, + "grad_norm": 1.2540863532382087, + "learning_rate": 4.3141663562793165e-05, + "loss": 0.6084, + "step": 10025 + }, + { + "epoch": 0.29271851743761856, + "grad_norm": 1.1280488310772208, + "learning_rate": 4.313385688846423e-05, + "loss": 0.6259, + "step": 10030 + }, + { + "epoch": 0.29286443893185465, + "grad_norm": 0.9539739015453806, + "learning_rate": 4.3126046573499716e-05, + "loss": 0.6301, + "step": 10035 + }, + { + "epoch": 0.29301036042609074, + "grad_norm": 1.2548090990064173, + "learning_rate": 4.311823261971843e-05, + "loss": 0.6483, + "step": 10040 + }, + { + "epoch": 0.2931562819203269, + "grad_norm": 1.1174417456436043, + "learning_rate": 4.3110415028939966e-05, + "loss": 0.6417, + "step": 10045 + }, + { + "epoch": 0.293302203414563, + "grad_norm": 0.8869065438456617, + "learning_rate": 4.310259380298482e-05, + "loss": 0.662, + "step": 10050 + }, + { + "epoch": 0.29344812490879907, + "grad_norm": 1.1824128584437887, + "learning_rate": 4.309476894367428e-05, + "loss": 0.6157, + "step": 10055 + }, + { + "epoch": 0.29359404640303516, + "grad_norm": 1.1472900066551646, + "learning_rate": 4.3086940452830554e-05, + "loss": 0.6242, + "step": 10060 + }, + { + "epoch": 0.29373996789727125, + "grad_norm": 1.0760054068103626, + "learning_rate": 4.3079108332276626e-05, + "loss": 0.6066, + "step": 10065 + }, + { + "epoch": 0.2938858893915074, + "grad_norm": 0.9376591745026733, + "learning_rate": 4.307127258383635e-05, + "loss": 0.6123, + "step": 10070 + }, + { + "epoch": 0.2940318108857435, + "grad_norm": 1.0417039857327592, + "learning_rate": 4.3063433209334424e-05, + "loss": 0.6729, + "step": 10075 + }, + { + "epoch": 0.2941777323799796, + "grad_norm": 1.08621513057713, + "learning_rate": 4.30555902105964e-05, + "loss": 0.6065, + "step": 10080 + }, + { + "epoch": 0.29432365387421566, + "grad_norm": 1.0581195725664356, + "learning_rate": 4.304774358944868e-05, + "loss": 0.7031, + "step": 10085 + }, + { + "epoch": 0.29446957536845175, + "grad_norm": 0.9561695129752983, + "learning_rate": 4.303989334771847e-05, + "loss": 0.6385, + "step": 10090 + }, + { + "epoch": 0.2946154968626879, + "grad_norm": 1.1846734382622395, + "learning_rate": 4.3032039487233874e-05, + "loss": 0.7111, + "step": 10095 + }, + { + "epoch": 0.294761418356924, + "grad_norm": 1.2588346196190965, + "learning_rate": 4.302418200982378e-05, + "loss": 0.675, + "step": 10100 + }, + { + "epoch": 0.2949073398511601, + "grad_norm": 1.1225204658783883, + "learning_rate": 4.301632091731796e-05, + "loss": 0.6334, + "step": 10105 + }, + { + "epoch": 0.29505326134539617, + "grad_norm": 1.0495946245311563, + "learning_rate": 4.300845621154703e-05, + "loss": 0.6137, + "step": 10110 + }, + { + "epoch": 0.29519918283963226, + "grad_norm": 1.0144801099071106, + "learning_rate": 4.300058789434242e-05, + "loss": 0.611, + "step": 10115 + }, + { + "epoch": 0.2953451043338684, + "grad_norm": 0.9492646281438598, + "learning_rate": 4.2992715967536415e-05, + "loss": 0.6006, + "step": 10120 + }, + { + "epoch": 0.2954910258281045, + "grad_norm": 1.039909505418342, + "learning_rate": 4.298484043296214e-05, + "loss": 0.6506, + "step": 10125 + }, + { + "epoch": 0.2956369473223406, + "grad_norm": 0.9382999988781197, + "learning_rate": 4.297696129245357e-05, + "loss": 0.6067, + "step": 10130 + }, + { + "epoch": 0.29578286881657667, + "grad_norm": 0.8427903373144261, + "learning_rate": 4.2969078547845496e-05, + "loss": 0.614, + "step": 10135 + }, + { + "epoch": 0.29592879031081276, + "grad_norm": 1.1997101368112018, + "learning_rate": 4.2961192200973565e-05, + "loss": 0.6175, + "step": 10140 + }, + { + "epoch": 0.2960747118050489, + "grad_norm": 1.0675060821333409, + "learning_rate": 4.295330225367428e-05, + "loss": 0.6323, + "step": 10145 + }, + { + "epoch": 0.296220633299285, + "grad_norm": 1.2598320735122508, + "learning_rate": 4.294540870778493e-05, + "loss": 0.6441, + "step": 10150 + }, + { + "epoch": 0.2963665547935211, + "grad_norm": 1.1203985738708426, + "learning_rate": 4.29375115651437e-05, + "loss": 0.7003, + "step": 10155 + }, + { + "epoch": 0.2965124762877572, + "grad_norm": 1.0751261921213477, + "learning_rate": 4.292961082758958e-05, + "loss": 0.6283, + "step": 10160 + }, + { + "epoch": 0.29665839778199327, + "grad_norm": 1.0305019782884048, + "learning_rate": 4.29217064969624e-05, + "loss": 0.6098, + "step": 10165 + }, + { + "epoch": 0.2968043192762294, + "grad_norm": 1.0394666710884315, + "learning_rate": 4.2913798575102835e-05, + "loss": 0.6087, + "step": 10170 + }, + { + "epoch": 0.2969502407704655, + "grad_norm": 1.1419209069357503, + "learning_rate": 4.290588706385238e-05, + "loss": 0.6373, + "step": 10175 + }, + { + "epoch": 0.2970961622647016, + "grad_norm": 0.9017028778278497, + "learning_rate": 4.289797196505341e-05, + "loss": 0.5877, + "step": 10180 + }, + { + "epoch": 0.2972420837589377, + "grad_norm": 0.9460849873541182, + "learning_rate": 4.289005328054908e-05, + "loss": 0.5985, + "step": 10185 + }, + { + "epoch": 0.29738800525317377, + "grad_norm": 1.2996981812683828, + "learning_rate": 4.288213101218339e-05, + "loss": 0.6697, + "step": 10190 + }, + { + "epoch": 0.2975339267474099, + "grad_norm": 1.3591789291783238, + "learning_rate": 4.287420516180122e-05, + "loss": 0.6539, + "step": 10195 + }, + { + "epoch": 0.297679848241646, + "grad_norm": 0.9869883294475226, + "learning_rate": 4.286627573124823e-05, + "loss": 0.6674, + "step": 10200 + }, + { + "epoch": 0.2978257697358821, + "grad_norm": 1.0633924641034176, + "learning_rate": 4.285834272237094e-05, + "loss": 0.6973, + "step": 10205 + }, + { + "epoch": 0.2979716912301182, + "grad_norm": 1.0806751091156153, + "learning_rate": 4.285040613701671e-05, + "loss": 0.6813, + "step": 10210 + }, + { + "epoch": 0.2981176127243543, + "grad_norm": 1.2665292301136417, + "learning_rate": 4.28424659770337e-05, + "loss": 0.638, + "step": 10215 + }, + { + "epoch": 0.2982635342185904, + "grad_norm": 0.9375090116684476, + "learning_rate": 4.283452224427096e-05, + "loss": 0.5913, + "step": 10220 + }, + { + "epoch": 0.2984094557128265, + "grad_norm": 0.9225897619759121, + "learning_rate": 4.282657494057829e-05, + "loss": 0.5837, + "step": 10225 + }, + { + "epoch": 0.2985553772070626, + "grad_norm": 1.113410029140933, + "learning_rate": 4.281862406780642e-05, + "loss": 0.6331, + "step": 10230 + }, + { + "epoch": 0.2987012987012987, + "grad_norm": 0.9690772171491423, + "learning_rate": 4.2810669627806816e-05, + "loss": 0.6192, + "step": 10235 + }, + { + "epoch": 0.2988472201955348, + "grad_norm": 0.9564890214667199, + "learning_rate": 4.280271162243184e-05, + "loss": 0.612, + "step": 10240 + }, + { + "epoch": 0.2989931416897709, + "grad_norm": 0.9493370878109928, + "learning_rate": 4.279475005353466e-05, + "loss": 0.6438, + "step": 10245 + }, + { + "epoch": 0.299139063184007, + "grad_norm": 1.1689399438433885, + "learning_rate": 4.2786784922969266e-05, + "loss": 0.6227, + "step": 10250 + }, + { + "epoch": 0.2992849846782431, + "grad_norm": 1.074599957155284, + "learning_rate": 4.27788162325905e-05, + "loss": 0.6673, + "step": 10255 + }, + { + "epoch": 0.2994309061724792, + "grad_norm": 1.131822976058886, + "learning_rate": 4.2770843984254004e-05, + "loss": 0.69, + "step": 10260 + }, + { + "epoch": 0.2995768276667153, + "grad_norm": 0.9818669758049734, + "learning_rate": 4.276286817981628e-05, + "loss": 0.613, + "step": 10265 + }, + { + "epoch": 0.29972274916095143, + "grad_norm": 1.049951587286779, + "learning_rate": 4.275488882113465e-05, + "loss": 0.5821, + "step": 10270 + }, + { + "epoch": 0.2998686706551875, + "grad_norm": 0.8586504788102471, + "learning_rate": 4.274690591006723e-05, + "loss": 0.5918, + "step": 10275 + }, + { + "epoch": 0.3000145921494236, + "grad_norm": 0.9193938930162942, + "learning_rate": 4.273891944847302e-05, + "loss": 0.5885, + "step": 10280 + }, + { + "epoch": 0.3001605136436597, + "grad_norm": 1.1161642067206445, + "learning_rate": 4.273092943821179e-05, + "loss": 0.6991, + "step": 10285 + }, + { + "epoch": 0.3003064351378958, + "grad_norm": 1.1258610428926281, + "learning_rate": 4.272293588114418e-05, + "loss": 0.6382, + "step": 10290 + }, + { + "epoch": 0.30045235663213193, + "grad_norm": 0.9402908905474704, + "learning_rate": 4.271493877913163e-05, + "loss": 0.6128, + "step": 10295 + }, + { + "epoch": 0.300598278126368, + "grad_norm": 1.0321535989101016, + "learning_rate": 4.270693813403643e-05, + "loss": 0.6417, + "step": 10300 + }, + { + "epoch": 0.3007441996206041, + "grad_norm": 1.0208349594054953, + "learning_rate": 4.269893394772166e-05, + "loss": 0.6325, + "step": 10305 + }, + { + "epoch": 0.3008901211148402, + "grad_norm": 1.1903529750006026, + "learning_rate": 4.269092622205125e-05, + "loss": 0.6456, + "step": 10310 + }, + { + "epoch": 0.3010360426090763, + "grad_norm": 0.8049629739529954, + "learning_rate": 4.268291495888995e-05, + "loss": 0.584, + "step": 10315 + }, + { + "epoch": 0.30118196410331244, + "grad_norm": 1.3438261508082656, + "learning_rate": 4.267490016010334e-05, + "loss": 0.6373, + "step": 10320 + }, + { + "epoch": 0.30132788559754853, + "grad_norm": 1.020809353844001, + "learning_rate": 4.2666881827557814e-05, + "loss": 0.6617, + "step": 10325 + }, + { + "epoch": 0.3014738070917846, + "grad_norm": 1.0982308795495834, + "learning_rate": 4.2658859963120575e-05, + "loss": 0.6369, + "step": 10330 + }, + { + "epoch": 0.3016197285860207, + "grad_norm": 0.941527729279539, + "learning_rate": 4.265083456865968e-05, + "loss": 0.6944, + "step": 10335 + }, + { + "epoch": 0.3017656500802568, + "grad_norm": 0.9426140706935435, + "learning_rate": 4.2642805646043973e-05, + "loss": 0.6319, + "step": 10340 + }, + { + "epoch": 0.30191157157449294, + "grad_norm": 1.0254375989601374, + "learning_rate": 4.263477319714317e-05, + "loss": 0.6737, + "step": 10345 + }, + { + "epoch": 0.30205749306872903, + "grad_norm": 0.9279709604109861, + "learning_rate": 4.262673722382775e-05, + "loss": 0.6231, + "step": 10350 + }, + { + "epoch": 0.3022034145629651, + "grad_norm": 1.1858764494611205, + "learning_rate": 4.2618697727969054e-05, + "loss": 0.6182, + "step": 10355 + }, + { + "epoch": 0.3023493360572012, + "grad_norm": 0.9713788051436115, + "learning_rate": 4.261065471143922e-05, + "loss": 0.6282, + "step": 10360 + }, + { + "epoch": 0.3024952575514373, + "grad_norm": 1.0979719956230793, + "learning_rate": 4.260260817611121e-05, + "loss": 0.5908, + "step": 10365 + }, + { + "epoch": 0.30264117904567345, + "grad_norm": 1.3096098724003238, + "learning_rate": 4.259455812385883e-05, + "loss": 0.6846, + "step": 10370 + }, + { + "epoch": 0.30278710053990954, + "grad_norm": 1.0476064464434878, + "learning_rate": 4.258650455655666e-05, + "loss": 0.6528, + "step": 10375 + }, + { + "epoch": 0.3029330220341456, + "grad_norm": 0.9798439797794413, + "learning_rate": 4.257844747608015e-05, + "loss": 0.5728, + "step": 10380 + }, + { + "epoch": 0.3030789435283817, + "grad_norm": 1.1895595078410353, + "learning_rate": 4.257038688430552e-05, + "loss": 0.6049, + "step": 10385 + }, + { + "epoch": 0.3032248650226178, + "grad_norm": 1.1103587792395997, + "learning_rate": 4.2562322783109835e-05, + "loss": 0.6094, + "step": 10390 + }, + { + "epoch": 0.30337078651685395, + "grad_norm": 0.9348793693097873, + "learning_rate": 4.255425517437098e-05, + "loss": 0.5397, + "step": 10395 + }, + { + "epoch": 0.30351670801109004, + "grad_norm": 1.0060834487029522, + "learning_rate": 4.2546184059967634e-05, + "loss": 0.596, + "step": 10400 + }, + { + "epoch": 0.30366262950532613, + "grad_norm": 0.8997301478964824, + "learning_rate": 4.253810944177932e-05, + "loss": 0.6551, + "step": 10405 + }, + { + "epoch": 0.3038085509995622, + "grad_norm": 1.0814856771207757, + "learning_rate": 4.253003132168636e-05, + "loss": 0.6845, + "step": 10410 + }, + { + "epoch": 0.3039544724937983, + "grad_norm": 1.0549408476391093, + "learning_rate": 4.252194970156989e-05, + "loss": 0.6582, + "step": 10415 + }, + { + "epoch": 0.30410039398803446, + "grad_norm": 1.1040764999147177, + "learning_rate": 4.2513864583311876e-05, + "loss": 0.6354, + "step": 10420 + }, + { + "epoch": 0.30424631548227055, + "grad_norm": 0.9637786458674548, + "learning_rate": 4.250577596879507e-05, + "loss": 0.6583, + "step": 10425 + }, + { + "epoch": 0.30439223697650664, + "grad_norm": 1.0484402766038825, + "learning_rate": 4.249768385990309e-05, + "loss": 0.5982, + "step": 10430 + }, + { + "epoch": 0.3045381584707427, + "grad_norm": 0.9464641488333255, + "learning_rate": 4.248958825852029e-05, + "loss": 0.6251, + "step": 10435 + }, + { + "epoch": 0.3046840799649788, + "grad_norm": 1.1148626531290056, + "learning_rate": 4.2481489166531915e-05, + "loss": 0.6138, + "step": 10440 + }, + { + "epoch": 0.30483000145921496, + "grad_norm": 0.9975829799111149, + "learning_rate": 4.2473386585823986e-05, + "loss": 0.6749, + "step": 10445 + }, + { + "epoch": 0.30497592295345105, + "grad_norm": 1.3235769403332804, + "learning_rate": 4.246528051828333e-05, + "loss": 0.721, + "step": 10450 + }, + { + "epoch": 0.30512184444768714, + "grad_norm": 1.1303143472315673, + "learning_rate": 4.2457170965797614e-05, + "loss": 0.6744, + "step": 10455 + }, + { + "epoch": 0.30526776594192323, + "grad_norm": 1.1619974982806487, + "learning_rate": 4.244905793025528e-05, + "loss": 0.6553, + "step": 10460 + }, + { + "epoch": 0.3054136874361593, + "grad_norm": 1.0191806419220486, + "learning_rate": 4.24409414135456e-05, + "loss": 0.6569, + "step": 10465 + }, + { + "epoch": 0.30555960893039547, + "grad_norm": 0.9681256557453185, + "learning_rate": 4.243282141755866e-05, + "loss": 0.6268, + "step": 10470 + }, + { + "epoch": 0.30570553042463156, + "grad_norm": 1.2333501019281996, + "learning_rate": 4.242469794418536e-05, + "loss": 0.6088, + "step": 10475 + }, + { + "epoch": 0.30585145191886765, + "grad_norm": 1.061462196546728, + "learning_rate": 4.241657099531741e-05, + "loss": 0.6111, + "step": 10480 + }, + { + "epoch": 0.30599737341310373, + "grad_norm": 1.0573594054231026, + "learning_rate": 4.24084405728473e-05, + "loss": 0.6086, + "step": 10485 + }, + { + "epoch": 0.3061432949073398, + "grad_norm": 1.0199023301421437, + "learning_rate": 4.2400306678668355e-05, + "loss": 0.5654, + "step": 10490 + }, + { + "epoch": 0.30628921640157597, + "grad_norm": 1.0326110428813924, + "learning_rate": 4.239216931467472e-05, + "loss": 0.6163, + "step": 10495 + }, + { + "epoch": 0.30643513789581206, + "grad_norm": 1.1178488965029092, + "learning_rate": 4.2384028482761315e-05, + "loss": 0.6546, + "step": 10500 + }, + { + "epoch": 0.30658105939004815, + "grad_norm": 0.8714919913236092, + "learning_rate": 4.237588418482389e-05, + "loss": 0.6007, + "step": 10505 + }, + { + "epoch": 0.30672698088428424, + "grad_norm": 1.1928375713498323, + "learning_rate": 4.2367736422759005e-05, + "loss": 0.681, + "step": 10510 + }, + { + "epoch": 0.30687290237852033, + "grad_norm": 1.0159384696385847, + "learning_rate": 4.2359585198464006e-05, + "loss": 0.5611, + "step": 10515 + }, + { + "epoch": 0.3070188238727565, + "grad_norm": 1.0785841915099939, + "learning_rate": 4.235143051383706e-05, + "loss": 0.5966, + "step": 10520 + }, + { + "epoch": 0.30716474536699256, + "grad_norm": 1.0469491083340396, + "learning_rate": 4.2343272370777155e-05, + "loss": 0.61, + "step": 10525 + }, + { + "epoch": 0.30731066686122865, + "grad_norm": 0.9353888289185007, + "learning_rate": 4.233511077118404e-05, + "loss": 0.6322, + "step": 10530 + }, + { + "epoch": 0.30745658835546474, + "grad_norm": 1.0130973165669086, + "learning_rate": 4.232694571695832e-05, + "loss": 0.6719, + "step": 10535 + }, + { + "epoch": 0.30760250984970083, + "grad_norm": 1.3664932792745657, + "learning_rate": 4.2318777210001364e-05, + "loss": 0.6945, + "step": 10540 + }, + { + "epoch": 0.307748431343937, + "grad_norm": 0.9783476864455728, + "learning_rate": 4.231060525221537e-05, + "loss": 0.5983, + "step": 10545 + }, + { + "epoch": 0.30789435283817307, + "grad_norm": 1.1209529751495166, + "learning_rate": 4.230242984550333e-05, + "loss": 0.6078, + "step": 10550 + }, + { + "epoch": 0.30804027433240916, + "grad_norm": 1.0096799691066385, + "learning_rate": 4.229425099176903e-05, + "loss": 0.5979, + "step": 10555 + }, + { + "epoch": 0.30818619582664525, + "grad_norm": 1.069972240463686, + "learning_rate": 4.228606869291708e-05, + "loss": 0.6059, + "step": 10560 + }, + { + "epoch": 0.30833211732088134, + "grad_norm": 0.9923507863451684, + "learning_rate": 4.227788295085288e-05, + "loss": 0.605, + "step": 10565 + }, + { + "epoch": 0.3084780388151175, + "grad_norm": 1.0936931170147761, + "learning_rate": 4.2269693767482635e-05, + "loss": 0.6062, + "step": 10570 + }, + { + "epoch": 0.3086239603093536, + "grad_norm": 1.0228773314444557, + "learning_rate": 4.226150114471334e-05, + "loss": 0.6837, + "step": 10575 + }, + { + "epoch": 0.30876988180358966, + "grad_norm": 0.8764492522245093, + "learning_rate": 4.225330508445281e-05, + "loss": 0.599, + "step": 10580 + }, + { + "epoch": 0.30891580329782575, + "grad_norm": 1.0172661540894417, + "learning_rate": 4.224510558860964e-05, + "loss": 0.6441, + "step": 10585 + }, + { + "epoch": 0.3090617247920619, + "grad_norm": 1.078365886822906, + "learning_rate": 4.2236902659093244e-05, + "loss": 0.5695, + "step": 10590 + }, + { + "epoch": 0.309207646286298, + "grad_norm": 1.0555723066920377, + "learning_rate": 4.222869629781383e-05, + "loss": 0.585, + "step": 10595 + }, + { + "epoch": 0.3093535677805341, + "grad_norm": 1.1878394546788085, + "learning_rate": 4.222048650668239e-05, + "loss": 0.6247, + "step": 10600 + }, + { + "epoch": 0.30949948927477017, + "grad_norm": 0.9529532759971112, + "learning_rate": 4.2212273287610744e-05, + "loss": 0.6345, + "step": 10605 + }, + { + "epoch": 0.30964541076900626, + "grad_norm": 1.2587038968001414, + "learning_rate": 4.220405664251148e-05, + "loss": 0.6873, + "step": 10610 + }, + { + "epoch": 0.3097913322632424, + "grad_norm": 1.0210864913772684, + "learning_rate": 4.2195836573298006e-05, + "loss": 0.7019, + "step": 10615 + }, + { + "epoch": 0.3099372537574785, + "grad_norm": 1.0136017567723619, + "learning_rate": 4.218761308188451e-05, + "loss": 0.5834, + "step": 10620 + }, + { + "epoch": 0.3100831752517146, + "grad_norm": 1.1353931397937516, + "learning_rate": 4.217938617018599e-05, + "loss": 0.6398, + "step": 10625 + }, + { + "epoch": 0.31022909674595067, + "grad_norm": 1.0139825340526678, + "learning_rate": 4.217115584011824e-05, + "loss": 0.6074, + "step": 10630 + }, + { + "epoch": 0.31037501824018676, + "grad_norm": 1.1376670674778286, + "learning_rate": 4.216292209359784e-05, + "loss": 0.6844, + "step": 10635 + }, + { + "epoch": 0.3105209397344229, + "grad_norm": 0.9654074776744501, + "learning_rate": 4.215468493254217e-05, + "loss": 0.5866, + "step": 10640 + }, + { + "epoch": 0.310666861228659, + "grad_norm": 1.0227474480457441, + "learning_rate": 4.214644435886942e-05, + "loss": 0.5637, + "step": 10645 + }, + { + "epoch": 0.3108127827228951, + "grad_norm": 1.0341845935844518, + "learning_rate": 4.213820037449854e-05, + "loss": 0.6125, + "step": 10650 + }, + { + "epoch": 0.3109587042171312, + "grad_norm": 1.0871959171133545, + "learning_rate": 4.212995298134932e-05, + "loss": 0.6707, + "step": 10655 + }, + { + "epoch": 0.31110462571136727, + "grad_norm": 1.0314432776746578, + "learning_rate": 4.212170218134229e-05, + "loss": 0.607, + "step": 10660 + }, + { + "epoch": 0.3112505472056034, + "grad_norm": 0.8306397051529498, + "learning_rate": 4.211344797639884e-05, + "loss": 0.6039, + "step": 10665 + }, + { + "epoch": 0.3113964686998395, + "grad_norm": 1.110747509191792, + "learning_rate": 4.210519036844109e-05, + "loss": 0.5711, + "step": 10670 + }, + { + "epoch": 0.3115423901940756, + "grad_norm": 1.0494420892508958, + "learning_rate": 4.209692935939198e-05, + "loss": 0.6827, + "step": 10675 + }, + { + "epoch": 0.3116883116883117, + "grad_norm": 1.0138262915884315, + "learning_rate": 4.2088664951175246e-05, + "loss": 0.6282, + "step": 10680 + }, + { + "epoch": 0.31183423318254777, + "grad_norm": 0.8593114601658431, + "learning_rate": 4.208039714571541e-05, + "loss": 0.5704, + "step": 10685 + }, + { + "epoch": 0.3119801546767839, + "grad_norm": 1.2149061859123544, + "learning_rate": 4.207212594493778e-05, + "loss": 0.6452, + "step": 10690 + }, + { + "epoch": 0.31212607617102, + "grad_norm": 0.9980836237159034, + "learning_rate": 4.206385135076847e-05, + "loss": 0.5795, + "step": 10695 + }, + { + "epoch": 0.3122719976652561, + "grad_norm": 0.9631069460264696, + "learning_rate": 4.205557336513435e-05, + "loss": 0.5393, + "step": 10700 + }, + { + "epoch": 0.3124179191594922, + "grad_norm": 1.1007460261290956, + "learning_rate": 4.204729198996314e-05, + "loss": 0.6376, + "step": 10705 + }, + { + "epoch": 0.3125638406537283, + "grad_norm": 1.2187241010617924, + "learning_rate": 4.203900722718328e-05, + "loss": 0.6176, + "step": 10710 + }, + { + "epoch": 0.3127097621479644, + "grad_norm": 1.126356224322464, + "learning_rate": 4.203071907872405e-05, + "loss": 0.6695, + "step": 10715 + }, + { + "epoch": 0.3128556836422005, + "grad_norm": 1.0258710602760182, + "learning_rate": 4.2022427546515497e-05, + "loss": 0.5803, + "step": 10720 + }, + { + "epoch": 0.3130016051364366, + "grad_norm": 1.0666554581542338, + "learning_rate": 4.2014132632488456e-05, + "loss": 0.6525, + "step": 10725 + }, + { + "epoch": 0.3131475266306727, + "grad_norm": 0.922861217567363, + "learning_rate": 4.200583433857455e-05, + "loss": 0.5665, + "step": 10730 + }, + { + "epoch": 0.3132934481249088, + "grad_norm": 1.1050478683539562, + "learning_rate": 4.19975326667062e-05, + "loss": 0.6062, + "step": 10735 + }, + { + "epoch": 0.3134393696191449, + "grad_norm": 1.036686179087357, + "learning_rate": 4.1989227618816604e-05, + "loss": 0.5828, + "step": 10740 + }, + { + "epoch": 0.313585291113381, + "grad_norm": 1.2464291074809783, + "learning_rate": 4.198091919683973e-05, + "loss": 0.6058, + "step": 10745 + }, + { + "epoch": 0.3137312126076171, + "grad_norm": 1.0411816435634602, + "learning_rate": 4.197260740271038e-05, + "loss": 0.6415, + "step": 10750 + }, + { + "epoch": 0.3138771341018532, + "grad_norm": 1.17981679213735, + "learning_rate": 4.196429223836408e-05, + "loss": 0.6391, + "step": 10755 + }, + { + "epoch": 0.3140230555960893, + "grad_norm": 1.1889506574903397, + "learning_rate": 4.19559737057372e-05, + "loss": 0.6569, + "step": 10760 + }, + { + "epoch": 0.31416897709032543, + "grad_norm": 1.183577120267219, + "learning_rate": 4.194765180676684e-05, + "loss": 0.6648, + "step": 10765 + }, + { + "epoch": 0.3143148985845615, + "grad_norm": 1.1719465246178082, + "learning_rate": 4.1939326543390926e-05, + "loss": 0.624, + "step": 10770 + }, + { + "epoch": 0.3144608200787976, + "grad_norm": 0.9473059984670127, + "learning_rate": 4.193099791754815e-05, + "loss": 0.6816, + "step": 10775 + }, + { + "epoch": 0.3146067415730337, + "grad_norm": 1.0582699688118096, + "learning_rate": 4.192266593117797e-05, + "loss": 0.6011, + "step": 10780 + }, + { + "epoch": 0.3147526630672698, + "grad_norm": 0.8657390430632141, + "learning_rate": 4.191433058622067e-05, + "loss": 0.6248, + "step": 10785 + }, + { + "epoch": 0.31489858456150593, + "grad_norm": 1.1959672626993205, + "learning_rate": 4.190599188461727e-05, + "loss": 0.7468, + "step": 10790 + }, + { + "epoch": 0.315044506055742, + "grad_norm": 1.1954092627551398, + "learning_rate": 4.189764982830961e-05, + "loss": 0.6675, + "step": 10795 + }, + { + "epoch": 0.3151904275499781, + "grad_norm": 1.1013725880470744, + "learning_rate": 4.188930441924029e-05, + "loss": 0.6017, + "step": 10800 + }, + { + "epoch": 0.3153363490442142, + "grad_norm": 1.2449000053096426, + "learning_rate": 4.188095565935268e-05, + "loss": 0.6477, + "step": 10805 + }, + { + "epoch": 0.3154822705384503, + "grad_norm": 0.9925361941041598, + "learning_rate": 4.1872603550590956e-05, + "loss": 0.5667, + "step": 10810 + }, + { + "epoch": 0.31562819203268644, + "grad_norm": 8.94427111548512, + "learning_rate": 4.1864248094900056e-05, + "loss": 0.6022, + "step": 10815 + }, + { + "epoch": 0.31577411352692253, + "grad_norm": 20.613875296370193, + "learning_rate": 4.185588929422572e-05, + "loss": 0.6696, + "step": 10820 + }, + { + "epoch": 0.3159200350211586, + "grad_norm": 1.0461280342824664, + "learning_rate": 4.1847527150514423e-05, + "loss": 0.5919, + "step": 10825 + }, + { + "epoch": 0.3160659565153947, + "grad_norm": 0.9761229171053537, + "learning_rate": 4.183916166571348e-05, + "loss": 0.637, + "step": 10830 + }, + { + "epoch": 0.3162118780096308, + "grad_norm": 1.0637595291027524, + "learning_rate": 4.183079284177093e-05, + "loss": 0.6499, + "step": 10835 + }, + { + "epoch": 0.31635779950386694, + "grad_norm": 1.034824977281423, + "learning_rate": 4.182242068063561e-05, + "loss": 0.604, + "step": 10840 + }, + { + "epoch": 0.31650372099810303, + "grad_norm": 1.0281952931191731, + "learning_rate": 4.1814045184257137e-05, + "loss": 0.6107, + "step": 10845 + }, + { + "epoch": 0.3166496424923391, + "grad_norm": 1.2192013642131252, + "learning_rate": 4.180566635458591e-05, + "loss": 0.667, + "step": 10850 + }, + { + "epoch": 0.3167955639865752, + "grad_norm": 1.4924884820422448, + "learning_rate": 4.179728419357307e-05, + "loss": 0.695, + "step": 10855 + }, + { + "epoch": 0.3169414854808113, + "grad_norm": 1.046807972073833, + "learning_rate": 4.178889870317059e-05, + "loss": 0.6295, + "step": 10860 + }, + { + "epoch": 0.31708740697504745, + "grad_norm": 1.0922531445192847, + "learning_rate": 4.178050988533118e-05, + "loss": 0.6483, + "step": 10865 + }, + { + "epoch": 0.31723332846928354, + "grad_norm": 1.0942237791905836, + "learning_rate": 4.177211774200832e-05, + "loss": 0.5993, + "step": 10870 + }, + { + "epoch": 0.3173792499635196, + "grad_norm": 0.966554865748181, + "learning_rate": 4.176372227515629e-05, + "loss": 0.579, + "step": 10875 + }, + { + "epoch": 0.3175251714577557, + "grad_norm": 1.1416827676907393, + "learning_rate": 4.175532348673012e-05, + "loss": 0.6455, + "step": 10880 + }, + { + "epoch": 0.3176710929519918, + "grad_norm": 1.0338401559576693, + "learning_rate": 4.1746921378685645e-05, + "loss": 0.6303, + "step": 10885 + }, + { + "epoch": 0.31781701444622795, + "grad_norm": 0.9358905890110428, + "learning_rate": 4.1738515952979437e-05, + "loss": 0.5801, + "step": 10890 + }, + { + "epoch": 0.31796293594046404, + "grad_norm": 0.9884018544105023, + "learning_rate": 4.173010721156885e-05, + "loss": 0.5735, + "step": 10895 + }, + { + "epoch": 0.31810885743470013, + "grad_norm": 1.115863129104097, + "learning_rate": 4.172169515641203e-05, + "loss": 0.6714, + "step": 10900 + }, + { + "epoch": 0.3182547789289362, + "grad_norm": 0.9213878315711291, + "learning_rate": 4.1713279789467885e-05, + "loss": 0.6328, + "step": 10905 + }, + { + "epoch": 0.3184007004231723, + "grad_norm": 1.142383580418868, + "learning_rate": 4.170486111269607e-05, + "loss": 0.6116, + "step": 10910 + }, + { + "epoch": 0.31854662191740846, + "grad_norm": 0.9637397484995928, + "learning_rate": 4.1696439128057046e-05, + "loss": 0.5982, + "step": 10915 + }, + { + "epoch": 0.31869254341164455, + "grad_norm": 1.0529787774530486, + "learning_rate": 4.168801383751203e-05, + "loss": 0.6153, + "step": 10920 + }, + { + "epoch": 0.31883846490588064, + "grad_norm": 1.026921087027283, + "learning_rate": 4.167958524302301e-05, + "loss": 0.6557, + "step": 10925 + }, + { + "epoch": 0.3189843864001167, + "grad_norm": 2.8855164738541, + "learning_rate": 4.167115334655273e-05, + "loss": 0.6603, + "step": 10930 + }, + { + "epoch": 0.3191303078943528, + "grad_norm": 1.127725152009719, + "learning_rate": 4.166271815006472e-05, + "loss": 0.6743, + "step": 10935 + }, + { + "epoch": 0.31927622938858896, + "grad_norm": 1.1322587046957713, + "learning_rate": 4.1654279655523285e-05, + "loss": 0.6542, + "step": 10940 + }, + { + "epoch": 0.31942215088282505, + "grad_norm": 1.0132380480317145, + "learning_rate": 4.164583786489346e-05, + "loss": 0.5809, + "step": 10945 + }, + { + "epoch": 0.31956807237706114, + "grad_norm": 1.0402728433038912, + "learning_rate": 4.16373927801411e-05, + "loss": 0.6701, + "step": 10950 + }, + { + "epoch": 0.31971399387129723, + "grad_norm": 1.2710436361080821, + "learning_rate": 4.162894440323278e-05, + "loss": 0.6675, + "step": 10955 + }, + { + "epoch": 0.3198599153655333, + "grad_norm": 0.9650906233915678, + "learning_rate": 4.162049273613588e-05, + "loss": 0.6025, + "step": 10960 + }, + { + "epoch": 0.32000583685976947, + "grad_norm": 0.9641418941329717, + "learning_rate": 4.161203778081852e-05, + "loss": 0.6092, + "step": 10965 + }, + { + "epoch": 0.32015175835400556, + "grad_norm": 0.8953927753027289, + "learning_rate": 4.160357953924959e-05, + "loss": 0.6123, + "step": 10970 + }, + { + "epoch": 0.32029767984824165, + "grad_norm": 0.9199657888722718, + "learning_rate": 4.159511801339876e-05, + "loss": 0.5749, + "step": 10975 + }, + { + "epoch": 0.32044360134247774, + "grad_norm": 1.0249280125653601, + "learning_rate": 4.1586653205236447e-05, + "loss": 0.6084, + "step": 10980 + }, + { + "epoch": 0.3205895228367138, + "grad_norm": 1.1417663925975452, + "learning_rate": 4.1578185116733846e-05, + "loss": 0.6533, + "step": 10985 + }, + { + "epoch": 0.32073544433094997, + "grad_norm": 1.0871716549112553, + "learning_rate": 4.15697137498629e-05, + "loss": 0.5614, + "step": 10990 + }, + { + "epoch": 0.32088136582518606, + "grad_norm": 1.1016528591290073, + "learning_rate": 4.156123910659632e-05, + "loss": 0.6273, + "step": 10995 + }, + { + "epoch": 0.32102728731942215, + "grad_norm": 1.0290611996249612, + "learning_rate": 4.1552761188907605e-05, + "loss": 0.6139, + "step": 11000 + }, + { + "epoch": 0.32117320881365824, + "grad_norm": 1.0931694809931243, + "learning_rate": 4.154427999877099e-05, + "loss": 0.6528, + "step": 11005 + }, + { + "epoch": 0.32131913030789433, + "grad_norm": 0.9548319298644551, + "learning_rate": 4.153579553816147e-05, + "loss": 0.641, + "step": 11010 + }, + { + "epoch": 0.3214650518021305, + "grad_norm": 0.9841313854224992, + "learning_rate": 4.152730780905482e-05, + "loss": 0.7052, + "step": 11015 + }, + { + "epoch": 0.32161097329636656, + "grad_norm": 0.988289395598077, + "learning_rate": 4.1518816813427556e-05, + "loss": 0.6301, + "step": 11020 + }, + { + "epoch": 0.32175689479060265, + "grad_norm": 0.9396116658389815, + "learning_rate": 4.151032255325697e-05, + "loss": 0.5954, + "step": 11025 + }, + { + "epoch": 0.32190281628483874, + "grad_norm": 1.0568483133635853, + "learning_rate": 4.150182503052111e-05, + "loss": 0.6344, + "step": 11030 + }, + { + "epoch": 0.32204873777907483, + "grad_norm": 1.0601604768196267, + "learning_rate": 4.149332424719879e-05, + "loss": 0.6798, + "step": 11035 + }, + { + "epoch": 0.322194659273311, + "grad_norm": 1.1254572699623864, + "learning_rate": 4.148482020526956e-05, + "loss": 0.6162, + "step": 11040 + }, + { + "epoch": 0.32234058076754707, + "grad_norm": 1.2528959522564855, + "learning_rate": 4.147631290671376e-05, + "loss": 0.5619, + "step": 11045 + }, + { + "epoch": 0.32248650226178316, + "grad_norm": 1.260123007927408, + "learning_rate": 4.1467802353512455e-05, + "loss": 0.6595, + "step": 11050 + }, + { + "epoch": 0.32263242375601925, + "grad_norm": 0.9933220625274068, + "learning_rate": 4.1459288547647504e-05, + "loss": 0.6269, + "step": 11055 + }, + { + "epoch": 0.32277834525025534, + "grad_norm": 0.9715078817219612, + "learning_rate": 4.14507714911015e-05, + "loss": 0.6045, + "step": 11060 + }, + { + "epoch": 0.3229242667444915, + "grad_norm": 0.9812043210205588, + "learning_rate": 4.144225118585779e-05, + "loss": 0.5581, + "step": 11065 + }, + { + "epoch": 0.3230701882387276, + "grad_norm": 1.0286662370463173, + "learning_rate": 4.143372763390051e-05, + "loss": 0.6451, + "step": 11070 + }, + { + "epoch": 0.32321610973296366, + "grad_norm": 1.1639407130847483, + "learning_rate": 4.1425200837214505e-05, + "loss": 0.6655, + "step": 11075 + }, + { + "epoch": 0.32336203122719975, + "grad_norm": 1.0132817647094212, + "learning_rate": 4.141667079778541e-05, + "loss": 0.6587, + "step": 11080 + }, + { + "epoch": 0.32350795272143584, + "grad_norm": 1.2678494324028673, + "learning_rate": 4.1408137517599596e-05, + "loss": 0.5957, + "step": 11085 + }, + { + "epoch": 0.323653874215672, + "grad_norm": 1.112567411200346, + "learning_rate": 4.13996009986442e-05, + "loss": 0.6728, + "step": 11090 + }, + { + "epoch": 0.3237997957099081, + "grad_norm": 1.069826485767921, + "learning_rate": 4.139106124290712e-05, + "loss": 0.6838, + "step": 11095 + }, + { + "epoch": 0.32394571720414417, + "grad_norm": 1.0727866184639705, + "learning_rate": 4.1382518252376985e-05, + "loss": 0.5858, + "step": 11100 + }, + { + "epoch": 0.32409163869838026, + "grad_norm": 0.8936483939993378, + "learning_rate": 4.137397202904319e-05, + "loss": 0.6067, + "step": 11105 + }, + { + "epoch": 0.32423756019261635, + "grad_norm": 1.1346353130779891, + "learning_rate": 4.136542257489589e-05, + "loss": 0.6161, + "step": 11110 + }, + { + "epoch": 0.3243834816868525, + "grad_norm": 1.0960577024248674, + "learning_rate": 4.1356869891925984e-05, + "loss": 0.6804, + "step": 11115 + }, + { + "epoch": 0.3245294031810886, + "grad_norm": 0.9792482318117541, + "learning_rate": 4.134831398212511e-05, + "loss": 0.6196, + "step": 11120 + }, + { + "epoch": 0.3246753246753247, + "grad_norm": 1.036312950440058, + "learning_rate": 4.133975484748569e-05, + "loss": 0.6321, + "step": 11125 + }, + { + "epoch": 0.32482124616956076, + "grad_norm": 0.9612634765544875, + "learning_rate": 4.1331192490000875e-05, + "loss": 0.5852, + "step": 11130 + }, + { + "epoch": 0.32496716766379685, + "grad_norm": 1.01009473889226, + "learning_rate": 4.132262691166456e-05, + "loss": 0.6316, + "step": 11135 + }, + { + "epoch": 0.325113089158033, + "grad_norm": 1.0315024708222542, + "learning_rate": 4.131405811447141e-05, + "loss": 0.6103, + "step": 11140 + }, + { + "epoch": 0.3252590106522691, + "grad_norm": 0.9133158035057377, + "learning_rate": 4.130548610041682e-05, + "loss": 0.677, + "step": 11145 + }, + { + "epoch": 0.3254049321465052, + "grad_norm": 0.8593012497677728, + "learning_rate": 4.1296910871496955e-05, + "loss": 0.5773, + "step": 11150 + }, + { + "epoch": 0.32555085364074127, + "grad_norm": 1.040992806162476, + "learning_rate": 4.1288332429708715e-05, + "loss": 0.6235, + "step": 11155 + }, + { + "epoch": 0.32569677513497736, + "grad_norm": 1.0317014632904054, + "learning_rate": 4.127975077704973e-05, + "loss": 0.6291, + "step": 11160 + }, + { + "epoch": 0.3258426966292135, + "grad_norm": 1.6706118109451882, + "learning_rate": 4.127116591551843e-05, + "loss": 0.7069, + "step": 11165 + }, + { + "epoch": 0.3259886181234496, + "grad_norm": 1.020123354206265, + "learning_rate": 4.126257784711394e-05, + "loss": 0.6371, + "step": 11170 + }, + { + "epoch": 0.3261345396176857, + "grad_norm": 0.9066459696650431, + "learning_rate": 4.125398657383616e-05, + "loss": 0.6216, + "step": 11175 + }, + { + "epoch": 0.32628046111192177, + "grad_norm": 1.0130280003722434, + "learning_rate": 4.124539209768573e-05, + "loss": 0.6074, + "step": 11180 + }, + { + "epoch": 0.32642638260615786, + "grad_norm": 0.9384077668714056, + "learning_rate": 4.1236794420664014e-05, + "loss": 0.6406, + "step": 11185 + }, + { + "epoch": 0.326572304100394, + "grad_norm": 0.9555525609756401, + "learning_rate": 4.122819354477317e-05, + "loss": 0.6486, + "step": 11190 + }, + { + "epoch": 0.3267182255946301, + "grad_norm": 1.1022165736744427, + "learning_rate": 4.121958947201606e-05, + "loss": 0.607, + "step": 11195 + }, + { + "epoch": 0.3268641470888662, + "grad_norm": 0.9738858125506661, + "learning_rate": 4.12109822043963e-05, + "loss": 0.5715, + "step": 11200 + }, + { + "epoch": 0.3270100685831023, + "grad_norm": 1.1979891364281465, + "learning_rate": 4.120237174391826e-05, + "loss": 0.6247, + "step": 11205 + }, + { + "epoch": 0.32715599007733837, + "grad_norm": 1.2370376908493559, + "learning_rate": 4.1193758092587034e-05, + "loss": 0.6777, + "step": 11210 + }, + { + "epoch": 0.3273019115715745, + "grad_norm": 0.9018058068412678, + "learning_rate": 4.118514125240849e-05, + "loss": 0.5635, + "step": 11215 + }, + { + "epoch": 0.3274478330658106, + "grad_norm": 1.0579673015690656, + "learning_rate": 4.117652122538921e-05, + "loss": 0.6732, + "step": 11220 + }, + { + "epoch": 0.3275937545600467, + "grad_norm": 1.1723372942662171, + "learning_rate": 4.116789801353652e-05, + "loss": 0.6172, + "step": 11225 + }, + { + "epoch": 0.3277396760542828, + "grad_norm": 1.0673296369873437, + "learning_rate": 4.115927161885851e-05, + "loss": 0.6362, + "step": 11230 + }, + { + "epoch": 0.32788559754851887, + "grad_norm": 0.9483166837365734, + "learning_rate": 4.1150642043363994e-05, + "loss": 0.6025, + "step": 11235 + }, + { + "epoch": 0.328031519042755, + "grad_norm": 0.8741627602015403, + "learning_rate": 4.114200928906252e-05, + "loss": 0.6055, + "step": 11240 + }, + { + "epoch": 0.3281774405369911, + "grad_norm": 1.1334322761907387, + "learning_rate": 4.1133373357964405e-05, + "loss": 0.679, + "step": 11245 + }, + { + "epoch": 0.3283233620312272, + "grad_norm": 0.984608212113143, + "learning_rate": 4.1124734252080674e-05, + "loss": 0.5891, + "step": 11250 + }, + { + "epoch": 0.3284692835254633, + "grad_norm": 1.0136537559423136, + "learning_rate": 4.1116091973423104e-05, + "loss": 0.624, + "step": 11255 + }, + { + "epoch": 0.3286152050196994, + "grad_norm": 0.8956438853083151, + "learning_rate": 4.1107446524004205e-05, + "loss": 0.598, + "step": 11260 + }, + { + "epoch": 0.3287611265139355, + "grad_norm": 1.0021347414901831, + "learning_rate": 4.109879790583725e-05, + "loss": 0.5737, + "step": 11265 + }, + { + "epoch": 0.3289070480081716, + "grad_norm": 1.139457046426254, + "learning_rate": 4.1090146120936214e-05, + "loss": 0.6488, + "step": 11270 + }, + { + "epoch": 0.3290529695024077, + "grad_norm": 1.025917268212767, + "learning_rate": 4.1081491171315834e-05, + "loss": 0.6672, + "step": 11275 + }, + { + "epoch": 0.3291988909966438, + "grad_norm": 1.12371622829611, + "learning_rate": 4.107283305899158e-05, + "loss": 0.696, + "step": 11280 + }, + { + "epoch": 0.3293448124908799, + "grad_norm": 1.0537778864026242, + "learning_rate": 4.1064171785979646e-05, + "loss": 0.5684, + "step": 11285 + }, + { + "epoch": 0.329490733985116, + "grad_norm": 1.0199855695199556, + "learning_rate": 4.1055507354296974e-05, + "loss": 0.6387, + "step": 11290 + }, + { + "epoch": 0.3296366554793521, + "grad_norm": 0.9583958003578545, + "learning_rate": 4.104683976596124e-05, + "loss": 0.6216, + "step": 11295 + }, + { + "epoch": 0.3297825769735882, + "grad_norm": 0.9497042714701236, + "learning_rate": 4.103816902299087e-05, + "loss": 0.5393, + "step": 11300 + }, + { + "epoch": 0.3299284984678243, + "grad_norm": 1.1500956032313587, + "learning_rate": 4.102949512740498e-05, + "loss": 0.6515, + "step": 11305 + }, + { + "epoch": 0.33007441996206044, + "grad_norm": 1.0790323256220478, + "learning_rate": 4.102081808122346e-05, + "loss": 0.6245, + "step": 11310 + }, + { + "epoch": 0.33022034145629653, + "grad_norm": 0.9668724223886171, + "learning_rate": 4.1012137886466926e-05, + "loss": 0.5914, + "step": 11315 + }, + { + "epoch": 0.3303662629505326, + "grad_norm": 1.206666398509121, + "learning_rate": 4.100345454515673e-05, + "loss": 0.687, + "step": 11320 + }, + { + "epoch": 0.3305121844447687, + "grad_norm": 1.119715151656299, + "learning_rate": 4.0994768059314934e-05, + "loss": 0.6153, + "step": 11325 + }, + { + "epoch": 0.3306581059390048, + "grad_norm": 0.9107223002161154, + "learning_rate": 4.098607843096435e-05, + "loss": 0.5969, + "step": 11330 + }, + { + "epoch": 0.33080402743324094, + "grad_norm": 1.1094921257211765, + "learning_rate": 4.097738566212854e-05, + "loss": 0.6428, + "step": 11335 + }, + { + "epoch": 0.33094994892747703, + "grad_norm": 1.1329701633719147, + "learning_rate": 4.096868975483176e-05, + "loss": 0.6269, + "step": 11340 + }, + { + "epoch": 0.3310958704217131, + "grad_norm": 0.9741960681615446, + "learning_rate": 4.095999071109901e-05, + "loss": 0.5984, + "step": 11345 + }, + { + "epoch": 0.3312417919159492, + "grad_norm": 1.2207424965425768, + "learning_rate": 4.095128853295604e-05, + "loss": 0.6507, + "step": 11350 + }, + { + "epoch": 0.3313877134101853, + "grad_norm": 1.0327260774376408, + "learning_rate": 4.094258322242931e-05, + "loss": 0.5989, + "step": 11355 + }, + { + "epoch": 0.33153363490442145, + "grad_norm": 1.2866155800385293, + "learning_rate": 4.0933874781546004e-05, + "loss": 0.646, + "step": 11360 + }, + { + "epoch": 0.33167955639865754, + "grad_norm": 0.9436024293875237, + "learning_rate": 4.092516321233406e-05, + "loss": 0.5883, + "step": 11365 + }, + { + "epoch": 0.3318254778928936, + "grad_norm": 0.9986688957699347, + "learning_rate": 4.0916448516822125e-05, + "loss": 0.6994, + "step": 11370 + }, + { + "epoch": 0.3319713993871297, + "grad_norm": 1.1694862800543997, + "learning_rate": 4.090773069703957e-05, + "loss": 0.5735, + "step": 11375 + }, + { + "epoch": 0.3321173208813658, + "grad_norm": 0.9386583051525885, + "learning_rate": 4.089900975501651e-05, + "loss": 0.6672, + "step": 11380 + }, + { + "epoch": 0.33226324237560195, + "grad_norm": 1.076219490657191, + "learning_rate": 4.089028569278378e-05, + "loss": 0.5972, + "step": 11385 + }, + { + "epoch": 0.33240916386983804, + "grad_norm": 0.9554744577351556, + "learning_rate": 4.088155851237294e-05, + "loss": 0.6338, + "step": 11390 + }, + { + "epoch": 0.33255508536407413, + "grad_norm": 1.0174314800802398, + "learning_rate": 4.0872828215816266e-05, + "loss": 0.6115, + "step": 11395 + }, + { + "epoch": 0.3327010068583102, + "grad_norm": 0.9218432861621283, + "learning_rate": 4.086409480514679e-05, + "loss": 0.5522, + "step": 11400 + }, + { + "epoch": 0.3328469283525463, + "grad_norm": 1.0458586652940354, + "learning_rate": 4.085535828239823e-05, + "loss": 0.6031, + "step": 11405 + }, + { + "epoch": 0.33299284984678246, + "grad_norm": 1.0612795754860267, + "learning_rate": 4.084661864960507e-05, + "loss": 0.6412, + "step": 11410 + }, + { + "epoch": 0.33313877134101855, + "grad_norm": 0.9647009468365729, + "learning_rate": 4.083787590880249e-05, + "loss": 0.591, + "step": 11415 + }, + { + "epoch": 0.33328469283525464, + "grad_norm": 1.0154476708360052, + "learning_rate": 4.082913006202637e-05, + "loss": 0.6261, + "step": 11420 + }, + { + "epoch": 0.3334306143294907, + "grad_norm": 1.1342561582274007, + "learning_rate": 4.082038111131339e-05, + "loss": 0.594, + "step": 11425 + }, + { + "epoch": 0.3335765358237268, + "grad_norm": 1.0843847008166378, + "learning_rate": 4.081162905870089e-05, + "loss": 0.594, + "step": 11430 + }, + { + "epoch": 0.33372245731796296, + "grad_norm": 0.8950674521866501, + "learning_rate": 4.080287390622693e-05, + "loss": 0.585, + "step": 11435 + }, + { + "epoch": 0.33386837881219905, + "grad_norm": 1.0938790435332206, + "learning_rate": 4.079411565593033e-05, + "loss": 0.6792, + "step": 11440 + }, + { + "epoch": 0.33401430030643514, + "grad_norm": 1.1204408051939787, + "learning_rate": 4.0785354309850614e-05, + "loss": 0.6735, + "step": 11445 + }, + { + "epoch": 0.33416022180067123, + "grad_norm": 1.117585344081694, + "learning_rate": 4.0776589870028017e-05, + "loss": 0.6867, + "step": 11450 + }, + { + "epoch": 0.3343061432949073, + "grad_norm": 1.1241996933499665, + "learning_rate": 4.076782233850349e-05, + "loss": 0.6537, + "step": 11455 + }, + { + "epoch": 0.33445206478914347, + "grad_norm": 1.2439817880282968, + "learning_rate": 4.0759051717318755e-05, + "loss": 0.5821, + "step": 11460 + }, + { + "epoch": 0.33459798628337956, + "grad_norm": 1.089365830702333, + "learning_rate": 4.075027800851617e-05, + "loss": 0.6324, + "step": 11465 + }, + { + "epoch": 0.33474390777761565, + "grad_norm": 1.0072398694470346, + "learning_rate": 4.0741501214138886e-05, + "loss": 0.6171, + "step": 11470 + }, + { + "epoch": 0.33488982927185174, + "grad_norm": 1.0880742122794438, + "learning_rate": 4.073272133623073e-05, + "loss": 0.6128, + "step": 11475 + }, + { + "epoch": 0.3350357507660878, + "grad_norm": 1.144030467809617, + "learning_rate": 4.072393837683628e-05, + "loss": 0.6133, + "step": 11480 + }, + { + "epoch": 0.33518167226032397, + "grad_norm": 1.1489113111514178, + "learning_rate": 4.071515233800079e-05, + "loss": 0.6896, + "step": 11485 + }, + { + "epoch": 0.33532759375456006, + "grad_norm": 1.0171107054003874, + "learning_rate": 4.0706363221770254e-05, + "loss": 0.6311, + "step": 11490 + }, + { + "epoch": 0.33547351524879615, + "grad_norm": 0.9438188284955568, + "learning_rate": 4.06975710301914e-05, + "loss": 0.5863, + "step": 11495 + }, + { + "epoch": 0.33561943674303224, + "grad_norm": 0.9609928362104976, + "learning_rate": 4.0688775765311636e-05, + "loss": 0.5922, + "step": 11500 + }, + { + "epoch": 0.33576535823726833, + "grad_norm": 0.8851583716387176, + "learning_rate": 4.067997742917911e-05, + "loss": 0.6575, + "step": 11505 + }, + { + "epoch": 0.3359112797315045, + "grad_norm": 1.2135837909130345, + "learning_rate": 4.067117602384269e-05, + "loss": 0.6412, + "step": 11510 + }, + { + "epoch": 0.33605720122574056, + "grad_norm": 0.9932059901168055, + "learning_rate": 4.0662371551351926e-05, + "loss": 0.5701, + "step": 11515 + }, + { + "epoch": 0.33620312271997665, + "grad_norm": 1.2438525231146114, + "learning_rate": 4.0653564013757124e-05, + "loss": 0.6326, + "step": 11520 + }, + { + "epoch": 0.33634904421421274, + "grad_norm": 0.9721708801783453, + "learning_rate": 4.064475341310926e-05, + "loss": 0.6224, + "step": 11525 + }, + { + "epoch": 0.33649496570844883, + "grad_norm": 0.9635945941202059, + "learning_rate": 4.063593975146008e-05, + "loss": 0.6199, + "step": 11530 + }, + { + "epoch": 0.336640887202685, + "grad_norm": 0.9477525384098505, + "learning_rate": 4.062712303086198e-05, + "loss": 0.6349, + "step": 11535 + }, + { + "epoch": 0.33678680869692107, + "grad_norm": 0.9500872000313519, + "learning_rate": 4.061830325336811e-05, + "loss": 0.6004, + "step": 11540 + }, + { + "epoch": 0.33693273019115716, + "grad_norm": 1.079264998691053, + "learning_rate": 4.0609480421032326e-05, + "loss": 0.6225, + "step": 11545 + }, + { + "epoch": 0.33707865168539325, + "grad_norm": 1.1255850604057762, + "learning_rate": 4.0600654535909174e-05, + "loss": 0.6407, + "step": 11550 + }, + { + "epoch": 0.33722457317962934, + "grad_norm": 1.0196165755708377, + "learning_rate": 4.0591825600053944e-05, + "loss": 0.5924, + "step": 11555 + }, + { + "epoch": 0.3373704946738655, + "grad_norm": 0.978065943675194, + "learning_rate": 4.05829936155226e-05, + "loss": 0.6415, + "step": 11560 + }, + { + "epoch": 0.3375164161681016, + "grad_norm": 1.0630273684517053, + "learning_rate": 4.057415858437186e-05, + "loss": 0.6218, + "step": 11565 + }, + { + "epoch": 0.33766233766233766, + "grad_norm": 0.9754306458450281, + "learning_rate": 4.056532050865909e-05, + "loss": 0.6261, + "step": 11570 + }, + { + "epoch": 0.33780825915657375, + "grad_norm": 1.0120772896129, + "learning_rate": 4.0556479390442425e-05, + "loss": 0.5415, + "step": 11575 + }, + { + "epoch": 0.33795418065080984, + "grad_norm": 1.030702413983348, + "learning_rate": 4.0547635231780686e-05, + "loss": 0.6605, + "step": 11580 + }, + { + "epoch": 0.338100102145046, + "grad_norm": 1.0705045927934609, + "learning_rate": 4.05387880347334e-05, + "loss": 0.6228, + "step": 11585 + }, + { + "epoch": 0.3382460236392821, + "grad_norm": 0.8869581727263077, + "learning_rate": 4.0529937801360796e-05, + "loss": 0.5799, + "step": 11590 + }, + { + "epoch": 0.33839194513351817, + "grad_norm": 0.9585061856170709, + "learning_rate": 4.0521084533723805e-05, + "loss": 0.6266, + "step": 11595 + }, + { + "epoch": 0.33853786662775426, + "grad_norm": 13.614486155253502, + "learning_rate": 4.051222823388409e-05, + "loss": 0.6292, + "step": 11600 + }, + { + "epoch": 0.33868378812199035, + "grad_norm": 1.0186451921533686, + "learning_rate": 4.0503368903904015e-05, + "loss": 0.6519, + "step": 11605 + }, + { + "epoch": 0.3388297096162265, + "grad_norm": 0.9520120308383339, + "learning_rate": 4.0494506545846625e-05, + "loss": 0.5591, + "step": 11610 + }, + { + "epoch": 0.3389756311104626, + "grad_norm": 1.094136677831175, + "learning_rate": 4.0485641161775686e-05, + "loss": 0.5687, + "step": 11615 + }, + { + "epoch": 0.3391215526046987, + "grad_norm": 0.9167299249673083, + "learning_rate": 4.047677275375567e-05, + "loss": 0.6232, + "step": 11620 + }, + { + "epoch": 0.33926747409893476, + "grad_norm": 1.0155349389072916, + "learning_rate": 4.046790132385177e-05, + "loss": 0.6473, + "step": 11625 + }, + { + "epoch": 0.33941339559317085, + "grad_norm": 1.204353672645426, + "learning_rate": 4.045902687412983e-05, + "loss": 0.6325, + "step": 11630 + }, + { + "epoch": 0.339559317087407, + "grad_norm": 1.0352811936427893, + "learning_rate": 4.0450149406656454e-05, + "loss": 0.5752, + "step": 11635 + }, + { + "epoch": 0.3397052385816431, + "grad_norm": 1.2036224500518966, + "learning_rate": 4.044126892349892e-05, + "loss": 0.5869, + "step": 11640 + }, + { + "epoch": 0.3398511600758792, + "grad_norm": 0.9049889018835772, + "learning_rate": 4.043238542672522e-05, + "loss": 0.7039, + "step": 11645 + }, + { + "epoch": 0.33999708157011527, + "grad_norm": 1.0746934526866798, + "learning_rate": 4.042349891840403e-05, + "loss": 0.6011, + "step": 11650 + }, + { + "epoch": 0.34014300306435136, + "grad_norm": 1.2791431374070314, + "learning_rate": 4.041460940060475e-05, + "loss": 0.575, + "step": 11655 + }, + { + "epoch": 0.3402889245585875, + "grad_norm": 0.956046019274133, + "learning_rate": 4.0405716875397474e-05, + "loss": 0.5684, + "step": 11660 + }, + { + "epoch": 0.3404348460528236, + "grad_norm": 0.8957865227576939, + "learning_rate": 4.039682134485299e-05, + "loss": 0.6254, + "step": 11665 + }, + { + "epoch": 0.3405807675470597, + "grad_norm": 1.3255957146955113, + "learning_rate": 4.038792281104277e-05, + "loss": 0.6362, + "step": 11670 + }, + { + "epoch": 0.34072668904129577, + "grad_norm": 0.9825886998592437, + "learning_rate": 4.037902127603902e-05, + "loss": 0.6321, + "step": 11675 + }, + { + "epoch": 0.34087261053553186, + "grad_norm": 1.022042790376303, + "learning_rate": 4.0370116741914626e-05, + "loss": 0.6232, + "step": 11680 + }, + { + "epoch": 0.341018532029768, + "grad_norm": 1.0367874143276963, + "learning_rate": 4.036120921074319e-05, + "loss": 0.5685, + "step": 11685 + }, + { + "epoch": 0.3411644535240041, + "grad_norm": 0.9625893045756854, + "learning_rate": 4.035229868459897e-05, + "loss": 0.5705, + "step": 11690 + }, + { + "epoch": 0.3413103750182402, + "grad_norm": 1.1416292137182558, + "learning_rate": 4.034338516555696e-05, + "loss": 0.631, + "step": 11695 + }, + { + "epoch": 0.3414562965124763, + "grad_norm": 0.9392387893446615, + "learning_rate": 4.033446865569285e-05, + "loss": 0.5502, + "step": 11700 + }, + { + "epoch": 0.34160221800671237, + "grad_norm": 1.1494205510912594, + "learning_rate": 4.0325549157083e-05, + "loss": 0.6027, + "step": 11705 + }, + { + "epoch": 0.3417481395009485, + "grad_norm": 0.9954427588894809, + "learning_rate": 4.031662667180449e-05, + "loss": 0.6416, + "step": 11710 + }, + { + "epoch": 0.3418940609951846, + "grad_norm": 0.9478626331065628, + "learning_rate": 4.030770120193509e-05, + "loss": 0.5836, + "step": 11715 + }, + { + "epoch": 0.3420399824894207, + "grad_norm": 1.0318209514457515, + "learning_rate": 4.029877274955325e-05, + "loss": 0.6742, + "step": 11720 + }, + { + "epoch": 0.3421859039836568, + "grad_norm": 0.9361076460990716, + "learning_rate": 4.028984131673814e-05, + "loss": 0.5534, + "step": 11725 + }, + { + "epoch": 0.34233182547789287, + "grad_norm": 1.182780341461957, + "learning_rate": 4.02809069055696e-05, + "loss": 0.6184, + "step": 11730 + }, + { + "epoch": 0.342477746972129, + "grad_norm": 1.1217329252882884, + "learning_rate": 4.027196951812819e-05, + "loss": 0.6618, + "step": 11735 + }, + { + "epoch": 0.3426236684663651, + "grad_norm": 1.041416129745986, + "learning_rate": 4.026302915649513e-05, + "loss": 0.6615, + "step": 11740 + }, + { + "epoch": 0.3427695899606012, + "grad_norm": 0.9929462762558451, + "learning_rate": 4.0254085822752365e-05, + "loss": 0.5981, + "step": 11745 + }, + { + "epoch": 0.3429155114548373, + "grad_norm": 1.1719165717497415, + "learning_rate": 4.02451395189825e-05, + "loss": 0.6287, + "step": 11750 + }, + { + "epoch": 0.3430614329490734, + "grad_norm": 1.0985426542323578, + "learning_rate": 4.023619024726887e-05, + "loss": 0.6175, + "step": 11755 + }, + { + "epoch": 0.3432073544433095, + "grad_norm": 1.0344749049824091, + "learning_rate": 4.022723800969547e-05, + "loss": 0.6391, + "step": 11760 + }, + { + "epoch": 0.3433532759375456, + "grad_norm": 1.2512350611789893, + "learning_rate": 4.0218282808346986e-05, + "loss": 0.6675, + "step": 11765 + }, + { + "epoch": 0.3434991974317817, + "grad_norm": 1.016037781223652, + "learning_rate": 4.020932464530882e-05, + "loss": 0.5805, + "step": 11770 + }, + { + "epoch": 0.3436451189260178, + "grad_norm": 1.142195942102537, + "learning_rate": 4.020036352266704e-05, + "loss": 0.644, + "step": 11775 + }, + { + "epoch": 0.3437910404202539, + "grad_norm": 1.412425677520266, + "learning_rate": 4.019139944250841e-05, + "loss": 0.6912, + "step": 11780 + }, + { + "epoch": 0.34393696191449, + "grad_norm": 1.2608008988458768, + "learning_rate": 4.018243240692038e-05, + "loss": 0.6782, + "step": 11785 + }, + { + "epoch": 0.3440828834087261, + "grad_norm": 1.066099336377157, + "learning_rate": 4.017346241799111e-05, + "loss": 0.6353, + "step": 11790 + }, + { + "epoch": 0.3442288049029622, + "grad_norm": 0.9886717340988644, + "learning_rate": 4.01644894778094e-05, + "loss": 0.622, + "step": 11795 + }, + { + "epoch": 0.3443747263971983, + "grad_norm": 1.0569965976704123, + "learning_rate": 4.015551358846479e-05, + "loss": 0.6027, + "step": 11800 + }, + { + "epoch": 0.3445206478914344, + "grad_norm": 1.071537093921575, + "learning_rate": 4.0146534752047485e-05, + "loss": 0.617, + "step": 11805 + }, + { + "epoch": 0.34466656938567053, + "grad_norm": 1.3423382903000434, + "learning_rate": 4.013755297064835e-05, + "loss": 0.624, + "step": 11810 + }, + { + "epoch": 0.3448124908799066, + "grad_norm": 1.0202858938297927, + "learning_rate": 4.012856824635899e-05, + "loss": 0.6005, + "step": 11815 + }, + { + "epoch": 0.3449584123741427, + "grad_norm": 1.0306726778606794, + "learning_rate": 4.011958058127165e-05, + "loss": 0.6073, + "step": 11820 + }, + { + "epoch": 0.3451043338683788, + "grad_norm": 1.1076206505861514, + "learning_rate": 4.0110589977479265e-05, + "loss": 0.6005, + "step": 11825 + }, + { + "epoch": 0.3452502553626149, + "grad_norm": 0.992354674622993, + "learning_rate": 4.01015964370755e-05, + "loss": 0.6576, + "step": 11830 + }, + { + "epoch": 0.34539617685685103, + "grad_norm": 0.9359407467285924, + "learning_rate": 4.0092599962154625e-05, + "loss": 0.5697, + "step": 11835 + }, + { + "epoch": 0.3455420983510871, + "grad_norm": 1.0197674730096449, + "learning_rate": 4.008360055481167e-05, + "loss": 0.6101, + "step": 11840 + }, + { + "epoch": 0.3456880198453232, + "grad_norm": 1.202661499911798, + "learning_rate": 4.007459821714231e-05, + "loss": 0.6433, + "step": 11845 + }, + { + "epoch": 0.3458339413395593, + "grad_norm": 0.9701731955478315, + "learning_rate": 4.006559295124289e-05, + "loss": 0.6774, + "step": 11850 + }, + { + "epoch": 0.3459798628337954, + "grad_norm": 1.0998235098898552, + "learning_rate": 4.005658475921047e-05, + "loss": 0.601, + "step": 11855 + }, + { + "epoch": 0.34612578432803154, + "grad_norm": 0.8687070239156239, + "learning_rate": 4.004757364314277e-05, + "loss": 0.5828, + "step": 11860 + }, + { + "epoch": 0.34627170582226763, + "grad_norm": 1.0973373423020554, + "learning_rate": 4.00385596051382e-05, + "loss": 0.6614, + "step": 11865 + }, + { + "epoch": 0.3464176273165037, + "grad_norm": 1.0186541566260618, + "learning_rate": 4.002954264729585e-05, + "loss": 0.5837, + "step": 11870 + }, + { + "epoch": 0.3465635488107398, + "grad_norm": 1.047456859805427, + "learning_rate": 4.002052277171548e-05, + "loss": 0.6187, + "step": 11875 + }, + { + "epoch": 0.3467094703049759, + "grad_norm": 1.0559355868145486, + "learning_rate": 4.001149998049754e-05, + "loss": 0.611, + "step": 11880 + }, + { + "epoch": 0.34685539179921204, + "grad_norm": 1.012127312246685, + "learning_rate": 4.000247427574316e-05, + "loss": 0.5921, + "step": 11885 + }, + { + "epoch": 0.34700131329344813, + "grad_norm": 1.212715477570161, + "learning_rate": 3.999344565955412e-05, + "loss": 0.6416, + "step": 11890 + }, + { + "epoch": 0.3471472347876842, + "grad_norm": 1.219259746588776, + "learning_rate": 3.9984414134032936e-05, + "loss": 0.617, + "step": 11895 + }, + { + "epoch": 0.3472931562819203, + "grad_norm": 1.0408859393241783, + "learning_rate": 3.9975379701282754e-05, + "loss": 0.5939, + "step": 11900 + }, + { + "epoch": 0.3474390777761564, + "grad_norm": 1.1164331437651034, + "learning_rate": 3.99663423634074e-05, + "loss": 0.6066, + "step": 11905 + }, + { + "epoch": 0.34758499927039255, + "grad_norm": 1.0687503314288125, + "learning_rate": 3.99573021225114e-05, + "loss": 0.584, + "step": 11910 + }, + { + "epoch": 0.34773092076462864, + "grad_norm": 1.0727786430714359, + "learning_rate": 3.9948258980699934e-05, + "loss": 0.6311, + "step": 11915 + }, + { + "epoch": 0.3478768422588647, + "grad_norm": 1.2509984583005835, + "learning_rate": 3.993921294007888e-05, + "loss": 0.6784, + "step": 11920 + }, + { + "epoch": 0.3480227637531008, + "grad_norm": 1.1234430072469523, + "learning_rate": 3.9930164002754755e-05, + "loss": 0.6479, + "step": 11925 + }, + { + "epoch": 0.3481686852473369, + "grad_norm": 0.9946724937840159, + "learning_rate": 3.992111217083479e-05, + "loss": 0.6574, + "step": 11930 + }, + { + "epoch": 0.34831460674157305, + "grad_norm": 1.0423653695316986, + "learning_rate": 3.991205744642687e-05, + "loss": 0.7139, + "step": 11935 + }, + { + "epoch": 0.34846052823580914, + "grad_norm": 1.044499716132989, + "learning_rate": 3.990299983163955e-05, + "loss": 0.6085, + "step": 11940 + }, + { + "epoch": 0.34860644973004523, + "grad_norm": 1.0596993552345453, + "learning_rate": 3.989393932858208e-05, + "loss": 0.5797, + "step": 11945 + }, + { + "epoch": 0.3487523712242813, + "grad_norm": 0.9288701138259828, + "learning_rate": 3.9884875939364356e-05, + "loss": 0.5617, + "step": 11950 + }, + { + "epoch": 0.3488982927185174, + "grad_norm": 0.9501583776531269, + "learning_rate": 3.987580966609696e-05, + "loss": 0.5739, + "step": 11955 + }, + { + "epoch": 0.34904421421275356, + "grad_norm": 0.9489743282834809, + "learning_rate": 3.9866740510891146e-05, + "loss": 0.6315, + "step": 11960 + }, + { + "epoch": 0.34919013570698965, + "grad_norm": 1.11705639204195, + "learning_rate": 3.985766847585883e-05, + "loss": 0.6338, + "step": 11965 + }, + { + "epoch": 0.34933605720122574, + "grad_norm": 0.9908553784863717, + "learning_rate": 3.98485935631126e-05, + "loss": 0.5678, + "step": 11970 + }, + { + "epoch": 0.3494819786954618, + "grad_norm": 1.2087208819872626, + "learning_rate": 3.9839515774765734e-05, + "loss": 0.7002, + "step": 11975 + }, + { + "epoch": 0.3496279001896979, + "grad_norm": 1.0111445846843672, + "learning_rate": 3.983043511293216e-05, + "loss": 0.6007, + "step": 11980 + }, + { + "epoch": 0.34977382168393406, + "grad_norm": 0.8873050546606204, + "learning_rate": 3.982135157972647e-05, + "loss": 0.634, + "step": 11985 + }, + { + "epoch": 0.34991974317817015, + "grad_norm": 1.088884100690103, + "learning_rate": 3.981226517726394e-05, + "loss": 0.6144, + "step": 11990 + }, + { + "epoch": 0.35006566467240624, + "grad_norm": 1.0440919603189667, + "learning_rate": 3.980317590766051e-05, + "loss": 0.6564, + "step": 11995 + }, + { + "epoch": 0.35021158616664233, + "grad_norm": 0.9068609447339323, + "learning_rate": 3.979408377303279e-05, + "loss": 0.5537, + "step": 12000 + }, + { + "epoch": 0.3503575076608784, + "grad_norm": 1.1693639016047885, + "learning_rate": 3.978498877549805e-05, + "loss": 0.5993, + "step": 12005 + }, + { + "epoch": 0.35050342915511457, + "grad_norm": 1.0621269135223315, + "learning_rate": 3.977589091717423e-05, + "loss": 0.5957, + "step": 12010 + }, + { + "epoch": 0.35064935064935066, + "grad_norm": 0.9255826927281953, + "learning_rate": 3.9766790200179926e-05, + "loss": 0.5885, + "step": 12015 + }, + { + "epoch": 0.35079527214358674, + "grad_norm": 1.172959074159462, + "learning_rate": 3.975768662663442e-05, + "loss": 0.607, + "step": 12020 + }, + { + "epoch": 0.35094119363782283, + "grad_norm": 1.1405879773489567, + "learning_rate": 3.974858019865766e-05, + "loss": 0.6829, + "step": 12025 + }, + { + "epoch": 0.351087115132059, + "grad_norm": 0.9571356477421576, + "learning_rate": 3.973947091837023e-05, + "loss": 0.6318, + "step": 12030 + }, + { + "epoch": 0.35123303662629507, + "grad_norm": 1.0538091204882005, + "learning_rate": 3.97303587878934e-05, + "loss": 0.6895, + "step": 12035 + }, + { + "epoch": 0.35137895812053116, + "grad_norm": 0.9383875321038203, + "learning_rate": 3.972124380934911e-05, + "loss": 0.5984, + "step": 12040 + }, + { + "epoch": 0.35152487961476725, + "grad_norm": 1.0600920043661486, + "learning_rate": 3.971212598485993e-05, + "loss": 0.6829, + "step": 12045 + }, + { + "epoch": 0.35167080110900334, + "grad_norm": 4.835728397734678, + "learning_rate": 3.9703005316549144e-05, + "loss": 0.6366, + "step": 12050 + }, + { + "epoch": 0.3518167226032395, + "grad_norm": 1.0425580132273389, + "learning_rate": 3.9693881806540663e-05, + "loss": 0.6193, + "step": 12055 + }, + { + "epoch": 0.3519626440974756, + "grad_norm": 1.1221501876056423, + "learning_rate": 3.968475545695904e-05, + "loss": 0.663, + "step": 12060 + }, + { + "epoch": 0.35210856559171166, + "grad_norm": 1.3164184007608646, + "learning_rate": 3.967562626992955e-05, + "loss": 0.5855, + "step": 12065 + }, + { + "epoch": 0.35225448708594775, + "grad_norm": 1.0839853136591844, + "learning_rate": 3.966649424757808e-05, + "loss": 0.6053, + "step": 12070 + }, + { + "epoch": 0.35240040858018384, + "grad_norm": 1.1444259043679468, + "learning_rate": 3.9657359392031194e-05, + "loss": 0.6678, + "step": 12075 + }, + { + "epoch": 0.35254633007442, + "grad_norm": 1.0408842539417962, + "learning_rate": 3.964822170541611e-05, + "loss": 0.5983, + "step": 12080 + }, + { + "epoch": 0.3526922515686561, + "grad_norm": 1.2503432594686106, + "learning_rate": 3.963908118986071e-05, + "loss": 0.6024, + "step": 12085 + }, + { + "epoch": 0.35283817306289217, + "grad_norm": 0.8876723098489058, + "learning_rate": 3.962993784749354e-05, + "loss": 0.6067, + "step": 12090 + }, + { + "epoch": 0.35298409455712826, + "grad_norm": 0.8594298888912166, + "learning_rate": 3.9620791680443786e-05, + "loss": 0.6063, + "step": 12095 + }, + { + "epoch": 0.35313001605136435, + "grad_norm": 1.1372547791451348, + "learning_rate": 3.961164269084132e-05, + "loss": 0.6218, + "step": 12100 + }, + { + "epoch": 0.3532759375456005, + "grad_norm": 1.1459439724212972, + "learning_rate": 3.9602490880816645e-05, + "loss": 0.691, + "step": 12105 + }, + { + "epoch": 0.3534218590398366, + "grad_norm": 1.1228696081698626, + "learning_rate": 3.959333625250093e-05, + "loss": 0.6424, + "step": 12110 + }, + { + "epoch": 0.3535677805340727, + "grad_norm": 1.1043214863169466, + "learning_rate": 3.958417880802601e-05, + "loss": 0.6485, + "step": 12115 + }, + { + "epoch": 0.35371370202830876, + "grad_norm": 1.1234498375565625, + "learning_rate": 3.957501854952434e-05, + "loss": 0.6113, + "step": 12120 + }, + { + "epoch": 0.35385962352254485, + "grad_norm": 1.219363438942327, + "learning_rate": 3.9565855479129096e-05, + "loss": 0.5836, + "step": 12125 + }, + { + "epoch": 0.354005545016781, + "grad_norm": 1.005811850355647, + "learning_rate": 3.955668959897405e-05, + "loss": 0.6167, + "step": 12130 + }, + { + "epoch": 0.3541514665110171, + "grad_norm": 0.932909719896271, + "learning_rate": 3.9547520911193655e-05, + "loss": 0.6075, + "step": 12135 + }, + { + "epoch": 0.3542973880052532, + "grad_norm": 1.1001747951757632, + "learning_rate": 3.9538349417923e-05, + "loss": 0.6051, + "step": 12140 + }, + { + "epoch": 0.35444330949948927, + "grad_norm": 0.948816669053959, + "learning_rate": 3.952917512129785e-05, + "loss": 0.6289, + "step": 12145 + }, + { + "epoch": 0.35458923099372536, + "grad_norm": 0.9499283574304829, + "learning_rate": 3.9519998023454595e-05, + "loss": 0.6412, + "step": 12150 + }, + { + "epoch": 0.3547351524879615, + "grad_norm": 1.251550171986757, + "learning_rate": 3.951081812653032e-05, + "loss": 0.6189, + "step": 12155 + }, + { + "epoch": 0.3548810739821976, + "grad_norm": 1.1282563332718842, + "learning_rate": 3.950163543266271e-05, + "loss": 0.6395, + "step": 12160 + }, + { + "epoch": 0.3550269954764337, + "grad_norm": 1.0819370419567056, + "learning_rate": 3.949244994399014e-05, + "loss": 0.635, + "step": 12165 + }, + { + "epoch": 0.35517291697066977, + "grad_norm": 1.0778565547247143, + "learning_rate": 3.948326166265162e-05, + "loss": 0.656, + "step": 12170 + }, + { + "epoch": 0.35531883846490586, + "grad_norm": 1.0059555884553493, + "learning_rate": 3.947407059078681e-05, + "loss": 0.6421, + "step": 12175 + }, + { + "epoch": 0.355464759959142, + "grad_norm": 0.955652087152547, + "learning_rate": 3.9464876730536023e-05, + "loss": 0.6316, + "step": 12180 + }, + { + "epoch": 0.3556106814533781, + "grad_norm": 1.0743625569302586, + "learning_rate": 3.945568008404023e-05, + "loss": 0.596, + "step": 12185 + }, + { + "epoch": 0.3557566029476142, + "grad_norm": 1.356186403434128, + "learning_rate": 3.9446480653441034e-05, + "loss": 0.6279, + "step": 12190 + }, + { + "epoch": 0.3559025244418503, + "grad_norm": 0.9841138898777383, + "learning_rate": 3.94372784408807e-05, + "loss": 0.5922, + "step": 12195 + }, + { + "epoch": 0.35604844593608637, + "grad_norm": 0.9898787268814568, + "learning_rate": 3.9428073448502125e-05, + "loss": 0.5846, + "step": 12200 + }, + { + "epoch": 0.3561943674303225, + "grad_norm": 1.2283082897526931, + "learning_rate": 3.941886567844887e-05, + "loss": 0.6322, + "step": 12205 + }, + { + "epoch": 0.3563402889245586, + "grad_norm": 0.8672762403587563, + "learning_rate": 3.9409655132865134e-05, + "loss": 0.6118, + "step": 12210 + }, + { + "epoch": 0.3564862104187947, + "grad_norm": 1.0170053194543032, + "learning_rate": 3.940044181389576e-05, + "loss": 0.6611, + "step": 12215 + }, + { + "epoch": 0.3566321319130308, + "grad_norm": 1.3992662603842216, + "learning_rate": 3.939122572368626e-05, + "loss": 0.6259, + "step": 12220 + }, + { + "epoch": 0.35677805340726687, + "grad_norm": 0.931400118462024, + "learning_rate": 3.938200686438276e-05, + "loss": 0.5905, + "step": 12225 + }, + { + "epoch": 0.356923974901503, + "grad_norm": 1.0077482460544411, + "learning_rate": 3.937278523813204e-05, + "loss": 0.6177, + "step": 12230 + }, + { + "epoch": 0.3570698963957391, + "grad_norm": 0.9732156132169247, + "learning_rate": 3.936356084708153e-05, + "loss": 0.6197, + "step": 12235 + }, + { + "epoch": 0.3572158178899752, + "grad_norm": 1.1821515070926205, + "learning_rate": 3.93543336933793e-05, + "loss": 0.5949, + "step": 12240 + }, + { + "epoch": 0.3573617393842113, + "grad_norm": 0.8798611610294117, + "learning_rate": 3.934510377917407e-05, + "loss": 0.6146, + "step": 12245 + }, + { + "epoch": 0.3575076608784474, + "grad_norm": 1.1159368694368452, + "learning_rate": 3.9335871106615195e-05, + "loss": 0.6392, + "step": 12250 + }, + { + "epoch": 0.3576535823726835, + "grad_norm": 1.1735250087724647, + "learning_rate": 3.9326635677852685e-05, + "loss": 0.5661, + "step": 12255 + }, + { + "epoch": 0.3577995038669196, + "grad_norm": 1.0239488702992352, + "learning_rate": 3.931739749503717e-05, + "loss": 0.6409, + "step": 12260 + }, + { + "epoch": 0.3579454253611557, + "grad_norm": 1.0047671993049745, + "learning_rate": 3.930815656031993e-05, + "loss": 0.6143, + "step": 12265 + }, + { + "epoch": 0.3580913468553918, + "grad_norm": 1.0219145610569702, + "learning_rate": 3.92989128758529e-05, + "loss": 0.5726, + "step": 12270 + }, + { + "epoch": 0.3582372683496279, + "grad_norm": 1.026163154691809, + "learning_rate": 3.928966644378864e-05, + "loss": 0.6568, + "step": 12275 + }, + { + "epoch": 0.358383189843864, + "grad_norm": 1.0479287348082684, + "learning_rate": 3.9280417266280356e-05, + "loss": 0.5927, + "step": 12280 + }, + { + "epoch": 0.3585291113381001, + "grad_norm": 0.9236183679134596, + "learning_rate": 3.927116534548189e-05, + "loss": 0.6215, + "step": 12285 + }, + { + "epoch": 0.3586750328323362, + "grad_norm": 0.9600579308213374, + "learning_rate": 3.926191068354773e-05, + "loss": 0.5791, + "step": 12290 + }, + { + "epoch": 0.3588209543265723, + "grad_norm": 0.9653476417998901, + "learning_rate": 3.925265328263299e-05, + "loss": 0.6087, + "step": 12295 + }, + { + "epoch": 0.3589668758208084, + "grad_norm": 0.9437025841774455, + "learning_rate": 3.9243393144893434e-05, + "loss": 0.6025, + "step": 12300 + }, + { + "epoch": 0.35911279731504453, + "grad_norm": 0.941321908566054, + "learning_rate": 3.923413027248545e-05, + "loss": 0.5963, + "step": 12305 + }, + { + "epoch": 0.3592587188092806, + "grad_norm": 1.076612889604512, + "learning_rate": 3.922486466756607e-05, + "loss": 0.6977, + "step": 12310 + }, + { + "epoch": 0.3594046403035167, + "grad_norm": 1.0821463482985947, + "learning_rate": 3.9215596332292984e-05, + "loss": 0.599, + "step": 12315 + }, + { + "epoch": 0.3595505617977528, + "grad_norm": 1.0601812976146276, + "learning_rate": 3.920632526882447e-05, + "loss": 0.6622, + "step": 12320 + }, + { + "epoch": 0.3596964832919889, + "grad_norm": 1.0766512517970406, + "learning_rate": 3.919705147931949e-05, + "loss": 0.5967, + "step": 12325 + }, + { + "epoch": 0.35984240478622503, + "grad_norm": 1.0052417408289198, + "learning_rate": 3.91877749659376e-05, + "loss": 0.6004, + "step": 12330 + }, + { + "epoch": 0.3599883262804611, + "grad_norm": 1.2221946812596904, + "learning_rate": 3.9178495730839025e-05, + "loss": 0.6456, + "step": 12335 + }, + { + "epoch": 0.3601342477746972, + "grad_norm": 1.079130279648457, + "learning_rate": 3.9169213776184604e-05, + "loss": 0.6441, + "step": 12340 + }, + { + "epoch": 0.3602801692689333, + "grad_norm": 0.9419757338439992, + "learning_rate": 3.91599291041358e-05, + "loss": 0.6623, + "step": 12345 + }, + { + "epoch": 0.3604260907631694, + "grad_norm": 1.0769944355614132, + "learning_rate": 3.915064171685474e-05, + "loss": 0.609, + "step": 12350 + }, + { + "epoch": 0.36057201225740554, + "grad_norm": 1.1742438348013977, + "learning_rate": 3.9141351616504166e-05, + "loss": 0.6361, + "step": 12355 + }, + { + "epoch": 0.36071793375164163, + "grad_norm": 1.1344928126963667, + "learning_rate": 3.913205880524744e-05, + "loss": 0.6066, + "step": 12360 + }, + { + "epoch": 0.3608638552458777, + "grad_norm": 0.8598928058434672, + "learning_rate": 3.912276328524857e-05, + "loss": 0.6158, + "step": 12365 + }, + { + "epoch": 0.3610097767401138, + "grad_norm": 0.878949872083786, + "learning_rate": 3.911346505867219e-05, + "loss": 0.5126, + "step": 12370 + }, + { + "epoch": 0.3611556982343499, + "grad_norm": 1.0196358038958668, + "learning_rate": 3.910416412768358e-05, + "loss": 0.5805, + "step": 12375 + }, + { + "epoch": 0.36130161972858604, + "grad_norm": 0.9966378808576365, + "learning_rate": 3.909486049444861e-05, + "loss": 0.6235, + "step": 12380 + }, + { + "epoch": 0.36144754122282213, + "grad_norm": 0.9980804698093425, + "learning_rate": 3.9085554161133834e-05, + "loss": 0.641, + "step": 12385 + }, + { + "epoch": 0.3615934627170582, + "grad_norm": 1.0416197783131367, + "learning_rate": 3.9076245129906374e-05, + "loss": 0.6057, + "step": 12390 + }, + { + "epoch": 0.3617393842112943, + "grad_norm": 1.0936552446637446, + "learning_rate": 3.9066933402934034e-05, + "loss": 0.6714, + "step": 12395 + }, + { + "epoch": 0.3618853057055304, + "grad_norm": 1.0771502397942445, + "learning_rate": 3.905761898238522e-05, + "loss": 0.6039, + "step": 12400 + }, + { + "epoch": 0.36203122719976655, + "grad_norm": 1.1268914571276445, + "learning_rate": 3.904830187042897e-05, + "loss": 0.5972, + "step": 12405 + }, + { + "epoch": 0.36217714869400264, + "grad_norm": 1.1882576064225108, + "learning_rate": 3.9038982069234936e-05, + "loss": 0.607, + "step": 12410 + }, + { + "epoch": 0.3623230701882387, + "grad_norm": 1.0251124486738417, + "learning_rate": 3.902965958097342e-05, + "loss": 0.5594, + "step": 12415 + }, + { + "epoch": 0.3624689916824748, + "grad_norm": 1.389894960278433, + "learning_rate": 3.902033440781533e-05, + "loss": 0.6343, + "step": 12420 + }, + { + "epoch": 0.3626149131767109, + "grad_norm": 1.0507809040936216, + "learning_rate": 3.901100655193221e-05, + "loss": 0.631, + "step": 12425 + }, + { + "epoch": 0.36276083467094705, + "grad_norm": 1.0251105360431334, + "learning_rate": 3.9001676015496235e-05, + "loss": 0.6455, + "step": 12430 + }, + { + "epoch": 0.36290675616518314, + "grad_norm": 1.0333728094239572, + "learning_rate": 3.899234280068018e-05, + "loss": 0.662, + "step": 12435 + }, + { + "epoch": 0.36305267765941923, + "grad_norm": 1.0839391075800229, + "learning_rate": 3.898300690965745e-05, + "loss": 0.5593, + "step": 12440 + }, + { + "epoch": 0.3631985991536553, + "grad_norm": 1.2468282912684354, + "learning_rate": 3.8973668344602116e-05, + "loss": 0.6087, + "step": 12445 + }, + { + "epoch": 0.3633445206478914, + "grad_norm": 0.9949617451404282, + "learning_rate": 3.8964327107688804e-05, + "loss": 0.6435, + "step": 12450 + }, + { + "epoch": 0.36349044214212756, + "grad_norm": 1.1795341630793073, + "learning_rate": 3.8954983201092814e-05, + "loss": 0.6108, + "step": 12455 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.041792239753175, + "learning_rate": 3.894563662699004e-05, + "loss": 0.6231, + "step": 12460 + }, + { + "epoch": 0.36378228513059974, + "grad_norm": 1.1061810632519775, + "learning_rate": 3.893628738755701e-05, + "loss": 0.6529, + "step": 12465 + }, + { + "epoch": 0.3639282066248358, + "grad_norm": 0.8645596604878601, + "learning_rate": 3.892693548497086e-05, + "loss": 0.6119, + "step": 12470 + }, + { + "epoch": 0.3640741281190719, + "grad_norm": 1.1858786637033374, + "learning_rate": 3.891758092140938e-05, + "loss": 0.5881, + "step": 12475 + }, + { + "epoch": 0.36422004961330806, + "grad_norm": 1.163342889683199, + "learning_rate": 3.8908223699050936e-05, + "loss": 0.6064, + "step": 12480 + }, + { + "epoch": 0.36436597110754415, + "grad_norm": 0.9931086008485451, + "learning_rate": 3.8898863820074525e-05, + "loss": 0.6562, + "step": 12485 + }, + { + "epoch": 0.36451189260178024, + "grad_norm": 1.047634168827138, + "learning_rate": 3.888950128665978e-05, + "loss": 0.6066, + "step": 12490 + }, + { + "epoch": 0.36465781409601633, + "grad_norm": 1.0349906291528372, + "learning_rate": 3.8880136100986947e-05, + "loss": 0.6756, + "step": 12495 + }, + { + "epoch": 0.3648037355902524, + "grad_norm": 0.9990501090781592, + "learning_rate": 3.887076826523687e-05, + "loss": 0.6031, + "step": 12500 + }, + { + "epoch": 0.36494965708448857, + "grad_norm": 0.9856391914854163, + "learning_rate": 3.886139778159104e-05, + "loss": 0.5714, + "step": 12505 + }, + { + "epoch": 0.36509557857872466, + "grad_norm": 1.0282245342282552, + "learning_rate": 3.8852024652231537e-05, + "loss": 0.6121, + "step": 12510 + }, + { + "epoch": 0.36524150007296075, + "grad_norm": 1.0137111171321143, + "learning_rate": 3.884264887934108e-05, + "loss": 0.6234, + "step": 12515 + }, + { + "epoch": 0.36538742156719684, + "grad_norm": 1.1053439189415284, + "learning_rate": 3.883327046510298e-05, + "loss": 0.6796, + "step": 12520 + }, + { + "epoch": 0.3655333430614329, + "grad_norm": 1.090195817061205, + "learning_rate": 3.882388941170119e-05, + "loss": 0.659, + "step": 12525 + }, + { + "epoch": 0.36567926455566907, + "grad_norm": 0.961115465503495, + "learning_rate": 3.881450572132025e-05, + "loss": 0.6249, + "step": 12530 + }, + { + "epoch": 0.36582518604990516, + "grad_norm": 1.2178822857034812, + "learning_rate": 3.880511939614533e-05, + "loss": 0.661, + "step": 12535 + }, + { + "epoch": 0.36597110754414125, + "grad_norm": 1.3523029561268745, + "learning_rate": 3.8795730438362224e-05, + "loss": 0.5938, + "step": 12540 + }, + { + "epoch": 0.36611702903837734, + "grad_norm": 0.9179444846417907, + "learning_rate": 3.878633885015731e-05, + "loss": 0.5721, + "step": 12545 + }, + { + "epoch": 0.36626295053261343, + "grad_norm": 1.0304136184299528, + "learning_rate": 3.877694463371761e-05, + "loss": 0.6008, + "step": 12550 + }, + { + "epoch": 0.3664088720268496, + "grad_norm": 1.1308417072170858, + "learning_rate": 3.876754779123073e-05, + "loss": 0.6391, + "step": 12555 + }, + { + "epoch": 0.36655479352108566, + "grad_norm": 1.0366168139855234, + "learning_rate": 3.875814832488491e-05, + "loss": 0.5866, + "step": 12560 + }, + { + "epoch": 0.36670071501532175, + "grad_norm": 0.9091783498818459, + "learning_rate": 3.874874623686898e-05, + "loss": 0.6333, + "step": 12565 + }, + { + "epoch": 0.36684663650955784, + "grad_norm": 1.0585857295806147, + "learning_rate": 3.87393415293724e-05, + "loss": 0.6515, + "step": 12570 + }, + { + "epoch": 0.36699255800379393, + "grad_norm": 1.0582641329813207, + "learning_rate": 3.8729934204585226e-05, + "loss": 0.5822, + "step": 12575 + }, + { + "epoch": 0.3671384794980301, + "grad_norm": 1.045085837100344, + "learning_rate": 3.872052426469814e-05, + "loss": 0.643, + "step": 12580 + }, + { + "epoch": 0.36728440099226617, + "grad_norm": 1.019812038817716, + "learning_rate": 3.8711111711902406e-05, + "loss": 0.6066, + "step": 12585 + }, + { + "epoch": 0.36743032248650226, + "grad_norm": 1.011380568256649, + "learning_rate": 3.870169654838994e-05, + "loss": 0.5816, + "step": 12590 + }, + { + "epoch": 0.36757624398073835, + "grad_norm": 1.0616611536982334, + "learning_rate": 3.86922787763532e-05, + "loss": 0.6318, + "step": 12595 + }, + { + "epoch": 0.36772216547497444, + "grad_norm": 1.1184250893439995, + "learning_rate": 3.868285839798532e-05, + "loss": 0.6662, + "step": 12600 + }, + { + "epoch": 0.3678680869692106, + "grad_norm": 1.885339962299369, + "learning_rate": 3.8673435415480005e-05, + "loss": 0.5327, + "step": 12605 + }, + { + "epoch": 0.3680140084634467, + "grad_norm": 0.8908082129284852, + "learning_rate": 3.8664009831031566e-05, + "loss": 0.5671, + "step": 12610 + }, + { + "epoch": 0.36815992995768276, + "grad_norm": 1.045861962298631, + "learning_rate": 3.865458164683493e-05, + "loss": 0.6534, + "step": 12615 + }, + { + "epoch": 0.36830585145191885, + "grad_norm": 0.9957578815874936, + "learning_rate": 3.864515086508562e-05, + "loss": 0.6275, + "step": 12620 + }, + { + "epoch": 0.36845177294615494, + "grad_norm": 1.0942732330491125, + "learning_rate": 3.863571748797979e-05, + "loss": 0.6436, + "step": 12625 + }, + { + "epoch": 0.3685976944403911, + "grad_norm": 0.950652580489795, + "learning_rate": 3.862628151771415e-05, + "loss": 0.6246, + "step": 12630 + }, + { + "epoch": 0.3687436159346272, + "grad_norm": 0.9616192886459279, + "learning_rate": 3.861684295648606e-05, + "loss": 0.6019, + "step": 12635 + }, + { + "epoch": 0.36888953742886327, + "grad_norm": 0.9931400635283733, + "learning_rate": 3.860740180649345e-05, + "loss": 0.6009, + "step": 12640 + }, + { + "epoch": 0.36903545892309936, + "grad_norm": 0.9639892742723815, + "learning_rate": 3.85979580699349e-05, + "loss": 0.5688, + "step": 12645 + }, + { + "epoch": 0.36918138041733545, + "grad_norm": 0.9729417778348433, + "learning_rate": 3.858851174900952e-05, + "loss": 0.6117, + "step": 12650 + }, + { + "epoch": 0.3693273019115716, + "grad_norm": 0.9107797937901947, + "learning_rate": 3.857906284591709e-05, + "loss": 0.6185, + "step": 12655 + }, + { + "epoch": 0.3694732234058077, + "grad_norm": 0.9722002897827574, + "learning_rate": 3.856961136285795e-05, + "loss": 0.6317, + "step": 12660 + }, + { + "epoch": 0.3696191449000438, + "grad_norm": 1.0326197333647278, + "learning_rate": 3.856015730203306e-05, + "loss": 0.6638, + "step": 12665 + }, + { + "epoch": 0.36976506639427986, + "grad_norm": 1.1396947701012703, + "learning_rate": 3.855070066564398e-05, + "loss": 0.604, + "step": 12670 + }, + { + "epoch": 0.36991098788851595, + "grad_norm": 1.045316932712372, + "learning_rate": 3.854124145589285e-05, + "loss": 0.5985, + "step": 12675 + }, + { + "epoch": 0.3700569093827521, + "grad_norm": 0.9618470098764271, + "learning_rate": 3.8531779674982434e-05, + "loss": 0.6567, + "step": 12680 + }, + { + "epoch": 0.3702028308769882, + "grad_norm": 1.00610176739214, + "learning_rate": 3.852231532511609e-05, + "loss": 0.5776, + "step": 12685 + }, + { + "epoch": 0.3703487523712243, + "grad_norm": 0.9658205113334071, + "learning_rate": 3.8512848408497746e-05, + "loss": 0.616, + "step": 12690 + }, + { + "epoch": 0.37049467386546037, + "grad_norm": 1.3336789365602073, + "learning_rate": 3.850337892733198e-05, + "loss": 0.66, + "step": 12695 + }, + { + "epoch": 0.37064059535969646, + "grad_norm": 1.101886589801939, + "learning_rate": 3.8493906883823915e-05, + "loss": 0.5811, + "step": 12700 + }, + { + "epoch": 0.3707865168539326, + "grad_norm": 0.9750669598075934, + "learning_rate": 3.84844322801793e-05, + "loss": 0.5374, + "step": 12705 + }, + { + "epoch": 0.3709324383481687, + "grad_norm": 0.9709111582543624, + "learning_rate": 3.847495511860447e-05, + "loss": 0.567, + "step": 12710 + }, + { + "epoch": 0.3710783598424048, + "grad_norm": 1.3180067061823268, + "learning_rate": 3.846547540130637e-05, + "loss": 0.632, + "step": 12715 + }, + { + "epoch": 0.37122428133664087, + "grad_norm": 0.9667394666544465, + "learning_rate": 3.845599313049253e-05, + "loss": 0.6046, + "step": 12720 + }, + { + "epoch": 0.37137020283087696, + "grad_norm": 0.9751186960856844, + "learning_rate": 3.844650830837106e-05, + "loss": 0.6077, + "step": 12725 + }, + { + "epoch": 0.3715161243251131, + "grad_norm": 1.1397828739197962, + "learning_rate": 3.843702093715068e-05, + "loss": 0.6012, + "step": 12730 + }, + { + "epoch": 0.3716620458193492, + "grad_norm": 1.8619660240201024, + "learning_rate": 3.8427531019040714e-05, + "loss": 0.7072, + "step": 12735 + }, + { + "epoch": 0.3718079673135853, + "grad_norm": 0.9742030449831929, + "learning_rate": 3.841803855625105e-05, + "loss": 0.6346, + "step": 12740 + }, + { + "epoch": 0.3719538888078214, + "grad_norm": 3.271679807241188, + "learning_rate": 3.8408543550992196e-05, + "loss": 0.6249, + "step": 12745 + }, + { + "epoch": 0.37209981030205747, + "grad_norm": 0.9656604098482288, + "learning_rate": 3.839904600547524e-05, + "loss": 0.6225, + "step": 12750 + }, + { + "epoch": 0.3722457317962936, + "grad_norm": 0.9504054394895505, + "learning_rate": 3.8389545921911865e-05, + "loss": 0.6205, + "step": 12755 + }, + { + "epoch": 0.3723916532905297, + "grad_norm": 0.9905034245827565, + "learning_rate": 3.838004330251434e-05, + "loss": 0.67, + "step": 12760 + }, + { + "epoch": 0.3725375747847658, + "grad_norm": 1.3376309404289846, + "learning_rate": 3.837053814949553e-05, + "loss": 0.6285, + "step": 12765 + }, + { + "epoch": 0.3726834962790019, + "grad_norm": 1.0004165256576445, + "learning_rate": 3.8361030465068884e-05, + "loss": 0.6298, + "step": 12770 + }, + { + "epoch": 0.372829417773238, + "grad_norm": 1.04397337729708, + "learning_rate": 3.835152025144844e-05, + "loss": 0.5732, + "step": 12775 + }, + { + "epoch": 0.3729753392674741, + "grad_norm": 1.0128129125340108, + "learning_rate": 3.834200751084883e-05, + "loss": 0.5578, + "step": 12780 + }, + { + "epoch": 0.3731212607617102, + "grad_norm": 1.1896644231464701, + "learning_rate": 3.833249224548529e-05, + "loss": 0.659, + "step": 12785 + }, + { + "epoch": 0.3732671822559463, + "grad_norm": 1.215454849919349, + "learning_rate": 3.83229744575736e-05, + "loss": 0.5604, + "step": 12790 + }, + { + "epoch": 0.3734131037501824, + "grad_norm": 1.0591844084790443, + "learning_rate": 3.831345414933018e-05, + "loss": 0.6322, + "step": 12795 + }, + { + "epoch": 0.37355902524441853, + "grad_norm": 1.0887968637760528, + "learning_rate": 3.830393132297199e-05, + "loss": 0.5991, + "step": 12800 + }, + { + "epoch": 0.3737049467386546, + "grad_norm": 1.0022259438497505, + "learning_rate": 3.829440598071662e-05, + "loss": 0.6065, + "step": 12805 + }, + { + "epoch": 0.3738508682328907, + "grad_norm": 1.0217075380911824, + "learning_rate": 3.828487812478221e-05, + "loss": 0.5708, + "step": 12810 + }, + { + "epoch": 0.3739967897271268, + "grad_norm": 1.115545274492303, + "learning_rate": 3.827534775738749e-05, + "loss": 0.58, + "step": 12815 + }, + { + "epoch": 0.3741427112213629, + "grad_norm": 0.9595017086464345, + "learning_rate": 3.826581488075181e-05, + "loss": 0.6343, + "step": 12820 + }, + { + "epoch": 0.37428863271559903, + "grad_norm": 0.9549765546841054, + "learning_rate": 3.825627949709505e-05, + "loss": 0.5749, + "step": 12825 + }, + { + "epoch": 0.3744345542098351, + "grad_norm": 1.0330177422368476, + "learning_rate": 3.8246741608637726e-05, + "loss": 0.6064, + "step": 12830 + }, + { + "epoch": 0.3745804757040712, + "grad_norm": 0.9829673160402712, + "learning_rate": 3.823720121760091e-05, + "loss": 0.5857, + "step": 12835 + }, + { + "epoch": 0.3747263971983073, + "grad_norm": 1.0281768797957225, + "learning_rate": 3.822765832620624e-05, + "loss": 0.5732, + "step": 12840 + }, + { + "epoch": 0.3748723186925434, + "grad_norm": 0.9382774485290768, + "learning_rate": 3.821811293667598e-05, + "loss": 0.5803, + "step": 12845 + }, + { + "epoch": 0.37501824018677954, + "grad_norm": 0.9635125686831632, + "learning_rate": 3.820856505123293e-05, + "loss": 0.5368, + "step": 12850 + }, + { + "epoch": 0.37516416168101563, + "grad_norm": 1.1575095067689325, + "learning_rate": 3.819901467210052e-05, + "loss": 0.6313, + "step": 12855 + }, + { + "epoch": 0.3753100831752517, + "grad_norm": 1.117952276145879, + "learning_rate": 3.8189461801502715e-05, + "loss": 0.626, + "step": 12860 + }, + { + "epoch": 0.3754560046694878, + "grad_norm": 0.9977116637210961, + "learning_rate": 3.817990644166408e-05, + "loss": 0.5977, + "step": 12865 + }, + { + "epoch": 0.3756019261637239, + "grad_norm": 1.0321914683689113, + "learning_rate": 3.8170348594809765e-05, + "loss": 0.6216, + "step": 12870 + }, + { + "epoch": 0.37574784765796004, + "grad_norm": 0.9462924306626755, + "learning_rate": 3.8160788263165496e-05, + "loss": 0.6183, + "step": 12875 + }, + { + "epoch": 0.37589376915219613, + "grad_norm": 1.1799633139763062, + "learning_rate": 3.815122544895757e-05, + "loss": 0.66, + "step": 12880 + }, + { + "epoch": 0.3760396906464322, + "grad_norm": 1.001258722806958, + "learning_rate": 3.8141660154412865e-05, + "loss": 0.5556, + "step": 12885 + }, + { + "epoch": 0.3761856121406683, + "grad_norm": 0.8791707175069545, + "learning_rate": 3.8132092381758846e-05, + "loss": 0.5547, + "step": 12890 + }, + { + "epoch": 0.3763315336349044, + "grad_norm": 1.1322911472454005, + "learning_rate": 3.812252213322353e-05, + "loss": 0.6101, + "step": 12895 + }, + { + "epoch": 0.37647745512914055, + "grad_norm": 0.9407462937266452, + "learning_rate": 3.811294941103555e-05, + "loss": 0.5881, + "step": 12900 + }, + { + "epoch": 0.37662337662337664, + "grad_norm": 0.9709251372175559, + "learning_rate": 3.810337421742408e-05, + "loss": 0.6158, + "step": 12905 + }, + { + "epoch": 0.3767692981176127, + "grad_norm": 0.9977510911272339, + "learning_rate": 3.8093796554618885e-05, + "loss": 0.573, + "step": 12910 + }, + { + "epoch": 0.3769152196118488, + "grad_norm": 1.099417182577981, + "learning_rate": 3.8084216424850314e-05, + "loss": 0.5798, + "step": 12915 + }, + { + "epoch": 0.3770611411060849, + "grad_norm": 1.3389044345815033, + "learning_rate": 3.807463383034926e-05, + "loss": 0.5727, + "step": 12920 + }, + { + "epoch": 0.37720706260032105, + "grad_norm": 1.0067748495421789, + "learning_rate": 3.806504877334722e-05, + "loss": 0.6679, + "step": 12925 + }, + { + "epoch": 0.37735298409455714, + "grad_norm": 0.9949862510734755, + "learning_rate": 3.805546125607625e-05, + "loss": 0.6035, + "step": 12930 + }, + { + "epoch": 0.37749890558879323, + "grad_norm": 1.146551034871961, + "learning_rate": 3.804587128076898e-05, + "loss": 0.6039, + "step": 12935 + }, + { + "epoch": 0.3776448270830293, + "grad_norm": 1.1597662231488113, + "learning_rate": 3.803627884965863e-05, + "loss": 0.6059, + "step": 12940 + }, + { + "epoch": 0.3777907485772654, + "grad_norm": 0.968271936861751, + "learning_rate": 3.802668396497896e-05, + "loss": 0.6395, + "step": 12945 + }, + { + "epoch": 0.37793667007150156, + "grad_norm": 1.072996036898714, + "learning_rate": 3.801708662896433e-05, + "loss": 0.6738, + "step": 12950 + }, + { + "epoch": 0.37808259156573765, + "grad_norm": 0.9541908255595443, + "learning_rate": 3.8007486843849635e-05, + "loss": 0.5561, + "step": 12955 + }, + { + "epoch": 0.37822851305997374, + "grad_norm": 1.075452140074578, + "learning_rate": 3.79978846118704e-05, + "loss": 0.6096, + "step": 12960 + }, + { + "epoch": 0.3783744345542098, + "grad_norm": 1.0158755784704179, + "learning_rate": 3.7988279935262656e-05, + "loss": 0.5778, + "step": 12965 + }, + { + "epoch": 0.3785203560484459, + "grad_norm": 0.9122407264209598, + "learning_rate": 3.7978672816263045e-05, + "loss": 0.6296, + "step": 12970 + }, + { + "epoch": 0.37866627754268206, + "grad_norm": 0.9971090820513184, + "learning_rate": 3.796906325710876e-05, + "loss": 0.606, + "step": 12975 + }, + { + "epoch": 0.37881219903691815, + "grad_norm": 0.9734110774785772, + "learning_rate": 3.7959451260037564e-05, + "loss": 0.5813, + "step": 12980 + }, + { + "epoch": 0.37895812053115424, + "grad_norm": 0.9459167277518017, + "learning_rate": 3.79498368272878e-05, + "loss": 0.6637, + "step": 12985 + }, + { + "epoch": 0.37910404202539033, + "grad_norm": 0.842421739030566, + "learning_rate": 3.794021996109836e-05, + "loss": 0.6037, + "step": 12990 + }, + { + "epoch": 0.3792499635196264, + "grad_norm": 0.9501889940935216, + "learning_rate": 3.793060066370871e-05, + "loss": 0.533, + "step": 12995 + }, + { + "epoch": 0.37939588501386257, + "grad_norm": 1.1191525290837185, + "learning_rate": 3.7920978937358885e-05, + "loss": 0.6292, + "step": 13000 + }, + { + "epoch": 0.37954180650809866, + "grad_norm": 1.045038239294286, + "learning_rate": 3.791135478428948e-05, + "loss": 0.6813, + "step": 13005 + }, + { + "epoch": 0.37968772800233475, + "grad_norm": 1.0565577569987286, + "learning_rate": 3.7901728206741674e-05, + "loss": 0.5849, + "step": 13010 + }, + { + "epoch": 0.37983364949657084, + "grad_norm": 1.1456704421709585, + "learning_rate": 3.789209920695717e-05, + "loss": 0.6029, + "step": 13015 + }, + { + "epoch": 0.3799795709908069, + "grad_norm": 1.0875918783375462, + "learning_rate": 3.7882467787178284e-05, + "loss": 0.6736, + "step": 13020 + }, + { + "epoch": 0.38012549248504307, + "grad_norm": 0.9808879171610269, + "learning_rate": 3.7872833949647856e-05, + "loss": 0.6292, + "step": 13025 + }, + { + "epoch": 0.38027141397927916, + "grad_norm": 0.9348055817099865, + "learning_rate": 3.7863197696609314e-05, + "loss": 0.5647, + "step": 13030 + }, + { + "epoch": 0.38041733547351525, + "grad_norm": 0.9701751132538474, + "learning_rate": 3.785355903030664e-05, + "loss": 0.6907, + "step": 13035 + }, + { + "epoch": 0.38056325696775134, + "grad_norm": 1.1600988361420812, + "learning_rate": 3.784391795298437e-05, + "loss": 0.619, + "step": 13040 + }, + { + "epoch": 0.38070917846198743, + "grad_norm": 1.0131561001501868, + "learning_rate": 3.783427446688762e-05, + "loss": 0.6703, + "step": 13045 + }, + { + "epoch": 0.3808550999562236, + "grad_norm": 1.1421043897462002, + "learning_rate": 3.782462857426203e-05, + "loss": 0.645, + "step": 13050 + }, + { + "epoch": 0.38100102145045966, + "grad_norm": 1.221099821669162, + "learning_rate": 3.7814980277353866e-05, + "loss": 0.6311, + "step": 13055 + }, + { + "epoch": 0.38114694294469575, + "grad_norm": 0.9774869555552376, + "learning_rate": 3.780532957840989e-05, + "loss": 0.5176, + "step": 13060 + }, + { + "epoch": 0.38129286443893184, + "grad_norm": 1.209502800218115, + "learning_rate": 3.7795676479677444e-05, + "loss": 0.6076, + "step": 13065 + }, + { + "epoch": 0.38143878593316793, + "grad_norm": 1.1527283419804677, + "learning_rate": 3.778602098340445e-05, + "loss": 0.6368, + "step": 13070 + }, + { + "epoch": 0.3815847074274041, + "grad_norm": 1.0349343271925626, + "learning_rate": 3.777636309183934e-05, + "loss": 0.5743, + "step": 13075 + }, + { + "epoch": 0.38173062892164017, + "grad_norm": 0.9815000757187583, + "learning_rate": 3.776670280723117e-05, + "loss": 0.5751, + "step": 13080 + }, + { + "epoch": 0.38187655041587626, + "grad_norm": 1.1831942925259558, + "learning_rate": 3.7757040131829505e-05, + "loss": 0.6293, + "step": 13085 + }, + { + "epoch": 0.38202247191011235, + "grad_norm": 0.8095494336732907, + "learning_rate": 3.774737506788447e-05, + "loss": 0.5888, + "step": 13090 + }, + { + "epoch": 0.38216839340434844, + "grad_norm": 0.919947988340151, + "learning_rate": 3.773770761764676e-05, + "loss": 0.5536, + "step": 13095 + }, + { + "epoch": 0.3823143148985846, + "grad_norm": 1.1331767343574564, + "learning_rate": 3.772803778336762e-05, + "loss": 0.5538, + "step": 13100 + }, + { + "epoch": 0.3824602363928207, + "grad_norm": 1.051309810678946, + "learning_rate": 3.771836556729887e-05, + "loss": 0.6258, + "step": 13105 + }, + { + "epoch": 0.38260615788705676, + "grad_norm": 1.0200278746090872, + "learning_rate": 3.770869097169284e-05, + "loss": 0.6071, + "step": 13110 + }, + { + "epoch": 0.38275207938129285, + "grad_norm": 0.8729450912026568, + "learning_rate": 3.769901399880245e-05, + "loss": 0.6236, + "step": 13115 + }, + { + "epoch": 0.38289800087552894, + "grad_norm": 1.2662868661938753, + "learning_rate": 3.768933465088116e-05, + "loss": 0.6407, + "step": 13120 + }, + { + "epoch": 0.3830439223697651, + "grad_norm": 1.059206088634198, + "learning_rate": 3.7679652930183014e-05, + "loss": 0.5934, + "step": 13125 + }, + { + "epoch": 0.3831898438640012, + "grad_norm": 0.9353408834517517, + "learning_rate": 3.766996883896254e-05, + "loss": 0.5889, + "step": 13130 + }, + { + "epoch": 0.38333576535823727, + "grad_norm": 1.1972273175383707, + "learning_rate": 3.766028237947489e-05, + "loss": 0.6274, + "step": 13135 + }, + { + "epoch": 0.38348168685247336, + "grad_norm": 1.0268512914602315, + "learning_rate": 3.765059355397572e-05, + "loss": 0.6316, + "step": 13140 + }, + { + "epoch": 0.38362760834670945, + "grad_norm": 1.0691451064709403, + "learning_rate": 3.764090236472127e-05, + "loss": 0.5622, + "step": 13145 + }, + { + "epoch": 0.3837735298409456, + "grad_norm": 0.9874043835814377, + "learning_rate": 3.7631208813968286e-05, + "loss": 0.568, + "step": 13150 + }, + { + "epoch": 0.3839194513351817, + "grad_norm": 1.0102520648377, + "learning_rate": 3.7621512903974126e-05, + "loss": 0.6261, + "step": 13155 + }, + { + "epoch": 0.3840653728294178, + "grad_norm": 1.0651339557817885, + "learning_rate": 3.761181463699664e-05, + "loss": 0.6368, + "step": 13160 + }, + { + "epoch": 0.38421129432365386, + "grad_norm": 1.1024669067981412, + "learning_rate": 3.760211401529427e-05, + "loss": 0.6402, + "step": 13165 + }, + { + "epoch": 0.38435721581788995, + "grad_norm": 0.9111714021220461, + "learning_rate": 3.759241104112597e-05, + "loss": 0.5538, + "step": 13170 + }, + { + "epoch": 0.3845031373121261, + "grad_norm": 0.8618220241855702, + "learning_rate": 3.758270571675127e-05, + "loss": 0.6663, + "step": 13175 + }, + { + "epoch": 0.3846490588063622, + "grad_norm": 1.0005318912144738, + "learning_rate": 3.757299804443022e-05, + "loss": 0.6432, + "step": 13180 + }, + { + "epoch": 0.3847949803005983, + "grad_norm": 1.023621737111874, + "learning_rate": 3.756328802642345e-05, + "loss": 0.5995, + "step": 13185 + }, + { + "epoch": 0.38494090179483437, + "grad_norm": 1.0275594622875548, + "learning_rate": 3.755357566499211e-05, + "loss": 0.5664, + "step": 13190 + }, + { + "epoch": 0.38508682328907046, + "grad_norm": 1.074611049274864, + "learning_rate": 3.754386096239792e-05, + "loss": 0.5693, + "step": 13195 + }, + { + "epoch": 0.3852327447833066, + "grad_norm": 1.046615633320191, + "learning_rate": 3.753414392090311e-05, + "loss": 0.6139, + "step": 13200 + }, + { + "epoch": 0.3853786662775427, + "grad_norm": 1.003972254791306, + "learning_rate": 3.7524424542770474e-05, + "loss": 0.6088, + "step": 13205 + }, + { + "epoch": 0.3855245877717788, + "grad_norm": 0.9510938384382496, + "learning_rate": 3.751470283026336e-05, + "loss": 0.609, + "step": 13210 + }, + { + "epoch": 0.38567050926601487, + "grad_norm": 1.1241891382002371, + "learning_rate": 3.750497878564566e-05, + "loss": 0.6409, + "step": 13215 + }, + { + "epoch": 0.38581643076025096, + "grad_norm": 1.213772879456788, + "learning_rate": 3.749525241118178e-05, + "loss": 0.6054, + "step": 13220 + }, + { + "epoch": 0.3859623522544871, + "grad_norm": 1.0136068535027045, + "learning_rate": 3.7485523709136703e-05, + "loss": 0.617, + "step": 13225 + }, + { + "epoch": 0.3861082737487232, + "grad_norm": 1.1194260148829809, + "learning_rate": 3.747579268177592e-05, + "loss": 0.6222, + "step": 13230 + }, + { + "epoch": 0.3862541952429593, + "grad_norm": 1.226612637197211, + "learning_rate": 3.7466059331365514e-05, + "loss": 0.6664, + "step": 13235 + }, + { + "epoch": 0.3864001167371954, + "grad_norm": 1.0707023686878625, + "learning_rate": 3.745632366017205e-05, + "loss": 0.5931, + "step": 13240 + }, + { + "epoch": 0.38654603823143147, + "grad_norm": 1.0337206648806554, + "learning_rate": 3.744658567046267e-05, + "loss": 0.5973, + "step": 13245 + }, + { + "epoch": 0.3866919597256676, + "grad_norm": 1.0135370598491595, + "learning_rate": 3.743684536450505e-05, + "loss": 0.6459, + "step": 13250 + }, + { + "epoch": 0.3868378812199037, + "grad_norm": 0.9559325574631224, + "learning_rate": 3.7427102744567383e-05, + "loss": 0.5423, + "step": 13255 + }, + { + "epoch": 0.3869838027141398, + "grad_norm": 1.0825917244953278, + "learning_rate": 3.7417357812918446e-05, + "loss": 0.6121, + "step": 13260 + }, + { + "epoch": 0.3871297242083759, + "grad_norm": 1.2495067830388449, + "learning_rate": 3.740761057182751e-05, + "loss": 0.6613, + "step": 13265 + }, + { + "epoch": 0.38727564570261197, + "grad_norm": 0.953704618028202, + "learning_rate": 3.7397861023564426e-05, + "loss": 0.6447, + "step": 13270 + }, + { + "epoch": 0.3874215671968481, + "grad_norm": 1.1364379587081346, + "learning_rate": 3.7388109170399536e-05, + "loss": 0.6375, + "step": 13275 + }, + { + "epoch": 0.3875674886910842, + "grad_norm": 1.022040496706585, + "learning_rate": 3.7378355014603734e-05, + "loss": 0.634, + "step": 13280 + }, + { + "epoch": 0.3877134101853203, + "grad_norm": 1.0323467662871728, + "learning_rate": 3.7368598558448484e-05, + "loss": 0.5829, + "step": 13285 + }, + { + "epoch": 0.3878593316795564, + "grad_norm": 1.1454618317949399, + "learning_rate": 3.735883980420574e-05, + "loss": 0.5941, + "step": 13290 + }, + { + "epoch": 0.3880052531737925, + "grad_norm": 0.9639361559463864, + "learning_rate": 3.7349078754148023e-05, + "loss": 0.5669, + "step": 13295 + }, + { + "epoch": 0.3881511746680286, + "grad_norm": 1.0453915296987675, + "learning_rate": 3.7339315410548356e-05, + "loss": 0.6212, + "step": 13300 + }, + { + "epoch": 0.3882970961622647, + "grad_norm": 1.2666039441616714, + "learning_rate": 3.732954977568034e-05, + "loss": 0.5824, + "step": 13305 + }, + { + "epoch": 0.3884430176565008, + "grad_norm": 0.9494146600552643, + "learning_rate": 3.731978185181807e-05, + "loss": 0.5716, + "step": 13310 + }, + { + "epoch": 0.3885889391507369, + "grad_norm": 1.0670522568509824, + "learning_rate": 3.73100116412362e-05, + "loss": 0.5944, + "step": 13315 + }, + { + "epoch": 0.388734860644973, + "grad_norm": 1.1133037954436609, + "learning_rate": 3.730023914620989e-05, + "loss": 0.6541, + "step": 13320 + }, + { + "epoch": 0.3888807821392091, + "grad_norm": 1.3443072366828142, + "learning_rate": 3.729046436901486e-05, + "loss": 0.6463, + "step": 13325 + }, + { + "epoch": 0.3890267036334452, + "grad_norm": 1.0503781254569338, + "learning_rate": 3.7280687311927345e-05, + "loss": 0.5526, + "step": 13330 + }, + { + "epoch": 0.3891726251276813, + "grad_norm": 1.0326084446996835, + "learning_rate": 3.727090797722412e-05, + "loss": 0.6362, + "step": 13335 + }, + { + "epoch": 0.3893185466219174, + "grad_norm": 1.1288020408681696, + "learning_rate": 3.726112636718249e-05, + "loss": 0.6272, + "step": 13340 + }, + { + "epoch": 0.3894644681161535, + "grad_norm": 1.0641342870016819, + "learning_rate": 3.725134248408027e-05, + "loss": 0.6002, + "step": 13345 + }, + { + "epoch": 0.38961038961038963, + "grad_norm": 1.1762173072742437, + "learning_rate": 3.724155633019583e-05, + "loss": 0.6268, + "step": 13350 + }, + { + "epoch": 0.3897563111046257, + "grad_norm": 1.023280310855562, + "learning_rate": 3.723176790780806e-05, + "loss": 0.5585, + "step": 13355 + }, + { + "epoch": 0.3899022325988618, + "grad_norm": 0.8527399562964203, + "learning_rate": 3.7221977219196366e-05, + "loss": 0.5913, + "step": 13360 + }, + { + "epoch": 0.3900481540930979, + "grad_norm": 0.9705276841256729, + "learning_rate": 3.721218426664071e-05, + "loss": 0.5933, + "step": 13365 + }, + { + "epoch": 0.390194075587334, + "grad_norm": 0.9427855587160271, + "learning_rate": 3.7202389052421546e-05, + "loss": 0.5633, + "step": 13370 + }, + { + "epoch": 0.39033999708157013, + "grad_norm": 1.0948552019642581, + "learning_rate": 3.71925915788199e-05, + "loss": 0.6108, + "step": 13375 + }, + { + "epoch": 0.3904859185758062, + "grad_norm": 0.964746014766271, + "learning_rate": 3.7182791848117266e-05, + "loss": 0.5766, + "step": 13380 + }, + { + "epoch": 0.3906318400700423, + "grad_norm": 1.2018675643885772, + "learning_rate": 3.71729898625957e-05, + "loss": 0.6777, + "step": 13385 + }, + { + "epoch": 0.3907777615642784, + "grad_norm": 1.1017440196022457, + "learning_rate": 3.71631856245378e-05, + "loss": 0.6375, + "step": 13390 + }, + { + "epoch": 0.3909236830585145, + "grad_norm": 1.1664211869880705, + "learning_rate": 3.7153379136226633e-05, + "loss": 0.6443, + "step": 13395 + }, + { + "epoch": 0.39106960455275064, + "grad_norm": 1.1285230780410467, + "learning_rate": 3.714357039994585e-05, + "loss": 0.6808, + "step": 13400 + }, + { + "epoch": 0.39121552604698673, + "grad_norm": 0.9703363072407105, + "learning_rate": 3.713375941797958e-05, + "loss": 0.5831, + "step": 13405 + }, + { + "epoch": 0.3913614475412228, + "grad_norm": 0.8893430594707917, + "learning_rate": 3.7123946192612505e-05, + "loss": 0.5768, + "step": 13410 + }, + { + "epoch": 0.3915073690354589, + "grad_norm": 0.8641656756119463, + "learning_rate": 3.711413072612982e-05, + "loss": 0.5865, + "step": 13415 + }, + { + "epoch": 0.391653290529695, + "grad_norm": 0.9433621627765796, + "learning_rate": 3.710431302081723e-05, + "loss": 0.674, + "step": 13420 + }, + { + "epoch": 0.39179921202393114, + "grad_norm": 1.066283511052582, + "learning_rate": 3.709449307896098e-05, + "loss": 0.5443, + "step": 13425 + }, + { + "epoch": 0.39194513351816723, + "grad_norm": 0.9551737816832169, + "learning_rate": 3.708467090284781e-05, + "loss": 0.5382, + "step": 13430 + }, + { + "epoch": 0.3920910550124033, + "grad_norm": 1.0502458196011253, + "learning_rate": 3.7074846494765015e-05, + "loss": 0.676, + "step": 13435 + }, + { + "epoch": 0.3922369765066394, + "grad_norm": 1.0125396367129404, + "learning_rate": 3.706501985700039e-05, + "loss": 0.6583, + "step": 13440 + }, + { + "epoch": 0.3923828980008755, + "grad_norm": 0.9747712792982066, + "learning_rate": 3.7055190991842245e-05, + "loss": 0.5672, + "step": 13445 + }, + { + "epoch": 0.39252881949511165, + "grad_norm": 1.1151222065630897, + "learning_rate": 3.704535990157941e-05, + "loss": 0.5651, + "step": 13450 + }, + { + "epoch": 0.39267474098934774, + "grad_norm": 1.0300698781354214, + "learning_rate": 3.703552658850126e-05, + "loss": 0.5683, + "step": 13455 + }, + { + "epoch": 0.3928206624835838, + "grad_norm": 1.1914322222012972, + "learning_rate": 3.7025691054897644e-05, + "loss": 0.6332, + "step": 13460 + }, + { + "epoch": 0.3929665839778199, + "grad_norm": 0.9662409733925251, + "learning_rate": 3.701585330305895e-05, + "loss": 0.576, + "step": 13465 + }, + { + "epoch": 0.393112505472056, + "grad_norm": 1.1915595893841633, + "learning_rate": 3.700601333527609e-05, + "loss": 0.6191, + "step": 13470 + }, + { + "epoch": 0.39325842696629215, + "grad_norm": 1.1273482770823606, + "learning_rate": 3.6996171153840486e-05, + "loss": 0.5853, + "step": 13475 + }, + { + "epoch": 0.39340434846052824, + "grad_norm": 1.0481133573851806, + "learning_rate": 3.698632676104407e-05, + "loss": 0.5901, + "step": 13480 + }, + { + "epoch": 0.39355026995476433, + "grad_norm": 1.025377981232465, + "learning_rate": 3.69764801591793e-05, + "loss": 0.5787, + "step": 13485 + }, + { + "epoch": 0.3936961914490004, + "grad_norm": 1.089141628678364, + "learning_rate": 3.696663135053912e-05, + "loss": 0.5804, + "step": 13490 + }, + { + "epoch": 0.39384211294323657, + "grad_norm": 0.9607370785168048, + "learning_rate": 3.695678033741704e-05, + "loss": 0.6499, + "step": 13495 + }, + { + "epoch": 0.39398803443747266, + "grad_norm": 1.0180811514253796, + "learning_rate": 3.694692712210702e-05, + "loss": 0.578, + "step": 13500 + }, + { + "epoch": 0.39413395593170875, + "grad_norm": 0.973042457466423, + "learning_rate": 3.6937071706903594e-05, + "loss": 0.6045, + "step": 13505 + }, + { + "epoch": 0.39427987742594484, + "grad_norm": 0.9688845182368967, + "learning_rate": 3.692721409410176e-05, + "loss": 0.6066, + "step": 13510 + }, + { + "epoch": 0.3944257989201809, + "grad_norm": 0.9883454293206793, + "learning_rate": 3.691735428599705e-05, + "loss": 0.6406, + "step": 13515 + }, + { + "epoch": 0.39457172041441707, + "grad_norm": 0.829441584996368, + "learning_rate": 3.690749228488552e-05, + "loss": 0.5494, + "step": 13520 + }, + { + "epoch": 0.39471764190865316, + "grad_norm": 1.0275314433702603, + "learning_rate": 3.689762809306371e-05, + "loss": 0.6194, + "step": 13525 + }, + { + "epoch": 0.39486356340288925, + "grad_norm": 1.031530453016531, + "learning_rate": 3.6887761712828664e-05, + "loss": 0.5895, + "step": 13530 + }, + { + "epoch": 0.39500948489712534, + "grad_norm": 0.881401894301783, + "learning_rate": 3.687789314647798e-05, + "loss": 0.5452, + "step": 13535 + }, + { + "epoch": 0.39515540639136143, + "grad_norm": 1.115580208343848, + "learning_rate": 3.686802239630973e-05, + "loss": 0.5874, + "step": 13540 + }, + { + "epoch": 0.3953013278855976, + "grad_norm": 0.9796559102548387, + "learning_rate": 3.68581494646225e-05, + "loss": 0.5827, + "step": 13545 + }, + { + "epoch": 0.39544724937983367, + "grad_norm": 1.0259949206107621, + "learning_rate": 3.684827435371538e-05, + "loss": 0.6274, + "step": 13550 + }, + { + "epoch": 0.39559317087406975, + "grad_norm": 1.0634990670213518, + "learning_rate": 3.6838397065887984e-05, + "loss": 0.5929, + "step": 13555 + }, + { + "epoch": 0.39573909236830584, + "grad_norm": 1.0055012700306751, + "learning_rate": 3.682851760344043e-05, + "loss": 0.6009, + "step": 13560 + }, + { + "epoch": 0.39588501386254193, + "grad_norm": 0.894091094982307, + "learning_rate": 3.681863596867331e-05, + "loss": 0.5433, + "step": 13565 + }, + { + "epoch": 0.3960309353567781, + "grad_norm": 0.9535542740351096, + "learning_rate": 3.680875216388777e-05, + "loss": 0.5813, + "step": 13570 + }, + { + "epoch": 0.39617685685101417, + "grad_norm": 0.9829963185440888, + "learning_rate": 3.6798866191385425e-05, + "loss": 0.6063, + "step": 13575 + }, + { + "epoch": 0.39632277834525026, + "grad_norm": 1.104239828591839, + "learning_rate": 3.678897805346842e-05, + "loss": 0.6735, + "step": 13580 + }, + { + "epoch": 0.39646869983948635, + "grad_norm": 1.2819688282238808, + "learning_rate": 3.6779087752439383e-05, + "loss": 0.6639, + "step": 13585 + }, + { + "epoch": 0.39661462133372244, + "grad_norm": 0.9217824514390351, + "learning_rate": 3.6769195290601466e-05, + "loss": 0.5527, + "step": 13590 + }, + { + "epoch": 0.3967605428279586, + "grad_norm": 1.0587990042490922, + "learning_rate": 3.67593006702583e-05, + "loss": 0.602, + "step": 13595 + }, + { + "epoch": 0.3969064643221947, + "grad_norm": 1.0898791746846404, + "learning_rate": 3.6749403893714037e-05, + "loss": 0.6227, + "step": 13600 + }, + { + "epoch": 0.39705238581643076, + "grad_norm": 1.0320325658931797, + "learning_rate": 3.673950496327333e-05, + "loss": 0.6285, + "step": 13605 + }, + { + "epoch": 0.39719830731066685, + "grad_norm": 0.984491998091008, + "learning_rate": 3.672960388124133e-05, + "loss": 0.6877, + "step": 13610 + }, + { + "epoch": 0.39734422880490294, + "grad_norm": 0.8440649500665628, + "learning_rate": 3.671970064992369e-05, + "loss": 0.59, + "step": 13615 + }, + { + "epoch": 0.3974901502991391, + "grad_norm": 0.9771649580000522, + "learning_rate": 3.670979527162655e-05, + "loss": 0.6197, + "step": 13620 + }, + { + "epoch": 0.3976360717933752, + "grad_norm": 0.9541157363617031, + "learning_rate": 3.669988774865658e-05, + "loss": 0.5667, + "step": 13625 + }, + { + "epoch": 0.39778199328761127, + "grad_norm": 0.9310572266318097, + "learning_rate": 3.668997808332092e-05, + "loss": 0.5754, + "step": 13630 + }, + { + "epoch": 0.39792791478184736, + "grad_norm": 1.111370438445708, + "learning_rate": 3.668006627792722e-05, + "loss": 0.6323, + "step": 13635 + }, + { + "epoch": 0.39807383627608345, + "grad_norm": 1.3663612148113984, + "learning_rate": 3.667015233478364e-05, + "loss": 0.6658, + "step": 13640 + }, + { + "epoch": 0.3982197577703196, + "grad_norm": 1.0311973801675742, + "learning_rate": 3.666023625619881e-05, + "loss": 0.6008, + "step": 13645 + }, + { + "epoch": 0.3983656792645557, + "grad_norm": 1.3123705487764714, + "learning_rate": 3.6650318044481894e-05, + "loss": 0.6126, + "step": 13650 + }, + { + "epoch": 0.3985116007587918, + "grad_norm": 1.2384923137253803, + "learning_rate": 3.664039770194251e-05, + "loss": 0.6247, + "step": 13655 + }, + { + "epoch": 0.39865752225302786, + "grad_norm": 0.8722670468896063, + "learning_rate": 3.663047523089081e-05, + "loss": 0.6215, + "step": 13660 + }, + { + "epoch": 0.39880344374726395, + "grad_norm": 0.9853884733120933, + "learning_rate": 3.662055063363744e-05, + "loss": 0.6233, + "step": 13665 + }, + { + "epoch": 0.3989493652415001, + "grad_norm": 1.154867516607522, + "learning_rate": 3.6610623912493504e-05, + "loss": 0.6154, + "step": 13670 + }, + { + "epoch": 0.3990952867357362, + "grad_norm": 1.204579376537676, + "learning_rate": 3.660069506977063e-05, + "loss": 0.6879, + "step": 13675 + }, + { + "epoch": 0.3992412082299723, + "grad_norm": 1.145256755755369, + "learning_rate": 3.6590764107780935e-05, + "loss": 0.5878, + "step": 13680 + }, + { + "epoch": 0.39938712972420837, + "grad_norm": 0.988170369728385, + "learning_rate": 3.658083102883703e-05, + "loss": 0.5949, + "step": 13685 + }, + { + "epoch": 0.39953305121844446, + "grad_norm": 1.1473880230341127, + "learning_rate": 3.6570895835252026e-05, + "loss": 0.6305, + "step": 13690 + }, + { + "epoch": 0.3996789727126806, + "grad_norm": 1.1801438201474737, + "learning_rate": 3.656095852933951e-05, + "loss": 0.5602, + "step": 13695 + }, + { + "epoch": 0.3998248942069167, + "grad_norm": 1.0924793581340302, + "learning_rate": 3.6551019113413566e-05, + "loss": 0.6096, + "step": 13700 + }, + { + "epoch": 0.3999708157011528, + "grad_norm": 1.0027412796223487, + "learning_rate": 3.654107758978878e-05, + "loss": 0.6331, + "step": 13705 + }, + { + "epoch": 0.40011673719538887, + "grad_norm": 0.994242794278382, + "learning_rate": 3.6531133960780226e-05, + "loss": 0.6325, + "step": 13710 + }, + { + "epoch": 0.40026265868962496, + "grad_norm": 1.1191020572553931, + "learning_rate": 3.652118822870344e-05, + "loss": 0.5738, + "step": 13715 + }, + { + "epoch": 0.4004085801838611, + "grad_norm": 0.9296404937289181, + "learning_rate": 3.6511240395874495e-05, + "loss": 0.6328, + "step": 13720 + }, + { + "epoch": 0.4005545016780972, + "grad_norm": 1.0750205733649723, + "learning_rate": 3.6501290464609926e-05, + "loss": 0.5833, + "step": 13725 + }, + { + "epoch": 0.4007004231723333, + "grad_norm": 0.9192040161905642, + "learning_rate": 3.6491338437226743e-05, + "loss": 0.6081, + "step": 13730 + }, + { + "epoch": 0.4008463446665694, + "grad_norm": 1.1610054487960284, + "learning_rate": 3.6481384316042496e-05, + "loss": 0.6424, + "step": 13735 + }, + { + "epoch": 0.40099226616080547, + "grad_norm": 0.9795576044721311, + "learning_rate": 3.647142810337516e-05, + "loss": 0.5698, + "step": 13740 + }, + { + "epoch": 0.4011381876550416, + "grad_norm": 0.934323468674151, + "learning_rate": 3.646146980154323e-05, + "loss": 0.5997, + "step": 13745 + }, + { + "epoch": 0.4012841091492777, + "grad_norm": 0.9576380261424753, + "learning_rate": 3.6451509412865684e-05, + "loss": 0.5465, + "step": 13750 + }, + { + "epoch": 0.4014300306435138, + "grad_norm": 1.0898459220001486, + "learning_rate": 3.644154693966199e-05, + "loss": 0.5873, + "step": 13755 + }, + { + "epoch": 0.4015759521377499, + "grad_norm": 0.9479643965150694, + "learning_rate": 3.643158238425209e-05, + "loss": 0.5547, + "step": 13760 + }, + { + "epoch": 0.40172187363198597, + "grad_norm": 0.9619721874819919, + "learning_rate": 3.6421615748956417e-05, + "loss": 0.5495, + "step": 13765 + }, + { + "epoch": 0.4018677951262221, + "grad_norm": 1.125480959699391, + "learning_rate": 3.6411647036095886e-05, + "loss": 0.5903, + "step": 13770 + }, + { + "epoch": 0.4020137166204582, + "grad_norm": 0.8793812411167892, + "learning_rate": 3.6401676247991916e-05, + "loss": 0.5591, + "step": 13775 + }, + { + "epoch": 0.4021596381146943, + "grad_norm": 1.126724265902178, + "learning_rate": 3.639170338696637e-05, + "loss": 0.5857, + "step": 13780 + }, + { + "epoch": 0.4023055596089304, + "grad_norm": 1.0181733665073054, + "learning_rate": 3.638172845534161e-05, + "loss": 0.5652, + "step": 13785 + }, + { + "epoch": 0.4024514811031665, + "grad_norm": 0.895455295310825, + "learning_rate": 3.63717514554405e-05, + "loss": 0.5866, + "step": 13790 + }, + { + "epoch": 0.4025974025974026, + "grad_norm": 0.9914312052974177, + "learning_rate": 3.636177238958638e-05, + "loss": 0.5689, + "step": 13795 + }, + { + "epoch": 0.4027433240916387, + "grad_norm": 1.0567095521583312, + "learning_rate": 3.635179126010303e-05, + "loss": 0.5529, + "step": 13800 + }, + { + "epoch": 0.4028892455858748, + "grad_norm": 0.9378974422761764, + "learning_rate": 3.634180806931478e-05, + "loss": 0.6413, + "step": 13805 + }, + { + "epoch": 0.4030351670801109, + "grad_norm": 0.9975682218349748, + "learning_rate": 3.633182281954638e-05, + "loss": 0.6203, + "step": 13810 + }, + { + "epoch": 0.403181088574347, + "grad_norm": 0.9271969131038335, + "learning_rate": 3.6321835513123084e-05, + "loss": 0.6169, + "step": 13815 + }, + { + "epoch": 0.4033270100685831, + "grad_norm": 1.0514958083973167, + "learning_rate": 3.631184615237062e-05, + "loss": 0.5748, + "step": 13820 + }, + { + "epoch": 0.4034729315628192, + "grad_norm": 0.9259155342573695, + "learning_rate": 3.630185473961521e-05, + "loss": 0.6272, + "step": 13825 + }, + { + "epoch": 0.4036188530570553, + "grad_norm": 1.3429468789369587, + "learning_rate": 3.629186127718353e-05, + "loss": 0.5878, + "step": 13830 + }, + { + "epoch": 0.4037647745512914, + "grad_norm": 0.9380880442445885, + "learning_rate": 3.628186576740274e-05, + "loss": 0.5697, + "step": 13835 + }, + { + "epoch": 0.4039106960455275, + "grad_norm": 0.8733286759127851, + "learning_rate": 3.6271868212600506e-05, + "loss": 0.5756, + "step": 13840 + }, + { + "epoch": 0.40405661753976363, + "grad_norm": 0.8677100655425329, + "learning_rate": 3.626186861510492e-05, + "loss": 0.5867, + "step": 13845 + }, + { + "epoch": 0.4042025390339997, + "grad_norm": 0.9315842074886852, + "learning_rate": 3.625186697724458e-05, + "loss": 0.5962, + "step": 13850 + }, + { + "epoch": 0.4043484605282358, + "grad_norm": 0.92928993964647, + "learning_rate": 3.624186330134856e-05, + "loss": 0.6267, + "step": 13855 + }, + { + "epoch": 0.4044943820224719, + "grad_norm": 1.0136256621626847, + "learning_rate": 3.62318575897464e-05, + "loss": 0.5846, + "step": 13860 + }, + { + "epoch": 0.404640303516708, + "grad_norm": 1.0088796088920677, + "learning_rate": 3.622184984476812e-05, + "loss": 0.6267, + "step": 13865 + }, + { + "epoch": 0.40478622501094413, + "grad_norm": 0.902065045847803, + "learning_rate": 3.62118400687442e-05, + "loss": 0.5337, + "step": 13870 + }, + { + "epoch": 0.4049321465051802, + "grad_norm": 0.9705934211972295, + "learning_rate": 3.6201828264005614e-05, + "loss": 0.6258, + "step": 13875 + }, + { + "epoch": 0.4050780679994163, + "grad_norm": 1.0283730795958412, + "learning_rate": 3.61918144328838e-05, + "loss": 0.6136, + "step": 13880 + }, + { + "epoch": 0.4052239894936524, + "grad_norm": 0.9397230067689413, + "learning_rate": 3.6181798577710644e-05, + "loss": 0.5573, + "step": 13885 + }, + { + "epoch": 0.4053699109878885, + "grad_norm": 1.6922130444161771, + "learning_rate": 3.617178070081856e-05, + "loss": 0.6131, + "step": 13890 + }, + { + "epoch": 0.40551583248212464, + "grad_norm": 1.0096660556690698, + "learning_rate": 3.616176080454036e-05, + "loss": 0.6192, + "step": 13895 + }, + { + "epoch": 0.40566175397636073, + "grad_norm": 1.0451144359005133, + "learning_rate": 3.6151738891209394e-05, + "loss": 0.6493, + "step": 13900 + }, + { + "epoch": 0.4058076754705968, + "grad_norm": 1.0319053984768454, + "learning_rate": 3.614171496315944e-05, + "loss": 0.5912, + "step": 13905 + }, + { + "epoch": 0.4059535969648329, + "grad_norm": 0.8879732590190528, + "learning_rate": 3.6131689022724764e-05, + "loss": 0.6598, + "step": 13910 + }, + { + "epoch": 0.406099518459069, + "grad_norm": 0.9775436158322994, + "learning_rate": 3.6121661072240076e-05, + "loss": 0.573, + "step": 13915 + }, + { + "epoch": 0.40624543995330514, + "grad_norm": 1.083073518810078, + "learning_rate": 3.611163111404059e-05, + "loss": 0.5962, + "step": 13920 + }, + { + "epoch": 0.40639136144754123, + "grad_norm": 1.0182637229742864, + "learning_rate": 3.610159915046195e-05, + "loss": 0.5993, + "step": 13925 + }, + { + "epoch": 0.4065372829417773, + "grad_norm": 1.1678979149989746, + "learning_rate": 3.6091565183840305e-05, + "loss": 0.5442, + "step": 13930 + }, + { + "epoch": 0.4066832044360134, + "grad_norm": 1.1390569990514423, + "learning_rate": 3.608152921651224e-05, + "loss": 0.6341, + "step": 13935 + }, + { + "epoch": 0.4068291259302495, + "grad_norm": 1.0747654773090607, + "learning_rate": 3.607149125081483e-05, + "loss": 0.6095, + "step": 13940 + }, + { + "epoch": 0.40697504742448565, + "grad_norm": 1.0630998548921433, + "learning_rate": 3.606145128908558e-05, + "loss": 0.5796, + "step": 13945 + }, + { + "epoch": 0.40712096891872174, + "grad_norm": 1.023127529672732, + "learning_rate": 3.6051409333662503e-05, + "loss": 0.5564, + "step": 13950 + }, + { + "epoch": 0.4072668904129578, + "grad_norm": 0.9636646909808338, + "learning_rate": 3.6041365386884055e-05, + "loss": 0.6323, + "step": 13955 + }, + { + "epoch": 0.4074128119071939, + "grad_norm": 1.0851052170394584, + "learning_rate": 3.603131945108915e-05, + "loss": 0.6588, + "step": 13960 + }, + { + "epoch": 0.40755873340143, + "grad_norm": 1.0394377436243798, + "learning_rate": 3.602127152861717e-05, + "loss": 0.6954, + "step": 13965 + }, + { + "epoch": 0.40770465489566615, + "grad_norm": 1.195528184638518, + "learning_rate": 3.601122162180796e-05, + "loss": 0.6763, + "step": 13970 + }, + { + "epoch": 0.40785057638990224, + "grad_norm": 0.9889296116803785, + "learning_rate": 3.600116973300185e-05, + "loss": 0.6158, + "step": 13975 + }, + { + "epoch": 0.40799649788413833, + "grad_norm": 1.2071272670170186, + "learning_rate": 3.599111586453957e-05, + "loss": 0.6344, + "step": 13980 + }, + { + "epoch": 0.4081424193783744, + "grad_norm": 1.0953133997781315, + "learning_rate": 3.598106001876238e-05, + "loss": 0.5506, + "step": 13985 + }, + { + "epoch": 0.4082883408726105, + "grad_norm": 1.1085463464816405, + "learning_rate": 3.597100219801197e-05, + "loss": 0.6358, + "step": 13990 + }, + { + "epoch": 0.40843426236684666, + "grad_norm": 0.8556361731608682, + "learning_rate": 3.5960942404630497e-05, + "loss": 0.5673, + "step": 13995 + }, + { + "epoch": 0.40858018386108275, + "grad_norm": 1.1799374011645758, + "learning_rate": 3.595088064096055e-05, + "loss": 0.6093, + "step": 14000 + }, + { + "epoch": 0.40872610535531884, + "grad_norm": 1.0177815942960753, + "learning_rate": 3.594081690934521e-05, + "loss": 0.5833, + "step": 14005 + }, + { + "epoch": 0.4088720268495549, + "grad_norm": 1.1309553338843712, + "learning_rate": 3.593075121212801e-05, + "loss": 0.6023, + "step": 14010 + }, + { + "epoch": 0.409017948343791, + "grad_norm": 0.9500253907375841, + "learning_rate": 3.592068355165292e-05, + "loss": 0.5598, + "step": 14015 + }, + { + "epoch": 0.40916386983802716, + "grad_norm": 0.9973913241208755, + "learning_rate": 3.59106139302644e-05, + "loss": 0.5489, + "step": 14020 + }, + { + "epoch": 0.40930979133226325, + "grad_norm": 1.0308698393981637, + "learning_rate": 3.590054235030734e-05, + "loss": 0.5778, + "step": 14025 + }, + { + "epoch": 0.40945571282649934, + "grad_norm": 1.0117583811466544, + "learning_rate": 3.589046881412711e-05, + "loss": 0.5796, + "step": 14030 + }, + { + "epoch": 0.40960163432073543, + "grad_norm": 1.00842219251295, + "learning_rate": 3.58803933240695e-05, + "loss": 0.6385, + "step": 14035 + }, + { + "epoch": 0.4097475558149715, + "grad_norm": 1.1053934821537055, + "learning_rate": 3.587031588248079e-05, + "loss": 0.6243, + "step": 14040 + }, + { + "epoch": 0.40989347730920767, + "grad_norm": 1.188126522612844, + "learning_rate": 3.5860236491707686e-05, + "loss": 0.6117, + "step": 14045 + }, + { + "epoch": 0.41003939880344376, + "grad_norm": 1.0794860725213764, + "learning_rate": 3.585015515409738e-05, + "loss": 0.6048, + "step": 14050 + }, + { + "epoch": 0.41018532029767985, + "grad_norm": 0.8496648965780355, + "learning_rate": 3.5840071871997485e-05, + "loss": 0.5895, + "step": 14055 + }, + { + "epoch": 0.41033124179191593, + "grad_norm": 0.9699657408285617, + "learning_rate": 3.58299866477561e-05, + "loss": 0.6194, + "step": 14060 + }, + { + "epoch": 0.410477163286152, + "grad_norm": 1.4890033837865122, + "learning_rate": 3.581989948372175e-05, + "loss": 0.6413, + "step": 14065 + }, + { + "epoch": 0.41062308478038817, + "grad_norm": 0.9635370470138371, + "learning_rate": 3.5809810382243395e-05, + "loss": 0.6343, + "step": 14070 + }, + { + "epoch": 0.41076900627462426, + "grad_norm": 0.9846049198950082, + "learning_rate": 3.5799719345670514e-05, + "loss": 0.5887, + "step": 14075 + }, + { + "epoch": 0.41091492776886035, + "grad_norm": 1.1979670651968295, + "learning_rate": 3.578962637635295e-05, + "loss": 0.6117, + "step": 14080 + }, + { + "epoch": 0.41106084926309644, + "grad_norm": 1.0409560722513993, + "learning_rate": 3.5779531476641076e-05, + "loss": 0.5827, + "step": 14085 + }, + { + "epoch": 0.41120677075733253, + "grad_norm": 1.1542669307522804, + "learning_rate": 3.576943464888566e-05, + "loss": 0.5933, + "step": 14090 + }, + { + "epoch": 0.4113526922515687, + "grad_norm": 1.1638165516107504, + "learning_rate": 3.575933589543794e-05, + "loss": 0.6201, + "step": 14095 + }, + { + "epoch": 0.41149861374580476, + "grad_norm": 1.0427308156340584, + "learning_rate": 3.57492352186496e-05, + "loss": 0.5648, + "step": 14100 + }, + { + "epoch": 0.41164453524004085, + "grad_norm": 1.0174082723047517, + "learning_rate": 3.573913262087276e-05, + "loss": 0.5764, + "step": 14105 + }, + { + "epoch": 0.41179045673427694, + "grad_norm": 1.063599536068323, + "learning_rate": 3.572902810446002e-05, + "loss": 0.5934, + "step": 14110 + }, + { + "epoch": 0.41193637822851303, + "grad_norm": 1.103158390089056, + "learning_rate": 3.571892167176438e-05, + "loss": 0.6199, + "step": 14115 + }, + { + "epoch": 0.4120822997227492, + "grad_norm": 1.0748310281525493, + "learning_rate": 3.570881332513933e-05, + "loss": 0.5754, + "step": 14120 + }, + { + "epoch": 0.41222822121698527, + "grad_norm": 1.1293010474353045, + "learning_rate": 3.569870306693879e-05, + "loss": 0.6522, + "step": 14125 + }, + { + "epoch": 0.41237414271122136, + "grad_norm": 0.9065958344337489, + "learning_rate": 3.568859089951711e-05, + "loss": 0.5756, + "step": 14130 + }, + { + "epoch": 0.41252006420545745, + "grad_norm": 1.078713814030312, + "learning_rate": 3.567847682522911e-05, + "loss": 0.6294, + "step": 14135 + }, + { + "epoch": 0.41266598569969354, + "grad_norm": 1.0503517984580948, + "learning_rate": 3.5668360846430025e-05, + "loss": 0.5431, + "step": 14140 + }, + { + "epoch": 0.4128119071939297, + "grad_norm": 1.05780503030239, + "learning_rate": 3.5658242965475555e-05, + "loss": 0.5895, + "step": 14145 + }, + { + "epoch": 0.4129578286881658, + "grad_norm": 1.0798976387133552, + "learning_rate": 3.564812318472185e-05, + "loss": 0.6087, + "step": 14150 + }, + { + "epoch": 0.41310375018240186, + "grad_norm": 1.0301023869953623, + "learning_rate": 3.563800150652547e-05, + "loss": 0.5844, + "step": 14155 + }, + { + "epoch": 0.41324967167663795, + "grad_norm": 0.9661610362243159, + "learning_rate": 3.562787793324346e-05, + "loss": 0.5759, + "step": 14160 + }, + { + "epoch": 0.41339559317087404, + "grad_norm": 1.2766466026450818, + "learning_rate": 3.561775246723326e-05, + "loss": 0.6638, + "step": 14165 + }, + { + "epoch": 0.4135415146651102, + "grad_norm": 0.9713674514443876, + "learning_rate": 3.560762511085279e-05, + "loss": 0.6106, + "step": 14170 + }, + { + "epoch": 0.4136874361593463, + "grad_norm": 1.1063337114226663, + "learning_rate": 3.559749586646039e-05, + "loss": 0.5737, + "step": 14175 + }, + { + "epoch": 0.41383335765358237, + "grad_norm": 1.015296495620961, + "learning_rate": 3.5587364736414834e-05, + "loss": 0.6151, + "step": 14180 + }, + { + "epoch": 0.41397927914781846, + "grad_norm": 1.097725490542968, + "learning_rate": 3.557723172307536e-05, + "loss": 0.6344, + "step": 14185 + }, + { + "epoch": 0.41412520064205455, + "grad_norm": 1.060477601245144, + "learning_rate": 3.556709682880162e-05, + "loss": 0.5914, + "step": 14190 + }, + { + "epoch": 0.4142711221362907, + "grad_norm": 0.9852390758343812, + "learning_rate": 3.555696005595371e-05, + "loss": 0.6329, + "step": 14195 + }, + { + "epoch": 0.4144170436305268, + "grad_norm": 0.907624557008725, + "learning_rate": 3.5546821406892186e-05, + "loss": 0.527, + "step": 14200 + }, + { + "epoch": 0.41456296512476287, + "grad_norm": 1.0041630200590868, + "learning_rate": 3.5536680883978e-05, + "loss": 0.586, + "step": 14205 + }, + { + "epoch": 0.41470888661899896, + "grad_norm": 1.0940688591390892, + "learning_rate": 3.552653848957257e-05, + "loss": 0.6573, + "step": 14210 + }, + { + "epoch": 0.41485480811323505, + "grad_norm": 1.1690224777847, + "learning_rate": 3.551639422603774e-05, + "loss": 0.5826, + "step": 14215 + }, + { + "epoch": 0.4150007296074712, + "grad_norm": 0.9807015531500162, + "learning_rate": 3.55062480957358e-05, + "loss": 0.6108, + "step": 14220 + }, + { + "epoch": 0.4151466511017073, + "grad_norm": 1.0355150883902318, + "learning_rate": 3.5496100101029456e-05, + "loss": 0.53, + "step": 14225 + }, + { + "epoch": 0.4152925725959434, + "grad_norm": 1.1220035134010093, + "learning_rate": 3.548595024428187e-05, + "loss": 0.6176, + "step": 14230 + }, + { + "epoch": 0.41543849409017947, + "grad_norm": 1.0633359663432378, + "learning_rate": 3.54757985278566e-05, + "loss": 0.616, + "step": 14235 + }, + { + "epoch": 0.4155844155844156, + "grad_norm": 1.0282625079134138, + "learning_rate": 3.5465644954117693e-05, + "loss": 0.6616, + "step": 14240 + }, + { + "epoch": 0.4157303370786517, + "grad_norm": 0.9918514711690857, + "learning_rate": 3.545548952542959e-05, + "loss": 0.5928, + "step": 14245 + }, + { + "epoch": 0.4158762585728878, + "grad_norm": 1.1002923665765318, + "learning_rate": 3.544533224415716e-05, + "loss": 0.6624, + "step": 14250 + }, + { + "epoch": 0.4160221800671239, + "grad_norm": 1.06023785845152, + "learning_rate": 3.5435173112665725e-05, + "loss": 0.5557, + "step": 14255 + }, + { + "epoch": 0.41616810156135997, + "grad_norm": 1.0960143983061892, + "learning_rate": 3.542501213332102e-05, + "loss": 0.6125, + "step": 14260 + }, + { + "epoch": 0.4163140230555961, + "grad_norm": 1.0249244061679479, + "learning_rate": 3.5414849308489236e-05, + "loss": 0.6244, + "step": 14265 + }, + { + "epoch": 0.4164599445498322, + "grad_norm": 1.0323971363105802, + "learning_rate": 3.540468464053696e-05, + "loss": 0.6001, + "step": 14270 + }, + { + "epoch": 0.4166058660440683, + "grad_norm": 0.9687706572251554, + "learning_rate": 3.5394518131831236e-05, + "loss": 0.5599, + "step": 14275 + }, + { + "epoch": 0.4167517875383044, + "grad_norm": 0.9175495914932794, + "learning_rate": 3.538434978473952e-05, + "loss": 0.5871, + "step": 14280 + }, + { + "epoch": 0.4168977090325405, + "grad_norm": 0.9022083755802284, + "learning_rate": 3.5374179601629695e-05, + "loss": 0.6408, + "step": 14285 + }, + { + "epoch": 0.4170436305267766, + "grad_norm": 1.0055822122706368, + "learning_rate": 3.53640075848701e-05, + "loss": 0.6534, + "step": 14290 + }, + { + "epoch": 0.4171895520210127, + "grad_norm": 1.0522571062849835, + "learning_rate": 3.535383373682945e-05, + "loss": 0.5886, + "step": 14295 + }, + { + "epoch": 0.4173354735152488, + "grad_norm": 1.107140653670357, + "learning_rate": 3.534365805987694e-05, + "loss": 0.6334, + "step": 14300 + }, + { + "epoch": 0.4174813950094849, + "grad_norm": 0.921656305829327, + "learning_rate": 3.5333480556382145e-05, + "loss": 0.5223, + "step": 14305 + }, + { + "epoch": 0.417627316503721, + "grad_norm": 1.056612875777694, + "learning_rate": 3.532330122871511e-05, + "loss": 0.6025, + "step": 14310 + }, + { + "epoch": 0.4177732379979571, + "grad_norm": 1.0299752674288745, + "learning_rate": 3.531312007924626e-05, + "loss": 0.6141, + "step": 14315 + }, + { + "epoch": 0.4179191594921932, + "grad_norm": 1.104480192557672, + "learning_rate": 3.530293711034648e-05, + "loss": 0.6381, + "step": 14320 + }, + { + "epoch": 0.4180650809864293, + "grad_norm": 0.9994627633885773, + "learning_rate": 3.529275232438706e-05, + "loss": 0.5619, + "step": 14325 + }, + { + "epoch": 0.4182110024806654, + "grad_norm": 0.8397527690825002, + "learning_rate": 3.528256572373972e-05, + "loss": 0.5883, + "step": 14330 + }, + { + "epoch": 0.4183569239749015, + "grad_norm": 1.3507100766803446, + "learning_rate": 3.5272377310776587e-05, + "loss": 0.6187, + "step": 14335 + }, + { + "epoch": 0.41850284546913763, + "grad_norm": 1.1339780521382532, + "learning_rate": 3.526218708787024e-05, + "loss": 0.6476, + "step": 14340 + }, + { + "epoch": 0.4186487669633737, + "grad_norm": 1.075417424478344, + "learning_rate": 3.525199505739366e-05, + "loss": 0.6113, + "step": 14345 + }, + { + "epoch": 0.4187946884576098, + "grad_norm": 0.9655707341614594, + "learning_rate": 3.524180122172025e-05, + "loss": 0.6142, + "step": 14350 + }, + { + "epoch": 0.4189406099518459, + "grad_norm": 1.0030324258943075, + "learning_rate": 3.523160558322383e-05, + "loss": 0.6348, + "step": 14355 + }, + { + "epoch": 0.419086531446082, + "grad_norm": 1.124070642831555, + "learning_rate": 3.522140814427864e-05, + "loss": 0.5046, + "step": 14360 + }, + { + "epoch": 0.41923245294031813, + "grad_norm": 1.22871033614499, + "learning_rate": 3.5211208907259356e-05, + "loss": 0.6182, + "step": 14365 + }, + { + "epoch": 0.4193783744345542, + "grad_norm": 0.9811542020866325, + "learning_rate": 3.5201007874541064e-05, + "loss": 0.5788, + "step": 14370 + }, + { + "epoch": 0.4195242959287903, + "grad_norm": 1.1851259092855806, + "learning_rate": 3.519080504849925e-05, + "loss": 0.5824, + "step": 14375 + }, + { + "epoch": 0.4196702174230264, + "grad_norm": 1.0740830122313665, + "learning_rate": 3.5180600431509856e-05, + "loss": 0.5719, + "step": 14380 + }, + { + "epoch": 0.4198161389172625, + "grad_norm": 0.8662558134425952, + "learning_rate": 3.517039402594919e-05, + "loss": 0.5678, + "step": 14385 + }, + { + "epoch": 0.41996206041149864, + "grad_norm": 1.1409192121898903, + "learning_rate": 3.5160185834194016e-05, + "loss": 0.6169, + "step": 14390 + }, + { + "epoch": 0.42010798190573473, + "grad_norm": 0.960539445868417, + "learning_rate": 3.5149975858621516e-05, + "loss": 0.5259, + "step": 14395 + }, + { + "epoch": 0.4202539033999708, + "grad_norm": 0.9213290129950416, + "learning_rate": 3.513976410160924e-05, + "loss": 0.586, + "step": 14400 + }, + { + "epoch": 0.4203998248942069, + "grad_norm": 1.282168223289508, + "learning_rate": 3.5129550565535215e-05, + "loss": 0.6768, + "step": 14405 + }, + { + "epoch": 0.420545746388443, + "grad_norm": 0.9233702665083272, + "learning_rate": 3.5119335252777844e-05, + "loss": 0.619, + "step": 14410 + }, + { + "epoch": 0.42069166788267914, + "grad_norm": 1.095082350014174, + "learning_rate": 3.510911816571595e-05, + "loss": 0.6691, + "step": 14415 + }, + { + "epoch": 0.42083758937691523, + "grad_norm": 0.9325288171921042, + "learning_rate": 3.509889930672879e-05, + "loss": 0.5886, + "step": 14420 + }, + { + "epoch": 0.4209835108711513, + "grad_norm": 0.8672598065256684, + "learning_rate": 3.5088678678196e-05, + "loss": 0.5769, + "step": 14425 + }, + { + "epoch": 0.4211294323653874, + "grad_norm": 1.1388268128079726, + "learning_rate": 3.507845628249765e-05, + "loss": 0.6277, + "step": 14430 + }, + { + "epoch": 0.4212753538596235, + "grad_norm": 0.8462210710162821, + "learning_rate": 3.50682321220142e-05, + "loss": 0.572, + "step": 14435 + }, + { + "epoch": 0.42142127535385965, + "grad_norm": 1.0520908565130593, + "learning_rate": 3.505800619912656e-05, + "loss": 0.6226, + "step": 14440 + }, + { + "epoch": 0.42156719684809574, + "grad_norm": 0.847584599913274, + "learning_rate": 3.504777851621602e-05, + "loss": 0.5856, + "step": 14445 + }, + { + "epoch": 0.4217131183423318, + "grad_norm": 1.059548211609444, + "learning_rate": 3.503754907566427e-05, + "loss": 0.6076, + "step": 14450 + }, + { + "epoch": 0.4218590398365679, + "grad_norm": 1.2293196163149804, + "learning_rate": 3.502731787985345e-05, + "loss": 0.69, + "step": 14455 + }, + { + "epoch": 0.422004961330804, + "grad_norm": 1.072054277110249, + "learning_rate": 3.501708493116608e-05, + "loss": 0.5986, + "step": 14460 + }, + { + "epoch": 0.42215088282504015, + "grad_norm": 0.9702137632458881, + "learning_rate": 3.500685023198508e-05, + "loss": 0.5334, + "step": 14465 + }, + { + "epoch": 0.42229680431927624, + "grad_norm": 0.8771242097362763, + "learning_rate": 3.49966137846938e-05, + "loss": 0.5643, + "step": 14470 + }, + { + "epoch": 0.42244272581351233, + "grad_norm": 0.946588377018901, + "learning_rate": 3.498637559167599e-05, + "loss": 0.5689, + "step": 14475 + }, + { + "epoch": 0.4225886473077484, + "grad_norm": 1.1403335673419035, + "learning_rate": 3.49761356553158e-05, + "loss": 0.5718, + "step": 14480 + }, + { + "epoch": 0.4227345688019845, + "grad_norm": 1.1402872598041047, + "learning_rate": 3.496589397799779e-05, + "loss": 0.5776, + "step": 14485 + }, + { + "epoch": 0.42288049029622066, + "grad_norm": 1.0738909364394391, + "learning_rate": 3.495565056210693e-05, + "loss": 0.6311, + "step": 14490 + }, + { + "epoch": 0.42302641179045675, + "grad_norm": 0.9592686697343905, + "learning_rate": 3.4945405410028605e-05, + "loss": 0.6275, + "step": 14495 + }, + { + "epoch": 0.42317233328469284, + "grad_norm": 0.8961623619200217, + "learning_rate": 3.493515852414855e-05, + "loss": 0.604, + "step": 14500 + }, + { + "epoch": 0.4233182547789289, + "grad_norm": 0.9660838086441447, + "learning_rate": 3.492490990685298e-05, + "loss": 0.5785, + "step": 14505 + }, + { + "epoch": 0.423464176273165, + "grad_norm": 1.177025120962537, + "learning_rate": 3.491465956052846e-05, + "loss": 0.6541, + "step": 14510 + }, + { + "epoch": 0.42361009776740116, + "grad_norm": 0.9137641572032487, + "learning_rate": 3.4904407487561987e-05, + "loss": 0.5901, + "step": 14515 + }, + { + "epoch": 0.42375601926163725, + "grad_norm": 0.965328698268098, + "learning_rate": 3.4894153690340926e-05, + "loss": 0.6251, + "step": 14520 + }, + { + "epoch": 0.42390194075587334, + "grad_norm": 1.0448285479062196, + "learning_rate": 3.488389817125309e-05, + "loss": 0.6183, + "step": 14525 + }, + { + "epoch": 0.42404786225010943, + "grad_norm": 1.0833674364203354, + "learning_rate": 3.487364093268666e-05, + "loss": 0.553, + "step": 14530 + }, + { + "epoch": 0.4241937837443455, + "grad_norm": 1.1287078322651714, + "learning_rate": 3.486338197703021e-05, + "loss": 0.6124, + "step": 14535 + }, + { + "epoch": 0.42433970523858167, + "grad_norm": 1.0564906310867284, + "learning_rate": 3.485312130667275e-05, + "loss": 0.5833, + "step": 14540 + }, + { + "epoch": 0.42448562673281776, + "grad_norm": 1.0656181074987345, + "learning_rate": 3.4842858924003654e-05, + "loss": 0.6534, + "step": 14545 + }, + { + "epoch": 0.42463154822705385, + "grad_norm": 1.048484757568297, + "learning_rate": 3.4832594831412724e-05, + "loss": 0.5974, + "step": 14550 + }, + { + "epoch": 0.42477746972128994, + "grad_norm": 0.9794724082955425, + "learning_rate": 3.482232903129013e-05, + "loss": 0.6469, + "step": 14555 + }, + { + "epoch": 0.424923391215526, + "grad_norm": 1.0589184614388112, + "learning_rate": 3.481206152602647e-05, + "loss": 0.6166, + "step": 14560 + }, + { + "epoch": 0.42506931270976217, + "grad_norm": 0.964708633393379, + "learning_rate": 3.480179231801272e-05, + "loss": 0.5767, + "step": 14565 + }, + { + "epoch": 0.42521523420399826, + "grad_norm": 0.9952948771382572, + "learning_rate": 3.4791521409640256e-05, + "loss": 0.5994, + "step": 14570 + }, + { + "epoch": 0.42536115569823435, + "grad_norm": 0.8714708249182996, + "learning_rate": 3.478124880330084e-05, + "loss": 0.5989, + "step": 14575 + }, + { + "epoch": 0.42550707719247044, + "grad_norm": 1.2000549971076717, + "learning_rate": 3.477097450138666e-05, + "loss": 0.6266, + "step": 14580 + }, + { + "epoch": 0.42565299868670653, + "grad_norm": 1.054526289381861, + "learning_rate": 3.476069850629026e-05, + "loss": 0.6354, + "step": 14585 + }, + { + "epoch": 0.4257989201809427, + "grad_norm": 1.1340311171921678, + "learning_rate": 3.475042082040461e-05, + "loss": 0.6224, + "step": 14590 + }, + { + "epoch": 0.42594484167517876, + "grad_norm": 1.2247781164857576, + "learning_rate": 3.4740141446123046e-05, + "loss": 0.6242, + "step": 14595 + }, + { + "epoch": 0.42609076316941485, + "grad_norm": 1.2902071767432588, + "learning_rate": 3.472986038583934e-05, + "loss": 0.5977, + "step": 14600 + }, + { + "epoch": 0.42623668466365094, + "grad_norm": 0.9240076849811838, + "learning_rate": 3.471957764194761e-05, + "loss": 0.5447, + "step": 14605 + }, + { + "epoch": 0.42638260615788703, + "grad_norm": 0.9232150807016487, + "learning_rate": 3.4709293216842383e-05, + "loss": 0.5634, + "step": 14610 + }, + { + "epoch": 0.4265285276521232, + "grad_norm": 1.44191113136554, + "learning_rate": 3.469900711291858e-05, + "loss": 0.5829, + "step": 14615 + }, + { + "epoch": 0.42667444914635927, + "grad_norm": 1.0753292628866873, + "learning_rate": 3.4688719332571526e-05, + "loss": 0.6083, + "step": 14620 + }, + { + "epoch": 0.42682037064059536, + "grad_norm": 1.0144987080420538, + "learning_rate": 3.4678429878196905e-05, + "loss": 0.698, + "step": 14625 + }, + { + "epoch": 0.42696629213483145, + "grad_norm": 1.0989524795214847, + "learning_rate": 3.4668138752190815e-05, + "loss": 0.6716, + "step": 14630 + }, + { + "epoch": 0.42711221362906754, + "grad_norm": 1.1709035506008318, + "learning_rate": 3.4657845956949745e-05, + "loss": 0.6411, + "step": 14635 + }, + { + "epoch": 0.4272581351233037, + "grad_norm": 1.1138648921056695, + "learning_rate": 3.464755149487056e-05, + "loss": 0.5775, + "step": 14640 + }, + { + "epoch": 0.4274040566175398, + "grad_norm": 1.1812056106658904, + "learning_rate": 3.463725536835051e-05, + "loss": 0.5672, + "step": 14645 + }, + { + "epoch": 0.42754997811177586, + "grad_norm": 1.2241758595708385, + "learning_rate": 3.4626957579787244e-05, + "loss": 0.6744, + "step": 14650 + }, + { + "epoch": 0.42769589960601195, + "grad_norm": 0.9480877642606028, + "learning_rate": 3.4616658131578805e-05, + "loss": 0.6246, + "step": 14655 + }, + { + "epoch": 0.42784182110024804, + "grad_norm": 1.0003426543032912, + "learning_rate": 3.46063570261236e-05, + "loss": 0.6176, + "step": 14660 + }, + { + "epoch": 0.4279877425944842, + "grad_norm": 0.9356697760347451, + "learning_rate": 3.459605426582043e-05, + "loss": 0.5599, + "step": 14665 + }, + { + "epoch": 0.4281336640887203, + "grad_norm": 1.1163592928833672, + "learning_rate": 3.45857498530685e-05, + "loss": 0.5844, + "step": 14670 + }, + { + "epoch": 0.42827958558295637, + "grad_norm": 1.0829333685346898, + "learning_rate": 3.457544379026738e-05, + "loss": 0.6072, + "step": 14675 + }, + { + "epoch": 0.42842550707719246, + "grad_norm": 1.082003619700801, + "learning_rate": 3.456513607981703e-05, + "loss": 0.607, + "step": 14680 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.9186345387077305, + "learning_rate": 3.455482672411778e-05, + "loss": 0.6095, + "step": 14685 + }, + { + "epoch": 0.4287173500656647, + "grad_norm": 0.961724746355123, + "learning_rate": 3.454451572557036e-05, + "loss": 0.528, + "step": 14690 + }, + { + "epoch": 0.4288632715599008, + "grad_norm": 0.9231988902429026, + "learning_rate": 3.45342030865759e-05, + "loss": 0.58, + "step": 14695 + }, + { + "epoch": 0.4290091930541369, + "grad_norm": 1.067185859881822, + "learning_rate": 3.452388880953587e-05, + "loss": 0.6824, + "step": 14700 + }, + { + "epoch": 0.42915511454837296, + "grad_norm": 0.9610803245485077, + "learning_rate": 3.451357289685214e-05, + "loss": 0.5487, + "step": 14705 + }, + { + "epoch": 0.42930103604260905, + "grad_norm": 1.1642097834681657, + "learning_rate": 3.450325535092698e-05, + "loss": 0.6231, + "step": 14710 + }, + { + "epoch": 0.4294469575368452, + "grad_norm": 1.031502523325777, + "learning_rate": 3.449293617416301e-05, + "loss": 0.6245, + "step": 14715 + }, + { + "epoch": 0.4295928790310813, + "grad_norm": 1.2699125631894317, + "learning_rate": 3.448261536896324e-05, + "loss": 0.6418, + "step": 14720 + }, + { + "epoch": 0.4297388005253174, + "grad_norm": 0.9215147245324534, + "learning_rate": 3.4472292937731064e-05, + "loss": 0.6082, + "step": 14725 + }, + { + "epoch": 0.42988472201955347, + "grad_norm": 1.134100042377912, + "learning_rate": 3.4461968882870277e-05, + "loss": 0.582, + "step": 14730 + }, + { + "epoch": 0.43003064351378956, + "grad_norm": 1.0550186252394929, + "learning_rate": 3.4451643206784986e-05, + "loss": 0.5555, + "step": 14735 + }, + { + "epoch": 0.4301765650080257, + "grad_norm": 0.9470831240385473, + "learning_rate": 3.4441315911879756e-05, + "loss": 0.5782, + "step": 14740 + }, + { + "epoch": 0.4303224865022618, + "grad_norm": 0.8935949165620442, + "learning_rate": 3.443098700055947e-05, + "loss": 0.5637, + "step": 14745 + }, + { + "epoch": 0.4304684079964979, + "grad_norm": 0.9960770842223797, + "learning_rate": 3.442065647522941e-05, + "loss": 0.6152, + "step": 14750 + }, + { + "epoch": 0.43061432949073397, + "grad_norm": 0.8686976779723603, + "learning_rate": 3.441032433829523e-05, + "loss": 0.6471, + "step": 14755 + }, + { + "epoch": 0.43076025098497006, + "grad_norm": 1.2142185733703368, + "learning_rate": 3.439999059216297e-05, + "loss": 0.6019, + "step": 14760 + }, + { + "epoch": 0.4309061724792062, + "grad_norm": 0.9941572809716894, + "learning_rate": 3.438965523923903e-05, + "loss": 0.5698, + "step": 14765 + }, + { + "epoch": 0.4310520939734423, + "grad_norm": 1.1751590566908852, + "learning_rate": 3.437931828193019e-05, + "loss": 0.6197, + "step": 14770 + }, + { + "epoch": 0.4311980154676784, + "grad_norm": 1.1017336752785598, + "learning_rate": 3.436897972264361e-05, + "loss": 0.5696, + "step": 14775 + }, + { + "epoch": 0.4313439369619145, + "grad_norm": 0.9761712375978427, + "learning_rate": 3.4358639563786803e-05, + "loss": 0.5918, + "step": 14780 + }, + { + "epoch": 0.43148985845615057, + "grad_norm": 0.9627952198204527, + "learning_rate": 3.434829780776769e-05, + "loss": 0.5748, + "step": 14785 + }, + { + "epoch": 0.4316357799503867, + "grad_norm": 1.0358676092014028, + "learning_rate": 3.433795445699451e-05, + "loss": 0.5875, + "step": 14790 + }, + { + "epoch": 0.4317817014446228, + "grad_norm": 1.2272333712935142, + "learning_rate": 3.432760951387593e-05, + "loss": 0.6019, + "step": 14795 + }, + { + "epoch": 0.4319276229388589, + "grad_norm": 1.1675025999424626, + "learning_rate": 3.4317262980820955e-05, + "loss": 0.6488, + "step": 14800 + }, + { + "epoch": 0.432073544433095, + "grad_norm": 1.2682792520499573, + "learning_rate": 3.430691486023897e-05, + "loss": 0.5887, + "step": 14805 + }, + { + "epoch": 0.43221946592733107, + "grad_norm": 1.050471902364545, + "learning_rate": 3.4296565154539735e-05, + "loss": 0.5445, + "step": 14810 + }, + { + "epoch": 0.4323653874215672, + "grad_norm": 1.0074986947221647, + "learning_rate": 3.428621386613336e-05, + "loss": 0.6325, + "step": 14815 + }, + { + "epoch": 0.4325113089158033, + "grad_norm": 1.0463143726485051, + "learning_rate": 3.427586099743034e-05, + "loss": 0.6451, + "step": 14820 + }, + { + "epoch": 0.4326572304100394, + "grad_norm": 1.171815048880382, + "learning_rate": 3.426550655084154e-05, + "loss": 0.6042, + "step": 14825 + }, + { + "epoch": 0.4328031519042755, + "grad_norm": 0.8735331935608498, + "learning_rate": 3.4255150528778186e-05, + "loss": 0.6063, + "step": 14830 + }, + { + "epoch": 0.4329490733985116, + "grad_norm": 1.1073257796844498, + "learning_rate": 3.424479293365186e-05, + "loss": 0.6162, + "step": 14835 + }, + { + "epoch": 0.4330949948927477, + "grad_norm": 1.0463564294646084, + "learning_rate": 3.423443376787452e-05, + "loss": 0.664, + "step": 14840 + }, + { + "epoch": 0.4332409163869838, + "grad_norm": 0.9710156166509877, + "learning_rate": 3.422407303385851e-05, + "loss": 0.6058, + "step": 14845 + }, + { + "epoch": 0.4333868378812199, + "grad_norm": 1.1115312949177574, + "learning_rate": 3.421371073401651e-05, + "loss": 0.6342, + "step": 14850 + }, + { + "epoch": 0.433532759375456, + "grad_norm": 0.9612200796972009, + "learning_rate": 3.420334687076157e-05, + "loss": 0.5573, + "step": 14855 + }, + { + "epoch": 0.4336786808696921, + "grad_norm": 0.8982438896090172, + "learning_rate": 3.419298144650712e-05, + "loss": 0.6045, + "step": 14860 + }, + { + "epoch": 0.4338246023639282, + "grad_norm": 1.27416370256648, + "learning_rate": 3.418261446366693e-05, + "loss": 0.6287, + "step": 14865 + }, + { + "epoch": 0.4339705238581643, + "grad_norm": 1.1327345581771062, + "learning_rate": 3.417224592465516e-05, + "loss": 0.6665, + "step": 14870 + }, + { + "epoch": 0.4341164453524004, + "grad_norm": 1.2935850454568887, + "learning_rate": 3.4161875831886305e-05, + "loss": 0.6055, + "step": 14875 + }, + { + "epoch": 0.4342623668466365, + "grad_norm": 1.0063549724489365, + "learning_rate": 3.415150418777524e-05, + "loss": 0.5448, + "step": 14880 + }, + { + "epoch": 0.4344082883408726, + "grad_norm": 0.9395070160447475, + "learning_rate": 3.41411309947372e-05, + "loss": 0.5964, + "step": 14885 + }, + { + "epoch": 0.43455420983510873, + "grad_norm": 1.0128619887098098, + "learning_rate": 3.4130756255187766e-05, + "loss": 0.5398, + "step": 14890 + }, + { + "epoch": 0.4347001313293448, + "grad_norm": 1.1392782819000082, + "learning_rate": 3.41203799715429e-05, + "loss": 0.598, + "step": 14895 + }, + { + "epoch": 0.4348460528235809, + "grad_norm": 1.0167659566556178, + "learning_rate": 3.411000214621891e-05, + "loss": 0.6286, + "step": 14900 + }, + { + "epoch": 0.434991974317817, + "grad_norm": 0.9279072705353846, + "learning_rate": 3.4099622781632464e-05, + "loss": 0.5561, + "step": 14905 + }, + { + "epoch": 0.4351378958120531, + "grad_norm": 0.9722983712918133, + "learning_rate": 3.4089241880200593e-05, + "loss": 0.5854, + "step": 14910 + }, + { + "epoch": 0.43528381730628923, + "grad_norm": 1.047881147011593, + "learning_rate": 3.407885944434068e-05, + "loss": 0.6314, + "step": 14915 + }, + { + "epoch": 0.4354297388005253, + "grad_norm": 0.9253653074681496, + "learning_rate": 3.406847547647049e-05, + "loss": 0.6362, + "step": 14920 + }, + { + "epoch": 0.4355756602947614, + "grad_norm": 1.0614789017596018, + "learning_rate": 3.4058089979008095e-05, + "loss": 0.585, + "step": 14925 + }, + { + "epoch": 0.4357215817889975, + "grad_norm": 1.0619885524342625, + "learning_rate": 3.404770295437197e-05, + "loss": 0.6204, + "step": 14930 + }, + { + "epoch": 0.4358675032832336, + "grad_norm": 0.9489630508453383, + "learning_rate": 3.403731440498092e-05, + "loss": 0.513, + "step": 14935 + }, + { + "epoch": 0.43601342477746974, + "grad_norm": 0.9737341109412176, + "learning_rate": 3.402692433325412e-05, + "loss": 0.6417, + "step": 14940 + }, + { + "epoch": 0.43615934627170583, + "grad_norm": 1.0447307011395082, + "learning_rate": 3.401653274161107e-05, + "loss": 0.5915, + "step": 14945 + }, + { + "epoch": 0.4363052677659419, + "grad_norm": 0.9676710840373857, + "learning_rate": 3.4006139632471675e-05, + "loss": 0.5659, + "step": 14950 + }, + { + "epoch": 0.436451189260178, + "grad_norm": 0.9253596332734276, + "learning_rate": 3.3995745008256146e-05, + "loss": 0.5598, + "step": 14955 + }, + { + "epoch": 0.43659711075441415, + "grad_norm": 1.0179512650412093, + "learning_rate": 3.398534887138508e-05, + "loss": 0.5717, + "step": 14960 + }, + { + "epoch": 0.43674303224865024, + "grad_norm": 1.041646236743334, + "learning_rate": 3.39749512242794e-05, + "loss": 0.5489, + "step": 14965 + }, + { + "epoch": 0.43688895374288633, + "grad_norm": 0.9942978016835857, + "learning_rate": 3.39645520693604e-05, + "loss": 0.5674, + "step": 14970 + }, + { + "epoch": 0.4370348752371224, + "grad_norm": 1.0022983503636906, + "learning_rate": 3.395415140904971e-05, + "loss": 0.567, + "step": 14975 + }, + { + "epoch": 0.4371807967313585, + "grad_norm": 1.0642492384656053, + "learning_rate": 3.394374924576932e-05, + "loss": 0.6156, + "step": 14980 + }, + { + "epoch": 0.43732671822559466, + "grad_norm": 0.9379956570349489, + "learning_rate": 3.3933345581941564e-05, + "loss": 0.6172, + "step": 14985 + }, + { + "epoch": 0.43747263971983075, + "grad_norm": 0.966830581931971, + "learning_rate": 3.392294041998914e-05, + "loss": 0.6503, + "step": 14990 + }, + { + "epoch": 0.43761856121406684, + "grad_norm": 0.9487043663364297, + "learning_rate": 3.391253376233508e-05, + "loss": 0.6219, + "step": 14995 + }, + { + "epoch": 0.4377644827083029, + "grad_norm": 1.3401538829977264, + "learning_rate": 3.390212561140277e-05, + "loss": 0.6602, + "step": 15000 + }, + { + "epoch": 0.437910404202539, + "grad_norm": 1.082827187229185, + "learning_rate": 3.389171596961594e-05, + "loss": 0.6024, + "step": 15005 + }, + { + "epoch": 0.43805632569677516, + "grad_norm": 1.2588159317785443, + "learning_rate": 3.388130483939867e-05, + "loss": 0.6019, + "step": 15010 + }, + { + "epoch": 0.43820224719101125, + "grad_norm": 0.8167574099841444, + "learning_rate": 3.387089222317538e-05, + "loss": 0.5474, + "step": 15015 + }, + { + "epoch": 0.43834816868524734, + "grad_norm": 1.0582371762697391, + "learning_rate": 3.386047812337085e-05, + "loss": 0.5479, + "step": 15020 + }, + { + "epoch": 0.43849409017948343, + "grad_norm": 1.2014907263197525, + "learning_rate": 3.385006254241019e-05, + "loss": 0.6313, + "step": 15025 + }, + { + "epoch": 0.4386400116737195, + "grad_norm": 0.922362904638179, + "learning_rate": 3.3839645482718876e-05, + "loss": 0.5868, + "step": 15030 + }, + { + "epoch": 0.43878593316795567, + "grad_norm": 0.9909309871937789, + "learning_rate": 3.382922694672272e-05, + "loss": 0.5811, + "step": 15035 + }, + { + "epoch": 0.43893185466219176, + "grad_norm": 0.8915043010921688, + "learning_rate": 3.381880693684784e-05, + "loss": 0.5946, + "step": 15040 + }, + { + "epoch": 0.43907777615642785, + "grad_norm": 1.014321636298506, + "learning_rate": 3.380838545552075e-05, + "loss": 0.5328, + "step": 15045 + }, + { + "epoch": 0.43922369765066394, + "grad_norm": 0.9783062774969534, + "learning_rate": 3.3797962505168295e-05, + "loss": 0.6209, + "step": 15050 + }, + { + "epoch": 0.4393696191449, + "grad_norm": 1.0502065097108184, + "learning_rate": 3.378753808821764e-05, + "loss": 0.5312, + "step": 15055 + }, + { + "epoch": 0.43951554063913617, + "grad_norm": 0.9095734806730934, + "learning_rate": 3.3777112207096315e-05, + "loss": 0.5806, + "step": 15060 + }, + { + "epoch": 0.43966146213337226, + "grad_norm": 1.1093471595482236, + "learning_rate": 3.376668486423216e-05, + "loss": 0.6112, + "step": 15065 + }, + { + "epoch": 0.43980738362760835, + "grad_norm": 1.1927504053347295, + "learning_rate": 3.375625606205342e-05, + "loss": 0.6168, + "step": 15070 + }, + { + "epoch": 0.43995330512184444, + "grad_norm": 1.007807674891563, + "learning_rate": 3.374582580298859e-05, + "loss": 0.6941, + "step": 15075 + }, + { + "epoch": 0.44009922661608053, + "grad_norm": 1.1135702503202363, + "learning_rate": 3.3735394089466574e-05, + "loss": 0.596, + "step": 15080 + }, + { + "epoch": 0.4402451481103167, + "grad_norm": 0.8416660886898341, + "learning_rate": 3.372496092391659e-05, + "loss": 0.6069, + "step": 15085 + }, + { + "epoch": 0.44039106960455277, + "grad_norm": 0.9626430180071739, + "learning_rate": 3.3714526308768177e-05, + "loss": 0.6245, + "step": 15090 + }, + { + "epoch": 0.44053699109878885, + "grad_norm": 0.9345578557755848, + "learning_rate": 3.3704090246451264e-05, + "loss": 0.596, + "step": 15095 + }, + { + "epoch": 0.44068291259302494, + "grad_norm": 1.1071101407204926, + "learning_rate": 3.3693652739396054e-05, + "loss": 0.6044, + "step": 15100 + }, + { + "epoch": 0.44082883408726103, + "grad_norm": 1.0470767070671643, + "learning_rate": 3.368321379003313e-05, + "loss": 0.5579, + "step": 15105 + }, + { + "epoch": 0.4409747555814972, + "grad_norm": 1.096572528531247, + "learning_rate": 3.367277340079339e-05, + "loss": 0.5821, + "step": 15110 + }, + { + "epoch": 0.44112067707573327, + "grad_norm": 0.9463021632800804, + "learning_rate": 3.366233157410808e-05, + "loss": 0.5732, + "step": 15115 + }, + { + "epoch": 0.44126659856996936, + "grad_norm": 0.9886869477006601, + "learning_rate": 3.3651888312408766e-05, + "loss": 0.6171, + "step": 15120 + }, + { + "epoch": 0.44141252006420545, + "grad_norm": 0.9817643653603855, + "learning_rate": 3.3641443618127366e-05, + "loss": 0.6247, + "step": 15125 + }, + { + "epoch": 0.44155844155844154, + "grad_norm": 1.042498855092766, + "learning_rate": 3.363099749369612e-05, + "loss": 0.586, + "step": 15130 + }, + { + "epoch": 0.4417043630526777, + "grad_norm": 0.9696650778195526, + "learning_rate": 3.362054994154758e-05, + "loss": 0.6047, + "step": 15135 + }, + { + "epoch": 0.4418502845469138, + "grad_norm": 1.0350340472103092, + "learning_rate": 3.361010096411469e-05, + "loss": 0.574, + "step": 15140 + }, + { + "epoch": 0.44199620604114986, + "grad_norm": 1.0501103392445714, + "learning_rate": 3.359965056383068e-05, + "loss": 0.5968, + "step": 15145 + }, + { + "epoch": 0.44214212753538595, + "grad_norm": 0.9489985685780461, + "learning_rate": 3.3589198743129104e-05, + "loss": 0.5949, + "step": 15150 + }, + { + "epoch": 0.44228804902962204, + "grad_norm": 0.9935727131012589, + "learning_rate": 3.357874550444388e-05, + "loss": 0.6468, + "step": 15155 + }, + { + "epoch": 0.4424339705238582, + "grad_norm": 1.1224827913390254, + "learning_rate": 3.356829085020922e-05, + "loss": 0.6021, + "step": 15160 + }, + { + "epoch": 0.4425798920180943, + "grad_norm": 1.112257596305318, + "learning_rate": 3.355783478285971e-05, + "loss": 0.6193, + "step": 15165 + }, + { + "epoch": 0.44272581351233037, + "grad_norm": 1.0067536116452678, + "learning_rate": 3.354737730483023e-05, + "loss": 0.6309, + "step": 15170 + }, + { + "epoch": 0.44287173500656646, + "grad_norm": 1.036330841820235, + "learning_rate": 3.353691841855599e-05, + "loss": 0.5378, + "step": 15175 + }, + { + "epoch": 0.44301765650080255, + "grad_norm": 0.9705157807120757, + "learning_rate": 3.352645812647256e-05, + "loss": 0.6129, + "step": 15180 + }, + { + "epoch": 0.4431635779950387, + "grad_norm": 0.9550643259820844, + "learning_rate": 3.351599643101579e-05, + "loss": 0.5603, + "step": 15185 + }, + { + "epoch": 0.4433094994892748, + "grad_norm": 0.938996838157557, + "learning_rate": 3.3505533334621894e-05, + "loss": 0.5646, + "step": 15190 + }, + { + "epoch": 0.4434554209835109, + "grad_norm": 0.9569242456275914, + "learning_rate": 3.349506883972739e-05, + "loss": 0.565, + "step": 15195 + }, + { + "epoch": 0.44360134247774696, + "grad_norm": 1.024650959083687, + "learning_rate": 3.348460294876915e-05, + "loss": 0.5885, + "step": 15200 + }, + { + "epoch": 0.44374726397198305, + "grad_norm": 0.982234310306115, + "learning_rate": 3.347413566418434e-05, + "loss": 0.5601, + "step": 15205 + }, + { + "epoch": 0.4438931854662192, + "grad_norm": 0.9407601324461912, + "learning_rate": 3.3463666988410454e-05, + "loss": 0.5952, + "step": 15210 + }, + { + "epoch": 0.4440391069604553, + "grad_norm": 1.061224155639477, + "learning_rate": 3.345319692388533e-05, + "loss": 0.6029, + "step": 15215 + }, + { + "epoch": 0.4441850284546914, + "grad_norm": 1.0250934349925327, + "learning_rate": 3.344272547304712e-05, + "loss": 0.6459, + "step": 15220 + }, + { + "epoch": 0.44433094994892747, + "grad_norm": 0.9277423655168385, + "learning_rate": 3.3432252638334285e-05, + "loss": 0.5841, + "step": 15225 + }, + { + "epoch": 0.44447687144316356, + "grad_norm": 1.0519419008985296, + "learning_rate": 3.3421778422185635e-05, + "loss": 0.5901, + "step": 15230 + }, + { + "epoch": 0.4446227929373997, + "grad_norm": 1.091299003832944, + "learning_rate": 3.3411302827040275e-05, + "loss": 0.6318, + "step": 15235 + }, + { + "epoch": 0.4447687144316358, + "grad_norm": 1.1553049277039817, + "learning_rate": 3.340082585533765e-05, + "loss": 0.6027, + "step": 15240 + }, + { + "epoch": 0.4449146359258719, + "grad_norm": 0.9515248979775799, + "learning_rate": 3.339034750951751e-05, + "loss": 0.5822, + "step": 15245 + }, + { + "epoch": 0.44506055742010797, + "grad_norm": 0.8411647701571835, + "learning_rate": 3.3379867792019945e-05, + "loss": 0.6079, + "step": 15250 + }, + { + "epoch": 0.44520647891434406, + "grad_norm": 1.0222166659217409, + "learning_rate": 3.3369386705285346e-05, + "loss": 0.6314, + "step": 15255 + }, + { + "epoch": 0.4453524004085802, + "grad_norm": 1.0955096604615973, + "learning_rate": 3.3358904251754436e-05, + "loss": 0.5889, + "step": 15260 + }, + { + "epoch": 0.4454983219028163, + "grad_norm": 0.9660014397522572, + "learning_rate": 3.3348420433868235e-05, + "loss": 0.5217, + "step": 15265 + }, + { + "epoch": 0.4456442433970524, + "grad_norm": 1.0649112114660237, + "learning_rate": 3.3337935254068116e-05, + "loss": 0.642, + "step": 15270 + }, + { + "epoch": 0.4457901648912885, + "grad_norm": 1.0646489450179117, + "learning_rate": 3.332744871479573e-05, + "loss": 0.64, + "step": 15275 + }, + { + "epoch": 0.44593608638552457, + "grad_norm": 1.0210810091645277, + "learning_rate": 3.331696081849308e-05, + "loss": 0.5859, + "step": 15280 + }, + { + "epoch": 0.4460820078797607, + "grad_norm": 1.1013563955223056, + "learning_rate": 3.3306471567602454e-05, + "loss": 0.5398, + "step": 15285 + }, + { + "epoch": 0.4462279293739968, + "grad_norm": 0.9582345179102, + "learning_rate": 3.329598096456648e-05, + "loss": 0.5985, + "step": 15290 + }, + { + "epoch": 0.4463738508682329, + "grad_norm": 1.4801209189499522, + "learning_rate": 3.32854890118281e-05, + "loss": 0.6338, + "step": 15295 + }, + { + "epoch": 0.446519772362469, + "grad_norm": 1.0595813889613248, + "learning_rate": 3.327499571183054e-05, + "loss": 0.6071, + "step": 15300 + }, + { + "epoch": 0.44666569385670507, + "grad_norm": 0.9701932680779322, + "learning_rate": 3.3264501067017365e-05, + "loss": 0.5698, + "step": 15305 + }, + { + "epoch": 0.4468116153509412, + "grad_norm": 0.9691293912955153, + "learning_rate": 3.325400507983245e-05, + "loss": 0.6545, + "step": 15310 + }, + { + "epoch": 0.4469575368451773, + "grad_norm": 1.3766902005369608, + "learning_rate": 3.3243507752719996e-05, + "loss": 0.5278, + "step": 15315 + }, + { + "epoch": 0.4471034583394134, + "grad_norm": 1.0940274184668297, + "learning_rate": 3.32330090881245e-05, + "loss": 0.5736, + "step": 15320 + }, + { + "epoch": 0.4472493798336495, + "grad_norm": 0.9003880547172795, + "learning_rate": 3.3222509088490745e-05, + "loss": 0.6181, + "step": 15325 + }, + { + "epoch": 0.4473953013278856, + "grad_norm": 1.221717558449494, + "learning_rate": 3.3212007756263886e-05, + "loss": 0.5919, + "step": 15330 + }, + { + "epoch": 0.4475412228221217, + "grad_norm": 0.978352758371311, + "learning_rate": 3.3201505093889324e-05, + "loss": 0.6969, + "step": 15335 + }, + { + "epoch": 0.4476871443163578, + "grad_norm": 1.0808389494346902, + "learning_rate": 3.3191001103812816e-05, + "loss": 0.594, + "step": 15340 + }, + { + "epoch": 0.4478330658105939, + "grad_norm": 0.968828869070613, + "learning_rate": 3.318049578848042e-05, + "loss": 0.5706, + "step": 15345 + }, + { + "epoch": 0.44797898730483, + "grad_norm": 1.4936361212984097, + "learning_rate": 3.3169989150338474e-05, + "loss": 0.6418, + "step": 15350 + }, + { + "epoch": 0.4481249087990661, + "grad_norm": 1.1277461384053353, + "learning_rate": 3.315948119183367e-05, + "loss": 0.5557, + "step": 15355 + }, + { + "epoch": 0.4482708302933022, + "grad_norm": 1.0174018532199776, + "learning_rate": 3.314897191541297e-05, + "loss": 0.5855, + "step": 15360 + }, + { + "epoch": 0.4484167517875383, + "grad_norm": 1.0746846227499856, + "learning_rate": 3.313846132352365e-05, + "loss": 0.6264, + "step": 15365 + }, + { + "epoch": 0.4485626732817744, + "grad_norm": 1.0494340328477192, + "learning_rate": 3.3127949418613304e-05, + "loss": 0.5639, + "step": 15370 + }, + { + "epoch": 0.4487085947760105, + "grad_norm": 0.9646150372309438, + "learning_rate": 3.311743620312982e-05, + "loss": 0.6229, + "step": 15375 + }, + { + "epoch": 0.4488545162702466, + "grad_norm": 1.0103380082583522, + "learning_rate": 3.3106921679521405e-05, + "loss": 0.5708, + "step": 15380 + }, + { + "epoch": 0.44900043776448273, + "grad_norm": 0.9699920865509774, + "learning_rate": 3.309640585023655e-05, + "loss": 0.6199, + "step": 15385 + }, + { + "epoch": 0.4491463592587188, + "grad_norm": 0.9405598917719894, + "learning_rate": 3.308588871772408e-05, + "loss": 0.5522, + "step": 15390 + }, + { + "epoch": 0.4492922807529549, + "grad_norm": 1.1340629498309085, + "learning_rate": 3.307537028443309e-05, + "loss": 0.5464, + "step": 15395 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 1.263708817457306, + "learning_rate": 3.3064850552812996e-05, + "loss": 0.6057, + "step": 15400 + }, + { + "epoch": 0.4495841237414271, + "grad_norm": 0.9614440944512271, + "learning_rate": 3.305432952531352e-05, + "loss": 0.5802, + "step": 15405 + }, + { + "epoch": 0.44973004523566323, + "grad_norm": 1.0618567661042626, + "learning_rate": 3.304380720438468e-05, + "loss": 0.6304, + "step": 15410 + }, + { + "epoch": 0.4498759667298993, + "grad_norm": 1.4074500590171208, + "learning_rate": 3.3033283592476786e-05, + "loss": 0.646, + "step": 15415 + }, + { + "epoch": 0.4500218882241354, + "grad_norm": 1.069658930890198, + "learning_rate": 3.302275869204047e-05, + "loss": 0.577, + "step": 15420 + }, + { + "epoch": 0.4501678097183715, + "grad_norm": 1.2068179667633927, + "learning_rate": 3.301223250552664e-05, + "loss": 0.637, + "step": 15425 + }, + { + "epoch": 0.4503137312126076, + "grad_norm": 1.1090883088641412, + "learning_rate": 3.300170503538653e-05, + "loss": 0.6254, + "step": 15430 + }, + { + "epoch": 0.45045965270684374, + "grad_norm": 0.9145535769414993, + "learning_rate": 3.299117628407163e-05, + "loss": 0.6333, + "step": 15435 + }, + { + "epoch": 0.45060557420107983, + "grad_norm": 1.0470490795968856, + "learning_rate": 3.2980646254033787e-05, + "loss": 0.6173, + "step": 15440 + }, + { + "epoch": 0.4507514956953159, + "grad_norm": 0.9414471742592865, + "learning_rate": 3.29701149477251e-05, + "loss": 0.576, + "step": 15445 + }, + { + "epoch": 0.450897417189552, + "grad_norm": 1.027438451466081, + "learning_rate": 3.295958236759799e-05, + "loss": 0.6058, + "step": 15450 + }, + { + "epoch": 0.4510433386837881, + "grad_norm": 1.3833992470231662, + "learning_rate": 3.294904851610514e-05, + "loss": 0.6049, + "step": 15455 + }, + { + "epoch": 0.45118926017802424, + "grad_norm": 1.2449868103003836, + "learning_rate": 3.2938513395699585e-05, + "loss": 0.5943, + "step": 15460 + }, + { + "epoch": 0.45133518167226033, + "grad_norm": 0.9851991930409953, + "learning_rate": 3.292797700883461e-05, + "loss": 0.5943, + "step": 15465 + }, + { + "epoch": 0.4514811031664964, + "grad_norm": 1.0407686194741652, + "learning_rate": 3.291743935796381e-05, + "loss": 0.5492, + "step": 15470 + }, + { + "epoch": 0.4516270246607325, + "grad_norm": 0.9530060947491329, + "learning_rate": 3.290690044554108e-05, + "loss": 0.6019, + "step": 15475 + }, + { + "epoch": 0.4517729461549686, + "grad_norm": 0.8823918455499635, + "learning_rate": 3.2896360274020596e-05, + "loss": 0.5407, + "step": 15480 + }, + { + "epoch": 0.45191886764920475, + "grad_norm": 0.9354243929459383, + "learning_rate": 3.288581884585683e-05, + "loss": 0.5088, + "step": 15485 + }, + { + "epoch": 0.45206478914344084, + "grad_norm": 0.8766881555579561, + "learning_rate": 3.287527616350456e-05, + "loss": 0.5206, + "step": 15490 + }, + { + "epoch": 0.4522107106376769, + "grad_norm": 1.0957480159060131, + "learning_rate": 3.286473222941884e-05, + "loss": 0.5418, + "step": 15495 + }, + { + "epoch": 0.452356632131913, + "grad_norm": 1.07198053955402, + "learning_rate": 3.2854187046055024e-05, + "loss": 0.6047, + "step": 15500 + }, + { + "epoch": 0.4525025536261491, + "grad_norm": 0.9212762341843745, + "learning_rate": 3.284364061586876e-05, + "loss": 0.5956, + "step": 15505 + }, + { + "epoch": 0.45264847512038525, + "grad_norm": 0.9743527744295987, + "learning_rate": 3.2833092941315975e-05, + "loss": 0.6023, + "step": 15510 + }, + { + "epoch": 0.45279439661462134, + "grad_norm": 1.0438093504371246, + "learning_rate": 3.282254402485289e-05, + "loss": 0.6041, + "step": 15515 + }, + { + "epoch": 0.45294031810885743, + "grad_norm": 1.2693671668816704, + "learning_rate": 3.2811993868936024e-05, + "loss": 0.5524, + "step": 15520 + }, + { + "epoch": 0.4530862396030935, + "grad_norm": 0.989367182498016, + "learning_rate": 3.280144247602217e-05, + "loss": 0.5957, + "step": 15525 + }, + { + "epoch": 0.4532321610973296, + "grad_norm": 0.8592103927843617, + "learning_rate": 3.2790889848568416e-05, + "loss": 0.5775, + "step": 15530 + }, + { + "epoch": 0.45337808259156576, + "grad_norm": 1.0883419221677124, + "learning_rate": 3.2780335989032164e-05, + "loss": 0.6629, + "step": 15535 + }, + { + "epoch": 0.45352400408580185, + "grad_norm": 1.0326728842703878, + "learning_rate": 3.276978089987104e-05, + "loss": 0.5678, + "step": 15540 + }, + { + "epoch": 0.45366992558003794, + "grad_norm": 0.9331684749223301, + "learning_rate": 3.275922458354302e-05, + "loss": 0.6306, + "step": 15545 + }, + { + "epoch": 0.453815847074274, + "grad_norm": 1.0652826440564773, + "learning_rate": 3.2748667042506326e-05, + "loss": 0.5582, + "step": 15550 + }, + { + "epoch": 0.4539617685685101, + "grad_norm": 1.2697183729862582, + "learning_rate": 3.2738108279219484e-05, + "loss": 0.6246, + "step": 15555 + }, + { + "epoch": 0.45410769006274626, + "grad_norm": 0.9561355340400615, + "learning_rate": 3.27275482961413e-05, + "loss": 0.5685, + "step": 15560 + }, + { + "epoch": 0.45425361155698235, + "grad_norm": 1.1375440726329225, + "learning_rate": 3.2716987095730854e-05, + "loss": 0.624, + "step": 15565 + }, + { + "epoch": 0.45439953305121844, + "grad_norm": 1.251949342269883, + "learning_rate": 3.270642468044753e-05, + "loss": 0.6375, + "step": 15570 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.9445923297152842, + "learning_rate": 3.269586105275098e-05, + "loss": 0.5817, + "step": 15575 + }, + { + "epoch": 0.4546913760396906, + "grad_norm": 1.1316410560885413, + "learning_rate": 3.268529621510115e-05, + "loss": 0.6748, + "step": 15580 + }, + { + "epoch": 0.45483729753392677, + "grad_norm": 0.9441436662422065, + "learning_rate": 3.2674730169958234e-05, + "loss": 0.5859, + "step": 15585 + }, + { + "epoch": 0.45498321902816286, + "grad_norm": 0.9069016560298085, + "learning_rate": 3.2664162919782765e-05, + "loss": 0.5871, + "step": 15590 + }, + { + "epoch": 0.45512914052239895, + "grad_norm": 1.00714936401814, + "learning_rate": 3.26535944670355e-05, + "loss": 0.6045, + "step": 15595 + }, + { + "epoch": 0.45527506201663503, + "grad_norm": 0.9963012408117092, + "learning_rate": 3.264302481417751e-05, + "loss": 0.5278, + "step": 15600 + }, + { + "epoch": 0.4554209835108711, + "grad_norm": 0.9763836435771481, + "learning_rate": 3.263245396367015e-05, + "loss": 0.5498, + "step": 15605 + }, + { + "epoch": 0.45556690500510727, + "grad_norm": 1.283773050445861, + "learning_rate": 3.2621881917975014e-05, + "loss": 0.6138, + "step": 15610 + }, + { + "epoch": 0.45571282649934336, + "grad_norm": 1.1166999859464244, + "learning_rate": 3.261130867955403e-05, + "loss": 0.6111, + "step": 15615 + }, + { + "epoch": 0.45585874799357945, + "grad_norm": 0.9286310253959584, + "learning_rate": 3.2600734250869333e-05, + "loss": 0.5983, + "step": 15620 + }, + { + "epoch": 0.45600466948781554, + "grad_norm": 1.2083062434129777, + "learning_rate": 3.259015863438341e-05, + "loss": 0.596, + "step": 15625 + }, + { + "epoch": 0.45615059098205163, + "grad_norm": 1.6283205043882343, + "learning_rate": 3.257958183255899e-05, + "loss": 0.5995, + "step": 15630 + }, + { + "epoch": 0.4562965124762878, + "grad_norm": 0.9109724092675541, + "learning_rate": 3.2569003847859056e-05, + "loss": 0.5983, + "step": 15635 + }, + { + "epoch": 0.45644243397052386, + "grad_norm": 1.1087699477704822, + "learning_rate": 3.255842468274691e-05, + "loss": 0.5714, + "step": 15640 + }, + { + "epoch": 0.45658835546475995, + "grad_norm": 1.0506735351840697, + "learning_rate": 3.254784433968611e-05, + "loss": 0.574, + "step": 15645 + }, + { + "epoch": 0.45673427695899604, + "grad_norm": 1.2098919805362127, + "learning_rate": 3.2537262821140465e-05, + "loss": 0.5639, + "step": 15650 + }, + { + "epoch": 0.45688019845323213, + "grad_norm": 1.0338719695370537, + "learning_rate": 3.25266801295741e-05, + "loss": 0.6475, + "step": 15655 + }, + { + "epoch": 0.4570261199474683, + "grad_norm": 1.1459744175517204, + "learning_rate": 3.251609626745137e-05, + "loss": 0.5843, + "step": 15660 + }, + { + "epoch": 0.45717204144170437, + "grad_norm": 1.2885086908276957, + "learning_rate": 3.250551123723696e-05, + "loss": 0.6055, + "step": 15665 + }, + { + "epoch": 0.45731796293594046, + "grad_norm": 1.0478937924211262, + "learning_rate": 3.2494925041395755e-05, + "loss": 0.6184, + "step": 15670 + }, + { + "epoch": 0.45746388443017655, + "grad_norm": 1.1573852810061225, + "learning_rate": 3.2484337682392976e-05, + "loss": 0.6093, + "step": 15675 + }, + { + "epoch": 0.4576098059244127, + "grad_norm": 0.8407046206846887, + "learning_rate": 3.247374916269407e-05, + "loss": 0.5653, + "step": 15680 + }, + { + "epoch": 0.4577557274186488, + "grad_norm": 1.2137561234384373, + "learning_rate": 3.246315948476479e-05, + "loss": 0.5505, + "step": 15685 + }, + { + "epoch": 0.4579016489128849, + "grad_norm": 0.9239642804148287, + "learning_rate": 3.245256865107111e-05, + "loss": 0.5703, + "step": 15690 + }, + { + "epoch": 0.45804757040712096, + "grad_norm": 1.1111061635795683, + "learning_rate": 3.2441976664079326e-05, + "loss": 0.5917, + "step": 15695 + }, + { + "epoch": 0.45819349190135705, + "grad_norm": 0.9292923152123604, + "learning_rate": 3.243138352625598e-05, + "loss": 0.5305, + "step": 15700 + }, + { + "epoch": 0.4583394133955932, + "grad_norm": 1.1360188916673053, + "learning_rate": 3.242078924006787e-05, + "loss": 0.5992, + "step": 15705 + }, + { + "epoch": 0.4584853348898293, + "grad_norm": 1.1695538741765779, + "learning_rate": 3.241019380798209e-05, + "loss": 0.6202, + "step": 15710 + }, + { + "epoch": 0.4586312563840654, + "grad_norm": 0.9597372734198713, + "learning_rate": 3.2399597232465976e-05, + "loss": 0.5477, + "step": 15715 + }, + { + "epoch": 0.45877717787830147, + "grad_norm": 0.9525963551637229, + "learning_rate": 3.238899951598713e-05, + "loss": 0.6136, + "step": 15720 + }, + { + "epoch": 0.45892309937253756, + "grad_norm": 1.093767354710571, + "learning_rate": 3.237840066101344e-05, + "loss": 0.6348, + "step": 15725 + }, + { + "epoch": 0.4590690208667737, + "grad_norm": 1.014376923467261, + "learning_rate": 3.2367800670013035e-05, + "loss": 0.6575, + "step": 15730 + }, + { + "epoch": 0.4592149423610098, + "grad_norm": 0.963490668162711, + "learning_rate": 3.235719954545434e-05, + "loss": 0.6087, + "step": 15735 + }, + { + "epoch": 0.4593608638552459, + "grad_norm": 0.9777063123112228, + "learning_rate": 3.2346597289806005e-05, + "loss": 0.5823, + "step": 15740 + }, + { + "epoch": 0.45950678534948197, + "grad_norm": 0.9306426212604956, + "learning_rate": 3.233599390553698e-05, + "loss": 0.5817, + "step": 15745 + }, + { + "epoch": 0.45965270684371806, + "grad_norm": 0.9934307294628575, + "learning_rate": 3.2325389395116445e-05, + "loss": 0.6817, + "step": 15750 + }, + { + "epoch": 0.4597986283379542, + "grad_norm": 1.0248496278926615, + "learning_rate": 3.2314783761013876e-05, + "loss": 0.543, + "step": 15755 + }, + { + "epoch": 0.4599445498321903, + "grad_norm": 0.9492459367095261, + "learning_rate": 3.2304177005698985e-05, + "loss": 0.6236, + "step": 15760 + }, + { + "epoch": 0.4600904713264264, + "grad_norm": 1.023610604436513, + "learning_rate": 3.229356913164174e-05, + "loss": 0.5156, + "step": 15765 + }, + { + "epoch": 0.4602363928206625, + "grad_norm": 1.047680341406216, + "learning_rate": 3.2282960141312404e-05, + "loss": 0.6315, + "step": 15770 + }, + { + "epoch": 0.46038231431489857, + "grad_norm": 0.997127441253729, + "learning_rate": 3.2272350037181456e-05, + "loss": 0.5839, + "step": 15775 + }, + { + "epoch": 0.4605282358091347, + "grad_norm": 1.0417916231889532, + "learning_rate": 3.226173882171968e-05, + "loss": 0.6547, + "step": 15780 + }, + { + "epoch": 0.4606741573033708, + "grad_norm": 1.0631821333318783, + "learning_rate": 3.225112649739808e-05, + "loss": 0.5847, + "step": 15785 + }, + { + "epoch": 0.4608200787976069, + "grad_norm": 0.9716857898144506, + "learning_rate": 3.224051306668795e-05, + "loss": 0.5793, + "step": 15790 + }, + { + "epoch": 0.460966000291843, + "grad_norm": 1.0047720592026088, + "learning_rate": 3.22298985320608e-05, + "loss": 0.5995, + "step": 15795 + }, + { + "epoch": 0.46111192178607907, + "grad_norm": 1.3832403520722822, + "learning_rate": 3.2219282895988445e-05, + "loss": 0.588, + "step": 15800 + }, + { + "epoch": 0.4612578432803152, + "grad_norm": 1.019242116775801, + "learning_rate": 3.220866616094293e-05, + "loss": 0.5614, + "step": 15805 + }, + { + "epoch": 0.4614037647745513, + "grad_norm": 0.9282579570792205, + "learning_rate": 3.219804832939655e-05, + "loss": 0.526, + "step": 15810 + }, + { + "epoch": 0.4615496862687874, + "grad_norm": 1.0687882475730643, + "learning_rate": 3.2187429403821876e-05, + "loss": 0.6267, + "step": 15815 + }, + { + "epoch": 0.4616956077630235, + "grad_norm": 0.9478941843039194, + "learning_rate": 3.2176809386691714e-05, + "loss": 0.5995, + "step": 15820 + }, + { + "epoch": 0.4618415292572596, + "grad_norm": 0.9662875553531348, + "learning_rate": 3.216618828047914e-05, + "loss": 0.5921, + "step": 15825 + }, + { + "epoch": 0.4619874507514957, + "grad_norm": 1.0516865340645782, + "learning_rate": 3.215556608765748e-05, + "loss": 0.5966, + "step": 15830 + }, + { + "epoch": 0.4621333722457318, + "grad_norm": 1.1336114712008754, + "learning_rate": 3.21449428107003e-05, + "loss": 0.5943, + "step": 15835 + }, + { + "epoch": 0.4622792937399679, + "grad_norm": 1.1637276547351914, + "learning_rate": 3.2134318452081444e-05, + "loss": 0.6183, + "step": 15840 + }, + { + "epoch": 0.462425215234204, + "grad_norm": 1.0266984522162912, + "learning_rate": 3.2123693014274965e-05, + "loss": 0.5574, + "step": 15845 + }, + { + "epoch": 0.4625711367284401, + "grad_norm": 0.8845639906006659, + "learning_rate": 3.2113066499755215e-05, + "loss": 0.5606, + "step": 15850 + }, + { + "epoch": 0.4627170582226762, + "grad_norm": 0.9073966151433892, + "learning_rate": 3.210243891099678e-05, + "loss": 0.5569, + "step": 15855 + }, + { + "epoch": 0.4628629797169123, + "grad_norm": 1.0662052997774707, + "learning_rate": 3.209181025047449e-05, + "loss": 0.5629, + "step": 15860 + }, + { + "epoch": 0.4630089012111484, + "grad_norm": 1.01163438062521, + "learning_rate": 3.208118052066342e-05, + "loss": 0.5393, + "step": 15865 + }, + { + "epoch": 0.4631548227053845, + "grad_norm": 1.1041645687777277, + "learning_rate": 3.20705497240389e-05, + "loss": 0.6139, + "step": 15870 + }, + { + "epoch": 0.4633007441996206, + "grad_norm": 0.9121251771958931, + "learning_rate": 3.205991786307652e-05, + "loss": 0.6044, + "step": 15875 + }, + { + "epoch": 0.46344666569385673, + "grad_norm": 0.9355383760583984, + "learning_rate": 3.204928494025209e-05, + "loss": 0.5854, + "step": 15880 + }, + { + "epoch": 0.4635925871880928, + "grad_norm": 1.10289375735494, + "learning_rate": 3.20386509580417e-05, + "loss": 0.6014, + "step": 15885 + }, + { + "epoch": 0.4637385086823289, + "grad_norm": 1.046126257150109, + "learning_rate": 3.202801591892167e-05, + "loss": 0.6139, + "step": 15890 + }, + { + "epoch": 0.463884430176565, + "grad_norm": 0.9749659423553443, + "learning_rate": 3.201737982536857e-05, + "loss": 0.5912, + "step": 15895 + }, + { + "epoch": 0.4640303516708011, + "grad_norm": 1.0591973016379579, + "learning_rate": 3.20067426798592e-05, + "loss": 0.5544, + "step": 15900 + }, + { + "epoch": 0.46417627316503723, + "grad_norm": 1.1682535035086854, + "learning_rate": 3.1996104484870615e-05, + "loss": 0.4743, + "step": 15905 + }, + { + "epoch": 0.4643221946592733, + "grad_norm": 1.1598578416372252, + "learning_rate": 3.198546524288014e-05, + "loss": 0.5987, + "step": 15910 + }, + { + "epoch": 0.4644681161535094, + "grad_norm": 0.9575683339289085, + "learning_rate": 3.197482495636531e-05, + "loss": 0.5513, + "step": 15915 + }, + { + "epoch": 0.4646140376477455, + "grad_norm": 1.1147247113284615, + "learning_rate": 3.196418362780391e-05, + "loss": 0.5959, + "step": 15920 + }, + { + "epoch": 0.4647599591419816, + "grad_norm": 1.02720540148014, + "learning_rate": 3.195354125967397e-05, + "loss": 0.6222, + "step": 15925 + }, + { + "epoch": 0.46490588063621774, + "grad_norm": 0.9927925502777519, + "learning_rate": 3.1942897854453776e-05, + "loss": 0.5505, + "step": 15930 + }, + { + "epoch": 0.46505180213045383, + "grad_norm": 1.232419275419981, + "learning_rate": 3.1932253414621846e-05, + "loss": 0.5836, + "step": 15935 + }, + { + "epoch": 0.4651977236246899, + "grad_norm": 1.198392516016938, + "learning_rate": 3.192160794265691e-05, + "loss": 0.5601, + "step": 15940 + }, + { + "epoch": 0.465343645118926, + "grad_norm": 0.9290787294382147, + "learning_rate": 3.1910961441038e-05, + "loss": 0.5783, + "step": 15945 + }, + { + "epoch": 0.4654895666131621, + "grad_norm": 0.9673793305861905, + "learning_rate": 3.1900313912244326e-05, + "loss": 0.5959, + "step": 15950 + }, + { + "epoch": 0.46563548810739824, + "grad_norm": 1.1998292550313885, + "learning_rate": 3.188966535875537e-05, + "loss": 0.5464, + "step": 15955 + }, + { + "epoch": 0.46578140960163433, + "grad_norm": 1.171475298071113, + "learning_rate": 3.1879015783050855e-05, + "loss": 0.6397, + "step": 15960 + }, + { + "epoch": 0.4659273310958704, + "grad_norm": 1.1307752464317156, + "learning_rate": 3.1868365187610736e-05, + "loss": 0.63, + "step": 15965 + }, + { + "epoch": 0.4660732525901065, + "grad_norm": 1.1731864130109129, + "learning_rate": 3.185771357491519e-05, + "loss": 0.6413, + "step": 15970 + }, + { + "epoch": 0.4662191740843426, + "grad_norm": 1.0920781863690463, + "learning_rate": 3.1847060947444643e-05, + "loss": 0.6109, + "step": 15975 + }, + { + "epoch": 0.46636509557857875, + "grad_norm": 0.9719439390500204, + "learning_rate": 3.183640730767977e-05, + "loss": 0.6134, + "step": 15980 + }, + { + "epoch": 0.46651101707281484, + "grad_norm": 0.9675295952738631, + "learning_rate": 3.1825752658101474e-05, + "loss": 0.6123, + "step": 15985 + }, + { + "epoch": 0.4666569385670509, + "grad_norm": 1.0373621345418962, + "learning_rate": 3.181509700119087e-05, + "loss": 0.5901, + "step": 15990 + }, + { + "epoch": 0.466802860061287, + "grad_norm": 1.0249244672043027, + "learning_rate": 3.1804440339429344e-05, + "loss": 0.6608, + "step": 15995 + }, + { + "epoch": 0.4669487815555231, + "grad_norm": 1.0724877856702746, + "learning_rate": 3.1793782675298485e-05, + "loss": 0.6656, + "step": 16000 + }, + { + "epoch": 0.46709470304975925, + "grad_norm": 1.1952657635486619, + "learning_rate": 3.178312401128014e-05, + "loss": 0.6083, + "step": 16005 + }, + { + "epoch": 0.46724062454399534, + "grad_norm": 1.0690913191527671, + "learning_rate": 3.177246434985638e-05, + "loss": 0.5899, + "step": 16010 + }, + { + "epoch": 0.46738654603823143, + "grad_norm": 1.1834225817099195, + "learning_rate": 3.1761803693509486e-05, + "loss": 0.7149, + "step": 16015 + }, + { + "epoch": 0.4675324675324675, + "grad_norm": 1.2295242212235264, + "learning_rate": 3.175114204472201e-05, + "loss": 0.6701, + "step": 16020 + }, + { + "epoch": 0.4676783890267036, + "grad_norm": 1.3023199828516847, + "learning_rate": 3.174047940597671e-05, + "loss": 0.6676, + "step": 16025 + }, + { + "epoch": 0.46782431052093976, + "grad_norm": 0.9432176597114115, + "learning_rate": 3.1729815779756586e-05, + "loss": 0.5697, + "step": 16030 + }, + { + "epoch": 0.46797023201517585, + "grad_norm": 1.067917631624269, + "learning_rate": 3.171915116854486e-05, + "loss": 0.5941, + "step": 16035 + }, + { + "epoch": 0.46811615350941194, + "grad_norm": 0.8236801347322392, + "learning_rate": 3.170848557482498e-05, + "loss": 0.4979, + "step": 16040 + }, + { + "epoch": 0.468262075003648, + "grad_norm": 0.9382195157037129, + "learning_rate": 3.1697819001080636e-05, + "loss": 0.629, + "step": 16045 + }, + { + "epoch": 0.4684079964978841, + "grad_norm": 0.9991670203979078, + "learning_rate": 3.1687151449795735e-05, + "loss": 0.5543, + "step": 16050 + }, + { + "epoch": 0.46855391799212026, + "grad_norm": 1.138257288287831, + "learning_rate": 3.1676482923454406e-05, + "loss": 0.5942, + "step": 16055 + }, + { + "epoch": 0.46869983948635635, + "grad_norm": 0.9383685087244243, + "learning_rate": 3.166581342454104e-05, + "loss": 0.5821, + "step": 16060 + }, + { + "epoch": 0.46884576098059244, + "grad_norm": 1.0208261833720251, + "learning_rate": 3.165514295554021e-05, + "loss": 0.5804, + "step": 16065 + }, + { + "epoch": 0.46899168247482853, + "grad_norm": 1.1095146268454237, + "learning_rate": 3.164447151893673e-05, + "loss": 0.5952, + "step": 16070 + }, + { + "epoch": 0.4691376039690646, + "grad_norm": 1.012549555463579, + "learning_rate": 3.163379911721566e-05, + "loss": 0.6351, + "step": 16075 + }, + { + "epoch": 0.46928352546330077, + "grad_norm": 1.1788680249148444, + "learning_rate": 3.162312575286226e-05, + "loss": 0.5721, + "step": 16080 + }, + { + "epoch": 0.46942944695753686, + "grad_norm": 1.0263215462979607, + "learning_rate": 3.161245142836201e-05, + "loss": 0.5891, + "step": 16085 + }, + { + "epoch": 0.46957536845177295, + "grad_norm": 1.096561843456749, + "learning_rate": 3.160177614620065e-05, + "loss": 0.6271, + "step": 16090 + }, + { + "epoch": 0.46972128994600904, + "grad_norm": 0.9582351993218208, + "learning_rate": 3.15910999088641e-05, + "loss": 0.5922, + "step": 16095 + }, + { + "epoch": 0.4698672114402451, + "grad_norm": 1.1435621609478108, + "learning_rate": 3.158042271883853e-05, + "loss": 0.6033, + "step": 16100 + }, + { + "epoch": 0.47001313293448127, + "grad_norm": 0.9624437619131209, + "learning_rate": 3.156974457861032e-05, + "loss": 0.6375, + "step": 16105 + }, + { + "epoch": 0.47015905442871736, + "grad_norm": 1.0190270413678166, + "learning_rate": 3.155906549066607e-05, + "loss": 0.5898, + "step": 16110 + }, + { + "epoch": 0.47030497592295345, + "grad_norm": 1.1826229508212447, + "learning_rate": 3.1548385457492615e-05, + "loss": 0.5615, + "step": 16115 + }, + { + "epoch": 0.47045089741718954, + "grad_norm": 1.1285316377328358, + "learning_rate": 3.1537704481576996e-05, + "loss": 0.5909, + "step": 16120 + }, + { + "epoch": 0.47059681891142563, + "grad_norm": 1.6147334791451788, + "learning_rate": 3.1527022565406485e-05, + "loss": 0.6389, + "step": 16125 + }, + { + "epoch": 0.4707427404056618, + "grad_norm": 1.117986456933243, + "learning_rate": 3.151633971146854e-05, + "loss": 0.5609, + "step": 16130 + }, + { + "epoch": 0.47088866189989786, + "grad_norm": 0.9438890624816828, + "learning_rate": 3.150565592225089e-05, + "loss": 0.6322, + "step": 16135 + }, + { + "epoch": 0.47103458339413395, + "grad_norm": 0.9417274734364771, + "learning_rate": 3.1494971200241445e-05, + "loss": 0.5182, + "step": 16140 + }, + { + "epoch": 0.47118050488837004, + "grad_norm": 1.047334740433062, + "learning_rate": 3.1484285547928346e-05, + "loss": 0.5999, + "step": 16145 + }, + { + "epoch": 0.47132642638260613, + "grad_norm": 1.0935660709235446, + "learning_rate": 3.147359896779994e-05, + "loss": 0.6251, + "step": 16150 + }, + { + "epoch": 0.4714723478768423, + "grad_norm": 1.0378166892325567, + "learning_rate": 3.14629114623448e-05, + "loss": 0.5395, + "step": 16155 + }, + { + "epoch": 0.47161826937107837, + "grad_norm": 1.126316973640822, + "learning_rate": 3.1452223034051714e-05, + "loss": 0.6217, + "step": 16160 + }, + { + "epoch": 0.47176419086531446, + "grad_norm": 1.036274101388914, + "learning_rate": 3.144153368540967e-05, + "loss": 0.5966, + "step": 16165 + }, + { + "epoch": 0.47191011235955055, + "grad_norm": 1.0825445332083152, + "learning_rate": 3.1430843418907905e-05, + "loss": 0.6309, + "step": 16170 + }, + { + "epoch": 0.47205603385378664, + "grad_norm": 1.1010543405445865, + "learning_rate": 3.1420152237035824e-05, + "loss": 0.5654, + "step": 16175 + }, + { + "epoch": 0.4722019553480228, + "grad_norm": 1.291319672829736, + "learning_rate": 3.140946014228308e-05, + "loss": 0.6257, + "step": 16180 + }, + { + "epoch": 0.4723478768422589, + "grad_norm": 1.1042257214611904, + "learning_rate": 3.139876713713954e-05, + "loss": 0.6235, + "step": 16185 + }, + { + "epoch": 0.47249379833649496, + "grad_norm": 0.9629843497570609, + "learning_rate": 3.1388073224095236e-05, + "loss": 0.5866, + "step": 16190 + }, + { + "epoch": 0.47263971983073105, + "grad_norm": 1.0866402931963635, + "learning_rate": 3.137737840564048e-05, + "loss": 0.5875, + "step": 16195 + }, + { + "epoch": 0.47278564132496714, + "grad_norm": 1.059982232823143, + "learning_rate": 3.136668268426574e-05, + "loss": 0.5419, + "step": 16200 + }, + { + "epoch": 0.4729315628192033, + "grad_norm": 1.2767011738484482, + "learning_rate": 3.135598606246171e-05, + "loss": 0.6259, + "step": 16205 + }, + { + "epoch": 0.4730774843134394, + "grad_norm": 1.0727429676467213, + "learning_rate": 3.134528854271932e-05, + "loss": 0.5979, + "step": 16210 + }, + { + "epoch": 0.47322340580767547, + "grad_norm": 0.935253270093171, + "learning_rate": 3.1334590127529676e-05, + "loss": 0.6169, + "step": 16215 + }, + { + "epoch": 0.47336932730191156, + "grad_norm": 0.9088583229338272, + "learning_rate": 3.13238908193841e-05, + "loss": 0.5648, + "step": 16220 + }, + { + "epoch": 0.47351524879614765, + "grad_norm": 1.0388776791127656, + "learning_rate": 3.131319062077412e-05, + "loss": 0.6178, + "step": 16225 + }, + { + "epoch": 0.4736611702903838, + "grad_norm": 1.058695593297206, + "learning_rate": 3.1302489534191496e-05, + "loss": 0.549, + "step": 16230 + }, + { + "epoch": 0.4738070917846199, + "grad_norm": 0.9573896971764649, + "learning_rate": 3.129178756212816e-05, + "loss": 0.6673, + "step": 16235 + }, + { + "epoch": 0.473953013278856, + "grad_norm": 1.1798643992256792, + "learning_rate": 3.128108470707627e-05, + "loss": 0.5958, + "step": 16240 + }, + { + "epoch": 0.47409893477309206, + "grad_norm": 0.9749965823953239, + "learning_rate": 3.127038097152819e-05, + "loss": 0.6324, + "step": 16245 + }, + { + "epoch": 0.47424485626732815, + "grad_norm": 1.0032067699445637, + "learning_rate": 3.125967635797648e-05, + "loss": 0.5879, + "step": 16250 + }, + { + "epoch": 0.4743907777615643, + "grad_norm": 1.0337100139887638, + "learning_rate": 3.1248970868913926e-05, + "loss": 0.5537, + "step": 16255 + }, + { + "epoch": 0.4745366992558004, + "grad_norm": 1.068809433506519, + "learning_rate": 3.123826450683347e-05, + "loss": 0.5618, + "step": 16260 + }, + { + "epoch": 0.4746826207500365, + "grad_norm": 1.0405455587530177, + "learning_rate": 3.1227557274228314e-05, + "loss": 0.5527, + "step": 16265 + }, + { + "epoch": 0.47482854224427257, + "grad_norm": 1.0661025990390622, + "learning_rate": 3.1216849173591816e-05, + "loss": 0.5644, + "step": 16270 + }, + { + "epoch": 0.47497446373850866, + "grad_norm": 1.124048760964528, + "learning_rate": 3.120614020741758e-05, + "loss": 0.6207, + "step": 16275 + }, + { + "epoch": 0.4751203852327448, + "grad_norm": 0.9775749805402151, + "learning_rate": 3.1195430378199376e-05, + "loss": 0.5665, + "step": 16280 + }, + { + "epoch": 0.4752663067269809, + "grad_norm": 1.1100688950990143, + "learning_rate": 3.118471968843118e-05, + "loss": 0.5943, + "step": 16285 + }, + { + "epoch": 0.475412228221217, + "grad_norm": 1.0291055831161693, + "learning_rate": 3.11740081406072e-05, + "loss": 0.6502, + "step": 16290 + }, + { + "epoch": 0.47555814971545307, + "grad_norm": 1.0315300126889353, + "learning_rate": 3.11632957372218e-05, + "loss": 0.5642, + "step": 16295 + }, + { + "epoch": 0.47570407120968916, + "grad_norm": 0.9797770643424677, + "learning_rate": 3.1152582480769566e-05, + "loss": 0.5703, + "step": 16300 + }, + { + "epoch": 0.4758499927039253, + "grad_norm": 0.9462656804437485, + "learning_rate": 3.1141868373745285e-05, + "loss": 0.5997, + "step": 16305 + }, + { + "epoch": 0.4759959141981614, + "grad_norm": 0.9769441795159095, + "learning_rate": 3.113115341864393e-05, + "loss": 0.5707, + "step": 16310 + }, + { + "epoch": 0.4761418356923975, + "grad_norm": 1.2045468012988314, + "learning_rate": 3.1120437617960695e-05, + "loss": 0.6186, + "step": 16315 + }, + { + "epoch": 0.4762877571866336, + "grad_norm": 0.810664063243908, + "learning_rate": 3.110972097419093e-05, + "loss": 0.5105, + "step": 16320 + }, + { + "epoch": 0.47643367868086967, + "grad_norm": 1.0101058675561303, + "learning_rate": 3.1099003489830225e-05, + "loss": 0.619, + "step": 16325 + }, + { + "epoch": 0.4765796001751058, + "grad_norm": 1.3988922571449385, + "learning_rate": 3.1088285167374345e-05, + "loss": 0.5865, + "step": 16330 + }, + { + "epoch": 0.4767255216693419, + "grad_norm": 1.1101767982009851, + "learning_rate": 3.107756600931923e-05, + "loss": 0.5892, + "step": 16335 + }, + { + "epoch": 0.476871443163578, + "grad_norm": 1.0213661041585358, + "learning_rate": 3.106684601816106e-05, + "loss": 0.5502, + "step": 16340 + }, + { + "epoch": 0.4770173646578141, + "grad_norm": 0.983270686263814, + "learning_rate": 3.1056125196396174e-05, + "loss": 0.5851, + "step": 16345 + }, + { + "epoch": 0.47716328615205017, + "grad_norm": 1.0366162681146514, + "learning_rate": 3.104540354652113e-05, + "loss": 0.5291, + "step": 16350 + }, + { + "epoch": 0.4773092076462863, + "grad_norm": 1.216918503959385, + "learning_rate": 3.103468107103265e-05, + "loss": 0.6292, + "step": 16355 + }, + { + "epoch": 0.4774551291405224, + "grad_norm": 1.0767064027176807, + "learning_rate": 3.1023957772427656e-05, + "loss": 0.6254, + "step": 16360 + }, + { + "epoch": 0.4776010506347585, + "grad_norm": 1.1485535990927525, + "learning_rate": 3.101323365320329e-05, + "loss": 0.6227, + "step": 16365 + }, + { + "epoch": 0.4777469721289946, + "grad_norm": 1.0113956100330284, + "learning_rate": 3.100250871585684e-05, + "loss": 0.59, + "step": 16370 + }, + { + "epoch": 0.4778928936232307, + "grad_norm": 1.284691180996894, + "learning_rate": 3.099178296288583e-05, + "loss": 0.6798, + "step": 16375 + }, + { + "epoch": 0.4780388151174668, + "grad_norm": 0.9987102666277092, + "learning_rate": 3.0981056396787936e-05, + "loss": 0.5675, + "step": 16380 + }, + { + "epoch": 0.4781847366117029, + "grad_norm": 1.017702166853108, + "learning_rate": 3.097032902006105e-05, + "loss": 0.5337, + "step": 16385 + }, + { + "epoch": 0.478330658105939, + "grad_norm": 1.0309861509135925, + "learning_rate": 3.095960083520323e-05, + "loss": 0.5986, + "step": 16390 + }, + { + "epoch": 0.4784765796001751, + "grad_norm": 1.0609202762615813, + "learning_rate": 3.0948871844712746e-05, + "loss": 0.5649, + "step": 16395 + }, + { + "epoch": 0.4786225010944112, + "grad_norm": 1.05094668256943, + "learning_rate": 3.093814205108804e-05, + "loss": 0.6136, + "step": 16400 + }, + { + "epoch": 0.4787684225886473, + "grad_norm": 1.097103947953059, + "learning_rate": 3.092741145682774e-05, + "loss": 0.6099, + "step": 16405 + }, + { + "epoch": 0.4789143440828834, + "grad_norm": 1.0286617622228802, + "learning_rate": 3.091668006443067e-05, + "loss": 0.5911, + "step": 16410 + }, + { + "epoch": 0.4790602655771195, + "grad_norm": 0.9191288172393072, + "learning_rate": 3.090594787639584e-05, + "loss": 0.6147, + "step": 16415 + }, + { + "epoch": 0.4792061870713556, + "grad_norm": 1.1372882945790748, + "learning_rate": 3.089521489522243e-05, + "loss": 0.5636, + "step": 16420 + }, + { + "epoch": 0.47935210856559174, + "grad_norm": 1.0693982931276715, + "learning_rate": 3.088448112340982e-05, + "loss": 0.5732, + "step": 16425 + }, + { + "epoch": 0.47949803005982783, + "grad_norm": 1.07027696047337, + "learning_rate": 3.0873746563457567e-05, + "loss": 0.5641, + "step": 16430 + }, + { + "epoch": 0.4796439515540639, + "grad_norm": 0.9514757147042575, + "learning_rate": 3.0863011217865415e-05, + "loss": 0.6107, + "step": 16435 + }, + { + "epoch": 0.4797898730483, + "grad_norm": 0.8974841732761474, + "learning_rate": 3.0852275089133305e-05, + "loss": 0.5669, + "step": 16440 + }, + { + "epoch": 0.4799357945425361, + "grad_norm": 0.9491273704131361, + "learning_rate": 3.084153817976131e-05, + "loss": 0.5835, + "step": 16445 + }, + { + "epoch": 0.48008171603677224, + "grad_norm": 0.9386445190771993, + "learning_rate": 3.083080049224976e-05, + "loss": 0.5788, + "step": 16450 + }, + { + "epoch": 0.48022763753100833, + "grad_norm": 1.1587704595737656, + "learning_rate": 3.082006202909909e-05, + "loss": 0.6252, + "step": 16455 + }, + { + "epoch": 0.4803735590252444, + "grad_norm": 0.8934272110359924, + "learning_rate": 3.0809322792809974e-05, + "loss": 0.5625, + "step": 16460 + }, + { + "epoch": 0.4805194805194805, + "grad_norm": 1.0051712165565045, + "learning_rate": 3.079858278588324e-05, + "loss": 0.5512, + "step": 16465 + }, + { + "epoch": 0.4806654020137166, + "grad_norm": 0.924265478352745, + "learning_rate": 3.0787842010819896e-05, + "loss": 0.5612, + "step": 16470 + }, + { + "epoch": 0.48081132350795275, + "grad_norm": 1.1785058206193082, + "learning_rate": 3.0777100470121135e-05, + "loss": 0.5665, + "step": 16475 + }, + { + "epoch": 0.48095724500218884, + "grad_norm": 1.1115991749510084, + "learning_rate": 3.0766358166288325e-05, + "loss": 0.6064, + "step": 16480 + }, + { + "epoch": 0.4811031664964249, + "grad_norm": 1.057264070379488, + "learning_rate": 3.075561510182301e-05, + "loss": 0.5368, + "step": 16485 + }, + { + "epoch": 0.481249087990661, + "grad_norm": 1.02301090036176, + "learning_rate": 3.074487127922691e-05, + "loss": 0.5116, + "step": 16490 + }, + { + "epoch": 0.4813950094848971, + "grad_norm": 0.9098775770077692, + "learning_rate": 3.073412670100193e-05, + "loss": 0.55, + "step": 16495 + }, + { + "epoch": 0.48154093097913325, + "grad_norm": 1.0395773719043275, + "learning_rate": 3.0723381369650144e-05, + "loss": 0.6279, + "step": 16500 + }, + { + "epoch": 0.48168685247336934, + "grad_norm": 1.0969355584335883, + "learning_rate": 3.0712635287673806e-05, + "loss": 0.6313, + "step": 16505 + }, + { + "epoch": 0.48183277396760543, + "grad_norm": 1.0646441403544518, + "learning_rate": 3.070188845757534e-05, + "loss": 0.5381, + "step": 16510 + }, + { + "epoch": 0.4819786954618415, + "grad_norm": 1.0026345696667833, + "learning_rate": 3.069114088185735e-05, + "loss": 0.6036, + "step": 16515 + }, + { + "epoch": 0.4821246169560776, + "grad_norm": 1.1186335478111695, + "learning_rate": 3.068039256302259e-05, + "loss": 0.5318, + "step": 16520 + }, + { + "epoch": 0.48227053845031376, + "grad_norm": 1.0032186205758673, + "learning_rate": 3.066964350357403e-05, + "loss": 0.6589, + "step": 16525 + }, + { + "epoch": 0.48241645994454985, + "grad_norm": 1.1064033076938589, + "learning_rate": 3.065889370601479e-05, + "loss": 0.5713, + "step": 16530 + }, + { + "epoch": 0.48256238143878594, + "grad_norm": 0.8758987582809168, + "learning_rate": 3.064814317284814e-05, + "loss": 0.5113, + "step": 16535 + }, + { + "epoch": 0.482708302933022, + "grad_norm": 1.0407519143116468, + "learning_rate": 3.063739190657756e-05, + "loss": 0.599, + "step": 16540 + }, + { + "epoch": 0.4828542244272581, + "grad_norm": 1.1284490311184938, + "learning_rate": 3.0626639909706686e-05, + "loss": 0.5651, + "step": 16545 + }, + { + "epoch": 0.48300014592149426, + "grad_norm": 0.9132063911299511, + "learning_rate": 3.0615887184739306e-05, + "loss": 0.5937, + "step": 16550 + }, + { + "epoch": 0.48314606741573035, + "grad_norm": 0.9788954521872006, + "learning_rate": 3.06051337341794e-05, + "loss": 0.5512, + "step": 16555 + }, + { + "epoch": 0.48329198890996644, + "grad_norm": 1.095220995826207, + "learning_rate": 3.05943795605311e-05, + "loss": 0.588, + "step": 16560 + }, + { + "epoch": 0.48343791040420253, + "grad_norm": 1.101814668046757, + "learning_rate": 3.0583624666298744e-05, + "loss": 0.5677, + "step": 16565 + }, + { + "epoch": 0.4835838318984386, + "grad_norm": 1.0767350502821125, + "learning_rate": 3.057286905398678e-05, + "loss": 0.5672, + "step": 16570 + }, + { + "epoch": 0.48372975339267477, + "grad_norm": 1.0145287764316528, + "learning_rate": 3.0562112726099874e-05, + "loss": 0.5659, + "step": 16575 + }, + { + "epoch": 0.48387567488691086, + "grad_norm": 0.8951101173973824, + "learning_rate": 3.0551355685142824e-05, + "loss": 0.5689, + "step": 16580 + }, + { + "epoch": 0.48402159638114695, + "grad_norm": 1.0388098563140675, + "learning_rate": 3.054059793362062e-05, + "loss": 0.6006, + "step": 16585 + }, + { + "epoch": 0.48416751787538304, + "grad_norm": 1.094177681656709, + "learning_rate": 3.052983947403839e-05, + "loss": 0.5793, + "step": 16590 + }, + { + "epoch": 0.4843134393696191, + "grad_norm": 0.9406811228138509, + "learning_rate": 3.0519080308901446e-05, + "loss": 0.5824, + "step": 16595 + }, + { + "epoch": 0.48445936086385527, + "grad_norm": 1.235780288180848, + "learning_rate": 3.0508320440715276e-05, + "loss": 0.6251, + "step": 16600 + }, + { + "epoch": 0.48460528235809136, + "grad_norm": 1.101615762660566, + "learning_rate": 3.04975598719855e-05, + "loss": 0.6339, + "step": 16605 + }, + { + "epoch": 0.48475120385232745, + "grad_norm": 0.8748141487967571, + "learning_rate": 3.048679860521793e-05, + "loss": 0.5477, + "step": 16610 + }, + { + "epoch": 0.48489712534656354, + "grad_norm": 1.1252109265861094, + "learning_rate": 3.0476036642918515e-05, + "loss": 0.635, + "step": 16615 + }, + { + "epoch": 0.48504304684079963, + "grad_norm": 1.1513319214737583, + "learning_rate": 3.046527398759339e-05, + "loss": 0.6009, + "step": 16620 + }, + { + "epoch": 0.4851889683350358, + "grad_norm": 1.2551123248600837, + "learning_rate": 3.0454510641748822e-05, + "loss": 0.6141, + "step": 16625 + }, + { + "epoch": 0.48533488982927186, + "grad_norm": 1.031188379187192, + "learning_rate": 3.044374660789128e-05, + "loss": 0.6948, + "step": 16630 + }, + { + "epoch": 0.48548081132350795, + "grad_norm": 0.9956876355652408, + "learning_rate": 3.0432981888527363e-05, + "loss": 0.5561, + "step": 16635 + }, + { + "epoch": 0.48562673281774404, + "grad_norm": 1.1056522815640566, + "learning_rate": 3.042221648616383e-05, + "loss": 0.598, + "step": 16640 + }, + { + "epoch": 0.48577265431198013, + "grad_norm": 1.0722816944801006, + "learning_rate": 3.041145040330761e-05, + "loss": 0.6027, + "step": 16645 + }, + { + "epoch": 0.4859185758062163, + "grad_norm": 1.1311406265845234, + "learning_rate": 3.040068364246579e-05, + "loss": 0.5923, + "step": 16650 + }, + { + "epoch": 0.48606449730045237, + "grad_norm": 0.9701960581456185, + "learning_rate": 3.0389916206145602e-05, + "loss": 0.5999, + "step": 16655 + }, + { + "epoch": 0.48621041879468846, + "grad_norm": 1.1194453829949578, + "learning_rate": 3.037914809685446e-05, + "loss": 0.5755, + "step": 16660 + }, + { + "epoch": 0.48635634028892455, + "grad_norm": 1.1052857023913059, + "learning_rate": 3.0368379317099904e-05, + "loss": 0.5864, + "step": 16665 + }, + { + "epoch": 0.48650226178316064, + "grad_norm": 1.1250900258444685, + "learning_rate": 3.0357609869389657e-05, + "loss": 0.606, + "step": 16670 + }, + { + "epoch": 0.4866481832773968, + "grad_norm": 1.1594551943657159, + "learning_rate": 3.0346839756231577e-05, + "loss": 0.5219, + "step": 16675 + }, + { + "epoch": 0.4867941047716329, + "grad_norm": 1.0504391740277301, + "learning_rate": 3.033606898013369e-05, + "loss": 0.5188, + "step": 16680 + }, + { + "epoch": 0.48694002626586896, + "grad_norm": 0.9968746885784527, + "learning_rate": 3.0325297543604174e-05, + "loss": 0.5715, + "step": 16685 + }, + { + "epoch": 0.48708594776010505, + "grad_norm": 0.9227738120060264, + "learning_rate": 3.031452544915136e-05, + "loss": 0.5631, + "step": 16690 + }, + { + "epoch": 0.48723186925434114, + "grad_norm": 1.079810228938708, + "learning_rate": 3.0303752699283728e-05, + "loss": 0.6024, + "step": 16695 + }, + { + "epoch": 0.4873777907485773, + "grad_norm": 1.006269550053113, + "learning_rate": 3.0292979296509906e-05, + "loss": 0.576, + "step": 16700 + }, + { + "epoch": 0.4875237122428134, + "grad_norm": 0.9359264658024572, + "learning_rate": 3.02822052433387e-05, + "loss": 0.5991, + "step": 16705 + }, + { + "epoch": 0.48766963373704947, + "grad_norm": 1.0477115565446178, + "learning_rate": 3.0271430542279033e-05, + "loss": 0.5938, + "step": 16710 + }, + { + "epoch": 0.48781555523128556, + "grad_norm": 1.1035232161775983, + "learning_rate": 3.026065519584001e-05, + "loss": 0.6556, + "step": 16715 + }, + { + "epoch": 0.48796147672552165, + "grad_norm": 0.9596841525527193, + "learning_rate": 3.0249879206530857e-05, + "loss": 0.5701, + "step": 16720 + }, + { + "epoch": 0.4881073982197578, + "grad_norm": 1.0794334018029093, + "learning_rate": 3.023910257686098e-05, + "loss": 0.6171, + "step": 16725 + }, + { + "epoch": 0.4882533197139939, + "grad_norm": 0.9116137211414866, + "learning_rate": 3.0228325309339907e-05, + "loss": 0.6255, + "step": 16730 + }, + { + "epoch": 0.48839924120823, + "grad_norm": 0.895479894629434, + "learning_rate": 3.0217547406477327e-05, + "loss": 0.5234, + "step": 16735 + }, + { + "epoch": 0.48854516270246606, + "grad_norm": 1.2151650945823496, + "learning_rate": 3.0206768870783086e-05, + "loss": 0.5981, + "step": 16740 + }, + { + "epoch": 0.48869108419670215, + "grad_norm": 1.0855610104739521, + "learning_rate": 3.0195989704767154e-05, + "loss": 0.5799, + "step": 16745 + }, + { + "epoch": 0.4888370056909383, + "grad_norm": 1.3638768224077351, + "learning_rate": 3.018520991093966e-05, + "loss": 0.5795, + "step": 16750 + }, + { + "epoch": 0.4889829271851744, + "grad_norm": 1.039542191491571, + "learning_rate": 3.0174429491810897e-05, + "loss": 0.5466, + "step": 16755 + }, + { + "epoch": 0.4891288486794105, + "grad_norm": 1.2436322898659926, + "learning_rate": 3.0163648449891275e-05, + "loss": 0.6047, + "step": 16760 + }, + { + "epoch": 0.48927477017364657, + "grad_norm": 1.0463341469666652, + "learning_rate": 3.0152866787691365e-05, + "loss": 0.5878, + "step": 16765 + }, + { + "epoch": 0.48942069166788266, + "grad_norm": 0.9986395190908836, + "learning_rate": 3.014208450772187e-05, + "loss": 0.6148, + "step": 16770 + }, + { + "epoch": 0.4895666131621188, + "grad_norm": 0.9508145375780831, + "learning_rate": 3.013130161249365e-05, + "loss": 0.5526, + "step": 16775 + }, + { + "epoch": 0.4897125346563549, + "grad_norm": 1.1855955482255367, + "learning_rate": 3.0120518104517713e-05, + "loss": 0.6268, + "step": 16780 + }, + { + "epoch": 0.489858456150591, + "grad_norm": 1.0269621056162834, + "learning_rate": 3.0109733986305182e-05, + "loss": 0.5716, + "step": 16785 + }, + { + "epoch": 0.49000437764482707, + "grad_norm": 1.139210936347588, + "learning_rate": 3.009894926036736e-05, + "loss": 0.5592, + "step": 16790 + }, + { + "epoch": 0.49015029913906316, + "grad_norm": 1.173464924838714, + "learning_rate": 3.0088163929215668e-05, + "loss": 0.6348, + "step": 16795 + }, + { + "epoch": 0.4902962206332993, + "grad_norm": 0.8586206797685373, + "learning_rate": 3.0077377995361662e-05, + "loss": 0.5568, + "step": 16800 + }, + { + "epoch": 0.4904421421275354, + "grad_norm": 1.1914081996807593, + "learning_rate": 3.0066591461317047e-05, + "loss": 0.6222, + "step": 16805 + }, + { + "epoch": 0.4905880636217715, + "grad_norm": 0.8844532684601611, + "learning_rate": 3.0055804329593678e-05, + "loss": 0.6, + "step": 16810 + }, + { + "epoch": 0.4907339851160076, + "grad_norm": 1.1051806903627077, + "learning_rate": 3.004501660270353e-05, + "loss": 0.6008, + "step": 16815 + }, + { + "epoch": 0.49087990661024367, + "grad_norm": 1.1046818058469494, + "learning_rate": 3.0034228283158738e-05, + "loss": 0.5839, + "step": 16820 + }, + { + "epoch": 0.4910258281044798, + "grad_norm": 0.9772652604582976, + "learning_rate": 3.0023439373471556e-05, + "loss": 0.6183, + "step": 16825 + }, + { + "epoch": 0.4911717495987159, + "grad_norm": 0.99527660067659, + "learning_rate": 3.0012649876154396e-05, + "loss": 0.6558, + "step": 16830 + }, + { + "epoch": 0.491317671092952, + "grad_norm": 1.1049813052801287, + "learning_rate": 3.0001859793719778e-05, + "loss": 0.5754, + "step": 16835 + }, + { + "epoch": 0.4914635925871881, + "grad_norm": 1.0804230143070144, + "learning_rate": 2.999106912868038e-05, + "loss": 0.565, + "step": 16840 + }, + { + "epoch": 0.49160951408142417, + "grad_norm": 0.9667300843938154, + "learning_rate": 2.9980277883549014e-05, + "loss": 0.5602, + "step": 16845 + }, + { + "epoch": 0.4917554355756603, + "grad_norm": 1.112504019395495, + "learning_rate": 2.9969486060838616e-05, + "loss": 0.6162, + "step": 16850 + }, + { + "epoch": 0.4919013570698964, + "grad_norm": 1.0123761453453675, + "learning_rate": 2.9958693663062266e-05, + "loss": 0.6031, + "step": 16855 + }, + { + "epoch": 0.4920472785641325, + "grad_norm": 0.9328559186875812, + "learning_rate": 2.9947900692733172e-05, + "loss": 0.5659, + "step": 16860 + }, + { + "epoch": 0.4921932000583686, + "grad_norm": 1.135837648885987, + "learning_rate": 2.9937107152364684e-05, + "loss": 0.6413, + "step": 16865 + }, + { + "epoch": 0.4923391215526047, + "grad_norm": 1.0315662764197113, + "learning_rate": 2.9926313044470287e-05, + "loss": 0.5843, + "step": 16870 + }, + { + "epoch": 0.4924850430468408, + "grad_norm": 1.056989609274433, + "learning_rate": 2.9915518371563573e-05, + "loss": 0.5851, + "step": 16875 + }, + { + "epoch": 0.4926309645410769, + "grad_norm": 1.117294212353933, + "learning_rate": 2.990472313615829e-05, + "loss": 0.5715, + "step": 16880 + }, + { + "epoch": 0.492776886035313, + "grad_norm": 1.0668832151570473, + "learning_rate": 2.9893927340768323e-05, + "loss": 0.5732, + "step": 16885 + }, + { + "epoch": 0.4929228075295491, + "grad_norm": 1.0437664606850778, + "learning_rate": 2.988313098790766e-05, + "loss": 0.5577, + "step": 16890 + }, + { + "epoch": 0.4930687290237852, + "grad_norm": 1.1182632433749098, + "learning_rate": 2.987233408009043e-05, + "loss": 0.5727, + "step": 16895 + }, + { + "epoch": 0.4932146505180213, + "grad_norm": 0.9806326308210983, + "learning_rate": 2.9861536619830908e-05, + "loss": 0.5108, + "step": 16900 + }, + { + "epoch": 0.4933605720122574, + "grad_norm": 1.0632672939693888, + "learning_rate": 2.9850738609643476e-05, + "loss": 0.6147, + "step": 16905 + }, + { + "epoch": 0.4935064935064935, + "grad_norm": 1.0202524332496135, + "learning_rate": 2.9839940052042658e-05, + "loss": 0.6579, + "step": 16910 + }, + { + "epoch": 0.4936524150007296, + "grad_norm": 0.9673321870150993, + "learning_rate": 2.9829140949543095e-05, + "loss": 0.6454, + "step": 16915 + }, + { + "epoch": 0.4937983364949657, + "grad_norm": 0.9250203025433781, + "learning_rate": 2.9818341304659558e-05, + "loss": 0.5605, + "step": 16920 + }, + { + "epoch": 0.49394425798920183, + "grad_norm": 1.17805820620995, + "learning_rate": 2.980754111990695e-05, + "loss": 0.6092, + "step": 16925 + }, + { + "epoch": 0.4940901794834379, + "grad_norm": 0.9049157028388981, + "learning_rate": 2.9796740397800294e-05, + "loss": 0.5629, + "step": 16930 + }, + { + "epoch": 0.494236100977674, + "grad_norm": 0.9992000241130848, + "learning_rate": 2.978593914085474e-05, + "loss": 0.5351, + "step": 16935 + }, + { + "epoch": 0.4943820224719101, + "grad_norm": 1.0880880154066537, + "learning_rate": 2.977513735158557e-05, + "loss": 0.562, + "step": 16940 + }, + { + "epoch": 0.4945279439661462, + "grad_norm": 1.025528763606253, + "learning_rate": 2.9764335032508174e-05, + "loss": 0.5494, + "step": 16945 + }, + { + "epoch": 0.49467386546038233, + "grad_norm": 0.9083104501889877, + "learning_rate": 2.975353218613807e-05, + "loss": 0.57, + "step": 16950 + }, + { + "epoch": 0.4948197869546184, + "grad_norm": 1.0445661233546493, + "learning_rate": 2.9742728814990906e-05, + "loss": 0.5815, + "step": 16955 + }, + { + "epoch": 0.4949657084488545, + "grad_norm": 1.1064254230623647, + "learning_rate": 2.9731924921582456e-05, + "loss": 0.6064, + "step": 16960 + }, + { + "epoch": 0.4951116299430906, + "grad_norm": 0.9843510816262551, + "learning_rate": 2.9721120508428596e-05, + "loss": 0.5466, + "step": 16965 + }, + { + "epoch": 0.4952575514373267, + "grad_norm": 1.1181502424225254, + "learning_rate": 2.9710315578045346e-05, + "loss": 0.633, + "step": 16970 + }, + { + "epoch": 0.49540347293156284, + "grad_norm": 1.084398136372078, + "learning_rate": 2.969951013294883e-05, + "loss": 0.559, + "step": 16975 + }, + { + "epoch": 0.49554939442579893, + "grad_norm": 1.1208990016419313, + "learning_rate": 2.9688704175655307e-05, + "loss": 0.6358, + "step": 16980 + }, + { + "epoch": 0.495695315920035, + "grad_norm": 0.869429962777699, + "learning_rate": 2.9677897708681128e-05, + "loss": 0.5541, + "step": 16985 + }, + { + "epoch": 0.4958412374142711, + "grad_norm": 1.0566064012035865, + "learning_rate": 2.9667090734542803e-05, + "loss": 0.5793, + "step": 16990 + }, + { + "epoch": 0.4959871589085072, + "grad_norm": 0.8986133297879705, + "learning_rate": 2.9656283255756915e-05, + "loss": 0.5479, + "step": 16995 + }, + { + "epoch": 0.49613308040274334, + "grad_norm": 1.1670480632866398, + "learning_rate": 2.9645475274840214e-05, + "loss": 0.5958, + "step": 17000 + }, + { + "epoch": 0.49627900189697943, + "grad_norm": 1.0595418026447003, + "learning_rate": 2.9634666794309507e-05, + "loss": 0.576, + "step": 17005 + }, + { + "epoch": 0.4964249233912155, + "grad_norm": 1.0343596360481417, + "learning_rate": 2.9623857816681773e-05, + "loss": 0.6534, + "step": 17010 + }, + { + "epoch": 0.4965708448854516, + "grad_norm": 1.2118023238726043, + "learning_rate": 2.9613048344474087e-05, + "loss": 0.5977, + "step": 17015 + }, + { + "epoch": 0.4967167663796877, + "grad_norm": 0.8946716938813952, + "learning_rate": 2.9602238380203623e-05, + "loss": 0.5996, + "step": 17020 + }, + { + "epoch": 0.49686268787392385, + "grad_norm": 1.0965484099482996, + "learning_rate": 2.9591427926387693e-05, + "loss": 0.5717, + "step": 17025 + }, + { + "epoch": 0.49700860936815994, + "grad_norm": 1.203404582937764, + "learning_rate": 2.95806169855437e-05, + "loss": 0.585, + "step": 17030 + }, + { + "epoch": 0.497154530862396, + "grad_norm": 1.0241335862567653, + "learning_rate": 2.956980556018919e-05, + "loss": 0.5898, + "step": 17035 + }, + { + "epoch": 0.4973004523566321, + "grad_norm": 1.212790525642013, + "learning_rate": 2.9558993652841795e-05, + "loss": 0.6325, + "step": 17040 + }, + { + "epoch": 0.4974463738508682, + "grad_norm": 1.0672694059007521, + "learning_rate": 2.9548181266019277e-05, + "loss": 0.586, + "step": 17045 + }, + { + "epoch": 0.49759229534510435, + "grad_norm": 1.067985076293012, + "learning_rate": 2.9537368402239496e-05, + "loss": 0.6287, + "step": 17050 + }, + { + "epoch": 0.49773821683934044, + "grad_norm": 1.1210875052539528, + "learning_rate": 2.9526555064020427e-05, + "loss": 0.573, + "step": 17055 + }, + { + "epoch": 0.49788413833357653, + "grad_norm": 1.089874287773011, + "learning_rate": 2.9515741253880164e-05, + "loss": 0.591, + "step": 17060 + }, + { + "epoch": 0.4980300598278126, + "grad_norm": 1.0910041107342514, + "learning_rate": 2.95049269743369e-05, + "loss": 0.5831, + "step": 17065 + }, + { + "epoch": 0.4981759813220487, + "grad_norm": 0.9203871016363342, + "learning_rate": 2.9494112227908955e-05, + "loss": 0.5516, + "step": 17070 + }, + { + "epoch": 0.49832190281628486, + "grad_norm": 0.9816651505927119, + "learning_rate": 2.948329701711472e-05, + "loss": 0.5643, + "step": 17075 + }, + { + "epoch": 0.49846782431052095, + "grad_norm": 1.1430532292228752, + "learning_rate": 2.9472481344472742e-05, + "loss": 0.5994, + "step": 17080 + }, + { + "epoch": 0.49861374580475704, + "grad_norm": 1.0681291883308852, + "learning_rate": 2.9461665212501644e-05, + "loss": 0.5766, + "step": 17085 + }, + { + "epoch": 0.4987596672989931, + "grad_norm": 1.2322660112356114, + "learning_rate": 2.9450848623720155e-05, + "loss": 0.5961, + "step": 17090 + }, + { + "epoch": 0.4989055887932292, + "grad_norm": 1.1629068826495832, + "learning_rate": 2.944003158064713e-05, + "loss": 0.5381, + "step": 17095 + }, + { + "epoch": 0.49905151028746536, + "grad_norm": 1.1404379842908996, + "learning_rate": 2.9429214085801525e-05, + "loss": 0.6537, + "step": 17100 + }, + { + "epoch": 0.49919743178170145, + "grad_norm": 1.0940607875008523, + "learning_rate": 2.941839614170238e-05, + "loss": 0.5714, + "step": 17105 + }, + { + "epoch": 0.49934335327593754, + "grad_norm": 1.0299423097868212, + "learning_rate": 2.940757775086887e-05, + "loss": 0.6254, + "step": 17110 + }, + { + "epoch": 0.49948927477017363, + "grad_norm": 0.9625733565818848, + "learning_rate": 2.939675891582025e-05, + "loss": 0.5756, + "step": 17115 + }, + { + "epoch": 0.4996351962644097, + "grad_norm": 1.1749270933065001, + "learning_rate": 2.93859396390759e-05, + "loss": 0.544, + "step": 17120 + }, + { + "epoch": 0.49978111775864587, + "grad_norm": 1.1419196198724284, + "learning_rate": 2.937511992315527e-05, + "loss": 0.5359, + "step": 17125 + }, + { + "epoch": 0.49992703925288196, + "grad_norm": 1.0460114285291608, + "learning_rate": 2.936429977057795e-05, + "loss": 0.5768, + "step": 17130 + }, + { + "epoch": 0.500072960747118, + "grad_norm": 0.9495569851449527, + "learning_rate": 2.9353479183863608e-05, + "loss": 0.5854, + "step": 17135 + }, + { + "epoch": 0.5002188822413541, + "grad_norm": 1.0947634323998576, + "learning_rate": 2.934265816553202e-05, + "loss": 0.5748, + "step": 17140 + }, + { + "epoch": 0.5003648037355902, + "grad_norm": 1.0812227117142743, + "learning_rate": 2.9331836718103072e-05, + "loss": 0.5982, + "step": 17145 + }, + { + "epoch": 0.5005107252298263, + "grad_norm": 1.1583502319131789, + "learning_rate": 2.9321014844096715e-05, + "loss": 0.6301, + "step": 17150 + }, + { + "epoch": 0.5006566467240624, + "grad_norm": 1.0244266547667469, + "learning_rate": 2.9310192546033055e-05, + "loss": 0.5376, + "step": 17155 + }, + { + "epoch": 0.5008025682182986, + "grad_norm": 1.0900180064398863, + "learning_rate": 2.929936982643225e-05, + "loss": 0.5834, + "step": 17160 + }, + { + "epoch": 0.5009484897125347, + "grad_norm": 1.100390758039939, + "learning_rate": 2.9288546687814573e-05, + "loss": 0.5617, + "step": 17165 + }, + { + "epoch": 0.5010944112067708, + "grad_norm": 1.1087549270427306, + "learning_rate": 2.9277723132700392e-05, + "loss": 0.6625, + "step": 17170 + }, + { + "epoch": 0.5012403327010069, + "grad_norm": 1.1275671835770074, + "learning_rate": 2.9266899163610173e-05, + "loss": 0.5694, + "step": 17175 + }, + { + "epoch": 0.501386254195243, + "grad_norm": 1.301754425842359, + "learning_rate": 2.9256074783064492e-05, + "loss": 0.616, + "step": 17180 + }, + { + "epoch": 0.501532175689479, + "grad_norm": 0.9801953455936939, + "learning_rate": 2.9245249993583996e-05, + "loss": 0.6054, + "step": 17185 + }, + { + "epoch": 0.5016780971837151, + "grad_norm": 1.3169724917244068, + "learning_rate": 2.9234424797689448e-05, + "loss": 0.5957, + "step": 17190 + }, + { + "epoch": 0.5018240186779512, + "grad_norm": 1.0964113878658048, + "learning_rate": 2.9223599197901692e-05, + "loss": 0.5315, + "step": 17195 + }, + { + "epoch": 0.5019699401721873, + "grad_norm": 1.116645100721211, + "learning_rate": 2.9212773196741673e-05, + "loss": 0.6488, + "step": 17200 + }, + { + "epoch": 0.5021158616664234, + "grad_norm": 1.1340473647607214, + "learning_rate": 2.9201946796730417e-05, + "loss": 0.5862, + "step": 17205 + }, + { + "epoch": 0.5022617831606596, + "grad_norm": 0.9973146248286466, + "learning_rate": 2.9191120000389066e-05, + "loss": 0.543, + "step": 17210 + }, + { + "epoch": 0.5024077046548957, + "grad_norm": 0.8232630283002779, + "learning_rate": 2.9180292810238836e-05, + "loss": 0.5329, + "step": 17215 + }, + { + "epoch": 0.5025536261491318, + "grad_norm": 1.0003801033715258, + "learning_rate": 2.916946522880104e-05, + "loss": 0.5445, + "step": 17220 + }, + { + "epoch": 0.5026995476433679, + "grad_norm": 0.8959837413992002, + "learning_rate": 2.91586372585971e-05, + "loss": 0.5692, + "step": 17225 + }, + { + "epoch": 0.502845469137604, + "grad_norm": 1.0163054443683688, + "learning_rate": 2.9147808902148488e-05, + "loss": 0.6168, + "step": 17230 + }, + { + "epoch": 0.5029913906318401, + "grad_norm": 1.1127307451846904, + "learning_rate": 2.9136980161976797e-05, + "loss": 0.6375, + "step": 17235 + }, + { + "epoch": 0.5031373121260762, + "grad_norm": 1.076284732878839, + "learning_rate": 2.91261510406037e-05, + "loss": 0.5912, + "step": 17240 + }, + { + "epoch": 0.5032832336203122, + "grad_norm": 1.0139824967738633, + "learning_rate": 2.911532154055096e-05, + "loss": 0.5759, + "step": 17245 + }, + { + "epoch": 0.5034291551145483, + "grad_norm": 1.19276931955239, + "learning_rate": 2.9104491664340434e-05, + "loss": 0.6087, + "step": 17250 + }, + { + "epoch": 0.5035750766087844, + "grad_norm": 1.0511522848016808, + "learning_rate": 2.9093661414494057e-05, + "loss": 0.5587, + "step": 17255 + }, + { + "epoch": 0.5037209981030206, + "grad_norm": 0.9211614362139381, + "learning_rate": 2.9082830793533856e-05, + "loss": 0.5971, + "step": 17260 + }, + { + "epoch": 0.5038669195972567, + "grad_norm": 1.0145505004904072, + "learning_rate": 2.907199980398194e-05, + "loss": 0.5494, + "step": 17265 + }, + { + "epoch": 0.5040128410914928, + "grad_norm": 0.9686619057838534, + "learning_rate": 2.906116844836051e-05, + "loss": 0.5692, + "step": 17270 + }, + { + "epoch": 0.5041587625857289, + "grad_norm": 1.0442973920811878, + "learning_rate": 2.9050336729191857e-05, + "loss": 0.5579, + "step": 17275 + }, + { + "epoch": 0.504304684079965, + "grad_norm": 1.1239191601778034, + "learning_rate": 2.9039504648998328e-05, + "loss": 0.569, + "step": 17280 + }, + { + "epoch": 0.5044506055742011, + "grad_norm": 1.2276443448948418, + "learning_rate": 2.90286722103024e-05, + "loss": 0.5255, + "step": 17285 + }, + { + "epoch": 0.5045965270684372, + "grad_norm": 1.2029144942229784, + "learning_rate": 2.9017839415626584e-05, + "loss": 0.5834, + "step": 17290 + }, + { + "epoch": 0.5047424485626733, + "grad_norm": 0.9857402848906581, + "learning_rate": 2.9007006267493525e-05, + "loss": 0.5827, + "step": 17295 + }, + { + "epoch": 0.5048883700569093, + "grad_norm": 1.328432638628235, + "learning_rate": 2.8996172768425907e-05, + "loss": 0.5532, + "step": 17300 + }, + { + "epoch": 0.5050342915511454, + "grad_norm": 1.0045094529702179, + "learning_rate": 2.898533892094651e-05, + "loss": 0.5265, + "step": 17305 + }, + { + "epoch": 0.5051802130453816, + "grad_norm": 0.9651319526000371, + "learning_rate": 2.897450472757821e-05, + "loss": 0.6008, + "step": 17310 + }, + { + "epoch": 0.5053261345396177, + "grad_norm": 1.0001540404332667, + "learning_rate": 2.896367019084394e-05, + "loss": 0.6061, + "step": 17315 + }, + { + "epoch": 0.5054720560338538, + "grad_norm": 1.0646427289389395, + "learning_rate": 2.895283531326674e-05, + "loss": 0.5672, + "step": 17320 + }, + { + "epoch": 0.5056179775280899, + "grad_norm": 0.9596517328676479, + "learning_rate": 2.8942000097369688e-05, + "loss": 0.6075, + "step": 17325 + }, + { + "epoch": 0.505763899022326, + "grad_norm": 1.060332198099877, + "learning_rate": 2.893116454567599e-05, + "loss": 0.5943, + "step": 17330 + }, + { + "epoch": 0.5059098205165621, + "grad_norm": 1.0262001565236243, + "learning_rate": 2.8920328660708896e-05, + "loss": 0.5723, + "step": 17335 + }, + { + "epoch": 0.5060557420107982, + "grad_norm": 1.29144940433142, + "learning_rate": 2.8909492444991747e-05, + "loss": 0.5843, + "step": 17340 + }, + { + "epoch": 0.5062016635050343, + "grad_norm": 1.0781561513046394, + "learning_rate": 2.8898655901047962e-05, + "loss": 0.5963, + "step": 17345 + }, + { + "epoch": 0.5063475849992704, + "grad_norm": 1.117393026989122, + "learning_rate": 2.888781903140102e-05, + "loss": 0.5466, + "step": 17350 + }, + { + "epoch": 0.5064935064935064, + "grad_norm": 1.0736764317057645, + "learning_rate": 2.8876981838574503e-05, + "loss": 0.6062, + "step": 17355 + }, + { + "epoch": 0.5066394279877426, + "grad_norm": 1.17194833290902, + "learning_rate": 2.8866144325092043e-05, + "loss": 0.6087, + "step": 17360 + }, + { + "epoch": 0.5067853494819787, + "grad_norm": 0.9703464444738195, + "learning_rate": 2.8855306493477358e-05, + "loss": 0.6191, + "step": 17365 + }, + { + "epoch": 0.5069312709762148, + "grad_norm": 0.9410884438624202, + "learning_rate": 2.8844468346254255e-05, + "loss": 0.6091, + "step": 17370 + }, + { + "epoch": 0.5070771924704509, + "grad_norm": 1.2207950256953726, + "learning_rate": 2.8833629885946577e-05, + "loss": 0.6219, + "step": 17375 + }, + { + "epoch": 0.507223113964687, + "grad_norm": 0.8974910594482379, + "learning_rate": 2.882279111507828e-05, + "loss": 0.5796, + "step": 17380 + }, + { + "epoch": 0.5073690354589231, + "grad_norm": 1.1175635016786833, + "learning_rate": 2.8811952036173355e-05, + "loss": 0.6583, + "step": 17385 + }, + { + "epoch": 0.5075149569531592, + "grad_norm": 0.9872565826231327, + "learning_rate": 2.8801112651755905e-05, + "loss": 0.5826, + "step": 17390 + }, + { + "epoch": 0.5076608784473953, + "grad_norm": 1.1222368721191303, + "learning_rate": 2.8790272964350068e-05, + "loss": 0.599, + "step": 17395 + }, + { + "epoch": 0.5078067999416314, + "grad_norm": 0.9814023266013103, + "learning_rate": 2.8779432976480068e-05, + "loss": 0.5664, + "step": 17400 + }, + { + "epoch": 0.5079527214358674, + "grad_norm": 1.0228487372019415, + "learning_rate": 2.876859269067022e-05, + "loss": 0.546, + "step": 17405 + }, + { + "epoch": 0.5080986429301036, + "grad_norm": 1.092224997464901, + "learning_rate": 2.8757752109444863e-05, + "loss": 0.6167, + "step": 17410 + }, + { + "epoch": 0.5082445644243397, + "grad_norm": 1.1356116876873803, + "learning_rate": 2.8746911235328434e-05, + "loss": 0.5678, + "step": 17415 + }, + { + "epoch": 0.5083904859185758, + "grad_norm": 1.037995364495429, + "learning_rate": 2.8736070070845438e-05, + "loss": 0.6021, + "step": 17420 + }, + { + "epoch": 0.5085364074128119, + "grad_norm": 1.1460497432059433, + "learning_rate": 2.872522861852044e-05, + "loss": 0.6667, + "step": 17425 + }, + { + "epoch": 0.508682328907048, + "grad_norm": 1.0397089702505213, + "learning_rate": 2.871438688087807e-05, + "loss": 0.5932, + "step": 17430 + }, + { + "epoch": 0.5088282504012841, + "grad_norm": 0.945489975867667, + "learning_rate": 2.8703544860443038e-05, + "loss": 0.616, + "step": 17435 + }, + { + "epoch": 0.5089741718955202, + "grad_norm": 0.9287097347610275, + "learning_rate": 2.869270255974011e-05, + "loss": 0.5479, + "step": 17440 + }, + { + "epoch": 0.5091200933897563, + "grad_norm": 1.0789326227101652, + "learning_rate": 2.8681859981294112e-05, + "loss": 0.5944, + "step": 17445 + }, + { + "epoch": 0.5092660148839924, + "grad_norm": 0.9248640116604638, + "learning_rate": 2.8671017127629946e-05, + "loss": 0.5989, + "step": 17450 + }, + { + "epoch": 0.5094119363782285, + "grad_norm": 0.9435524458654166, + "learning_rate": 2.866017400127256e-05, + "loss": 0.5973, + "step": 17455 + }, + { + "epoch": 0.5095578578724647, + "grad_norm": 0.8594193716207245, + "learning_rate": 2.8649330604746998e-05, + "loss": 0.5361, + "step": 17460 + }, + { + "epoch": 0.5097037793667007, + "grad_norm": 1.1668699217409337, + "learning_rate": 2.863848694057834e-05, + "loss": 0.5746, + "step": 17465 + }, + { + "epoch": 0.5098497008609368, + "grad_norm": 1.0228725378398382, + "learning_rate": 2.8627643011291733e-05, + "loss": 0.5623, + "step": 17470 + }, + { + "epoch": 0.5099956223551729, + "grad_norm": 1.230049338564838, + "learning_rate": 2.8616798819412393e-05, + "loss": 0.6605, + "step": 17475 + }, + { + "epoch": 0.510141543849409, + "grad_norm": 0.9505145364168149, + "learning_rate": 2.8605954367465587e-05, + "loss": 0.5223, + "step": 17480 + }, + { + "epoch": 0.5102874653436451, + "grad_norm": 1.0710982882944622, + "learning_rate": 2.8595109657976655e-05, + "loss": 0.5854, + "step": 17485 + }, + { + "epoch": 0.5104333868378812, + "grad_norm": 1.2372012507606343, + "learning_rate": 2.858426469347098e-05, + "loss": 0.6904, + "step": 17490 + }, + { + "epoch": 0.5105793083321173, + "grad_norm": 0.9376638153286584, + "learning_rate": 2.8573419476474016e-05, + "loss": 0.5832, + "step": 17495 + }, + { + "epoch": 0.5107252298263534, + "grad_norm": 1.0187019226979341, + "learning_rate": 2.8562574009511294e-05, + "loss": 0.5794, + "step": 17500 + }, + { + "epoch": 0.5108711513205896, + "grad_norm": 1.1340581306982422, + "learning_rate": 2.8551728295108355e-05, + "loss": 0.581, + "step": 17505 + }, + { + "epoch": 0.5110170728148257, + "grad_norm": 1.086329309984347, + "learning_rate": 2.8540882335790847e-05, + "loss": 0.5763, + "step": 17510 + }, + { + "epoch": 0.5111629943090618, + "grad_norm": 1.0538057847745117, + "learning_rate": 2.8530036134084443e-05, + "loss": 0.5662, + "step": 17515 + }, + { + "epoch": 0.5113089158032978, + "grad_norm": 1.1087439246944333, + "learning_rate": 2.8519189692514893e-05, + "loss": 0.5533, + "step": 17520 + }, + { + "epoch": 0.5114548372975339, + "grad_norm": 1.034907845803554, + "learning_rate": 2.850834301360798e-05, + "loss": 0.5684, + "step": 17525 + }, + { + "epoch": 0.51160075879177, + "grad_norm": 1.1065832531853204, + "learning_rate": 2.8497496099889564e-05, + "loss": 0.5744, + "step": 17530 + }, + { + "epoch": 0.5117466802860061, + "grad_norm": 1.1935555148762191, + "learning_rate": 2.8486648953885558e-05, + "loss": 0.6193, + "step": 17535 + }, + { + "epoch": 0.5118926017802422, + "grad_norm": 1.2160283155081548, + "learning_rate": 2.8475801578121906e-05, + "loss": 0.5721, + "step": 17540 + }, + { + "epoch": 0.5120385232744783, + "grad_norm": 1.0087517141440323, + "learning_rate": 2.8464953975124635e-05, + "loss": 0.5693, + "step": 17545 + }, + { + "epoch": 0.5121844447687144, + "grad_norm": 1.046991254894614, + "learning_rate": 2.845410614741982e-05, + "loss": 0.5483, + "step": 17550 + }, + { + "epoch": 0.5123303662629506, + "grad_norm": 1.2566785090116803, + "learning_rate": 2.8443258097533555e-05, + "loss": 0.565, + "step": 17555 + }, + { + "epoch": 0.5124762877571867, + "grad_norm": 1.030136651257968, + "learning_rate": 2.843240982799203e-05, + "loss": 0.5895, + "step": 17560 + }, + { + "epoch": 0.5126222092514228, + "grad_norm": 1.0161408067941047, + "learning_rate": 2.8421561341321463e-05, + "loss": 0.6342, + "step": 17565 + }, + { + "epoch": 0.5127681307456589, + "grad_norm": 1.0011792332447376, + "learning_rate": 2.8410712640048126e-05, + "loss": 0.557, + "step": 17570 + }, + { + "epoch": 0.512914052239895, + "grad_norm": 1.2349378635849713, + "learning_rate": 2.8399863726698335e-05, + "loss": 0.5977, + "step": 17575 + }, + { + "epoch": 0.513059973734131, + "grad_norm": 1.0698154914571276, + "learning_rate": 2.8389014603798475e-05, + "loss": 0.5276, + "step": 17580 + }, + { + "epoch": 0.5132058952283671, + "grad_norm": 1.0035111125310359, + "learning_rate": 2.8378165273874957e-05, + "loss": 0.5122, + "step": 17585 + }, + { + "epoch": 0.5133518167226032, + "grad_norm": 1.0462222071126355, + "learning_rate": 2.836731573945426e-05, + "loss": 0.5474, + "step": 17590 + }, + { + "epoch": 0.5134977382168393, + "grad_norm": 0.9428497527551408, + "learning_rate": 2.8356466003062894e-05, + "loss": 0.5983, + "step": 17595 + }, + { + "epoch": 0.5136436597110754, + "grad_norm": 0.9175667941487953, + "learning_rate": 2.8345616067227415e-05, + "loss": 0.5236, + "step": 17600 + }, + { + "epoch": 0.5137895812053116, + "grad_norm": 1.0710161454231686, + "learning_rate": 2.8334765934474448e-05, + "loss": 0.6276, + "step": 17605 + }, + { + "epoch": 0.5139355026995477, + "grad_norm": 1.1499209811244444, + "learning_rate": 2.8323915607330635e-05, + "loss": 0.6508, + "step": 17610 + }, + { + "epoch": 0.5140814241937838, + "grad_norm": 1.3424452013333985, + "learning_rate": 2.8313065088322693e-05, + "loss": 0.6246, + "step": 17615 + }, + { + "epoch": 0.5142273456880199, + "grad_norm": 1.1100830168771811, + "learning_rate": 2.8302214379977354e-05, + "loss": 0.5859, + "step": 17620 + }, + { + "epoch": 0.514373267182256, + "grad_norm": 0.9934520833845969, + "learning_rate": 2.8291363484821422e-05, + "loss": 0.5781, + "step": 17625 + }, + { + "epoch": 0.514519188676492, + "grad_norm": 0.9340852176261393, + "learning_rate": 2.828051240538172e-05, + "loss": 0.57, + "step": 17630 + }, + { + "epoch": 0.5146651101707281, + "grad_norm": 1.13110737250981, + "learning_rate": 2.826966114418512e-05, + "loss": 0.5069, + "step": 17635 + }, + { + "epoch": 0.5148110316649642, + "grad_norm": 1.1937072726531144, + "learning_rate": 2.8258809703758553e-05, + "loss": 0.5909, + "step": 17640 + }, + { + "epoch": 0.5149569531592003, + "grad_norm": 1.1477588448061466, + "learning_rate": 2.8247958086628973e-05, + "loss": 0.6076, + "step": 17645 + }, + { + "epoch": 0.5151028746534364, + "grad_norm": 0.9959273415041177, + "learning_rate": 2.8237106295323384e-05, + "loss": 0.5674, + "step": 17650 + }, + { + "epoch": 0.5152487961476726, + "grad_norm": 1.0391448175149716, + "learning_rate": 2.8226254332368822e-05, + "loss": 0.5276, + "step": 17655 + }, + { + "epoch": 0.5153947176419087, + "grad_norm": 1.1358969477768783, + "learning_rate": 2.821540220029238e-05, + "loss": 0.6381, + "step": 17660 + }, + { + "epoch": 0.5155406391361448, + "grad_norm": 1.0831653988255865, + "learning_rate": 2.8204549901621173e-05, + "loss": 0.6323, + "step": 17665 + }, + { + "epoch": 0.5156865606303809, + "grad_norm": 0.9107220459306797, + "learning_rate": 2.8193697438882365e-05, + "loss": 0.6691, + "step": 17670 + }, + { + "epoch": 0.515832482124617, + "grad_norm": 1.0772234751138587, + "learning_rate": 2.8182844814603153e-05, + "loss": 0.6064, + "step": 17675 + }, + { + "epoch": 0.515978403618853, + "grad_norm": 1.1700468855762125, + "learning_rate": 2.817199203131076e-05, + "loss": 0.5742, + "step": 17680 + }, + { + "epoch": 0.5161243251130891, + "grad_norm": 1.0484436757591806, + "learning_rate": 2.816113909153249e-05, + "loss": 0.5272, + "step": 17685 + }, + { + "epoch": 0.5162702466073252, + "grad_norm": 1.2667027710767873, + "learning_rate": 2.8150285997795623e-05, + "loss": 0.6175, + "step": 17690 + }, + { + "epoch": 0.5164161681015613, + "grad_norm": 1.0548303782894322, + "learning_rate": 2.8139432752627525e-05, + "loss": 0.556, + "step": 17695 + }, + { + "epoch": 0.5165620895957974, + "grad_norm": 1.1023287926030914, + "learning_rate": 2.812857935855556e-05, + "loss": 0.6358, + "step": 17700 + }, + { + "epoch": 0.5167080110900336, + "grad_norm": 1.0250801973355461, + "learning_rate": 2.8117725818107153e-05, + "loss": 0.5528, + "step": 17705 + }, + { + "epoch": 0.5168539325842697, + "grad_norm": 1.043629832466394, + "learning_rate": 2.810687213380976e-05, + "loss": 0.5889, + "step": 17710 + }, + { + "epoch": 0.5169998540785058, + "grad_norm": 1.1944446589197606, + "learning_rate": 2.809601830819084e-05, + "loss": 0.6054, + "step": 17715 + }, + { + "epoch": 0.5171457755727419, + "grad_norm": 1.2055329412248967, + "learning_rate": 2.8085164343777935e-05, + "loss": 0.6046, + "step": 17720 + }, + { + "epoch": 0.517291697066978, + "grad_norm": 1.3661658444564035, + "learning_rate": 2.8074310243098585e-05, + "loss": 0.6454, + "step": 17725 + }, + { + "epoch": 0.5174376185612141, + "grad_norm": 0.9163626163591251, + "learning_rate": 2.8063456008680366e-05, + "loss": 0.5323, + "step": 17730 + }, + { + "epoch": 0.5175835400554502, + "grad_norm": 1.1353270728921936, + "learning_rate": 2.8052601643050896e-05, + "loss": 0.6316, + "step": 17735 + }, + { + "epoch": 0.5177294615496862, + "grad_norm": 1.0294023526885498, + "learning_rate": 2.8041747148737808e-05, + "loss": 0.564, + "step": 17740 + }, + { + "epoch": 0.5178753830439223, + "grad_norm": 1.1887120183300548, + "learning_rate": 2.8030892528268782e-05, + "loss": 0.6252, + "step": 17745 + }, + { + "epoch": 0.5180213045381584, + "grad_norm": 1.00884804778949, + "learning_rate": 2.802003778417151e-05, + "loss": 0.5356, + "step": 17750 + }, + { + "epoch": 0.5181672260323946, + "grad_norm": 0.9203056843011136, + "learning_rate": 2.800918291897372e-05, + "loss": 0.5749, + "step": 17755 + }, + { + "epoch": 0.5183131475266307, + "grad_norm": 1.0578741002197878, + "learning_rate": 2.7998327935203195e-05, + "loss": 0.5998, + "step": 17760 + }, + { + "epoch": 0.5184590690208668, + "grad_norm": 0.9952829087159073, + "learning_rate": 2.7987472835387697e-05, + "loss": 0.6376, + "step": 17765 + }, + { + "epoch": 0.5186049905151029, + "grad_norm": 1.057399174960197, + "learning_rate": 2.797661762205504e-05, + "loss": 0.5895, + "step": 17770 + }, + { + "epoch": 0.518750912009339, + "grad_norm": 1.1917805036341311, + "learning_rate": 2.7965762297733072e-05, + "loss": 0.6051, + "step": 17775 + }, + { + "epoch": 0.5188968335035751, + "grad_norm": 1.0186513632770766, + "learning_rate": 2.7954906864949658e-05, + "loss": 0.5908, + "step": 17780 + }, + { + "epoch": 0.5190427549978112, + "grad_norm": 1.234953846598363, + "learning_rate": 2.7944051326232678e-05, + "loss": 0.6182, + "step": 17785 + }, + { + "epoch": 0.5191886764920473, + "grad_norm": 0.9683449140657164, + "learning_rate": 2.7933195684110048e-05, + "loss": 0.5714, + "step": 17790 + }, + { + "epoch": 0.5193345979862833, + "grad_norm": 0.9991210469099925, + "learning_rate": 2.792233994110972e-05, + "loss": 0.5679, + "step": 17795 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 1.0102242166801076, + "learning_rate": 2.7911484099759642e-05, + "loss": 0.6158, + "step": 17800 + }, + { + "epoch": 0.5196264409747556, + "grad_norm": 1.1314710282394644, + "learning_rate": 2.7900628162587824e-05, + "loss": 0.6181, + "step": 17805 + }, + { + "epoch": 0.5197723624689917, + "grad_norm": 0.9824366087518185, + "learning_rate": 2.7889772132122244e-05, + "loss": 0.6119, + "step": 17810 + }, + { + "epoch": 0.5199182839632278, + "grad_norm": 1.2106457589656532, + "learning_rate": 2.787891601089094e-05, + "loss": 0.5836, + "step": 17815 + }, + { + "epoch": 0.5200642054574639, + "grad_norm": 0.8705621690175652, + "learning_rate": 2.786805980142197e-05, + "loss": 0.5748, + "step": 17820 + }, + { + "epoch": 0.5202101269517, + "grad_norm": 0.8682835390098648, + "learning_rate": 2.7857203506243396e-05, + "loss": 0.5666, + "step": 17825 + }, + { + "epoch": 0.5203560484459361, + "grad_norm": 0.9142846059466999, + "learning_rate": 2.784634712788332e-05, + "loss": 0.548, + "step": 17830 + }, + { + "epoch": 0.5205019699401722, + "grad_norm": 0.8955753944313541, + "learning_rate": 2.7835490668869847e-05, + "loss": 0.5036, + "step": 17835 + }, + { + "epoch": 0.5206478914344083, + "grad_norm": 0.9681870383507004, + "learning_rate": 2.7824634131731104e-05, + "loss": 0.5781, + "step": 17840 + }, + { + "epoch": 0.5207938129286444, + "grad_norm": 1.0304916224598168, + "learning_rate": 2.7813777518995242e-05, + "loss": 0.5827, + "step": 17845 + }, + { + "epoch": 0.5209397344228804, + "grad_norm": 0.9067291131415092, + "learning_rate": 2.7802920833190415e-05, + "loss": 0.5749, + "step": 17850 + }, + { + "epoch": 0.5210856559171166, + "grad_norm": 1.246931610098259, + "learning_rate": 2.7792064076844827e-05, + "loss": 0.556, + "step": 17855 + }, + { + "epoch": 0.5212315774113527, + "grad_norm": 1.1713973596950293, + "learning_rate": 2.7781207252486657e-05, + "loss": 0.5852, + "step": 17860 + }, + { + "epoch": 0.5213774989055888, + "grad_norm": 1.1311884209914094, + "learning_rate": 2.7770350362644133e-05, + "loss": 0.5603, + "step": 17865 + }, + { + "epoch": 0.5215234203998249, + "grad_norm": 1.1811047801665386, + "learning_rate": 2.7759493409845465e-05, + "loss": 0.5867, + "step": 17870 + }, + { + "epoch": 0.521669341894061, + "grad_norm": 0.9313827278867771, + "learning_rate": 2.774863639661892e-05, + "loss": 0.5281, + "step": 17875 + }, + { + "epoch": 0.5218152633882971, + "grad_norm": 1.0879431842096456, + "learning_rate": 2.7737779325492745e-05, + "loss": 0.6127, + "step": 17880 + }, + { + "epoch": 0.5219611848825332, + "grad_norm": 0.9248057638527426, + "learning_rate": 2.7726922198995213e-05, + "loss": 0.5508, + "step": 17885 + }, + { + "epoch": 0.5221071063767693, + "grad_norm": 1.181503588169849, + "learning_rate": 2.7716065019654608e-05, + "loss": 0.5795, + "step": 17890 + }, + { + "epoch": 0.5222530278710054, + "grad_norm": 1.1629537044355318, + "learning_rate": 2.770520778999922e-05, + "loss": 0.5591, + "step": 17895 + }, + { + "epoch": 0.5223989493652414, + "grad_norm": 1.006313492054334, + "learning_rate": 2.7694350512557377e-05, + "loss": 0.5426, + "step": 17900 + }, + { + "epoch": 0.5225448708594777, + "grad_norm": 1.0427865429987986, + "learning_rate": 2.7683493189857384e-05, + "loss": 0.6002, + "step": 17905 + }, + { + "epoch": 0.5226907923537137, + "grad_norm": 0.9388058197858173, + "learning_rate": 2.767263582442757e-05, + "loss": 0.5468, + "step": 17910 + }, + { + "epoch": 0.5228367138479498, + "grad_norm": 0.9567632746630504, + "learning_rate": 2.766177841879629e-05, + "loss": 0.5749, + "step": 17915 + }, + { + "epoch": 0.5229826353421859, + "grad_norm": 0.9371542371492836, + "learning_rate": 2.765092097549187e-05, + "loss": 0.5761, + "step": 17920 + }, + { + "epoch": 0.523128556836422, + "grad_norm": 0.9180796385506896, + "learning_rate": 2.76400634970427e-05, + "loss": 0.5573, + "step": 17925 + }, + { + "epoch": 0.5232744783306581, + "grad_norm": 0.99396388799466, + "learning_rate": 2.7629205985977115e-05, + "loss": 0.5328, + "step": 17930 + }, + { + "epoch": 0.5234203998248942, + "grad_norm": 1.0635640291886472, + "learning_rate": 2.761834844482351e-05, + "loss": 0.5835, + "step": 17935 + }, + { + "epoch": 0.5235663213191303, + "grad_norm": 1.010000245062592, + "learning_rate": 2.7607490876110252e-05, + "loss": 0.567, + "step": 17940 + }, + { + "epoch": 0.5237122428133664, + "grad_norm": 0.9038317069670091, + "learning_rate": 2.759663328236574e-05, + "loss": 0.514, + "step": 17945 + }, + { + "epoch": 0.5238581643076025, + "grad_norm": 1.0290717981877628, + "learning_rate": 2.7585775666118363e-05, + "loss": 0.5362, + "step": 17950 + }, + { + "epoch": 0.5240040858018387, + "grad_norm": 1.0183765847627329, + "learning_rate": 2.7574918029896518e-05, + "loss": 0.6522, + "step": 17955 + }, + { + "epoch": 0.5241500072960747, + "grad_norm": 0.870909947389803, + "learning_rate": 2.7564060376228616e-05, + "loss": 0.5514, + "step": 17960 + }, + { + "epoch": 0.5242959287903108, + "grad_norm": 1.0534903074153465, + "learning_rate": 2.7553202707643055e-05, + "loss": 0.6032, + "step": 17965 + }, + { + "epoch": 0.5244418502845469, + "grad_norm": 1.349471714667913, + "learning_rate": 2.7542345026668255e-05, + "loss": 0.5419, + "step": 17970 + }, + { + "epoch": 0.524587771778783, + "grad_norm": 1.1253828129285237, + "learning_rate": 2.7531487335832612e-05, + "loss": 0.5845, + "step": 17975 + }, + { + "epoch": 0.5247336932730191, + "grad_norm": 0.9818731466670688, + "learning_rate": 2.7520629637664568e-05, + "loss": 0.5717, + "step": 17980 + }, + { + "epoch": 0.5248796147672552, + "grad_norm": 1.1678802119041052, + "learning_rate": 2.750977193469253e-05, + "loss": 0.5859, + "step": 17985 + }, + { + "epoch": 0.5250255362614913, + "grad_norm": 0.8589280890552412, + "learning_rate": 2.7498914229444905e-05, + "loss": 0.5325, + "step": 17990 + }, + { + "epoch": 0.5251714577557274, + "grad_norm": 1.1247062103077785, + "learning_rate": 2.7488056524450124e-05, + "loss": 0.5986, + "step": 17995 + }, + { + "epoch": 0.5253173792499635, + "grad_norm": 1.1271651355640557, + "learning_rate": 2.74771988222366e-05, + "loss": 0.5751, + "step": 18000 + }, + { + "epoch": 0.5254633007441997, + "grad_norm": 1.0204284137484032, + "learning_rate": 2.7466341125332755e-05, + "loss": 0.542, + "step": 18005 + }, + { + "epoch": 0.5256092222384358, + "grad_norm": 1.053005753691077, + "learning_rate": 2.745548343626701e-05, + "loss": 0.5795, + "step": 18010 + }, + { + "epoch": 0.5257551437326718, + "grad_norm": 1.105690091608904, + "learning_rate": 2.744462575756779e-05, + "loss": 0.6801, + "step": 18015 + }, + { + "epoch": 0.5259010652269079, + "grad_norm": 1.178388275086113, + "learning_rate": 2.743376809176349e-05, + "loss": 0.6182, + "step": 18020 + }, + { + "epoch": 0.526046986721144, + "grad_norm": 0.9457427140225922, + "learning_rate": 2.742291044138252e-05, + "loss": 0.558, + "step": 18025 + }, + { + "epoch": 0.5261929082153801, + "grad_norm": 0.9708408525287296, + "learning_rate": 2.7412052808953297e-05, + "loss": 0.5676, + "step": 18030 + }, + { + "epoch": 0.5263388297096162, + "grad_norm": 1.0559828569230834, + "learning_rate": 2.740119519700423e-05, + "loss": 0.5693, + "step": 18035 + }, + { + "epoch": 0.5264847512038523, + "grad_norm": 0.9354915664613574, + "learning_rate": 2.7390337608063698e-05, + "loss": 0.5494, + "step": 18040 + }, + { + "epoch": 0.5266306726980884, + "grad_norm": 1.061313471885252, + "learning_rate": 2.7379480044660093e-05, + "loss": 0.6198, + "step": 18045 + }, + { + "epoch": 0.5267765941923245, + "grad_norm": 1.1000362431286346, + "learning_rate": 2.7368622509321824e-05, + "loss": 0.5672, + "step": 18050 + }, + { + "epoch": 0.5269225156865607, + "grad_norm": 0.9563107888921746, + "learning_rate": 2.7357765004577253e-05, + "loss": 0.5479, + "step": 18055 + }, + { + "epoch": 0.5270684371807968, + "grad_norm": 1.009484929738099, + "learning_rate": 2.7346907532954757e-05, + "loss": 0.4874, + "step": 18060 + }, + { + "epoch": 0.5272143586750329, + "grad_norm": 1.1151126734438044, + "learning_rate": 2.7336050096982708e-05, + "loss": 0.5965, + "step": 18065 + }, + { + "epoch": 0.527360280169269, + "grad_norm": 1.118748882626728, + "learning_rate": 2.7325192699189444e-05, + "loss": 0.5801, + "step": 18070 + }, + { + "epoch": 0.527506201663505, + "grad_norm": 1.0036223727339522, + "learning_rate": 2.7314335342103332e-05, + "loss": 0.5876, + "step": 18075 + }, + { + "epoch": 0.5276521231577411, + "grad_norm": 1.3129092811257328, + "learning_rate": 2.7303478028252693e-05, + "loss": 0.6509, + "step": 18080 + }, + { + "epoch": 0.5277980446519772, + "grad_norm": 1.14082140899177, + "learning_rate": 2.729262076016588e-05, + "loss": 0.6252, + "step": 18085 + }, + { + "epoch": 0.5279439661462133, + "grad_norm": 0.9213499227605806, + "learning_rate": 2.728176354037119e-05, + "loss": 0.5726, + "step": 18090 + }, + { + "epoch": 0.5280898876404494, + "grad_norm": 0.8878894157992827, + "learning_rate": 2.7270906371396944e-05, + "loss": 0.5358, + "step": 18095 + }, + { + "epoch": 0.5282358091346855, + "grad_norm": 1.1537371144424056, + "learning_rate": 2.7260049255771414e-05, + "loss": 0.5642, + "step": 18100 + }, + { + "epoch": 0.5283817306289217, + "grad_norm": 1.1289761175928628, + "learning_rate": 2.72491921960229e-05, + "loss": 0.6012, + "step": 18105 + }, + { + "epoch": 0.5285276521231578, + "grad_norm": 0.9920248849199294, + "learning_rate": 2.7238335194679676e-05, + "loss": 0.4891, + "step": 18110 + }, + { + "epoch": 0.5286735736173939, + "grad_norm": 0.9307938024749491, + "learning_rate": 2.7227478254269976e-05, + "loss": 0.553, + "step": 18115 + }, + { + "epoch": 0.52881949511163, + "grad_norm": 1.0576176657405045, + "learning_rate": 2.7216621377322062e-05, + "loss": 0.6572, + "step": 18120 + }, + { + "epoch": 0.528965416605866, + "grad_norm": 1.0177275315593213, + "learning_rate": 2.720576456636415e-05, + "loss": 0.5535, + "step": 18125 + }, + { + "epoch": 0.5291113381001021, + "grad_norm": 0.8761674993888076, + "learning_rate": 2.7194907823924464e-05, + "loss": 0.5118, + "step": 18130 + }, + { + "epoch": 0.5292572595943382, + "grad_norm": 1.2133053221102292, + "learning_rate": 2.718405115253118e-05, + "loss": 0.5895, + "step": 18135 + }, + { + "epoch": 0.5294031810885743, + "grad_norm": 1.055325401261371, + "learning_rate": 2.7173194554712485e-05, + "loss": 0.5534, + "step": 18140 + }, + { + "epoch": 0.5295491025828104, + "grad_norm": 1.0648408707662704, + "learning_rate": 2.7162338032996544e-05, + "loss": 0.5392, + "step": 18145 + }, + { + "epoch": 0.5296950240770465, + "grad_norm": 1.1425983438174918, + "learning_rate": 2.7151481589911492e-05, + "loss": 0.5726, + "step": 18150 + }, + { + "epoch": 0.5298409455712827, + "grad_norm": 0.9676007574513101, + "learning_rate": 2.7140625227985478e-05, + "loss": 0.6483, + "step": 18155 + }, + { + "epoch": 0.5299868670655188, + "grad_norm": 1.073930229541028, + "learning_rate": 2.712976894974658e-05, + "loss": 0.5833, + "step": 18160 + }, + { + "epoch": 0.5301327885597549, + "grad_norm": 1.043407332235955, + "learning_rate": 2.7118912757722896e-05, + "loss": 0.6067, + "step": 18165 + }, + { + "epoch": 0.530278710053991, + "grad_norm": 1.163245273461599, + "learning_rate": 2.7108056654442492e-05, + "loss": 0.6643, + "step": 18170 + }, + { + "epoch": 0.530424631548227, + "grad_norm": 1.0601916215571352, + "learning_rate": 2.7097200642433418e-05, + "loss": 0.5444, + "step": 18175 + }, + { + "epoch": 0.5305705530424631, + "grad_norm": 0.9801836412830268, + "learning_rate": 2.7086344724223713e-05, + "loss": 0.5907, + "step": 18180 + }, + { + "epoch": 0.5307164745366992, + "grad_norm": 0.9459059497626972, + "learning_rate": 2.7075488902341345e-05, + "loss": 0.5692, + "step": 18185 + }, + { + "epoch": 0.5308623960309353, + "grad_norm": 1.0340348833628825, + "learning_rate": 2.706463317931432e-05, + "loss": 0.6456, + "step": 18190 + }, + { + "epoch": 0.5310083175251714, + "grad_norm": 0.9488384241803544, + "learning_rate": 2.705377755767059e-05, + "loss": 0.5355, + "step": 18195 + }, + { + "epoch": 0.5311542390194075, + "grad_norm": 1.2440057463960081, + "learning_rate": 2.704292203993809e-05, + "loss": 0.5744, + "step": 18200 + }, + { + "epoch": 0.5313001605136437, + "grad_norm": 1.1103911197502438, + "learning_rate": 2.7032066628644738e-05, + "loss": 0.5997, + "step": 18205 + }, + { + "epoch": 0.5314460820078798, + "grad_norm": 0.9576074578486755, + "learning_rate": 2.70212113263184e-05, + "loss": 0.5129, + "step": 18210 + }, + { + "epoch": 0.5315920035021159, + "grad_norm": 0.9557281373829349, + "learning_rate": 2.7010356135486953e-05, + "loss": 0.5518, + "step": 18215 + }, + { + "epoch": 0.531737924996352, + "grad_norm": 1.0173171172414277, + "learning_rate": 2.6999501058678218e-05, + "loss": 0.5827, + "step": 18220 + }, + { + "epoch": 0.5318838464905881, + "grad_norm": 1.0428664299761135, + "learning_rate": 2.6988646098420018e-05, + "loss": 0.5446, + "step": 18225 + }, + { + "epoch": 0.5320297679848242, + "grad_norm": 0.9773760471360108, + "learning_rate": 2.6977791257240115e-05, + "loss": 0.5333, + "step": 18230 + }, + { + "epoch": 0.5321756894790602, + "grad_norm": 0.9527201148385231, + "learning_rate": 2.6966936537666282e-05, + "loss": 0.5868, + "step": 18235 + }, + { + "epoch": 0.5323216109732963, + "grad_norm": 0.921331536218337, + "learning_rate": 2.6956081942226226e-05, + "loss": 0.5974, + "step": 18240 + }, + { + "epoch": 0.5324675324675324, + "grad_norm": 1.2147578476322956, + "learning_rate": 2.6945227473447644e-05, + "loss": 0.5934, + "step": 18245 + }, + { + "epoch": 0.5326134539617686, + "grad_norm": 1.0689657581666354, + "learning_rate": 2.6934373133858216e-05, + "loss": 0.5346, + "step": 18250 + }, + { + "epoch": 0.5327593754560047, + "grad_norm": 1.0122966731967253, + "learning_rate": 2.6923518925985562e-05, + "loss": 0.6253, + "step": 18255 + }, + { + "epoch": 0.5329052969502408, + "grad_norm": 1.3474157785010537, + "learning_rate": 2.691266485235729e-05, + "loss": 0.5916, + "step": 18260 + }, + { + "epoch": 0.5330512184444769, + "grad_norm": 0.8534697236050557, + "learning_rate": 2.690181091550098e-05, + "loss": 0.5058, + "step": 18265 + }, + { + "epoch": 0.533197139938713, + "grad_norm": 0.9023092235402138, + "learning_rate": 2.689095711794417e-05, + "loss": 0.552, + "step": 18270 + }, + { + "epoch": 0.5333430614329491, + "grad_norm": 1.104183687348476, + "learning_rate": 2.6880103462214372e-05, + "loss": 0.6473, + "step": 18275 + }, + { + "epoch": 0.5334889829271852, + "grad_norm": 1.095553481408904, + "learning_rate": 2.6869249950839065e-05, + "loss": 0.5768, + "step": 18280 + }, + { + "epoch": 0.5336349044214213, + "grad_norm": 1.1393413773932635, + "learning_rate": 2.685839658634568e-05, + "loss": 0.55, + "step": 18285 + }, + { + "epoch": 0.5337808259156573, + "grad_norm": 0.9494067181565052, + "learning_rate": 2.684754337126163e-05, + "loss": 0.6175, + "step": 18290 + }, + { + "epoch": 0.5339267474098934, + "grad_norm": 1.1490515663327459, + "learning_rate": 2.6836690308114305e-05, + "loss": 0.5981, + "step": 18295 + }, + { + "epoch": 0.5340726689041296, + "grad_norm": 1.1841713961538063, + "learning_rate": 2.682583739943102e-05, + "loss": 0.6036, + "step": 18300 + }, + { + "epoch": 0.5342185903983657, + "grad_norm": 1.0900590191484214, + "learning_rate": 2.6814984647739084e-05, + "loss": 0.5771, + "step": 18305 + }, + { + "epoch": 0.5343645118926018, + "grad_norm": 0.9662420682836969, + "learning_rate": 2.680413205556578e-05, + "loss": 0.5793, + "step": 18310 + }, + { + "epoch": 0.5345104333868379, + "grad_norm": 0.972625122035825, + "learning_rate": 2.6793279625438312e-05, + "loss": 0.6047, + "step": 18315 + }, + { + "epoch": 0.534656354881074, + "grad_norm": 1.0018566165416312, + "learning_rate": 2.678242735988389e-05, + "loss": 0.6372, + "step": 18320 + }, + { + "epoch": 0.5348022763753101, + "grad_norm": 1.0622554693048338, + "learning_rate": 2.6771575261429653e-05, + "loss": 0.6144, + "step": 18325 + }, + { + "epoch": 0.5349481978695462, + "grad_norm": 1.012464937244321, + "learning_rate": 2.6760723332602722e-05, + "loss": 0.6442, + "step": 18330 + }, + { + "epoch": 0.5350941193637823, + "grad_norm": 0.9954906944075244, + "learning_rate": 2.674987157593017e-05, + "loss": 0.5788, + "step": 18335 + }, + { + "epoch": 0.5352400408580184, + "grad_norm": 0.8556905022391582, + "learning_rate": 2.6739019993939036e-05, + "loss": 0.5805, + "step": 18340 + }, + { + "epoch": 0.5353859623522544, + "grad_norm": 1.0922630885641271, + "learning_rate": 2.672816858915631e-05, + "loss": 0.5266, + "step": 18345 + }, + { + "epoch": 0.5355318838464906, + "grad_norm": 1.0387405695029193, + "learning_rate": 2.671731736410894e-05, + "loss": 0.5533, + "step": 18350 + }, + { + "epoch": 0.5356778053407267, + "grad_norm": 1.1310656840115603, + "learning_rate": 2.6706466321323835e-05, + "loss": 0.6014, + "step": 18355 + }, + { + "epoch": 0.5358237268349628, + "grad_norm": 0.9737755114287527, + "learning_rate": 2.6695615463327873e-05, + "loss": 0.5795, + "step": 18360 + }, + { + "epoch": 0.5359696483291989, + "grad_norm": 1.0501839104982529, + "learning_rate": 2.668476479264787e-05, + "loss": 0.5396, + "step": 18365 + }, + { + "epoch": 0.536115569823435, + "grad_norm": 1.027747273145144, + "learning_rate": 2.6673914311810623e-05, + "loss": 0.5666, + "step": 18370 + }, + { + "epoch": 0.5362614913176711, + "grad_norm": 1.1081186098697693, + "learning_rate": 2.6663064023342843e-05, + "loss": 0.5575, + "step": 18375 + }, + { + "epoch": 0.5364074128119072, + "grad_norm": 0.9431075609984424, + "learning_rate": 2.665221392977125e-05, + "loss": 0.5511, + "step": 18380 + }, + { + "epoch": 0.5365533343061433, + "grad_norm": 1.117275856317832, + "learning_rate": 2.664136403362246e-05, + "loss": 0.587, + "step": 18385 + }, + { + "epoch": 0.5366992558003794, + "grad_norm": 1.0224434254753776, + "learning_rate": 2.6630514337423113e-05, + "loss": 0.5647, + "step": 18390 + }, + { + "epoch": 0.5368451772946154, + "grad_norm": 1.1926220516333217, + "learning_rate": 2.661966484369972e-05, + "loss": 0.5802, + "step": 18395 + }, + { + "epoch": 0.5369910987888517, + "grad_norm": 1.1821218962088669, + "learning_rate": 2.6608815554978816e-05, + "loss": 0.5684, + "step": 18400 + }, + { + "epoch": 0.5371370202830877, + "grad_norm": 0.9535477946098685, + "learning_rate": 2.6597966473786855e-05, + "loss": 0.5426, + "step": 18405 + }, + { + "epoch": 0.5372829417773238, + "grad_norm": 1.0052778968371063, + "learning_rate": 2.658711760265026e-05, + "loss": 0.5204, + "step": 18410 + }, + { + "epoch": 0.5374288632715599, + "grad_norm": 0.9255092815843405, + "learning_rate": 2.657626894409537e-05, + "loss": 0.5866, + "step": 18415 + }, + { + "epoch": 0.537574784765796, + "grad_norm": 1.0578974978663298, + "learning_rate": 2.6565420500648513e-05, + "loss": 0.5784, + "step": 18420 + }, + { + "epoch": 0.5377207062600321, + "grad_norm": 1.0520200182587254, + "learning_rate": 2.6554572274835942e-05, + "loss": 0.6163, + "step": 18425 + }, + { + "epoch": 0.5378666277542682, + "grad_norm": 1.041134492589403, + "learning_rate": 2.6543724269183873e-05, + "loss": 0.6265, + "step": 18430 + }, + { + "epoch": 0.5380125492485043, + "grad_norm": 0.988964346619895, + "learning_rate": 2.6532876486218473e-05, + "loss": 0.5645, + "step": 18435 + }, + { + "epoch": 0.5381584707427404, + "grad_norm": 0.9453036825333526, + "learning_rate": 2.652202892846585e-05, + "loss": 0.5687, + "step": 18440 + }, + { + "epoch": 0.5383043922369765, + "grad_norm": 1.0533407036876061, + "learning_rate": 2.651118159845205e-05, + "loss": 0.6116, + "step": 18445 + }, + { + "epoch": 0.5384503137312127, + "grad_norm": 1.1041314700767115, + "learning_rate": 2.6500334498703083e-05, + "loss": 0.6, + "step": 18450 + }, + { + "epoch": 0.5385962352254487, + "grad_norm": 0.9228067326310603, + "learning_rate": 2.6489487631744896e-05, + "loss": 0.5538, + "step": 18455 + }, + { + "epoch": 0.5387421567196848, + "grad_norm": 1.175014935684281, + "learning_rate": 2.647864100010339e-05, + "loss": 0.5761, + "step": 18460 + }, + { + "epoch": 0.5388880782139209, + "grad_norm": 1.1950751675611238, + "learning_rate": 2.6467794606304413e-05, + "loss": 0.5528, + "step": 18465 + }, + { + "epoch": 0.539033999708157, + "grad_norm": 0.9191728133190892, + "learning_rate": 2.6456948452873726e-05, + "loss": 0.6361, + "step": 18470 + }, + { + "epoch": 0.5391799212023931, + "grad_norm": 1.130807323169007, + "learning_rate": 2.6446102542337076e-05, + "loss": 0.5488, + "step": 18475 + }, + { + "epoch": 0.5393258426966292, + "grad_norm": 1.025083859050189, + "learning_rate": 2.6435256877220128e-05, + "loss": 0.5727, + "step": 18480 + }, + { + "epoch": 0.5394717641908653, + "grad_norm": 1.0230226662784287, + "learning_rate": 2.6424411460048514e-05, + "loss": 0.5378, + "step": 18485 + }, + { + "epoch": 0.5396176856851014, + "grad_norm": 0.8624736322988603, + "learning_rate": 2.6413566293347775e-05, + "loss": 0.5018, + "step": 18490 + }, + { + "epoch": 0.5397636071793375, + "grad_norm": 1.1635472449443434, + "learning_rate": 2.640272137964342e-05, + "loss": 0.594, + "step": 18495 + }, + { + "epoch": 0.5399095286735737, + "grad_norm": 0.9694807524777028, + "learning_rate": 2.6391876721460873e-05, + "loss": 0.5659, + "step": 18500 + }, + { + "epoch": 0.5400554501678098, + "grad_norm": 1.0043352412930615, + "learning_rate": 2.6381032321325534e-05, + "loss": 0.5522, + "step": 18505 + }, + { + "epoch": 0.5402013716620458, + "grad_norm": 0.898720669135111, + "learning_rate": 2.6370188181762728e-05, + "loss": 0.5852, + "step": 18510 + }, + { + "epoch": 0.5403472931562819, + "grad_norm": 1.0805056715968975, + "learning_rate": 2.635934430529769e-05, + "loss": 0.5887, + "step": 18515 + }, + { + "epoch": 0.540493214650518, + "grad_norm": 0.9409565694363404, + "learning_rate": 2.6348500694455642e-05, + "loss": 0.5883, + "step": 18520 + }, + { + "epoch": 0.5406391361447541, + "grad_norm": 0.9626454078603776, + "learning_rate": 2.6337657351761713e-05, + "loss": 0.5996, + "step": 18525 + }, + { + "epoch": 0.5407850576389902, + "grad_norm": 1.156311791376103, + "learning_rate": 2.6326814279740974e-05, + "loss": 0.5654, + "step": 18530 + }, + { + "epoch": 0.5409309791332263, + "grad_norm": 1.0126293661836134, + "learning_rate": 2.631597148091845e-05, + "loss": 0.6135, + "step": 18535 + }, + { + "epoch": 0.5410769006274624, + "grad_norm": 0.9243149763631263, + "learning_rate": 2.6305128957819076e-05, + "loss": 0.5486, + "step": 18540 + }, + { + "epoch": 0.5412228221216985, + "grad_norm": 1.1708097882204893, + "learning_rate": 2.6294286712967735e-05, + "loss": 0.5783, + "step": 18545 + }, + { + "epoch": 0.5413687436159347, + "grad_norm": 1.1144951980913023, + "learning_rate": 2.6283444748889252e-05, + "loss": 0.5933, + "step": 18550 + }, + { + "epoch": 0.5415146651101708, + "grad_norm": 1.1215278280294547, + "learning_rate": 2.6272603068108386e-05, + "loss": 0.6662, + "step": 18555 + }, + { + "epoch": 0.5416605866044069, + "grad_norm": 0.9577801754610242, + "learning_rate": 2.626176167314982e-05, + "loss": 0.578, + "step": 18560 + }, + { + "epoch": 0.541806508098643, + "grad_norm": 1.0799355954306333, + "learning_rate": 2.6250920566538178e-05, + "loss": 0.6074, + "step": 18565 + }, + { + "epoch": 0.541952429592879, + "grad_norm": 1.1405245382424891, + "learning_rate": 2.624007975079801e-05, + "loss": 0.6529, + "step": 18570 + }, + { + "epoch": 0.5420983510871151, + "grad_norm": 1.0377902980799405, + "learning_rate": 2.6229239228453794e-05, + "loss": 0.5545, + "step": 18575 + }, + { + "epoch": 0.5422442725813512, + "grad_norm": 0.9353960676734543, + "learning_rate": 2.621839900202997e-05, + "loss": 0.5676, + "step": 18580 + }, + { + "epoch": 0.5423901940755873, + "grad_norm": 1.1176803279490075, + "learning_rate": 2.6207559074050863e-05, + "loss": 0.4961, + "step": 18585 + }, + { + "epoch": 0.5425361155698234, + "grad_norm": 0.9722832623021269, + "learning_rate": 2.619671944704077e-05, + "loss": 0.5487, + "step": 18590 + }, + { + "epoch": 0.5426820370640595, + "grad_norm": 0.9587465116780288, + "learning_rate": 2.6185880123523894e-05, + "loss": 0.5546, + "step": 18595 + }, + { + "epoch": 0.5428279585582957, + "grad_norm": 1.1696431270844794, + "learning_rate": 2.6175041106024366e-05, + "loss": 0.5058, + "step": 18600 + }, + { + "epoch": 0.5429738800525318, + "grad_norm": 1.0826394701815976, + "learning_rate": 2.6164202397066286e-05, + "loss": 0.5232, + "step": 18605 + }, + { + "epoch": 0.5431198015467679, + "grad_norm": 0.8796562439996058, + "learning_rate": 2.615336399917361e-05, + "loss": 0.5655, + "step": 18610 + }, + { + "epoch": 0.543265723041004, + "grad_norm": 0.9823904936670873, + "learning_rate": 2.614252591487027e-05, + "loss": 0.519, + "step": 18615 + }, + { + "epoch": 0.54341164453524, + "grad_norm": 1.028710295385897, + "learning_rate": 2.6131688146680133e-05, + "loss": 0.5346, + "step": 18620 + }, + { + "epoch": 0.5435575660294761, + "grad_norm": 1.0895924683222729, + "learning_rate": 2.612085069712697e-05, + "loss": 0.6019, + "step": 18625 + }, + { + "epoch": 0.5437034875237122, + "grad_norm": 1.0438265636717063, + "learning_rate": 2.6110013568734477e-05, + "loss": 0.5462, + "step": 18630 + }, + { + "epoch": 0.5438494090179483, + "grad_norm": 0.983221332578274, + "learning_rate": 2.609917676402629e-05, + "loss": 0.5941, + "step": 18635 + }, + { + "epoch": 0.5439953305121844, + "grad_norm": 1.1222904325094563, + "learning_rate": 2.6088340285525948e-05, + "loss": 0.6253, + "step": 18640 + }, + { + "epoch": 0.5441412520064205, + "grad_norm": 0.9621962174057327, + "learning_rate": 2.6077504135756935e-05, + "loss": 0.5769, + "step": 18645 + }, + { + "epoch": 0.5442871735006567, + "grad_norm": 1.1136769866907792, + "learning_rate": 2.6066668317242653e-05, + "loss": 0.5494, + "step": 18650 + }, + { + "epoch": 0.5444330949948928, + "grad_norm": 1.0281861679538178, + "learning_rate": 2.6055832832506434e-05, + "loss": 0.5728, + "step": 18655 + }, + { + "epoch": 0.5445790164891289, + "grad_norm": 1.2908076190791984, + "learning_rate": 2.6044997684071497e-05, + "loss": 0.618, + "step": 18660 + }, + { + "epoch": 0.544724937983365, + "grad_norm": 1.0261999511290776, + "learning_rate": 2.6034162874461038e-05, + "loss": 0.556, + "step": 18665 + }, + { + "epoch": 0.544870859477601, + "grad_norm": 1.0894673677022477, + "learning_rate": 2.602332840619812e-05, + "loss": 0.5053, + "step": 18670 + }, + { + "epoch": 0.5450167809718371, + "grad_norm": 1.0719473798968804, + "learning_rate": 2.6012494281805777e-05, + "loss": 0.6432, + "step": 18675 + }, + { + "epoch": 0.5451627024660732, + "grad_norm": 1.0206624581390917, + "learning_rate": 2.6001660503806906e-05, + "loss": 0.6032, + "step": 18680 + }, + { + "epoch": 0.5453086239603093, + "grad_norm": 1.5241969536797513, + "learning_rate": 2.599082707472438e-05, + "loss": 0.5755, + "step": 18685 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.0474371407233378, + "learning_rate": 2.5979993997080947e-05, + "loss": 0.6024, + "step": 18690 + }, + { + "epoch": 0.5456004669487815, + "grad_norm": 1.1193353735004417, + "learning_rate": 2.5969161273399306e-05, + "loss": 0.6066, + "step": 18695 + }, + { + "epoch": 0.5457463884430177, + "grad_norm": 1.2281871702574194, + "learning_rate": 2.5958328906202063e-05, + "loss": 0.6051, + "step": 18700 + }, + { + "epoch": 0.5458923099372538, + "grad_norm": 1.248497607609345, + "learning_rate": 2.594749689801172e-05, + "loss": 0.5628, + "step": 18705 + }, + { + "epoch": 0.5460382314314899, + "grad_norm": 1.0833910196660719, + "learning_rate": 2.5936665251350718e-05, + "loss": 0.6102, + "step": 18710 + }, + { + "epoch": 0.546184152925726, + "grad_norm": 1.094220701660935, + "learning_rate": 2.592583396874141e-05, + "loss": 0.5604, + "step": 18715 + }, + { + "epoch": 0.5463300744199621, + "grad_norm": 0.969939531675538, + "learning_rate": 2.591500305270606e-05, + "loss": 0.5243, + "step": 18720 + }, + { + "epoch": 0.5464759959141982, + "grad_norm": 1.1182319389173998, + "learning_rate": 2.5904172505766864e-05, + "loss": 0.5713, + "step": 18725 + }, + { + "epoch": 0.5466219174084342, + "grad_norm": 1.5037264282608949, + "learning_rate": 2.5893342330445892e-05, + "loss": 0.5946, + "step": 18730 + }, + { + "epoch": 0.5467678389026703, + "grad_norm": 1.1648680136260472, + "learning_rate": 2.5882512529265167e-05, + "loss": 0.5923, + "step": 18735 + }, + { + "epoch": 0.5469137603969064, + "grad_norm": 1.1128611647869884, + "learning_rate": 2.587168310474662e-05, + "loss": 0.5591, + "step": 18740 + }, + { + "epoch": 0.5470596818911425, + "grad_norm": 0.8953168106044155, + "learning_rate": 2.586085405941207e-05, + "loss": 0.5439, + "step": 18745 + }, + { + "epoch": 0.5472056033853787, + "grad_norm": 1.142966131321429, + "learning_rate": 2.5850025395783263e-05, + "loss": 0.5786, + "step": 18750 + }, + { + "epoch": 0.5473515248796148, + "grad_norm": 0.9578302790419457, + "learning_rate": 2.583919711638186e-05, + "loss": 0.564, + "step": 18755 + }, + { + "epoch": 0.5474974463738509, + "grad_norm": 1.1409318308312035, + "learning_rate": 2.582836922372942e-05, + "loss": 0.6189, + "step": 18760 + }, + { + "epoch": 0.547643367868087, + "grad_norm": 1.0133993165330566, + "learning_rate": 2.5817541720347434e-05, + "loss": 0.5323, + "step": 18765 + }, + { + "epoch": 0.5477892893623231, + "grad_norm": 0.9694775426494109, + "learning_rate": 2.5806714608757283e-05, + "loss": 0.5373, + "step": 18770 + }, + { + "epoch": 0.5479352108565592, + "grad_norm": 1.0412753794502596, + "learning_rate": 2.579588789148026e-05, + "loss": 0.5765, + "step": 18775 + }, + { + "epoch": 0.5480811323507953, + "grad_norm": 1.0722631643779192, + "learning_rate": 2.5785061571037566e-05, + "loss": 0.5325, + "step": 18780 + }, + { + "epoch": 0.5482270538450313, + "grad_norm": 0.9694552921580994, + "learning_rate": 2.5774235649950312e-05, + "loss": 0.6036, + "step": 18785 + }, + { + "epoch": 0.5483729753392674, + "grad_norm": 0.9047047292913784, + "learning_rate": 2.5763410130739517e-05, + "loss": 0.5399, + "step": 18790 + }, + { + "epoch": 0.5485188968335035, + "grad_norm": 1.0527515663682632, + "learning_rate": 2.5752585015926124e-05, + "loss": 0.5638, + "step": 18795 + }, + { + "epoch": 0.5486648183277397, + "grad_norm": 0.8847067385970466, + "learning_rate": 2.5741760308030933e-05, + "loss": 0.5668, + "step": 18800 + }, + { + "epoch": 0.5488107398219758, + "grad_norm": 1.0433680099661289, + "learning_rate": 2.5730936009574684e-05, + "loss": 0.6146, + "step": 18805 + }, + { + "epoch": 0.5489566613162119, + "grad_norm": 1.0960315526480513, + "learning_rate": 2.572011212307804e-05, + "loss": 0.6032, + "step": 18810 + }, + { + "epoch": 0.549102582810448, + "grad_norm": 1.008049271422144, + "learning_rate": 2.5709288651061526e-05, + "loss": 0.5668, + "step": 18815 + }, + { + "epoch": 0.5492485043046841, + "grad_norm": 1.0303661244700986, + "learning_rate": 2.56984655960456e-05, + "loss": 0.6205, + "step": 18820 + }, + { + "epoch": 0.5493944257989202, + "grad_norm": 1.0216156943528545, + "learning_rate": 2.56876429605506e-05, + "loss": 0.5744, + "step": 18825 + }, + { + "epoch": 0.5495403472931563, + "grad_norm": 0.9533687720611044, + "learning_rate": 2.5676820747096787e-05, + "loss": 0.5475, + "step": 18830 + }, + { + "epoch": 0.5496862687873924, + "grad_norm": 1.1001135006103824, + "learning_rate": 2.5665998958204313e-05, + "loss": 0.5929, + "step": 18835 + }, + { + "epoch": 0.5498321902816284, + "grad_norm": 1.0152341366158735, + "learning_rate": 2.5655177596393244e-05, + "loss": 0.5469, + "step": 18840 + }, + { + "epoch": 0.5499781117758645, + "grad_norm": 1.1219268949891719, + "learning_rate": 2.5644356664183512e-05, + "loss": 0.5614, + "step": 18845 + }, + { + "epoch": 0.5501240332701007, + "grad_norm": 0.8820918915949129, + "learning_rate": 2.5633536164095e-05, + "loss": 0.6017, + "step": 18850 + }, + { + "epoch": 0.5502699547643368, + "grad_norm": 1.0735395332406277, + "learning_rate": 2.562271609864744e-05, + "loss": 0.6183, + "step": 18855 + }, + { + "epoch": 0.5504158762585729, + "grad_norm": 1.195082020869037, + "learning_rate": 2.561189647036051e-05, + "loss": 0.569, + "step": 18860 + }, + { + "epoch": 0.550561797752809, + "grad_norm": 0.961023415528801, + "learning_rate": 2.5601077281753755e-05, + "loss": 0.5552, + "step": 18865 + }, + { + "epoch": 0.5507077192470451, + "grad_norm": 1.2236131441096125, + "learning_rate": 2.5590258535346605e-05, + "loss": 0.618, + "step": 18870 + }, + { + "epoch": 0.5508536407412812, + "grad_norm": 1.1281272706105105, + "learning_rate": 2.557944023365843e-05, + "loss": 0.6186, + "step": 18875 + }, + { + "epoch": 0.5509995622355173, + "grad_norm": 0.9494685945411812, + "learning_rate": 2.556862237920847e-05, + "loss": 0.541, + "step": 18880 + }, + { + "epoch": 0.5511454837297534, + "grad_norm": 1.1039442359528024, + "learning_rate": 2.5557804974515853e-05, + "loss": 0.6234, + "step": 18885 + }, + { + "epoch": 0.5512914052239895, + "grad_norm": 1.2397737831522957, + "learning_rate": 2.5546988022099633e-05, + "loss": 0.5842, + "step": 18890 + }, + { + "epoch": 0.5514373267182255, + "grad_norm": 0.9222605895253784, + "learning_rate": 2.553617152447872e-05, + "loss": 0.532, + "step": 18895 + }, + { + "epoch": 0.5515832482124617, + "grad_norm": 1.1054125419781198, + "learning_rate": 2.5525355484171942e-05, + "loss": 0.5844, + "step": 18900 + }, + { + "epoch": 0.5517291697066978, + "grad_norm": 1.0920177162832378, + "learning_rate": 2.551453990369802e-05, + "loss": 0.5977, + "step": 18905 + }, + { + "epoch": 0.5518750912009339, + "grad_norm": 1.0443406539143116, + "learning_rate": 2.5503724785575567e-05, + "loss": 0.5844, + "step": 18910 + }, + { + "epoch": 0.55202101269517, + "grad_norm": 1.1242148004092776, + "learning_rate": 2.549291013232309e-05, + "loss": 0.5725, + "step": 18915 + }, + { + "epoch": 0.5521669341894061, + "grad_norm": 0.978401193546021, + "learning_rate": 2.5482095946458966e-05, + "loss": 0.5488, + "step": 18920 + }, + { + "epoch": 0.5523128556836422, + "grad_norm": 1.0847012634123125, + "learning_rate": 2.5471282230501492e-05, + "loss": 0.5354, + "step": 18925 + }, + { + "epoch": 0.5524587771778783, + "grad_norm": 1.0358762658460097, + "learning_rate": 2.546046898696884e-05, + "loss": 0.5698, + "step": 18930 + }, + { + "epoch": 0.5526046986721144, + "grad_norm": 1.0891401532471086, + "learning_rate": 2.5449656218379074e-05, + "loss": 0.5633, + "step": 18935 + }, + { + "epoch": 0.5527506201663505, + "grad_norm": 1.0510193442296694, + "learning_rate": 2.5438843927250154e-05, + "loss": 0.6099, + "step": 18940 + }, + { + "epoch": 0.5528965416605865, + "grad_norm": 1.2183005943802543, + "learning_rate": 2.5428032116099915e-05, + "loss": 0.6016, + "step": 18945 + }, + { + "epoch": 0.5530424631548227, + "grad_norm": 1.013470118830834, + "learning_rate": 2.54172207874461e-05, + "loss": 0.5752, + "step": 18950 + }, + { + "epoch": 0.5531883846490588, + "grad_norm": 1.0779410603820585, + "learning_rate": 2.5406409943806332e-05, + "loss": 0.5764, + "step": 18955 + }, + { + "epoch": 0.5533343061432949, + "grad_norm": 1.0547807165614647, + "learning_rate": 2.5395599587698105e-05, + "loss": 0.5817, + "step": 18960 + }, + { + "epoch": 0.553480227637531, + "grad_norm": 1.008305177677653, + "learning_rate": 2.5384789721638814e-05, + "loss": 0.6211, + "step": 18965 + }, + { + "epoch": 0.5536261491317671, + "grad_norm": 1.099845859383687, + "learning_rate": 2.5373980348145744e-05, + "loss": 0.5909, + "step": 18970 + }, + { + "epoch": 0.5537720706260032, + "grad_norm": 1.006296163758954, + "learning_rate": 2.536317146973605e-05, + "loss": 0.5261, + "step": 18975 + }, + { + "epoch": 0.5539179921202393, + "grad_norm": 1.0087618764886408, + "learning_rate": 2.5352363088926794e-05, + "loss": 0.5847, + "step": 18980 + }, + { + "epoch": 0.5540639136144754, + "grad_norm": 1.0175269245985794, + "learning_rate": 2.5341555208234913e-05, + "loss": 0.6103, + "step": 18985 + }, + { + "epoch": 0.5542098351087115, + "grad_norm": 1.0676517263307896, + "learning_rate": 2.5330747830177198e-05, + "loss": 0.6017, + "step": 18990 + }, + { + "epoch": 0.5543557566029477, + "grad_norm": 0.927595789718737, + "learning_rate": 2.5319940957270376e-05, + "loss": 0.5658, + "step": 18995 + }, + { + "epoch": 0.5545016780971838, + "grad_norm": 1.0517677562821353, + "learning_rate": 2.530913459203102e-05, + "loss": 0.5708, + "step": 19000 + }, + { + "epoch": 0.5546475995914198, + "grad_norm": 0.9924221815992588, + "learning_rate": 2.5298328736975586e-05, + "loss": 0.5162, + "step": 19005 + }, + { + "epoch": 0.5547935210856559, + "grad_norm": 1.133808558330379, + "learning_rate": 2.5287523394620434e-05, + "loss": 0.6535, + "step": 19010 + }, + { + "epoch": 0.554939442579892, + "grad_norm": 1.2295521240001699, + "learning_rate": 2.5276718567481772e-05, + "loss": 0.5985, + "step": 19015 + }, + { + "epoch": 0.5550853640741281, + "grad_norm": 1.0829990655439399, + "learning_rate": 2.526591425807572e-05, + "loss": 0.5858, + "step": 19020 + }, + { + "epoch": 0.5552312855683642, + "grad_norm": 0.975581808513619, + "learning_rate": 2.525511046891826e-05, + "loss": 0.5663, + "step": 19025 + }, + { + "epoch": 0.5553772070626003, + "grad_norm": 0.9977770894440878, + "learning_rate": 2.5244307202525253e-05, + "loss": 0.5307, + "step": 19030 + }, + { + "epoch": 0.5555231285568364, + "grad_norm": 1.0209717023105354, + "learning_rate": 2.5233504461412442e-05, + "loss": 0.5669, + "step": 19035 + }, + { + "epoch": 0.5556690500510725, + "grad_norm": 1.1791880003927673, + "learning_rate": 2.5222702248095453e-05, + "loss": 0.588, + "step": 19040 + }, + { + "epoch": 0.5558149715453087, + "grad_norm": 1.1051184583095714, + "learning_rate": 2.521190056508977e-05, + "loss": 0.6078, + "step": 19045 + }, + { + "epoch": 0.5559608930395448, + "grad_norm": 0.9539411945786938, + "learning_rate": 2.5201099414910777e-05, + "loss": 0.5132, + "step": 19050 + }, + { + "epoch": 0.5561068145337809, + "grad_norm": 0.9693700415932675, + "learning_rate": 2.519029880007373e-05, + "loss": 0.5634, + "step": 19055 + }, + { + "epoch": 0.556252736028017, + "grad_norm": 1.1086048631443517, + "learning_rate": 2.5179498723093738e-05, + "loss": 0.5585, + "step": 19060 + }, + { + "epoch": 0.556398657522253, + "grad_norm": 1.0455769386020437, + "learning_rate": 2.5168699186485812e-05, + "loss": 0.5514, + "step": 19065 + }, + { + "epoch": 0.5565445790164891, + "grad_norm": 0.9256208382486878, + "learning_rate": 2.515790019276481e-05, + "loss": 0.5389, + "step": 19070 + }, + { + "epoch": 0.5566905005107252, + "grad_norm": 1.1502391874146716, + "learning_rate": 2.5147101744445502e-05, + "loss": 0.5288, + "step": 19075 + }, + { + "epoch": 0.5568364220049613, + "grad_norm": 1.0385892613077392, + "learning_rate": 2.51363038440425e-05, + "loss": 0.5402, + "step": 19080 + }, + { + "epoch": 0.5569823434991974, + "grad_norm": 1.0652073224815324, + "learning_rate": 2.5125506494070288e-05, + "loss": 0.5658, + "step": 19085 + }, + { + "epoch": 0.5571282649934335, + "grad_norm": 1.115274048936048, + "learning_rate": 2.5114709697043233e-05, + "loss": 0.5928, + "step": 19090 + }, + { + "epoch": 0.5572741864876697, + "grad_norm": 1.0188397659573443, + "learning_rate": 2.510391345547558e-05, + "loss": 0.5633, + "step": 19095 + }, + { + "epoch": 0.5574201079819058, + "grad_norm": 1.0801249548445246, + "learning_rate": 2.5093117771881435e-05, + "loss": 0.61, + "step": 19100 + }, + { + "epoch": 0.5575660294761419, + "grad_norm": 1.0460639149770012, + "learning_rate": 2.508232264877477e-05, + "loss": 0.5917, + "step": 19105 + }, + { + "epoch": 0.557711950970378, + "grad_norm": 1.076834998465646, + "learning_rate": 2.507152808866942e-05, + "loss": 0.5454, + "step": 19110 + }, + { + "epoch": 0.557857872464614, + "grad_norm": 0.9164329626296981, + "learning_rate": 2.5060734094079114e-05, + "loss": 0.6192, + "step": 19115 + }, + { + "epoch": 0.5580037939588501, + "grad_norm": 1.205042786583854, + "learning_rate": 2.504994066751743e-05, + "loss": 0.6095, + "step": 19120 + }, + { + "epoch": 0.5581497154530862, + "grad_norm": 1.010719854884888, + "learning_rate": 2.5039147811497832e-05, + "loss": 0.5475, + "step": 19125 + }, + { + "epoch": 0.5582956369473223, + "grad_norm": 1.1540967229947297, + "learning_rate": 2.502835552853362e-05, + "loss": 0.5794, + "step": 19130 + }, + { + "epoch": 0.5584415584415584, + "grad_norm": 1.0461724261051417, + "learning_rate": 2.5017563821137986e-05, + "loss": 0.507, + "step": 19135 + }, + { + "epoch": 0.5585874799357945, + "grad_norm": 1.0544502833352922, + "learning_rate": 2.5006772691823978e-05, + "loss": 0.5392, + "step": 19140 + }, + { + "epoch": 0.5587334014300307, + "grad_norm": 0.9651564140635615, + "learning_rate": 2.4995982143104512e-05, + "loss": 0.5443, + "step": 19145 + }, + { + "epoch": 0.5588793229242668, + "grad_norm": 1.092416606591082, + "learning_rate": 2.4985192177492384e-05, + "loss": 0.6354, + "step": 19150 + }, + { + "epoch": 0.5590252444185029, + "grad_norm": 1.1443818809993265, + "learning_rate": 2.4974402797500214e-05, + "loss": 0.6377, + "step": 19155 + }, + { + "epoch": 0.559171165912739, + "grad_norm": 1.1295841116010163, + "learning_rate": 2.4963614005640522e-05, + "loss": 0.5545, + "step": 19160 + }, + { + "epoch": 0.559317087406975, + "grad_norm": 1.0868486360816592, + "learning_rate": 2.4952825804425677e-05, + "loss": 0.6242, + "step": 19165 + }, + { + "epoch": 0.5594630089012111, + "grad_norm": 1.0443842412131272, + "learning_rate": 2.494203819636793e-05, + "loss": 0.5395, + "step": 19170 + }, + { + "epoch": 0.5596089303954472, + "grad_norm": 1.2821998649607063, + "learning_rate": 2.4931251183979363e-05, + "loss": 0.593, + "step": 19175 + }, + { + "epoch": 0.5597548518896833, + "grad_norm": 0.9135653226581955, + "learning_rate": 2.4920464769771922e-05, + "loss": 0.5753, + "step": 19180 + }, + { + "epoch": 0.5599007733839194, + "grad_norm": 1.0307893919342195, + "learning_rate": 2.4909678956257442e-05, + "loss": 0.6159, + "step": 19185 + }, + { + "epoch": 0.5600466948781555, + "grad_norm": 0.8833656854411767, + "learning_rate": 2.48988937459476e-05, + "loss": 0.5398, + "step": 19190 + }, + { + "epoch": 0.5601926163723917, + "grad_norm": 1.0503850689797165, + "learning_rate": 2.4888109141353933e-05, + "loss": 0.6072, + "step": 19195 + }, + { + "epoch": 0.5603385378666278, + "grad_norm": 1.1281983794219044, + "learning_rate": 2.4877325144987824e-05, + "loss": 0.5664, + "step": 19200 + }, + { + "epoch": 0.5604844593608639, + "grad_norm": 1.0848995924893812, + "learning_rate": 2.486654175936054e-05, + "loss": 0.5723, + "step": 19205 + }, + { + "epoch": 0.5606303808551, + "grad_norm": 1.0095072610759486, + "learning_rate": 2.4855758986983192e-05, + "loss": 0.5991, + "step": 19210 + }, + { + "epoch": 0.5607763023493361, + "grad_norm": 1.0925017368118464, + "learning_rate": 2.4844976830366752e-05, + "loss": 0.554, + "step": 19215 + }, + { + "epoch": 0.5609222238435722, + "grad_norm": 1.0161537070177942, + "learning_rate": 2.483419529202205e-05, + "loss": 0.4725, + "step": 19220 + }, + { + "epoch": 0.5610681453378082, + "grad_norm": 1.1843883117584293, + "learning_rate": 2.482341437445976e-05, + "loss": 0.5695, + "step": 19225 + }, + { + "epoch": 0.5612140668320443, + "grad_norm": 0.9304976985737102, + "learning_rate": 2.481263408019041e-05, + "loss": 0.5934, + "step": 19230 + }, + { + "epoch": 0.5613599883262804, + "grad_norm": 1.0408515652964907, + "learning_rate": 2.480185441172441e-05, + "loss": 0.601, + "step": 19235 + }, + { + "epoch": 0.5615059098205165, + "grad_norm": 1.1711645382896347, + "learning_rate": 2.4791075371572014e-05, + "loss": 0.6006, + "step": 19240 + }, + { + "epoch": 0.5616518313147527, + "grad_norm": 1.0104416067115156, + "learning_rate": 2.478029696224331e-05, + "loss": 0.5449, + "step": 19245 + }, + { + "epoch": 0.5617977528089888, + "grad_norm": 0.9905701187655629, + "learning_rate": 2.4769519186248253e-05, + "loss": 0.5688, + "step": 19250 + }, + { + "epoch": 0.5619436743032249, + "grad_norm": 0.9506585186110902, + "learning_rate": 2.4758742046096643e-05, + "loss": 0.5745, + "step": 19255 + }, + { + "epoch": 0.562089595797461, + "grad_norm": 1.0627845624652243, + "learning_rate": 2.4747965544298153e-05, + "loss": 0.5762, + "step": 19260 + }, + { + "epoch": 0.5622355172916971, + "grad_norm": 1.056693222541942, + "learning_rate": 2.4737189683362276e-05, + "loss": 0.581, + "step": 19265 + }, + { + "epoch": 0.5623814387859332, + "grad_norm": 1.2266561420640012, + "learning_rate": 2.47264144657984e-05, + "loss": 0.5532, + "step": 19270 + }, + { + "epoch": 0.5625273602801693, + "grad_norm": 1.1216603509846654, + "learning_rate": 2.4715639894115698e-05, + "loss": 0.6624, + "step": 19275 + }, + { + "epoch": 0.5626732817744053, + "grad_norm": 1.1223071449803248, + "learning_rate": 2.470486597082326e-05, + "loss": 0.5501, + "step": 19280 + }, + { + "epoch": 0.5628192032686414, + "grad_norm": 1.0848189018979422, + "learning_rate": 2.469409269842998e-05, + "loss": 0.5863, + "step": 19285 + }, + { + "epoch": 0.5629651247628775, + "grad_norm": 1.0779140525538955, + "learning_rate": 2.4683320079444632e-05, + "loss": 0.535, + "step": 19290 + }, + { + "epoch": 0.5631110462571137, + "grad_norm": 1.1538109088750106, + "learning_rate": 2.4672548116375797e-05, + "loss": 0.6798, + "step": 19295 + }, + { + "epoch": 0.5632569677513498, + "grad_norm": 1.1577009613692197, + "learning_rate": 2.466177681173194e-05, + "loss": 0.5687, + "step": 19300 + }, + { + "epoch": 0.5634028892455859, + "grad_norm": 1.0637343804154196, + "learning_rate": 2.4651006168021367e-05, + "loss": 0.541, + "step": 19305 + }, + { + "epoch": 0.563548810739822, + "grad_norm": 0.9878108564946632, + "learning_rate": 2.464023618775222e-05, + "loss": 0.5327, + "step": 19310 + }, + { + "epoch": 0.5636947322340581, + "grad_norm": 0.9287462175330049, + "learning_rate": 2.4629466873432483e-05, + "loss": 0.5314, + "step": 19315 + }, + { + "epoch": 0.5638406537282942, + "grad_norm": 0.8803254546998466, + "learning_rate": 2.4618698227570004e-05, + "loss": 0.5309, + "step": 19320 + }, + { + "epoch": 0.5639865752225303, + "grad_norm": 1.2028701200319487, + "learning_rate": 2.4607930252672446e-05, + "loss": 0.6191, + "step": 19325 + }, + { + "epoch": 0.5641324967167664, + "grad_norm": 0.8740160504273201, + "learning_rate": 2.459716295124735e-05, + "loss": 0.5593, + "step": 19330 + }, + { + "epoch": 0.5642784182110024, + "grad_norm": 1.0320604321621292, + "learning_rate": 2.458639632580207e-05, + "loss": 0.6246, + "step": 19335 + }, + { + "epoch": 0.5644243397052385, + "grad_norm": 1.0819877387168169, + "learning_rate": 2.4575630378843835e-05, + "loss": 0.5668, + "step": 19340 + }, + { + "epoch": 0.5645702611994747, + "grad_norm": 1.0981914701607622, + "learning_rate": 2.456486511287967e-05, + "loss": 0.5457, + "step": 19345 + }, + { + "epoch": 0.5647161826937108, + "grad_norm": 1.2711526163268574, + "learning_rate": 2.455410053041649e-05, + "loss": 0.6249, + "step": 19350 + }, + { + "epoch": 0.5648621041879469, + "grad_norm": 1.1266035804487176, + "learning_rate": 2.454333663396102e-05, + "loss": 0.5747, + "step": 19355 + }, + { + "epoch": 0.565008025682183, + "grad_norm": 1.0122364388672083, + "learning_rate": 2.4532573426019834e-05, + "loss": 0.5753, + "step": 19360 + }, + { + "epoch": 0.5651539471764191, + "grad_norm": 1.0185787207627244, + "learning_rate": 2.4521810909099358e-05, + "loss": 0.5999, + "step": 19365 + }, + { + "epoch": 0.5652998686706552, + "grad_norm": 1.1676105532008045, + "learning_rate": 2.4511049085705823e-05, + "loss": 0.541, + "step": 19370 + }, + { + "epoch": 0.5654457901648913, + "grad_norm": 0.9087974882154571, + "learning_rate": 2.4500287958345336e-05, + "loss": 0.5494, + "step": 19375 + }, + { + "epoch": 0.5655917116591274, + "grad_norm": 1.0401637410944768, + "learning_rate": 2.448952752952382e-05, + "loss": 0.6122, + "step": 19380 + }, + { + "epoch": 0.5657376331533635, + "grad_norm": 1.0380105477386217, + "learning_rate": 2.4478767801747056e-05, + "loss": 0.5644, + "step": 19385 + }, + { + "epoch": 0.5658835546475995, + "grad_norm": 1.0055441254996957, + "learning_rate": 2.4468008777520628e-05, + "loss": 0.5419, + "step": 19390 + }, + { + "epoch": 0.5660294761418357, + "grad_norm": 1.0857132155666829, + "learning_rate": 2.4457250459349983e-05, + "loss": 0.6004, + "step": 19395 + }, + { + "epoch": 0.5661753976360718, + "grad_norm": 1.087468370845473, + "learning_rate": 2.44464928497404e-05, + "loss": 0.5543, + "step": 19400 + }, + { + "epoch": 0.5663213191303079, + "grad_norm": 0.962954653391166, + "learning_rate": 2.443573595119698e-05, + "loss": 0.5419, + "step": 19405 + }, + { + "epoch": 0.566467240624544, + "grad_norm": 1.0725977718251574, + "learning_rate": 2.4424979766224686e-05, + "loss": 0.5627, + "step": 19410 + }, + { + "epoch": 0.5666131621187801, + "grad_norm": 1.0711329682147823, + "learning_rate": 2.4414224297328276e-05, + "loss": 0.6043, + "step": 19415 + }, + { + "epoch": 0.5667590836130162, + "grad_norm": 0.9184229927337886, + "learning_rate": 2.4403469547012376e-05, + "loss": 0.5424, + "step": 19420 + }, + { + "epoch": 0.5669050051072523, + "grad_norm": 0.9900840238780486, + "learning_rate": 2.4392715517781424e-05, + "loss": 0.5713, + "step": 19425 + }, + { + "epoch": 0.5670509266014884, + "grad_norm": 1.0326232022299162, + "learning_rate": 2.438196221213969e-05, + "loss": 0.5671, + "step": 19430 + }, + { + "epoch": 0.5671968480957245, + "grad_norm": 1.1297156208535026, + "learning_rate": 2.4371209632591306e-05, + "loss": 0.5583, + "step": 19435 + }, + { + "epoch": 0.5673427695899605, + "grad_norm": 1.2666884043574758, + "learning_rate": 2.4360457781640185e-05, + "loss": 0.6108, + "step": 19440 + }, + { + "epoch": 0.5674886910841968, + "grad_norm": 1.0482144735176806, + "learning_rate": 2.4349706661790106e-05, + "loss": 0.5555, + "step": 19445 + }, + { + "epoch": 0.5676346125784328, + "grad_norm": 1.0076283466884899, + "learning_rate": 2.4338956275544668e-05, + "loss": 0.5839, + "step": 19450 + }, + { + "epoch": 0.5677805340726689, + "grad_norm": 1.1059734727385848, + "learning_rate": 2.43282066254073e-05, + "loss": 0.5409, + "step": 19455 + }, + { + "epoch": 0.567926455566905, + "grad_norm": 1.03239931612895, + "learning_rate": 2.4317457713881263e-05, + "loss": 0.56, + "step": 19460 + }, + { + "epoch": 0.5680723770611411, + "grad_norm": 0.9518194947040801, + "learning_rate": 2.4306709543469634e-05, + "loss": 0.5484, + "step": 19465 + }, + { + "epoch": 0.5682182985553772, + "grad_norm": 1.0487805789207552, + "learning_rate": 2.4295962116675324e-05, + "loss": 0.6289, + "step": 19470 + }, + { + "epoch": 0.5683642200496133, + "grad_norm": 0.9889208969704194, + "learning_rate": 2.4285215436001075e-05, + "loss": 0.6138, + "step": 19475 + }, + { + "epoch": 0.5685101415438494, + "grad_norm": 1.103561269508087, + "learning_rate": 2.427446950394947e-05, + "loss": 0.5709, + "step": 19480 + }, + { + "epoch": 0.5686560630380855, + "grad_norm": 1.0537865038572112, + "learning_rate": 2.4263724323022867e-05, + "loss": 0.5095, + "step": 19485 + }, + { + "epoch": 0.5688019845323216, + "grad_norm": 1.0454335734105995, + "learning_rate": 2.4252979895723504e-05, + "loss": 0.5655, + "step": 19490 + }, + { + "epoch": 0.5689479060265578, + "grad_norm": 1.2187331331251503, + "learning_rate": 2.4242236224553416e-05, + "loss": 0.607, + "step": 19495 + }, + { + "epoch": 0.5690938275207938, + "grad_norm": 0.967919693370299, + "learning_rate": 2.423149331201447e-05, + "loss": 0.5801, + "step": 19500 + }, + { + "epoch": 0.5692397490150299, + "grad_norm": 0.9603491495727466, + "learning_rate": 2.4220751160608353e-05, + "loss": 0.562, + "step": 19505 + }, + { + "epoch": 0.569385670509266, + "grad_norm": 1.1393856886520102, + "learning_rate": 2.4210009772836566e-05, + "loss": 0.5568, + "step": 19510 + }, + { + "epoch": 0.5695315920035021, + "grad_norm": 0.9905720047753638, + "learning_rate": 2.4199269151200456e-05, + "loss": 0.5626, + "step": 19515 + }, + { + "epoch": 0.5696775134977382, + "grad_norm": 1.0127381621549063, + "learning_rate": 2.4188529298201168e-05, + "loss": 0.6223, + "step": 19520 + }, + { + "epoch": 0.5698234349919743, + "grad_norm": 1.043503433621324, + "learning_rate": 2.417779021633968e-05, + "loss": 0.5544, + "step": 19525 + }, + { + "epoch": 0.5699693564862104, + "grad_norm": 1.022900977933336, + "learning_rate": 2.4167051908116793e-05, + "loss": 0.5418, + "step": 19530 + }, + { + "epoch": 0.5701152779804465, + "grad_norm": 1.0064623542984361, + "learning_rate": 2.4156314376033117e-05, + "loss": 0.5038, + "step": 19535 + }, + { + "epoch": 0.5702611994746826, + "grad_norm": 1.1630485095625613, + "learning_rate": 2.414557762258909e-05, + "loss": 0.4865, + "step": 19540 + }, + { + "epoch": 0.5704071209689188, + "grad_norm": 1.0112976148485264, + "learning_rate": 2.4134841650284958e-05, + "loss": 0.5525, + "step": 19545 + }, + { + "epoch": 0.5705530424631549, + "grad_norm": 0.9943071438940063, + "learning_rate": 2.4124106461620804e-05, + "loss": 0.5762, + "step": 19550 + }, + { + "epoch": 0.570698963957391, + "grad_norm": 1.0972661288424221, + "learning_rate": 2.4113372059096505e-05, + "loss": 0.6109, + "step": 19555 + }, + { + "epoch": 0.570844885451627, + "grad_norm": 1.021607176701133, + "learning_rate": 2.410263844521177e-05, + "loss": 0.6179, + "step": 19560 + }, + { + "epoch": 0.5709908069458631, + "grad_norm": 0.9976981375922676, + "learning_rate": 2.4091905622466138e-05, + "loss": 0.5794, + "step": 19565 + }, + { + "epoch": 0.5711367284400992, + "grad_norm": 1.017355359592056, + "learning_rate": 2.4081173593358924e-05, + "loss": 0.6123, + "step": 19570 + }, + { + "epoch": 0.5712826499343353, + "grad_norm": 1.0947729252844716, + "learning_rate": 2.40704423603893e-05, + "loss": 0.598, + "step": 19575 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.1122979500011798, + "learning_rate": 2.405971192605622e-05, + "loss": 0.6051, + "step": 19580 + }, + { + "epoch": 0.5715744929228075, + "grad_norm": 1.0626110276751881, + "learning_rate": 2.4048982292858467e-05, + "loss": 0.549, + "step": 19585 + }, + { + "epoch": 0.5717204144170436, + "grad_norm": 0.9888621336869378, + "learning_rate": 2.4038253463294647e-05, + "loss": 0.5634, + "step": 19590 + }, + { + "epoch": 0.5718663359112798, + "grad_norm": 0.9603879643285596, + "learning_rate": 2.4027525439863157e-05, + "loss": 0.5787, + "step": 19595 + }, + { + "epoch": 0.5720122574055159, + "grad_norm": 1.0439401637810652, + "learning_rate": 2.4016798225062236e-05, + "loss": 0.5695, + "step": 19600 + }, + { + "epoch": 0.572158178899752, + "grad_norm": 1.0563232939961402, + "learning_rate": 2.4006071821389897e-05, + "loss": 0.6025, + "step": 19605 + }, + { + "epoch": 0.572304100393988, + "grad_norm": 1.1897741745411052, + "learning_rate": 2.3995346231343986e-05, + "loss": 0.5898, + "step": 19610 + }, + { + "epoch": 0.5724500218882241, + "grad_norm": 0.9907830940476184, + "learning_rate": 2.3984621457422164e-05, + "loss": 0.5818, + "step": 19615 + }, + { + "epoch": 0.5725959433824602, + "grad_norm": 0.8979939849343949, + "learning_rate": 2.3973897502121883e-05, + "loss": 0.5415, + "step": 19620 + }, + { + "epoch": 0.5727418648766963, + "grad_norm": 0.9506663375049602, + "learning_rate": 2.3963174367940443e-05, + "loss": 0.5594, + "step": 19625 + }, + { + "epoch": 0.5728877863709324, + "grad_norm": 1.053628239029449, + "learning_rate": 2.395245205737489e-05, + "loss": 0.5412, + "step": 19630 + }, + { + "epoch": 0.5730337078651685, + "grad_norm": 1.0324232359859886, + "learning_rate": 2.394173057292214e-05, + "loss": 0.6263, + "step": 19635 + }, + { + "epoch": 0.5731796293594046, + "grad_norm": 1.060672260120206, + "learning_rate": 2.3931009917078885e-05, + "loss": 0.5843, + "step": 19640 + }, + { + "epoch": 0.5733255508536408, + "grad_norm": 1.051380927302944, + "learning_rate": 2.3920290092341636e-05, + "loss": 0.5944, + "step": 19645 + }, + { + "epoch": 0.5734714723478769, + "grad_norm": 1.0332582720349894, + "learning_rate": 2.3909571101206684e-05, + "loss": 0.5514, + "step": 19650 + }, + { + "epoch": 0.573617393842113, + "grad_norm": 1.1165065892940602, + "learning_rate": 2.389885294617016e-05, + "loss": 0.5557, + "step": 19655 + }, + { + "epoch": 0.5737633153363491, + "grad_norm": 1.1243443192739717, + "learning_rate": 2.388813562972798e-05, + "loss": 0.5874, + "step": 19660 + }, + { + "epoch": 0.5739092368305851, + "grad_norm": 0.9690077446995694, + "learning_rate": 2.3877419154375886e-05, + "loss": 0.5524, + "step": 19665 + }, + { + "epoch": 0.5740551583248212, + "grad_norm": 1.101170878516414, + "learning_rate": 2.3866703522609406e-05, + "loss": 0.5503, + "step": 19670 + }, + { + "epoch": 0.5742010798190573, + "grad_norm": 1.0808283099924705, + "learning_rate": 2.3855988736923855e-05, + "loss": 0.5282, + "step": 19675 + }, + { + "epoch": 0.5743470013132934, + "grad_norm": 1.2252590800713856, + "learning_rate": 2.3845274799814393e-05, + "loss": 0.6092, + "step": 19680 + }, + { + "epoch": 0.5744929228075295, + "grad_norm": 1.033283015047248, + "learning_rate": 2.383456171377595e-05, + "loss": 0.5685, + "step": 19685 + }, + { + "epoch": 0.5746388443017657, + "grad_norm": 1.0292211122904824, + "learning_rate": 2.3823849481303268e-05, + "loss": 0.5483, + "step": 19690 + }, + { + "epoch": 0.5747847657960018, + "grad_norm": 0.9795220588868664, + "learning_rate": 2.381313810489091e-05, + "loss": 0.5186, + "step": 19695 + }, + { + "epoch": 0.5749306872902379, + "grad_norm": 1.009543404724265, + "learning_rate": 2.3802427587033184e-05, + "loss": 0.5586, + "step": 19700 + }, + { + "epoch": 0.575076608784474, + "grad_norm": 0.9526254455321203, + "learning_rate": 2.3791717930224255e-05, + "loss": 0.5565, + "step": 19705 + }, + { + "epoch": 0.5752225302787101, + "grad_norm": 1.05814446754326, + "learning_rate": 2.378100913695807e-05, + "loss": 0.5786, + "step": 19710 + }, + { + "epoch": 0.5753684517729462, + "grad_norm": 1.1072549291998413, + "learning_rate": 2.3770301209728364e-05, + "loss": 0.6009, + "step": 19715 + }, + { + "epoch": 0.5755143732671822, + "grad_norm": 1.1069411821699975, + "learning_rate": 2.3759594151028687e-05, + "loss": 0.5993, + "step": 19720 + }, + { + "epoch": 0.5756602947614183, + "grad_norm": 0.9687950745002407, + "learning_rate": 2.3748887963352363e-05, + "loss": 0.6346, + "step": 19725 + }, + { + "epoch": 0.5758062162556544, + "grad_norm": 0.9565434370262882, + "learning_rate": 2.3738182649192536e-05, + "loss": 0.5384, + "step": 19730 + }, + { + "epoch": 0.5759521377498905, + "grad_norm": 1.2963520126701809, + "learning_rate": 2.3727478211042137e-05, + "loss": 0.631, + "step": 19735 + }, + { + "epoch": 0.5760980592441267, + "grad_norm": 1.0277698762556078, + "learning_rate": 2.3716774651393902e-05, + "loss": 0.518, + "step": 19740 + }, + { + "epoch": 0.5762439807383628, + "grad_norm": 1.037264089457356, + "learning_rate": 2.3706071972740345e-05, + "loss": 0.5777, + "step": 19745 + }, + { + "epoch": 0.5763899022325989, + "grad_norm": 1.0686817206165269, + "learning_rate": 2.3695370177573784e-05, + "loss": 0.5847, + "step": 19750 + }, + { + "epoch": 0.576535823726835, + "grad_norm": 1.0219112677969322, + "learning_rate": 2.368466926838634e-05, + "loss": 0.5744, + "step": 19755 + }, + { + "epoch": 0.5766817452210711, + "grad_norm": 0.9198189234949002, + "learning_rate": 2.367396924766991e-05, + "loss": 0.5977, + "step": 19760 + }, + { + "epoch": 0.5768276667153072, + "grad_norm": 1.1688812470931758, + "learning_rate": 2.3663270117916208e-05, + "loss": 0.551, + "step": 19765 + }, + { + "epoch": 0.5769735882095433, + "grad_norm": 1.223149741639775, + "learning_rate": 2.3652571881616708e-05, + "loss": 0.6064, + "step": 19770 + }, + { + "epoch": 0.5771195097037793, + "grad_norm": 1.093432559624975, + "learning_rate": 2.3641874541262703e-05, + "loss": 0.5427, + "step": 19775 + }, + { + "epoch": 0.5772654311980154, + "grad_norm": 0.9943137345686238, + "learning_rate": 2.3631178099345274e-05, + "loss": 0.5146, + "step": 19780 + }, + { + "epoch": 0.5774113526922515, + "grad_norm": 0.9860914534655764, + "learning_rate": 2.362048255835528e-05, + "loss": 0.5971, + "step": 19785 + }, + { + "epoch": 0.5775572741864877, + "grad_norm": 1.137046556542198, + "learning_rate": 2.3609787920783384e-05, + "loss": 0.5623, + "step": 19790 + }, + { + "epoch": 0.5777031956807238, + "grad_norm": 1.1018774285870212, + "learning_rate": 2.359909418912002e-05, + "loss": 0.6113, + "step": 19795 + }, + { + "epoch": 0.5778491171749599, + "grad_norm": 1.3686580234016736, + "learning_rate": 2.358840136585543e-05, + "loss": 0.6183, + "step": 19800 + }, + { + "epoch": 0.577995038669196, + "grad_norm": 0.9878869735558028, + "learning_rate": 2.3577709453479635e-05, + "loss": 0.5758, + "step": 19805 + }, + { + "epoch": 0.5781409601634321, + "grad_norm": 1.0057911338574395, + "learning_rate": 2.356701845448246e-05, + "loss": 0.6007, + "step": 19810 + }, + { + "epoch": 0.5782868816576682, + "grad_norm": 1.034418996895888, + "learning_rate": 2.3556328371353488e-05, + "loss": 0.5579, + "step": 19815 + }, + { + "epoch": 0.5784328031519043, + "grad_norm": 1.0825349240768498, + "learning_rate": 2.3545639206582114e-05, + "loss": 0.5986, + "step": 19820 + }, + { + "epoch": 0.5785787246461404, + "grad_norm": 1.0761705246061184, + "learning_rate": 2.35349509626575e-05, + "loss": 0.5843, + "step": 19825 + }, + { + "epoch": 0.5787246461403764, + "grad_norm": 1.0338489498115588, + "learning_rate": 2.3524263642068606e-05, + "loss": 0.6431, + "step": 19830 + }, + { + "epoch": 0.5788705676346125, + "grad_norm": 1.1026292915996687, + "learning_rate": 2.3513577247304193e-05, + "loss": 0.5962, + "step": 19835 + }, + { + "epoch": 0.5790164891288487, + "grad_norm": 1.0591814340101953, + "learning_rate": 2.350289178085276e-05, + "loss": 0.5521, + "step": 19840 + }, + { + "epoch": 0.5791624106230848, + "grad_norm": 0.9524738320325364, + "learning_rate": 2.3492207245202626e-05, + "loss": 0.6025, + "step": 19845 + }, + { + "epoch": 0.5793083321173209, + "grad_norm": 0.9982975450306473, + "learning_rate": 2.3481523642841884e-05, + "loss": 0.5502, + "step": 19850 + }, + { + "epoch": 0.579454253611557, + "grad_norm": 1.2876912246017667, + "learning_rate": 2.3470840976258423e-05, + "loss": 0.6401, + "step": 19855 + }, + { + "epoch": 0.5796001751057931, + "grad_norm": 1.0360027701575842, + "learning_rate": 2.34601592479399e-05, + "loss": 0.5767, + "step": 19860 + }, + { + "epoch": 0.5797460966000292, + "grad_norm": 1.1903877611486364, + "learning_rate": 2.3449478460373736e-05, + "loss": 0.5755, + "step": 19865 + }, + { + "epoch": 0.5798920180942653, + "grad_norm": 1.05899201418877, + "learning_rate": 2.3438798616047165e-05, + "loss": 0.5655, + "step": 19870 + }, + { + "epoch": 0.5800379395885014, + "grad_norm": 0.9707993762008369, + "learning_rate": 2.3428119717447185e-05, + "loss": 0.5791, + "step": 19875 + }, + { + "epoch": 0.5801838610827375, + "grad_norm": 0.9943940114821047, + "learning_rate": 2.3417441767060578e-05, + "loss": 0.5499, + "step": 19880 + }, + { + "epoch": 0.5803297825769735, + "grad_norm": 1.0998746453248927, + "learning_rate": 2.3406764767373912e-05, + "loss": 0.5326, + "step": 19885 + }, + { + "epoch": 0.5804757040712097, + "grad_norm": 1.0192555530747083, + "learning_rate": 2.3396088720873517e-05, + "loss": 0.5286, + "step": 19890 + }, + { + "epoch": 0.5806216255654458, + "grad_norm": 1.0135247138597054, + "learning_rate": 2.3385413630045512e-05, + "loss": 0.609, + "step": 19895 + }, + { + "epoch": 0.5807675470596819, + "grad_norm": 0.9375723531144762, + "learning_rate": 2.337473949737578e-05, + "loss": 0.5623, + "step": 19900 + }, + { + "epoch": 0.580913468553918, + "grad_norm": 1.013587725649466, + "learning_rate": 2.336406632535002e-05, + "loss": 0.5897, + "step": 19905 + }, + { + "epoch": 0.5810593900481541, + "grad_norm": 1.0259578466540173, + "learning_rate": 2.3353394116453646e-05, + "loss": 0.5979, + "step": 19910 + }, + { + "epoch": 0.5812053115423902, + "grad_norm": 0.9098455463702352, + "learning_rate": 2.3342722873171902e-05, + "loss": 0.5405, + "step": 19915 + }, + { + "epoch": 0.5813512330366263, + "grad_norm": 0.9438563565428367, + "learning_rate": 2.333205259798978e-05, + "loss": 0.6148, + "step": 19920 + }, + { + "epoch": 0.5814971545308624, + "grad_norm": 0.9894844982728428, + "learning_rate": 2.3321383293392064e-05, + "loss": 0.5473, + "step": 19925 + }, + { + "epoch": 0.5816430760250985, + "grad_norm": 0.8859886805773258, + "learning_rate": 2.3310714961863285e-05, + "loss": 0.5764, + "step": 19930 + }, + { + "epoch": 0.5817889975193345, + "grad_norm": 1.4192123280097704, + "learning_rate": 2.330004760588777e-05, + "loss": 0.6193, + "step": 19935 + }, + { + "epoch": 0.5819349190135708, + "grad_norm": 1.113111075087676, + "learning_rate": 2.3289381227949613e-05, + "loss": 0.53, + "step": 19940 + }, + { + "epoch": 0.5820808405078068, + "grad_norm": 1.0784595956427026, + "learning_rate": 2.327871583053267e-05, + "loss": 0.5413, + "step": 19945 + }, + { + "epoch": 0.5822267620020429, + "grad_norm": 0.9414684261308156, + "learning_rate": 2.326805141612059e-05, + "loss": 0.568, + "step": 19950 + }, + { + "epoch": 0.582372683496279, + "grad_norm": 1.030533834064575, + "learning_rate": 2.3257387987196788e-05, + "loss": 0.5074, + "step": 19955 + }, + { + "epoch": 0.5825186049905151, + "grad_norm": 1.2681820979247564, + "learning_rate": 2.324672554624442e-05, + "loss": 0.615, + "step": 19960 + }, + { + "epoch": 0.5826645264847512, + "grad_norm": 1.3022550494463503, + "learning_rate": 2.323606409574645e-05, + "loss": 0.6014, + "step": 19965 + }, + { + "epoch": 0.5828104479789873, + "grad_norm": 1.1224614254956247, + "learning_rate": 2.3225403638185588e-05, + "loss": 0.617, + "step": 19970 + }, + { + "epoch": 0.5829563694732234, + "grad_norm": 1.0395465766918444, + "learning_rate": 2.321474417604433e-05, + "loss": 0.5199, + "step": 19975 + }, + { + "epoch": 0.5831022909674595, + "grad_norm": 0.9210448832036293, + "learning_rate": 2.320408571180493e-05, + "loss": 0.5587, + "step": 19980 + }, + { + "epoch": 0.5832482124616956, + "grad_norm": 1.147796740447498, + "learning_rate": 2.3193428247949394e-05, + "loss": 0.6102, + "step": 19985 + }, + { + "epoch": 0.5833941339559318, + "grad_norm": 0.9760878615299288, + "learning_rate": 2.318277178695953e-05, + "loss": 0.5722, + "step": 19990 + }, + { + "epoch": 0.5835400554501678, + "grad_norm": 1.0158807382115598, + "learning_rate": 2.3172116331316888e-05, + "loss": 0.5549, + "step": 19995 + }, + { + "epoch": 0.5836859769444039, + "grad_norm": 1.3370643707030505, + "learning_rate": 2.316146188350279e-05, + "loss": 0.5545, + "step": 20000 + }, + { + "epoch": 0.58383189843864, + "grad_norm": 1.0601650883439209, + "learning_rate": 2.3150808445998324e-05, + "loss": 0.6062, + "step": 20005 + }, + { + "epoch": 0.5839778199328761, + "grad_norm": 1.0832085047352267, + "learning_rate": 2.314015602128433e-05, + "loss": 0.55, + "step": 20010 + }, + { + "epoch": 0.5841237414271122, + "grad_norm": 1.0067164385115555, + "learning_rate": 2.312950461184144e-05, + "loss": 0.6001, + "step": 20015 + }, + { + "epoch": 0.5842696629213483, + "grad_norm": 0.9762789716483766, + "learning_rate": 2.3118854220150027e-05, + "loss": 0.5337, + "step": 20020 + }, + { + "epoch": 0.5844155844155844, + "grad_norm": 1.0868463009744185, + "learning_rate": 2.3108204848690246e-05, + "loss": 0.5766, + "step": 20025 + }, + { + "epoch": 0.5845615059098205, + "grad_norm": 1.0240216753493245, + "learning_rate": 2.3097556499941973e-05, + "loss": 0.6115, + "step": 20030 + }, + { + "epoch": 0.5847074274040566, + "grad_norm": 1.0366519873291238, + "learning_rate": 2.3086909176384906e-05, + "loss": 0.5603, + "step": 20035 + }, + { + "epoch": 0.5848533488982928, + "grad_norm": 1.1458312058321865, + "learning_rate": 2.3076262880498457e-05, + "loss": 0.5592, + "step": 20040 + }, + { + "epoch": 0.5849992703925289, + "grad_norm": 1.0041386035133433, + "learning_rate": 2.3065617614761813e-05, + "loss": 0.5861, + "step": 20045 + }, + { + "epoch": 0.585145191886765, + "grad_norm": 1.0279531474405932, + "learning_rate": 2.305497338165394e-05, + "loss": 0.5467, + "step": 20050 + }, + { + "epoch": 0.585291113381001, + "grad_norm": 1.0219938409677654, + "learning_rate": 2.304433018365352e-05, + "loss": 0.5204, + "step": 20055 + }, + { + "epoch": 0.5854370348752371, + "grad_norm": 1.0255116832824502, + "learning_rate": 2.303368802323903e-05, + "loss": 0.56, + "step": 20060 + }, + { + "epoch": 0.5855829563694732, + "grad_norm": 1.1657682065927872, + "learning_rate": 2.3023046902888708e-05, + "loss": 0.5401, + "step": 20065 + }, + { + "epoch": 0.5857288778637093, + "grad_norm": 0.9714736081473958, + "learning_rate": 2.3012406825080533e-05, + "loss": 0.572, + "step": 20070 + }, + { + "epoch": 0.5858747993579454, + "grad_norm": 1.0460751900486902, + "learning_rate": 2.300176779229225e-05, + "loss": 0.5198, + "step": 20075 + }, + { + "epoch": 0.5860207208521815, + "grad_norm": 1.0196186155194558, + "learning_rate": 2.2991129807001337e-05, + "loss": 0.5398, + "step": 20080 + }, + { + "epoch": 0.5861666423464176, + "grad_norm": 0.9835985548288224, + "learning_rate": 2.298049287168506e-05, + "loss": 0.5454, + "step": 20085 + }, + { + "epoch": 0.5863125638406538, + "grad_norm": 1.1305678948194302, + "learning_rate": 2.2969856988820426e-05, + "loss": 0.5504, + "step": 20090 + }, + { + "epoch": 0.5864584853348899, + "grad_norm": 1.031482607950574, + "learning_rate": 2.2959222160884214e-05, + "loss": 0.5782, + "step": 20095 + }, + { + "epoch": 0.586604406829126, + "grad_norm": 1.0639462238326853, + "learning_rate": 2.2948588390352914e-05, + "loss": 0.5499, + "step": 20100 + }, + { + "epoch": 0.586750328323362, + "grad_norm": 0.9649379735991427, + "learning_rate": 2.293795567970282e-05, + "loss": 0.5314, + "step": 20105 + }, + { + "epoch": 0.5868962498175981, + "grad_norm": 1.0786050965693676, + "learning_rate": 2.2927324031409952e-05, + "loss": 0.6091, + "step": 20110 + }, + { + "epoch": 0.5870421713118342, + "grad_norm": 1.2126269526539113, + "learning_rate": 2.2916693447950082e-05, + "loss": 0.6135, + "step": 20115 + }, + { + "epoch": 0.5871880928060703, + "grad_norm": 0.991849799542179, + "learning_rate": 2.290606393179875e-05, + "loss": 0.489, + "step": 20120 + }, + { + "epoch": 0.5873340143003064, + "grad_norm": 1.1246426269741396, + "learning_rate": 2.2895435485431228e-05, + "loss": 0.5282, + "step": 20125 + }, + { + "epoch": 0.5874799357945425, + "grad_norm": 1.1984968582242352, + "learning_rate": 2.288480811132255e-05, + "loss": 0.59, + "step": 20130 + }, + { + "epoch": 0.5876258572887786, + "grad_norm": 1.0430466042705444, + "learning_rate": 2.2874181811947497e-05, + "loss": 0.5648, + "step": 20135 + }, + { + "epoch": 0.5877717787830148, + "grad_norm": 1.0762150889898576, + "learning_rate": 2.2863556589780617e-05, + "loss": 0.617, + "step": 20140 + }, + { + "epoch": 0.5879177002772509, + "grad_norm": 1.175687932031314, + "learning_rate": 2.285293244729617e-05, + "loss": 0.6026, + "step": 20145 + }, + { + "epoch": 0.588063621771487, + "grad_norm": 1.2318501650849663, + "learning_rate": 2.2842309386968196e-05, + "loss": 0.5844, + "step": 20150 + }, + { + "epoch": 0.5882095432657231, + "grad_norm": 1.0552322794995843, + "learning_rate": 2.2831687411270476e-05, + "loss": 0.5873, + "step": 20155 + }, + { + "epoch": 0.5883554647599591, + "grad_norm": 0.865114369258492, + "learning_rate": 2.2821066522676528e-05, + "loss": 0.5376, + "step": 20160 + }, + { + "epoch": 0.5885013862541952, + "grad_norm": 1.2060802660238765, + "learning_rate": 2.2810446723659624e-05, + "loss": 0.534, + "step": 20165 + }, + { + "epoch": 0.5886473077484313, + "grad_norm": 0.9868617120119897, + "learning_rate": 2.279982801669279e-05, + "loss": 0.6097, + "step": 20170 + }, + { + "epoch": 0.5887932292426674, + "grad_norm": 0.9743787891241852, + "learning_rate": 2.278921040424879e-05, + "loss": 0.5535, + "step": 20175 + }, + { + "epoch": 0.5889391507369035, + "grad_norm": 0.939793853923095, + "learning_rate": 2.2778593888800125e-05, + "loss": 0.5277, + "step": 20180 + }, + { + "epoch": 0.5890850722311396, + "grad_norm": 0.927018543227344, + "learning_rate": 2.2767978472819056e-05, + "loss": 0.5168, + "step": 20185 + }, + { + "epoch": 0.5892309937253758, + "grad_norm": 0.9539244643890282, + "learning_rate": 2.2757364158777585e-05, + "loss": 0.5383, + "step": 20190 + }, + { + "epoch": 0.5893769152196119, + "grad_norm": 1.1828529197982565, + "learning_rate": 2.2746750949147438e-05, + "loss": 0.5599, + "step": 20195 + }, + { + "epoch": 0.589522836713848, + "grad_norm": 1.1124491374510874, + "learning_rate": 2.2736138846400107e-05, + "loss": 0.597, + "step": 20200 + }, + { + "epoch": 0.5896687582080841, + "grad_norm": 1.1347113234226767, + "learning_rate": 2.272552785300682e-05, + "loss": 0.5404, + "step": 20205 + }, + { + "epoch": 0.5898146797023202, + "grad_norm": 1.119478247378532, + "learning_rate": 2.2714917971438548e-05, + "loss": 0.5699, + "step": 20210 + }, + { + "epoch": 0.5899606011965562, + "grad_norm": 1.1012338535080322, + "learning_rate": 2.2704309204165996e-05, + "loss": 0.5579, + "step": 20215 + }, + { + "epoch": 0.5901065226907923, + "grad_norm": 0.9085055893323857, + "learning_rate": 2.269370155365962e-05, + "loss": 0.6082, + "step": 20220 + }, + { + "epoch": 0.5902524441850284, + "grad_norm": 1.312426097371305, + "learning_rate": 2.2683095022389594e-05, + "loss": 0.571, + "step": 20225 + }, + { + "epoch": 0.5903983656792645, + "grad_norm": 0.9496304002480994, + "learning_rate": 2.267248961282586e-05, + "loss": 0.4821, + "step": 20230 + }, + { + "epoch": 0.5905442871735006, + "grad_norm": 1.1349963182095353, + "learning_rate": 2.2661885327438075e-05, + "loss": 0.5381, + "step": 20235 + }, + { + "epoch": 0.5906902086677368, + "grad_norm": 0.8917739088768509, + "learning_rate": 2.265128216869567e-05, + "loss": 0.5422, + "step": 20240 + }, + { + "epoch": 0.5908361301619729, + "grad_norm": 1.2144918697567813, + "learning_rate": 2.2640680139067753e-05, + "loss": 0.553, + "step": 20245 + }, + { + "epoch": 0.590982051656209, + "grad_norm": 0.9986413615814256, + "learning_rate": 2.2630079241023233e-05, + "loss": 0.6059, + "step": 20250 + }, + { + "epoch": 0.5911279731504451, + "grad_norm": 0.9865061021953596, + "learning_rate": 2.261947947703071e-05, + "loss": 0.6178, + "step": 20255 + }, + { + "epoch": 0.5912738946446812, + "grad_norm": 0.8939777495483102, + "learning_rate": 2.260888084955854e-05, + "loss": 0.5693, + "step": 20260 + }, + { + "epoch": 0.5914198161389173, + "grad_norm": 1.005944084878975, + "learning_rate": 2.2598283361074828e-05, + "loss": 0.5119, + "step": 20265 + }, + { + "epoch": 0.5915657376331533, + "grad_norm": 0.9195304580861503, + "learning_rate": 2.2587687014047376e-05, + "loss": 0.5683, + "step": 20270 + }, + { + "epoch": 0.5917116591273894, + "grad_norm": 1.3277022005795922, + "learning_rate": 2.2577091810943746e-05, + "loss": 0.6543, + "step": 20275 + }, + { + "epoch": 0.5918575806216255, + "grad_norm": 1.0663342464068277, + "learning_rate": 2.256649775423123e-05, + "loss": 0.609, + "step": 20280 + }, + { + "epoch": 0.5920035021158616, + "grad_norm": 1.3978342850656502, + "learning_rate": 2.255590484637687e-05, + "loss": 0.5292, + "step": 20285 + }, + { + "epoch": 0.5921494236100978, + "grad_norm": 1.0049496463223158, + "learning_rate": 2.254531308984739e-05, + "loss": 0.5449, + "step": 20290 + }, + { + "epoch": 0.5922953451043339, + "grad_norm": 0.9847580686873297, + "learning_rate": 2.25347224871093e-05, + "loss": 0.5393, + "step": 20295 + }, + { + "epoch": 0.59244126659857, + "grad_norm": 1.0169967790279575, + "learning_rate": 2.2524133040628813e-05, + "loss": 0.544, + "step": 20300 + }, + { + "epoch": 0.5925871880928061, + "grad_norm": 1.1658272993065637, + "learning_rate": 2.251354475287188e-05, + "loss": 0.5779, + "step": 20305 + }, + { + "epoch": 0.5927331095870422, + "grad_norm": 1.056454703688431, + "learning_rate": 2.2502957626304195e-05, + "loss": 0.541, + "step": 20310 + }, + { + "epoch": 0.5928790310812783, + "grad_norm": 1.0823722829493652, + "learning_rate": 2.249237166339114e-05, + "loss": 0.5743, + "step": 20315 + }, + { + "epoch": 0.5930249525755144, + "grad_norm": 1.0635155505116682, + "learning_rate": 2.2481786866597877e-05, + "loss": 0.5646, + "step": 20320 + }, + { + "epoch": 0.5931708740697504, + "grad_norm": 1.0562418940884053, + "learning_rate": 2.247120323838926e-05, + "loss": 0.5693, + "step": 20325 + }, + { + "epoch": 0.5933167955639865, + "grad_norm": 0.938750548293275, + "learning_rate": 2.2460620781229896e-05, + "loss": 0.5882, + "step": 20330 + }, + { + "epoch": 0.5934627170582226, + "grad_norm": 0.9674781560476177, + "learning_rate": 2.2450039497584108e-05, + "loss": 0.5786, + "step": 20335 + }, + { + "epoch": 0.5936086385524588, + "grad_norm": 1.0382562079800113, + "learning_rate": 2.2439459389915936e-05, + "loss": 0.5517, + "step": 20340 + }, + { + "epoch": 0.5937545600466949, + "grad_norm": 1.0405564976104924, + "learning_rate": 2.242888046068915e-05, + "loss": 0.5889, + "step": 20345 + }, + { + "epoch": 0.593900481540931, + "grad_norm": 1.0170114741046987, + "learning_rate": 2.2418302712367266e-05, + "loss": 0.5458, + "step": 20350 + }, + { + "epoch": 0.5940464030351671, + "grad_norm": 1.162253256162768, + "learning_rate": 2.240772614741352e-05, + "loss": 0.5464, + "step": 20355 + }, + { + "epoch": 0.5941923245294032, + "grad_norm": 1.1075240930150376, + "learning_rate": 2.239715076829083e-05, + "loss": 0.5729, + "step": 20360 + }, + { + "epoch": 0.5943382460236393, + "grad_norm": 1.1112280697434358, + "learning_rate": 2.2386576577461897e-05, + "loss": 0.523, + "step": 20365 + }, + { + "epoch": 0.5944841675178754, + "grad_norm": 1.0792448966301387, + "learning_rate": 2.2376003577389104e-05, + "loss": 0.6105, + "step": 20370 + }, + { + "epoch": 0.5946300890121115, + "grad_norm": 0.9822900784735648, + "learning_rate": 2.2365431770534585e-05, + "loss": 0.558, + "step": 20375 + }, + { + "epoch": 0.5947760105063475, + "grad_norm": 1.4666583289560435, + "learning_rate": 2.235486115936018e-05, + "loss": 0.6297, + "step": 20380 + }, + { + "epoch": 0.5949219320005836, + "grad_norm": 1.0954487303190743, + "learning_rate": 2.234429174632744e-05, + "loss": 0.5386, + "step": 20385 + }, + { + "epoch": 0.5950678534948198, + "grad_norm": 1.0998707050841217, + "learning_rate": 2.2333723533897656e-05, + "loss": 0.5946, + "step": 20390 + }, + { + "epoch": 0.5952137749890559, + "grad_norm": 1.0577715665455427, + "learning_rate": 2.2323156524531847e-05, + "loss": 0.4987, + "step": 20395 + }, + { + "epoch": 0.595359696483292, + "grad_norm": 1.1000263172707412, + "learning_rate": 2.231259072069072e-05, + "loss": 0.5672, + "step": 20400 + }, + { + "epoch": 0.5955056179775281, + "grad_norm": 1.0931702251825106, + "learning_rate": 2.2302026124834735e-05, + "loss": 0.5784, + "step": 20405 + }, + { + "epoch": 0.5956515394717642, + "grad_norm": 1.1593187168785455, + "learning_rate": 2.229146273942404e-05, + "loss": 0.5309, + "step": 20410 + }, + { + "epoch": 0.5957974609660003, + "grad_norm": 1.024642083032205, + "learning_rate": 2.2280900566918527e-05, + "loss": 0.5243, + "step": 20415 + }, + { + "epoch": 0.5959433824602364, + "grad_norm": 0.9364630036692924, + "learning_rate": 2.2270339609777795e-05, + "loss": 0.597, + "step": 20420 + }, + { + "epoch": 0.5960893039544725, + "grad_norm": 1.0631097835374657, + "learning_rate": 2.2259779870461163e-05, + "loss": 0.5595, + "step": 20425 + }, + { + "epoch": 0.5962352254487086, + "grad_norm": 1.0818605687517882, + "learning_rate": 2.2249221351427652e-05, + "loss": 0.5166, + "step": 20430 + }, + { + "epoch": 0.5963811469429448, + "grad_norm": 1.1059413427306886, + "learning_rate": 2.223866405513602e-05, + "loss": 0.5328, + "step": 20435 + }, + { + "epoch": 0.5965270684371808, + "grad_norm": 1.0505419561483995, + "learning_rate": 2.222810798404473e-05, + "loss": 0.5071, + "step": 20440 + }, + { + "epoch": 0.5966729899314169, + "grad_norm": 0.9612051757730211, + "learning_rate": 2.2217553140611952e-05, + "loss": 0.5846, + "step": 20445 + }, + { + "epoch": 0.596818911425653, + "grad_norm": 1.0945574456110765, + "learning_rate": 2.220699952729559e-05, + "loss": 0.5803, + "step": 20450 + }, + { + "epoch": 0.5969648329198891, + "grad_norm": 1.0513870560234595, + "learning_rate": 2.2196447146553246e-05, + "loss": 0.6112, + "step": 20455 + }, + { + "epoch": 0.5971107544141252, + "grad_norm": 0.9943958297025546, + "learning_rate": 2.2185896000842228e-05, + "loss": 0.5442, + "step": 20460 + }, + { + "epoch": 0.5972566759083613, + "grad_norm": 1.1133996031294575, + "learning_rate": 2.2175346092619587e-05, + "loss": 0.4808, + "step": 20465 + }, + { + "epoch": 0.5974025974025974, + "grad_norm": 1.0980346172546491, + "learning_rate": 2.2164797424342043e-05, + "loss": 0.5182, + "step": 20470 + }, + { + "epoch": 0.5975485188968335, + "grad_norm": 1.1067198836407617, + "learning_rate": 2.215424999846608e-05, + "loss": 0.556, + "step": 20475 + }, + { + "epoch": 0.5976944403910696, + "grad_norm": 1.0759340430497362, + "learning_rate": 2.2143703817447836e-05, + "loss": 0.538, + "step": 20480 + }, + { + "epoch": 0.5978403618853058, + "grad_norm": 1.0707005495055217, + "learning_rate": 2.2133158883743194e-05, + "loss": 0.5509, + "step": 20485 + }, + { + "epoch": 0.5979862833795418, + "grad_norm": 0.9814784070439988, + "learning_rate": 2.212261519980775e-05, + "loss": 0.5682, + "step": 20490 + }, + { + "epoch": 0.5981322048737779, + "grad_norm": 1.0201940864950712, + "learning_rate": 2.2112072768096786e-05, + "loss": 0.5815, + "step": 20495 + }, + { + "epoch": 0.598278126368014, + "grad_norm": 1.0251839583017146, + "learning_rate": 2.2101531591065306e-05, + "loss": 0.5744, + "step": 20500 + }, + { + "epoch": 0.5984240478622501, + "grad_norm": 1.0458626970003326, + "learning_rate": 2.2090991671168027e-05, + "loss": 0.5177, + "step": 20505 + }, + { + "epoch": 0.5985699693564862, + "grad_norm": 0.9348861798189539, + "learning_rate": 2.2080453010859353e-05, + "loss": 0.5896, + "step": 20510 + }, + { + "epoch": 0.5987158908507223, + "grad_norm": 1.050269455195219, + "learning_rate": 2.2069915612593412e-05, + "loss": 0.5885, + "step": 20515 + }, + { + "epoch": 0.5988618123449584, + "grad_norm": 1.0804755063263796, + "learning_rate": 2.205937947882404e-05, + "loss": 0.6179, + "step": 20520 + }, + { + "epoch": 0.5990077338391945, + "grad_norm": 1.223588097639347, + "learning_rate": 2.2048844612004776e-05, + "loss": 0.5401, + "step": 20525 + }, + { + "epoch": 0.5991536553334306, + "grad_norm": 0.9293614599273766, + "learning_rate": 2.2038311014588854e-05, + "loss": 0.551, + "step": 20530 + }, + { + "epoch": 0.5992995768276668, + "grad_norm": 1.0805094735804892, + "learning_rate": 2.202777868902921e-05, + "loss": 0.5545, + "step": 20535 + }, + { + "epoch": 0.5994454983219029, + "grad_norm": 0.9488425597564626, + "learning_rate": 2.201724763777851e-05, + "loss": 0.5391, + "step": 20540 + }, + { + "epoch": 0.599591419816139, + "grad_norm": 0.9865826681656805, + "learning_rate": 2.2006717863289106e-05, + "loss": 0.592, + "step": 20545 + }, + { + "epoch": 0.599737341310375, + "grad_norm": 1.023242505353522, + "learning_rate": 2.1996189368013025e-05, + "loss": 0.5733, + "step": 20550 + }, + { + "epoch": 0.5998832628046111, + "grad_norm": 0.9959313552116021, + "learning_rate": 2.1985662154402053e-05, + "loss": 0.5934, + "step": 20555 + }, + { + "epoch": 0.6000291842988472, + "grad_norm": 1.1775251651305807, + "learning_rate": 2.1975136224907638e-05, + "loss": 0.5558, + "step": 20560 + }, + { + "epoch": 0.6001751057930833, + "grad_norm": 1.0374697182886492, + "learning_rate": 2.196461158198094e-05, + "loss": 0.6711, + "step": 20565 + }, + { + "epoch": 0.6003210272873194, + "grad_norm": 1.1039702053972185, + "learning_rate": 2.195408822807282e-05, + "loss": 0.5485, + "step": 20570 + }, + { + "epoch": 0.6004669487815555, + "grad_norm": 1.04655963743811, + "learning_rate": 2.194356616563383e-05, + "loss": 0.5363, + "step": 20575 + }, + { + "epoch": 0.6006128702757916, + "grad_norm": 1.0508311881698102, + "learning_rate": 2.1933045397114245e-05, + "loss": 0.5844, + "step": 20580 + }, + { + "epoch": 0.6007587917700278, + "grad_norm": 0.9366321134470322, + "learning_rate": 2.1922525924964004e-05, + "loss": 0.534, + "step": 20585 + }, + { + "epoch": 0.6009047132642639, + "grad_norm": 1.0380862691353556, + "learning_rate": 2.1912007751632767e-05, + "loss": 0.5768, + "step": 20590 + }, + { + "epoch": 0.6010506347585, + "grad_norm": 1.0366172556067461, + "learning_rate": 2.1901490879569908e-05, + "loss": 0.5643, + "step": 20595 + }, + { + "epoch": 0.601196556252736, + "grad_norm": 1.047360476975221, + "learning_rate": 2.1890975311224448e-05, + "loss": 0.5441, + "step": 20600 + }, + { + "epoch": 0.6013424777469721, + "grad_norm": 1.0172326180650557, + "learning_rate": 2.1880461049045144e-05, + "loss": 0.5771, + "step": 20605 + }, + { + "epoch": 0.6014883992412082, + "grad_norm": 1.2187568986612647, + "learning_rate": 2.1869948095480447e-05, + "loss": 0.5665, + "step": 20610 + }, + { + "epoch": 0.6016343207354443, + "grad_norm": 0.9930688453397093, + "learning_rate": 2.1859436452978483e-05, + "loss": 0.6182, + "step": 20615 + }, + { + "epoch": 0.6017802422296804, + "grad_norm": 1.0418819910204091, + "learning_rate": 2.184892612398709e-05, + "loss": 0.5332, + "step": 20620 + }, + { + "epoch": 0.6019261637239165, + "grad_norm": 0.9857143201984704, + "learning_rate": 2.1838417110953792e-05, + "loss": 0.5625, + "step": 20625 + }, + { + "epoch": 0.6020720852181526, + "grad_norm": 1.1584848470391331, + "learning_rate": 2.182790941632581e-05, + "loss": 0.6107, + "step": 20630 + }, + { + "epoch": 0.6022180067123888, + "grad_norm": 1.1059416565062257, + "learning_rate": 2.181740304255005e-05, + "loss": 0.5432, + "step": 20635 + }, + { + "epoch": 0.6023639282066249, + "grad_norm": 1.123736321120489, + "learning_rate": 2.1806897992073138e-05, + "loss": 0.5449, + "step": 20640 + }, + { + "epoch": 0.602509849700861, + "grad_norm": 0.956918412791628, + "learning_rate": 2.1796394267341346e-05, + "loss": 0.4948, + "step": 20645 + }, + { + "epoch": 0.6026557711950971, + "grad_norm": 0.9718674535469826, + "learning_rate": 2.1785891870800685e-05, + "loss": 0.5327, + "step": 20650 + }, + { + "epoch": 0.6028016926893331, + "grad_norm": 0.9388932894873323, + "learning_rate": 2.1775390804896816e-05, + "loss": 0.5519, + "step": 20655 + }, + { + "epoch": 0.6029476141835692, + "grad_norm": 1.0055246979123322, + "learning_rate": 2.1764891072075112e-05, + "loss": 0.5718, + "step": 20660 + }, + { + "epoch": 0.6030935356778053, + "grad_norm": 1.0017008070083586, + "learning_rate": 2.1754392674780643e-05, + "loss": 0.5489, + "step": 20665 + }, + { + "epoch": 0.6032394571720414, + "grad_norm": 0.8518831082289102, + "learning_rate": 2.1743895615458148e-05, + "loss": 0.5959, + "step": 20670 + }, + { + "epoch": 0.6033853786662775, + "grad_norm": 0.994183401431802, + "learning_rate": 2.173339989655206e-05, + "loss": 0.5938, + "step": 20675 + }, + { + "epoch": 0.6035313001605136, + "grad_norm": 0.8728277634365912, + "learning_rate": 2.172290552050651e-05, + "loss": 0.5572, + "step": 20680 + }, + { + "epoch": 0.6036772216547498, + "grad_norm": 1.0331559815136646, + "learning_rate": 2.171241248976531e-05, + "loss": 0.5819, + "step": 20685 + }, + { + "epoch": 0.6038231431489859, + "grad_norm": 0.9931145179801677, + "learning_rate": 2.1701920806771958e-05, + "loss": 0.5002, + "step": 20690 + }, + { + "epoch": 0.603969064643222, + "grad_norm": 1.294920213556819, + "learning_rate": 2.169143047396963e-05, + "loss": 0.5608, + "step": 20695 + }, + { + "epoch": 0.6041149861374581, + "grad_norm": 0.9987796593517074, + "learning_rate": 2.1680941493801197e-05, + "loss": 0.5195, + "step": 20700 + }, + { + "epoch": 0.6042609076316942, + "grad_norm": 1.082973899450036, + "learning_rate": 2.1670453868709222e-05, + "loss": 0.6269, + "step": 20705 + }, + { + "epoch": 0.6044068291259302, + "grad_norm": 1.186291341820937, + "learning_rate": 2.1659967601135955e-05, + "loss": 0.5906, + "step": 20710 + }, + { + "epoch": 0.6045527506201663, + "grad_norm": 1.0604562835596387, + "learning_rate": 2.1649482693523293e-05, + "loss": 0.557, + "step": 20715 + }, + { + "epoch": 0.6046986721144024, + "grad_norm": 1.314099975332506, + "learning_rate": 2.1638999148312855e-05, + "loss": 0.6083, + "step": 20720 + }, + { + "epoch": 0.6048445936086385, + "grad_norm": 1.0462001366275309, + "learning_rate": 2.1628516967945927e-05, + "loss": 0.5343, + "step": 20725 + }, + { + "epoch": 0.6049905151028746, + "grad_norm": 1.1311857720781664, + "learning_rate": 2.1618036154863485e-05, + "loss": 0.5856, + "step": 20730 + }, + { + "epoch": 0.6051364365971108, + "grad_norm": 1.143649893366773, + "learning_rate": 2.1607556711506194e-05, + "loss": 0.5912, + "step": 20735 + }, + { + "epoch": 0.6052823580913469, + "grad_norm": 0.8457230125523526, + "learning_rate": 2.159707864031436e-05, + "loss": 0.5514, + "step": 20740 + }, + { + "epoch": 0.605428279585583, + "grad_norm": 1.0579730671928256, + "learning_rate": 2.1586601943728013e-05, + "loss": 0.5711, + "step": 20745 + }, + { + "epoch": 0.6055742010798191, + "grad_norm": 1.004001186772471, + "learning_rate": 2.157612662418685e-05, + "loss": 0.521, + "step": 20750 + }, + { + "epoch": 0.6057201225740552, + "grad_norm": 1.0654443347250322, + "learning_rate": 2.1565652684130244e-05, + "loss": 0.5424, + "step": 20755 + }, + { + "epoch": 0.6058660440682913, + "grad_norm": 1.1286265359893384, + "learning_rate": 2.1555180125997247e-05, + "loss": 0.5678, + "step": 20760 + }, + { + "epoch": 0.6060119655625273, + "grad_norm": 1.1479693581431403, + "learning_rate": 2.1544708952226585e-05, + "loss": 0.562, + "step": 20765 + }, + { + "epoch": 0.6061578870567634, + "grad_norm": 1.06875988335612, + "learning_rate": 2.1534239165256658e-05, + "loss": 0.5419, + "step": 20770 + }, + { + "epoch": 0.6063038085509995, + "grad_norm": 1.1545276979146863, + "learning_rate": 2.1523770767525576e-05, + "loss": 0.5368, + "step": 20775 + }, + { + "epoch": 0.6064497300452356, + "grad_norm": 1.0947678661430025, + "learning_rate": 2.151330376147108e-05, + "loss": 0.6497, + "step": 20780 + }, + { + "epoch": 0.6065956515394718, + "grad_norm": 0.9092165975933016, + "learning_rate": 2.150283814953063e-05, + "loss": 0.5881, + "step": 20785 + }, + { + "epoch": 0.6067415730337079, + "grad_norm": 1.0249615315489535, + "learning_rate": 2.1492373934141315e-05, + "loss": 0.5502, + "step": 20790 + }, + { + "epoch": 0.606887494527944, + "grad_norm": 0.8678888065011715, + "learning_rate": 2.148191111773993e-05, + "loss": 0.5354, + "step": 20795 + }, + { + "epoch": 0.6070334160221801, + "grad_norm": 1.0305914310366362, + "learning_rate": 2.147144970276294e-05, + "loss": 0.5299, + "step": 20800 + }, + { + "epoch": 0.6071793375164162, + "grad_norm": 0.9095794650262555, + "learning_rate": 2.1460989691646495e-05, + "loss": 0.5835, + "step": 20805 + }, + { + "epoch": 0.6073252590106523, + "grad_norm": 1.2402571152238935, + "learning_rate": 2.145053108682637e-05, + "loss": 0.6104, + "step": 20810 + }, + { + "epoch": 0.6074711805048884, + "grad_norm": 1.2254193813354115, + "learning_rate": 2.1440073890738073e-05, + "loss": 0.5973, + "step": 20815 + }, + { + "epoch": 0.6076171019991244, + "grad_norm": 0.9822422301400647, + "learning_rate": 2.142961810581675e-05, + "loss": 0.538, + "step": 20820 + }, + { + "epoch": 0.6077630234933605, + "grad_norm": 0.9772408431179281, + "learning_rate": 2.1419163734497233e-05, + "loss": 0.615, + "step": 20825 + }, + { + "epoch": 0.6079089449875966, + "grad_norm": 1.005338618273472, + "learning_rate": 2.140871077921401e-05, + "loss": 0.5758, + "step": 20830 + }, + { + "epoch": 0.6080548664818328, + "grad_norm": 1.0106155205475722, + "learning_rate": 2.139825924240125e-05, + "loss": 0.5656, + "step": 20835 + }, + { + "epoch": 0.6082007879760689, + "grad_norm": 0.9939845411067095, + "learning_rate": 2.1387809126492777e-05, + "loss": 0.5504, + "step": 20840 + }, + { + "epoch": 0.608346709470305, + "grad_norm": 1.1478236779280577, + "learning_rate": 2.13773604339221e-05, + "loss": 0.5632, + "step": 20845 + }, + { + "epoch": 0.6084926309645411, + "grad_norm": 1.1606344762948329, + "learning_rate": 2.136691316712241e-05, + "loss": 0.5607, + "step": 20850 + }, + { + "epoch": 0.6086385524587772, + "grad_norm": 1.0153698575612413, + "learning_rate": 2.1356467328526537e-05, + "loss": 0.6162, + "step": 20855 + }, + { + "epoch": 0.6087844739530133, + "grad_norm": 1.0426421631438765, + "learning_rate": 2.1346022920566976e-05, + "loss": 0.5326, + "step": 20860 + }, + { + "epoch": 0.6089303954472494, + "grad_norm": 1.0776483199406754, + "learning_rate": 2.1335579945675926e-05, + "loss": 0.5695, + "step": 20865 + }, + { + "epoch": 0.6090763169414855, + "grad_norm": 1.0780409586606694, + "learning_rate": 2.1325138406285212e-05, + "loss": 0.5485, + "step": 20870 + }, + { + "epoch": 0.6092222384357215, + "grad_norm": 1.112676160883401, + "learning_rate": 2.1314698304826347e-05, + "loss": 0.5152, + "step": 20875 + }, + { + "epoch": 0.6093681599299576, + "grad_norm": 1.2861610403341261, + "learning_rate": 2.1304259643730508e-05, + "loss": 0.6097, + "step": 20880 + }, + { + "epoch": 0.6095140814241938, + "grad_norm": 1.0235654980519893, + "learning_rate": 2.1293822425428528e-05, + "loss": 0.5154, + "step": 20885 + }, + { + "epoch": 0.6096600029184299, + "grad_norm": 1.0248336427277436, + "learning_rate": 2.1283386652350897e-05, + "loss": 0.6362, + "step": 20890 + }, + { + "epoch": 0.609805924412666, + "grad_norm": 0.9238274484533109, + "learning_rate": 2.12729523269278e-05, + "loss": 0.538, + "step": 20895 + }, + { + "epoch": 0.6099518459069021, + "grad_norm": 1.1268433274417007, + "learning_rate": 2.1262519451589057e-05, + "loss": 0.6846, + "step": 20900 + }, + { + "epoch": 0.6100977674011382, + "grad_norm": 0.8409508299516611, + "learning_rate": 2.125208802876415e-05, + "loss": 0.555, + "step": 20905 + }, + { + "epoch": 0.6102436888953743, + "grad_norm": 1.060115998635965, + "learning_rate": 2.1241658060882237e-05, + "loss": 0.5642, + "step": 20910 + }, + { + "epoch": 0.6103896103896104, + "grad_norm": 1.0056216063178673, + "learning_rate": 2.123122955037213e-05, + "loss": 0.5804, + "step": 20915 + }, + { + "epoch": 0.6105355318838465, + "grad_norm": 1.1343809433512069, + "learning_rate": 2.12208024996623e-05, + "loss": 0.5894, + "step": 20920 + }, + { + "epoch": 0.6106814533780826, + "grad_norm": 1.2345845805770375, + "learning_rate": 2.1210376911180897e-05, + "loss": 0.5921, + "step": 20925 + }, + { + "epoch": 0.6108273748723186, + "grad_norm": 1.1387616682537929, + "learning_rate": 2.1199952787355687e-05, + "loss": 0.5643, + "step": 20930 + }, + { + "epoch": 0.6109732963665548, + "grad_norm": 1.1019699759279846, + "learning_rate": 2.1189530130614144e-05, + "loss": 0.5827, + "step": 20935 + }, + { + "epoch": 0.6111192178607909, + "grad_norm": 0.9621888290522305, + "learning_rate": 2.1179108943383362e-05, + "loss": 0.5376, + "step": 20940 + }, + { + "epoch": 0.611265139355027, + "grad_norm": 1.1429063699441584, + "learning_rate": 2.1168689228090116e-05, + "loss": 0.6299, + "step": 20945 + }, + { + "epoch": 0.6114110608492631, + "grad_norm": 0.9793385378375368, + "learning_rate": 2.1158270987160845e-05, + "loss": 0.582, + "step": 20950 + }, + { + "epoch": 0.6115569823434992, + "grad_norm": 0.9625935878807543, + "learning_rate": 2.1147854223021602e-05, + "loss": 0.5464, + "step": 20955 + }, + { + "epoch": 0.6117029038377353, + "grad_norm": 1.1470381136472476, + "learning_rate": 2.1137438938098146e-05, + "loss": 0.5253, + "step": 20960 + }, + { + "epoch": 0.6118488253319714, + "grad_norm": 1.028113568982448, + "learning_rate": 2.112702513481587e-05, + "loss": 0.5293, + "step": 20965 + }, + { + "epoch": 0.6119947468262075, + "grad_norm": 1.1336046067258037, + "learning_rate": 2.111661281559982e-05, + "loss": 0.5734, + "step": 20970 + }, + { + "epoch": 0.6121406683204436, + "grad_norm": 1.0997543051865089, + "learning_rate": 2.110620198287469e-05, + "loss": 0.6211, + "step": 20975 + }, + { + "epoch": 0.6122865898146796, + "grad_norm": 0.9912151064564366, + "learning_rate": 2.1095792639064848e-05, + "loss": 0.548, + "step": 20980 + }, + { + "epoch": 0.6124325113089159, + "grad_norm": 1.0613383488483452, + "learning_rate": 2.1085384786594293e-05, + "loss": 0.5345, + "step": 20985 + }, + { + "epoch": 0.6125784328031519, + "grad_norm": 1.3034219194078562, + "learning_rate": 2.10749784278867e-05, + "loss": 0.6727, + "step": 20990 + }, + { + "epoch": 0.612724354297388, + "grad_norm": 0.9465008086291266, + "learning_rate": 2.1064573565365386e-05, + "loss": 0.5688, + "step": 20995 + }, + { + "epoch": 0.6128702757916241, + "grad_norm": 1.0180498381045107, + "learning_rate": 2.1054170201453306e-05, + "loss": 0.5633, + "step": 21000 + }, + { + "epoch": 0.6130161972858602, + "grad_norm": 1.0657436655782013, + "learning_rate": 2.1043768338573078e-05, + "loss": 0.5402, + "step": 21005 + }, + { + "epoch": 0.6131621187800963, + "grad_norm": 1.004813186994838, + "learning_rate": 2.1033367979146977e-05, + "loss": 0.5228, + "step": 21010 + }, + { + "epoch": 0.6133080402743324, + "grad_norm": 0.9249121557971645, + "learning_rate": 2.1022969125596914e-05, + "loss": 0.5876, + "step": 21015 + }, + { + "epoch": 0.6134539617685685, + "grad_norm": 1.1860403940140252, + "learning_rate": 2.101257178034447e-05, + "loss": 0.5574, + "step": 21020 + }, + { + "epoch": 0.6135998832628046, + "grad_norm": 1.0965048000541653, + "learning_rate": 2.1002175945810848e-05, + "loss": 0.5706, + "step": 21025 + }, + { + "epoch": 0.6137458047570407, + "grad_norm": 1.3360917650644655, + "learning_rate": 2.0991781624416906e-05, + "loss": 0.6071, + "step": 21030 + }, + { + "epoch": 0.6138917262512769, + "grad_norm": 1.383901407067633, + "learning_rate": 2.0981388818583177e-05, + "loss": 0.6533, + "step": 21035 + }, + { + "epoch": 0.614037647745513, + "grad_norm": 1.1014294578665347, + "learning_rate": 2.0970997530729803e-05, + "loss": 0.5424, + "step": 21040 + }, + { + "epoch": 0.614183569239749, + "grad_norm": 1.1272101205528826, + "learning_rate": 2.0960607763276598e-05, + "loss": 0.5894, + "step": 21045 + }, + { + "epoch": 0.6143294907339851, + "grad_norm": 1.0062562554036623, + "learning_rate": 2.095021951864301e-05, + "loss": 0.595, + "step": 21050 + }, + { + "epoch": 0.6144754122282212, + "grad_norm": 1.124343377791846, + "learning_rate": 2.0939832799248132e-05, + "loss": 0.5899, + "step": 21055 + }, + { + "epoch": 0.6146213337224573, + "grad_norm": 1.0910739427240361, + "learning_rate": 2.092944760751071e-05, + "loss": 0.6024, + "step": 21060 + }, + { + "epoch": 0.6147672552166934, + "grad_norm": 0.9850902669143926, + "learning_rate": 2.0919063945849132e-05, + "loss": 0.6254, + "step": 21065 + }, + { + "epoch": 0.6149131767109295, + "grad_norm": 1.0522794167654212, + "learning_rate": 2.090868181668143e-05, + "loss": 0.5739, + "step": 21070 + }, + { + "epoch": 0.6150590982051656, + "grad_norm": 1.3670239792786028, + "learning_rate": 2.0898301222425264e-05, + "loss": 0.5878, + "step": 21075 + }, + { + "epoch": 0.6152050196994017, + "grad_norm": 1.174906405545156, + "learning_rate": 2.0887922165497963e-05, + "loss": 0.6097, + "step": 21080 + }, + { + "epoch": 0.6153509411936379, + "grad_norm": 1.1029151659068621, + "learning_rate": 2.087754464831648e-05, + "loss": 0.5913, + "step": 21085 + }, + { + "epoch": 0.615496862687874, + "grad_norm": 1.0722149946417212, + "learning_rate": 2.0867168673297404e-05, + "loss": 0.5466, + "step": 21090 + }, + { + "epoch": 0.61564278418211, + "grad_norm": 1.0671606458624274, + "learning_rate": 2.0856794242856987e-05, + "loss": 0.5784, + "step": 21095 + }, + { + "epoch": 0.6157887056763461, + "grad_norm": 0.9656602125896625, + "learning_rate": 2.08464213594111e-05, + "loss": 0.5053, + "step": 21100 + }, + { + "epoch": 0.6159346271705822, + "grad_norm": 0.8487333982574742, + "learning_rate": 2.083605002537527e-05, + "loss": 0.4968, + "step": 21105 + }, + { + "epoch": 0.6160805486648183, + "grad_norm": 1.0654263172881016, + "learning_rate": 2.082568024316465e-05, + "loss": 0.5615, + "step": 21110 + }, + { + "epoch": 0.6162264701590544, + "grad_norm": 0.9245827144939938, + "learning_rate": 2.081531201519405e-05, + "loss": 0.5623, + "step": 21115 + }, + { + "epoch": 0.6163723916532905, + "grad_norm": 1.0630141062163003, + "learning_rate": 2.080494534387789e-05, + "loss": 0.5863, + "step": 21120 + }, + { + "epoch": 0.6165183131475266, + "grad_norm": 1.2693387154464157, + "learning_rate": 2.079458023163025e-05, + "loss": 0.5639, + "step": 21125 + }, + { + "epoch": 0.6166642346417627, + "grad_norm": 0.9376850976772002, + "learning_rate": 2.0784216680864833e-05, + "loss": 0.5426, + "step": 21130 + }, + { + "epoch": 0.6168101561359989, + "grad_norm": 1.0769892123138154, + "learning_rate": 2.0773854693994994e-05, + "loss": 0.5566, + "step": 21135 + }, + { + "epoch": 0.616956077630235, + "grad_norm": 1.193443691146768, + "learning_rate": 2.0763494273433727e-05, + "loss": 0.5786, + "step": 21140 + }, + { + "epoch": 0.6171019991244711, + "grad_norm": 1.0189284087574257, + "learning_rate": 2.0753135421593616e-05, + "loss": 0.5368, + "step": 21145 + }, + { + "epoch": 0.6172479206187071, + "grad_norm": 7.954496033399136, + "learning_rate": 2.0742778140886936e-05, + "loss": 0.5891, + "step": 21150 + }, + { + "epoch": 0.6173938421129432, + "grad_norm": 1.0887223560450103, + "learning_rate": 2.0732422433725574e-05, + "loss": 0.5695, + "step": 21155 + }, + { + "epoch": 0.6175397636071793, + "grad_norm": 1.1922631437013145, + "learning_rate": 2.0722068302521048e-05, + "loss": 0.5527, + "step": 21160 + }, + { + "epoch": 0.6176856851014154, + "grad_norm": 0.9858764650298035, + "learning_rate": 2.0711715749684502e-05, + "loss": 0.6035, + "step": 21165 + }, + { + "epoch": 0.6178316065956515, + "grad_norm": 1.0346440495787257, + "learning_rate": 2.0701364777626725e-05, + "loss": 0.5369, + "step": 21170 + }, + { + "epoch": 0.6179775280898876, + "grad_norm": 0.9005077258704912, + "learning_rate": 2.0691015388758138e-05, + "loss": 0.5485, + "step": 21175 + }, + { + "epoch": 0.6181234495841238, + "grad_norm": 1.126403900209555, + "learning_rate": 2.068066758548879e-05, + "loss": 0.5753, + "step": 21180 + }, + { + "epoch": 0.6182693710783599, + "grad_norm": 1.190056036680942, + "learning_rate": 2.0670321370228362e-05, + "loss": 0.575, + "step": 21185 + }, + { + "epoch": 0.618415292572596, + "grad_norm": 1.0279237016730343, + "learning_rate": 2.0659976745386157e-05, + "loss": 0.5619, + "step": 21190 + }, + { + "epoch": 0.6185612140668321, + "grad_norm": 0.9287121372900066, + "learning_rate": 2.0649633713371107e-05, + "loss": 0.5276, + "step": 21195 + }, + { + "epoch": 0.6187071355610682, + "grad_norm": 0.965304169246816, + "learning_rate": 2.0639292276591798e-05, + "loss": 0.5693, + "step": 21200 + }, + { + "epoch": 0.6188530570553042, + "grad_norm": 1.0316334511895353, + "learning_rate": 2.062895243745641e-05, + "loss": 0.5203, + "step": 21205 + }, + { + "epoch": 0.6189989785495403, + "grad_norm": 0.9407941404535358, + "learning_rate": 2.0618614198372793e-05, + "loss": 0.5648, + "step": 21210 + }, + { + "epoch": 0.6191449000437764, + "grad_norm": 1.0383205896852723, + "learning_rate": 2.0608277561748372e-05, + "loss": 0.5419, + "step": 21215 + }, + { + "epoch": 0.6192908215380125, + "grad_norm": 0.8846479166681062, + "learning_rate": 2.0597942529990233e-05, + "loss": 0.5305, + "step": 21220 + }, + { + "epoch": 0.6194367430322486, + "grad_norm": 1.0870178223318203, + "learning_rate": 2.0587609105505096e-05, + "loss": 0.5516, + "step": 21225 + }, + { + "epoch": 0.6195826645264848, + "grad_norm": 0.9493673859021511, + "learning_rate": 2.0577277290699272e-05, + "loss": 0.5524, + "step": 21230 + }, + { + "epoch": 0.6197285860207209, + "grad_norm": 1.075991458806228, + "learning_rate": 2.0566947087978727e-05, + "loss": 0.5922, + "step": 21235 + }, + { + "epoch": 0.619874507514957, + "grad_norm": 1.0315632090199243, + "learning_rate": 2.0556618499749037e-05, + "loss": 0.5828, + "step": 21240 + }, + { + "epoch": 0.6200204290091931, + "grad_norm": 1.114033719898931, + "learning_rate": 2.054629152841541e-05, + "loss": 0.5625, + "step": 21245 + }, + { + "epoch": 0.6201663505034292, + "grad_norm": 0.8769256769172714, + "learning_rate": 2.053596617638267e-05, + "loss": 0.5527, + "step": 21250 + }, + { + "epoch": 0.6203122719976653, + "grad_norm": 0.9331616257283377, + "learning_rate": 2.052564244605529e-05, + "loss": 0.5256, + "step": 21255 + }, + { + "epoch": 0.6204581934919013, + "grad_norm": 0.9385707265453662, + "learning_rate": 2.0515320339837307e-05, + "loss": 0.5222, + "step": 21260 + }, + { + "epoch": 0.6206041149861374, + "grad_norm": 0.9865989544284782, + "learning_rate": 2.050499986013244e-05, + "loss": 0.6163, + "step": 21265 + }, + { + "epoch": 0.6207500364803735, + "grad_norm": 0.8994829113519723, + "learning_rate": 2.0494681009343997e-05, + "loss": 0.5327, + "step": 21270 + }, + { + "epoch": 0.6208959579746096, + "grad_norm": 1.0514526287984596, + "learning_rate": 2.0484363789874922e-05, + "loss": 0.5518, + "step": 21275 + }, + { + "epoch": 0.6210418794688458, + "grad_norm": 1.069301747313711, + "learning_rate": 2.0474048204127773e-05, + "loss": 0.5982, + "step": 21280 + }, + { + "epoch": 0.6211878009630819, + "grad_norm": 1.3294654380040898, + "learning_rate": 2.046373425450471e-05, + "loss": 0.5861, + "step": 21285 + }, + { + "epoch": 0.621333722457318, + "grad_norm": 0.9223621789935365, + "learning_rate": 2.0453421943407547e-05, + "loss": 0.5555, + "step": 21290 + }, + { + "epoch": 0.6214796439515541, + "grad_norm": 0.9501808462250673, + "learning_rate": 2.04431112732377e-05, + "loss": 0.5902, + "step": 21295 + }, + { + "epoch": 0.6216255654457902, + "grad_norm": 0.8997578243825174, + "learning_rate": 2.0432802246396176e-05, + "loss": 0.5294, + "step": 21300 + }, + { + "epoch": 0.6217714869400263, + "grad_norm": 1.1095027987155772, + "learning_rate": 2.0422494865283658e-05, + "loss": 0.621, + "step": 21305 + }, + { + "epoch": 0.6219174084342624, + "grad_norm": 1.0195718472616002, + "learning_rate": 2.0412189132300384e-05, + "loss": 0.5538, + "step": 21310 + }, + { + "epoch": 0.6220633299284984, + "grad_norm": 1.1603937337271175, + "learning_rate": 2.0401885049846253e-05, + "loss": 0.5653, + "step": 21315 + }, + { + "epoch": 0.6222092514227345, + "grad_norm": 0.857776049578617, + "learning_rate": 2.039158262032076e-05, + "loss": 0.5229, + "step": 21320 + }, + { + "epoch": 0.6223551729169706, + "grad_norm": 1.0633128806757628, + "learning_rate": 2.0381281846123012e-05, + "loss": 0.5651, + "step": 21325 + }, + { + "epoch": 0.6225010944112068, + "grad_norm": 0.9907112614236621, + "learning_rate": 2.037098272965175e-05, + "loss": 0.6081, + "step": 21330 + }, + { + "epoch": 0.6226470159054429, + "grad_norm": 1.2478561719135073, + "learning_rate": 2.0360685273305303e-05, + "loss": 0.5885, + "step": 21335 + }, + { + "epoch": 0.622792937399679, + "grad_norm": 1.0533024011191094, + "learning_rate": 2.0350389479481625e-05, + "loss": 0.6176, + "step": 21340 + }, + { + "epoch": 0.6229388588939151, + "grad_norm": 0.9586637821408782, + "learning_rate": 2.0340095350578285e-05, + "loss": 0.565, + "step": 21345 + }, + { + "epoch": 0.6230847803881512, + "grad_norm": 1.0426406732603255, + "learning_rate": 2.0329802888992485e-05, + "loss": 0.5626, + "step": 21350 + }, + { + "epoch": 0.6232307018823873, + "grad_norm": 1.0347510408200014, + "learning_rate": 2.031951209712098e-05, + "loss": 0.6089, + "step": 21355 + }, + { + "epoch": 0.6233766233766234, + "grad_norm": 0.9996708460516198, + "learning_rate": 2.0309222977360193e-05, + "loss": 0.5789, + "step": 21360 + }, + { + "epoch": 0.6235225448708595, + "grad_norm": 1.4525903403202267, + "learning_rate": 2.029893553210615e-05, + "loss": 0.5948, + "step": 21365 + }, + { + "epoch": 0.6236684663650955, + "grad_norm": 0.8834234802936349, + "learning_rate": 2.0288649763754448e-05, + "loss": 0.5683, + "step": 21370 + }, + { + "epoch": 0.6238143878593316, + "grad_norm": 1.073468640838132, + "learning_rate": 2.0278365674700343e-05, + "loss": 0.5599, + "step": 21375 + }, + { + "epoch": 0.6239603093535678, + "grad_norm": 0.9808194882641018, + "learning_rate": 2.0268083267338665e-05, + "loss": 0.564, + "step": 21380 + }, + { + "epoch": 0.6241062308478039, + "grad_norm": 1.0608993769212025, + "learning_rate": 2.0257802544063865e-05, + "loss": 0.6158, + "step": 21385 + }, + { + "epoch": 0.62425215234204, + "grad_norm": 1.1540170659309947, + "learning_rate": 2.024752350727e-05, + "loss": 0.6033, + "step": 21390 + }, + { + "epoch": 0.6243980738362761, + "grad_norm": 0.98006331599683, + "learning_rate": 2.023724615935075e-05, + "loss": 0.5733, + "step": 21395 + }, + { + "epoch": 0.6245439953305122, + "grad_norm": 0.967465655258965, + "learning_rate": 2.0226970502699373e-05, + "loss": 0.5419, + "step": 21400 + }, + { + "epoch": 0.6246899168247483, + "grad_norm": 1.0185109680764977, + "learning_rate": 2.0216696539708745e-05, + "loss": 0.5123, + "step": 21405 + }, + { + "epoch": 0.6248358383189844, + "grad_norm": 1.0499718379617553, + "learning_rate": 2.0206424272771364e-05, + "loss": 0.5261, + "step": 21410 + }, + { + "epoch": 0.6249817598132205, + "grad_norm": 1.144159539526579, + "learning_rate": 2.0196153704279303e-05, + "loss": 0.534, + "step": 21415 + }, + { + "epoch": 0.6251276813074566, + "grad_norm": 1.064938894036352, + "learning_rate": 2.018588483662427e-05, + "loss": 0.5612, + "step": 21420 + }, + { + "epoch": 0.6252736028016926, + "grad_norm": 1.0264622459731616, + "learning_rate": 2.0175617672197568e-05, + "loss": 0.5694, + "step": 21425 + }, + { + "epoch": 0.6254195242959288, + "grad_norm": 1.1168875367414954, + "learning_rate": 2.016535221339007e-05, + "loss": 0.5821, + "step": 21430 + }, + { + "epoch": 0.6255654457901649, + "grad_norm": 0.9744687105585814, + "learning_rate": 2.015508846259231e-05, + "loss": 0.5947, + "step": 21435 + }, + { + "epoch": 0.625711367284401, + "grad_norm": 0.9351018711588722, + "learning_rate": 2.014482642219437e-05, + "loss": 0.5697, + "step": 21440 + }, + { + "epoch": 0.6258572887786371, + "grad_norm": 1.074847075465745, + "learning_rate": 2.0134566094585976e-05, + "loss": 0.571, + "step": 21445 + }, + { + "epoch": 0.6260032102728732, + "grad_norm": 0.9232741818204088, + "learning_rate": 2.0124307482156428e-05, + "loss": 0.5622, + "step": 21450 + }, + { + "epoch": 0.6261491317671093, + "grad_norm": 1.1075312982604204, + "learning_rate": 2.0114050587294635e-05, + "loss": 0.5161, + "step": 21455 + }, + { + "epoch": 0.6262950532613454, + "grad_norm": 1.0498199037689169, + "learning_rate": 2.010379541238911e-05, + "loss": 0.5611, + "step": 21460 + }, + { + "epoch": 0.6264409747555815, + "grad_norm": 0.8194196037784405, + "learning_rate": 2.0093541959827956e-05, + "loss": 0.4801, + "step": 21465 + }, + { + "epoch": 0.6265868962498176, + "grad_norm": 0.9935096321243365, + "learning_rate": 2.0083290231998896e-05, + "loss": 0.5724, + "step": 21470 + }, + { + "epoch": 0.6267328177440536, + "grad_norm": 1.0599974385061437, + "learning_rate": 2.007304023128922e-05, + "loss": 0.5287, + "step": 21475 + }, + { + "epoch": 0.6268787392382899, + "grad_norm": 0.9435461840115053, + "learning_rate": 2.006279196008584e-05, + "loss": 0.56, + "step": 21480 + }, + { + "epoch": 0.6270246607325259, + "grad_norm": 0.9392940918791206, + "learning_rate": 2.005254542077525e-05, + "loss": 0.5103, + "step": 21485 + }, + { + "epoch": 0.627170582226762, + "grad_norm": 1.1049056641867625, + "learning_rate": 2.004230061574356e-05, + "loss": 0.5795, + "step": 21490 + }, + { + "epoch": 0.6273165037209981, + "grad_norm": 1.051987756658652, + "learning_rate": 2.0032057547376458e-05, + "loss": 0.5502, + "step": 21495 + }, + { + "epoch": 0.6274624252152342, + "grad_norm": 1.1779116305533983, + "learning_rate": 2.002181621805923e-05, + "loss": 0.6, + "step": 21500 + }, + { + "epoch": 0.6276083467094703, + "grad_norm": 0.9992861488429373, + "learning_rate": 2.0011576630176766e-05, + "loss": 0.5858, + "step": 21505 + }, + { + "epoch": 0.6277542682037064, + "grad_norm": 1.1511294131684007, + "learning_rate": 2.000133878611354e-05, + "loss": 0.5821, + "step": 21510 + }, + { + "epoch": 0.6279001896979425, + "grad_norm": 1.0451854485275764, + "learning_rate": 1.999110268825363e-05, + "loss": 0.5955, + "step": 21515 + }, + { + "epoch": 0.6280461111921786, + "grad_norm": 0.8563721435460955, + "learning_rate": 1.9980868338980708e-05, + "loss": 0.5137, + "step": 21520 + }, + { + "epoch": 0.6281920326864147, + "grad_norm": 1.0629345700263528, + "learning_rate": 1.9970635740678017e-05, + "loss": 0.5727, + "step": 21525 + }, + { + "epoch": 0.6283379541806509, + "grad_norm": 1.1178738371594295, + "learning_rate": 1.9960404895728425e-05, + "loss": 0.5467, + "step": 21530 + }, + { + "epoch": 0.628483875674887, + "grad_norm": 1.239028514520143, + "learning_rate": 1.995017580651436e-05, + "loss": 0.5712, + "step": 21535 + }, + { + "epoch": 0.628629797169123, + "grad_norm": 1.0309535186259977, + "learning_rate": 1.993994847541788e-05, + "loss": 0.532, + "step": 21540 + }, + { + "epoch": 0.6287757186633591, + "grad_norm": 1.0949356423924668, + "learning_rate": 1.992972290482058e-05, + "loss": 0.628, + "step": 21545 + }, + { + "epoch": 0.6289216401575952, + "grad_norm": 1.1365741643920848, + "learning_rate": 1.9919499097103707e-05, + "loss": 0.5629, + "step": 21550 + }, + { + "epoch": 0.6290675616518313, + "grad_norm": 0.9266869761695989, + "learning_rate": 1.9909277054648035e-05, + "loss": 0.5483, + "step": 21555 + }, + { + "epoch": 0.6292134831460674, + "grad_norm": 1.0233942261886486, + "learning_rate": 1.9899056779833968e-05, + "loss": 0.5333, + "step": 21560 + }, + { + "epoch": 0.6293594046403035, + "grad_norm": 1.0904015542221328, + "learning_rate": 1.9888838275041504e-05, + "loss": 0.5333, + "step": 21565 + }, + { + "epoch": 0.6295053261345396, + "grad_norm": 1.0064669175235683, + "learning_rate": 1.9878621542650194e-05, + "loss": 0.5703, + "step": 21570 + }, + { + "epoch": 0.6296512476287757, + "grad_norm": 0.8647918065236093, + "learning_rate": 1.98684065850392e-05, + "loss": 0.5671, + "step": 21575 + }, + { + "epoch": 0.6297971691230119, + "grad_norm": 1.0763160247628032, + "learning_rate": 1.9858193404587266e-05, + "loss": 0.5683, + "step": 21580 + }, + { + "epoch": 0.629943090617248, + "grad_norm": 0.9557088483317157, + "learning_rate": 1.9847982003672722e-05, + "loss": 0.5601, + "step": 21585 + }, + { + "epoch": 0.630089012111484, + "grad_norm": 1.0662843512762523, + "learning_rate": 1.9837772384673493e-05, + "loss": 0.5717, + "step": 21590 + }, + { + "epoch": 0.6302349336057201, + "grad_norm": 1.204606190876787, + "learning_rate": 1.9827564549967066e-05, + "loss": 0.5785, + "step": 21595 + }, + { + "epoch": 0.6303808550999562, + "grad_norm": 1.213489042518213, + "learning_rate": 1.981735850193052e-05, + "loss": 0.614, + "step": 21600 + }, + { + "epoch": 0.6305267765941923, + "grad_norm": 1.1613852732207153, + "learning_rate": 1.9807154242940538e-05, + "loss": 0.591, + "step": 21605 + }, + { + "epoch": 0.6306726980884284, + "grad_norm": 1.1104349952177215, + "learning_rate": 1.979695177537338e-05, + "loss": 0.4961, + "step": 21610 + }, + { + "epoch": 0.6308186195826645, + "grad_norm": 1.027669194751987, + "learning_rate": 1.9786751101604856e-05, + "loss": 0.5297, + "step": 21615 + }, + { + "epoch": 0.6309645410769006, + "grad_norm": 1.0842382631888179, + "learning_rate": 1.9776552224010408e-05, + "loss": 0.5428, + "step": 21620 + }, + { + "epoch": 0.6311104625711367, + "grad_norm": 1.1392991137613815, + "learning_rate": 1.9766355144965015e-05, + "loss": 0.5222, + "step": 21625 + }, + { + "epoch": 0.6312563840653729, + "grad_norm": 1.1786876753215634, + "learning_rate": 1.9756159866843266e-05, + "loss": 0.6008, + "step": 21630 + }, + { + "epoch": 0.631402305559609, + "grad_norm": 0.9846830610408701, + "learning_rate": 1.9745966392019334e-05, + "loss": 0.5126, + "step": 21635 + }, + { + "epoch": 0.6315482270538451, + "grad_norm": 1.1603368204216153, + "learning_rate": 1.9735774722866934e-05, + "loss": 0.5422, + "step": 21640 + }, + { + "epoch": 0.6316941485480811, + "grad_norm": 1.1548009339559104, + "learning_rate": 1.9725584861759404e-05, + "loss": 0.5643, + "step": 21645 + }, + { + "epoch": 0.6318400700423172, + "grad_norm": 0.9860952435174237, + "learning_rate": 1.9715396811069646e-05, + "loss": 0.5029, + "step": 21650 + }, + { + "epoch": 0.6319859915365533, + "grad_norm": 1.1576391095470537, + "learning_rate": 1.970521057317013e-05, + "loss": 0.5766, + "step": 21655 + }, + { + "epoch": 0.6321319130307894, + "grad_norm": 1.1608427762529696, + "learning_rate": 1.9695026150432917e-05, + "loss": 0.5499, + "step": 21660 + }, + { + "epoch": 0.6322778345250255, + "grad_norm": 1.0831664722466747, + "learning_rate": 1.9684843545229632e-05, + "loss": 0.5557, + "step": 21665 + }, + { + "epoch": 0.6324237560192616, + "grad_norm": 0.9752829167183303, + "learning_rate": 1.9674662759931488e-05, + "loss": 0.6056, + "step": 21670 + }, + { + "epoch": 0.6325696775134977, + "grad_norm": 1.0721075951551997, + "learning_rate": 1.9664483796909273e-05, + "loss": 0.478, + "step": 21675 + }, + { + "epoch": 0.6327155990077339, + "grad_norm": 1.1633242654425933, + "learning_rate": 1.9654306658533343e-05, + "loss": 0.6239, + "step": 21680 + }, + { + "epoch": 0.63286152050197, + "grad_norm": 0.951271485650146, + "learning_rate": 1.9644131347173657e-05, + "loss": 0.5801, + "step": 21685 + }, + { + "epoch": 0.6330074419962061, + "grad_norm": 1.1507745632632937, + "learning_rate": 1.9633957865199698e-05, + "loss": 0.5971, + "step": 21690 + }, + { + "epoch": 0.6331533634904422, + "grad_norm": 0.9349511367178429, + "learning_rate": 1.9623786214980565e-05, + "loss": 0.5231, + "step": 21695 + }, + { + "epoch": 0.6332992849846782, + "grad_norm": 1.0686143834784587, + "learning_rate": 1.9613616398884917e-05, + "loss": 0.5491, + "step": 21700 + }, + { + "epoch": 0.6334452064789143, + "grad_norm": 1.0066415742846488, + "learning_rate": 1.9603448419280984e-05, + "loss": 0.5576, + "step": 21705 + }, + { + "epoch": 0.6335911279731504, + "grad_norm": 1.0813473886407243, + "learning_rate": 1.959328227853656e-05, + "loss": 0.6038, + "step": 21710 + }, + { + "epoch": 0.6337370494673865, + "grad_norm": 1.0700488508256019, + "learning_rate": 1.9583117979019033e-05, + "loss": 0.5768, + "step": 21715 + }, + { + "epoch": 0.6338829709616226, + "grad_norm": 1.1626816701622698, + "learning_rate": 1.9572955523095348e-05, + "loss": 0.5268, + "step": 21720 + }, + { + "epoch": 0.6340288924558587, + "grad_norm": 1.0848277872998586, + "learning_rate": 1.9562794913132026e-05, + "loss": 0.5562, + "step": 21725 + }, + { + "epoch": 0.6341748139500949, + "grad_norm": 0.9970875121518702, + "learning_rate": 1.9552636151495148e-05, + "loss": 0.5048, + "step": 21730 + }, + { + "epoch": 0.634320735444331, + "grad_norm": 1.0701842205964907, + "learning_rate": 1.954247924055037e-05, + "loss": 0.596, + "step": 21735 + }, + { + "epoch": 0.6344666569385671, + "grad_norm": 0.9218209455053972, + "learning_rate": 1.9532324182662924e-05, + "loss": 0.5692, + "step": 21740 + }, + { + "epoch": 0.6346125784328032, + "grad_norm": 1.0352803640309893, + "learning_rate": 1.95221709801976e-05, + "loss": 0.5592, + "step": 21745 + }, + { + "epoch": 0.6347584999270393, + "grad_norm": 1.099854377686255, + "learning_rate": 1.9512019635518762e-05, + "loss": 0.5943, + "step": 21750 + }, + { + "epoch": 0.6349044214212753, + "grad_norm": 1.3136065269390034, + "learning_rate": 1.9501870150990353e-05, + "loss": 0.5205, + "step": 21755 + }, + { + "epoch": 0.6350503429155114, + "grad_norm": 1.1008757241372282, + "learning_rate": 1.9491722528975852e-05, + "loss": 0.5139, + "step": 21760 + }, + { + "epoch": 0.6351962644097475, + "grad_norm": 0.9960248558880715, + "learning_rate": 1.9481576771838327e-05, + "loss": 0.5867, + "step": 21765 + }, + { + "epoch": 0.6353421859039836, + "grad_norm": 0.9461912387161494, + "learning_rate": 1.9471432881940417e-05, + "loss": 0.521, + "step": 21770 + }, + { + "epoch": 0.6354881073982197, + "grad_norm": 0.9295808817064493, + "learning_rate": 1.9461290861644295e-05, + "loss": 0.5494, + "step": 21775 + }, + { + "epoch": 0.6356340288924559, + "grad_norm": 1.0333998877701482, + "learning_rate": 1.945115071331175e-05, + "loss": 0.5251, + "step": 21780 + }, + { + "epoch": 0.635779950386692, + "grad_norm": 1.0724527217686928, + "learning_rate": 1.9441012439304075e-05, + "loss": 0.5382, + "step": 21785 + }, + { + "epoch": 0.6359258718809281, + "grad_norm": 0.994721729197665, + "learning_rate": 1.943087604198217e-05, + "loss": 0.5614, + "step": 21790 + }, + { + "epoch": 0.6360717933751642, + "grad_norm": 0.895917359408065, + "learning_rate": 1.9420741523706493e-05, + "loss": 0.5689, + "step": 21795 + }, + { + "epoch": 0.6362177148694003, + "grad_norm": 1.1898866616462012, + "learning_rate": 1.9410608886837043e-05, + "loss": 0.5901, + "step": 21800 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.989276378289207, + "learning_rate": 1.9400478133733402e-05, + "loss": 0.557, + "step": 21805 + }, + { + "epoch": 0.6365095578578724, + "grad_norm": 0.9978327495797231, + "learning_rate": 1.9390349266754694e-05, + "loss": 0.5859, + "step": 21810 + }, + { + "epoch": 0.6366554793521085, + "grad_norm": 0.9133972603259738, + "learning_rate": 1.9380222288259618e-05, + "loss": 0.5451, + "step": 21815 + }, + { + "epoch": 0.6368014008463446, + "grad_norm": 1.0705297075965714, + "learning_rate": 1.9370097200606436e-05, + "loss": 0.5974, + "step": 21820 + }, + { + "epoch": 0.6369473223405807, + "grad_norm": 1.099350454401547, + "learning_rate": 1.935997400615297e-05, + "loss": 0.5848, + "step": 21825 + }, + { + "epoch": 0.6370932438348169, + "grad_norm": 1.033662516080168, + "learning_rate": 1.934985270725658e-05, + "loss": 0.5386, + "step": 21830 + }, + { + "epoch": 0.637239165329053, + "grad_norm": 1.0822618943713955, + "learning_rate": 1.933973330627421e-05, + "loss": 0.5235, + "step": 21835 + }, + { + "epoch": 0.6373850868232891, + "grad_norm": 1.0058398601548837, + "learning_rate": 1.9329615805562346e-05, + "loss": 0.5855, + "step": 21840 + }, + { + "epoch": 0.6375310083175252, + "grad_norm": 1.2577499370813194, + "learning_rate": 1.931950020747703e-05, + "loss": 0.5288, + "step": 21845 + }, + { + "epoch": 0.6376769298117613, + "grad_norm": 0.973305858285909, + "learning_rate": 1.9309386514373897e-05, + "loss": 0.4946, + "step": 21850 + }, + { + "epoch": 0.6378228513059974, + "grad_norm": 1.1003721817427605, + "learning_rate": 1.929927472860807e-05, + "loss": 0.5847, + "step": 21855 + }, + { + "epoch": 0.6379687728002335, + "grad_norm": 0.9773540960942197, + "learning_rate": 1.9289164852534287e-05, + "loss": 0.5433, + "step": 21860 + }, + { + "epoch": 0.6381146942944695, + "grad_norm": 1.0137804527527943, + "learning_rate": 1.9279056888506824e-05, + "loss": 0.5763, + "step": 21865 + }, + { + "epoch": 0.6382606157887056, + "grad_norm": 1.0864801326439824, + "learning_rate": 1.9268950838879508e-05, + "loss": 0.5342, + "step": 21870 + }, + { + "epoch": 0.6384065372829418, + "grad_norm": 1.103286580790859, + "learning_rate": 1.9258846706005724e-05, + "loss": 0.5906, + "step": 21875 + }, + { + "epoch": 0.6385524587771779, + "grad_norm": 1.0176982880976133, + "learning_rate": 1.9248744492238392e-05, + "loss": 0.5932, + "step": 21880 + }, + { + "epoch": 0.638698380271414, + "grad_norm": 1.1005023290539862, + "learning_rate": 1.923864419993001e-05, + "loss": 0.5779, + "step": 21885 + }, + { + "epoch": 0.6388443017656501, + "grad_norm": 1.088670336471166, + "learning_rate": 1.922854583143263e-05, + "loss": 0.6035, + "step": 21890 + }, + { + "epoch": 0.6389902232598862, + "grad_norm": 1.1254615369540781, + "learning_rate": 1.921844938909784e-05, + "loss": 0.5117, + "step": 21895 + }, + { + "epoch": 0.6391361447541223, + "grad_norm": 1.1787096577025074, + "learning_rate": 1.9208354875276774e-05, + "loss": 0.651, + "step": 21900 + }, + { + "epoch": 0.6392820662483584, + "grad_norm": 1.138862720019086, + "learning_rate": 1.9198262292320145e-05, + "loss": 0.5354, + "step": 21905 + }, + { + "epoch": 0.6394279877425945, + "grad_norm": 1.027769582525892, + "learning_rate": 1.9188171642578185e-05, + "loss": 0.5259, + "step": 21910 + }, + { + "epoch": 0.6395739092368306, + "grad_norm": 1.1632852126316648, + "learning_rate": 1.9178082928400697e-05, + "loss": 0.534, + "step": 21915 + }, + { + "epoch": 0.6397198307310666, + "grad_norm": 0.9554524439085132, + "learning_rate": 1.916799615213703e-05, + "loss": 0.5652, + "step": 21920 + }, + { + "epoch": 0.6398657522253028, + "grad_norm": 1.2185073638271997, + "learning_rate": 1.915791131613607e-05, + "loss": 0.5486, + "step": 21925 + }, + { + "epoch": 0.6400116737195389, + "grad_norm": 1.1126732615548693, + "learning_rate": 1.9147828422746266e-05, + "loss": 0.5596, + "step": 21930 + }, + { + "epoch": 0.640157595213775, + "grad_norm": 1.1670515873939935, + "learning_rate": 1.9137747474315604e-05, + "loss": 0.5681, + "step": 21935 + }, + { + "epoch": 0.6403035167080111, + "grad_norm": 1.0457325044935364, + "learning_rate": 1.9127668473191624e-05, + "loss": 0.541, + "step": 21940 + }, + { + "epoch": 0.6404494382022472, + "grad_norm": 0.9561656215353793, + "learning_rate": 1.9117591421721408e-05, + "loss": 0.5639, + "step": 21945 + }, + { + "epoch": 0.6405953596964833, + "grad_norm": 1.0593182257517024, + "learning_rate": 1.910751632225159e-05, + "loss": 0.5706, + "step": 21950 + }, + { + "epoch": 0.6407412811907194, + "grad_norm": 1.161237438501792, + "learning_rate": 1.9097443177128337e-05, + "loss": 0.5773, + "step": 21955 + }, + { + "epoch": 0.6408872026849555, + "grad_norm": 1.1797792195206869, + "learning_rate": 1.908737198869737e-05, + "loss": 0.4968, + "step": 21960 + }, + { + "epoch": 0.6410331241791916, + "grad_norm": 1.1104436113051526, + "learning_rate": 1.9077302759303966e-05, + "loss": 0.5648, + "step": 21965 + }, + { + "epoch": 0.6411790456734276, + "grad_norm": 1.1238236393226455, + "learning_rate": 1.9067235491292918e-05, + "loss": 0.5436, + "step": 21970 + }, + { + "epoch": 0.6413249671676639, + "grad_norm": 0.8945389670274669, + "learning_rate": 1.9057170187008584e-05, + "loss": 0.5398, + "step": 21975 + }, + { + "epoch": 0.6414708886618999, + "grad_norm": 1.0592094716344638, + "learning_rate": 1.9047106848794856e-05, + "loss": 0.5402, + "step": 21980 + }, + { + "epoch": 0.641616810156136, + "grad_norm": 0.8896220415718814, + "learning_rate": 1.903704547899517e-05, + "loss": 0.5036, + "step": 21985 + }, + { + "epoch": 0.6417627316503721, + "grad_norm": 1.0836446221395244, + "learning_rate": 1.9026986079952515e-05, + "loss": 0.5795, + "step": 21990 + }, + { + "epoch": 0.6419086531446082, + "grad_norm": 1.3836117609355192, + "learning_rate": 1.9016928654009386e-05, + "loss": 0.5628, + "step": 21995 + }, + { + "epoch": 0.6420545746388443, + "grad_norm": 0.9945319721452842, + "learning_rate": 1.9006873203507864e-05, + "loss": 0.5601, + "step": 22000 + }, + { + "epoch": 0.6422004961330804, + "grad_norm": 0.9716881496250388, + "learning_rate": 1.8996819730789533e-05, + "loss": 0.4592, + "step": 22005 + }, + { + "epoch": 0.6423464176273165, + "grad_norm": 0.9026869985544795, + "learning_rate": 1.8986768238195553e-05, + "loss": 0.5901, + "step": 22010 + }, + { + "epoch": 0.6424923391215526, + "grad_norm": 0.9544651782469936, + "learning_rate": 1.8976718728066584e-05, + "loss": 0.5758, + "step": 22015 + }, + { + "epoch": 0.6426382606157887, + "grad_norm": 1.0866303092292, + "learning_rate": 1.8966671202742848e-05, + "loss": 0.5692, + "step": 22020 + }, + { + "epoch": 0.6427841821100249, + "grad_norm": 1.0479128744449573, + "learning_rate": 1.8956625664564092e-05, + "loss": 0.4996, + "step": 22025 + }, + { + "epoch": 0.642930103604261, + "grad_norm": 1.1841361900011267, + "learning_rate": 1.8946582115869614e-05, + "loss": 0.5926, + "step": 22030 + }, + { + "epoch": 0.643076025098497, + "grad_norm": 0.9502305747451631, + "learning_rate": 1.893654055899824e-05, + "loss": 0.551, + "step": 22035 + }, + { + "epoch": 0.6432219465927331, + "grad_norm": 0.9737048360817947, + "learning_rate": 1.8926500996288348e-05, + "loss": 0.54, + "step": 22040 + }, + { + "epoch": 0.6433678680869692, + "grad_norm": 1.01431126887875, + "learning_rate": 1.8916463430077807e-05, + "loss": 0.5613, + "step": 22045 + }, + { + "epoch": 0.6435137895812053, + "grad_norm": 1.16528919494979, + "learning_rate": 1.8906427862704086e-05, + "loss": 0.5574, + "step": 22050 + }, + { + "epoch": 0.6436597110754414, + "grad_norm": 1.089080879137437, + "learning_rate": 1.889639429650412e-05, + "loss": 0.6184, + "step": 22055 + }, + { + "epoch": 0.6438056325696775, + "grad_norm": 0.9235669711542518, + "learning_rate": 1.888636273381445e-05, + "loss": 0.5565, + "step": 22060 + }, + { + "epoch": 0.6439515540639136, + "grad_norm": 0.8797908484836513, + "learning_rate": 1.887633317697108e-05, + "loss": 0.541, + "step": 22065 + }, + { + "epoch": 0.6440974755581497, + "grad_norm": 1.0525346594055296, + "learning_rate": 1.886630562830959e-05, + "loss": 0.5656, + "step": 22070 + }, + { + "epoch": 0.6442433970523859, + "grad_norm": 1.0602087884688443, + "learning_rate": 1.8856280090165085e-05, + "loss": 0.5885, + "step": 22075 + }, + { + "epoch": 0.644389318546622, + "grad_norm": 0.9797394718919068, + "learning_rate": 1.8846256564872204e-05, + "loss": 0.5385, + "step": 22080 + }, + { + "epoch": 0.644535240040858, + "grad_norm": 1.1045910284359404, + "learning_rate": 1.8836235054765112e-05, + "loss": 0.5981, + "step": 22085 + }, + { + "epoch": 0.6446811615350941, + "grad_norm": 1.100104331713952, + "learning_rate": 1.8826215562177492e-05, + "loss": 0.5967, + "step": 22090 + }, + { + "epoch": 0.6448270830293302, + "grad_norm": 0.9079469064204102, + "learning_rate": 1.8816198089442572e-05, + "loss": 0.5517, + "step": 22095 + }, + { + "epoch": 0.6449730045235663, + "grad_norm": 1.235865516274603, + "learning_rate": 1.880618263889311e-05, + "loss": 0.5699, + "step": 22100 + }, + { + "epoch": 0.6451189260178024, + "grad_norm": 1.0011448289760487, + "learning_rate": 1.87961692128614e-05, + "loss": 0.5333, + "step": 22105 + }, + { + "epoch": 0.6452648475120385, + "grad_norm": 1.0401675071128709, + "learning_rate": 1.8786157813679255e-05, + "loss": 0.5592, + "step": 22110 + }, + { + "epoch": 0.6454107690062746, + "grad_norm": 1.0748297092456756, + "learning_rate": 1.8776148443678e-05, + "loss": 0.5356, + "step": 22115 + }, + { + "epoch": 0.6455566905005107, + "grad_norm": 1.1052177985188683, + "learning_rate": 1.8766141105188517e-05, + "loss": 0.517, + "step": 22120 + }, + { + "epoch": 0.6457026119947469, + "grad_norm": 0.984848924715191, + "learning_rate": 1.8756135800541197e-05, + "loss": 0.5518, + "step": 22125 + }, + { + "epoch": 0.645848533488983, + "grad_norm": 1.0549945290660518, + "learning_rate": 1.8746132532065963e-05, + "loss": 0.5555, + "step": 22130 + }, + { + "epoch": 0.6459944549832191, + "grad_norm": 1.181573969277069, + "learning_rate": 1.8736131302092274e-05, + "loss": 0.5721, + "step": 22135 + }, + { + "epoch": 0.6461403764774551, + "grad_norm": 1.332459352082685, + "learning_rate": 1.872613211294908e-05, + "loss": 0.6013, + "step": 22140 + }, + { + "epoch": 0.6462862979716912, + "grad_norm": 1.0573805920249875, + "learning_rate": 1.8716134966964893e-05, + "loss": 0.5749, + "step": 22145 + }, + { + "epoch": 0.6464322194659273, + "grad_norm": 1.333648925851769, + "learning_rate": 1.8706139866467732e-05, + "loss": 0.5813, + "step": 22150 + }, + { + "epoch": 0.6465781409601634, + "grad_norm": 1.0744983311308165, + "learning_rate": 1.8696146813785154e-05, + "loss": 0.5566, + "step": 22155 + }, + { + "epoch": 0.6467240624543995, + "grad_norm": 1.0368161519532362, + "learning_rate": 1.868615581124421e-05, + "loss": 0.5731, + "step": 22160 + }, + { + "epoch": 0.6468699839486356, + "grad_norm": 1.0496330182202802, + "learning_rate": 1.8676166861171502e-05, + "loss": 0.5427, + "step": 22165 + }, + { + "epoch": 0.6470159054428717, + "grad_norm": 1.0630568673861143, + "learning_rate": 1.8666179965893137e-05, + "loss": 0.5808, + "step": 22170 + }, + { + "epoch": 0.6471618269371079, + "grad_norm": 0.9499046406576437, + "learning_rate": 1.865619512773476e-05, + "loss": 0.5244, + "step": 22175 + }, + { + "epoch": 0.647307748431344, + "grad_norm": 0.9201785651704663, + "learning_rate": 1.864621234902153e-05, + "loss": 0.5442, + "step": 22180 + }, + { + "epoch": 0.6474536699255801, + "grad_norm": 1.0565774919364457, + "learning_rate": 1.86362316320781e-05, + "loss": 0.4962, + "step": 22185 + }, + { + "epoch": 0.6475995914198162, + "grad_norm": 0.9500343992277334, + "learning_rate": 1.8626252979228685e-05, + "loss": 0.4972, + "step": 22190 + }, + { + "epoch": 0.6477455129140522, + "grad_norm": 1.0043234280239284, + "learning_rate": 1.8616276392797e-05, + "loss": 0.5513, + "step": 22195 + }, + { + "epoch": 0.6478914344082883, + "grad_norm": 0.9390246362665631, + "learning_rate": 1.8606301875106273e-05, + "loss": 0.5352, + "step": 22200 + }, + { + "epoch": 0.6480373559025244, + "grad_norm": 1.1867331778099164, + "learning_rate": 1.859632942847927e-05, + "loss": 0.5514, + "step": 22205 + }, + { + "epoch": 0.6481832773967605, + "grad_norm": 0.9779153018628504, + "learning_rate": 1.8586359055238244e-05, + "loss": 0.516, + "step": 22210 + }, + { + "epoch": 0.6483291988909966, + "grad_norm": 1.0016874791669665, + "learning_rate": 1.8576390757704988e-05, + "loss": 0.5597, + "step": 22215 + }, + { + "epoch": 0.6484751203852327, + "grad_norm": 1.0304749995347424, + "learning_rate": 1.856642453820081e-05, + "loss": 0.5492, + "step": 22220 + }, + { + "epoch": 0.6486210418794689, + "grad_norm": 0.9601316515364549, + "learning_rate": 1.8556460399046534e-05, + "loss": 0.583, + "step": 22225 + }, + { + "epoch": 0.648766963373705, + "grad_norm": 1.0460836305042074, + "learning_rate": 1.8546498342562486e-05, + "loss": 0.5289, + "step": 22230 + }, + { + "epoch": 0.6489128848679411, + "grad_norm": 1.0163466737266107, + "learning_rate": 1.8536538371068524e-05, + "loss": 0.5771, + "step": 22235 + }, + { + "epoch": 0.6490588063621772, + "grad_norm": 1.1007378726951997, + "learning_rate": 1.852658048688401e-05, + "loss": 0.5526, + "step": 22240 + }, + { + "epoch": 0.6492047278564133, + "grad_norm": 1.0903823821791667, + "learning_rate": 1.8516624692327828e-05, + "loss": 0.5858, + "step": 22245 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 0.974744903629621, + "learning_rate": 1.8506670989718366e-05, + "loss": 0.5162, + "step": 22250 + }, + { + "epoch": 0.6494965708448854, + "grad_norm": 0.9603235287937402, + "learning_rate": 1.849671938137353e-05, + "loss": 0.5416, + "step": 22255 + }, + { + "epoch": 0.6496424923391215, + "grad_norm": 1.1622975381596095, + "learning_rate": 1.848676986961073e-05, + "loss": 0.5405, + "step": 22260 + }, + { + "epoch": 0.6497884138333576, + "grad_norm": 1.0097143305689305, + "learning_rate": 1.8476822456746917e-05, + "loss": 0.5491, + "step": 22265 + }, + { + "epoch": 0.6499343353275937, + "grad_norm": 1.0565047478290637, + "learning_rate": 1.846687714509851e-05, + "loss": 0.5983, + "step": 22270 + }, + { + "epoch": 0.6500802568218299, + "grad_norm": 0.9464132043355717, + "learning_rate": 1.845693393698148e-05, + "loss": 0.6055, + "step": 22275 + }, + { + "epoch": 0.650226178316066, + "grad_norm": 1.1125014594981864, + "learning_rate": 1.8446992834711274e-05, + "loss": 0.5823, + "step": 22280 + }, + { + "epoch": 0.6503720998103021, + "grad_norm": 0.9498019695510368, + "learning_rate": 1.8437053840602864e-05, + "loss": 0.4941, + "step": 22285 + }, + { + "epoch": 0.6505180213045382, + "grad_norm": 1.0469344983050612, + "learning_rate": 1.8427116956970735e-05, + "loss": 0.5312, + "step": 22290 + }, + { + "epoch": 0.6506639427987743, + "grad_norm": 1.4131594940772032, + "learning_rate": 1.8417182186128878e-05, + "loss": 0.6171, + "step": 22295 + }, + { + "epoch": 0.6508098642930104, + "grad_norm": 1.060472795424812, + "learning_rate": 1.8407249530390784e-05, + "loss": 0.5409, + "step": 22300 + }, + { + "epoch": 0.6509557857872464, + "grad_norm": 0.9018584279056739, + "learning_rate": 1.8397318992069467e-05, + "loss": 0.5876, + "step": 22305 + }, + { + "epoch": 0.6511017072814825, + "grad_norm": 1.019629845212201, + "learning_rate": 1.8387390573477425e-05, + "loss": 0.5645, + "step": 22310 + }, + { + "epoch": 0.6512476287757186, + "grad_norm": 1.0307095221493323, + "learning_rate": 1.837746427692668e-05, + "loss": 0.5888, + "step": 22315 + }, + { + "epoch": 0.6513935502699547, + "grad_norm": 0.992104092831787, + "learning_rate": 1.836754010472876e-05, + "loss": 0.493, + "step": 22320 + }, + { + "epoch": 0.6515394717641909, + "grad_norm": 1.175775803478721, + "learning_rate": 1.83576180591947e-05, + "loss": 0.5539, + "step": 22325 + }, + { + "epoch": 0.651685393258427, + "grad_norm": 1.0238728909826367, + "learning_rate": 1.8347698142635017e-05, + "loss": 0.5662, + "step": 22330 + }, + { + "epoch": 0.6518313147526631, + "grad_norm": 1.0789561027299324, + "learning_rate": 1.8337780357359763e-05, + "loss": 0.5505, + "step": 22335 + }, + { + "epoch": 0.6519772362468992, + "grad_norm": 1.0342055484346018, + "learning_rate": 1.8327864705678465e-05, + "loss": 0.5413, + "step": 22340 + }, + { + "epoch": 0.6521231577411353, + "grad_norm": 1.0363116458595134, + "learning_rate": 1.8317951189900195e-05, + "loss": 0.5433, + "step": 22345 + }, + { + "epoch": 0.6522690792353714, + "grad_norm": 0.9839074521628217, + "learning_rate": 1.8308039812333465e-05, + "loss": 0.5476, + "step": 22350 + }, + { + "epoch": 0.6524150007296075, + "grad_norm": 1.0992590866491638, + "learning_rate": 1.8298130575286342e-05, + "loss": 0.5128, + "step": 22355 + }, + { + "epoch": 0.6525609222238435, + "grad_norm": 1.0029813189319117, + "learning_rate": 1.8288223481066374e-05, + "loss": 0.533, + "step": 22360 + }, + { + "epoch": 0.6527068437180796, + "grad_norm": 1.1285802077821032, + "learning_rate": 1.827831853198062e-05, + "loss": 0.5521, + "step": 22365 + }, + { + "epoch": 0.6528527652123157, + "grad_norm": 1.1021568507153408, + "learning_rate": 1.8268415730335636e-05, + "loss": 0.5666, + "step": 22370 + }, + { + "epoch": 0.6529986867065519, + "grad_norm": 1.04945154257413, + "learning_rate": 1.8258515078437456e-05, + "loss": 0.5898, + "step": 22375 + }, + { + "epoch": 0.653144608200788, + "grad_norm": 1.0780308853941516, + "learning_rate": 1.8248616578591642e-05, + "loss": 0.5103, + "step": 22380 + }, + { + "epoch": 0.6532905296950241, + "grad_norm": 1.051293775865153, + "learning_rate": 1.8238720233103247e-05, + "loss": 0.5512, + "step": 22385 + }, + { + "epoch": 0.6534364511892602, + "grad_norm": 1.0461492369882694, + "learning_rate": 1.822882604427681e-05, + "loss": 0.5729, + "step": 22390 + }, + { + "epoch": 0.6535823726834963, + "grad_norm": 1.1234318945254003, + "learning_rate": 1.8218934014416395e-05, + "loss": 0.565, + "step": 22395 + }, + { + "epoch": 0.6537282941777324, + "grad_norm": 0.9773601659570147, + "learning_rate": 1.8209044145825533e-05, + "loss": 0.5526, + "step": 22400 + }, + { + "epoch": 0.6538742156719685, + "grad_norm": 1.1022468630495483, + "learning_rate": 1.819915644080726e-05, + "loss": 0.6008, + "step": 22405 + }, + { + "epoch": 0.6540201371662046, + "grad_norm": 1.198216144642613, + "learning_rate": 1.8189270901664123e-05, + "loss": 0.5701, + "step": 22410 + }, + { + "epoch": 0.6541660586604406, + "grad_norm": 1.1542191270615358, + "learning_rate": 1.8179387530698154e-05, + "loss": 0.52, + "step": 22415 + }, + { + "epoch": 0.6543119801546767, + "grad_norm": 0.946965423472249, + "learning_rate": 1.8169506330210884e-05, + "loss": 0.561, + "step": 22420 + }, + { + "epoch": 0.6544579016489129, + "grad_norm": 1.1388805323564575, + "learning_rate": 1.815962730250331e-05, + "loss": 0.5207, + "step": 22425 + }, + { + "epoch": 0.654603823143149, + "grad_norm": 1.0627650927352519, + "learning_rate": 1.814975044987598e-05, + "loss": 0.5871, + "step": 22430 + }, + { + "epoch": 0.6547497446373851, + "grad_norm": 1.2193239971956373, + "learning_rate": 1.813987577462889e-05, + "loss": 0.6161, + "step": 22435 + }, + { + "epoch": 0.6548956661316212, + "grad_norm": 1.0275910890135207, + "learning_rate": 1.8130003279061546e-05, + "loss": 0.5446, + "step": 22440 + }, + { + "epoch": 0.6550415876258573, + "grad_norm": 0.9816511661689146, + "learning_rate": 1.812013296547294e-05, + "loss": 0.5511, + "step": 22445 + }, + { + "epoch": 0.6551875091200934, + "grad_norm": 1.1119925782065414, + "learning_rate": 1.811026483616155e-05, + "loss": 0.5222, + "step": 22450 + }, + { + "epoch": 0.6553334306143295, + "grad_norm": 0.971374082081296, + "learning_rate": 1.810039889342537e-05, + "loss": 0.5262, + "step": 22455 + }, + { + "epoch": 0.6554793521085656, + "grad_norm": 1.002649847142402, + "learning_rate": 1.809053513956186e-05, + "loss": 0.5393, + "step": 22460 + }, + { + "epoch": 0.6556252736028017, + "grad_norm": 0.9174788924590409, + "learning_rate": 1.8080673576867985e-05, + "loss": 0.5325, + "step": 22465 + }, + { + "epoch": 0.6557711950970377, + "grad_norm": 1.114159890731216, + "learning_rate": 1.807081420764019e-05, + "loss": 0.5961, + "step": 22470 + }, + { + "epoch": 0.6559171165912739, + "grad_norm": 1.1245713955494845, + "learning_rate": 1.8060957034174403e-05, + "loss": 0.5735, + "step": 22475 + }, + { + "epoch": 0.65606303808551, + "grad_norm": 1.216104686437999, + "learning_rate": 1.805110205876607e-05, + "loss": 0.5177, + "step": 22480 + }, + { + "epoch": 0.6562089595797461, + "grad_norm": 1.0833817666980703, + "learning_rate": 1.8041249283710094e-05, + "loss": 0.543, + "step": 22485 + }, + { + "epoch": 0.6563548810739822, + "grad_norm": 1.106600433974133, + "learning_rate": 1.80313987113009e-05, + "loss": 0.5953, + "step": 22490 + }, + { + "epoch": 0.6565008025682183, + "grad_norm": 1.0135322080600178, + "learning_rate": 1.8021550343832337e-05, + "loss": 0.4854, + "step": 22495 + }, + { + "epoch": 0.6566467240624544, + "grad_norm": 1.1687294715051906, + "learning_rate": 1.8011704183597817e-05, + "loss": 0.5513, + "step": 22500 + }, + { + "epoch": 0.6567926455566905, + "grad_norm": 1.0034398783796985, + "learning_rate": 1.8001860232890183e-05, + "loss": 0.4984, + "step": 22505 + }, + { + "epoch": 0.6569385670509266, + "grad_norm": 1.02623442681083, + "learning_rate": 1.7992018494001794e-05, + "loss": 0.5394, + "step": 22510 + }, + { + "epoch": 0.6570844885451627, + "grad_norm": 1.120452635378034, + "learning_rate": 1.7982178969224472e-05, + "loss": 0.5995, + "step": 22515 + }, + { + "epoch": 0.6572304100393987, + "grad_norm": 1.2458899168498645, + "learning_rate": 1.7972341660849535e-05, + "loss": 0.582, + "step": 22520 + }, + { + "epoch": 0.657376331533635, + "grad_norm": 1.187255381181032, + "learning_rate": 1.7962506571167797e-05, + "loss": 0.6208, + "step": 22525 + }, + { + "epoch": 0.657522253027871, + "grad_norm": 0.9448733478550574, + "learning_rate": 1.795267370246953e-05, + "loss": 0.57, + "step": 22530 + }, + { + "epoch": 0.6576681745221071, + "grad_norm": 1.1492476448366193, + "learning_rate": 1.7942843057044513e-05, + "loss": 0.5188, + "step": 22535 + }, + { + "epoch": 0.6578140960163432, + "grad_norm": 1.0409284854644185, + "learning_rate": 1.7933014637181982e-05, + "loss": 0.5496, + "step": 22540 + }, + { + "epoch": 0.6579600175105793, + "grad_norm": 0.9340754713217472, + "learning_rate": 1.7923188445170663e-05, + "loss": 0.513, + "step": 22545 + }, + { + "epoch": 0.6581059390048154, + "grad_norm": 1.0541531749339594, + "learning_rate": 1.7913364483298794e-05, + "loss": 0.5322, + "step": 22550 + }, + { + "epoch": 0.6582518604990515, + "grad_norm": 1.2384180599349084, + "learning_rate": 1.790354275385404e-05, + "loss": 0.5843, + "step": 22555 + }, + { + "epoch": 0.6583977819932876, + "grad_norm": 1.1377817639613113, + "learning_rate": 1.789372325912361e-05, + "loss": 0.5538, + "step": 22560 + }, + { + "epoch": 0.6585437034875237, + "grad_norm": 1.1194731863886143, + "learning_rate": 1.7883906001394108e-05, + "loss": 0.532, + "step": 22565 + }, + { + "epoch": 0.6586896249817598, + "grad_norm": 0.9954368267707016, + "learning_rate": 1.78740909829517e-05, + "loss": 0.5475, + "step": 22570 + }, + { + "epoch": 0.658835546475996, + "grad_norm": 0.8811513865950462, + "learning_rate": 1.7864278206081992e-05, + "loss": 0.5331, + "step": 22575 + }, + { + "epoch": 0.658981467970232, + "grad_norm": 0.9968644311605007, + "learning_rate": 1.785446767307006e-05, + "loss": 0.499, + "step": 22580 + }, + { + "epoch": 0.6591273894644681, + "grad_norm": 0.9675670382732324, + "learning_rate": 1.7844659386200495e-05, + "loss": 0.4832, + "step": 22585 + }, + { + "epoch": 0.6592733109587042, + "grad_norm": 0.9996172606231879, + "learning_rate": 1.7834853347757304e-05, + "loss": 0.5418, + "step": 22590 + }, + { + "epoch": 0.6594192324529403, + "grad_norm": 1.0081872140256811, + "learning_rate": 1.7825049560024033e-05, + "loss": 0.5555, + "step": 22595 + }, + { + "epoch": 0.6595651539471764, + "grad_norm": 0.9386331549370254, + "learning_rate": 1.7815248025283666e-05, + "loss": 0.5756, + "step": 22600 + }, + { + "epoch": 0.6597110754414125, + "grad_norm": 1.0637658638546388, + "learning_rate": 1.780544874581869e-05, + "loss": 0.5928, + "step": 22605 + }, + { + "epoch": 0.6598569969356486, + "grad_norm": 0.9725144027446253, + "learning_rate": 1.7795651723911015e-05, + "loss": 0.5687, + "step": 22610 + }, + { + "epoch": 0.6600029184298847, + "grad_norm": 1.0342286664043392, + "learning_rate": 1.7785856961842102e-05, + "loss": 0.5728, + "step": 22615 + }, + { + "epoch": 0.6601488399241209, + "grad_norm": 1.200101105127198, + "learning_rate": 1.7776064461892815e-05, + "loss": 0.591, + "step": 22620 + }, + { + "epoch": 0.660294761418357, + "grad_norm": 1.0592290529834663, + "learning_rate": 1.7766274226343526e-05, + "loss": 0.574, + "step": 22625 + }, + { + "epoch": 0.6604406829125931, + "grad_norm": 1.077231180645231, + "learning_rate": 1.7756486257474093e-05, + "loss": 0.5384, + "step": 22630 + }, + { + "epoch": 0.6605866044068291, + "grad_norm": 1.0051843217699519, + "learning_rate": 1.7746700557563795e-05, + "loss": 0.5756, + "step": 22635 + }, + { + "epoch": 0.6607325259010652, + "grad_norm": 0.9219904517506695, + "learning_rate": 1.7736917128891444e-05, + "loss": 0.5769, + "step": 22640 + }, + { + "epoch": 0.6608784473953013, + "grad_norm": 1.0555397452322748, + "learning_rate": 1.772713597373528e-05, + "loss": 0.6118, + "step": 22645 + }, + { + "epoch": 0.6610243688895374, + "grad_norm": 1.1494499274952148, + "learning_rate": 1.7717357094373026e-05, + "loss": 0.6019, + "step": 22650 + }, + { + "epoch": 0.6611702903837735, + "grad_norm": 1.2381588336204996, + "learning_rate": 1.77075804930819e-05, + "loss": 0.6031, + "step": 22655 + }, + { + "epoch": 0.6613162118780096, + "grad_norm": 0.9703289644232134, + "learning_rate": 1.7697806172138526e-05, + "loss": 0.6191, + "step": 22660 + }, + { + "epoch": 0.6614621333722457, + "grad_norm": 0.9002486966267685, + "learning_rate": 1.7688034133819066e-05, + "loss": 0.5301, + "step": 22665 + }, + { + "epoch": 0.6616080548664819, + "grad_norm": 1.3569416046244565, + "learning_rate": 1.7678264380399106e-05, + "loss": 0.6167, + "step": 22670 + }, + { + "epoch": 0.661753976360718, + "grad_norm": 1.114878161847666, + "learning_rate": 1.7668496914153736e-05, + "loss": 0.517, + "step": 22675 + }, + { + "epoch": 0.6618998978549541, + "grad_norm": 1.1529869418569423, + "learning_rate": 1.765873173735748e-05, + "loss": 0.5194, + "step": 22680 + }, + { + "epoch": 0.6620458193491902, + "grad_norm": 1.0543841509746303, + "learning_rate": 1.764896885228434e-05, + "loss": 0.5484, + "step": 22685 + }, + { + "epoch": 0.6621917408434262, + "grad_norm": 0.9161104234675111, + "learning_rate": 1.763920826120779e-05, + "loss": 0.5605, + "step": 22690 + }, + { + "epoch": 0.6623376623376623, + "grad_norm": 0.9798387389815199, + "learning_rate": 1.762944996640076e-05, + "loss": 0.5128, + "step": 22695 + }, + { + "epoch": 0.6624835838318984, + "grad_norm": 1.2902785652504538, + "learning_rate": 1.7619693970135665e-05, + "loss": 0.5258, + "step": 22700 + }, + { + "epoch": 0.6626295053261345, + "grad_norm": 1.0780989655203863, + "learning_rate": 1.7609940274684343e-05, + "loss": 0.5618, + "step": 22705 + }, + { + "epoch": 0.6627754268203706, + "grad_norm": 1.0592094884280832, + "learning_rate": 1.760018888231816e-05, + "loss": 0.5692, + "step": 22710 + }, + { + "epoch": 0.6629213483146067, + "grad_norm": 1.200159793838391, + "learning_rate": 1.7590439795307888e-05, + "loss": 0.5183, + "step": 22715 + }, + { + "epoch": 0.6630672698088429, + "grad_norm": 0.9049647984769915, + "learning_rate": 1.7580693015923782e-05, + "loss": 0.5048, + "step": 22720 + }, + { + "epoch": 0.663213191303079, + "grad_norm": 0.9736219233245142, + "learning_rate": 1.7570948546435584e-05, + "loss": 0.5807, + "step": 22725 + }, + { + "epoch": 0.6633591127973151, + "grad_norm": 1.0726031575698647, + "learning_rate": 1.7561206389112445e-05, + "loss": 0.6033, + "step": 22730 + }, + { + "epoch": 0.6635050342915512, + "grad_norm": 1.1279534663342017, + "learning_rate": 1.7551466546223033e-05, + "loss": 0.6007, + "step": 22735 + }, + { + "epoch": 0.6636509557857873, + "grad_norm": 0.9675569167676644, + "learning_rate": 1.754172902003543e-05, + "loss": 0.5263, + "step": 22740 + }, + { + "epoch": 0.6637968772800233, + "grad_norm": 1.0963318658072772, + "learning_rate": 1.7531993812817222e-05, + "loss": 0.591, + "step": 22745 + }, + { + "epoch": 0.6639427987742594, + "grad_norm": 1.0096277645767417, + "learning_rate": 1.7522260926835425e-05, + "loss": 0.5514, + "step": 22750 + }, + { + "epoch": 0.6640887202684955, + "grad_norm": 1.1492415048861329, + "learning_rate": 1.751253036435653e-05, + "loss": 0.5434, + "step": 22755 + }, + { + "epoch": 0.6642346417627316, + "grad_norm": 0.9946079184844736, + "learning_rate": 1.7502802127646466e-05, + "loss": 0.516, + "step": 22760 + }, + { + "epoch": 0.6643805632569677, + "grad_norm": 1.0035661071226694, + "learning_rate": 1.7493076218970636e-05, + "loss": 0.5326, + "step": 22765 + }, + { + "epoch": 0.6645264847512039, + "grad_norm": 1.095034181536273, + "learning_rate": 1.7483352640593926e-05, + "loss": 0.5193, + "step": 22770 + }, + { + "epoch": 0.66467240624544, + "grad_norm": 1.1257855144454236, + "learning_rate": 1.747363139478061e-05, + "loss": 0.5374, + "step": 22775 + }, + { + "epoch": 0.6648183277396761, + "grad_norm": 0.9187265742371556, + "learning_rate": 1.74639124837945e-05, + "loss": 0.5713, + "step": 22780 + }, + { + "epoch": 0.6649642492339122, + "grad_norm": 1.1038815515098346, + "learning_rate": 1.7454195909898804e-05, + "loss": 0.5757, + "step": 22785 + }, + { + "epoch": 0.6651101707281483, + "grad_norm": 1.08632253691434, + "learning_rate": 1.7444481675356208e-05, + "loss": 0.5896, + "step": 22790 + }, + { + "epoch": 0.6652560922223844, + "grad_norm": 0.8971477303461654, + "learning_rate": 1.7434769782428877e-05, + "loss": 0.5027, + "step": 22795 + }, + { + "epoch": 0.6654020137166204, + "grad_norm": 0.938212889591609, + "learning_rate": 1.742506023337837e-05, + "loss": 0.5505, + "step": 22800 + }, + { + "epoch": 0.6655479352108565, + "grad_norm": 0.9654345804146206, + "learning_rate": 1.7415353030465763e-05, + "loss": 0.5603, + "step": 22805 + }, + { + "epoch": 0.6656938567050926, + "grad_norm": 1.0788458471096207, + "learning_rate": 1.7405648175951545e-05, + "loss": 0.495, + "step": 22810 + }, + { + "epoch": 0.6658397781993287, + "grad_norm": 1.0970413151210807, + "learning_rate": 1.739594567209569e-05, + "loss": 0.5458, + "step": 22815 + }, + { + "epoch": 0.6659856996935649, + "grad_norm": 1.1318940324050657, + "learning_rate": 1.7386245521157596e-05, + "loss": 0.5287, + "step": 22820 + }, + { + "epoch": 0.666131621187801, + "grad_norm": 1.0576462003167644, + "learning_rate": 1.7376547725396123e-05, + "loss": 0.6149, + "step": 22825 + }, + { + "epoch": 0.6662775426820371, + "grad_norm": 1.0507391910214832, + "learning_rate": 1.7366852287069586e-05, + "loss": 0.5744, + "step": 22830 + }, + { + "epoch": 0.6664234641762732, + "grad_norm": 1.05866825635751, + "learning_rate": 1.7357159208435738e-05, + "loss": 0.5639, + "step": 22835 + }, + { + "epoch": 0.6665693856705093, + "grad_norm": 1.0220161270049948, + "learning_rate": 1.7347468491751817e-05, + "loss": 0.5405, + "step": 22840 + }, + { + "epoch": 0.6667153071647454, + "grad_norm": 0.9813985032978343, + "learning_rate": 1.7337780139274474e-05, + "loss": 0.5619, + "step": 22845 + }, + { + "epoch": 0.6668612286589815, + "grad_norm": 0.9746940027034712, + "learning_rate": 1.7328094153259822e-05, + "loss": 0.5152, + "step": 22850 + }, + { + "epoch": 0.6670071501532175, + "grad_norm": 1.1253890253561056, + "learning_rate": 1.731841053596342e-05, + "loss": 0.6213, + "step": 22855 + }, + { + "epoch": 0.6671530716474536, + "grad_norm": 1.0821344765087502, + "learning_rate": 1.7308729289640292e-05, + "loss": 0.5453, + "step": 22860 + }, + { + "epoch": 0.6672989931416897, + "grad_norm": 0.8571015965834469, + "learning_rate": 1.7299050416544886e-05, + "loss": 0.4823, + "step": 22865 + }, + { + "epoch": 0.6674449146359259, + "grad_norm": 1.0023458534924126, + "learning_rate": 1.7289373918931113e-05, + "loss": 0.5291, + "step": 22870 + }, + { + "epoch": 0.667590836130162, + "grad_norm": 1.0509939215782997, + "learning_rate": 1.7279699799052326e-05, + "loss": 0.5207, + "step": 22875 + }, + { + "epoch": 0.6677367576243981, + "grad_norm": 1.0096239479163684, + "learning_rate": 1.7270028059161313e-05, + "loss": 0.4976, + "step": 22880 + }, + { + "epoch": 0.6678826791186342, + "grad_norm": 1.0696030330092865, + "learning_rate": 1.7260358701510342e-05, + "loss": 0.5825, + "step": 22885 + }, + { + "epoch": 0.6680286006128703, + "grad_norm": 1.0517355077461708, + "learning_rate": 1.7250691728351088e-05, + "loss": 0.5244, + "step": 22890 + }, + { + "epoch": 0.6681745221071064, + "grad_norm": 1.060763802022097, + "learning_rate": 1.724102714193469e-05, + "loss": 0.5679, + "step": 22895 + }, + { + "epoch": 0.6683204436013425, + "grad_norm": 0.8891093644378575, + "learning_rate": 1.723136494451173e-05, + "loss": 0.5148, + "step": 22900 + }, + { + "epoch": 0.6684663650955786, + "grad_norm": 1.3978784078283886, + "learning_rate": 1.722170513833222e-05, + "loss": 0.6194, + "step": 22905 + }, + { + "epoch": 0.6686122865898146, + "grad_norm": 1.289247955978387, + "learning_rate": 1.7212047725645637e-05, + "loss": 0.5432, + "step": 22910 + }, + { + "epoch": 0.6687582080840507, + "grad_norm": 0.9265779295872825, + "learning_rate": 1.7202392708700888e-05, + "loss": 0.5345, + "step": 22915 + }, + { + "epoch": 0.6689041295782869, + "grad_norm": 0.9564012791854823, + "learning_rate": 1.7192740089746324e-05, + "loss": 0.4985, + "step": 22920 + }, + { + "epoch": 0.669050051072523, + "grad_norm": 1.3245925113172805, + "learning_rate": 1.718308987102973e-05, + "loss": 0.6034, + "step": 22925 + }, + { + "epoch": 0.6691959725667591, + "grad_norm": 0.9169733989032314, + "learning_rate": 1.7173442054798355e-05, + "loss": 0.5555, + "step": 22930 + }, + { + "epoch": 0.6693418940609952, + "grad_norm": 1.1582435262667794, + "learning_rate": 1.7163796643298866e-05, + "loss": 0.5721, + "step": 22935 + }, + { + "epoch": 0.6694878155552313, + "grad_norm": 1.0015232662577707, + "learning_rate": 1.7154153638777372e-05, + "loss": 0.5823, + "step": 22940 + }, + { + "epoch": 0.6696337370494674, + "grad_norm": 0.9721584186757738, + "learning_rate": 1.714451304347943e-05, + "loss": 0.545, + "step": 22945 + }, + { + "epoch": 0.6697796585437035, + "grad_norm": 1.0547979815866453, + "learning_rate": 1.713487485965003e-05, + "loss": 0.5437, + "step": 22950 + }, + { + "epoch": 0.6699255800379396, + "grad_norm": 0.9668802912572163, + "learning_rate": 1.7125239089533615e-05, + "loss": 0.5282, + "step": 22955 + }, + { + "epoch": 0.6700715015321757, + "grad_norm": 1.0523811420671794, + "learning_rate": 1.711560573537404e-05, + "loss": 0.5334, + "step": 22960 + }, + { + "epoch": 0.6702174230264117, + "grad_norm": 1.111174111101546, + "learning_rate": 1.7105974799414623e-05, + "loss": 0.5437, + "step": 22965 + }, + { + "epoch": 0.6703633445206479, + "grad_norm": 1.1306558488983185, + "learning_rate": 1.70963462838981e-05, + "loss": 0.5816, + "step": 22970 + }, + { + "epoch": 0.670509266014884, + "grad_norm": 1.0348954330549631, + "learning_rate": 1.708672019106664e-05, + "loss": 0.5201, + "step": 22975 + }, + { + "epoch": 0.6706551875091201, + "grad_norm": 1.0338216617873321, + "learning_rate": 1.7077096523161882e-05, + "loss": 0.5471, + "step": 22980 + }, + { + "epoch": 0.6708011090033562, + "grad_norm": 1.1166430746274223, + "learning_rate": 1.706747528242486e-05, + "loss": 0.5633, + "step": 22985 + }, + { + "epoch": 0.6709470304975923, + "grad_norm": 1.0854731011854513, + "learning_rate": 1.7057856471096074e-05, + "loss": 0.5102, + "step": 22990 + }, + { + "epoch": 0.6710929519918284, + "grad_norm": 1.0561931045642492, + "learning_rate": 1.7048240091415418e-05, + "loss": 0.6019, + "step": 22995 + }, + { + "epoch": 0.6712388734860645, + "grad_norm": 0.8548325967758942, + "learning_rate": 1.7038626145622275e-05, + "loss": 0.565, + "step": 23000 + }, + { + "epoch": 0.6713847949803006, + "grad_norm": 1.1306562088692707, + "learning_rate": 1.7029014635955414e-05, + "loss": 0.5857, + "step": 23005 + }, + { + "epoch": 0.6715307164745367, + "grad_norm": 0.8871696913778064, + "learning_rate": 1.7019405564653063e-05, + "loss": 0.4927, + "step": 23010 + }, + { + "epoch": 0.6716766379687727, + "grad_norm": 1.1596165485318664, + "learning_rate": 1.7009798933952872e-05, + "loss": 0.6298, + "step": 23015 + }, + { + "epoch": 0.671822559463009, + "grad_norm": 1.203428662080954, + "learning_rate": 1.7000194746091913e-05, + "loss": 0.5477, + "step": 23020 + }, + { + "epoch": 0.671968480957245, + "grad_norm": 1.1431052281582041, + "learning_rate": 1.6990593003306722e-05, + "loss": 0.5874, + "step": 23025 + }, + { + "epoch": 0.6721144024514811, + "grad_norm": 1.084437885125764, + "learning_rate": 1.6980993707833216e-05, + "loss": 0.5735, + "step": 23030 + }, + { + "epoch": 0.6722603239457172, + "grad_norm": 1.0482996860710294, + "learning_rate": 1.697139686190681e-05, + "loss": 0.5112, + "step": 23035 + }, + { + "epoch": 0.6724062454399533, + "grad_norm": 1.1694277303245686, + "learning_rate": 1.696180246776228e-05, + "loss": 0.5464, + "step": 23040 + }, + { + "epoch": 0.6725521669341894, + "grad_norm": 1.1584063032522418, + "learning_rate": 1.695221052763385e-05, + "loss": 0.6097, + "step": 23045 + }, + { + "epoch": 0.6726980884284255, + "grad_norm": 1.1321955920722366, + "learning_rate": 1.6942621043755205e-05, + "loss": 0.5793, + "step": 23050 + }, + { + "epoch": 0.6728440099226616, + "grad_norm": 1.0710594381512657, + "learning_rate": 1.6933034018359434e-05, + "loss": 0.539, + "step": 23055 + }, + { + "epoch": 0.6729899314168977, + "grad_norm": 1.0096259612558842, + "learning_rate": 1.6923449453679048e-05, + "loss": 0.5984, + "step": 23060 + }, + { + "epoch": 0.6731358529111338, + "grad_norm": 1.0617707963259246, + "learning_rate": 1.6913867351945978e-05, + "loss": 0.5238, + "step": 23065 + }, + { + "epoch": 0.67328177440537, + "grad_norm": 0.9839102714961664, + "learning_rate": 1.6904287715391625e-05, + "loss": 0.5367, + "step": 23070 + }, + { + "epoch": 0.673427695899606, + "grad_norm": 1.1334308891524825, + "learning_rate": 1.6894710546246774e-05, + "loss": 0.5851, + "step": 23075 + }, + { + "epoch": 0.6735736173938421, + "grad_norm": 1.11701429464701, + "learning_rate": 1.688513584674164e-05, + "loss": 0.5647, + "step": 23080 + }, + { + "epoch": 0.6737195388880782, + "grad_norm": 1.160265069265792, + "learning_rate": 1.687556361910588e-05, + "loss": 0.5709, + "step": 23085 + }, + { + "epoch": 0.6738654603823143, + "grad_norm": 0.9902998126548146, + "learning_rate": 1.6865993865568547e-05, + "loss": 0.5578, + "step": 23090 + }, + { + "epoch": 0.6740113818765504, + "grad_norm": 1.0502794718331292, + "learning_rate": 1.6856426588358167e-05, + "loss": 0.5684, + "step": 23095 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 1.0149458800754292, + "learning_rate": 1.684686178970263e-05, + "loss": 0.5036, + "step": 23100 + }, + { + "epoch": 0.6743032248650226, + "grad_norm": 0.9477917100921661, + "learning_rate": 1.6837299471829315e-05, + "loss": 0.5981, + "step": 23105 + }, + { + "epoch": 0.6744491463592587, + "grad_norm": 0.9576828679520093, + "learning_rate": 1.682773963696494e-05, + "loss": 0.571, + "step": 23110 + }, + { + "epoch": 0.6745950678534948, + "grad_norm": 1.0285771222944262, + "learning_rate": 1.6818182287335726e-05, + "loss": 0.6035, + "step": 23115 + }, + { + "epoch": 0.674740989347731, + "grad_norm": 1.0471372169567676, + "learning_rate": 1.6808627425167274e-05, + "loss": 0.5311, + "step": 23120 + }, + { + "epoch": 0.6748869108419671, + "grad_norm": 0.8947847519013676, + "learning_rate": 1.6799075052684587e-05, + "loss": 0.4482, + "step": 23125 + }, + { + "epoch": 0.6750328323362031, + "grad_norm": 0.9906370768566115, + "learning_rate": 1.678952517211216e-05, + "loss": 0.5068, + "step": 23130 + }, + { + "epoch": 0.6751787538304392, + "grad_norm": 1.0269642948236566, + "learning_rate": 1.6779977785673812e-05, + "loss": 0.5417, + "step": 23135 + }, + { + "epoch": 0.6753246753246753, + "grad_norm": 0.9840424733816381, + "learning_rate": 1.677043289559286e-05, + "loss": 0.527, + "step": 23140 + }, + { + "epoch": 0.6754705968189114, + "grad_norm": 0.995730947756101, + "learning_rate": 1.6760890504092004e-05, + "loss": 0.5638, + "step": 23145 + }, + { + "epoch": 0.6756165183131475, + "grad_norm": 1.1739752859882615, + "learning_rate": 1.675135061339337e-05, + "loss": 0.6313, + "step": 23150 + }, + { + "epoch": 0.6757624398073836, + "grad_norm": 1.0070692333879487, + "learning_rate": 1.674181322571849e-05, + "loss": 0.5724, + "step": 23155 + }, + { + "epoch": 0.6759083613016197, + "grad_norm": 0.9805496124952455, + "learning_rate": 1.6732278343288324e-05, + "loss": 0.5595, + "step": 23160 + }, + { + "epoch": 0.6760542827958558, + "grad_norm": 1.1891503261107619, + "learning_rate": 1.6722745968323262e-05, + "loss": 0.5501, + "step": 23165 + }, + { + "epoch": 0.676200204290092, + "grad_norm": 0.9459787552230361, + "learning_rate": 1.6713216103043078e-05, + "loss": 0.5178, + "step": 23170 + }, + { + "epoch": 0.6763461257843281, + "grad_norm": 1.0194841987911363, + "learning_rate": 1.6703688749667002e-05, + "loss": 0.5153, + "step": 23175 + }, + { + "epoch": 0.6764920472785642, + "grad_norm": 0.9394424189934754, + "learning_rate": 1.6694163910413623e-05, + "loss": 0.5457, + "step": 23180 + }, + { + "epoch": 0.6766379687728002, + "grad_norm": 1.0305380954741048, + "learning_rate": 1.668464158750101e-05, + "loss": 0.5356, + "step": 23185 + }, + { + "epoch": 0.6767838902670363, + "grad_norm": 1.0628366997866339, + "learning_rate": 1.6675121783146598e-05, + "loss": 0.5755, + "step": 23190 + }, + { + "epoch": 0.6769298117612724, + "grad_norm": 1.0204511676726125, + "learning_rate": 1.6665604499567238e-05, + "loss": 0.5893, + "step": 23195 + }, + { + "epoch": 0.6770757332555085, + "grad_norm": 1.2554300500924958, + "learning_rate": 1.6656089738979246e-05, + "loss": 0.6102, + "step": 23200 + }, + { + "epoch": 0.6772216547497446, + "grad_norm": 1.0584128865168627, + "learning_rate": 1.6646577503598266e-05, + "loss": 0.4976, + "step": 23205 + }, + { + "epoch": 0.6773675762439807, + "grad_norm": 0.9802831555039291, + "learning_rate": 1.6637067795639433e-05, + "loss": 0.565, + "step": 23210 + }, + { + "epoch": 0.6775134977382168, + "grad_norm": 0.9297394984569726, + "learning_rate": 1.662756061731725e-05, + "loss": 0.5141, + "step": 23215 + }, + { + "epoch": 0.677659419232453, + "grad_norm": 1.2610847416564823, + "learning_rate": 1.6618055970845637e-05, + "loss": 0.5798, + "step": 23220 + }, + { + "epoch": 0.6778053407266891, + "grad_norm": 1.03021309074525, + "learning_rate": 1.6608553858437934e-05, + "loss": 0.548, + "step": 23225 + }, + { + "epoch": 0.6779512622209252, + "grad_norm": 1.1822521636541803, + "learning_rate": 1.6599054282306875e-05, + "loss": 0.4995, + "step": 23230 + }, + { + "epoch": 0.6780971837151613, + "grad_norm": 1.0142994370813658, + "learning_rate": 1.6589557244664627e-05, + "loss": 0.5529, + "step": 23235 + }, + { + "epoch": 0.6782431052093973, + "grad_norm": 1.0297452818205204, + "learning_rate": 1.658006274772274e-05, + "loss": 0.5646, + "step": 23240 + }, + { + "epoch": 0.6783890267036334, + "grad_norm": 1.1473667017619555, + "learning_rate": 1.6570570793692204e-05, + "loss": 0.6576, + "step": 23245 + }, + { + "epoch": 0.6785349481978695, + "grad_norm": 1.0208846805799963, + "learning_rate": 1.6561081384783372e-05, + "loss": 0.5145, + "step": 23250 + }, + { + "epoch": 0.6786808696921056, + "grad_norm": 1.110999011653466, + "learning_rate": 1.6551594523206054e-05, + "loss": 0.5795, + "step": 23255 + }, + { + "epoch": 0.6788267911863417, + "grad_norm": 0.9602465407403457, + "learning_rate": 1.6542110211169433e-05, + "loss": 0.5574, + "step": 23260 + }, + { + "epoch": 0.6789727126805778, + "grad_norm": 0.9208055144682332, + "learning_rate": 1.6532628450882094e-05, + "loss": 0.5545, + "step": 23265 + }, + { + "epoch": 0.679118634174814, + "grad_norm": 0.9242384567633852, + "learning_rate": 1.6523149244552078e-05, + "loss": 0.5141, + "step": 23270 + }, + { + "epoch": 0.6792645556690501, + "grad_norm": 1.0816904339632034, + "learning_rate": 1.6513672594386754e-05, + "loss": 0.5539, + "step": 23275 + }, + { + "epoch": 0.6794104771632862, + "grad_norm": 1.080541560198899, + "learning_rate": 1.6504198502592967e-05, + "loss": 0.5101, + "step": 23280 + }, + { + "epoch": 0.6795563986575223, + "grad_norm": 1.0742492238456445, + "learning_rate": 1.649472697137691e-05, + "loss": 0.5258, + "step": 23285 + }, + { + "epoch": 0.6797023201517584, + "grad_norm": 0.9347587717116602, + "learning_rate": 1.6485258002944237e-05, + "loss": 0.5638, + "step": 23290 + }, + { + "epoch": 0.6798482416459944, + "grad_norm": 1.021894703208546, + "learning_rate": 1.6475791599499963e-05, + "loss": 0.558, + "step": 23295 + }, + { + "epoch": 0.6799941631402305, + "grad_norm": 0.9373868979781866, + "learning_rate": 1.6466327763248496e-05, + "loss": 0.5886, + "step": 23300 + }, + { + "epoch": 0.6801400846344666, + "grad_norm": 1.1164845953712361, + "learning_rate": 1.6456866496393693e-05, + "loss": 0.5123, + "step": 23305 + }, + { + "epoch": 0.6802860061287027, + "grad_norm": 1.1053382619595968, + "learning_rate": 1.6447407801138764e-05, + "loss": 0.5751, + "step": 23310 + }, + { + "epoch": 0.6804319276229388, + "grad_norm": 1.0749554731577158, + "learning_rate": 1.643795167968638e-05, + "loss": 0.535, + "step": 23315 + }, + { + "epoch": 0.680577849117175, + "grad_norm": 1.185607479490067, + "learning_rate": 1.6428498134238525e-05, + "loss": 0.5442, + "step": 23320 + }, + { + "epoch": 0.6807237706114111, + "grad_norm": 1.0152353292368814, + "learning_rate": 1.6419047166996677e-05, + "loss": 0.5343, + "step": 23325 + }, + { + "epoch": 0.6808696921056472, + "grad_norm": 1.0350232364893797, + "learning_rate": 1.640959878016165e-05, + "loss": 0.5414, + "step": 23330 + }, + { + "epoch": 0.6810156135998833, + "grad_norm": 1.1825879912847295, + "learning_rate": 1.640015297593367e-05, + "loss": 0.5637, + "step": 23335 + }, + { + "epoch": 0.6811615350941194, + "grad_norm": 1.0675058118255656, + "learning_rate": 1.6390709756512396e-05, + "loss": 0.5819, + "step": 23340 + }, + { + "epoch": 0.6813074565883555, + "grad_norm": 1.0598146914594901, + "learning_rate": 1.6381269124096826e-05, + "loss": 0.5448, + "step": 23345 + }, + { + "epoch": 0.6814533780825915, + "grad_norm": 1.0035441650626076, + "learning_rate": 1.6371831080885412e-05, + "loss": 0.5284, + "step": 23350 + }, + { + "epoch": 0.6815992995768276, + "grad_norm": 0.9034775875526377, + "learning_rate": 1.636239562907596e-05, + "loss": 0.5339, + "step": 23355 + }, + { + "epoch": 0.6817452210710637, + "grad_norm": 1.0782894915309393, + "learning_rate": 1.635296277086571e-05, + "loss": 0.567, + "step": 23360 + }, + { + "epoch": 0.6818911425652999, + "grad_norm": 0.9842612881379557, + "learning_rate": 1.634353250845127e-05, + "loss": 0.5745, + "step": 23365 + }, + { + "epoch": 0.682037064059536, + "grad_norm": 1.178868287168965, + "learning_rate": 1.633410484402865e-05, + "loss": 0.5723, + "step": 23370 + }, + { + "epoch": 0.6821829855537721, + "grad_norm": 0.9865464731188474, + "learning_rate": 1.6324679779793262e-05, + "loss": 0.5712, + "step": 23375 + }, + { + "epoch": 0.6823289070480082, + "grad_norm": 0.9605047187629729, + "learning_rate": 1.6315257317939897e-05, + "loss": 0.536, + "step": 23380 + }, + { + "epoch": 0.6824748285422443, + "grad_norm": 1.2088904999579724, + "learning_rate": 1.6305837460662775e-05, + "loss": 0.6091, + "step": 23385 + }, + { + "epoch": 0.6826207500364804, + "grad_norm": 1.1824381558147814, + "learning_rate": 1.6296420210155468e-05, + "loss": 0.5819, + "step": 23390 + }, + { + "epoch": 0.6827666715307165, + "grad_norm": 1.0609920495438732, + "learning_rate": 1.628700556861096e-05, + "loss": 0.4789, + "step": 23395 + }, + { + "epoch": 0.6829125930249526, + "grad_norm": 1.0475499021537709, + "learning_rate": 1.627759353822163e-05, + "loss": 0.5758, + "step": 23400 + }, + { + "epoch": 0.6830585145191886, + "grad_norm": 0.9269605777426536, + "learning_rate": 1.6268184121179238e-05, + "loss": 0.5593, + "step": 23405 + }, + { + "epoch": 0.6832044360134247, + "grad_norm": 1.4658124627300477, + "learning_rate": 1.6258777319674962e-05, + "loss": 0.5617, + "step": 23410 + }, + { + "epoch": 0.6833503575076609, + "grad_norm": 0.8773186284019495, + "learning_rate": 1.6249373135899318e-05, + "loss": 0.5225, + "step": 23415 + }, + { + "epoch": 0.683496279001897, + "grad_norm": 0.9814686065894171, + "learning_rate": 1.6239971572042276e-05, + "loss": 0.5202, + "step": 23420 + }, + { + "epoch": 0.6836422004961331, + "grad_norm": 1.1953645833053774, + "learning_rate": 1.6230572630293144e-05, + "loss": 0.6085, + "step": 23425 + }, + { + "epoch": 0.6837881219903692, + "grad_norm": 1.0469237309760777, + "learning_rate": 1.6221176312840666e-05, + "loss": 0.5334, + "step": 23430 + }, + { + "epoch": 0.6839340434846053, + "grad_norm": 0.9478701868735897, + "learning_rate": 1.6211782621872932e-05, + "loss": 0.579, + "step": 23435 + }, + { + "epoch": 0.6840799649788414, + "grad_norm": 1.1447166374528552, + "learning_rate": 1.6202391559577444e-05, + "loss": 0.4644, + "step": 23440 + }, + { + "epoch": 0.6842258864730775, + "grad_norm": 1.067339598444809, + "learning_rate": 1.6193003128141083e-05, + "loss": 0.6239, + "step": 23445 + }, + { + "epoch": 0.6843718079673136, + "grad_norm": 1.024253990127003, + "learning_rate": 1.6183617329750118e-05, + "loss": 0.5885, + "step": 23450 + }, + { + "epoch": 0.6845177294615497, + "grad_norm": 0.9265669984984773, + "learning_rate": 1.6174234166590213e-05, + "loss": 0.523, + "step": 23455 + }, + { + "epoch": 0.6846636509557857, + "grad_norm": 1.0874373479316508, + "learning_rate": 1.616485364084642e-05, + "loss": 0.5816, + "step": 23460 + }, + { + "epoch": 0.6848095724500219, + "grad_norm": 1.0376467727882255, + "learning_rate": 1.6155475754703154e-05, + "loss": 0.5503, + "step": 23465 + }, + { + "epoch": 0.684955493944258, + "grad_norm": 1.0895133678626407, + "learning_rate": 1.6146100510344243e-05, + "loss": 0.5281, + "step": 23470 + }, + { + "epoch": 0.6851014154384941, + "grad_norm": 1.268252728418005, + "learning_rate": 1.613672790995287e-05, + "loss": 0.684, + "step": 23475 + }, + { + "epoch": 0.6852473369327302, + "grad_norm": 0.9438313716507676, + "learning_rate": 1.6127357955711645e-05, + "loss": 0.5629, + "step": 23480 + }, + { + "epoch": 0.6853932584269663, + "grad_norm": 1.1380420485873997, + "learning_rate": 1.611799064980252e-05, + "loss": 0.5421, + "step": 23485 + }, + { + "epoch": 0.6855391799212024, + "grad_norm": 1.198483456071783, + "learning_rate": 1.6108625994406856e-05, + "loss": 0.5806, + "step": 23490 + }, + { + "epoch": 0.6856851014154385, + "grad_norm": 1.064611203381326, + "learning_rate": 1.6099263991705377e-05, + "loss": 0.5777, + "step": 23495 + }, + { + "epoch": 0.6858310229096746, + "grad_norm": 1.1879101637228973, + "learning_rate": 1.608990464387821e-05, + "loss": 0.5111, + "step": 23500 + }, + { + "epoch": 0.6859769444039107, + "grad_norm": 0.9454223676325956, + "learning_rate": 1.6080547953104855e-05, + "loss": 0.4858, + "step": 23505 + }, + { + "epoch": 0.6861228658981467, + "grad_norm": 0.9772241526624805, + "learning_rate": 1.6071193921564186e-05, + "loss": 0.5658, + "step": 23510 + }, + { + "epoch": 0.686268787392383, + "grad_norm": 1.255510697205806, + "learning_rate": 1.606184255143447e-05, + "loss": 0.6002, + "step": 23515 + }, + { + "epoch": 0.686414708886619, + "grad_norm": 0.9904281988027352, + "learning_rate": 1.605249384489333e-05, + "loss": 0.6192, + "step": 23520 + }, + { + "epoch": 0.6865606303808551, + "grad_norm": 1.1358177593119672, + "learning_rate": 1.604314780411781e-05, + "loss": 0.5435, + "step": 23525 + }, + { + "epoch": 0.6867065518750912, + "grad_norm": 1.0466279870437631, + "learning_rate": 1.6033804431284304e-05, + "loss": 0.5616, + "step": 23530 + }, + { + "epoch": 0.6868524733693273, + "grad_norm": 1.1618160603749006, + "learning_rate": 1.602446372856859e-05, + "loss": 0.5652, + "step": 23535 + }, + { + "epoch": 0.6869983948635634, + "grad_norm": 0.9998343717013705, + "learning_rate": 1.6015125698145813e-05, + "loss": 0.5309, + "step": 23540 + }, + { + "epoch": 0.6871443163577995, + "grad_norm": 1.2399625557201817, + "learning_rate": 1.6005790342190524e-05, + "loss": 0.5655, + "step": 23545 + }, + { + "epoch": 0.6872902378520356, + "grad_norm": 0.9459448030947272, + "learning_rate": 1.599645766287663e-05, + "loss": 0.5692, + "step": 23550 + }, + { + "epoch": 0.6874361593462717, + "grad_norm": 1.0752870262988208, + "learning_rate": 1.5987127662377422e-05, + "loss": 0.535, + "step": 23555 + }, + { + "epoch": 0.6875820808405078, + "grad_norm": 1.0667389282485402, + "learning_rate": 1.5977800342865562e-05, + "loss": 0.6001, + "step": 23560 + }, + { + "epoch": 0.687728002334744, + "grad_norm": 1.0609337750350845, + "learning_rate": 1.5968475706513074e-05, + "loss": 0.5241, + "step": 23565 + }, + { + "epoch": 0.68787392382898, + "grad_norm": 1.1431712469539306, + "learning_rate": 1.5959153755491406e-05, + "loss": 0.5194, + "step": 23570 + }, + { + "epoch": 0.6880198453232161, + "grad_norm": 1.0336957854878634, + "learning_rate": 1.594983449197133e-05, + "loss": 0.5721, + "step": 23575 + }, + { + "epoch": 0.6881657668174522, + "grad_norm": 1.0372929794021808, + "learning_rate": 1.5940517918123014e-05, + "loss": 0.5477, + "step": 23580 + }, + { + "epoch": 0.6883116883116883, + "grad_norm": 1.1715872333389117, + "learning_rate": 1.593120403611599e-05, + "loss": 0.564, + "step": 23585 + }, + { + "epoch": 0.6884576098059244, + "grad_norm": 0.9390364392736658, + "learning_rate": 1.5921892848119168e-05, + "loss": 0.489, + "step": 23590 + }, + { + "epoch": 0.6886035313001605, + "grad_norm": 0.9694152961788406, + "learning_rate": 1.5912584356300852e-05, + "loss": 0.5783, + "step": 23595 + }, + { + "epoch": 0.6887494527943966, + "grad_norm": 1.0175749742331377, + "learning_rate": 1.590327856282868e-05, + "loss": 0.6127, + "step": 23600 + }, + { + "epoch": 0.6888953742886327, + "grad_norm": 0.9958267435307991, + "learning_rate": 1.5893975469869683e-05, + "loss": 0.5605, + "step": 23605 + }, + { + "epoch": 0.6890412957828688, + "grad_norm": 1.0258320463814403, + "learning_rate": 1.5884675079590257e-05, + "loss": 0.5621, + "step": 23610 + }, + { + "epoch": 0.689187217277105, + "grad_norm": 1.0199016104137717, + "learning_rate": 1.587537739415618e-05, + "loss": 0.5188, + "step": 23615 + }, + { + "epoch": 0.6893331387713411, + "grad_norm": 0.9650422372422685, + "learning_rate": 1.5866082415732594e-05, + "loss": 0.599, + "step": 23620 + }, + { + "epoch": 0.6894790602655771, + "grad_norm": 1.0831343293958984, + "learning_rate": 1.5856790146483997e-05, + "loss": 0.5557, + "step": 23625 + }, + { + "epoch": 0.6896249817598132, + "grad_norm": 1.0055842576754728, + "learning_rate": 1.5847500588574274e-05, + "loss": 0.5785, + "step": 23630 + }, + { + "epoch": 0.6897709032540493, + "grad_norm": 1.3611713938555352, + "learning_rate": 1.5838213744166663e-05, + "loss": 0.5718, + "step": 23635 + }, + { + "epoch": 0.6899168247482854, + "grad_norm": 1.0885449456691279, + "learning_rate": 1.582892961542379e-05, + "loss": 0.573, + "step": 23640 + }, + { + "epoch": 0.6900627462425215, + "grad_norm": 0.989074675000031, + "learning_rate": 1.5819648204507643e-05, + "loss": 0.5723, + "step": 23645 + }, + { + "epoch": 0.6902086677367576, + "grad_norm": 0.95446603669653, + "learning_rate": 1.5810369513579558e-05, + "loss": 0.5519, + "step": 23650 + }, + { + "epoch": 0.6903545892309937, + "grad_norm": 1.184215700854847, + "learning_rate": 1.580109354480026e-05, + "loss": 0.5478, + "step": 23655 + }, + { + "epoch": 0.6905005107252298, + "grad_norm": 1.0701789108762954, + "learning_rate": 1.5791820300329817e-05, + "loss": 0.5363, + "step": 23660 + }, + { + "epoch": 0.690646432219466, + "grad_norm": 1.1438174326842303, + "learning_rate": 1.5782549782327695e-05, + "loss": 0.585, + "step": 23665 + }, + { + "epoch": 0.6907923537137021, + "grad_norm": 0.9157628825332, + "learning_rate": 1.5773281992952704e-05, + "loss": 0.5984, + "step": 23670 + }, + { + "epoch": 0.6909382752079382, + "grad_norm": 0.9762524213905028, + "learning_rate": 1.5764016934363017e-05, + "loss": 0.5547, + "step": 23675 + }, + { + "epoch": 0.6910841967021742, + "grad_norm": 1.0943813251343266, + "learning_rate": 1.575475460871617e-05, + "loss": 0.6572, + "step": 23680 + }, + { + "epoch": 0.6912301181964103, + "grad_norm": 1.4299729069645397, + "learning_rate": 1.5745495018169087e-05, + "loss": 0.5581, + "step": 23685 + }, + { + "epoch": 0.6913760396906464, + "grad_norm": 1.1122682820087713, + "learning_rate": 1.5736238164878024e-05, + "loss": 0.5561, + "step": 23690 + }, + { + "epoch": 0.6915219611848825, + "grad_norm": 1.042108564749287, + "learning_rate": 1.572698405099861e-05, + "loss": 0.5207, + "step": 23695 + }, + { + "epoch": 0.6916678826791186, + "grad_norm": 0.9900335337263171, + "learning_rate": 1.5717732678685848e-05, + "loss": 0.5052, + "step": 23700 + }, + { + "epoch": 0.6918138041733547, + "grad_norm": 1.3067463810264681, + "learning_rate": 1.5708484050094077e-05, + "loss": 0.6179, + "step": 23705 + }, + { + "epoch": 0.6919597256675908, + "grad_norm": 0.9617549799250384, + "learning_rate": 1.569923816737703e-05, + "loss": 0.4895, + "step": 23710 + }, + { + "epoch": 0.692105647161827, + "grad_norm": 1.1011768100805583, + "learning_rate": 1.5689995032687767e-05, + "loss": 0.5222, + "step": 23715 + }, + { + "epoch": 0.6922515686560631, + "grad_norm": 1.1232514821334867, + "learning_rate": 1.5680754648178754e-05, + "loss": 0.5949, + "step": 23720 + }, + { + "epoch": 0.6923974901502992, + "grad_norm": 1.3577272138291319, + "learning_rate": 1.5671517016001757e-05, + "loss": 0.5989, + "step": 23725 + }, + { + "epoch": 0.6925434116445353, + "grad_norm": 1.1006137366082411, + "learning_rate": 1.5662282138307932e-05, + "loss": 0.6073, + "step": 23730 + }, + { + "epoch": 0.6926893331387713, + "grad_norm": 1.1773173017553349, + "learning_rate": 1.565305001724781e-05, + "loss": 0.588, + "step": 23735 + }, + { + "epoch": 0.6928352546330074, + "grad_norm": 1.1620335338616863, + "learning_rate": 1.5643820654971248e-05, + "loss": 0.5832, + "step": 23740 + }, + { + "epoch": 0.6929811761272435, + "grad_norm": 1.031085682964398, + "learning_rate": 1.5634594053627497e-05, + "loss": 0.5757, + "step": 23745 + }, + { + "epoch": 0.6931270976214796, + "grad_norm": 1.004524683013652, + "learning_rate": 1.5625370215365112e-05, + "loss": 0.5818, + "step": 23750 + }, + { + "epoch": 0.6932730191157157, + "grad_norm": 0.9339729446136052, + "learning_rate": 1.5616149142332064e-05, + "loss": 0.5754, + "step": 23755 + }, + { + "epoch": 0.6934189406099518, + "grad_norm": 1.0004535089218087, + "learning_rate": 1.560693083667564e-05, + "loss": 0.4795, + "step": 23760 + }, + { + "epoch": 0.693564862104188, + "grad_norm": 0.9877533608974142, + "learning_rate": 1.559771530054249e-05, + "loss": 0.5071, + "step": 23765 + }, + { + "epoch": 0.6937107835984241, + "grad_norm": 1.0923836364874329, + "learning_rate": 1.5588502536078635e-05, + "loss": 0.5225, + "step": 23770 + }, + { + "epoch": 0.6938567050926602, + "grad_norm": 0.9781339788954408, + "learning_rate": 1.5579292545429424e-05, + "loss": 0.472, + "step": 23775 + }, + { + "epoch": 0.6940026265868963, + "grad_norm": 1.0269149935255157, + "learning_rate": 1.557008533073959e-05, + "loss": 0.4979, + "step": 23780 + }, + { + "epoch": 0.6941485480811324, + "grad_norm": 0.9612519605924633, + "learning_rate": 1.5560880894153194e-05, + "loss": 0.6085, + "step": 23785 + }, + { + "epoch": 0.6942944695753684, + "grad_norm": 1.093956388883857, + "learning_rate": 1.5551679237813683e-05, + "loss": 0.5393, + "step": 23790 + }, + { + "epoch": 0.6944403910696045, + "grad_norm": 0.9298277891789557, + "learning_rate": 1.55424803638638e-05, + "loss": 0.5564, + "step": 23795 + }, + { + "epoch": 0.6945863125638406, + "grad_norm": 1.1287123821467764, + "learning_rate": 1.5533284274445704e-05, + "loss": 0.5833, + "step": 23800 + }, + { + "epoch": 0.6947322340580767, + "grad_norm": 0.9747456728924071, + "learning_rate": 1.5524090971700858e-05, + "loss": 0.514, + "step": 23805 + }, + { + "epoch": 0.6948781555523128, + "grad_norm": 1.006958214950494, + "learning_rate": 1.5514900457770096e-05, + "loss": 0.5698, + "step": 23810 + }, + { + "epoch": 0.695024077046549, + "grad_norm": 1.0017558592261047, + "learning_rate": 1.5505712734793624e-05, + "loss": 0.5484, + "step": 23815 + }, + { + "epoch": 0.6951699985407851, + "grad_norm": 1.144835571577397, + "learning_rate": 1.549652780491094e-05, + "loss": 0.5765, + "step": 23820 + }, + { + "epoch": 0.6953159200350212, + "grad_norm": 0.9971907597869134, + "learning_rate": 1.548734567026095e-05, + "loss": 0.6106, + "step": 23825 + }, + { + "epoch": 0.6954618415292573, + "grad_norm": 0.9843461176192522, + "learning_rate": 1.5478166332981883e-05, + "loss": 0.5654, + "step": 23830 + }, + { + "epoch": 0.6956077630234934, + "grad_norm": 1.11990042432235, + "learning_rate": 1.5468989795211303e-05, + "loss": 0.5446, + "step": 23835 + }, + { + "epoch": 0.6957536845177295, + "grad_norm": 0.9887499455259151, + "learning_rate": 1.5459816059086167e-05, + "loss": 0.5462, + "step": 23840 + }, + { + "epoch": 0.6958996060119655, + "grad_norm": 0.9902404577868734, + "learning_rate": 1.545064512674272e-05, + "loss": 0.5598, + "step": 23845 + }, + { + "epoch": 0.6960455275062016, + "grad_norm": 0.9753796815314271, + "learning_rate": 1.5441477000316602e-05, + "loss": 0.5653, + "step": 23850 + }, + { + "epoch": 0.6961914490004377, + "grad_norm": 0.9872521467117548, + "learning_rate": 1.5432311681942775e-05, + "loss": 0.5295, + "step": 23855 + }, + { + "epoch": 0.6963373704946738, + "grad_norm": 1.2628690881733258, + "learning_rate": 1.542314917375557e-05, + "loss": 0.6121, + "step": 23860 + }, + { + "epoch": 0.69648329198891, + "grad_norm": 1.0891966269910203, + "learning_rate": 1.5413989477888628e-05, + "loss": 0.5384, + "step": 23865 + }, + { + "epoch": 0.6966292134831461, + "grad_norm": 0.9987778189723784, + "learning_rate": 1.5404832596474966e-05, + "loss": 0.5314, + "step": 23870 + }, + { + "epoch": 0.6967751349773822, + "grad_norm": 1.1407358691363665, + "learning_rate": 1.5395678531646937e-05, + "loss": 0.595, + "step": 23875 + }, + { + "epoch": 0.6969210564716183, + "grad_norm": 1.0973126262333854, + "learning_rate": 1.5386527285536222e-05, + "loss": 0.5673, + "step": 23880 + }, + { + "epoch": 0.6970669779658544, + "grad_norm": 1.062910583233567, + "learning_rate": 1.5377378860273888e-05, + "loss": 0.5299, + "step": 23885 + }, + { + "epoch": 0.6972128994600905, + "grad_norm": 1.0558694226895347, + "learning_rate": 1.5368233257990284e-05, + "loss": 0.6188, + "step": 23890 + }, + { + "epoch": 0.6973588209543266, + "grad_norm": 1.0133638996405185, + "learning_rate": 1.5359090480815152e-05, + "loss": 0.5552, + "step": 23895 + }, + { + "epoch": 0.6975047424485626, + "grad_norm": 1.1337502189115591, + "learning_rate": 1.5349950530877558e-05, + "loss": 0.5552, + "step": 23900 + }, + { + "epoch": 0.6976506639427987, + "grad_norm": 0.9759160535765449, + "learning_rate": 1.5340813410305897e-05, + "loss": 0.5235, + "step": 23905 + }, + { + "epoch": 0.6977965854370348, + "grad_norm": 0.9863873237566788, + "learning_rate": 1.5331679121227942e-05, + "loss": 0.6026, + "step": 23910 + }, + { + "epoch": 0.697942506931271, + "grad_norm": 0.8840693382169786, + "learning_rate": 1.532254766577076e-05, + "loss": 0.5632, + "step": 23915 + }, + { + "epoch": 0.6980884284255071, + "grad_norm": 1.079312551924166, + "learning_rate": 1.531341904606079e-05, + "loss": 0.5608, + "step": 23920 + }, + { + "epoch": 0.6982343499197432, + "grad_norm": 1.1708898818397133, + "learning_rate": 1.53042932642238e-05, + "loss": 0.5526, + "step": 23925 + }, + { + "epoch": 0.6983802714139793, + "grad_norm": 0.959678370872696, + "learning_rate": 1.5295170322384904e-05, + "loss": 0.5437, + "step": 23930 + }, + { + "epoch": 0.6985261929082154, + "grad_norm": 0.8939635222772591, + "learning_rate": 1.528605022266855e-05, + "loss": 0.5445, + "step": 23935 + }, + { + "epoch": 0.6986721144024515, + "grad_norm": 1.1218714853305787, + "learning_rate": 1.5276932967198517e-05, + "loss": 0.5974, + "step": 23940 + }, + { + "epoch": 0.6988180358966876, + "grad_norm": 1.0828812139718766, + "learning_rate": 1.526781855809793e-05, + "loss": 0.5247, + "step": 23945 + }, + { + "epoch": 0.6989639573909237, + "grad_norm": 0.9231800826504527, + "learning_rate": 1.5258706997489241e-05, + "loss": 0.5466, + "step": 23950 + }, + { + "epoch": 0.6991098788851597, + "grad_norm": 1.1109178502670414, + "learning_rate": 1.5249598287494277e-05, + "loss": 0.5898, + "step": 23955 + }, + { + "epoch": 0.6992558003793958, + "grad_norm": 1.0142001242438954, + "learning_rate": 1.5240492430234127e-05, + "loss": 0.5583, + "step": 23960 + }, + { + "epoch": 0.699401721873632, + "grad_norm": 1.0743897999192367, + "learning_rate": 1.5231389427829295e-05, + "loss": 0.5254, + "step": 23965 + }, + { + "epoch": 0.6995476433678681, + "grad_norm": 1.060414814772233, + "learning_rate": 1.5222289282399568e-05, + "loss": 0.5538, + "step": 23970 + }, + { + "epoch": 0.6996935648621042, + "grad_norm": 1.2878218447865446, + "learning_rate": 1.5213191996064094e-05, + "loss": 0.5409, + "step": 23975 + }, + { + "epoch": 0.6998394863563403, + "grad_norm": 1.063504307431234, + "learning_rate": 1.5204097570941351e-05, + "loss": 0.6097, + "step": 23980 + }, + { + "epoch": 0.6999854078505764, + "grad_norm": 1.1331507414321427, + "learning_rate": 1.519500600914912e-05, + "loss": 0.5285, + "step": 23985 + }, + { + "epoch": 0.7001313293448125, + "grad_norm": 0.942782986060499, + "learning_rate": 1.5185917312804571e-05, + "loss": 0.552, + "step": 23990 + }, + { + "epoch": 0.7002772508390486, + "grad_norm": 0.8879691954509794, + "learning_rate": 1.5176831484024152e-05, + "loss": 0.53, + "step": 23995 + }, + { + "epoch": 0.7004231723332847, + "grad_norm": 1.0041566890995757, + "learning_rate": 1.5167748524923687e-05, + "loss": 0.5795, + "step": 24000 + }, + { + "epoch": 0.7005690938275208, + "grad_norm": 0.9421668551810137, + "learning_rate": 1.5158668437618307e-05, + "loss": 0.5373, + "step": 24005 + }, + { + "epoch": 0.7007150153217568, + "grad_norm": 0.9887603240078826, + "learning_rate": 1.5149591224222476e-05, + "loss": 0.4987, + "step": 24010 + }, + { + "epoch": 0.700860936815993, + "grad_norm": 0.9910310966047815, + "learning_rate": 1.5140516886849993e-05, + "loss": 0.5325, + "step": 24015 + }, + { + "epoch": 0.7010068583102291, + "grad_norm": 1.1489471142732917, + "learning_rate": 1.5131445427613982e-05, + "loss": 0.5529, + "step": 24020 + }, + { + "epoch": 0.7011527798044652, + "grad_norm": 0.9926160338689359, + "learning_rate": 1.512237684862693e-05, + "loss": 0.5427, + "step": 24025 + }, + { + "epoch": 0.7012987012987013, + "grad_norm": 0.994655527164068, + "learning_rate": 1.511331115200058e-05, + "loss": 0.5698, + "step": 24030 + }, + { + "epoch": 0.7014446227929374, + "grad_norm": 1.021934161619113, + "learning_rate": 1.5104248339846083e-05, + "loss": 0.5669, + "step": 24035 + }, + { + "epoch": 0.7015905442871735, + "grad_norm": 1.086159428464733, + "learning_rate": 1.5095188414273865e-05, + "loss": 0.54, + "step": 24040 + }, + { + "epoch": 0.7017364657814096, + "grad_norm": 1.0082074909642036, + "learning_rate": 1.5086131377393715e-05, + "loss": 0.5725, + "step": 24045 + }, + { + "epoch": 0.7018823872756457, + "grad_norm": 1.0623725640192214, + "learning_rate": 1.5077077231314724e-05, + "loss": 0.5333, + "step": 24050 + }, + { + "epoch": 0.7020283087698818, + "grad_norm": 1.0580229008137838, + "learning_rate": 1.5068025978145323e-05, + "loss": 0.6016, + "step": 24055 + }, + { + "epoch": 0.702174230264118, + "grad_norm": 1.004582938060519, + "learning_rate": 1.5058977619993258e-05, + "loss": 0.5315, + "step": 24060 + }, + { + "epoch": 0.702320151758354, + "grad_norm": 0.9474439861790411, + "learning_rate": 1.5049932158965608e-05, + "loss": 0.5362, + "step": 24065 + }, + { + "epoch": 0.7024660732525901, + "grad_norm": 1.173901170174014, + "learning_rate": 1.5040889597168789e-05, + "loss": 0.5656, + "step": 24070 + }, + { + "epoch": 0.7026119947468262, + "grad_norm": 1.0026167024711883, + "learning_rate": 1.5031849936708528e-05, + "loss": 0.542, + "step": 24075 + }, + { + "epoch": 0.7027579162410623, + "grad_norm": 1.0418145379810961, + "learning_rate": 1.5022813179689871e-05, + "loss": 0.5634, + "step": 24080 + }, + { + "epoch": 0.7029038377352984, + "grad_norm": 1.0347587841121264, + "learning_rate": 1.5013779328217201e-05, + "loss": 0.5469, + "step": 24085 + }, + { + "epoch": 0.7030497592295345, + "grad_norm": 1.070501516634199, + "learning_rate": 1.5004748384394207e-05, + "loss": 0.574, + "step": 24090 + }, + { + "epoch": 0.7031956807237706, + "grad_norm": 1.155202963908396, + "learning_rate": 1.4995720350323932e-05, + "loss": 0.6062, + "step": 24095 + }, + { + "epoch": 0.7033416022180067, + "grad_norm": 1.0153245700415607, + "learning_rate": 1.4986695228108715e-05, + "loss": 0.584, + "step": 24100 + }, + { + "epoch": 0.7034875237122428, + "grad_norm": 1.0004052658001845, + "learning_rate": 1.4977673019850219e-05, + "loss": 0.4873, + "step": 24105 + }, + { + "epoch": 0.703633445206479, + "grad_norm": 1.0832171318691564, + "learning_rate": 1.4968653727649434e-05, + "loss": 0.547, + "step": 24110 + }, + { + "epoch": 0.7037793667007151, + "grad_norm": 0.9336063406392175, + "learning_rate": 1.4959637353606678e-05, + "loss": 0.4998, + "step": 24115 + }, + { + "epoch": 0.7039252881949511, + "grad_norm": 0.8745079174032, + "learning_rate": 1.495062389982158e-05, + "loss": 0.4909, + "step": 24120 + }, + { + "epoch": 0.7040712096891872, + "grad_norm": 1.0416569546274859, + "learning_rate": 1.4941613368393092e-05, + "loss": 0.5029, + "step": 24125 + }, + { + "epoch": 0.7042171311834233, + "grad_norm": 0.8952009517958974, + "learning_rate": 1.493260576141948e-05, + "loss": 0.5581, + "step": 24130 + }, + { + "epoch": 0.7043630526776594, + "grad_norm": 1.1097504765125694, + "learning_rate": 1.4923601080998325e-05, + "loss": 0.6083, + "step": 24135 + }, + { + "epoch": 0.7045089741718955, + "grad_norm": 1.1128181359323472, + "learning_rate": 1.4914599329226558e-05, + "loss": 0.5523, + "step": 24140 + }, + { + "epoch": 0.7046548956661316, + "grad_norm": 1.0606921931329718, + "learning_rate": 1.490560050820039e-05, + "loss": 0.5162, + "step": 24145 + }, + { + "epoch": 0.7048008171603677, + "grad_norm": 0.9151637047067894, + "learning_rate": 1.4896604620015364e-05, + "loss": 0.5032, + "step": 24150 + }, + { + "epoch": 0.7049467386546038, + "grad_norm": 0.8606924694357436, + "learning_rate": 1.4887611666766346e-05, + "loss": 0.6016, + "step": 24155 + }, + { + "epoch": 0.70509266014884, + "grad_norm": 1.0209408199054315, + "learning_rate": 1.4878621650547503e-05, + "loss": 0.5696, + "step": 24160 + }, + { + "epoch": 0.7052385816430761, + "grad_norm": 1.0772898736344854, + "learning_rate": 1.4869634573452346e-05, + "loss": 0.587, + "step": 24165 + }, + { + "epoch": 0.7053845031373122, + "grad_norm": 1.0632431085850207, + "learning_rate": 1.486065043757367e-05, + "loss": 0.5096, + "step": 24170 + }, + { + "epoch": 0.7055304246315482, + "grad_norm": 0.9315477859257142, + "learning_rate": 1.4851669245003601e-05, + "loss": 0.598, + "step": 24175 + }, + { + "epoch": 0.7056763461257843, + "grad_norm": 1.1768450356135882, + "learning_rate": 1.4842690997833574e-05, + "loss": 0.5837, + "step": 24180 + }, + { + "epoch": 0.7058222676200204, + "grad_norm": 1.0158217869466701, + "learning_rate": 1.4833715698154355e-05, + "loss": 0.5406, + "step": 24185 + }, + { + "epoch": 0.7059681891142565, + "grad_norm": 0.9858779197760544, + "learning_rate": 1.4824743348055997e-05, + "loss": 0.5284, + "step": 24190 + }, + { + "epoch": 0.7061141106084926, + "grad_norm": 1.0396290614849886, + "learning_rate": 1.4815773949627888e-05, + "loss": 0.5618, + "step": 24195 + }, + { + "epoch": 0.7062600321027287, + "grad_norm": 0.9548591255245135, + "learning_rate": 1.4806807504958715e-05, + "loss": 0.573, + "step": 24200 + }, + { + "epoch": 0.7064059535969648, + "grad_norm": 1.0617743027350097, + "learning_rate": 1.4797844016136475e-05, + "loss": 0.561, + "step": 24205 + }, + { + "epoch": 0.706551875091201, + "grad_norm": 1.1657765155102613, + "learning_rate": 1.4788883485248498e-05, + "loss": 0.5785, + "step": 24210 + }, + { + "epoch": 0.7066977965854371, + "grad_norm": 1.0929638255282905, + "learning_rate": 1.4779925914381403e-05, + "loss": 0.589, + "step": 24215 + }, + { + "epoch": 0.7068437180796732, + "grad_norm": 1.0610996139228235, + "learning_rate": 1.4770971305621134e-05, + "loss": 0.577, + "step": 24220 + }, + { + "epoch": 0.7069896395739093, + "grad_norm": 1.0448727877535875, + "learning_rate": 1.4762019661052926e-05, + "loss": 0.6013, + "step": 24225 + }, + { + "epoch": 0.7071355610681453, + "grad_norm": 1.049271368571552, + "learning_rate": 1.475307098276135e-05, + "loss": 0.5032, + "step": 24230 + }, + { + "epoch": 0.7072814825623814, + "grad_norm": 1.198834488235397, + "learning_rate": 1.4744125272830268e-05, + "loss": 0.5805, + "step": 24235 + }, + { + "epoch": 0.7074274040566175, + "grad_norm": 0.9728507946921953, + "learning_rate": 1.4735182533342857e-05, + "loss": 0.5473, + "step": 24240 + }, + { + "epoch": 0.7075733255508536, + "grad_norm": 1.0125589572724134, + "learning_rate": 1.4726242766381599e-05, + "loss": 0.491, + "step": 24245 + }, + { + "epoch": 0.7077192470450897, + "grad_norm": 0.9403536157881934, + "learning_rate": 1.4717305974028278e-05, + "loss": 0.5401, + "step": 24250 + }, + { + "epoch": 0.7078651685393258, + "grad_norm": 1.2439438694113967, + "learning_rate": 1.4708372158364004e-05, + "loss": 0.5912, + "step": 24255 + }, + { + "epoch": 0.708011090033562, + "grad_norm": 1.0570476213780557, + "learning_rate": 1.4699441321469187e-05, + "loss": 0.563, + "step": 24260 + }, + { + "epoch": 0.7081570115277981, + "grad_norm": 1.1344776024957806, + "learning_rate": 1.4690513465423527e-05, + "loss": 0.5351, + "step": 24265 + }, + { + "epoch": 0.7083029330220342, + "grad_norm": 0.9517116966818174, + "learning_rate": 1.4681588592306051e-05, + "loss": 0.5816, + "step": 24270 + }, + { + "epoch": 0.7084488545162703, + "grad_norm": 0.9254938061243251, + "learning_rate": 1.4672666704195065e-05, + "loss": 0.5402, + "step": 24275 + }, + { + "epoch": 0.7085947760105064, + "grad_norm": 1.0949906738144726, + "learning_rate": 1.4663747803168221e-05, + "loss": 0.5228, + "step": 24280 + }, + { + "epoch": 0.7087406975047424, + "grad_norm": 0.9209693860775959, + "learning_rate": 1.4654831891302437e-05, + "loss": 0.5459, + "step": 24285 + }, + { + "epoch": 0.7088866189989785, + "grad_norm": 1.0828236813098084, + "learning_rate": 1.4645918970673966e-05, + "loss": 0.5604, + "step": 24290 + }, + { + "epoch": 0.7090325404932146, + "grad_norm": 0.8573555380896565, + "learning_rate": 1.463700904335832e-05, + "loss": 0.4945, + "step": 24295 + }, + { + "epoch": 0.7091784619874507, + "grad_norm": 1.138820866835025, + "learning_rate": 1.462810211143037e-05, + "loss": 0.6028, + "step": 24300 + }, + { + "epoch": 0.7093243834816868, + "grad_norm": 1.1059502592991022, + "learning_rate": 1.4619198176964244e-05, + "loss": 0.5824, + "step": 24305 + }, + { + "epoch": 0.709470304975923, + "grad_norm": 1.1868373732211293, + "learning_rate": 1.4610297242033393e-05, + "loss": 0.5831, + "step": 24310 + }, + { + "epoch": 0.7096162264701591, + "grad_norm": 1.0295679619658424, + "learning_rate": 1.4601399308710573e-05, + "loss": 0.5342, + "step": 24315 + }, + { + "epoch": 0.7097621479643952, + "grad_norm": 1.2139491234112905, + "learning_rate": 1.4592504379067822e-05, + "loss": 0.554, + "step": 24320 + }, + { + "epoch": 0.7099080694586313, + "grad_norm": 0.9184199497569038, + "learning_rate": 1.4583612455176503e-05, + "loss": 0.4928, + "step": 24325 + }, + { + "epoch": 0.7100539909528674, + "grad_norm": 0.9289577816708264, + "learning_rate": 1.457472353910726e-05, + "loss": 0.531, + "step": 24330 + }, + { + "epoch": 0.7101999124471035, + "grad_norm": 1.0325969369599877, + "learning_rate": 1.4565837632930044e-05, + "loss": 0.4927, + "step": 24335 + }, + { + "epoch": 0.7103458339413395, + "grad_norm": 0.993337810512389, + "learning_rate": 1.4556954738714103e-05, + "loss": 0.543, + "step": 24340 + }, + { + "epoch": 0.7104917554355756, + "grad_norm": 1.0119432687559544, + "learning_rate": 1.4548074858527982e-05, + "loss": 0.5243, + "step": 24345 + }, + { + "epoch": 0.7106376769298117, + "grad_norm": 1.2655935035631212, + "learning_rate": 1.4539197994439534e-05, + "loss": 0.5375, + "step": 24350 + }, + { + "epoch": 0.7107835984240478, + "grad_norm": 1.152821115981948, + "learning_rate": 1.4530324148515895e-05, + "loss": 0.6121, + "step": 24355 + }, + { + "epoch": 0.710929519918284, + "grad_norm": 0.9490338918918519, + "learning_rate": 1.4521453322823526e-05, + "loss": 0.5566, + "step": 24360 + }, + { + "epoch": 0.7110754414125201, + "grad_norm": 0.9488672991645826, + "learning_rate": 1.451258551942813e-05, + "loss": 0.5579, + "step": 24365 + }, + { + "epoch": 0.7112213629067562, + "grad_norm": 1.0984510830498266, + "learning_rate": 1.4503720740394768e-05, + "loss": 0.5773, + "step": 24370 + }, + { + "epoch": 0.7113672844009923, + "grad_norm": 1.0620351005521815, + "learning_rate": 1.4494858987787762e-05, + "loss": 0.5476, + "step": 24375 + }, + { + "epoch": 0.7115132058952284, + "grad_norm": 1.1361207229615862, + "learning_rate": 1.4486000263670735e-05, + "loss": 0.5058, + "step": 24380 + }, + { + "epoch": 0.7116591273894645, + "grad_norm": 1.0414569084795966, + "learning_rate": 1.4477144570106607e-05, + "loss": 0.5166, + "step": 24385 + }, + { + "epoch": 0.7118050488837006, + "grad_norm": 1.1976358702952392, + "learning_rate": 1.4468291909157583e-05, + "loss": 0.5531, + "step": 24390 + }, + { + "epoch": 0.7119509703779366, + "grad_norm": 1.1590179723220841, + "learning_rate": 1.4459442282885188e-05, + "loss": 0.5616, + "step": 24395 + }, + { + "epoch": 0.7120968918721727, + "grad_norm": 1.1566752242120362, + "learning_rate": 1.4450595693350205e-05, + "loss": 0.5233, + "step": 24400 + }, + { + "epoch": 0.7122428133664088, + "grad_norm": 1.2389754022509472, + "learning_rate": 1.4441752142612752e-05, + "loss": 0.5789, + "step": 24405 + }, + { + "epoch": 0.712388734860645, + "grad_norm": 1.1453548437096257, + "learning_rate": 1.4432911632732187e-05, + "loss": 0.5979, + "step": 24410 + }, + { + "epoch": 0.7125346563548811, + "grad_norm": 1.1037437211897294, + "learning_rate": 1.4424074165767199e-05, + "loss": 0.5345, + "step": 24415 + }, + { + "epoch": 0.7126805778491172, + "grad_norm": 0.9551622470134965, + "learning_rate": 1.441523974377576e-05, + "loss": 0.6019, + "step": 24420 + }, + { + "epoch": 0.7128264993433533, + "grad_norm": 1.2130372013147819, + "learning_rate": 1.4406408368815122e-05, + "loss": 0.5643, + "step": 24425 + }, + { + "epoch": 0.7129724208375894, + "grad_norm": 0.9712835721280971, + "learning_rate": 1.4397580042941858e-05, + "loss": 0.5012, + "step": 24430 + }, + { + "epoch": 0.7131183423318255, + "grad_norm": 0.9058692467266198, + "learning_rate": 1.4388754768211776e-05, + "loss": 0.5313, + "step": 24435 + }, + { + "epoch": 0.7132642638260616, + "grad_norm": 1.1425079248796624, + "learning_rate": 1.4379932546680031e-05, + "loss": 0.555, + "step": 24440 + }, + { + "epoch": 0.7134101853202977, + "grad_norm": 0.9961630226072019, + "learning_rate": 1.4371113380401036e-05, + "loss": 0.5603, + "step": 24445 + }, + { + "epoch": 0.7135561068145337, + "grad_norm": 0.9630539402208909, + "learning_rate": 1.4362297271428487e-05, + "loss": 0.534, + "step": 24450 + }, + { + "epoch": 0.7137020283087698, + "grad_norm": 1.0306877211262744, + "learning_rate": 1.4353484221815403e-05, + "loss": 0.5331, + "step": 24455 + }, + { + "epoch": 0.713847949803006, + "grad_norm": 1.0358162654136978, + "learning_rate": 1.4344674233614035e-05, + "loss": 0.5989, + "step": 24460 + }, + { + "epoch": 0.7139938712972421, + "grad_norm": 0.8352713222910401, + "learning_rate": 1.4335867308875983e-05, + "loss": 0.5398, + "step": 24465 + }, + { + "epoch": 0.7141397927914782, + "grad_norm": 4.946832034723737, + "learning_rate": 1.4327063449652084e-05, + "loss": 0.5702, + "step": 24470 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.116624206980472, + "learning_rate": 1.4318262657992504e-05, + "loss": 0.5895, + "step": 24475 + }, + { + "epoch": 0.7144316357799504, + "grad_norm": 1.1823160676251852, + "learning_rate": 1.4309464935946642e-05, + "loss": 0.5692, + "step": 24480 + }, + { + "epoch": 0.7145775572741865, + "grad_norm": 0.984740016936657, + "learning_rate": 1.4300670285563233e-05, + "loss": 0.5259, + "step": 24485 + }, + { + "epoch": 0.7147234787684226, + "grad_norm": 1.023882732110191, + "learning_rate": 1.4291878708890272e-05, + "loss": 0.5628, + "step": 24490 + }, + { + "epoch": 0.7148694002626587, + "grad_norm": 1.166015227269716, + "learning_rate": 1.428309020797503e-05, + "loss": 0.5269, + "step": 24495 + }, + { + "epoch": 0.7150153217568948, + "grad_norm": 1.0807054018499804, + "learning_rate": 1.4274304784864098e-05, + "loss": 0.5174, + "step": 24500 + }, + { + "epoch": 0.7151612432511308, + "grad_norm": 0.8718926417295376, + "learning_rate": 1.4265522441603296e-05, + "loss": 0.5067, + "step": 24505 + }, + { + "epoch": 0.715307164745367, + "grad_norm": 1.1005682628862632, + "learning_rate": 1.4256743180237777e-05, + "loss": 0.5539, + "step": 24510 + }, + { + "epoch": 0.7154530862396031, + "grad_norm": 0.8802949528227003, + "learning_rate": 1.4247967002811952e-05, + "loss": 0.5229, + "step": 24515 + }, + { + "epoch": 0.7155990077338392, + "grad_norm": 1.0458262784878145, + "learning_rate": 1.4239193911369508e-05, + "loss": 0.5406, + "step": 24520 + }, + { + "epoch": 0.7157449292280753, + "grad_norm": 1.1943831586181977, + "learning_rate": 1.4230423907953449e-05, + "loss": 0.559, + "step": 24525 + }, + { + "epoch": 0.7158908507223114, + "grad_norm": 0.983925435455423, + "learning_rate": 1.4221656994605998e-05, + "loss": 0.5678, + "step": 24530 + }, + { + "epoch": 0.7160367722165475, + "grad_norm": 1.1176890264899106, + "learning_rate": 1.4212893173368725e-05, + "loss": 0.571, + "step": 24535 + }, + { + "epoch": 0.7161826937107836, + "grad_norm": 1.137288587410384, + "learning_rate": 1.4204132446282425e-05, + "loss": 0.5562, + "step": 24540 + }, + { + "epoch": 0.7163286152050197, + "grad_norm": 1.1724075084412144, + "learning_rate": 1.4195374815387219e-05, + "loss": 0.5735, + "step": 24545 + }, + { + "epoch": 0.7164745366992558, + "grad_norm": 1.161746572130148, + "learning_rate": 1.4186620282722473e-05, + "loss": 0.565, + "step": 24550 + }, + { + "epoch": 0.7166204581934918, + "grad_norm": 1.077664429766411, + "learning_rate": 1.4177868850326851e-05, + "loss": 0.5375, + "step": 24555 + }, + { + "epoch": 0.716766379687728, + "grad_norm": 1.1224995518381657, + "learning_rate": 1.4169120520238277e-05, + "loss": 0.5275, + "step": 24560 + }, + { + "epoch": 0.7169123011819641, + "grad_norm": 0.9830538347397293, + "learning_rate": 1.4160375294493961e-05, + "loss": 0.476, + "step": 24565 + }, + { + "epoch": 0.7170582226762002, + "grad_norm": 1.1436343034411778, + "learning_rate": 1.4151633175130417e-05, + "loss": 0.5839, + "step": 24570 + }, + { + "epoch": 0.7172041441704363, + "grad_norm": 1.085081887911008, + "learning_rate": 1.4142894164183376e-05, + "loss": 0.4675, + "step": 24575 + }, + { + "epoch": 0.7173500656646724, + "grad_norm": 1.0413742364384941, + "learning_rate": 1.4134158263687905e-05, + "loss": 0.57, + "step": 24580 + }, + { + "epoch": 0.7174959871589085, + "grad_norm": 1.000954814738591, + "learning_rate": 1.4125425475678315e-05, + "loss": 0.5328, + "step": 24585 + }, + { + "epoch": 0.7176419086531446, + "grad_norm": 1.0108958039595437, + "learning_rate": 1.411669580218819e-05, + "loss": 0.504, + "step": 24590 + }, + { + "epoch": 0.7177878301473807, + "grad_norm": 0.8884764384926542, + "learning_rate": 1.4107969245250419e-05, + "loss": 0.5396, + "step": 24595 + }, + { + "epoch": 0.7179337516416168, + "grad_norm": 0.9453175934815446, + "learning_rate": 1.4099245806897113e-05, + "loss": 0.5025, + "step": 24600 + }, + { + "epoch": 0.7180796731358529, + "grad_norm": 0.9482374808440415, + "learning_rate": 1.4090525489159711e-05, + "loss": 0.4951, + "step": 24605 + }, + { + "epoch": 0.7182255946300891, + "grad_norm": 0.8915304156398259, + "learning_rate": 1.4081808294068894e-05, + "loss": 0.5143, + "step": 24610 + }, + { + "epoch": 0.7183715161243251, + "grad_norm": 0.9699427457982547, + "learning_rate": 1.4073094223654626e-05, + "loss": 0.5501, + "step": 24615 + }, + { + "epoch": 0.7185174376185612, + "grad_norm": 1.0017473976094748, + "learning_rate": 1.4064383279946141e-05, + "loss": 0.5465, + "step": 24620 + }, + { + "epoch": 0.7186633591127973, + "grad_norm": 1.2777095277581743, + "learning_rate": 1.4055675464971946e-05, + "loss": 0.5337, + "step": 24625 + }, + { + "epoch": 0.7188092806070334, + "grad_norm": 1.1011030671152342, + "learning_rate": 1.4046970780759816e-05, + "loss": 0.6021, + "step": 24630 + }, + { + "epoch": 0.7189552021012695, + "grad_norm": 1.0296889790169528, + "learning_rate": 1.4038269229336787e-05, + "loss": 0.5714, + "step": 24635 + }, + { + "epoch": 0.7191011235955056, + "grad_norm": 1.2811505711799551, + "learning_rate": 1.4029570812729204e-05, + "loss": 0.5916, + "step": 24640 + }, + { + "epoch": 0.7192470450897417, + "grad_norm": 0.8802879534332432, + "learning_rate": 1.402087553296264e-05, + "loss": 0.5506, + "step": 24645 + }, + { + "epoch": 0.7193929665839778, + "grad_norm": 0.9912588371933395, + "learning_rate": 1.4012183392061956e-05, + "loss": 0.5129, + "step": 24650 + }, + { + "epoch": 0.7195388880782139, + "grad_norm": 1.1786932997067552, + "learning_rate": 1.4003494392051267e-05, + "loss": 0.6091, + "step": 24655 + }, + { + "epoch": 0.7196848095724501, + "grad_norm": 1.1311139214037087, + "learning_rate": 1.399480853495399e-05, + "loss": 0.5185, + "step": 24660 + }, + { + "epoch": 0.7198307310666862, + "grad_norm": 1.1387093840691218, + "learning_rate": 1.3986125822792789e-05, + "loss": 0.5914, + "step": 24665 + }, + { + "epoch": 0.7199766525609222, + "grad_norm": 0.9299079241464092, + "learning_rate": 1.3977446257589572e-05, + "loss": 0.5925, + "step": 24670 + }, + { + "epoch": 0.7201225740551583, + "grad_norm": 1.114214367232458, + "learning_rate": 1.3968769841365552e-05, + "loss": 0.5944, + "step": 24675 + }, + { + "epoch": 0.7202684955493944, + "grad_norm": 0.9740034346974353, + "learning_rate": 1.3960096576141197e-05, + "loss": 0.5178, + "step": 24680 + }, + { + "epoch": 0.7204144170436305, + "grad_norm": 1.0794001317087405, + "learning_rate": 1.395142646393624e-05, + "loss": 0.5678, + "step": 24685 + }, + { + "epoch": 0.7205603385378666, + "grad_norm": 1.0489859868659006, + "learning_rate": 1.3942759506769676e-05, + "loss": 0.5185, + "step": 24690 + }, + { + "epoch": 0.7207062600321027, + "grad_norm": 1.0772334983786078, + "learning_rate": 1.3934095706659765e-05, + "loss": 0.5215, + "step": 24695 + }, + { + "epoch": 0.7208521815263388, + "grad_norm": 0.9102516968369698, + "learning_rate": 1.3925435065624038e-05, + "loss": 0.5407, + "step": 24700 + }, + { + "epoch": 0.7209981030205749, + "grad_norm": 1.279974120818073, + "learning_rate": 1.3916777585679275e-05, + "loss": 0.5325, + "step": 24705 + }, + { + "epoch": 0.7211440245148111, + "grad_norm": 0.9517698128184349, + "learning_rate": 1.3908123268841552e-05, + "loss": 0.5235, + "step": 24710 + }, + { + "epoch": 0.7212899460090472, + "grad_norm": 0.984386956873802, + "learning_rate": 1.3899472117126181e-05, + "loss": 0.5812, + "step": 24715 + }, + { + "epoch": 0.7214358675032833, + "grad_norm": 0.9897334991811357, + "learning_rate": 1.3890824132547742e-05, + "loss": 0.5646, + "step": 24720 + }, + { + "epoch": 0.7215817889975193, + "grad_norm": 1.1999808261371183, + "learning_rate": 1.388217931712007e-05, + "loss": 0.5999, + "step": 24725 + }, + { + "epoch": 0.7217277104917554, + "grad_norm": 1.1141004763722877, + "learning_rate": 1.387353767285629e-05, + "loss": 0.583, + "step": 24730 + }, + { + "epoch": 0.7218736319859915, + "grad_norm": 1.1531681898759512, + "learning_rate": 1.3864899201768761e-05, + "loss": 0.5033, + "step": 24735 + }, + { + "epoch": 0.7220195534802276, + "grad_norm": 1.165137918761426, + "learning_rate": 1.3856263905869113e-05, + "loss": 0.5505, + "step": 24740 + }, + { + "epoch": 0.7221654749744637, + "grad_norm": 0.9499678365747813, + "learning_rate": 1.3847631787168236e-05, + "loss": 0.5691, + "step": 24745 + }, + { + "epoch": 0.7223113964686998, + "grad_norm": 0.9832657700898059, + "learning_rate": 1.3839002847676275e-05, + "loss": 0.5458, + "step": 24750 + }, + { + "epoch": 0.7224573179629359, + "grad_norm": 1.0809557200461168, + "learning_rate": 1.3830377089402646e-05, + "loss": 0.5766, + "step": 24755 + }, + { + "epoch": 0.7226032394571721, + "grad_norm": 1.0343446224534398, + "learning_rate": 1.3821754514356025e-05, + "loss": 0.598, + "step": 24760 + }, + { + "epoch": 0.7227491609514082, + "grad_norm": 0.9274016911121137, + "learning_rate": 1.3813135124544327e-05, + "loss": 0.5128, + "step": 24765 + }, + { + "epoch": 0.7228950824456443, + "grad_norm": 0.9317734839774331, + "learning_rate": 1.380451892197474e-05, + "loss": 0.5276, + "step": 24770 + }, + { + "epoch": 0.7230410039398804, + "grad_norm": 0.954064454326678, + "learning_rate": 1.3795905908653706e-05, + "loss": 0.5685, + "step": 24775 + }, + { + "epoch": 0.7231869254341164, + "grad_norm": 1.1493734976006906, + "learning_rate": 1.3787296086586937e-05, + "loss": 0.5907, + "step": 24780 + }, + { + "epoch": 0.7233328469283525, + "grad_norm": 0.8968722600358542, + "learning_rate": 1.3778689457779384e-05, + "loss": 0.5311, + "step": 24785 + }, + { + "epoch": 0.7234787684225886, + "grad_norm": 1.1242738594814456, + "learning_rate": 1.3770086024235263e-05, + "loss": 0.6352, + "step": 24790 + }, + { + "epoch": 0.7236246899168247, + "grad_norm": 1.1527975654634566, + "learning_rate": 1.3761485787958028e-05, + "loss": 0.6154, + "step": 24795 + }, + { + "epoch": 0.7237706114110608, + "grad_norm": 1.0697957783709997, + "learning_rate": 1.3752888750950433e-05, + "loss": 0.4973, + "step": 24800 + }, + { + "epoch": 0.723916532905297, + "grad_norm": 0.9825699910886434, + "learning_rate": 1.3744294915214443e-05, + "loss": 0.5287, + "step": 24805 + }, + { + "epoch": 0.7240624543995331, + "grad_norm": 1.052330833100793, + "learning_rate": 1.3735704282751294e-05, + "loss": 0.5519, + "step": 24810 + }, + { + "epoch": 0.7242083758937692, + "grad_norm": 1.0415379392135926, + "learning_rate": 1.3727116855561478e-05, + "loss": 0.5386, + "step": 24815 + }, + { + "epoch": 0.7243542973880053, + "grad_norm": 1.0789041218116648, + "learning_rate": 1.3718532635644727e-05, + "loss": 0.5248, + "step": 24820 + }, + { + "epoch": 0.7245002188822414, + "grad_norm": 1.1541761863394766, + "learning_rate": 1.370995162500005e-05, + "loss": 0.51, + "step": 24825 + }, + { + "epoch": 0.7246461403764775, + "grad_norm": 0.8833589612047396, + "learning_rate": 1.3701373825625694e-05, + "loss": 0.5075, + "step": 24830 + }, + { + "epoch": 0.7247920618707135, + "grad_norm": 1.0177952056470108, + "learning_rate": 1.3692799239519155e-05, + "loss": 0.5251, + "step": 24835 + }, + { + "epoch": 0.7249379833649496, + "grad_norm": 1.0753824812576223, + "learning_rate": 1.3684227868677191e-05, + "loss": 0.4957, + "step": 24840 + }, + { + "epoch": 0.7250839048591857, + "grad_norm": 1.1592437380251366, + "learning_rate": 1.3675659715095793e-05, + "loss": 0.5679, + "step": 24845 + }, + { + "epoch": 0.7252298263534218, + "grad_norm": 1.1888215712744017, + "learning_rate": 1.3667094780770234e-05, + "loss": 0.5594, + "step": 24850 + }, + { + "epoch": 0.725375747847658, + "grad_norm": 1.0947373918738463, + "learning_rate": 1.3658533067695012e-05, + "loss": 0.5442, + "step": 24855 + }, + { + "epoch": 0.7255216693418941, + "grad_norm": 1.1977489526984082, + "learning_rate": 1.3649974577863871e-05, + "loss": 0.6208, + "step": 24860 + }, + { + "epoch": 0.7256675908361302, + "grad_norm": 1.044143912132242, + "learning_rate": 1.3641419313269824e-05, + "loss": 0.535, + "step": 24865 + }, + { + "epoch": 0.7258135123303663, + "grad_norm": 0.8902421415342231, + "learning_rate": 1.3632867275905129e-05, + "loss": 0.5404, + "step": 24870 + }, + { + "epoch": 0.7259594338246024, + "grad_norm": 1.0042582937693387, + "learning_rate": 1.3624318467761282e-05, + "loss": 0.4978, + "step": 24875 + }, + { + "epoch": 0.7261053553188385, + "grad_norm": 1.0522703978741763, + "learning_rate": 1.3615772890829034e-05, + "loss": 0.5685, + "step": 24880 + }, + { + "epoch": 0.7262512768130746, + "grad_norm": 0.913587632733783, + "learning_rate": 1.3607230547098382e-05, + "loss": 0.5397, + "step": 24885 + }, + { + "epoch": 0.7263971983073106, + "grad_norm": 0.9836496182677489, + "learning_rate": 1.3598691438558564e-05, + "loss": 0.5273, + "step": 24890 + }, + { + "epoch": 0.7265431198015467, + "grad_norm": 1.1476049492298055, + "learning_rate": 1.3590155567198082e-05, + "loss": 0.5263, + "step": 24895 + }, + { + "epoch": 0.7266890412957828, + "grad_norm": 1.2436486421592616, + "learning_rate": 1.358162293500466e-05, + "loss": 0.6255, + "step": 24900 + }, + { + "epoch": 0.726834962790019, + "grad_norm": 1.0366010603056368, + "learning_rate": 1.3573093543965304e-05, + "loss": 0.5713, + "step": 24905 + }, + { + "epoch": 0.7269808842842551, + "grad_norm": 1.0646455422092935, + "learning_rate": 1.3564567396066213e-05, + "loss": 0.4962, + "step": 24910 + }, + { + "epoch": 0.7271268057784912, + "grad_norm": 1.0905562208383737, + "learning_rate": 1.355604449329288e-05, + "loss": 0.5693, + "step": 24915 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.9392806608497347, + "learning_rate": 1.3547524837630013e-05, + "loss": 0.5278, + "step": 24920 + }, + { + "epoch": 0.7274186487669634, + "grad_norm": 1.245128905418762, + "learning_rate": 1.3539008431061576e-05, + "loss": 0.56, + "step": 24925 + }, + { + "epoch": 0.7275645702611995, + "grad_norm": 0.9882712513203719, + "learning_rate": 1.3530495275570773e-05, + "loss": 0.5162, + "step": 24930 + }, + { + "epoch": 0.7277104917554356, + "grad_norm": 1.0235677160058934, + "learning_rate": 1.3521985373140046e-05, + "loss": 0.4985, + "step": 24935 + }, + { + "epoch": 0.7278564132496717, + "grad_norm": 1.260192405412068, + "learning_rate": 1.3513478725751094e-05, + "loss": 0.5742, + "step": 24940 + }, + { + "epoch": 0.7280023347439077, + "grad_norm": 1.0147625933917526, + "learning_rate": 1.3504975335384845e-05, + "loss": 0.5328, + "step": 24945 + }, + { + "epoch": 0.7281482562381438, + "grad_norm": 1.0419171675453127, + "learning_rate": 1.3496475204021472e-05, + "loss": 0.5402, + "step": 24950 + }, + { + "epoch": 0.72829417773238, + "grad_norm": 0.9945950867103498, + "learning_rate": 1.3487978333640391e-05, + "loss": 0.5159, + "step": 24955 + }, + { + "epoch": 0.7284400992266161, + "grad_norm": 1.0193792119020708, + "learning_rate": 1.3479484726220245e-05, + "loss": 0.5992, + "step": 24960 + }, + { + "epoch": 0.7285860207208522, + "grad_norm": 1.1210378157260457, + "learning_rate": 1.3470994383738946e-05, + "loss": 0.5718, + "step": 24965 + }, + { + "epoch": 0.7287319422150883, + "grad_norm": 0.9106248327010925, + "learning_rate": 1.3462507308173616e-05, + "loss": 0.5048, + "step": 24970 + }, + { + "epoch": 0.7288778637093244, + "grad_norm": 1.137208457783271, + "learning_rate": 1.345402350150065e-05, + "loss": 0.5303, + "step": 24975 + }, + { + "epoch": 0.7290237852035605, + "grad_norm": 1.0232967254671586, + "learning_rate": 1.344554296569563e-05, + "loss": 0.5517, + "step": 24980 + }, + { + "epoch": 0.7291697066977966, + "grad_norm": 1.084303467882424, + "learning_rate": 1.343706570273343e-05, + "loss": 0.5613, + "step": 24985 + }, + { + "epoch": 0.7293156281920327, + "grad_norm": 1.048543664385041, + "learning_rate": 1.3428591714588134e-05, + "loss": 0.5635, + "step": 24990 + }, + { + "epoch": 0.7294615496862688, + "grad_norm": 0.9990611698961829, + "learning_rate": 1.3420121003233055e-05, + "loss": 0.5502, + "step": 24995 + }, + { + "epoch": 0.7296074711805048, + "grad_norm": 1.1208309789969972, + "learning_rate": 1.3411653570640786e-05, + "loss": 0.5497, + "step": 25000 + }, + { + "epoch": 0.729753392674741, + "grad_norm": 1.1804782246716283, + "learning_rate": 1.3403189418783093e-05, + "loss": 0.5699, + "step": 25005 + }, + { + "epoch": 0.7298993141689771, + "grad_norm": 1.2854738742147318, + "learning_rate": 1.3394728549631035e-05, + "loss": 0.5345, + "step": 25010 + }, + { + "epoch": 0.7300452356632132, + "grad_norm": 0.8898250793964284, + "learning_rate": 1.3386270965154874e-05, + "loss": 0.5171, + "step": 25015 + }, + { + "epoch": 0.7301911571574493, + "grad_norm": 0.9419518674295984, + "learning_rate": 1.3377816667324116e-05, + "loss": 0.5343, + "step": 25020 + }, + { + "epoch": 0.7303370786516854, + "grad_norm": 1.0704773079951875, + "learning_rate": 1.336936565810751e-05, + "loss": 0.5452, + "step": 25025 + }, + { + "epoch": 0.7304830001459215, + "grad_norm": 1.1023298964262838, + "learning_rate": 1.3360917939473016e-05, + "loss": 0.5448, + "step": 25030 + }, + { + "epoch": 0.7306289216401576, + "grad_norm": 0.9900842006034813, + "learning_rate": 1.335247351338786e-05, + "loss": 0.5279, + "step": 25035 + }, + { + "epoch": 0.7307748431343937, + "grad_norm": 0.9998328715450364, + "learning_rate": 1.3344032381818469e-05, + "loss": 0.5159, + "step": 25040 + }, + { + "epoch": 0.7309207646286298, + "grad_norm": 1.0287709411333383, + "learning_rate": 1.3335594546730545e-05, + "loss": 0.5165, + "step": 25045 + }, + { + "epoch": 0.7310666861228658, + "grad_norm": 1.2239768990007813, + "learning_rate": 1.3327160010088958e-05, + "loss": 0.612, + "step": 25050 + }, + { + "epoch": 0.731212607617102, + "grad_norm": 0.992710013363525, + "learning_rate": 1.3318728773857874e-05, + "loss": 0.5674, + "step": 25055 + }, + { + "epoch": 0.7313585291113381, + "grad_norm": 0.9288355374173342, + "learning_rate": 1.3310300840000656e-05, + "loss": 0.542, + "step": 25060 + }, + { + "epoch": 0.7315044506055742, + "grad_norm": 0.9629195865983087, + "learning_rate": 1.3301876210479896e-05, + "loss": 0.5816, + "step": 25065 + }, + { + "epoch": 0.7316503720998103, + "grad_norm": 1.0422849435404384, + "learning_rate": 1.3293454887257456e-05, + "loss": 0.5501, + "step": 25070 + }, + { + "epoch": 0.7317962935940464, + "grad_norm": 0.9495058815796161, + "learning_rate": 1.3285036872294359e-05, + "loss": 0.5814, + "step": 25075 + }, + { + "epoch": 0.7319422150882825, + "grad_norm": 1.0803645519281164, + "learning_rate": 1.3276622167550928e-05, + "loss": 0.5822, + "step": 25080 + }, + { + "epoch": 0.7320881365825186, + "grad_norm": 0.9533636908469251, + "learning_rate": 1.3268210774986673e-05, + "loss": 0.5767, + "step": 25085 + }, + { + "epoch": 0.7322340580767547, + "grad_norm": 1.0520121643432652, + "learning_rate": 1.3259802696560333e-05, + "loss": 0.5609, + "step": 25090 + }, + { + "epoch": 0.7323799795709908, + "grad_norm": 1.0187550732776751, + "learning_rate": 1.3251397934229914e-05, + "loss": 0.5612, + "step": 25095 + }, + { + "epoch": 0.7325259010652269, + "grad_norm": 0.9645179245506396, + "learning_rate": 1.3242996489952585e-05, + "loss": 0.5555, + "step": 25100 + }, + { + "epoch": 0.7326718225594631, + "grad_norm": 1.0592009922412269, + "learning_rate": 1.3234598365684808e-05, + "loss": 0.509, + "step": 25105 + }, + { + "epoch": 0.7328177440536991, + "grad_norm": 1.2442606341880318, + "learning_rate": 1.322620356338223e-05, + "loss": 0.5082, + "step": 25110 + }, + { + "epoch": 0.7329636655479352, + "grad_norm": 1.0217467282651531, + "learning_rate": 1.3217812084999753e-05, + "loss": 0.5759, + "step": 25115 + }, + { + "epoch": 0.7331095870421713, + "grad_norm": 1.0329878328551165, + "learning_rate": 1.3209423932491466e-05, + "loss": 0.5197, + "step": 25120 + }, + { + "epoch": 0.7332555085364074, + "grad_norm": 1.1815872200404214, + "learning_rate": 1.3201039107810722e-05, + "loss": 0.5559, + "step": 25125 + }, + { + "epoch": 0.7334014300306435, + "grad_norm": 1.0587165280268696, + "learning_rate": 1.3192657612910083e-05, + "loss": 0.4893, + "step": 25130 + }, + { + "epoch": 0.7335473515248796, + "grad_norm": 1.0176391640366607, + "learning_rate": 1.3184279449741326e-05, + "loss": 0.5606, + "step": 25135 + }, + { + "epoch": 0.7336932730191157, + "grad_norm": 0.9573205379674394, + "learning_rate": 1.3175904620255486e-05, + "loss": 0.4937, + "step": 25140 + }, + { + "epoch": 0.7338391945133518, + "grad_norm": 0.9887979990747906, + "learning_rate": 1.316753312640277e-05, + "loss": 0.5585, + "step": 25145 + }, + { + "epoch": 0.7339851160075879, + "grad_norm": 0.9923632007516789, + "learning_rate": 1.315916497013266e-05, + "loss": 0.5518, + "step": 25150 + }, + { + "epoch": 0.7341310375018241, + "grad_norm": 1.0263889728903777, + "learning_rate": 1.3150800153393816e-05, + "loss": 0.5587, + "step": 25155 + }, + { + "epoch": 0.7342769589960602, + "grad_norm": 0.9832763646765168, + "learning_rate": 1.3142438678134161e-05, + "loss": 0.6034, + "step": 25160 + }, + { + "epoch": 0.7344228804902962, + "grad_norm": 0.9377201914182386, + "learning_rate": 1.3134080546300815e-05, + "loss": 0.5057, + "step": 25165 + }, + { + "epoch": 0.7345688019845323, + "grad_norm": 1.1672845600917523, + "learning_rate": 1.3125725759840123e-05, + "loss": 0.5072, + "step": 25170 + }, + { + "epoch": 0.7347147234787684, + "grad_norm": 1.0341457328193873, + "learning_rate": 1.3117374320697656e-05, + "loss": 0.5964, + "step": 25175 + }, + { + "epoch": 0.7348606449730045, + "grad_norm": 1.095503482067089, + "learning_rate": 1.310902623081819e-05, + "loss": 0.576, + "step": 25180 + }, + { + "epoch": 0.7350065664672406, + "grad_norm": 0.9048064511447627, + "learning_rate": 1.3100681492145757e-05, + "loss": 0.4863, + "step": 25185 + }, + { + "epoch": 0.7351524879614767, + "grad_norm": 0.9930625815555961, + "learning_rate": 1.309234010662356e-05, + "loss": 0.5317, + "step": 25190 + }, + { + "epoch": 0.7352984094557128, + "grad_norm": 0.947380651500713, + "learning_rate": 1.3084002076194065e-05, + "loss": 0.5415, + "step": 25195 + }, + { + "epoch": 0.7354443309499489, + "grad_norm": 1.2026570428396774, + "learning_rate": 1.3075667402798931e-05, + "loss": 0.6066, + "step": 25200 + }, + { + "epoch": 0.7355902524441851, + "grad_norm": 1.2133817124368318, + "learning_rate": 1.3067336088379034e-05, + "loss": 0.5386, + "step": 25205 + }, + { + "epoch": 0.7357361739384212, + "grad_norm": 1.0464365770984325, + "learning_rate": 1.3059008134874503e-05, + "loss": 0.5677, + "step": 25210 + }, + { + "epoch": 0.7358820954326573, + "grad_norm": 1.0467130848976993, + "learning_rate": 1.3050683544224624e-05, + "loss": 0.5598, + "step": 25215 + }, + { + "epoch": 0.7360280169268933, + "grad_norm": 0.9610077681154812, + "learning_rate": 1.3042362318367953e-05, + "loss": 0.5113, + "step": 25220 + }, + { + "epoch": 0.7361739384211294, + "grad_norm": 1.010748113549356, + "learning_rate": 1.3034044459242234e-05, + "loss": 0.5413, + "step": 25225 + }, + { + "epoch": 0.7363198599153655, + "grad_norm": 0.9593836407782844, + "learning_rate": 1.3025729968784447e-05, + "loss": 0.4786, + "step": 25230 + }, + { + "epoch": 0.7364657814096016, + "grad_norm": 1.01907740885405, + "learning_rate": 1.3017418848930773e-05, + "loss": 0.5506, + "step": 25235 + }, + { + "epoch": 0.7366117029038377, + "grad_norm": 0.9389540561197258, + "learning_rate": 1.3009111101616605e-05, + "loss": 0.5393, + "step": 25240 + }, + { + "epoch": 0.7367576243980738, + "grad_norm": 0.9366584227149366, + "learning_rate": 1.300080672877656e-05, + "loss": 0.5526, + "step": 25245 + }, + { + "epoch": 0.7369035458923099, + "grad_norm": 1.0513284063906845, + "learning_rate": 1.2992505732344463e-05, + "loss": 0.5371, + "step": 25250 + }, + { + "epoch": 0.7370494673865461, + "grad_norm": 1.0595101507416727, + "learning_rate": 1.2984208114253366e-05, + "loss": 0.514, + "step": 25255 + }, + { + "epoch": 0.7371953888807822, + "grad_norm": 0.9967972199191502, + "learning_rate": 1.2975913876435519e-05, + "loss": 0.5836, + "step": 25260 + }, + { + "epoch": 0.7373413103750183, + "grad_norm": 1.0115150034897316, + "learning_rate": 1.296762302082239e-05, + "loss": 0.5321, + "step": 25265 + }, + { + "epoch": 0.7374872318692544, + "grad_norm": 0.9694402181154759, + "learning_rate": 1.2959335549344657e-05, + "loss": 0.5051, + "step": 25270 + }, + { + "epoch": 0.7376331533634904, + "grad_norm": 1.2303331437095986, + "learning_rate": 1.2951051463932207e-05, + "loss": 0.5237, + "step": 25275 + }, + { + "epoch": 0.7377790748577265, + "grad_norm": 0.9520640601815741, + "learning_rate": 1.2942770766514167e-05, + "loss": 0.5198, + "step": 25280 + }, + { + "epoch": 0.7379249963519626, + "grad_norm": 0.9773029328247239, + "learning_rate": 1.293449345901882e-05, + "loss": 0.541, + "step": 25285 + }, + { + "epoch": 0.7380709178461987, + "grad_norm": 1.2470725909333529, + "learning_rate": 1.2926219543373718e-05, + "loss": 0.5776, + "step": 25290 + }, + { + "epoch": 0.7382168393404348, + "grad_norm": 1.141465175644101, + "learning_rate": 1.2917949021505582e-05, + "loss": 0.5972, + "step": 25295 + }, + { + "epoch": 0.7383627608346709, + "grad_norm": 0.9609788573037573, + "learning_rate": 1.2909681895340367e-05, + "loss": 0.5649, + "step": 25300 + }, + { + "epoch": 0.7385086823289071, + "grad_norm": 1.0364169636451106, + "learning_rate": 1.2901418166803223e-05, + "loss": 0.5828, + "step": 25305 + }, + { + "epoch": 0.7386546038231432, + "grad_norm": 0.9538950234671885, + "learning_rate": 1.2893157837818514e-05, + "loss": 0.5583, + "step": 25310 + }, + { + "epoch": 0.7388005253173793, + "grad_norm": 1.0643406247670861, + "learning_rate": 1.2884900910309813e-05, + "loss": 0.6018, + "step": 25315 + }, + { + "epoch": 0.7389464468116154, + "grad_norm": 1.0262789379219452, + "learning_rate": 1.2876647386199892e-05, + "loss": 0.5783, + "step": 25320 + }, + { + "epoch": 0.7390923683058515, + "grad_norm": 1.1026771368815542, + "learning_rate": 1.286839726741075e-05, + "loss": 0.5665, + "step": 25325 + }, + { + "epoch": 0.7392382898000875, + "grad_norm": 0.924100406038911, + "learning_rate": 1.2860150555863577e-05, + "loss": 0.5723, + "step": 25330 + }, + { + "epoch": 0.7393842112943236, + "grad_norm": 1.0483367425838177, + "learning_rate": 1.2851907253478774e-05, + "loss": 0.5323, + "step": 25335 + }, + { + "epoch": 0.7395301327885597, + "grad_norm": 1.0042384594865397, + "learning_rate": 1.2843667362175937e-05, + "loss": 0.5512, + "step": 25340 + }, + { + "epoch": 0.7396760542827958, + "grad_norm": 0.9384001947194012, + "learning_rate": 1.2835430883873898e-05, + "loss": 0.507, + "step": 25345 + }, + { + "epoch": 0.7398219757770319, + "grad_norm": 1.0745238063295024, + "learning_rate": 1.282719782049066e-05, + "loss": 0.565, + "step": 25350 + }, + { + "epoch": 0.7399678972712681, + "grad_norm": 1.090054603605742, + "learning_rate": 1.2818968173943457e-05, + "loss": 0.5802, + "step": 25355 + }, + { + "epoch": 0.7401138187655042, + "grad_norm": 1.1182360724752936, + "learning_rate": 1.2810741946148707e-05, + "loss": 0.5232, + "step": 25360 + }, + { + "epoch": 0.7402597402597403, + "grad_norm": 1.0473042664186767, + "learning_rate": 1.2802519139022034e-05, + "loss": 0.5216, + "step": 25365 + }, + { + "epoch": 0.7404056617539764, + "grad_norm": 1.0216923113681062, + "learning_rate": 1.279429975447829e-05, + "loss": 0.5621, + "step": 25370 + }, + { + "epoch": 0.7405515832482125, + "grad_norm": 1.092407081871658, + "learning_rate": 1.2786083794431505e-05, + "loss": 0.5767, + "step": 25375 + }, + { + "epoch": 0.7406975047424486, + "grad_norm": 0.9090653668478144, + "learning_rate": 1.2777871260794918e-05, + "loss": 0.5293, + "step": 25380 + }, + { + "epoch": 0.7408434262366846, + "grad_norm": 1.1172868093720418, + "learning_rate": 1.2769662155480968e-05, + "loss": 0.5669, + "step": 25385 + }, + { + "epoch": 0.7409893477309207, + "grad_norm": 1.1706064875136633, + "learning_rate": 1.2761456480401296e-05, + "loss": 0.5388, + "step": 25390 + }, + { + "epoch": 0.7411352692251568, + "grad_norm": 0.9929775739484752, + "learning_rate": 1.275325423746676e-05, + "loss": 0.5389, + "step": 25395 + }, + { + "epoch": 0.7412811907193929, + "grad_norm": 1.3032373806818933, + "learning_rate": 1.2745055428587393e-05, + "loss": 0.565, + "step": 25400 + }, + { + "epoch": 0.7414271122136291, + "grad_norm": 1.1310822533461034, + "learning_rate": 1.2736860055672447e-05, + "loss": 0.5667, + "step": 25405 + }, + { + "epoch": 0.7415730337078652, + "grad_norm": 1.126759467635421, + "learning_rate": 1.272866812063036e-05, + "loss": 0.6085, + "step": 25410 + }, + { + "epoch": 0.7417189552021013, + "grad_norm": 0.945681056225867, + "learning_rate": 1.2720479625368786e-05, + "loss": 0.5156, + "step": 25415 + }, + { + "epoch": 0.7418648766963374, + "grad_norm": 1.2618692978413297, + "learning_rate": 1.271229457179457e-05, + "loss": 0.597, + "step": 25420 + }, + { + "epoch": 0.7420107981905735, + "grad_norm": 1.043740837656037, + "learning_rate": 1.2704112961813747e-05, + "loss": 0.5383, + "step": 25425 + }, + { + "epoch": 0.7421567196848096, + "grad_norm": 1.0465747646212846, + "learning_rate": 1.269593479733156e-05, + "loss": 0.5424, + "step": 25430 + }, + { + "epoch": 0.7423026411790457, + "grad_norm": 0.8664855832434459, + "learning_rate": 1.2687760080252443e-05, + "loss": 0.4679, + "step": 25435 + }, + { + "epoch": 0.7424485626732817, + "grad_norm": 0.8744408774993557, + "learning_rate": 1.2679588812480043e-05, + "loss": 0.5036, + "step": 25440 + }, + { + "epoch": 0.7425944841675178, + "grad_norm": 1.031492824204485, + "learning_rate": 1.2671420995917185e-05, + "loss": 0.5321, + "step": 25445 + }, + { + "epoch": 0.7427404056617539, + "grad_norm": 0.9338973390288998, + "learning_rate": 1.2663256632465903e-05, + "loss": 0.525, + "step": 25450 + }, + { + "epoch": 0.7428863271559901, + "grad_norm": 1.2091478700003864, + "learning_rate": 1.2655095724027414e-05, + "loss": 0.5758, + "step": 25455 + }, + { + "epoch": 0.7430322486502262, + "grad_norm": 1.1075708304779006, + "learning_rate": 1.2646938272502134e-05, + "loss": 0.5101, + "step": 25460 + }, + { + "epoch": 0.7431781701444623, + "grad_norm": 1.0104157537561222, + "learning_rate": 1.263878427978969e-05, + "loss": 0.486, + "step": 25465 + }, + { + "epoch": 0.7433240916386984, + "grad_norm": 0.9563619346991841, + "learning_rate": 1.2630633747788891e-05, + "loss": 0.4798, + "step": 25470 + }, + { + "epoch": 0.7434700131329345, + "grad_norm": 1.118596412294096, + "learning_rate": 1.2622486678397732e-05, + "loss": 0.5559, + "step": 25475 + }, + { + "epoch": 0.7436159346271706, + "grad_norm": 1.0867858788235167, + "learning_rate": 1.2614343073513413e-05, + "loss": 0.5971, + "step": 25480 + }, + { + "epoch": 0.7437618561214067, + "grad_norm": 1.1108414145235843, + "learning_rate": 1.260620293503233e-05, + "loss": 0.4884, + "step": 25485 + }, + { + "epoch": 0.7439077776156428, + "grad_norm": 1.0101355416992428, + "learning_rate": 1.2598066264850061e-05, + "loss": 0.5085, + "step": 25490 + }, + { + "epoch": 0.7440536991098788, + "grad_norm": 1.1083711609452138, + "learning_rate": 1.2589933064861382e-05, + "loss": 0.5699, + "step": 25495 + }, + { + "epoch": 0.7441996206041149, + "grad_norm": 1.0628758072733857, + "learning_rate": 1.2581803336960262e-05, + "loss": 0.5774, + "step": 25500 + }, + { + "epoch": 0.7443455420983511, + "grad_norm": 0.9367883689633545, + "learning_rate": 1.2573677083039856e-05, + "loss": 0.5218, + "step": 25505 + }, + { + "epoch": 0.7444914635925872, + "grad_norm": 1.2783136425482464, + "learning_rate": 1.2565554304992522e-05, + "loss": 0.6178, + "step": 25510 + }, + { + "epoch": 0.7446373850868233, + "grad_norm": 0.9114066630977176, + "learning_rate": 1.2557435004709796e-05, + "loss": 0.5118, + "step": 25515 + }, + { + "epoch": 0.7447833065810594, + "grad_norm": 0.8338505604524789, + "learning_rate": 1.2549319184082408e-05, + "loss": 0.5539, + "step": 25520 + }, + { + "epoch": 0.7449292280752955, + "grad_norm": 0.9797838951828499, + "learning_rate": 1.254120684500028e-05, + "loss": 0.5078, + "step": 25525 + }, + { + "epoch": 0.7450751495695316, + "grad_norm": 1.0069589685082077, + "learning_rate": 1.2533097989352513e-05, + "loss": 0.5356, + "step": 25530 + }, + { + "epoch": 0.7452210710637677, + "grad_norm": 1.0403052166121876, + "learning_rate": 1.252499261902742e-05, + "loss": 0.5645, + "step": 25535 + }, + { + "epoch": 0.7453669925580038, + "grad_norm": 1.0450398430405794, + "learning_rate": 1.2516890735912477e-05, + "loss": 0.5561, + "step": 25540 + }, + { + "epoch": 0.7455129140522399, + "grad_norm": 0.9924120042741277, + "learning_rate": 1.250879234189437e-05, + "loss": 0.5514, + "step": 25545 + }, + { + "epoch": 0.745658835546476, + "grad_norm": 0.9063258798785097, + "learning_rate": 1.250069743885895e-05, + "loss": 0.5632, + "step": 25550 + }, + { + "epoch": 0.7458047570407121, + "grad_norm": 1.0469000263831814, + "learning_rate": 1.249260602869127e-05, + "loss": 0.5257, + "step": 25555 + }, + { + "epoch": 0.7459506785349482, + "grad_norm": 0.9803628707261173, + "learning_rate": 1.248451811327557e-05, + "loss": 0.5637, + "step": 25560 + }, + { + "epoch": 0.7460966000291843, + "grad_norm": 1.1460900109108125, + "learning_rate": 1.2476433694495266e-05, + "loss": 0.5822, + "step": 25565 + }, + { + "epoch": 0.7462425215234204, + "grad_norm": 1.0089698183451101, + "learning_rate": 1.246835277423297e-05, + "loss": 0.5512, + "step": 25570 + }, + { + "epoch": 0.7463884430176565, + "grad_norm": 0.9418966492405134, + "learning_rate": 1.2460275354370467e-05, + "loss": 0.6279, + "step": 25575 + }, + { + "epoch": 0.7465343645118926, + "grad_norm": 1.075479539471517, + "learning_rate": 1.2452201436788748e-05, + "loss": 0.4726, + "step": 25580 + }, + { + "epoch": 0.7466802860061287, + "grad_norm": 1.034681224530147, + "learning_rate": 1.2444131023367962e-05, + "loss": 0.5227, + "step": 25585 + }, + { + "epoch": 0.7468262075003648, + "grad_norm": 1.0711723820905, + "learning_rate": 1.243606411598748e-05, + "loss": 0.5799, + "step": 25590 + }, + { + "epoch": 0.7469721289946009, + "grad_norm": 1.0895783824566092, + "learning_rate": 1.2428000716525799e-05, + "loss": 0.5761, + "step": 25595 + }, + { + "epoch": 0.7471180504888371, + "grad_norm": 0.9819716620730959, + "learning_rate": 1.2419940826860655e-05, + "loss": 0.5177, + "step": 25600 + }, + { + "epoch": 0.7472639719830731, + "grad_norm": 0.8782455978579036, + "learning_rate": 1.2411884448868937e-05, + "loss": 0.5308, + "step": 25605 + }, + { + "epoch": 0.7474098934773092, + "grad_norm": 1.1614015957066017, + "learning_rate": 1.2403831584426716e-05, + "loss": 0.539, + "step": 25610 + }, + { + "epoch": 0.7475558149715453, + "grad_norm": 0.8748345954958823, + "learning_rate": 1.2395782235409274e-05, + "loss": 0.5738, + "step": 25615 + }, + { + "epoch": 0.7477017364657814, + "grad_norm": 0.9710031945082617, + "learning_rate": 1.2387736403691019e-05, + "loss": 0.5729, + "step": 25620 + }, + { + "epoch": 0.7478476579600175, + "grad_norm": 1.0450618853947413, + "learning_rate": 1.2379694091145602e-05, + "loss": 0.5896, + "step": 25625 + }, + { + "epoch": 0.7479935794542536, + "grad_norm": 1.003233099959815, + "learning_rate": 1.2371655299645815e-05, + "loss": 0.4996, + "step": 25630 + }, + { + "epoch": 0.7481395009484897, + "grad_norm": 0.9860045169036421, + "learning_rate": 1.2363620031063638e-05, + "loss": 0.5406, + "step": 25635 + }, + { + "epoch": 0.7482854224427258, + "grad_norm": 1.1722039167321137, + "learning_rate": 1.235558828727024e-05, + "loss": 0.6086, + "step": 25640 + }, + { + "epoch": 0.7484313439369619, + "grad_norm": 1.3099115426647863, + "learning_rate": 1.2347560070135948e-05, + "loss": 0.5371, + "step": 25645 + }, + { + "epoch": 0.7485772654311981, + "grad_norm": 0.8645467297104173, + "learning_rate": 1.2339535381530302e-05, + "loss": 0.5392, + "step": 25650 + }, + { + "epoch": 0.7487231869254342, + "grad_norm": 0.90651514980129, + "learning_rate": 1.2331514223321985e-05, + "loss": 0.496, + "step": 25655 + }, + { + "epoch": 0.7488691084196702, + "grad_norm": 1.0756932529184244, + "learning_rate": 1.2323496597378892e-05, + "loss": 0.5699, + "step": 25660 + }, + { + "epoch": 0.7490150299139063, + "grad_norm": 0.9666964534861574, + "learning_rate": 1.2315482505568052e-05, + "loss": 0.5679, + "step": 25665 + }, + { + "epoch": 0.7491609514081424, + "grad_norm": 1.0970593532615356, + "learning_rate": 1.2307471949755719e-05, + "loss": 0.5727, + "step": 25670 + }, + { + "epoch": 0.7493068729023785, + "grad_norm": 1.0683184475371457, + "learning_rate": 1.2299464931807286e-05, + "loss": 0.6314, + "step": 25675 + }, + { + "epoch": 0.7494527943966146, + "grad_norm": 0.9568494891869519, + "learning_rate": 1.2291461453587335e-05, + "loss": 0.5228, + "step": 25680 + }, + { + "epoch": 0.7495987158908507, + "grad_norm": 1.117201976766499, + "learning_rate": 1.2283461516959648e-05, + "loss": 0.5623, + "step": 25685 + }, + { + "epoch": 0.7497446373850868, + "grad_norm": 0.9464249873501117, + "learning_rate": 1.2275465123787128e-05, + "loss": 0.5471, + "step": 25690 + }, + { + "epoch": 0.7498905588793229, + "grad_norm": 0.9907759266612338, + "learning_rate": 1.2267472275931905e-05, + "loss": 0.4934, + "step": 25695 + }, + { + "epoch": 0.7500364803735591, + "grad_norm": 0.9282906386658726, + "learning_rate": 1.225948297525526e-05, + "loss": 0.4944, + "step": 25700 + }, + { + "epoch": 0.7501824018677952, + "grad_norm": 1.0302221469690025, + "learning_rate": 1.225149722361764e-05, + "loss": 0.5713, + "step": 25705 + }, + { + "epoch": 0.7503283233620313, + "grad_norm": 0.9536677406356936, + "learning_rate": 1.2243515022878699e-05, + "loss": 0.4942, + "step": 25710 + }, + { + "epoch": 0.7504742448562673, + "grad_norm": 1.122923194063469, + "learning_rate": 1.2235536374897215e-05, + "loss": 0.5914, + "step": 25715 + }, + { + "epoch": 0.7506201663505034, + "grad_norm": 1.0219895713651888, + "learning_rate": 1.2227561281531183e-05, + "loss": 0.5612, + "step": 25720 + }, + { + "epoch": 0.7507660878447395, + "grad_norm": 1.0491126926372052, + "learning_rate": 1.221958974463774e-05, + "loss": 0.593, + "step": 25725 + }, + { + "epoch": 0.7509120093389756, + "grad_norm": 0.9893507325396295, + "learning_rate": 1.2211621766073229e-05, + "loss": 0.4979, + "step": 25730 + }, + { + "epoch": 0.7510579308332117, + "grad_norm": 1.1725701414719172, + "learning_rate": 1.2203657347693115e-05, + "loss": 0.5798, + "step": 25735 + }, + { + "epoch": 0.7512038523274478, + "grad_norm": 0.863698238264407, + "learning_rate": 1.219569649135208e-05, + "loss": 0.5066, + "step": 25740 + }, + { + "epoch": 0.7513497738216839, + "grad_norm": 1.0188583985408846, + "learning_rate": 1.2187739198903953e-05, + "loss": 0.5638, + "step": 25745 + }, + { + "epoch": 0.7514956953159201, + "grad_norm": 0.979049262163374, + "learning_rate": 1.217978547220173e-05, + "loss": 0.5013, + "step": 25750 + }, + { + "epoch": 0.7516416168101562, + "grad_norm": 0.962044520822094, + "learning_rate": 1.217183531309761e-05, + "loss": 0.4915, + "step": 25755 + }, + { + "epoch": 0.7517875383043923, + "grad_norm": 0.9815783493050788, + "learning_rate": 1.21638887234429e-05, + "loss": 0.5138, + "step": 25760 + }, + { + "epoch": 0.7519334597986284, + "grad_norm": 1.098760217680226, + "learning_rate": 1.215594570508814e-05, + "loss": 0.5442, + "step": 25765 + }, + { + "epoch": 0.7520793812928644, + "grad_norm": 1.1703149140345381, + "learning_rate": 1.2148006259883001e-05, + "loss": 0.6309, + "step": 25770 + }, + { + "epoch": 0.7522253027871005, + "grad_norm": 0.9750847021164261, + "learning_rate": 1.2140070389676322e-05, + "loss": 0.565, + "step": 25775 + }, + { + "epoch": 0.7523712242813366, + "grad_norm": 1.0192018431041778, + "learning_rate": 1.2132138096316141e-05, + "loss": 0.5489, + "step": 25780 + }, + { + "epoch": 0.7525171457755727, + "grad_norm": 1.1034162032635708, + "learning_rate": 1.2124209381649615e-05, + "loss": 0.5704, + "step": 25785 + }, + { + "epoch": 0.7526630672698088, + "grad_norm": 1.0089886523816638, + "learning_rate": 1.2116284247523111e-05, + "loss": 0.5292, + "step": 25790 + }, + { + "epoch": 0.7528089887640449, + "grad_norm": 0.9485782122555003, + "learning_rate": 1.2108362695782133e-05, + "loss": 0.5778, + "step": 25795 + }, + { + "epoch": 0.7529549102582811, + "grad_norm": 1.1223467935927802, + "learning_rate": 1.2100444728271373e-05, + "loss": 0.5411, + "step": 25800 + }, + { + "epoch": 0.7531008317525172, + "grad_norm": 1.0289524117390865, + "learning_rate": 1.2092530346834676e-05, + "loss": 0.4867, + "step": 25805 + }, + { + "epoch": 0.7532467532467533, + "grad_norm": 1.0984035194440613, + "learning_rate": 1.2084619553315045e-05, + "loss": 0.5269, + "step": 25810 + }, + { + "epoch": 0.7533926747409894, + "grad_norm": 1.04075958371763, + "learning_rate": 1.2076712349554665e-05, + "loss": 0.5591, + "step": 25815 + }, + { + "epoch": 0.7535385962352255, + "grad_norm": 0.9385249767136463, + "learning_rate": 1.2068808737394865e-05, + "loss": 0.5413, + "step": 25820 + }, + { + "epoch": 0.7536845177294615, + "grad_norm": 0.9094181827674642, + "learning_rate": 1.2060908718676168e-05, + "loss": 0.4641, + "step": 25825 + }, + { + "epoch": 0.7538304392236976, + "grad_norm": 1.164689036327355, + "learning_rate": 1.2053012295238217e-05, + "loss": 0.5803, + "step": 25830 + }, + { + "epoch": 0.7539763607179337, + "grad_norm": 1.1286062858097257, + "learning_rate": 1.204511946891986e-05, + "loss": 0.56, + "step": 25835 + }, + { + "epoch": 0.7541222822121698, + "grad_norm": 1.1097275886625522, + "learning_rate": 1.2037230241559074e-05, + "loss": 0.5715, + "step": 25840 + }, + { + "epoch": 0.7542682037064059, + "grad_norm": 1.0564247130739828, + "learning_rate": 1.202934461499303e-05, + "loss": 0.5334, + "step": 25845 + }, + { + "epoch": 0.7544141252006421, + "grad_norm": 0.9972303161086802, + "learning_rate": 1.2021462591058037e-05, + "loss": 0.5436, + "step": 25850 + }, + { + "epoch": 0.7545600466948782, + "grad_norm": 1.169238260709802, + "learning_rate": 1.2013584171589565e-05, + "loss": 0.5476, + "step": 25855 + }, + { + "epoch": 0.7547059681891143, + "grad_norm": 1.0615169711425816, + "learning_rate": 1.2005709358422258e-05, + "loss": 0.5662, + "step": 25860 + }, + { + "epoch": 0.7548518896833504, + "grad_norm": 1.0080620948780274, + "learning_rate": 1.1997838153389904e-05, + "loss": 0.5513, + "step": 25865 + }, + { + "epoch": 0.7549978111775865, + "grad_norm": 0.9875728082709048, + "learning_rate": 1.1989970558325475e-05, + "loss": 0.4737, + "step": 25870 + }, + { + "epoch": 0.7551437326718226, + "grad_norm": 0.9693294617578385, + "learning_rate": 1.1982106575061076e-05, + "loss": 0.5388, + "step": 25875 + }, + { + "epoch": 0.7552896541660586, + "grad_norm": 0.987254170263494, + "learning_rate": 1.1974246205427989e-05, + "loss": 0.529, + "step": 25880 + }, + { + "epoch": 0.7554355756602947, + "grad_norm": 1.093244516130019, + "learning_rate": 1.1966389451256644e-05, + "loss": 0.5386, + "step": 25885 + }, + { + "epoch": 0.7555814971545308, + "grad_norm": 1.0410642537186547, + "learning_rate": 1.1958536314376625e-05, + "loss": 0.5726, + "step": 25890 + }, + { + "epoch": 0.7557274186487669, + "grad_norm": 0.9468226472007348, + "learning_rate": 1.1950686796616698e-05, + "loss": 0.5276, + "step": 25895 + }, + { + "epoch": 0.7558733401430031, + "grad_norm": 1.162583147170506, + "learning_rate": 1.1942840899804763e-05, + "loss": 0.5885, + "step": 25900 + }, + { + "epoch": 0.7560192616372392, + "grad_norm": 0.9937293801527998, + "learning_rate": 1.1934998625767882e-05, + "loss": 0.5107, + "step": 25905 + }, + { + "epoch": 0.7561651831314753, + "grad_norm": 1.035658591836072, + "learning_rate": 1.192715997633227e-05, + "loss": 0.5818, + "step": 25910 + }, + { + "epoch": 0.7563111046257114, + "grad_norm": 1.012141341232472, + "learning_rate": 1.1919324953323312e-05, + "loss": 0.5705, + "step": 25915 + }, + { + "epoch": 0.7564570261199475, + "grad_norm": 0.9660426062870733, + "learning_rate": 1.191149355856554e-05, + "loss": 0.4958, + "step": 25920 + }, + { + "epoch": 0.7566029476141836, + "grad_norm": 0.994359619903572, + "learning_rate": 1.1903665793882633e-05, + "loss": 0.6293, + "step": 25925 + }, + { + "epoch": 0.7567488691084197, + "grad_norm": 1.2261718997537736, + "learning_rate": 1.1895841661097438e-05, + "loss": 0.5709, + "step": 25930 + }, + { + "epoch": 0.7568947906026557, + "grad_norm": 0.9353713565401006, + "learning_rate": 1.188802116203194e-05, + "loss": 0.5279, + "step": 25935 + }, + { + "epoch": 0.7570407120968918, + "grad_norm": 1.039585047909423, + "learning_rate": 1.1880204298507305e-05, + "loss": 0.5527, + "step": 25940 + }, + { + "epoch": 0.7571866335911279, + "grad_norm": 1.0287603599490898, + "learning_rate": 1.1872391072343824e-05, + "loss": 0.5212, + "step": 25945 + }, + { + "epoch": 0.7573325550853641, + "grad_norm": 1.0005440195775186, + "learning_rate": 1.186458148536096e-05, + "loss": 0.5194, + "step": 25950 + }, + { + "epoch": 0.7574784765796002, + "grad_norm": 0.996583335606949, + "learning_rate": 1.1856775539377316e-05, + "loss": 0.5035, + "step": 25955 + }, + { + "epoch": 0.7576243980738363, + "grad_norm": 1.1174549431129843, + "learning_rate": 1.1848973236210644e-05, + "loss": 0.5421, + "step": 25960 + }, + { + "epoch": 0.7577703195680724, + "grad_norm": 1.019774627318254, + "learning_rate": 1.1841174577677873e-05, + "loss": 0.5255, + "step": 25965 + }, + { + "epoch": 0.7579162410623085, + "grad_norm": 0.9448878674420801, + "learning_rate": 1.1833379565595058e-05, + "loss": 0.4771, + "step": 25970 + }, + { + "epoch": 0.7580621625565446, + "grad_norm": 1.1242599847384511, + "learning_rate": 1.1825588201777413e-05, + "loss": 0.5653, + "step": 25975 + }, + { + "epoch": 0.7582080840507807, + "grad_norm": 1.4572133065036537, + "learning_rate": 1.1817800488039297e-05, + "loss": 0.5845, + "step": 25980 + }, + { + "epoch": 0.7583540055450168, + "grad_norm": 1.0459839130174469, + "learning_rate": 1.181001642619424e-05, + "loss": 0.5509, + "step": 25985 + }, + { + "epoch": 0.7584999270392528, + "grad_norm": 1.1006205740381068, + "learning_rate": 1.1802236018054896e-05, + "loss": 0.5806, + "step": 25990 + }, + { + "epoch": 0.7586458485334889, + "grad_norm": 1.0035135232840215, + "learning_rate": 1.179445926543308e-05, + "loss": 0.519, + "step": 25995 + }, + { + "epoch": 0.7587917700277251, + "grad_norm": 0.8542348077225248, + "learning_rate": 1.1786686170139753e-05, + "loss": 0.5104, + "step": 26000 + }, + { + "epoch": 0.7589376915219612, + "grad_norm": 1.1252292451347838, + "learning_rate": 1.1778916733985017e-05, + "loss": 0.5049, + "step": 26005 + }, + { + "epoch": 0.7590836130161973, + "grad_norm": 0.869429282723768, + "learning_rate": 1.1771150958778147e-05, + "loss": 0.5532, + "step": 26010 + }, + { + "epoch": 0.7592295345104334, + "grad_norm": 0.9781650608313145, + "learning_rate": 1.1763388846327542e-05, + "loss": 0.5441, + "step": 26015 + }, + { + "epoch": 0.7593754560046695, + "grad_norm": 1.006431306882641, + "learning_rate": 1.1755630398440753e-05, + "loss": 0.5183, + "step": 26020 + }, + { + "epoch": 0.7595213774989056, + "grad_norm": 0.957305906675393, + "learning_rate": 1.1747875616924482e-05, + "loss": 0.5426, + "step": 26025 + }, + { + "epoch": 0.7596672989931417, + "grad_norm": 1.208702423084591, + "learning_rate": 1.1740124503584567e-05, + "loss": 0.5521, + "step": 26030 + }, + { + "epoch": 0.7598132204873778, + "grad_norm": 0.9551373867755021, + "learning_rate": 1.173237706022601e-05, + "loss": 0.5069, + "step": 26035 + }, + { + "epoch": 0.7599591419816139, + "grad_norm": 1.0468417229080547, + "learning_rate": 1.1724633288652941e-05, + "loss": 0.537, + "step": 26040 + }, + { + "epoch": 0.7601050634758499, + "grad_norm": 1.069712247237243, + "learning_rate": 1.1716893190668648e-05, + "loss": 0.5907, + "step": 26045 + }, + { + "epoch": 0.7602509849700861, + "grad_norm": 1.119615581164806, + "learning_rate": 1.1709156768075547e-05, + "loss": 0.5598, + "step": 26050 + }, + { + "epoch": 0.7603969064643222, + "grad_norm": 1.2004525093680338, + "learning_rate": 1.1701424022675223e-05, + "loss": 0.5149, + "step": 26055 + }, + { + "epoch": 0.7605428279585583, + "grad_norm": 1.254879113452933, + "learning_rate": 1.1693694956268381e-05, + "loss": 0.5606, + "step": 26060 + }, + { + "epoch": 0.7606887494527944, + "grad_norm": 1.2463059205487388, + "learning_rate": 1.1685969570654878e-05, + "loss": 0.5539, + "step": 26065 + }, + { + "epoch": 0.7608346709470305, + "grad_norm": 0.9331520088335304, + "learning_rate": 1.1678247867633718e-05, + "loss": 0.5122, + "step": 26070 + }, + { + "epoch": 0.7609805924412666, + "grad_norm": 0.9776996854919341, + "learning_rate": 1.1670529849003036e-05, + "loss": 0.533, + "step": 26075 + }, + { + "epoch": 0.7611265139355027, + "grad_norm": 1.1555457217301646, + "learning_rate": 1.1662815516560128e-05, + "loss": 0.5441, + "step": 26080 + }, + { + "epoch": 0.7612724354297388, + "grad_norm": 1.0706136107791275, + "learning_rate": 1.1655104872101415e-05, + "loss": 0.5485, + "step": 26085 + }, + { + "epoch": 0.7614183569239749, + "grad_norm": 1.2403313969508123, + "learning_rate": 1.1647397917422464e-05, + "loss": 0.601, + "step": 26090 + }, + { + "epoch": 0.761564278418211, + "grad_norm": 1.1240766869317989, + "learning_rate": 1.163969465431798e-05, + "loss": 0.5524, + "step": 26095 + }, + { + "epoch": 0.7617101999124472, + "grad_norm": 1.0278855921577061, + "learning_rate": 1.163199508458182e-05, + "loss": 0.6085, + "step": 26100 + }, + { + "epoch": 0.7618561214066832, + "grad_norm": 0.8901685700546406, + "learning_rate": 1.162429921000697e-05, + "loss": 0.5121, + "step": 26105 + }, + { + "epoch": 0.7620020429009193, + "grad_norm": 1.01239652661771, + "learning_rate": 1.1616607032385555e-05, + "loss": 0.5829, + "step": 26110 + }, + { + "epoch": 0.7621479643951554, + "grad_norm": 1.0578567172497082, + "learning_rate": 1.1608918553508844e-05, + "loss": 0.5329, + "step": 26115 + }, + { + "epoch": 0.7622938858893915, + "grad_norm": 0.9941253904157246, + "learning_rate": 1.1601233775167235e-05, + "loss": 0.5112, + "step": 26120 + }, + { + "epoch": 0.7624398073836276, + "grad_norm": 1.0013257836423846, + "learning_rate": 1.1593552699150287e-05, + "loss": 0.5214, + "step": 26125 + }, + { + "epoch": 0.7625857288778637, + "grad_norm": 1.121875655198004, + "learning_rate": 1.1585875327246677e-05, + "loss": 0.5291, + "step": 26130 + }, + { + "epoch": 0.7627316503720998, + "grad_norm": 1.107408305727277, + "learning_rate": 1.157820166124422e-05, + "loss": 0.5906, + "step": 26135 + }, + { + "epoch": 0.7628775718663359, + "grad_norm": 1.0897101848323532, + "learning_rate": 1.1570531702929873e-05, + "loss": 0.5466, + "step": 26140 + }, + { + "epoch": 0.763023493360572, + "grad_norm": 1.0599471421470619, + "learning_rate": 1.1562865454089728e-05, + "loss": 0.5745, + "step": 26145 + }, + { + "epoch": 0.7631694148548082, + "grad_norm": 1.0288093740130015, + "learning_rate": 1.1555202916509022e-05, + "loss": 0.6144, + "step": 26150 + }, + { + "epoch": 0.7633153363490442, + "grad_norm": 0.9816796522952747, + "learning_rate": 1.1547544091972107e-05, + "loss": 0.5788, + "step": 26155 + }, + { + "epoch": 0.7634612578432803, + "grad_norm": 0.9014640350577715, + "learning_rate": 1.1539888982262504e-05, + "loss": 0.5244, + "step": 26160 + }, + { + "epoch": 0.7636071793375164, + "grad_norm": 0.9801727100503064, + "learning_rate": 1.1532237589162824e-05, + "loss": 0.497, + "step": 26165 + }, + { + "epoch": 0.7637531008317525, + "grad_norm": 1.1378514682531211, + "learning_rate": 1.1524589914454855e-05, + "loss": 0.5681, + "step": 26170 + }, + { + "epoch": 0.7638990223259886, + "grad_norm": 0.9751828940230503, + "learning_rate": 1.1516945959919494e-05, + "loss": 0.5117, + "step": 26175 + }, + { + "epoch": 0.7640449438202247, + "grad_norm": 0.9534722582594792, + "learning_rate": 1.1509305727336779e-05, + "loss": 0.5297, + "step": 26180 + }, + { + "epoch": 0.7641908653144608, + "grad_norm": 1.0229250415906461, + "learning_rate": 1.150166921848588e-05, + "loss": 0.5521, + "step": 26185 + }, + { + "epoch": 0.7643367868086969, + "grad_norm": 1.1939338833197222, + "learning_rate": 1.1494036435145095e-05, + "loss": 0.5367, + "step": 26190 + }, + { + "epoch": 0.764482708302933, + "grad_norm": 1.0443782898588452, + "learning_rate": 1.148640737909187e-05, + "loss": 0.4888, + "step": 26195 + }, + { + "epoch": 0.7646286297971692, + "grad_norm": 1.029712909853686, + "learning_rate": 1.1478782052102775e-05, + "loss": 0.4944, + "step": 26200 + }, + { + "epoch": 0.7647745512914053, + "grad_norm": 1.0299434999522206, + "learning_rate": 1.1471160455953502e-05, + "loss": 0.5602, + "step": 26205 + }, + { + "epoch": 0.7649204727856413, + "grad_norm": 1.0625170889149527, + "learning_rate": 1.1463542592418885e-05, + "loss": 0.4982, + "step": 26210 + }, + { + "epoch": 0.7650663942798774, + "grad_norm": 1.0761779506451865, + "learning_rate": 1.1455928463272878e-05, + "loss": 0.5888, + "step": 26215 + }, + { + "epoch": 0.7652123157741135, + "grad_norm": 1.260766552199907, + "learning_rate": 1.1448318070288587e-05, + "loss": 0.5737, + "step": 26220 + }, + { + "epoch": 0.7653582372683496, + "grad_norm": 1.0044722474088226, + "learning_rate": 1.1440711415238221e-05, + "loss": 0.5736, + "step": 26225 + }, + { + "epoch": 0.7655041587625857, + "grad_norm": 1.1677107519312429, + "learning_rate": 1.143310849989315e-05, + "loss": 0.537, + "step": 26230 + }, + { + "epoch": 0.7656500802568218, + "grad_norm": 0.9714881172532713, + "learning_rate": 1.1425509326023832e-05, + "loss": 0.5277, + "step": 26235 + }, + { + "epoch": 0.7657960017510579, + "grad_norm": 0.9536014941152121, + "learning_rate": 1.1417913895399896e-05, + "loss": 0.5519, + "step": 26240 + }, + { + "epoch": 0.765941923245294, + "grad_norm": 1.1474721367891554, + "learning_rate": 1.141032220979007e-05, + "loss": 0.5302, + "step": 26245 + }, + { + "epoch": 0.7660878447395302, + "grad_norm": 1.311854894178345, + "learning_rate": 1.1402734270962213e-05, + "loss": 0.5385, + "step": 26250 + }, + { + "epoch": 0.7662337662337663, + "grad_norm": 1.0170296395474896, + "learning_rate": 1.1395150080683341e-05, + "loss": 0.5164, + "step": 26255 + }, + { + "epoch": 0.7663796877280024, + "grad_norm": 1.0142818028958265, + "learning_rate": 1.1387569640719546e-05, + "loss": 0.52, + "step": 26260 + }, + { + "epoch": 0.7665256092222384, + "grad_norm": 1.087516983543965, + "learning_rate": 1.1379992952836096e-05, + "loss": 0.5252, + "step": 26265 + }, + { + "epoch": 0.7666715307164745, + "grad_norm": 1.0685628522153694, + "learning_rate": 1.1372420018797351e-05, + "loss": 0.6054, + "step": 26270 + }, + { + "epoch": 0.7668174522107106, + "grad_norm": 0.9459058502166803, + "learning_rate": 1.136485084036683e-05, + "loss": 0.5564, + "step": 26275 + }, + { + "epoch": 0.7669633737049467, + "grad_norm": 1.0297359623145743, + "learning_rate": 1.1357285419307131e-05, + "loss": 0.5403, + "step": 26280 + }, + { + "epoch": 0.7671092951991828, + "grad_norm": 0.9749961572036202, + "learning_rate": 1.1349723757380024e-05, + "loss": 0.549, + "step": 26285 + }, + { + "epoch": 0.7672552166934189, + "grad_norm": 0.9622158450289131, + "learning_rate": 1.1342165856346373e-05, + "loss": 0.5226, + "step": 26290 + }, + { + "epoch": 0.7674011381876551, + "grad_norm": 1.007275351717186, + "learning_rate": 1.1334611717966173e-05, + "loss": 0.5305, + "step": 26295 + }, + { + "epoch": 0.7675470596818912, + "grad_norm": 0.9833263425629261, + "learning_rate": 1.132706134399857e-05, + "loss": 0.5031, + "step": 26300 + }, + { + "epoch": 0.7676929811761273, + "grad_norm": 0.964950547844251, + "learning_rate": 1.1319514736201773e-05, + "loss": 0.5396, + "step": 26305 + }, + { + "epoch": 0.7678389026703634, + "grad_norm": 1.0440341375740938, + "learning_rate": 1.1311971896333179e-05, + "loss": 0.5559, + "step": 26310 + }, + { + "epoch": 0.7679848241645995, + "grad_norm": 0.986377169722253, + "learning_rate": 1.130443282614927e-05, + "loss": 0.5478, + "step": 26315 + }, + { + "epoch": 0.7681307456588355, + "grad_norm": 1.036366520073748, + "learning_rate": 1.1296897527405651e-05, + "loss": 0.549, + "step": 26320 + }, + { + "epoch": 0.7682766671530716, + "grad_norm": 1.0396657355979717, + "learning_rate": 1.1289366001857082e-05, + "loss": 0.584, + "step": 26325 + }, + { + "epoch": 0.7684225886473077, + "grad_norm": 1.1659084248424367, + "learning_rate": 1.1281838251257384e-05, + "loss": 0.5504, + "step": 26330 + }, + { + "epoch": 0.7685685101415438, + "grad_norm": 1.0224428467325715, + "learning_rate": 1.1274314277359563e-05, + "loss": 0.5653, + "step": 26335 + }, + { + "epoch": 0.7687144316357799, + "grad_norm": 1.0821596532302247, + "learning_rate": 1.12667940819157e-05, + "loss": 0.5621, + "step": 26340 + }, + { + "epoch": 0.7688603531300161, + "grad_norm": 0.9100038179071127, + "learning_rate": 1.1259277666677023e-05, + "loss": 0.5412, + "step": 26345 + }, + { + "epoch": 0.7690062746242522, + "grad_norm": 1.0237176826796293, + "learning_rate": 1.1251765033393868e-05, + "loss": 0.4852, + "step": 26350 + }, + { + "epoch": 0.7691521961184883, + "grad_norm": 0.927511991898227, + "learning_rate": 1.124425618381569e-05, + "loss": 0.5065, + "step": 26355 + }, + { + "epoch": 0.7692981176127244, + "grad_norm": 1.0127450452466895, + "learning_rate": 1.1236751119691066e-05, + "loss": 0.5419, + "step": 26360 + }, + { + "epoch": 0.7694440391069605, + "grad_norm": 1.0868699431838862, + "learning_rate": 1.1229249842767684e-05, + "loss": 0.5012, + "step": 26365 + }, + { + "epoch": 0.7695899606011966, + "grad_norm": 0.854644570271715, + "learning_rate": 1.1221752354792373e-05, + "loss": 0.5081, + "step": 26370 + }, + { + "epoch": 0.7697358820954326, + "grad_norm": 0.975006809185287, + "learning_rate": 1.1214258657511037e-05, + "loss": 0.511, + "step": 26375 + }, + { + "epoch": 0.7698818035896687, + "grad_norm": 1.2293865477658465, + "learning_rate": 1.1206768752668747e-05, + "loss": 0.646, + "step": 26380 + }, + { + "epoch": 0.7700277250839048, + "grad_norm": 1.0038895467301465, + "learning_rate": 1.119928264200966e-05, + "loss": 0.5184, + "step": 26385 + }, + { + "epoch": 0.7701736465781409, + "grad_norm": 0.9551484166198256, + "learning_rate": 1.1191800327277046e-05, + "loss": 0.5414, + "step": 26390 + }, + { + "epoch": 0.7703195680723771, + "grad_norm": 0.9301715407148176, + "learning_rate": 1.1184321810213326e-05, + "loss": 0.5365, + "step": 26395 + }, + { + "epoch": 0.7704654895666132, + "grad_norm": 0.9191611509869474, + "learning_rate": 1.1176847092559986e-05, + "loss": 0.4363, + "step": 26400 + }, + { + "epoch": 0.7706114110608493, + "grad_norm": 0.9442743898599173, + "learning_rate": 1.1169376176057666e-05, + "loss": 0.5614, + "step": 26405 + }, + { + "epoch": 0.7707573325550854, + "grad_norm": 1.0637324804971586, + "learning_rate": 1.1161909062446105e-05, + "loss": 0.5302, + "step": 26410 + }, + { + "epoch": 0.7709032540493215, + "grad_norm": 1.259719803526256, + "learning_rate": 1.1154445753464168e-05, + "loss": 0.602, + "step": 26415 + }, + { + "epoch": 0.7710491755435576, + "grad_norm": 1.0799875321457637, + "learning_rate": 1.1146986250849819e-05, + "loss": 0.5964, + "step": 26420 + }, + { + "epoch": 0.7711950970377937, + "grad_norm": 1.1335312158770563, + "learning_rate": 1.1139530556340147e-05, + "loss": 0.5964, + "step": 26425 + }, + { + "epoch": 0.7713410185320297, + "grad_norm": 1.0749432291433856, + "learning_rate": 1.1132078671671345e-05, + "loss": 0.5592, + "step": 26430 + }, + { + "epoch": 0.7714869400262658, + "grad_norm": 1.0907066776683048, + "learning_rate": 1.1124630598578716e-05, + "loss": 0.5381, + "step": 26435 + }, + { + "epoch": 0.7716328615205019, + "grad_norm": 1.0298183424681384, + "learning_rate": 1.1117186338796707e-05, + "loss": 0.5272, + "step": 26440 + }, + { + "epoch": 0.7717787830147381, + "grad_norm": 0.8572802119525016, + "learning_rate": 1.1109745894058825e-05, + "loss": 0.5091, + "step": 26445 + }, + { + "epoch": 0.7719247045089742, + "grad_norm": 1.0378810935211007, + "learning_rate": 1.1102309266097735e-05, + "loss": 0.5422, + "step": 26450 + }, + { + "epoch": 0.7720706260032103, + "grad_norm": 1.1183538579459082, + "learning_rate": 1.109487645664519e-05, + "loss": 0.5475, + "step": 26455 + }, + { + "epoch": 0.7722165474974464, + "grad_norm": 1.008846668051106, + "learning_rate": 1.1087447467432047e-05, + "loss": 0.4789, + "step": 26460 + }, + { + "epoch": 0.7723624689916825, + "grad_norm": 0.9548686984444584, + "learning_rate": 1.1080022300188313e-05, + "loss": 0.509, + "step": 26465 + }, + { + "epoch": 0.7725083904859186, + "grad_norm": 1.1319156145216078, + "learning_rate": 1.1072600956643046e-05, + "loss": 0.5573, + "step": 26470 + }, + { + "epoch": 0.7726543119801547, + "grad_norm": 1.068148796691251, + "learning_rate": 1.1065183438524461e-05, + "loss": 0.5305, + "step": 26475 + }, + { + "epoch": 0.7728002334743908, + "grad_norm": 0.9431366109806484, + "learning_rate": 1.1057769747559859e-05, + "loss": 0.4844, + "step": 26480 + }, + { + "epoch": 0.7729461549686268, + "grad_norm": 1.0910037856309902, + "learning_rate": 1.1050359885475663e-05, + "loss": 0.5952, + "step": 26485 + }, + { + "epoch": 0.7730920764628629, + "grad_norm": 0.900522554912647, + "learning_rate": 1.1042953853997398e-05, + "loss": 0.5109, + "step": 26490 + }, + { + "epoch": 0.7732379979570991, + "grad_norm": 1.192841715582392, + "learning_rate": 1.103555165484969e-05, + "loss": 0.5572, + "step": 26495 + }, + { + "epoch": 0.7733839194513352, + "grad_norm": 1.239785568993468, + "learning_rate": 1.1028153289756287e-05, + "loss": 0.5923, + "step": 26500 + }, + { + "epoch": 0.7735298409455713, + "grad_norm": 1.1439078975637549, + "learning_rate": 1.1020758760440023e-05, + "loss": 0.5617, + "step": 26505 + }, + { + "epoch": 0.7736757624398074, + "grad_norm": 0.8816858779540978, + "learning_rate": 1.1013368068622865e-05, + "loss": 0.5054, + "step": 26510 + }, + { + "epoch": 0.7738216839340435, + "grad_norm": 0.9004464248885659, + "learning_rate": 1.1005981216025874e-05, + "loss": 0.5363, + "step": 26515 + }, + { + "epoch": 0.7739676054282796, + "grad_norm": 1.0716891887745637, + "learning_rate": 1.0998598204369212e-05, + "loss": 0.58, + "step": 26520 + }, + { + "epoch": 0.7741135269225157, + "grad_norm": 1.2528211726648548, + "learning_rate": 1.0991219035372144e-05, + "loss": 0.5856, + "step": 26525 + }, + { + "epoch": 0.7742594484167518, + "grad_norm": 0.9244889707258868, + "learning_rate": 1.0983843710753062e-05, + "loss": 0.5167, + "step": 26530 + }, + { + "epoch": 0.7744053699109879, + "grad_norm": 1.001764835583752, + "learning_rate": 1.097647223222944e-05, + "loss": 0.544, + "step": 26535 + }, + { + "epoch": 0.7745512914052239, + "grad_norm": 1.0178996596269738, + "learning_rate": 1.0969104601517868e-05, + "loss": 0.5406, + "step": 26540 + }, + { + "epoch": 0.7746972128994601, + "grad_norm": 0.914968039839898, + "learning_rate": 1.0961740820334032e-05, + "loss": 0.4929, + "step": 26545 + }, + { + "epoch": 0.7748431343936962, + "grad_norm": 1.0325972459841566, + "learning_rate": 1.0954380890392721e-05, + "loss": 0.5273, + "step": 26550 + }, + { + "epoch": 0.7749890558879323, + "grad_norm": 1.0543724376917063, + "learning_rate": 1.0947024813407848e-05, + "loss": 0.5606, + "step": 26555 + }, + { + "epoch": 0.7751349773821684, + "grad_norm": 1.0691947109121767, + "learning_rate": 1.0939672591092404e-05, + "loss": 0.5347, + "step": 26560 + }, + { + "epoch": 0.7752808988764045, + "grad_norm": 1.067036295985987, + "learning_rate": 1.093232422515849e-05, + "loss": 0.5436, + "step": 26565 + }, + { + "epoch": 0.7754268203706406, + "grad_norm": 0.9275120333427185, + "learning_rate": 1.0924979717317313e-05, + "loss": 0.4744, + "step": 26570 + }, + { + "epoch": 0.7755727418648767, + "grad_norm": 1.0834631290577457, + "learning_rate": 1.0917639069279166e-05, + "loss": 0.5198, + "step": 26575 + }, + { + "epoch": 0.7757186633591128, + "grad_norm": 0.9495215822316148, + "learning_rate": 1.0910302282753477e-05, + "loss": 0.5175, + "step": 26580 + }, + { + "epoch": 0.7758645848533489, + "grad_norm": 0.9463328960838919, + "learning_rate": 1.0902969359448747e-05, + "loss": 0.4925, + "step": 26585 + }, + { + "epoch": 0.776010506347585, + "grad_norm": 0.9926699150133391, + "learning_rate": 1.089564030107258e-05, + "loss": 0.5342, + "step": 26590 + }, + { + "epoch": 0.7761564278418212, + "grad_norm": 1.100510166667213, + "learning_rate": 1.0888315109331675e-05, + "loss": 0.6052, + "step": 26595 + }, + { + "epoch": 0.7763023493360572, + "grad_norm": 1.1930276926883863, + "learning_rate": 1.0880993785931859e-05, + "loss": 0.529, + "step": 26600 + }, + { + "epoch": 0.7764482708302933, + "grad_norm": 0.9741370662357622, + "learning_rate": 1.087367633257803e-05, + "loss": 0.4951, + "step": 26605 + }, + { + "epoch": 0.7765941923245294, + "grad_norm": 1.0116020218370523, + "learning_rate": 1.0866362750974194e-05, + "loss": 0.4975, + "step": 26610 + }, + { + "epoch": 0.7767401138187655, + "grad_norm": 1.0023964323771666, + "learning_rate": 1.0859053042823455e-05, + "loss": 0.5178, + "step": 26615 + }, + { + "epoch": 0.7768860353130016, + "grad_norm": 1.1927319121120095, + "learning_rate": 1.0851747209828008e-05, + "loss": 0.5681, + "step": 26620 + }, + { + "epoch": 0.7770319568072377, + "grad_norm": 0.8799901206906776, + "learning_rate": 1.0844445253689167e-05, + "loss": 0.6037, + "step": 26625 + }, + { + "epoch": 0.7771778783014738, + "grad_norm": 0.8560348918531231, + "learning_rate": 1.0837147176107322e-05, + "loss": 0.5216, + "step": 26630 + }, + { + "epoch": 0.7773237997957099, + "grad_norm": 1.0970734945825442, + "learning_rate": 1.0829852978781964e-05, + "loss": 0.5243, + "step": 26635 + }, + { + "epoch": 0.777469721289946, + "grad_norm": 1.0071171015594016, + "learning_rate": 1.0822562663411686e-05, + "loss": 0.5847, + "step": 26640 + }, + { + "epoch": 0.7776156427841822, + "grad_norm": 0.9979222514006434, + "learning_rate": 1.081527623169417e-05, + "loss": 0.5273, + "step": 26645 + }, + { + "epoch": 0.7777615642784182, + "grad_norm": 0.9230597292966153, + "learning_rate": 1.08079936853262e-05, + "loss": 0.5078, + "step": 26650 + }, + { + "epoch": 0.7779074857726543, + "grad_norm": 1.1459497111705466, + "learning_rate": 1.080071502600366e-05, + "loss": 0.5336, + "step": 26655 + }, + { + "epoch": 0.7780534072668904, + "grad_norm": 1.0735928601569844, + "learning_rate": 1.0793440255421511e-05, + "loss": 0.6352, + "step": 26660 + }, + { + "epoch": 0.7781993287611265, + "grad_norm": 1.017195495499888, + "learning_rate": 1.0786169375273821e-05, + "loss": 0.5073, + "step": 26665 + }, + { + "epoch": 0.7783452502553626, + "grad_norm": 0.9589371604026143, + "learning_rate": 1.0778902387253758e-05, + "loss": 0.5338, + "step": 26670 + }, + { + "epoch": 0.7784911717495987, + "grad_norm": 1.0093054678252185, + "learning_rate": 1.0771639293053571e-05, + "loss": 0.5451, + "step": 26675 + }, + { + "epoch": 0.7786370932438348, + "grad_norm": 0.9894169640693417, + "learning_rate": 1.0764380094364607e-05, + "loss": 0.519, + "step": 26680 + }, + { + "epoch": 0.7787830147380709, + "grad_norm": 0.851796298818518, + "learning_rate": 1.0757124792877306e-05, + "loss": 0.4957, + "step": 26685 + }, + { + "epoch": 0.778928936232307, + "grad_norm": 1.1924757451502195, + "learning_rate": 1.0749873390281194e-05, + "loss": 0.5593, + "step": 26690 + }, + { + "epoch": 0.7790748577265432, + "grad_norm": 0.9872430020850951, + "learning_rate": 1.074262588826491e-05, + "loss": 0.5453, + "step": 26695 + }, + { + "epoch": 0.7792207792207793, + "grad_norm": 0.9859896595323326, + "learning_rate": 1.0735382288516155e-05, + "loss": 0.5708, + "step": 26700 + }, + { + "epoch": 0.7793667007150153, + "grad_norm": 1.0298045916555123, + "learning_rate": 1.0728142592721757e-05, + "loss": 0.5457, + "step": 26705 + }, + { + "epoch": 0.7795126222092514, + "grad_norm": 0.9858766725513713, + "learning_rate": 1.0720906802567593e-05, + "loss": 0.5481, + "step": 26710 + }, + { + "epoch": 0.7796585437034875, + "grad_norm": 0.9357213052473992, + "learning_rate": 1.0713674919738655e-05, + "loss": 0.5441, + "step": 26715 + }, + { + "epoch": 0.7798044651977236, + "grad_norm": 1.1807413164539537, + "learning_rate": 1.0706446945919033e-05, + "loss": 0.5438, + "step": 26720 + }, + { + "epoch": 0.7799503866919597, + "grad_norm": 1.2879374593889028, + "learning_rate": 1.069922288279189e-05, + "loss": 0.6062, + "step": 26725 + }, + { + "epoch": 0.7800963081861958, + "grad_norm": 0.9065153805960428, + "learning_rate": 1.0692002732039482e-05, + "loss": 0.4991, + "step": 26730 + }, + { + "epoch": 0.7802422296804319, + "grad_norm": 1.0029375533661282, + "learning_rate": 1.0684786495343155e-05, + "loss": 0.5325, + "step": 26735 + }, + { + "epoch": 0.780388151174668, + "grad_norm": 0.9139557501888654, + "learning_rate": 1.067757417438335e-05, + "loss": 0.5361, + "step": 26740 + }, + { + "epoch": 0.7805340726689042, + "grad_norm": 0.9690206571977329, + "learning_rate": 1.067036577083959e-05, + "loss": 0.5339, + "step": 26745 + }, + { + "epoch": 0.7806799941631403, + "grad_norm": 1.0111053450051546, + "learning_rate": 1.0663161286390482e-05, + "loss": 0.4814, + "step": 26750 + }, + { + "epoch": 0.7808259156573764, + "grad_norm": 1.0294714716851727, + "learning_rate": 1.0655960722713728e-05, + "loss": 0.5024, + "step": 26755 + }, + { + "epoch": 0.7809718371516124, + "grad_norm": 1.1111785810328374, + "learning_rate": 1.064876408148611e-05, + "loss": 0.5266, + "step": 26760 + }, + { + "epoch": 0.7811177586458485, + "grad_norm": 1.068480231956318, + "learning_rate": 1.0641571364383503e-05, + "loss": 0.5479, + "step": 26765 + }, + { + "epoch": 0.7812636801400846, + "grad_norm": 1.0774347239121638, + "learning_rate": 1.0634382573080862e-05, + "loss": 0.5834, + "step": 26770 + }, + { + "epoch": 0.7814096016343207, + "grad_norm": 1.0192380970542516, + "learning_rate": 1.0627197709252248e-05, + "loss": 0.4717, + "step": 26775 + }, + { + "epoch": 0.7815555231285568, + "grad_norm": 1.0218460545560653, + "learning_rate": 1.0620016774570765e-05, + "loss": 0.5378, + "step": 26780 + }, + { + "epoch": 0.7817014446227929, + "grad_norm": 0.9668280484456119, + "learning_rate": 1.0612839770708646e-05, + "loss": 0.5762, + "step": 26785 + }, + { + "epoch": 0.781847366117029, + "grad_norm": 1.00963578638126, + "learning_rate": 1.0605666699337186e-05, + "loss": 0.5227, + "step": 26790 + }, + { + "epoch": 0.7819932876112652, + "grad_norm": 1.1287921193671648, + "learning_rate": 1.0598497562126767e-05, + "loss": 0.5826, + "step": 26795 + }, + { + "epoch": 0.7821392091055013, + "grad_norm": 1.0768528026817945, + "learning_rate": 1.0591332360746855e-05, + "loss": 0.5147, + "step": 26800 + }, + { + "epoch": 0.7822851305997374, + "grad_norm": 0.960805922525826, + "learning_rate": 1.0584171096865997e-05, + "loss": 0.4767, + "step": 26805 + }, + { + "epoch": 0.7824310520939735, + "grad_norm": 1.0255582550356812, + "learning_rate": 1.057701377215184e-05, + "loss": 0.5375, + "step": 26810 + }, + { + "epoch": 0.7825769735882095, + "grad_norm": 1.1403794243365977, + "learning_rate": 1.0569860388271095e-05, + "loss": 0.5367, + "step": 26815 + }, + { + "epoch": 0.7827228950824456, + "grad_norm": 0.9387852519692074, + "learning_rate": 1.0562710946889557e-05, + "loss": 0.5077, + "step": 26820 + }, + { + "epoch": 0.7828688165766817, + "grad_norm": 1.0794088465977334, + "learning_rate": 1.0555565449672111e-05, + "loss": 0.5101, + "step": 26825 + }, + { + "epoch": 0.7830147380709178, + "grad_norm": 0.9323215720950511, + "learning_rate": 1.0548423898282711e-05, + "loss": 0.5012, + "step": 26830 + }, + { + "epoch": 0.7831606595651539, + "grad_norm": 0.9817844452175546, + "learning_rate": 1.0541286294384414e-05, + "loss": 0.5471, + "step": 26835 + }, + { + "epoch": 0.78330658105939, + "grad_norm": 0.97035189562513, + "learning_rate": 1.0534152639639333e-05, + "loss": 0.5537, + "step": 26840 + }, + { + "epoch": 0.7834525025536262, + "grad_norm": 1.0151783766404865, + "learning_rate": 1.0527022935708692e-05, + "loss": 0.5669, + "step": 26845 + }, + { + "epoch": 0.7835984240478623, + "grad_norm": 1.151043286721633, + "learning_rate": 1.0519897184252745e-05, + "loss": 0.5649, + "step": 26850 + }, + { + "epoch": 0.7837443455420984, + "grad_norm": 1.0827802907639867, + "learning_rate": 1.051277538693088e-05, + "loss": 0.4871, + "step": 26855 + }, + { + "epoch": 0.7838902670363345, + "grad_norm": 0.9723327917001047, + "learning_rate": 1.0505657545401534e-05, + "loss": 0.5535, + "step": 26860 + }, + { + "epoch": 0.7840361885305706, + "grad_norm": 0.9648375760637035, + "learning_rate": 1.0498543661322224e-05, + "loss": 0.5363, + "step": 26865 + }, + { + "epoch": 0.7841821100248066, + "grad_norm": 1.0416308165603414, + "learning_rate": 1.0491433736349566e-05, + "loss": 0.5554, + "step": 26870 + }, + { + "epoch": 0.7843280315190427, + "grad_norm": 1.1743857190044469, + "learning_rate": 1.0484327772139215e-05, + "loss": 0.5755, + "step": 26875 + }, + { + "epoch": 0.7844739530132788, + "grad_norm": 0.9892857571681145, + "learning_rate": 1.0477225770345952e-05, + "loss": 0.5097, + "step": 26880 + }, + { + "epoch": 0.7846198745075149, + "grad_norm": 0.8745383153696362, + "learning_rate": 1.0470127732623596e-05, + "loss": 0.4687, + "step": 26885 + }, + { + "epoch": 0.784765796001751, + "grad_norm": 1.0112344363569625, + "learning_rate": 1.046303366062506e-05, + "loss": 0.5436, + "step": 26890 + }, + { + "epoch": 0.7849117174959872, + "grad_norm": 1.1638534746943998, + "learning_rate": 1.0455943556002334e-05, + "loss": 0.5632, + "step": 26895 + }, + { + "epoch": 0.7850576389902233, + "grad_norm": 1.0813854473829274, + "learning_rate": 1.0448857420406473e-05, + "loss": 0.5754, + "step": 26900 + }, + { + "epoch": 0.7852035604844594, + "grad_norm": 0.9329066614128672, + "learning_rate": 1.044177525548763e-05, + "loss": 0.499, + "step": 26905 + }, + { + "epoch": 0.7853494819786955, + "grad_norm": 1.0739361060455275, + "learning_rate": 1.0434697062895005e-05, + "loss": 0.5461, + "step": 26910 + }, + { + "epoch": 0.7854954034729316, + "grad_norm": 1.1294853115478685, + "learning_rate": 1.0427622844276905e-05, + "loss": 0.5748, + "step": 26915 + }, + { + "epoch": 0.7856413249671677, + "grad_norm": 1.1184331586367497, + "learning_rate": 1.0420552601280671e-05, + "loss": 0.5316, + "step": 26920 + }, + { + "epoch": 0.7857872464614037, + "grad_norm": 1.0398845017954061, + "learning_rate": 1.0413486335552764e-05, + "loss": 0.5634, + "step": 26925 + }, + { + "epoch": 0.7859331679556398, + "grad_norm": 0.8762766074442039, + "learning_rate": 1.040642404873868e-05, + "loss": 0.4935, + "step": 26930 + }, + { + "epoch": 0.7860790894498759, + "grad_norm": 0.8562464857743646, + "learning_rate": 1.0399365742483005e-05, + "loss": 0.4839, + "step": 26935 + }, + { + "epoch": 0.786225010944112, + "grad_norm": 1.0347324867914962, + "learning_rate": 1.0392311418429418e-05, + "loss": 0.5101, + "step": 26940 + }, + { + "epoch": 0.7863709324383482, + "grad_norm": 1.0433639491144333, + "learning_rate": 1.038526107822062e-05, + "loss": 0.5855, + "step": 26945 + }, + { + "epoch": 0.7865168539325843, + "grad_norm": 0.8667111022655034, + "learning_rate": 1.0378214723498433e-05, + "loss": 0.5682, + "step": 26950 + }, + { + "epoch": 0.7866627754268204, + "grad_norm": 1.054413416099709, + "learning_rate": 1.0371172355903724e-05, + "loss": 0.5479, + "step": 26955 + }, + { + "epoch": 0.7868086969210565, + "grad_norm": 1.0628771361905176, + "learning_rate": 1.0364133977076452e-05, + "loss": 0.53, + "step": 26960 + }, + { + "epoch": 0.7869546184152926, + "grad_norm": 0.9159529243190648, + "learning_rate": 1.0357099588655621e-05, + "loss": 0.4473, + "step": 26965 + }, + { + "epoch": 0.7871005399095287, + "grad_norm": 0.915143704474563, + "learning_rate": 1.035006919227933e-05, + "loss": 0.5466, + "step": 26970 + }, + { + "epoch": 0.7872464614037648, + "grad_norm": 1.080810620375531, + "learning_rate": 1.0343042789584732e-05, + "loss": 0.5833, + "step": 26975 + }, + { + "epoch": 0.7873923828980008, + "grad_norm": 1.027914903709717, + "learning_rate": 1.0336020382208053e-05, + "loss": 0.5604, + "step": 26980 + }, + { + "epoch": 0.7875383043922369, + "grad_norm": 1.011852185977007, + "learning_rate": 1.0329001971784605e-05, + "loss": 0.5167, + "step": 26985 + }, + { + "epoch": 0.7876842258864731, + "grad_norm": 0.9771170988856054, + "learning_rate": 1.032198755994874e-05, + "loss": 0.5304, + "step": 26990 + }, + { + "epoch": 0.7878301473807092, + "grad_norm": 0.9985216506998189, + "learning_rate": 1.0314977148333905e-05, + "loss": 0.5136, + "step": 26995 + }, + { + "epoch": 0.7879760688749453, + "grad_norm": 1.102586021041923, + "learning_rate": 1.0307970738572603e-05, + "loss": 0.4896, + "step": 27000 + }, + { + "epoch": 0.7881219903691814, + "grad_norm": 1.0269495887648135, + "learning_rate": 1.0300968332296401e-05, + "loss": 0.5471, + "step": 27005 + }, + { + "epoch": 0.7882679118634175, + "grad_norm": 0.9876185922916727, + "learning_rate": 1.029396993113596e-05, + "loss": 0.5055, + "step": 27010 + }, + { + "epoch": 0.7884138333576536, + "grad_norm": 1.111808647339129, + "learning_rate": 1.0286975536720962e-05, + "loss": 0.5433, + "step": 27015 + }, + { + "epoch": 0.7885597548518897, + "grad_norm": 1.0860672616241285, + "learning_rate": 1.0279985150680197e-05, + "loss": 0.5345, + "step": 27020 + }, + { + "epoch": 0.7887056763461258, + "grad_norm": 1.0599904274563325, + "learning_rate": 1.0272998774641501e-05, + "loss": 0.5346, + "step": 27025 + }, + { + "epoch": 0.7888515978403619, + "grad_norm": 1.0194286224398008, + "learning_rate": 1.0266016410231794e-05, + "loss": 0.5971, + "step": 27030 + }, + { + "epoch": 0.7889975193345979, + "grad_norm": 1.0994546963287377, + "learning_rate": 1.025903805907704e-05, + "loss": 0.5518, + "step": 27035 + }, + { + "epoch": 0.7891434408288341, + "grad_norm": 0.9579309490500107, + "learning_rate": 1.0252063722802277e-05, + "loss": 0.5544, + "step": 27040 + }, + { + "epoch": 0.7892893623230702, + "grad_norm": 1.2102666732002723, + "learning_rate": 1.0245093403031616e-05, + "loss": 0.5956, + "step": 27045 + }, + { + "epoch": 0.7894352838173063, + "grad_norm": 0.9490959886886922, + "learning_rate": 1.0238127101388214e-05, + "loss": 0.5458, + "step": 27050 + }, + { + "epoch": 0.7895812053115424, + "grad_norm": 1.0129748720248426, + "learning_rate": 1.0231164819494323e-05, + "loss": 0.5645, + "step": 27055 + }, + { + "epoch": 0.7897271268057785, + "grad_norm": 0.853697521392681, + "learning_rate": 1.0224206558971229e-05, + "loss": 0.5081, + "step": 27060 + }, + { + "epoch": 0.7898730483000146, + "grad_norm": 1.0109532413804747, + "learning_rate": 1.0217252321439296e-05, + "loss": 0.5901, + "step": 27065 + }, + { + "epoch": 0.7900189697942507, + "grad_norm": 1.0761715446736881, + "learning_rate": 1.0210302108517944e-05, + "loss": 0.583, + "step": 27070 + }, + { + "epoch": 0.7901648912884868, + "grad_norm": 0.9822409724803611, + "learning_rate": 1.0203355921825658e-05, + "loss": 0.565, + "step": 27075 + }, + { + "epoch": 0.7903108127827229, + "grad_norm": 0.991866154896098, + "learning_rate": 1.0196413762980003e-05, + "loss": 0.5619, + "step": 27080 + }, + { + "epoch": 0.790456734276959, + "grad_norm": 1.1659941928515118, + "learning_rate": 1.0189475633597568e-05, + "loss": 0.5248, + "step": 27085 + }, + { + "epoch": 0.7906026557711952, + "grad_norm": 1.041721830783372, + "learning_rate": 1.0182541535294043e-05, + "loss": 0.477, + "step": 27090 + }, + { + "epoch": 0.7907485772654312, + "grad_norm": 0.9410758883506408, + "learning_rate": 1.0175611469684148e-05, + "loss": 0.5467, + "step": 27095 + }, + { + "epoch": 0.7908944987596673, + "grad_norm": 0.9387041882717445, + "learning_rate": 1.0168685438381697e-05, + "loss": 0.4909, + "step": 27100 + }, + { + "epoch": 0.7910404202539034, + "grad_norm": 0.9989749866872627, + "learning_rate": 1.0161763442999533e-05, + "loss": 0.5597, + "step": 27105 + }, + { + "epoch": 0.7911863417481395, + "grad_norm": 1.0448132223225775, + "learning_rate": 1.0154845485149575e-05, + "loss": 0.5143, + "step": 27110 + }, + { + "epoch": 0.7913322632423756, + "grad_norm": 1.0004083768223377, + "learning_rate": 1.0147931566442797e-05, + "loss": 0.5886, + "step": 27115 + }, + { + "epoch": 0.7914781847366117, + "grad_norm": 1.311983778781264, + "learning_rate": 1.0141021688489231e-05, + "loss": 0.5761, + "step": 27120 + }, + { + "epoch": 0.7916241062308478, + "grad_norm": 0.943616279915982, + "learning_rate": 1.0134115852897979e-05, + "loss": 0.5406, + "step": 27125 + }, + { + "epoch": 0.7917700277250839, + "grad_norm": 1.0654865437814889, + "learning_rate": 1.0127214061277191e-05, + "loss": 0.5732, + "step": 27130 + }, + { + "epoch": 0.79191594921932, + "grad_norm": 1.0622301473898175, + "learning_rate": 1.0120316315234076e-05, + "loss": 0.5184, + "step": 27135 + }, + { + "epoch": 0.7920618707135562, + "grad_norm": 1.054142344866929, + "learning_rate": 1.0113422616374904e-05, + "loss": 0.5107, + "step": 27140 + }, + { + "epoch": 0.7922077922077922, + "grad_norm": 1.1247809626017595, + "learning_rate": 1.0106532966304996e-05, + "loss": 0.5465, + "step": 27145 + }, + { + "epoch": 0.7923537137020283, + "grad_norm": 0.8494081010169099, + "learning_rate": 1.009964736662875e-05, + "loss": 0.4786, + "step": 27150 + }, + { + "epoch": 0.7924996351962644, + "grad_norm": 1.2234108431849424, + "learning_rate": 1.0092765818949596e-05, + "loss": 0.5675, + "step": 27155 + }, + { + "epoch": 0.7926455566905005, + "grad_norm": 1.005798287560001, + "learning_rate": 1.0085888324870032e-05, + "loss": 0.5503, + "step": 27160 + }, + { + "epoch": 0.7927914781847366, + "grad_norm": 0.8559603018776307, + "learning_rate": 1.0079014885991609e-05, + "loss": 0.5602, + "step": 27165 + }, + { + "epoch": 0.7929373996789727, + "grad_norm": 1.109763629231988, + "learning_rate": 1.007214550391494e-05, + "loss": 0.5538, + "step": 27170 + }, + { + "epoch": 0.7930833211732088, + "grad_norm": 1.0633799701930946, + "learning_rate": 1.0065280180239689e-05, + "loss": 0.4901, + "step": 27175 + }, + { + "epoch": 0.7932292426674449, + "grad_norm": 0.9956387740402242, + "learning_rate": 1.0058418916564573e-05, + "loss": 0.544, + "step": 27180 + }, + { + "epoch": 0.793375164161681, + "grad_norm": 1.0965245567499116, + "learning_rate": 1.0051561714487365e-05, + "loss": 0.6129, + "step": 27185 + }, + { + "epoch": 0.7935210856559172, + "grad_norm": 1.0379328588231362, + "learning_rate": 1.0044708575604887e-05, + "loss": 0.5376, + "step": 27190 + }, + { + "epoch": 0.7936670071501533, + "grad_norm": 1.0465975513049335, + "learning_rate": 1.0037859501513033e-05, + "loss": 0.6011, + "step": 27195 + }, + { + "epoch": 0.7938129286443893, + "grad_norm": 1.0420389861648298, + "learning_rate": 1.0031014493806728e-05, + "loss": 0.5387, + "step": 27200 + }, + { + "epoch": 0.7939588501386254, + "grad_norm": 1.0911596134064727, + "learning_rate": 1.002417355407997e-05, + "loss": 0.5997, + "step": 27205 + }, + { + "epoch": 0.7941047716328615, + "grad_norm": 1.058853823446391, + "learning_rate": 1.0017336683925782e-05, + "loss": 0.5651, + "step": 27210 + }, + { + "epoch": 0.7942506931270976, + "grad_norm": 1.0287017175405877, + "learning_rate": 1.0010503884936272e-05, + "loss": 0.5523, + "step": 27215 + }, + { + "epoch": 0.7943966146213337, + "grad_norm": 0.9463675898832514, + "learning_rate": 1.0003675158702584e-05, + "loss": 0.5135, + "step": 27220 + }, + { + "epoch": 0.7945425361155698, + "grad_norm": 1.0361634311254662, + "learning_rate": 9.996850506814907e-06, + "loss": 0.5745, + "step": 27225 + }, + { + "epoch": 0.7946884576098059, + "grad_norm": 0.98522863504564, + "learning_rate": 9.990029930862496e-06, + "loss": 0.5792, + "step": 27230 + }, + { + "epoch": 0.794834379104042, + "grad_norm": 1.093101275345474, + "learning_rate": 9.983213432433639e-06, + "loss": 0.58, + "step": 27235 + }, + { + "epoch": 0.7949803005982782, + "grad_norm": 1.0568704157421192, + "learning_rate": 9.9764010131157e-06, + "loss": 0.5275, + "step": 27240 + }, + { + "epoch": 0.7951262220925143, + "grad_norm": 1.1403919978120967, + "learning_rate": 9.969592674495065e-06, + "loss": 0.5589, + "step": 27245 + }, + { + "epoch": 0.7952721435867504, + "grad_norm": 1.0712487005921623, + "learning_rate": 9.962788418157192e-06, + "loss": 0.5623, + "step": 27250 + }, + { + "epoch": 0.7954180650809864, + "grad_norm": 1.0513781499284154, + "learning_rate": 9.955988245686575e-06, + "loss": 0.5061, + "step": 27255 + }, + { + "epoch": 0.7955639865752225, + "grad_norm": 0.8929031989395446, + "learning_rate": 9.949192158666755e-06, + "loss": 0.5969, + "step": 27260 + }, + { + "epoch": 0.7957099080694586, + "grad_norm": 0.8974400594187889, + "learning_rate": 9.94240015868034e-06, + "loss": 0.4722, + "step": 27265 + }, + { + "epoch": 0.7958558295636947, + "grad_norm": 1.1803723646969366, + "learning_rate": 9.935612247308965e-06, + "loss": 0.501, + "step": 27270 + }, + { + "epoch": 0.7960017510579308, + "grad_norm": 1.0467736708845314, + "learning_rate": 9.928828426133328e-06, + "loss": 0.5212, + "step": 27275 + }, + { + "epoch": 0.7961476725521669, + "grad_norm": 1.088564761438336, + "learning_rate": 9.92204869673316e-06, + "loss": 0.6101, + "step": 27280 + }, + { + "epoch": 0.796293594046403, + "grad_norm": 0.9767858884642183, + "learning_rate": 9.915273060687259e-06, + "loss": 0.484, + "step": 27285 + }, + { + "epoch": 0.7964395155406392, + "grad_norm": 0.9842297228226178, + "learning_rate": 9.90850151957345e-06, + "loss": 0.5226, + "step": 27290 + }, + { + "epoch": 0.7965854370348753, + "grad_norm": 1.1449838278588649, + "learning_rate": 9.901734074968616e-06, + "loss": 0.5761, + "step": 27295 + }, + { + "epoch": 0.7967313585291114, + "grad_norm": 0.8570547886807693, + "learning_rate": 9.894970728448686e-06, + "loss": 0.5156, + "step": 27300 + }, + { + "epoch": 0.7968772800233475, + "grad_norm": 1.2021593853417891, + "learning_rate": 9.888211481588615e-06, + "loss": 0.5957, + "step": 27305 + }, + { + "epoch": 0.7970232015175835, + "grad_norm": 0.9139561314783105, + "learning_rate": 9.881456335962444e-06, + "loss": 0.5374, + "step": 27310 + }, + { + "epoch": 0.7971691230118196, + "grad_norm": 1.0356776970393604, + "learning_rate": 9.87470529314322e-06, + "loss": 0.5348, + "step": 27315 + }, + { + "epoch": 0.7973150445060557, + "grad_norm": 1.1279862710750772, + "learning_rate": 9.867958354703057e-06, + "loss": 0.5423, + "step": 27320 + }, + { + "epoch": 0.7974609660002918, + "grad_norm": 1.0535608425941125, + "learning_rate": 9.861215522213097e-06, + "loss": 0.4999, + "step": 27325 + }, + { + "epoch": 0.7976068874945279, + "grad_norm": 0.9606345684864482, + "learning_rate": 9.854476797243537e-06, + "loss": 0.5397, + "step": 27330 + }, + { + "epoch": 0.797752808988764, + "grad_norm": 1.1621359622508396, + "learning_rate": 9.84774218136362e-06, + "loss": 0.5647, + "step": 27335 + }, + { + "epoch": 0.7978987304830002, + "grad_norm": 0.9540440460656431, + "learning_rate": 9.841011676141624e-06, + "loss": 0.5042, + "step": 27340 + }, + { + "epoch": 0.7980446519772363, + "grad_norm": 1.1311605935202607, + "learning_rate": 9.834285283144875e-06, + "loss": 0.5273, + "step": 27345 + }, + { + "epoch": 0.7981905734714724, + "grad_norm": 0.9087268902371635, + "learning_rate": 9.827563003939727e-06, + "loss": 0.4966, + "step": 27350 + }, + { + "epoch": 0.7983364949657085, + "grad_norm": 1.2067002378640321, + "learning_rate": 9.820844840091608e-06, + "loss": 0.5401, + "step": 27355 + }, + { + "epoch": 0.7984824164599446, + "grad_norm": 1.0419974564728558, + "learning_rate": 9.814130793164957e-06, + "loss": 0.5234, + "step": 27360 + }, + { + "epoch": 0.7986283379541806, + "grad_norm": 1.132389495889677, + "learning_rate": 9.807420864723268e-06, + "loss": 0.6014, + "step": 27365 + }, + { + "epoch": 0.7987742594484167, + "grad_norm": 0.9791412942275789, + "learning_rate": 9.80071505632907e-06, + "loss": 0.5452, + "step": 27370 + }, + { + "epoch": 0.7989201809426528, + "grad_norm": 0.9863250104020144, + "learning_rate": 9.794013369543933e-06, + "loss": 0.5504, + "step": 27375 + }, + { + "epoch": 0.7990661024368889, + "grad_norm": 1.136941525509386, + "learning_rate": 9.787315805928476e-06, + "loss": 0.5147, + "step": 27380 + }, + { + "epoch": 0.799212023931125, + "grad_norm": 0.9638669748080306, + "learning_rate": 9.780622367042347e-06, + "loss": 0.572, + "step": 27385 + }, + { + "epoch": 0.7993579454253612, + "grad_norm": 1.0038674685692957, + "learning_rate": 9.773933054444256e-06, + "loss": 0.5211, + "step": 27390 + }, + { + "epoch": 0.7995038669195973, + "grad_norm": 1.0054751085562437, + "learning_rate": 9.76724786969191e-06, + "loss": 0.5153, + "step": 27395 + }, + { + "epoch": 0.7996497884138334, + "grad_norm": 1.2150457173490514, + "learning_rate": 9.760566814342085e-06, + "loss": 0.5256, + "step": 27400 + }, + { + "epoch": 0.7997957099080695, + "grad_norm": 1.2763043732891401, + "learning_rate": 9.753889889950595e-06, + "loss": 0.5427, + "step": 27405 + }, + { + "epoch": 0.7999416314023056, + "grad_norm": 1.0284306448616833, + "learning_rate": 9.747217098072287e-06, + "loss": 0.5248, + "step": 27410 + }, + { + "epoch": 0.8000875528965417, + "grad_norm": 1.0841584801004394, + "learning_rate": 9.740548440261048e-06, + "loss": 0.5258, + "step": 27415 + }, + { + "epoch": 0.8002334743907777, + "grad_norm": 1.0289617759438767, + "learning_rate": 9.733883918069786e-06, + "loss": 0.5861, + "step": 27420 + }, + { + "epoch": 0.8003793958850138, + "grad_norm": 0.9702492187903877, + "learning_rate": 9.727223533050472e-06, + "loss": 0.5085, + "step": 27425 + }, + { + "epoch": 0.8005253173792499, + "grad_norm": 1.0128043042828136, + "learning_rate": 9.720567286754098e-06, + "loss": 0.5902, + "step": 27430 + }, + { + "epoch": 0.800671238873486, + "grad_norm": 1.0831884655380992, + "learning_rate": 9.713915180730698e-06, + "loss": 0.5444, + "step": 27435 + }, + { + "epoch": 0.8008171603677222, + "grad_norm": 0.919850515937568, + "learning_rate": 9.70726721652933e-06, + "loss": 0.5114, + "step": 27440 + }, + { + "epoch": 0.8009630818619583, + "grad_norm": 1.143320572954369, + "learning_rate": 9.7006233956981e-06, + "loss": 0.5174, + "step": 27445 + }, + { + "epoch": 0.8011090033561944, + "grad_norm": 0.9673356174348477, + "learning_rate": 9.69398371978415e-06, + "loss": 0.5348, + "step": 27450 + }, + { + "epoch": 0.8012549248504305, + "grad_norm": 0.8049751965545767, + "learning_rate": 9.687348190333649e-06, + "loss": 0.4988, + "step": 27455 + }, + { + "epoch": 0.8014008463446666, + "grad_norm": 0.9765064182314537, + "learning_rate": 9.680716808891815e-06, + "loss": 0.517, + "step": 27460 + }, + { + "epoch": 0.8015467678389027, + "grad_norm": 0.9913646612621594, + "learning_rate": 9.674089577002868e-06, + "loss": 0.5834, + "step": 27465 + }, + { + "epoch": 0.8016926893331388, + "grad_norm": 0.8744162882001801, + "learning_rate": 9.6674664962101e-06, + "loss": 0.5467, + "step": 27470 + }, + { + "epoch": 0.8018386108273748, + "grad_norm": 1.2778265176970571, + "learning_rate": 9.660847568055815e-06, + "loss": 0.5899, + "step": 27475 + }, + { + "epoch": 0.8019845323216109, + "grad_norm": 1.0611832238433687, + "learning_rate": 9.654232794081344e-06, + "loss": 0.5326, + "step": 27480 + }, + { + "epoch": 0.802130453815847, + "grad_norm": 1.0102272809417914, + "learning_rate": 9.64762217582708e-06, + "loss": 0.5744, + "step": 27485 + }, + { + "epoch": 0.8022763753100832, + "grad_norm": 0.9568474941629972, + "learning_rate": 9.641015714832408e-06, + "loss": 0.532, + "step": 27490 + }, + { + "epoch": 0.8024222968043193, + "grad_norm": 1.06775629283666, + "learning_rate": 9.63441341263578e-06, + "loss": 0.5586, + "step": 27495 + }, + { + "epoch": 0.8025682182985554, + "grad_norm": 1.1032546758194686, + "learning_rate": 9.627815270774662e-06, + "loss": 0.5222, + "step": 27500 + }, + { + "epoch": 0.8027141397927915, + "grad_norm": 0.9155934374699033, + "learning_rate": 9.621221290785548e-06, + "loss": 0.505, + "step": 27505 + }, + { + "epoch": 0.8028600612870276, + "grad_norm": 0.9401719563158842, + "learning_rate": 9.614631474203988e-06, + "loss": 0.5134, + "step": 27510 + }, + { + "epoch": 0.8030059827812637, + "grad_norm": 0.9868455245798152, + "learning_rate": 9.608045822564518e-06, + "loss": 0.5084, + "step": 27515 + }, + { + "epoch": 0.8031519042754998, + "grad_norm": 1.1627447409892855, + "learning_rate": 9.601464337400749e-06, + "loss": 0.5412, + "step": 27520 + }, + { + "epoch": 0.8032978257697359, + "grad_norm": 0.9617026133021087, + "learning_rate": 9.594887020245292e-06, + "loss": 0.473, + "step": 27525 + }, + { + "epoch": 0.8034437472639719, + "grad_norm": 0.9424687912754987, + "learning_rate": 9.588313872629812e-06, + "loss": 0.5175, + "step": 27530 + }, + { + "epoch": 0.803589668758208, + "grad_norm": 0.9679763521252085, + "learning_rate": 9.581744896084971e-06, + "loss": 0.5314, + "step": 27535 + }, + { + "epoch": 0.8037355902524442, + "grad_norm": 1.013203551640198, + "learning_rate": 9.57518009214049e-06, + "loss": 0.5023, + "step": 27540 + }, + { + "epoch": 0.8038815117466803, + "grad_norm": 0.9064389754273547, + "learning_rate": 9.568619462325105e-06, + "loss": 0.4931, + "step": 27545 + }, + { + "epoch": 0.8040274332409164, + "grad_norm": 1.0578281892980055, + "learning_rate": 9.56206300816657e-06, + "loss": 0.5472, + "step": 27550 + }, + { + "epoch": 0.8041733547351525, + "grad_norm": 1.0917287075421498, + "learning_rate": 9.555510731191703e-06, + "loss": 0.5168, + "step": 27555 + }, + { + "epoch": 0.8043192762293886, + "grad_norm": 0.9588788769701684, + "learning_rate": 9.548962632926294e-06, + "loss": 0.5174, + "step": 27560 + }, + { + "epoch": 0.8044651977236247, + "grad_norm": 1.0454322416561728, + "learning_rate": 9.54241871489521e-06, + "loss": 0.527, + "step": 27565 + }, + { + "epoch": 0.8046111192178608, + "grad_norm": 1.1135051545545753, + "learning_rate": 9.53587897862232e-06, + "loss": 0.5066, + "step": 27570 + }, + { + "epoch": 0.8047570407120969, + "grad_norm": 1.002594034012158, + "learning_rate": 9.529343425630515e-06, + "loss": 0.5238, + "step": 27575 + }, + { + "epoch": 0.804902962206333, + "grad_norm": 0.9086356407424167, + "learning_rate": 9.522812057441742e-06, + "loss": 0.5562, + "step": 27580 + }, + { + "epoch": 0.805048883700569, + "grad_norm": 0.9534181218130524, + "learning_rate": 9.516284875576923e-06, + "loss": 0.577, + "step": 27585 + }, + { + "epoch": 0.8051948051948052, + "grad_norm": 1.031107239357346, + "learning_rate": 9.509761881556057e-06, + "loss": 0.5257, + "step": 27590 + }, + { + "epoch": 0.8053407266890413, + "grad_norm": 0.9885917876951881, + "learning_rate": 9.503243076898133e-06, + "loss": 0.5435, + "step": 27595 + }, + { + "epoch": 0.8054866481832774, + "grad_norm": 1.0626693019751252, + "learning_rate": 9.496728463121188e-06, + "loss": 0.5472, + "step": 27600 + }, + { + "epoch": 0.8056325696775135, + "grad_norm": 0.9067944455597727, + "learning_rate": 9.490218041742255e-06, + "loss": 0.5945, + "step": 27605 + }, + { + "epoch": 0.8057784911717496, + "grad_norm": 1.1253369720637882, + "learning_rate": 9.483711814277421e-06, + "loss": 0.5659, + "step": 27610 + }, + { + "epoch": 0.8059244126659857, + "grad_norm": 0.9644438493688245, + "learning_rate": 9.477209782241782e-06, + "loss": 0.4896, + "step": 27615 + }, + { + "epoch": 0.8060703341602218, + "grad_norm": 0.9585658942490324, + "learning_rate": 9.470711947149442e-06, + "loss": 0.5252, + "step": 27620 + }, + { + "epoch": 0.8062162556544579, + "grad_norm": 1.1722305346644295, + "learning_rate": 9.46421831051357e-06, + "loss": 0.5723, + "step": 27625 + }, + { + "epoch": 0.806362177148694, + "grad_norm": 1.1177449741135377, + "learning_rate": 9.457728873846305e-06, + "loss": 0.5504, + "step": 27630 + }, + { + "epoch": 0.80650809864293, + "grad_norm": 0.9393704640937329, + "learning_rate": 9.451243638658847e-06, + "loss": 0.5277, + "step": 27635 + }, + { + "epoch": 0.8066540201371662, + "grad_norm": 1.0533317383940068, + "learning_rate": 9.444762606461395e-06, + "loss": 0.5479, + "step": 27640 + }, + { + "epoch": 0.8067999416314023, + "grad_norm": 1.1566669044090205, + "learning_rate": 9.43828577876319e-06, + "loss": 0.5711, + "step": 27645 + }, + { + "epoch": 0.8069458631256384, + "grad_norm": 1.018216470129085, + "learning_rate": 9.431813157072483e-06, + "loss": 0.5252, + "step": 27650 + }, + { + "epoch": 0.8070917846198745, + "grad_norm": 0.9507319286907391, + "learning_rate": 9.425344742896528e-06, + "loss": 0.5051, + "step": 27655 + }, + { + "epoch": 0.8072377061141106, + "grad_norm": 1.0691638812976023, + "learning_rate": 9.41888053774163e-06, + "loss": 0.6041, + "step": 27660 + }, + { + "epoch": 0.8073836276083467, + "grad_norm": 1.0972148078030926, + "learning_rate": 9.41242054311309e-06, + "loss": 0.5358, + "step": 27665 + }, + { + "epoch": 0.8075295491025828, + "grad_norm": 1.1118987443013721, + "learning_rate": 9.405964760515256e-06, + "loss": 0.5667, + "step": 27670 + }, + { + "epoch": 0.8076754705968189, + "grad_norm": 1.0795088052423096, + "learning_rate": 9.399513191451461e-06, + "loss": 0.4594, + "step": 27675 + }, + { + "epoch": 0.807821392091055, + "grad_norm": 1.0591274634186896, + "learning_rate": 9.393065837424084e-06, + "loss": 0.5397, + "step": 27680 + }, + { + "epoch": 0.8079673135852911, + "grad_norm": 1.0058882986557025, + "learning_rate": 9.386622699934501e-06, + "loss": 0.5361, + "step": 27685 + }, + { + "epoch": 0.8081132350795273, + "grad_norm": 1.0813658145107006, + "learning_rate": 9.380183780483121e-06, + "loss": 0.5435, + "step": 27690 + }, + { + "epoch": 0.8082591565737633, + "grad_norm": 1.0424913198809822, + "learning_rate": 9.373749080569378e-06, + "loss": 0.5833, + "step": 27695 + }, + { + "epoch": 0.8084050780679994, + "grad_norm": 1.0099481378119568, + "learning_rate": 9.367318601691689e-06, + "loss": 0.4944, + "step": 27700 + }, + { + "epoch": 0.8085509995622355, + "grad_norm": 1.2534436522532801, + "learning_rate": 9.360892345347533e-06, + "loss": 0.6047, + "step": 27705 + }, + { + "epoch": 0.8086969210564716, + "grad_norm": 1.0273105256661559, + "learning_rate": 9.354470313033368e-06, + "loss": 0.5628, + "step": 27710 + }, + { + "epoch": 0.8088428425507077, + "grad_norm": 1.0879328581582328, + "learning_rate": 9.348052506244697e-06, + "loss": 0.5364, + "step": 27715 + }, + { + "epoch": 0.8089887640449438, + "grad_norm": 1.2056678545686688, + "learning_rate": 9.341638926476021e-06, + "loss": 0.568, + "step": 27720 + }, + { + "epoch": 0.8091346855391799, + "grad_norm": 1.2228912264438538, + "learning_rate": 9.33522957522086e-06, + "loss": 0.5399, + "step": 27725 + }, + { + "epoch": 0.809280607033416, + "grad_norm": 1.0243165951012978, + "learning_rate": 9.328824453971752e-06, + "loss": 0.5146, + "step": 27730 + }, + { + "epoch": 0.8094265285276522, + "grad_norm": 1.0440796243789765, + "learning_rate": 9.322423564220246e-06, + "loss": 0.5282, + "step": 27735 + }, + { + "epoch": 0.8095724500218883, + "grad_norm": 1.139371110599913, + "learning_rate": 9.316026907456918e-06, + "loss": 0.5724, + "step": 27740 + }, + { + "epoch": 0.8097183715161244, + "grad_norm": 1.0597225520778788, + "learning_rate": 9.309634485171342e-06, + "loss": 0.5767, + "step": 27745 + }, + { + "epoch": 0.8098642930103604, + "grad_norm": 0.9747881523077964, + "learning_rate": 9.303246298852114e-06, + "loss": 0.4782, + "step": 27750 + }, + { + "epoch": 0.8100102145045965, + "grad_norm": 1.1222987751360458, + "learning_rate": 9.296862349986843e-06, + "loss": 0.4727, + "step": 27755 + }, + { + "epoch": 0.8101561359988326, + "grad_norm": 1.0610867429011148, + "learning_rate": 9.290482640062147e-06, + "loss": 0.5551, + "step": 27760 + }, + { + "epoch": 0.8103020574930687, + "grad_norm": 1.1151588296115174, + "learning_rate": 9.284107170563666e-06, + "loss": 0.5698, + "step": 27765 + }, + { + "epoch": 0.8104479789873048, + "grad_norm": 1.188136795631472, + "learning_rate": 9.277735942976049e-06, + "loss": 0.5736, + "step": 27770 + }, + { + "epoch": 0.8105939004815409, + "grad_norm": 0.9672101091187782, + "learning_rate": 9.271368958782947e-06, + "loss": 0.5233, + "step": 27775 + }, + { + "epoch": 0.810739821975777, + "grad_norm": 0.8573582914705824, + "learning_rate": 9.265006219467033e-06, + "loss": 0.5033, + "step": 27780 + }, + { + "epoch": 0.8108857434700132, + "grad_norm": 0.9623949590488653, + "learning_rate": 9.258647726509996e-06, + "loss": 0.4892, + "step": 27785 + }, + { + "epoch": 0.8110316649642493, + "grad_norm": 1.0744293737302077, + "learning_rate": 9.25229348139253e-06, + "loss": 0.498, + "step": 27790 + }, + { + "epoch": 0.8111775864584854, + "grad_norm": 1.1014697423501634, + "learning_rate": 9.245943485594332e-06, + "loss": 0.5976, + "step": 27795 + }, + { + "epoch": 0.8113235079527215, + "grad_norm": 0.987113832028931, + "learning_rate": 9.239597740594122e-06, + "loss": 0.5226, + "step": 27800 + }, + { + "epoch": 0.8114694294469575, + "grad_norm": 1.2630926845648756, + "learning_rate": 9.233256247869621e-06, + "loss": 0.5543, + "step": 27805 + }, + { + "epoch": 0.8116153509411936, + "grad_norm": 1.0864837101800717, + "learning_rate": 9.22691900889757e-06, + "loss": 0.4879, + "step": 27810 + }, + { + "epoch": 0.8117612724354297, + "grad_norm": 0.9343961114199191, + "learning_rate": 9.22058602515371e-06, + "loss": 0.4881, + "step": 27815 + }, + { + "epoch": 0.8119071939296658, + "grad_norm": 0.9889183974951017, + "learning_rate": 9.214257298112798e-06, + "loss": 0.5518, + "step": 27820 + }, + { + "epoch": 0.8120531154239019, + "grad_norm": 0.9559528303074131, + "learning_rate": 9.207932829248592e-06, + "loss": 0.5169, + "step": 27825 + }, + { + "epoch": 0.812199036918138, + "grad_norm": 1.137927860617841, + "learning_rate": 9.201612620033858e-06, + "loss": 0.5746, + "step": 27830 + }, + { + "epoch": 0.8123449584123742, + "grad_norm": 0.8940713593425644, + "learning_rate": 9.19529667194039e-06, + "loss": 0.4852, + "step": 27835 + }, + { + "epoch": 0.8124908799066103, + "grad_norm": 0.9521947145858813, + "learning_rate": 9.188984986438962e-06, + "loss": 0.5448, + "step": 27840 + }, + { + "epoch": 0.8126368014008464, + "grad_norm": 0.8714111086224701, + "learning_rate": 9.182677564999373e-06, + "loss": 0.4721, + "step": 27845 + }, + { + "epoch": 0.8127827228950825, + "grad_norm": 1.0190454412461618, + "learning_rate": 9.176374409090415e-06, + "loss": 0.5517, + "step": 27850 + }, + { + "epoch": 0.8129286443893186, + "grad_norm": 0.9827922185977388, + "learning_rate": 9.170075520179911e-06, + "loss": 0.581, + "step": 27855 + }, + { + "epoch": 0.8130745658835546, + "grad_norm": 1.2429780460859878, + "learning_rate": 9.163780899734664e-06, + "loss": 0.6059, + "step": 27860 + }, + { + "epoch": 0.8132204873777907, + "grad_norm": 1.1901717120842787, + "learning_rate": 9.157490549220499e-06, + "loss": 0.5467, + "step": 27865 + }, + { + "epoch": 0.8133664088720268, + "grad_norm": 1.199586783243458, + "learning_rate": 9.151204470102238e-06, + "loss": 0.5854, + "step": 27870 + }, + { + "epoch": 0.8135123303662629, + "grad_norm": 0.9925734983397883, + "learning_rate": 9.144922663843705e-06, + "loss": 0.5523, + "step": 27875 + }, + { + "epoch": 0.813658251860499, + "grad_norm": 0.996518396023555, + "learning_rate": 9.138645131907754e-06, + "loss": 0.5009, + "step": 27880 + }, + { + "epoch": 0.8138041733547352, + "grad_norm": 0.9651210286462376, + "learning_rate": 9.132371875756212e-06, + "loss": 0.5114, + "step": 27885 + }, + { + "epoch": 0.8139500948489713, + "grad_norm": 0.9642596226734768, + "learning_rate": 9.12610289684993e-06, + "loss": 0.532, + "step": 27890 + }, + { + "epoch": 0.8140960163432074, + "grad_norm": 0.9317355303371309, + "learning_rate": 9.119838196648749e-06, + "loss": 0.5634, + "step": 27895 + }, + { + "epoch": 0.8142419378374435, + "grad_norm": 1.1931870554692583, + "learning_rate": 9.113577776611528e-06, + "loss": 0.5678, + "step": 27900 + }, + { + "epoch": 0.8143878593316796, + "grad_norm": 0.9705321649499273, + "learning_rate": 9.107321638196128e-06, + "loss": 0.4981, + "step": 27905 + }, + { + "epoch": 0.8145337808259157, + "grad_norm": 0.9848314424466137, + "learning_rate": 9.101069782859397e-06, + "loss": 0.5374, + "step": 27910 + }, + { + "epoch": 0.8146797023201517, + "grad_norm": 0.9849529556958224, + "learning_rate": 9.094822212057202e-06, + "loss": 0.4833, + "step": 27915 + }, + { + "epoch": 0.8148256238143878, + "grad_norm": 1.113136201208091, + "learning_rate": 9.0885789272444e-06, + "loss": 0.4873, + "step": 27920 + }, + { + "epoch": 0.8149715453086239, + "grad_norm": 1.1096102421006975, + "learning_rate": 9.082339929874866e-06, + "loss": 0.5669, + "step": 27925 + }, + { + "epoch": 0.81511746680286, + "grad_norm": 1.0755514707794318, + "learning_rate": 9.076105221401463e-06, + "loss": 0.4828, + "step": 27930 + }, + { + "epoch": 0.8152633882970962, + "grad_norm": 1.3527511108946093, + "learning_rate": 9.069874803276057e-06, + "loss": 0.5815, + "step": 27935 + }, + { + "epoch": 0.8154093097913323, + "grad_norm": 1.1278287118302506, + "learning_rate": 9.06364867694952e-06, + "loss": 0.5447, + "step": 27940 + }, + { + "epoch": 0.8155552312855684, + "grad_norm": 1.2232301347475611, + "learning_rate": 9.057426843871717e-06, + "loss": 0.5642, + "step": 27945 + }, + { + "epoch": 0.8157011527798045, + "grad_norm": 1.1861420741796393, + "learning_rate": 9.051209305491525e-06, + "loss": 0.5632, + "step": 27950 + }, + { + "epoch": 0.8158470742740406, + "grad_norm": 1.0056647253659428, + "learning_rate": 9.044996063256805e-06, + "loss": 0.5142, + "step": 27955 + }, + { + "epoch": 0.8159929957682767, + "grad_norm": 1.171597051449102, + "learning_rate": 9.038787118614445e-06, + "loss": 0.5709, + "step": 27960 + }, + { + "epoch": 0.8161389172625128, + "grad_norm": 1.0595601349295236, + "learning_rate": 9.03258247301029e-06, + "loss": 0.5356, + "step": 27965 + }, + { + "epoch": 0.8162848387567488, + "grad_norm": 1.0476062087564835, + "learning_rate": 9.026382127889223e-06, + "loss": 0.5915, + "step": 27970 + }, + { + "epoch": 0.8164307602509849, + "grad_norm": 0.941600054199794, + "learning_rate": 9.02018608469511e-06, + "loss": 0.5205, + "step": 27975 + }, + { + "epoch": 0.816576681745221, + "grad_norm": 1.0309150295288003, + "learning_rate": 9.013994344870807e-06, + "loss": 0.5456, + "step": 27980 + }, + { + "epoch": 0.8167226032394572, + "grad_norm": 0.998801669629097, + "learning_rate": 9.00780690985818e-06, + "loss": 0.5401, + "step": 27985 + }, + { + "epoch": 0.8168685247336933, + "grad_norm": 1.079199015163746, + "learning_rate": 9.001623781098085e-06, + "loss": 0.5073, + "step": 27990 + }, + { + "epoch": 0.8170144462279294, + "grad_norm": 1.037364112733121, + "learning_rate": 8.99544496003039e-06, + "loss": 0.5135, + "step": 27995 + }, + { + "epoch": 0.8171603677221655, + "grad_norm": 0.9737609701743194, + "learning_rate": 8.989270448093943e-06, + "loss": 0.5131, + "step": 28000 + }, + { + "epoch": 0.8173062892164016, + "grad_norm": 1.0652790053330017, + "learning_rate": 8.98310024672659e-06, + "loss": 0.567, + "step": 28005 + }, + { + "epoch": 0.8174522107106377, + "grad_norm": 1.032025735214532, + "learning_rate": 8.976934357365183e-06, + "loss": 0.527, + "step": 28010 + }, + { + "epoch": 0.8175981322048738, + "grad_norm": 0.9831969342446276, + "learning_rate": 8.970772781445557e-06, + "loss": 0.5093, + "step": 28015 + }, + { + "epoch": 0.8177440536991099, + "grad_norm": 1.0413412740147545, + "learning_rate": 8.964615520402562e-06, + "loss": 0.5775, + "step": 28020 + }, + { + "epoch": 0.8178899751933459, + "grad_norm": 1.0107369579762189, + "learning_rate": 8.95846257567002e-06, + "loss": 0.5999, + "step": 28025 + }, + { + "epoch": 0.818035896687582, + "grad_norm": 0.9411778469432223, + "learning_rate": 8.952313948680773e-06, + "loss": 0.502, + "step": 28030 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.0656401668805855, + "learning_rate": 8.946169640866625e-06, + "loss": 0.5612, + "step": 28035 + }, + { + "epoch": 0.8183277396760543, + "grad_norm": 0.956707045458958, + "learning_rate": 8.940029653658407e-06, + "loss": 0.4698, + "step": 28040 + }, + { + "epoch": 0.8184736611702904, + "grad_norm": 0.8905059535363938, + "learning_rate": 8.933893988485927e-06, + "loss": 0.4924, + "step": 28045 + }, + { + "epoch": 0.8186195826645265, + "grad_norm": 1.1552375069009455, + "learning_rate": 8.927762646777988e-06, + "loss": 0.5288, + "step": 28050 + }, + { + "epoch": 0.8187655041587626, + "grad_norm": 1.1103000459352588, + "learning_rate": 8.921635629962383e-06, + "loss": 0.5049, + "step": 28055 + }, + { + "epoch": 0.8189114256529987, + "grad_norm": 1.0887474143422102, + "learning_rate": 8.915512939465903e-06, + "loss": 0.5544, + "step": 28060 + }, + { + "epoch": 0.8190573471472348, + "grad_norm": 1.2835730826144744, + "learning_rate": 8.909394576714342e-06, + "loss": 0.5687, + "step": 28065 + }, + { + "epoch": 0.8192032686414709, + "grad_norm": 0.891441482662223, + "learning_rate": 8.90328054313246e-06, + "loss": 0.5479, + "step": 28070 + }, + { + "epoch": 0.819349190135707, + "grad_norm": 1.0411416067683397, + "learning_rate": 8.89717084014404e-06, + "loss": 0.5512, + "step": 28075 + }, + { + "epoch": 0.819495111629943, + "grad_norm": 0.9101170804735373, + "learning_rate": 8.891065469171827e-06, + "loss": 0.4968, + "step": 28080 + }, + { + "epoch": 0.8196410331241792, + "grad_norm": 1.0574828395888316, + "learning_rate": 8.88496443163757e-06, + "loss": 0.5291, + "step": 28085 + }, + { + "epoch": 0.8197869546184153, + "grad_norm": 1.040209554091426, + "learning_rate": 8.87886772896202e-06, + "loss": 0.5508, + "step": 28090 + }, + { + "epoch": 0.8199328761126514, + "grad_norm": 1.088897895777659, + "learning_rate": 8.872775362564897e-06, + "loss": 0.5252, + "step": 28095 + }, + { + "epoch": 0.8200787976068875, + "grad_norm": 1.1177754534567514, + "learning_rate": 8.86668733386494e-06, + "loss": 0.5308, + "step": 28100 + }, + { + "epoch": 0.8202247191011236, + "grad_norm": 1.0953068065459572, + "learning_rate": 8.860603644279836e-06, + "loss": 0.5429, + "step": 28105 + }, + { + "epoch": 0.8203706405953597, + "grad_norm": 1.1047356183143968, + "learning_rate": 8.854524295226304e-06, + "loss": 0.5751, + "step": 28110 + }, + { + "epoch": 0.8205165620895958, + "grad_norm": 1.0343611392911816, + "learning_rate": 8.848449288120028e-06, + "loss": 0.5256, + "step": 28115 + }, + { + "epoch": 0.8206624835838319, + "grad_norm": 0.9750535414852431, + "learning_rate": 8.84237862437568e-06, + "loss": 0.5346, + "step": 28120 + }, + { + "epoch": 0.820808405078068, + "grad_norm": 0.9025739737203358, + "learning_rate": 8.836312305406946e-06, + "loss": 0.4982, + "step": 28125 + }, + { + "epoch": 0.820954326572304, + "grad_norm": 1.1475953183753795, + "learning_rate": 8.830250332626461e-06, + "loss": 0.5314, + "step": 28130 + }, + { + "epoch": 0.8211002480665403, + "grad_norm": 1.0874575516504863, + "learning_rate": 8.82419270744588e-06, + "loss": 0.5377, + "step": 28135 + }, + { + "epoch": 0.8212461695607763, + "grad_norm": 1.095453790300035, + "learning_rate": 8.818139431275829e-06, + "loss": 0.5775, + "step": 28140 + }, + { + "epoch": 0.8213920910550124, + "grad_norm": 1.1713486802735658, + "learning_rate": 8.812090505525938e-06, + "loss": 0.5893, + "step": 28145 + }, + { + "epoch": 0.8215380125492485, + "grad_norm": 1.216258057259605, + "learning_rate": 8.806045931604792e-06, + "loss": 0.5276, + "step": 28150 + }, + { + "epoch": 0.8216839340434846, + "grad_norm": 1.0214076925902091, + "learning_rate": 8.80000571092e-06, + "loss": 0.5746, + "step": 28155 + }, + { + "epoch": 0.8218298555377207, + "grad_norm": 1.0048347057821552, + "learning_rate": 8.793969844878135e-06, + "loss": 0.5198, + "step": 28160 + }, + { + "epoch": 0.8219757770319568, + "grad_norm": 0.9353233099340699, + "learning_rate": 8.787938334884754e-06, + "loss": 0.5083, + "step": 28165 + }, + { + "epoch": 0.8221216985261929, + "grad_norm": 0.9939063910839164, + "learning_rate": 8.781911182344426e-06, + "loss": 0.5522, + "step": 28170 + }, + { + "epoch": 0.822267620020429, + "grad_norm": 0.9079581624624045, + "learning_rate": 8.775888388660665e-06, + "loss": 0.5321, + "step": 28175 + }, + { + "epoch": 0.8224135415146651, + "grad_norm": 1.0726363179867262, + "learning_rate": 8.769869955236001e-06, + "loss": 0.5352, + "step": 28180 + }, + { + "epoch": 0.8225594630089013, + "grad_norm": 0.959918415461413, + "learning_rate": 8.763855883471938e-06, + "loss": 0.5042, + "step": 28185 + }, + { + "epoch": 0.8227053845031373, + "grad_norm": 1.0977239243156898, + "learning_rate": 8.75784617476896e-06, + "loss": 0.5076, + "step": 28190 + }, + { + "epoch": 0.8228513059973734, + "grad_norm": 0.970517053477958, + "learning_rate": 8.751840830526551e-06, + "loss": 0.5291, + "step": 28195 + }, + { + "epoch": 0.8229972274916095, + "grad_norm": 1.0383910002614876, + "learning_rate": 8.745839852143153e-06, + "loss": 0.5608, + "step": 28200 + }, + { + "epoch": 0.8231431489858456, + "grad_norm": 1.083696137125219, + "learning_rate": 8.739843241016218e-06, + "loss": 0.5174, + "step": 28205 + }, + { + "epoch": 0.8232890704800817, + "grad_norm": 0.9979652126326642, + "learning_rate": 8.733850998542159e-06, + "loss": 0.5277, + "step": 28210 + }, + { + "epoch": 0.8234349919743178, + "grad_norm": 1.1333915467510103, + "learning_rate": 8.727863126116391e-06, + "loss": 0.5568, + "step": 28215 + }, + { + "epoch": 0.8235809134685539, + "grad_norm": 1.0254876309538437, + "learning_rate": 8.7218796251333e-06, + "loss": 0.5118, + "step": 28220 + }, + { + "epoch": 0.82372683496279, + "grad_norm": 1.2602776551773491, + "learning_rate": 8.71590049698625e-06, + "loss": 0.5721, + "step": 28225 + }, + { + "epoch": 0.8238727564570261, + "grad_norm": 1.0831427282107986, + "learning_rate": 8.7099257430676e-06, + "loss": 0.5355, + "step": 28230 + }, + { + "epoch": 0.8240186779512623, + "grad_norm": 1.0171223226081503, + "learning_rate": 8.703955364768674e-06, + "loss": 0.488, + "step": 28235 + }, + { + "epoch": 0.8241645994454984, + "grad_norm": 0.9708185384944034, + "learning_rate": 8.697989363479802e-06, + "loss": 0.5465, + "step": 28240 + }, + { + "epoch": 0.8243105209397344, + "grad_norm": 1.105057452779647, + "learning_rate": 8.692027740590255e-06, + "loss": 0.5324, + "step": 28245 + }, + { + "epoch": 0.8244564424339705, + "grad_norm": 0.9668363008976052, + "learning_rate": 8.68607049748833e-06, + "loss": 0.5732, + "step": 28250 + }, + { + "epoch": 0.8246023639282066, + "grad_norm": 1.033607550446259, + "learning_rate": 8.680117635561274e-06, + "loss": 0.5439, + "step": 28255 + }, + { + "epoch": 0.8247482854224427, + "grad_norm": 1.1332460262963568, + "learning_rate": 8.674169156195314e-06, + "loss": 0.5383, + "step": 28260 + }, + { + "epoch": 0.8248942069166788, + "grad_norm": 0.9704722276677873, + "learning_rate": 8.668225060775684e-06, + "loss": 0.5049, + "step": 28265 + }, + { + "epoch": 0.8250401284109149, + "grad_norm": 1.0734520785174781, + "learning_rate": 8.662285350686555e-06, + "loss": 0.5593, + "step": 28270 + }, + { + "epoch": 0.825186049905151, + "grad_norm": 1.2226039397833643, + "learning_rate": 8.656350027311116e-06, + "loss": 0.5659, + "step": 28275 + }, + { + "epoch": 0.8253319713993871, + "grad_norm": 1.0981170973257064, + "learning_rate": 8.65041909203151e-06, + "loss": 0.5265, + "step": 28280 + }, + { + "epoch": 0.8254778928936233, + "grad_norm": 1.004740060648132, + "learning_rate": 8.644492546228872e-06, + "loss": 0.5655, + "step": 28285 + }, + { + "epoch": 0.8256238143878594, + "grad_norm": 1.0108501955307818, + "learning_rate": 8.638570391283308e-06, + "loss": 0.5371, + "step": 28290 + }, + { + "epoch": 0.8257697358820955, + "grad_norm": 0.899541588686353, + "learning_rate": 8.632652628573895e-06, + "loss": 0.4771, + "step": 28295 + }, + { + "epoch": 0.8259156573763315, + "grad_norm": 1.0296300858875094, + "learning_rate": 8.6267392594787e-06, + "loss": 0.5124, + "step": 28300 + }, + { + "epoch": 0.8260615788705676, + "grad_norm": 1.1133524034065478, + "learning_rate": 8.620830285374759e-06, + "loss": 0.5804, + "step": 28305 + }, + { + "epoch": 0.8262075003648037, + "grad_norm": 0.9411745150253564, + "learning_rate": 8.61492570763809e-06, + "loss": 0.5521, + "step": 28310 + }, + { + "epoch": 0.8263534218590398, + "grad_norm": 1.0599675669577968, + "learning_rate": 8.609025527643686e-06, + "loss": 0.5301, + "step": 28315 + }, + { + "epoch": 0.8264993433532759, + "grad_norm": 1.0840057842616877, + "learning_rate": 8.603129746765509e-06, + "loss": 0.5521, + "step": 28320 + }, + { + "epoch": 0.826645264847512, + "grad_norm": 1.0165467852720114, + "learning_rate": 8.5972383663765e-06, + "loss": 0.5052, + "step": 28325 + }, + { + "epoch": 0.8267911863417481, + "grad_norm": 0.8989814474686102, + "learning_rate": 8.591351387848587e-06, + "loss": 0.5164, + "step": 28330 + }, + { + "epoch": 0.8269371078359843, + "grad_norm": 0.9118256489721082, + "learning_rate": 8.585468812552659e-06, + "loss": 0.5644, + "step": 28335 + }, + { + "epoch": 0.8270830293302204, + "grad_norm": 1.0532162727122696, + "learning_rate": 8.579590641858573e-06, + "loss": 0.5577, + "step": 28340 + }, + { + "epoch": 0.8272289508244565, + "grad_norm": 1.257313215284296, + "learning_rate": 8.573716877135183e-06, + "loss": 0.5225, + "step": 28345 + }, + { + "epoch": 0.8273748723186926, + "grad_norm": 1.0170258356679212, + "learning_rate": 8.567847519750295e-06, + "loss": 0.5631, + "step": 28350 + }, + { + "epoch": 0.8275207938129286, + "grad_norm": 0.9929653597187704, + "learning_rate": 8.561982571070712e-06, + "loss": 0.5596, + "step": 28355 + }, + { + "epoch": 0.8276667153071647, + "grad_norm": 1.0160876758399313, + "learning_rate": 8.556122032462185e-06, + "loss": 0.5724, + "step": 28360 + }, + { + "epoch": 0.8278126368014008, + "grad_norm": 1.141845250660919, + "learning_rate": 8.550265905289459e-06, + "loss": 0.5187, + "step": 28365 + }, + { + "epoch": 0.8279585582956369, + "grad_norm": 1.1030789903200593, + "learning_rate": 8.544414190916233e-06, + "loss": 0.5549, + "step": 28370 + }, + { + "epoch": 0.828104479789873, + "grad_norm": 1.083840841464789, + "learning_rate": 8.53856689070519e-06, + "loss": 0.6088, + "step": 28375 + }, + { + "epoch": 0.8282504012841091, + "grad_norm": 1.0822565265068411, + "learning_rate": 8.532724006017992e-06, + "loss": 0.513, + "step": 28380 + }, + { + "epoch": 0.8283963227783453, + "grad_norm": 0.9196671456825369, + "learning_rate": 8.526885538215253e-06, + "loss": 0.5069, + "step": 28385 + }, + { + "epoch": 0.8285422442725814, + "grad_norm": 1.2076770396945309, + "learning_rate": 8.521051488656578e-06, + "loss": 0.5434, + "step": 28390 + }, + { + "epoch": 0.8286881657668175, + "grad_norm": 1.0448804213015241, + "learning_rate": 8.515221858700526e-06, + "loss": 0.5506, + "step": 28395 + }, + { + "epoch": 0.8288340872610536, + "grad_norm": 1.1183658470134425, + "learning_rate": 8.509396649704641e-06, + "loss": 0.5518, + "step": 28400 + }, + { + "epoch": 0.8289800087552897, + "grad_norm": 1.1529160634805338, + "learning_rate": 8.503575863025433e-06, + "loss": 0.5206, + "step": 28405 + }, + { + "epoch": 0.8291259302495257, + "grad_norm": 0.9076697602378971, + "learning_rate": 8.497759500018377e-06, + "loss": 0.5025, + "step": 28410 + }, + { + "epoch": 0.8292718517437618, + "grad_norm": 1.0777764594570893, + "learning_rate": 8.491947562037922e-06, + "loss": 0.5285, + "step": 28415 + }, + { + "epoch": 0.8294177732379979, + "grad_norm": 1.0732513751571446, + "learning_rate": 8.486140050437479e-06, + "loss": 0.51, + "step": 28420 + }, + { + "epoch": 0.829563694732234, + "grad_norm": 1.034793987011815, + "learning_rate": 8.480336966569451e-06, + "loss": 0.5097, + "step": 28425 + }, + { + "epoch": 0.8297096162264701, + "grad_norm": 1.0730145674433134, + "learning_rate": 8.474538311785188e-06, + "loss": 0.5204, + "step": 28430 + }, + { + "epoch": 0.8298555377207063, + "grad_norm": 1.0071562603218471, + "learning_rate": 8.468744087435013e-06, + "loss": 0.5266, + "step": 28435 + }, + { + "epoch": 0.8300014592149424, + "grad_norm": 0.9491159383525267, + "learning_rate": 8.462954294868223e-06, + "loss": 0.5544, + "step": 28440 + }, + { + "epoch": 0.8301473807091785, + "grad_norm": 1.0802805101983743, + "learning_rate": 8.457168935433068e-06, + "loss": 0.551, + "step": 28445 + }, + { + "epoch": 0.8302933022034146, + "grad_norm": 1.1076149901445336, + "learning_rate": 8.451388010476794e-06, + "loss": 0.524, + "step": 28450 + }, + { + "epoch": 0.8304392236976507, + "grad_norm": 1.0582825856257356, + "learning_rate": 8.445611521345587e-06, + "loss": 0.5551, + "step": 28455 + }, + { + "epoch": 0.8305851451918868, + "grad_norm": 1.1098473641565707, + "learning_rate": 8.439839469384614e-06, + "loss": 0.5536, + "step": 28460 + }, + { + "epoch": 0.8307310666861228, + "grad_norm": 1.0264150557442315, + "learning_rate": 8.434071855937998e-06, + "loss": 0.5425, + "step": 28465 + }, + { + "epoch": 0.8308769881803589, + "grad_norm": 1.2411340187542772, + "learning_rate": 8.428308682348845e-06, + "loss": 0.5388, + "step": 28470 + }, + { + "epoch": 0.831022909674595, + "grad_norm": 1.1225591692932502, + "learning_rate": 8.422549949959216e-06, + "loss": 0.5809, + "step": 28475 + }, + { + "epoch": 0.8311688311688312, + "grad_norm": 1.1221056016126487, + "learning_rate": 8.416795660110135e-06, + "loss": 0.5534, + "step": 28480 + }, + { + "epoch": 0.8313147526630673, + "grad_norm": 1.0054239316913236, + "learning_rate": 8.411045814141598e-06, + "loss": 0.5221, + "step": 28485 + }, + { + "epoch": 0.8314606741573034, + "grad_norm": 0.96367337559212, + "learning_rate": 8.40530041339256e-06, + "loss": 0.5608, + "step": 28490 + }, + { + "epoch": 0.8316065956515395, + "grad_norm": 1.149264757579184, + "learning_rate": 8.399559459200955e-06, + "loss": 0.5799, + "step": 28495 + }, + { + "epoch": 0.8317525171457756, + "grad_norm": 1.1033586848665635, + "learning_rate": 8.393822952903666e-06, + "loss": 0.5437, + "step": 28500 + }, + { + "epoch": 0.8318984386400117, + "grad_norm": 0.9867967183849311, + "learning_rate": 8.388090895836542e-06, + "loss": 0.544, + "step": 28505 + }, + { + "epoch": 0.8320443601342478, + "grad_norm": 0.9994423911169564, + "learning_rate": 8.382363289334402e-06, + "loss": 0.5356, + "step": 28510 + }, + { + "epoch": 0.8321902816284839, + "grad_norm": 0.8585144235536971, + "learning_rate": 8.376640134731024e-06, + "loss": 0.5004, + "step": 28515 + }, + { + "epoch": 0.8323362031227199, + "grad_norm": 1.100973418503936, + "learning_rate": 8.370921433359158e-06, + "loss": 0.5689, + "step": 28520 + }, + { + "epoch": 0.832482124616956, + "grad_norm": 1.10027922106769, + "learning_rate": 8.365207186550507e-06, + "loss": 0.5634, + "step": 28525 + }, + { + "epoch": 0.8326280461111922, + "grad_norm": 0.8913974395967492, + "learning_rate": 8.359497395635737e-06, + "loss": 0.4718, + "step": 28530 + }, + { + "epoch": 0.8327739676054283, + "grad_norm": 1.0219082854602857, + "learning_rate": 8.35379206194448e-06, + "loss": 0.5205, + "step": 28535 + }, + { + "epoch": 0.8329198890996644, + "grad_norm": 0.9634836976040918, + "learning_rate": 8.348091186805337e-06, + "loss": 0.4809, + "step": 28540 + }, + { + "epoch": 0.8330658105939005, + "grad_norm": 1.088646281085902, + "learning_rate": 8.342394771545856e-06, + "loss": 0.5374, + "step": 28545 + }, + { + "epoch": 0.8332117320881366, + "grad_norm": 0.986038026809769, + "learning_rate": 8.336702817492554e-06, + "loss": 0.5203, + "step": 28550 + }, + { + "epoch": 0.8333576535823727, + "grad_norm": 1.1086873720378472, + "learning_rate": 8.331015325970912e-06, + "loss": 0.5327, + "step": 28555 + }, + { + "epoch": 0.8335035750766088, + "grad_norm": 0.9822926656397626, + "learning_rate": 8.325332298305364e-06, + "loss": 0.4861, + "step": 28560 + }, + { + "epoch": 0.8336494965708449, + "grad_norm": 0.9687290575187417, + "learning_rate": 8.319653735819314e-06, + "loss": 0.5697, + "step": 28565 + }, + { + "epoch": 0.833795418065081, + "grad_norm": 0.9589075781558747, + "learning_rate": 8.313979639835115e-06, + "loss": 0.5325, + "step": 28570 + }, + { + "epoch": 0.833941339559317, + "grad_norm": 1.0331155781807098, + "learning_rate": 8.3083100116741e-06, + "loss": 0.5207, + "step": 28575 + }, + { + "epoch": 0.8340872610535532, + "grad_norm": 1.059006857155687, + "learning_rate": 8.30264485265653e-06, + "loss": 0.5363, + "step": 28580 + }, + { + "epoch": 0.8342331825477893, + "grad_norm": 1.0248103398743924, + "learning_rate": 8.296984164101656e-06, + "loss": 0.5517, + "step": 28585 + }, + { + "epoch": 0.8343791040420254, + "grad_norm": 1.019010704777994, + "learning_rate": 8.291327947327673e-06, + "loss": 0.578, + "step": 28590 + }, + { + "epoch": 0.8345250255362615, + "grad_norm": 1.0613855789308455, + "learning_rate": 8.285676203651729e-06, + "loss": 0.5535, + "step": 28595 + }, + { + "epoch": 0.8346709470304976, + "grad_norm": 1.1123191800024483, + "learning_rate": 8.280028934389947e-06, + "loss": 0.5931, + "step": 28600 + }, + { + "epoch": 0.8348168685247337, + "grad_norm": 0.9181602846127165, + "learning_rate": 8.27438614085739e-06, + "loss": 0.5171, + "step": 28605 + }, + { + "epoch": 0.8349627900189698, + "grad_norm": 1.0824412807674593, + "learning_rate": 8.268747824368098e-06, + "loss": 0.4973, + "step": 28610 + }, + { + "epoch": 0.8351087115132059, + "grad_norm": 1.146386789160925, + "learning_rate": 8.263113986235053e-06, + "loss": 0.5661, + "step": 28615 + }, + { + "epoch": 0.835254633007442, + "grad_norm": 0.9747110943565919, + "learning_rate": 8.257484627770198e-06, + "loss": 0.5244, + "step": 28620 + }, + { + "epoch": 0.835400554501678, + "grad_norm": 0.9233288735046112, + "learning_rate": 8.25185975028444e-06, + "loss": 0.4955, + "step": 28625 + }, + { + "epoch": 0.8355464759959143, + "grad_norm": 1.050275123363343, + "learning_rate": 8.246239355087629e-06, + "loss": 0.5425, + "step": 28630 + }, + { + "epoch": 0.8356923974901503, + "grad_norm": 0.9500637696242519, + "learning_rate": 8.240623443488582e-06, + "loss": 0.5454, + "step": 28635 + }, + { + "epoch": 0.8358383189843864, + "grad_norm": 1.1290070942804287, + "learning_rate": 8.23501201679507e-06, + "loss": 0.5765, + "step": 28640 + }, + { + "epoch": 0.8359842404786225, + "grad_norm": 1.0704560214774832, + "learning_rate": 8.229405076313828e-06, + "loss": 0.5862, + "step": 28645 + }, + { + "epoch": 0.8361301619728586, + "grad_norm": 1.1897397905140032, + "learning_rate": 8.223802623350517e-06, + "loss": 0.5786, + "step": 28650 + }, + { + "epoch": 0.8362760834670947, + "grad_norm": 1.0555545496877718, + "learning_rate": 8.218204659209786e-06, + "loss": 0.5486, + "step": 28655 + }, + { + "epoch": 0.8364220049613308, + "grad_norm": 1.159109589314259, + "learning_rate": 8.212611185195224e-06, + "loss": 0.5714, + "step": 28660 + }, + { + "epoch": 0.8365679264555669, + "grad_norm": 1.1027782964026718, + "learning_rate": 8.207022202609371e-06, + "loss": 0.5055, + "step": 28665 + }, + { + "epoch": 0.836713847949803, + "grad_norm": 1.1051130134821823, + "learning_rate": 8.201437712753738e-06, + "loss": 0.5505, + "step": 28670 + }, + { + "epoch": 0.8368597694440391, + "grad_norm": 1.0229282931915, + "learning_rate": 8.195857716928759e-06, + "loss": 0.4824, + "step": 28675 + }, + { + "epoch": 0.8370056909382753, + "grad_norm": 1.0781369446828093, + "learning_rate": 8.190282216433856e-06, + "loss": 0.5272, + "step": 28680 + }, + { + "epoch": 0.8371516124325113, + "grad_norm": 0.9882931415660549, + "learning_rate": 8.184711212567384e-06, + "loss": 0.5646, + "step": 28685 + }, + { + "epoch": 0.8372975339267474, + "grad_norm": 1.03038850668961, + "learning_rate": 8.179144706626653e-06, + "loss": 0.529, + "step": 28690 + }, + { + "epoch": 0.8374434554209835, + "grad_norm": 1.0291082492959558, + "learning_rate": 8.173582699907929e-06, + "loss": 0.5273, + "step": 28695 + }, + { + "epoch": 0.8375893769152196, + "grad_norm": 0.9611983840125431, + "learning_rate": 8.168025193706422e-06, + "loss": 0.556, + "step": 28700 + }, + { + "epoch": 0.8377352984094557, + "grad_norm": 1.1370373493521226, + "learning_rate": 8.162472189316318e-06, + "loss": 0.5355, + "step": 28705 + }, + { + "epoch": 0.8378812199036918, + "grad_norm": 1.0119000872423265, + "learning_rate": 8.156923688030719e-06, + "loss": 0.5184, + "step": 28710 + }, + { + "epoch": 0.8380271413979279, + "grad_norm": 1.1622485116258077, + "learning_rate": 8.151379691141716e-06, + "loss": 0.5407, + "step": 28715 + }, + { + "epoch": 0.838173062892164, + "grad_norm": 1.0062494274676266, + "learning_rate": 8.145840199940318e-06, + "loss": 0.5152, + "step": 28720 + }, + { + "epoch": 0.8383189843864001, + "grad_norm": 0.9899267035539158, + "learning_rate": 8.140305215716504e-06, + "loss": 0.5307, + "step": 28725 + }, + { + "epoch": 0.8384649058806363, + "grad_norm": 1.0412525428999533, + "learning_rate": 8.134774739759197e-06, + "loss": 0.533, + "step": 28730 + }, + { + "epoch": 0.8386108273748724, + "grad_norm": 1.0004537666802642, + "learning_rate": 8.129248773356271e-06, + "loss": 0.5007, + "step": 28735 + }, + { + "epoch": 0.8387567488691084, + "grad_norm": 1.0829581922079066, + "learning_rate": 8.12372731779456e-06, + "loss": 0.5432, + "step": 28740 + }, + { + "epoch": 0.8389026703633445, + "grad_norm": 0.9583958927864793, + "learning_rate": 8.118210374359821e-06, + "loss": 0.5093, + "step": 28745 + }, + { + "epoch": 0.8390485918575806, + "grad_norm": 0.913992901106083, + "learning_rate": 8.112697944336788e-06, + "loss": 0.5101, + "step": 28750 + }, + { + "epoch": 0.8391945133518167, + "grad_norm": 1.2014537477908251, + "learning_rate": 8.107190029009132e-06, + "loss": 0.5574, + "step": 28755 + }, + { + "epoch": 0.8393404348460528, + "grad_norm": 1.0132131985581958, + "learning_rate": 8.101686629659474e-06, + "loss": 0.5017, + "step": 28760 + }, + { + "epoch": 0.8394863563402889, + "grad_norm": 1.0710258587056813, + "learning_rate": 8.09618774756939e-06, + "loss": 0.5272, + "step": 28765 + }, + { + "epoch": 0.839632277834525, + "grad_norm": 1.2474873027348299, + "learning_rate": 8.090693384019383e-06, + "loss": 0.5155, + "step": 28770 + }, + { + "epoch": 0.8397781993287611, + "grad_norm": 1.20166957409939, + "learning_rate": 8.085203540288929e-06, + "loss": 0.5416, + "step": 28775 + }, + { + "epoch": 0.8399241208229973, + "grad_norm": 1.114488990711099, + "learning_rate": 8.07971821765643e-06, + "loss": 0.5266, + "step": 28780 + }, + { + "epoch": 0.8400700423172334, + "grad_norm": 1.0731360915713228, + "learning_rate": 8.074237417399264e-06, + "loss": 0.5445, + "step": 28785 + }, + { + "epoch": 0.8402159638114695, + "grad_norm": 1.1664358796426688, + "learning_rate": 8.068761140793715e-06, + "loss": 0.5681, + "step": 28790 + }, + { + "epoch": 0.8403618853057055, + "grad_norm": 1.1015874636305765, + "learning_rate": 8.063289389115057e-06, + "loss": 0.5387, + "step": 28795 + }, + { + "epoch": 0.8405078067999416, + "grad_norm": 0.9549948952316141, + "learning_rate": 8.057822163637476e-06, + "loss": 0.5227, + "step": 28800 + }, + { + "epoch": 0.8406537282941777, + "grad_norm": 1.029128241363529, + "learning_rate": 8.052359465634122e-06, + "loss": 0.5286, + "step": 28805 + }, + { + "epoch": 0.8407996497884138, + "grad_norm": 0.9186237891163542, + "learning_rate": 8.046901296377093e-06, + "loss": 0.5178, + "step": 28810 + }, + { + "epoch": 0.8409455712826499, + "grad_norm": 1.0339676015107346, + "learning_rate": 8.041447657137411e-06, + "loss": 0.582, + "step": 28815 + }, + { + "epoch": 0.841091492776886, + "grad_norm": 1.0642172182405518, + "learning_rate": 8.035998549185069e-06, + "loss": 0.5635, + "step": 28820 + }, + { + "epoch": 0.8412374142711221, + "grad_norm": 0.9247459867156482, + "learning_rate": 8.030553973788985e-06, + "loss": 0.5449, + "step": 28825 + }, + { + "epoch": 0.8413833357653583, + "grad_norm": 1.224868752668849, + "learning_rate": 8.025113932217043e-06, + "loss": 0.5414, + "step": 28830 + }, + { + "epoch": 0.8415292572595944, + "grad_norm": 0.9502953553082312, + "learning_rate": 8.01967842573605e-06, + "loss": 0.5298, + "step": 28835 + }, + { + "epoch": 0.8416751787538305, + "grad_norm": 1.0874792108850762, + "learning_rate": 8.014247455611764e-06, + "loss": 0.5548, + "step": 28840 + }, + { + "epoch": 0.8418211002480666, + "grad_norm": 0.9818337211906557, + "learning_rate": 8.00882102310889e-06, + "loss": 0.5296, + "step": 28845 + }, + { + "epoch": 0.8419670217423026, + "grad_norm": 1.175721516795925, + "learning_rate": 8.003399129491069e-06, + "loss": 0.5581, + "step": 28850 + }, + { + "epoch": 0.8421129432365387, + "grad_norm": 1.0470979128166304, + "learning_rate": 7.997981776020907e-06, + "loss": 0.5269, + "step": 28855 + }, + { + "epoch": 0.8422588647307748, + "grad_norm": 1.1079233534035513, + "learning_rate": 7.992568963959914e-06, + "loss": 0.5355, + "step": 28860 + }, + { + "epoch": 0.8424047862250109, + "grad_norm": 0.8833179696592207, + "learning_rate": 7.987160694568576e-06, + "loss": 0.5415, + "step": 28865 + }, + { + "epoch": 0.842550707719247, + "grad_norm": 0.9494362405020401, + "learning_rate": 7.98175696910631e-06, + "loss": 0.4982, + "step": 28870 + }, + { + "epoch": 0.8426966292134831, + "grad_norm": 1.2477074151620577, + "learning_rate": 7.976357788831468e-06, + "loss": 0.5826, + "step": 28875 + }, + { + "epoch": 0.8428425507077193, + "grad_norm": 0.9811005680142201, + "learning_rate": 7.970963155001362e-06, + "loss": 0.5423, + "step": 28880 + }, + { + "epoch": 0.8429884722019554, + "grad_norm": 1.0398149955487936, + "learning_rate": 7.965573068872218e-06, + "loss": 0.5023, + "step": 28885 + }, + { + "epoch": 0.8431343936961915, + "grad_norm": 1.004025921231077, + "learning_rate": 7.96018753169923e-06, + "loss": 0.5453, + "step": 28890 + }, + { + "epoch": 0.8432803151904276, + "grad_norm": 1.039886239170758, + "learning_rate": 7.954806544736513e-06, + "loss": 0.5124, + "step": 28895 + }, + { + "epoch": 0.8434262366846637, + "grad_norm": 1.0107755703127719, + "learning_rate": 7.949430109237137e-06, + "loss": 0.5231, + "step": 28900 + }, + { + "epoch": 0.8435721581788997, + "grad_norm": 0.990302120857384, + "learning_rate": 7.944058226453102e-06, + "loss": 0.5486, + "step": 28905 + }, + { + "epoch": 0.8437180796731358, + "grad_norm": 1.0223309498940814, + "learning_rate": 7.938690897635353e-06, + "loss": 0.5899, + "step": 28910 + }, + { + "epoch": 0.8438640011673719, + "grad_norm": 0.9605595533318095, + "learning_rate": 7.933328124033771e-06, + "loss": 0.5083, + "step": 28915 + }, + { + "epoch": 0.844009922661608, + "grad_norm": 0.9504214537266125, + "learning_rate": 7.927969906897173e-06, + "loss": 0.5552, + "step": 28920 + }, + { + "epoch": 0.8441558441558441, + "grad_norm": 1.0211576402563416, + "learning_rate": 7.922616247473332e-06, + "loss": 0.5527, + "step": 28925 + }, + { + "epoch": 0.8443017656500803, + "grad_norm": 0.9926792275004087, + "learning_rate": 7.91726714700894e-06, + "loss": 0.5095, + "step": 28930 + }, + { + "epoch": 0.8444476871443164, + "grad_norm": 1.0975816362819006, + "learning_rate": 7.911922606749634e-06, + "loss": 0.5363, + "step": 28935 + }, + { + "epoch": 0.8445936086385525, + "grad_norm": 0.9984023285668479, + "learning_rate": 7.906582627939994e-06, + "loss": 0.5522, + "step": 28940 + }, + { + "epoch": 0.8447395301327886, + "grad_norm": 1.043262724859837, + "learning_rate": 7.901247211823528e-06, + "loss": 0.5286, + "step": 28945 + }, + { + "epoch": 0.8448854516270247, + "grad_norm": 0.9674899976136536, + "learning_rate": 7.895916359642699e-06, + "loss": 0.4654, + "step": 28950 + }, + { + "epoch": 0.8450313731212608, + "grad_norm": 0.824003242529688, + "learning_rate": 7.890590072638877e-06, + "loss": 0.5189, + "step": 28955 + }, + { + "epoch": 0.8451772946154968, + "grad_norm": 1.0577531787174204, + "learning_rate": 7.885268352052402e-06, + "loss": 0.5382, + "step": 28960 + }, + { + "epoch": 0.8453232161097329, + "grad_norm": 1.0361514509787901, + "learning_rate": 7.87995119912253e-06, + "loss": 0.5172, + "step": 28965 + }, + { + "epoch": 0.845469137603969, + "grad_norm": 1.188232480309185, + "learning_rate": 7.874638615087462e-06, + "loss": 0.5337, + "step": 28970 + }, + { + "epoch": 0.8456150590982051, + "grad_norm": 1.0091505899203395, + "learning_rate": 7.869330601184336e-06, + "loss": 0.5618, + "step": 28975 + }, + { + "epoch": 0.8457609805924413, + "grad_norm": 1.0137750187902712, + "learning_rate": 7.864027158649215e-06, + "loss": 0.5584, + "step": 28980 + }, + { + "epoch": 0.8459069020866774, + "grad_norm": 0.992773766493352, + "learning_rate": 7.858728288717109e-06, + "loss": 0.5119, + "step": 28985 + }, + { + "epoch": 0.8460528235809135, + "grad_norm": 0.920133439807406, + "learning_rate": 7.853433992621953e-06, + "loss": 0.5336, + "step": 28990 + }, + { + "epoch": 0.8461987450751496, + "grad_norm": 0.8635377933533684, + "learning_rate": 7.848144271596631e-06, + "loss": 0.5141, + "step": 28995 + }, + { + "epoch": 0.8463446665693857, + "grad_norm": 0.9954243159172684, + "learning_rate": 7.842859126872949e-06, + "loss": 0.5689, + "step": 29000 + }, + { + "epoch": 0.8464905880636218, + "grad_norm": 1.0335549413132776, + "learning_rate": 7.837578559681652e-06, + "loss": 0.5354, + "step": 29005 + }, + { + "epoch": 0.8466365095578579, + "grad_norm": 1.0131267707665483, + "learning_rate": 7.832302571252418e-06, + "loss": 0.5459, + "step": 29010 + }, + { + "epoch": 0.8467824310520939, + "grad_norm": 0.9532925750661183, + "learning_rate": 7.82703116281386e-06, + "loss": 0.528, + "step": 29015 + }, + { + "epoch": 0.84692835254633, + "grad_norm": 1.0011634479318117, + "learning_rate": 7.821764335593524e-06, + "loss": 0.4985, + "step": 29020 + }, + { + "epoch": 0.8470742740405661, + "grad_norm": 0.916946067830353, + "learning_rate": 7.816502090817891e-06, + "loss": 0.4593, + "step": 29025 + }, + { + "epoch": 0.8472201955348023, + "grad_norm": 1.2908915911985754, + "learning_rate": 7.811244429712372e-06, + "loss": 0.5747, + "step": 29030 + }, + { + "epoch": 0.8473661170290384, + "grad_norm": 1.0475907074270305, + "learning_rate": 7.805991353501303e-06, + "loss": 0.5589, + "step": 29035 + }, + { + "epoch": 0.8475120385232745, + "grad_norm": 1.0050993561620114, + "learning_rate": 7.800742863407978e-06, + "loss": 0.5374, + "step": 29040 + }, + { + "epoch": 0.8476579600175106, + "grad_norm": 0.9157725715882323, + "learning_rate": 7.795498960654592e-06, + "loss": 0.5076, + "step": 29045 + }, + { + "epoch": 0.8478038815117467, + "grad_norm": 0.9866826543310594, + "learning_rate": 7.790259646462291e-06, + "loss": 0.5622, + "step": 29050 + }, + { + "epoch": 0.8479498030059828, + "grad_norm": 0.8709154790155849, + "learning_rate": 7.785024922051148e-06, + "loss": 0.5464, + "step": 29055 + }, + { + "epoch": 0.8480957245002189, + "grad_norm": 1.0186272960006317, + "learning_rate": 7.77979478864016e-06, + "loss": 0.5288, + "step": 29060 + }, + { + "epoch": 0.848241645994455, + "grad_norm": 1.181732058870617, + "learning_rate": 7.77456924744727e-06, + "loss": 0.5018, + "step": 29065 + }, + { + "epoch": 0.848387567488691, + "grad_norm": 0.9589909165194724, + "learning_rate": 7.769348299689342e-06, + "loss": 0.5384, + "step": 29070 + }, + { + "epoch": 0.8485334889829271, + "grad_norm": 0.8477710164307056, + "learning_rate": 7.764131946582164e-06, + "loss": 0.5258, + "step": 29075 + }, + { + "epoch": 0.8486794104771633, + "grad_norm": 0.9208784431749927, + "learning_rate": 7.758920189340462e-06, + "loss": 0.5413, + "step": 29080 + }, + { + "epoch": 0.8488253319713994, + "grad_norm": 1.2303515323518492, + "learning_rate": 7.753713029177899e-06, + "loss": 0.5726, + "step": 29085 + }, + { + "epoch": 0.8489712534656355, + "grad_norm": 0.9496198275718524, + "learning_rate": 7.748510467307055e-06, + "loss": 0.5267, + "step": 29090 + }, + { + "epoch": 0.8491171749598716, + "grad_norm": 0.9965271908352583, + "learning_rate": 7.743312504939442e-06, + "loss": 0.5863, + "step": 29095 + }, + { + "epoch": 0.8492630964541077, + "grad_norm": 1.099989548761296, + "learning_rate": 7.738119143285511e-06, + "loss": 0.5207, + "step": 29100 + }, + { + "epoch": 0.8494090179483438, + "grad_norm": 0.9593993920346373, + "learning_rate": 7.732930383554613e-06, + "loss": 0.5005, + "step": 29105 + }, + { + "epoch": 0.8495549394425799, + "grad_norm": 1.0007556620337354, + "learning_rate": 7.727746226955071e-06, + "loss": 0.5301, + "step": 29110 + }, + { + "epoch": 0.849700860936816, + "grad_norm": 0.9979739404571979, + "learning_rate": 7.7225666746941e-06, + "loss": 0.5144, + "step": 29115 + }, + { + "epoch": 0.849846782431052, + "grad_norm": 1.0030392955243845, + "learning_rate": 7.71739172797786e-06, + "loss": 0.5519, + "step": 29120 + }, + { + "epoch": 0.8499927039252881, + "grad_norm": 0.8989583501412702, + "learning_rate": 7.712221388011432e-06, + "loss": 0.5305, + "step": 29125 + }, + { + "epoch": 0.8501386254195243, + "grad_norm": 1.0627584780620454, + "learning_rate": 7.707055655998821e-06, + "loss": 0.5292, + "step": 29130 + }, + { + "epoch": 0.8502845469137604, + "grad_norm": 1.251002920468956, + "learning_rate": 7.701894533142974e-06, + "loss": 0.5609, + "step": 29135 + }, + { + "epoch": 0.8504304684079965, + "grad_norm": 0.9849854961762665, + "learning_rate": 7.696738020645751e-06, + "loss": 0.5382, + "step": 29140 + }, + { + "epoch": 0.8505763899022326, + "grad_norm": 1.0427387168596631, + "learning_rate": 7.691586119707942e-06, + "loss": 0.5319, + "step": 29145 + }, + { + "epoch": 0.8507223113964687, + "grad_norm": 1.1278618413710044, + "learning_rate": 7.686438831529258e-06, + "loss": 0.6103, + "step": 29150 + }, + { + "epoch": 0.8508682328907048, + "grad_norm": 1.057858831586323, + "learning_rate": 7.681296157308353e-06, + "loss": 0.5405, + "step": 29155 + }, + { + "epoch": 0.8510141543849409, + "grad_norm": 1.0191740421522106, + "learning_rate": 7.676158098242786e-06, + "loss": 0.5173, + "step": 29160 + }, + { + "epoch": 0.851160075879177, + "grad_norm": 0.9951918045696453, + "learning_rate": 7.671024655529053e-06, + "loss": 0.5042, + "step": 29165 + }, + { + "epoch": 0.8513059973734131, + "grad_norm": 0.9846268500510177, + "learning_rate": 7.66589583036257e-06, + "loss": 0.563, + "step": 29170 + }, + { + "epoch": 0.8514519188676493, + "grad_norm": 0.9567677584071608, + "learning_rate": 7.660771623937676e-06, + "loss": 0.5053, + "step": 29175 + }, + { + "epoch": 0.8515978403618853, + "grad_norm": 0.9677484857063607, + "learning_rate": 7.655652037447649e-06, + "loss": 0.5435, + "step": 29180 + }, + { + "epoch": 0.8517437618561214, + "grad_norm": 1.4106466861289337, + "learning_rate": 7.65053707208467e-06, + "loss": 0.6132, + "step": 29185 + }, + { + "epoch": 0.8518896833503575, + "grad_norm": 0.9950799720350395, + "learning_rate": 7.645426729039864e-06, + "loss": 0.5199, + "step": 29190 + }, + { + "epoch": 0.8520356048445936, + "grad_norm": 1.0698083048875968, + "learning_rate": 7.64032100950326e-06, + "loss": 0.5364, + "step": 29195 + }, + { + "epoch": 0.8521815263388297, + "grad_norm": 1.0565794819331658, + "learning_rate": 7.635219914663825e-06, + "loss": 0.5511, + "step": 29200 + }, + { + "epoch": 0.8523274478330658, + "grad_norm": 1.096010294736017, + "learning_rate": 7.630123445709445e-06, + "loss": 0.5493, + "step": 29205 + }, + { + "epoch": 0.8524733693273019, + "grad_norm": 1.098027658757928, + "learning_rate": 7.625031603826924e-06, + "loss": 0.5896, + "step": 29210 + }, + { + "epoch": 0.852619290821538, + "grad_norm": 0.9774815360167062, + "learning_rate": 7.619944390201997e-06, + "loss": 0.4918, + "step": 29215 + }, + { + "epoch": 0.8527652123157741, + "grad_norm": 1.0660118285118299, + "learning_rate": 7.614861806019312e-06, + "loss": 0.5297, + "step": 29220 + }, + { + "epoch": 0.8529111338100103, + "grad_norm": 1.105076660124108, + "learning_rate": 7.60978385246245e-06, + "loss": 0.4709, + "step": 29225 + }, + { + "epoch": 0.8530570553042464, + "grad_norm": 0.9820736367481939, + "learning_rate": 7.604710530713904e-06, + "loss": 0.5087, + "step": 29230 + }, + { + "epoch": 0.8532029767984824, + "grad_norm": 0.9671971371307803, + "learning_rate": 7.599641841955091e-06, + "loss": 0.5434, + "step": 29235 + }, + { + "epoch": 0.8533488982927185, + "grad_norm": 1.2751315450238296, + "learning_rate": 7.594577787366353e-06, + "loss": 0.5597, + "step": 29240 + }, + { + "epoch": 0.8534948197869546, + "grad_norm": 1.031491900253608, + "learning_rate": 7.5895183681269385e-06, + "loss": 0.5039, + "step": 29245 + }, + { + "epoch": 0.8536407412811907, + "grad_norm": 1.1107097811299178, + "learning_rate": 7.584463585415044e-06, + "loss": 0.5452, + "step": 29250 + }, + { + "epoch": 0.8537866627754268, + "grad_norm": 1.0858412343028454, + "learning_rate": 7.579413440407757e-06, + "loss": 0.5312, + "step": 29255 + }, + { + "epoch": 0.8539325842696629, + "grad_norm": 0.9569172340780231, + "learning_rate": 7.574367934281112e-06, + "loss": 0.5183, + "step": 29260 + }, + { + "epoch": 0.854078505763899, + "grad_norm": 0.9439550370251739, + "learning_rate": 7.56932706821003e-06, + "loss": 0.5064, + "step": 29265 + }, + { + "epoch": 0.8542244272581351, + "grad_norm": 1.0566462796605756, + "learning_rate": 7.564290843368388e-06, + "loss": 0.5768, + "step": 29270 + }, + { + "epoch": 0.8543703487523713, + "grad_norm": 1.0767325754111787, + "learning_rate": 7.5592592609289595e-06, + "loss": 0.5666, + "step": 29275 + }, + { + "epoch": 0.8545162702466074, + "grad_norm": 0.971360966282966, + "learning_rate": 7.554232322063436e-06, + "loss": 0.5559, + "step": 29280 + }, + { + "epoch": 0.8546621917408435, + "grad_norm": 0.9725763079953507, + "learning_rate": 7.549210027942448e-06, + "loss": 0.491, + "step": 29285 + }, + { + "epoch": 0.8548081132350795, + "grad_norm": 0.9526262228538975, + "learning_rate": 7.544192379735515e-06, + "loss": 0.5534, + "step": 29290 + }, + { + "epoch": 0.8549540347293156, + "grad_norm": 0.9350550607996951, + "learning_rate": 7.5391793786111e-06, + "loss": 0.5389, + "step": 29295 + }, + { + "epoch": 0.8550999562235517, + "grad_norm": 1.1512225106044671, + "learning_rate": 7.534171025736572e-06, + "loss": 0.5555, + "step": 29300 + }, + { + "epoch": 0.8552458777177878, + "grad_norm": 0.9629784255139604, + "learning_rate": 7.52916732227822e-06, + "loss": 0.5782, + "step": 29305 + }, + { + "epoch": 0.8553917992120239, + "grad_norm": 1.1062524292030902, + "learning_rate": 7.524168269401247e-06, + "loss": 0.5439, + "step": 29310 + }, + { + "epoch": 0.85553772070626, + "grad_norm": 1.0406242052793033, + "learning_rate": 7.519173868269773e-06, + "loss": 0.502, + "step": 29315 + }, + { + "epoch": 0.8556836422004961, + "grad_norm": 0.8514247689166575, + "learning_rate": 7.514184120046848e-06, + "loss": 0.5052, + "step": 29320 + }, + { + "epoch": 0.8558295636947323, + "grad_norm": 0.9981273701994329, + "learning_rate": 7.509199025894416e-06, + "loss": 0.5002, + "step": 29325 + }, + { + "epoch": 0.8559754851889684, + "grad_norm": 0.9299183496272749, + "learning_rate": 7.504218586973362e-06, + "loss": 0.5689, + "step": 29330 + }, + { + "epoch": 0.8561214066832045, + "grad_norm": 0.9649267101768552, + "learning_rate": 7.4992428044434615e-06, + "loss": 0.4939, + "step": 29335 + }, + { + "epoch": 0.8562673281774406, + "grad_norm": 1.0297776506641034, + "learning_rate": 7.4942716794634285e-06, + "loss": 0.5825, + "step": 29340 + }, + { + "epoch": 0.8564132496716766, + "grad_norm": 1.0835927166519497, + "learning_rate": 7.489305213190878e-06, + "loss": 0.5747, + "step": 29345 + }, + { + "epoch": 0.8565591711659127, + "grad_norm": 1.1930106256009805, + "learning_rate": 7.4843434067823406e-06, + "loss": 0.639, + "step": 29350 + }, + { + "epoch": 0.8567050926601488, + "grad_norm": 0.9035318334904757, + "learning_rate": 7.479386261393277e-06, + "loss": 0.4922, + "step": 29355 + }, + { + "epoch": 0.8568510141543849, + "grad_norm": 1.1758228799907051, + "learning_rate": 7.474433778178036e-06, + "loss": 0.5867, + "step": 29360 + }, + { + "epoch": 0.856996935648621, + "grad_norm": 1.1377733620064063, + "learning_rate": 7.469485958289909e-06, + "loss": 0.4864, + "step": 29365 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.9786212987924846, + "learning_rate": 7.464542802881081e-06, + "loss": 0.5254, + "step": 29370 + }, + { + "epoch": 0.8572887786370933, + "grad_norm": 1.2119135254173408, + "learning_rate": 7.4596043131026566e-06, + "loss": 0.5174, + "step": 29375 + }, + { + "epoch": 0.8574347001313294, + "grad_norm": 0.9651031770906952, + "learning_rate": 7.454670490104667e-06, + "loss": 0.5086, + "step": 29380 + }, + { + "epoch": 0.8575806216255655, + "grad_norm": 1.1193861403779517, + "learning_rate": 7.449741335036026e-06, + "loss": 0.5154, + "step": 29385 + }, + { + "epoch": 0.8577265431198016, + "grad_norm": 1.1527638564244398, + "learning_rate": 7.444816849044597e-06, + "loss": 0.5984, + "step": 29390 + }, + { + "epoch": 0.8578724646140377, + "grad_norm": 0.8773588572099367, + "learning_rate": 7.439897033277126e-06, + "loss": 0.481, + "step": 29395 + }, + { + "epoch": 0.8580183861082737, + "grad_norm": 0.8357452491954406, + "learning_rate": 7.4349818888793e-06, + "loss": 0.4871, + "step": 29400 + }, + { + "epoch": 0.8581643076025098, + "grad_norm": 0.9615751916353773, + "learning_rate": 7.43007141699568e-06, + "loss": 0.5532, + "step": 29405 + }, + { + "epoch": 0.8583102290967459, + "grad_norm": 1.0489755749147818, + "learning_rate": 7.42516561876978e-06, + "loss": 0.5293, + "step": 29410 + }, + { + "epoch": 0.858456150590982, + "grad_norm": 1.04571543714412, + "learning_rate": 7.420264495343999e-06, + "loss": 0.5052, + "step": 29415 + }, + { + "epoch": 0.8586020720852181, + "grad_norm": 0.9461085453647297, + "learning_rate": 7.415368047859651e-06, + "loss": 0.5069, + "step": 29420 + }, + { + "epoch": 0.8587479935794543, + "grad_norm": 0.9693742427159968, + "learning_rate": 7.4104762774569775e-06, + "loss": 0.548, + "step": 29425 + }, + { + "epoch": 0.8588939150736904, + "grad_norm": 1.1795387010120408, + "learning_rate": 7.405589185275105e-06, + "loss": 0.5123, + "step": 29430 + }, + { + "epoch": 0.8590398365679265, + "grad_norm": 0.9182819818812419, + "learning_rate": 7.4007067724520925e-06, + "loss": 0.5047, + "step": 29435 + }, + { + "epoch": 0.8591857580621626, + "grad_norm": 1.1434935929700647, + "learning_rate": 7.395829040124899e-06, + "loss": 0.5712, + "step": 29440 + }, + { + "epoch": 0.8593316795563987, + "grad_norm": 0.9869355446577552, + "learning_rate": 7.390955989429392e-06, + "loss": 0.5322, + "step": 29445 + }, + { + "epoch": 0.8594776010506348, + "grad_norm": 0.9635511933708542, + "learning_rate": 7.386087621500362e-06, + "loss": 0.5117, + "step": 29450 + }, + { + "epoch": 0.8596235225448708, + "grad_norm": 0.9698633334364771, + "learning_rate": 7.381223937471488e-06, + "loss": 0.5118, + "step": 29455 + }, + { + "epoch": 0.8597694440391069, + "grad_norm": 1.1973984980916046, + "learning_rate": 7.376364938475378e-06, + "loss": 0.5647, + "step": 29460 + }, + { + "epoch": 0.859915365533343, + "grad_norm": 0.9986681878543255, + "learning_rate": 7.371510625643532e-06, + "loss": 0.529, + "step": 29465 + }, + { + "epoch": 0.8600612870275791, + "grad_norm": 0.9926797038063817, + "learning_rate": 7.366661000106378e-06, + "loss": 0.475, + "step": 29470 + }, + { + "epoch": 0.8602072085218153, + "grad_norm": 1.0370216750595327, + "learning_rate": 7.361816062993236e-06, + "loss": 0.5075, + "step": 29475 + }, + { + "epoch": 0.8603531300160514, + "grad_norm": 0.9174706267697447, + "learning_rate": 7.35697581543234e-06, + "loss": 0.5385, + "step": 29480 + }, + { + "epoch": 0.8604990515102875, + "grad_norm": 1.1573158443910245, + "learning_rate": 7.352140258550835e-06, + "loss": 0.5261, + "step": 29485 + }, + { + "epoch": 0.8606449730045236, + "grad_norm": 1.0980753393416467, + "learning_rate": 7.347309393474763e-06, + "loss": 0.5769, + "step": 29490 + }, + { + "epoch": 0.8607908944987597, + "grad_norm": 0.9298358491377472, + "learning_rate": 7.3424832213290965e-06, + "loss": 0.5121, + "step": 29495 + }, + { + "epoch": 0.8609368159929958, + "grad_norm": 1.0873244015164854, + "learning_rate": 7.33766174323768e-06, + "loss": 0.5138, + "step": 29500 + }, + { + "epoch": 0.8610827374872319, + "grad_norm": 1.0637996349861991, + "learning_rate": 7.3328449603233e-06, + "loss": 0.5663, + "step": 29505 + }, + { + "epoch": 0.8612286589814679, + "grad_norm": 0.8792711426318676, + "learning_rate": 7.328032873707626e-06, + "loss": 0.5573, + "step": 29510 + }, + { + "epoch": 0.861374580475704, + "grad_norm": 1.0346410026508894, + "learning_rate": 7.323225484511246e-06, + "loss": 0.5422, + "step": 29515 + }, + { + "epoch": 0.8615205019699401, + "grad_norm": 1.196555233866233, + "learning_rate": 7.3184227938536534e-06, + "loss": 0.5513, + "step": 29520 + }, + { + "epoch": 0.8616664234641763, + "grad_norm": 0.9908639897417755, + "learning_rate": 7.313624802853241e-06, + "loss": 0.5878, + "step": 29525 + }, + { + "epoch": 0.8618123449584124, + "grad_norm": 0.8701600215744093, + "learning_rate": 7.308831512627308e-06, + "loss": 0.4933, + "step": 29530 + }, + { + "epoch": 0.8619582664526485, + "grad_norm": 1.0571731477392725, + "learning_rate": 7.30404292429206e-06, + "loss": 0.4879, + "step": 29535 + }, + { + "epoch": 0.8621041879468846, + "grad_norm": 0.9584279710696191, + "learning_rate": 7.2992590389626195e-06, + "loss": 0.5681, + "step": 29540 + }, + { + "epoch": 0.8622501094411207, + "grad_norm": 1.0962913336649283, + "learning_rate": 7.2944798577529965e-06, + "loss": 0.5326, + "step": 29545 + }, + { + "epoch": 0.8623960309353568, + "grad_norm": 1.1071142582479432, + "learning_rate": 7.289705381776113e-06, + "loss": 0.5746, + "step": 29550 + }, + { + "epoch": 0.8625419524295929, + "grad_norm": 0.9696910815006976, + "learning_rate": 7.284935612143795e-06, + "loss": 0.5628, + "step": 29555 + }, + { + "epoch": 0.862687873923829, + "grad_norm": 0.9200833296536041, + "learning_rate": 7.2801705499667695e-06, + "loss": 0.522, + "step": 29560 + }, + { + "epoch": 0.862833795418065, + "grad_norm": 1.1338086664391194, + "learning_rate": 7.275410196354675e-06, + "loss": 0.5725, + "step": 29565 + }, + { + "epoch": 0.8629797169123011, + "grad_norm": 0.944663776974379, + "learning_rate": 7.270654552416049e-06, + "loss": 0.5263, + "step": 29570 + }, + { + "epoch": 0.8631256384065373, + "grad_norm": 0.9883047085687345, + "learning_rate": 7.26590361925833e-06, + "loss": 0.5727, + "step": 29575 + }, + { + "epoch": 0.8632715599007734, + "grad_norm": 0.9492780244695722, + "learning_rate": 7.2611573979878585e-06, + "loss": 0.5263, + "step": 29580 + }, + { + "epoch": 0.8634174813950095, + "grad_norm": 1.1940330377591961, + "learning_rate": 7.256415889709885e-06, + "loss": 0.5638, + "step": 29585 + }, + { + "epoch": 0.8635634028892456, + "grad_norm": 0.8963232815679787, + "learning_rate": 7.25167909552856e-06, + "loss": 0.5048, + "step": 29590 + }, + { + "epoch": 0.8637093243834817, + "grad_norm": 1.0786162624483215, + "learning_rate": 7.24694701654693e-06, + "loss": 0.5413, + "step": 29595 + }, + { + "epoch": 0.8638552458777178, + "grad_norm": 1.0005664536563534, + "learning_rate": 7.2422196538669525e-06, + "loss": 0.5115, + "step": 29600 + }, + { + "epoch": 0.8640011673719539, + "grad_norm": 0.9931593721585926, + "learning_rate": 7.237497008589474e-06, + "loss": 0.512, + "step": 29605 + }, + { + "epoch": 0.86414708886619, + "grad_norm": 1.0513484033634384, + "learning_rate": 7.232779081814259e-06, + "loss": 0.5588, + "step": 29610 + }, + { + "epoch": 0.864293010360426, + "grad_norm": 1.0415464286457292, + "learning_rate": 7.228065874639964e-06, + "loss": 0.5176, + "step": 29615 + }, + { + "epoch": 0.8644389318546621, + "grad_norm": 0.9109553476060037, + "learning_rate": 7.223357388164144e-06, + "loss": 0.5474, + "step": 29620 + }, + { + "epoch": 0.8645848533488983, + "grad_norm": 1.1251893142339733, + "learning_rate": 7.218653623483259e-06, + "loss": 0.5073, + "step": 29625 + }, + { + "epoch": 0.8647307748431344, + "grad_norm": 0.9834942075279803, + "learning_rate": 7.213954581692664e-06, + "loss": 0.46, + "step": 29630 + }, + { + "epoch": 0.8648766963373705, + "grad_norm": 0.9584303757006949, + "learning_rate": 7.2092602638866285e-06, + "loss": 0.4925, + "step": 29635 + }, + { + "epoch": 0.8650226178316066, + "grad_norm": 1.2057430723260245, + "learning_rate": 7.204570671158305e-06, + "loss": 0.5172, + "step": 29640 + }, + { + "epoch": 0.8651685393258427, + "grad_norm": 1.06932847983272, + "learning_rate": 7.199885804599757e-06, + "loss": 0.5282, + "step": 29645 + }, + { + "epoch": 0.8653144608200788, + "grad_norm": 0.8615385000288169, + "learning_rate": 7.195205665301937e-06, + "loss": 0.4743, + "step": 29650 + }, + { + "epoch": 0.8654603823143149, + "grad_norm": 1.1677842032916155, + "learning_rate": 7.190530254354709e-06, + "loss": 0.5757, + "step": 29655 + }, + { + "epoch": 0.865606303808551, + "grad_norm": 0.9709761593785842, + "learning_rate": 7.1858595728468275e-06, + "loss": 0.5458, + "step": 29660 + }, + { + "epoch": 0.8657522253027871, + "grad_norm": 1.031251773091452, + "learning_rate": 7.181193621865948e-06, + "loss": 0.576, + "step": 29665 + }, + { + "epoch": 0.8658981467970231, + "grad_norm": 1.0651348781347871, + "learning_rate": 7.176532402498626e-06, + "loss": 0.5243, + "step": 29670 + }, + { + "epoch": 0.8660440682912594, + "grad_norm": 0.9553451754803618, + "learning_rate": 7.171875915830305e-06, + "loss": 0.5445, + "step": 29675 + }, + { + "epoch": 0.8661899897854954, + "grad_norm": 0.9830341963690129, + "learning_rate": 7.167224162945346e-06, + "loss": 0.5518, + "step": 29680 + }, + { + "epoch": 0.8663359112797315, + "grad_norm": 1.1750194195071062, + "learning_rate": 7.162577144926996e-06, + "loss": 0.5083, + "step": 29685 + }, + { + "epoch": 0.8664818327739676, + "grad_norm": 0.9967875008019401, + "learning_rate": 7.157934862857392e-06, + "loss": 0.5868, + "step": 29690 + }, + { + "epoch": 0.8666277542682037, + "grad_norm": 1.1928715392703513, + "learning_rate": 7.153297317817578e-06, + "loss": 0.5934, + "step": 29695 + }, + { + "epoch": 0.8667736757624398, + "grad_norm": 0.9750855717417999, + "learning_rate": 7.148664510887497e-06, + "loss": 0.5518, + "step": 29700 + }, + { + "epoch": 0.8669195972566759, + "grad_norm": 0.9146764263237812, + "learning_rate": 7.1440364431459835e-06, + "loss": 0.5112, + "step": 29705 + }, + { + "epoch": 0.867065518750912, + "grad_norm": 0.9199484514055865, + "learning_rate": 7.139413115670769e-06, + "loss": 0.4639, + "step": 29710 + }, + { + "epoch": 0.8672114402451481, + "grad_norm": 1.140457294429192, + "learning_rate": 7.134794529538484e-06, + "loss": 0.4956, + "step": 29715 + }, + { + "epoch": 0.8673573617393842, + "grad_norm": 0.9739593566755906, + "learning_rate": 7.130180685824643e-06, + "loss": 0.5628, + "step": 29720 + }, + { + "epoch": 0.8675032832336204, + "grad_norm": 0.9924272179574128, + "learning_rate": 7.125571585603677e-06, + "loss": 0.4985, + "step": 29725 + }, + { + "epoch": 0.8676492047278564, + "grad_norm": 0.9485921433147932, + "learning_rate": 7.120967229948897e-06, + "loss": 0.505, + "step": 29730 + }, + { + "epoch": 0.8677951262220925, + "grad_norm": 1.1359561736908115, + "learning_rate": 7.116367619932513e-06, + "loss": 0.5168, + "step": 29735 + }, + { + "epoch": 0.8679410477163286, + "grad_norm": 0.9757973307698785, + "learning_rate": 7.1117727566256305e-06, + "loss": 0.5353, + "step": 29740 + }, + { + "epoch": 0.8680869692105647, + "grad_norm": 0.8331331945573113, + "learning_rate": 7.107182641098243e-06, + "loss": 0.5122, + "step": 29745 + }, + { + "epoch": 0.8682328907048008, + "grad_norm": 0.9916706961854886, + "learning_rate": 7.1025972744192555e-06, + "loss": 0.5731, + "step": 29750 + }, + { + "epoch": 0.8683788121990369, + "grad_norm": 0.9894609679733036, + "learning_rate": 7.098016657656451e-06, + "loss": 0.5142, + "step": 29755 + }, + { + "epoch": 0.868524733693273, + "grad_norm": 1.0143109850872247, + "learning_rate": 7.093440791876513e-06, + "loss": 0.4951, + "step": 29760 + }, + { + "epoch": 0.8686706551875091, + "grad_norm": 0.9632059995792072, + "learning_rate": 7.088869678145011e-06, + "loss": 0.5362, + "step": 29765 + }, + { + "epoch": 0.8688165766817452, + "grad_norm": 1.0005181321903434, + "learning_rate": 7.084303317526423e-06, + "loss": 0.5181, + "step": 29770 + }, + { + "epoch": 0.8689624981759814, + "grad_norm": 0.8704204772259626, + "learning_rate": 7.079741711084107e-06, + "loss": 0.4972, + "step": 29775 + }, + { + "epoch": 0.8691084196702175, + "grad_norm": 0.9274588543343439, + "learning_rate": 7.075184859880321e-06, + "loss": 0.5346, + "step": 29780 + }, + { + "epoch": 0.8692543411644535, + "grad_norm": 0.9092844997181421, + "learning_rate": 7.070632764976209e-06, + "loss": 0.5339, + "step": 29785 + }, + { + "epoch": 0.8694002626586896, + "grad_norm": 0.9231899287060015, + "learning_rate": 7.0660854274318095e-06, + "loss": 0.4998, + "step": 29790 + }, + { + "epoch": 0.8695461841529257, + "grad_norm": 0.9455069001389227, + "learning_rate": 7.061542848306063e-06, + "loss": 0.5217, + "step": 29795 + }, + { + "epoch": 0.8696921056471618, + "grad_norm": 0.9226647398019713, + "learning_rate": 7.057005028656791e-06, + "loss": 0.5024, + "step": 29800 + }, + { + "epoch": 0.8698380271413979, + "grad_norm": 1.1095454945608603, + "learning_rate": 7.052471969540707e-06, + "loss": 0.5892, + "step": 29805 + }, + { + "epoch": 0.869983948635634, + "grad_norm": 1.0478962505815896, + "learning_rate": 7.047943672013419e-06, + "loss": 0.568, + "step": 29810 + }, + { + "epoch": 0.8701298701298701, + "grad_norm": 1.0584558024284247, + "learning_rate": 7.043420137129422e-06, + "loss": 0.5413, + "step": 29815 + }, + { + "epoch": 0.8702757916241062, + "grad_norm": 0.9954731433904819, + "learning_rate": 7.0389013659421135e-06, + "loss": 0.5306, + "step": 29820 + }, + { + "epoch": 0.8704217131183424, + "grad_norm": 0.9457259107495466, + "learning_rate": 7.034387359503766e-06, + "loss": 0.5554, + "step": 29825 + }, + { + "epoch": 0.8705676346125785, + "grad_norm": 0.8954323724038272, + "learning_rate": 7.0298781188655625e-06, + "loss": 0.5393, + "step": 29830 + }, + { + "epoch": 0.8707135561068146, + "grad_norm": 1.086648646672613, + "learning_rate": 7.025373645077549e-06, + "loss": 0.5881, + "step": 29835 + }, + { + "epoch": 0.8708594776010506, + "grad_norm": 1.026161268402807, + "learning_rate": 7.020873939188686e-06, + "loss": 0.5475, + "step": 29840 + }, + { + "epoch": 0.8710053990952867, + "grad_norm": 0.9888220065075831, + "learning_rate": 7.016379002246812e-06, + "loss": 0.5278, + "step": 29845 + }, + { + "epoch": 0.8711513205895228, + "grad_norm": 1.2230658721543257, + "learning_rate": 7.011888835298655e-06, + "loss": 0.5781, + "step": 29850 + }, + { + "epoch": 0.8712972420837589, + "grad_norm": 0.9751594696089496, + "learning_rate": 7.007403439389835e-06, + "loss": 0.5593, + "step": 29855 + }, + { + "epoch": 0.871443163577995, + "grad_norm": 1.072942579304287, + "learning_rate": 7.002922815564858e-06, + "loss": 0.5874, + "step": 29860 + }, + { + "epoch": 0.8715890850722311, + "grad_norm": 1.111257371597716, + "learning_rate": 6.998446964867125e-06, + "loss": 0.5341, + "step": 29865 + }, + { + "epoch": 0.8717350065664672, + "grad_norm": 1.221842096238057, + "learning_rate": 6.993975888338924e-06, + "loss": 0.5335, + "step": 29870 + }, + { + "epoch": 0.8718809280607034, + "grad_norm": 1.0993164854861834, + "learning_rate": 6.989509587021421e-06, + "loss": 0.55, + "step": 29875 + }, + { + "epoch": 0.8720268495549395, + "grad_norm": 0.8740989334193764, + "learning_rate": 6.985048061954684e-06, + "loss": 0.5821, + "step": 29880 + }, + { + "epoch": 0.8721727710491756, + "grad_norm": 1.0956713262952211, + "learning_rate": 6.980591314177652e-06, + "loss": 0.5488, + "step": 29885 + }, + { + "epoch": 0.8723186925434117, + "grad_norm": 1.0180792715670386, + "learning_rate": 6.976139344728178e-06, + "loss": 0.565, + "step": 29890 + }, + { + "epoch": 0.8724646140376477, + "grad_norm": 1.0862611416281598, + "learning_rate": 6.9716921546429715e-06, + "loss": 0.5653, + "step": 29895 + }, + { + "epoch": 0.8726105355318838, + "grad_norm": 1.044699719376724, + "learning_rate": 6.967249744957657e-06, + "loss": 0.559, + "step": 29900 + }, + { + "epoch": 0.8727564570261199, + "grad_norm": 0.963360536657404, + "learning_rate": 6.962812116706719e-06, + "loss": 0.5442, + "step": 29905 + }, + { + "epoch": 0.872902378520356, + "grad_norm": 1.1747464547669655, + "learning_rate": 6.958379270923553e-06, + "loss": 0.5444, + "step": 29910 + }, + { + "epoch": 0.8730483000145921, + "grad_norm": 1.173916687656853, + "learning_rate": 6.953951208640427e-06, + "loss": 0.5448, + "step": 29915 + }, + { + "epoch": 0.8731942215088283, + "grad_norm": 0.8322397775807132, + "learning_rate": 6.94952793088849e-06, + "loss": 0.4837, + "step": 29920 + }, + { + "epoch": 0.8733401430030644, + "grad_norm": 1.08338997339878, + "learning_rate": 6.945109438697803e-06, + "loss": 0.5691, + "step": 29925 + }, + { + "epoch": 0.8734860644973005, + "grad_norm": 0.9906516867313716, + "learning_rate": 6.9406957330972715e-06, + "loss": 0.5113, + "step": 29930 + }, + { + "epoch": 0.8736319859915366, + "grad_norm": 0.9179423426391685, + "learning_rate": 6.936286815114727e-06, + "loss": 0.4827, + "step": 29935 + }, + { + "epoch": 0.8737779074857727, + "grad_norm": 1.0304264892773132, + "learning_rate": 6.9318826857768565e-06, + "loss": 0.4878, + "step": 29940 + }, + { + "epoch": 0.8739238289800088, + "grad_norm": 1.1369862842478091, + "learning_rate": 6.927483346109259e-06, + "loss": 0.5065, + "step": 29945 + }, + { + "epoch": 0.8740697504742448, + "grad_norm": 1.0204419657974106, + "learning_rate": 6.923088797136385e-06, + "loss": 0.5895, + "step": 29950 + }, + { + "epoch": 0.8742156719684809, + "grad_norm": 1.0357751844490088, + "learning_rate": 6.918699039881601e-06, + "loss": 0.5442, + "step": 29955 + }, + { + "epoch": 0.874361593462717, + "grad_norm": 0.8748692793847861, + "learning_rate": 6.914314075367138e-06, + "loss": 0.5314, + "step": 29960 + }, + { + "epoch": 0.8745075149569531, + "grad_norm": 1.0199991152308767, + "learning_rate": 6.9099339046141114e-06, + "loss": 0.4941, + "step": 29965 + }, + { + "epoch": 0.8746534364511893, + "grad_norm": 1.0946435876259222, + "learning_rate": 6.905558528642541e-06, + "loss": 0.5069, + "step": 29970 + }, + { + "epoch": 0.8747993579454254, + "grad_norm": 1.0300316392027062, + "learning_rate": 6.901187948471301e-06, + "loss": 0.502, + "step": 29975 + }, + { + "epoch": 0.8749452794396615, + "grad_norm": 1.195950166837118, + "learning_rate": 6.8968221651181675e-06, + "loss": 0.5395, + "step": 29980 + }, + { + "epoch": 0.8750912009338976, + "grad_norm": 0.922518089654216, + "learning_rate": 6.892461179599796e-06, + "loss": 0.5127, + "step": 29985 + }, + { + "epoch": 0.8752371224281337, + "grad_norm": 1.059932475396812, + "learning_rate": 6.888104992931718e-06, + "loss": 0.5262, + "step": 29990 + }, + { + "epoch": 0.8753830439223698, + "grad_norm": 0.9833271019775266, + "learning_rate": 6.883753606128365e-06, + "loss": 0.5348, + "step": 29995 + }, + { + "epoch": 0.8755289654166059, + "grad_norm": 1.0137711799894418, + "learning_rate": 6.879407020203021e-06, + "loss": 0.5511, + "step": 30000 + }, + { + "epoch": 0.8756748869108419, + "grad_norm": 0.9471064424446669, + "learning_rate": 6.875065236167884e-06, + "loss": 0.5068, + "step": 30005 + }, + { + "epoch": 0.875820808405078, + "grad_norm": 0.9028574386618482, + "learning_rate": 6.870728255034013e-06, + "loss": 0.4875, + "step": 30010 + }, + { + "epoch": 0.8759667298993141, + "grad_norm": 1.0526732564936714, + "learning_rate": 6.8663960778113645e-06, + "loss": 0.5204, + "step": 30015 + }, + { + "epoch": 0.8761126513935503, + "grad_norm": 0.9489331372750632, + "learning_rate": 6.862068705508748e-06, + "loss": 0.5062, + "step": 30020 + }, + { + "epoch": 0.8762585728877864, + "grad_norm": 1.0788595693638467, + "learning_rate": 6.8577461391338915e-06, + "loss": 0.4838, + "step": 30025 + }, + { + "epoch": 0.8764044943820225, + "grad_norm": 1.0013434365668266, + "learning_rate": 6.853428379693374e-06, + "loss": 0.5429, + "step": 30030 + }, + { + "epoch": 0.8765504158762586, + "grad_norm": 1.0088690569590848, + "learning_rate": 6.84911542819267e-06, + "loss": 0.5428, + "step": 30035 + }, + { + "epoch": 0.8766963373704947, + "grad_norm": 1.062222335005417, + "learning_rate": 6.84480728563614e-06, + "loss": 0.5232, + "step": 30040 + }, + { + "epoch": 0.8768422588647308, + "grad_norm": 1.1126050866143993, + "learning_rate": 6.840503953026996e-06, + "loss": 0.5611, + "step": 30045 + }, + { + "epoch": 0.8769881803589669, + "grad_norm": 1.0350764177098937, + "learning_rate": 6.836205431367365e-06, + "loss": 0.5125, + "step": 30050 + }, + { + "epoch": 0.877134101853203, + "grad_norm": 0.9792411923890526, + "learning_rate": 6.8319117216582355e-06, + "loss": 0.511, + "step": 30055 + }, + { + "epoch": 0.877280023347439, + "grad_norm": 1.090824461939905, + "learning_rate": 6.827622824899468e-06, + "loss": 0.5186, + "step": 30060 + }, + { + "epoch": 0.8774259448416751, + "grad_norm": 0.9121237611156275, + "learning_rate": 6.823338742089831e-06, + "loss": 0.5134, + "step": 30065 + }, + { + "epoch": 0.8775718663359113, + "grad_norm": 1.147552196790515, + "learning_rate": 6.819059474226933e-06, + "loss": 0.533, + "step": 30070 + }, + { + "epoch": 0.8777177878301474, + "grad_norm": 0.9970530391326189, + "learning_rate": 6.814785022307293e-06, + "loss": 0.5163, + "step": 30075 + }, + { + "epoch": 0.8778637093243835, + "grad_norm": 1.056014804221753, + "learning_rate": 6.810515387326291e-06, + "loss": 0.5421, + "step": 30080 + }, + { + "epoch": 0.8780096308186196, + "grad_norm": 1.1091870848055414, + "learning_rate": 6.806250570278198e-06, + "loss": 0.5734, + "step": 30085 + }, + { + "epoch": 0.8781555523128557, + "grad_norm": 1.0023019814262806, + "learning_rate": 6.801990572156155e-06, + "loss": 0.5374, + "step": 30090 + }, + { + "epoch": 0.8783014738070918, + "grad_norm": 0.8659267476355923, + "learning_rate": 6.797735393952179e-06, + "loss": 0.4771, + "step": 30095 + }, + { + "epoch": 0.8784473953013279, + "grad_norm": 1.0798493557772468, + "learning_rate": 6.7934850366571694e-06, + "loss": 0.6133, + "step": 30100 + }, + { + "epoch": 0.878593316795564, + "grad_norm": 0.9723571269965117, + "learning_rate": 6.7892395012608965e-06, + "loss": 0.5505, + "step": 30105 + }, + { + "epoch": 0.8787392382898, + "grad_norm": 1.0537390830097002, + "learning_rate": 6.784998788752024e-06, + "loss": 0.531, + "step": 30110 + }, + { + "epoch": 0.8788851597840361, + "grad_norm": 0.9400690072094103, + "learning_rate": 6.780762900118068e-06, + "loss": 0.4691, + "step": 30115 + }, + { + "epoch": 0.8790310812782723, + "grad_norm": 1.105529440059472, + "learning_rate": 6.77653183634544e-06, + "loss": 0.5454, + "step": 30120 + }, + { + "epoch": 0.8791770027725084, + "grad_norm": 0.924528009053591, + "learning_rate": 6.772305598419426e-06, + "loss": 0.5263, + "step": 30125 + }, + { + "epoch": 0.8793229242667445, + "grad_norm": 0.9610767289945021, + "learning_rate": 6.7680841873241756e-06, + "loss": 0.6027, + "step": 30130 + }, + { + "epoch": 0.8794688457609806, + "grad_norm": 1.0567462707982134, + "learning_rate": 6.763867604042737e-06, + "loss": 0.5347, + "step": 30135 + }, + { + "epoch": 0.8796147672552167, + "grad_norm": 1.144421321692344, + "learning_rate": 6.759655849557002e-06, + "loss": 0.559, + "step": 30140 + }, + { + "epoch": 0.8797606887494528, + "grad_norm": 0.9496284752401181, + "learning_rate": 6.755448924847771e-06, + "loss": 0.5463, + "step": 30145 + }, + { + "epoch": 0.8799066102436889, + "grad_norm": 0.9636762022558271, + "learning_rate": 6.751246830894694e-06, + "loss": 0.5455, + "step": 30150 + }, + { + "epoch": 0.880052531737925, + "grad_norm": 0.9072418255046193, + "learning_rate": 6.747049568676318e-06, + "loss": 0.5138, + "step": 30155 + }, + { + "epoch": 0.8801984532321611, + "grad_norm": 0.8771845542226931, + "learning_rate": 6.74285713917005e-06, + "loss": 0.4933, + "step": 30160 + }, + { + "epoch": 0.8803443747263971, + "grad_norm": 1.027385444308803, + "learning_rate": 6.738669543352175e-06, + "loss": 0.5442, + "step": 30165 + }, + { + "epoch": 0.8804902962206334, + "grad_norm": 1.0769734088098941, + "learning_rate": 6.734486782197852e-06, + "loss": 0.5477, + "step": 30170 + }, + { + "epoch": 0.8806362177148694, + "grad_norm": 0.9652807090033174, + "learning_rate": 6.730308856681112e-06, + "loss": 0.4675, + "step": 30175 + }, + { + "epoch": 0.8807821392091055, + "grad_norm": 1.0411863227327887, + "learning_rate": 6.726135767774867e-06, + "loss": 0.4684, + "step": 30180 + }, + { + "epoch": 0.8809280607033416, + "grad_norm": 1.0788513084648004, + "learning_rate": 6.7219675164509e-06, + "loss": 0.5347, + "step": 30185 + }, + { + "epoch": 0.8810739821975777, + "grad_norm": 1.052854975731894, + "learning_rate": 6.717804103679865e-06, + "loss": 0.5459, + "step": 30190 + }, + { + "epoch": 0.8812199036918138, + "grad_norm": 0.9380997580514051, + "learning_rate": 6.7136455304312865e-06, + "loss": 0.5533, + "step": 30195 + }, + { + "epoch": 0.8813658251860499, + "grad_norm": 1.0993360559642138, + "learning_rate": 6.709491797673571e-06, + "loss": 0.4865, + "step": 30200 + }, + { + "epoch": 0.881511746680286, + "grad_norm": 1.0338201264669893, + "learning_rate": 6.70534290637399e-06, + "loss": 0.541, + "step": 30205 + }, + { + "epoch": 0.8816576681745221, + "grad_norm": 1.0762800010256062, + "learning_rate": 6.701198857498693e-06, + "loss": 0.4765, + "step": 30210 + }, + { + "epoch": 0.8818035896687582, + "grad_norm": 1.154709347065583, + "learning_rate": 6.6970596520126966e-06, + "loss": 0.5343, + "step": 30215 + }, + { + "epoch": 0.8819495111629944, + "grad_norm": 1.0069951836980788, + "learning_rate": 6.692925290879891e-06, + "loss": 0.4787, + "step": 30220 + }, + { + "epoch": 0.8820954326572304, + "grad_norm": 0.9903485754315747, + "learning_rate": 6.688795775063044e-06, + "loss": 0.4865, + "step": 30225 + }, + { + "epoch": 0.8822413541514665, + "grad_norm": 1.0509969502745817, + "learning_rate": 6.684671105523785e-06, + "loss": 0.5364, + "step": 30230 + }, + { + "epoch": 0.8823872756457026, + "grad_norm": 0.8607330113734835, + "learning_rate": 6.6805512832226244e-06, + "loss": 0.5167, + "step": 30235 + }, + { + "epoch": 0.8825331971399387, + "grad_norm": 0.8899420270677085, + "learning_rate": 6.676436309118942e-06, + "loss": 0.4924, + "step": 30240 + }, + { + "epoch": 0.8826791186341748, + "grad_norm": 1.0785084869859474, + "learning_rate": 6.672326184170974e-06, + "loss": 0.5297, + "step": 30245 + }, + { + "epoch": 0.8828250401284109, + "grad_norm": 0.9834880786436881, + "learning_rate": 6.6682209093358565e-06, + "loss": 0.497, + "step": 30250 + }, + { + "epoch": 0.882970961622647, + "grad_norm": 0.8676551705046306, + "learning_rate": 6.6641204855695695e-06, + "loss": 0.5584, + "step": 30255 + }, + { + "epoch": 0.8831168831168831, + "grad_norm": 0.9566525293696535, + "learning_rate": 6.660024913826976e-06, + "loss": 0.5782, + "step": 30260 + }, + { + "epoch": 0.8832628046111192, + "grad_norm": 1.0188560538696452, + "learning_rate": 6.6559341950618e-06, + "loss": 0.5682, + "step": 30265 + }, + { + "epoch": 0.8834087261053554, + "grad_norm": 1.0300495895815487, + "learning_rate": 6.6518483302266535e-06, + "loss": 0.5354, + "step": 30270 + }, + { + "epoch": 0.8835546475995915, + "grad_norm": 1.0097785168081432, + "learning_rate": 6.6477673202730006e-06, + "loss": 0.5195, + "step": 30275 + }, + { + "epoch": 0.8837005690938275, + "grad_norm": 0.9682368111525732, + "learning_rate": 6.643691166151178e-06, + "loss": 0.4895, + "step": 30280 + }, + { + "epoch": 0.8838464905880636, + "grad_norm": 1.0018786415950411, + "learning_rate": 6.639619868810398e-06, + "loss": 0.5206, + "step": 30285 + }, + { + "epoch": 0.8839924120822997, + "grad_norm": 0.9106761919254766, + "learning_rate": 6.635553429198735e-06, + "loss": 0.5409, + "step": 30290 + }, + { + "epoch": 0.8841383335765358, + "grad_norm": 1.1403622283456385, + "learning_rate": 6.631491848263137e-06, + "loss": 0.5546, + "step": 30295 + }, + { + "epoch": 0.8842842550707719, + "grad_norm": 0.8927811407997241, + "learning_rate": 6.6274351269494214e-06, + "loss": 0.4947, + "step": 30300 + }, + { + "epoch": 0.884430176565008, + "grad_norm": 0.962949364788676, + "learning_rate": 6.62338326620227e-06, + "loss": 0.5423, + "step": 30305 + }, + { + "epoch": 0.8845760980592441, + "grad_norm": 0.9036803757803599, + "learning_rate": 6.619336266965234e-06, + "loss": 0.5205, + "step": 30310 + }, + { + "epoch": 0.8847220195534802, + "grad_norm": 1.1031498594248523, + "learning_rate": 6.6152941301807286e-06, + "loss": 0.5498, + "step": 30315 + }, + { + "epoch": 0.8848679410477164, + "grad_norm": 1.0576554232408852, + "learning_rate": 6.611256856790048e-06, + "loss": 0.5196, + "step": 30320 + }, + { + "epoch": 0.8850138625419525, + "grad_norm": 0.9514805077270847, + "learning_rate": 6.607224447733343e-06, + "loss": 0.51, + "step": 30325 + }, + { + "epoch": 0.8851597840361886, + "grad_norm": 0.9177794419307634, + "learning_rate": 6.603196903949637e-06, + "loss": 0.5424, + "step": 30330 + }, + { + "epoch": 0.8853057055304246, + "grad_norm": 0.9285289116283991, + "learning_rate": 6.5991742263768125e-06, + "loss": 0.5017, + "step": 30335 + }, + { + "epoch": 0.8854516270246607, + "grad_norm": 1.0784913369271725, + "learning_rate": 6.595156415951636e-06, + "loss": 0.5322, + "step": 30340 + }, + { + "epoch": 0.8855975485188968, + "grad_norm": 1.1073906503610704, + "learning_rate": 6.591143473609722e-06, + "loss": 0.525, + "step": 30345 + }, + { + "epoch": 0.8857434700131329, + "grad_norm": 1.1577261909116483, + "learning_rate": 6.587135400285562e-06, + "loss": 0.5306, + "step": 30350 + }, + { + "epoch": 0.885889391507369, + "grad_norm": 1.152700078794717, + "learning_rate": 6.583132196912509e-06, + "loss": 0.5441, + "step": 30355 + }, + { + "epoch": 0.8860353130016051, + "grad_norm": 0.9863928107563286, + "learning_rate": 6.5791338644227815e-06, + "loss": 0.5159, + "step": 30360 + }, + { + "epoch": 0.8861812344958412, + "grad_norm": 0.9611006475972624, + "learning_rate": 6.575140403747474e-06, + "loss": 0.4632, + "step": 30365 + }, + { + "epoch": 0.8863271559900774, + "grad_norm": 1.0772509951562268, + "learning_rate": 6.5711518158165255e-06, + "loss": 0.5493, + "step": 30370 + }, + { + "epoch": 0.8864730774843135, + "grad_norm": 0.9417049634273706, + "learning_rate": 6.56716810155877e-06, + "loss": 0.5421, + "step": 30375 + }, + { + "epoch": 0.8866189989785496, + "grad_norm": 1.094028832386522, + "learning_rate": 6.563189261901874e-06, + "loss": 0.5116, + "step": 30380 + }, + { + "epoch": 0.8867649204727857, + "grad_norm": 1.1523471117240423, + "learning_rate": 6.559215297772395e-06, + "loss": 0.5905, + "step": 30385 + }, + { + "epoch": 0.8869108419670217, + "grad_norm": 1.0385926913926515, + "learning_rate": 6.555246210095739e-06, + "loss": 0.5225, + "step": 30390 + }, + { + "epoch": 0.8870567634612578, + "grad_norm": 1.0145460635822472, + "learning_rate": 6.551281999796187e-06, + "loss": 0.5548, + "step": 30395 + }, + { + "epoch": 0.8872026849554939, + "grad_norm": 0.8959369649060647, + "learning_rate": 6.547322667796875e-06, + "loss": 0.484, + "step": 30400 + }, + { + "epoch": 0.88734860644973, + "grad_norm": 1.143972763850105, + "learning_rate": 6.543368215019805e-06, + "loss": 0.5317, + "step": 30405 + }, + { + "epoch": 0.8874945279439661, + "grad_norm": 0.9856911896592033, + "learning_rate": 6.539418642385854e-06, + "loss": 0.504, + "step": 30410 + }, + { + "epoch": 0.8876404494382022, + "grad_norm": 1.0053296352322783, + "learning_rate": 6.535473950814749e-06, + "loss": 0.4891, + "step": 30415 + }, + { + "epoch": 0.8877863709324384, + "grad_norm": 1.1099608467866107, + "learning_rate": 6.531534141225083e-06, + "loss": 0.5808, + "step": 30420 + }, + { + "epoch": 0.8879322924266745, + "grad_norm": 0.9428015409725334, + "learning_rate": 6.527599214534319e-06, + "loss": 0.4899, + "step": 30425 + }, + { + "epoch": 0.8880782139209106, + "grad_norm": 1.0130410365871099, + "learning_rate": 6.5236691716587705e-06, + "loss": 0.548, + "step": 30430 + }, + { + "epoch": 0.8882241354151467, + "grad_norm": 1.1563452810017223, + "learning_rate": 6.5197440135136305e-06, + "loss": 0.5631, + "step": 30435 + }, + { + "epoch": 0.8883700569093828, + "grad_norm": 1.0189653894835977, + "learning_rate": 6.51582374101294e-06, + "loss": 0.5582, + "step": 30440 + }, + { + "epoch": 0.8885159784036188, + "grad_norm": 0.9860787064062873, + "learning_rate": 6.511908355069613e-06, + "loss": 0.5381, + "step": 30445 + }, + { + "epoch": 0.8886618998978549, + "grad_norm": 1.0889951926286519, + "learning_rate": 6.507997856595409e-06, + "loss": 0.5415, + "step": 30450 + }, + { + "epoch": 0.888807821392091, + "grad_norm": 0.9779808812381271, + "learning_rate": 6.504092246500972e-06, + "loss": 0.4962, + "step": 30455 + }, + { + "epoch": 0.8889537428863271, + "grad_norm": 1.1941593001987445, + "learning_rate": 6.500191525695794e-06, + "loss": 0.5913, + "step": 30460 + }, + { + "epoch": 0.8890996643805632, + "grad_norm": 0.8411551052934448, + "learning_rate": 6.496295695088227e-06, + "loss": 0.5503, + "step": 30465 + }, + { + "epoch": 0.8892455858747994, + "grad_norm": 0.9416121831704969, + "learning_rate": 6.492404755585488e-06, + "loss": 0.5031, + "step": 30470 + }, + { + "epoch": 0.8893915073690355, + "grad_norm": 1.0317188628948697, + "learning_rate": 6.488518708093653e-06, + "loss": 0.5422, + "step": 30475 + }, + { + "epoch": 0.8895374288632716, + "grad_norm": 1.0034729077229914, + "learning_rate": 6.4846375535176725e-06, + "loss": 0.5646, + "step": 30480 + }, + { + "epoch": 0.8896833503575077, + "grad_norm": 1.0098002117844118, + "learning_rate": 6.480761292761332e-06, + "loss": 0.5516, + "step": 30485 + }, + { + "epoch": 0.8898292718517438, + "grad_norm": 0.9164785215798202, + "learning_rate": 6.4768899267273005e-06, + "loss": 0.5004, + "step": 30490 + }, + { + "epoch": 0.8899751933459799, + "grad_norm": 0.9290579640605727, + "learning_rate": 6.4730234563170945e-06, + "loss": 0.5079, + "step": 30495 + }, + { + "epoch": 0.8901211148402159, + "grad_norm": 1.089062137643188, + "learning_rate": 6.469161882431089e-06, + "loss": 0.5335, + "step": 30500 + }, + { + "epoch": 0.890267036334452, + "grad_norm": 1.0619621333094895, + "learning_rate": 6.465305205968532e-06, + "loss": 0.5544, + "step": 30505 + }, + { + "epoch": 0.8904129578286881, + "grad_norm": 0.9297827314654659, + "learning_rate": 6.461453427827514e-06, + "loss": 0.5045, + "step": 30510 + }, + { + "epoch": 0.8905588793229242, + "grad_norm": 1.0819081817818819, + "learning_rate": 6.457606548905006e-06, + "loss": 0.5566, + "step": 30515 + }, + { + "epoch": 0.8907048008171604, + "grad_norm": 1.1006931447397736, + "learning_rate": 6.453764570096811e-06, + "loss": 0.5574, + "step": 30520 + }, + { + "epoch": 0.8908507223113965, + "grad_norm": 0.998948901241452, + "learning_rate": 6.449927492297618e-06, + "loss": 0.4858, + "step": 30525 + }, + { + "epoch": 0.8909966438056326, + "grad_norm": 0.9345295884985156, + "learning_rate": 6.446095316400954e-06, + "loss": 0.4846, + "step": 30530 + }, + { + "epoch": 0.8911425652998687, + "grad_norm": 1.0212378264166426, + "learning_rate": 6.442268043299214e-06, + "loss": 0.5222, + "step": 30535 + }, + { + "epoch": 0.8912884867941048, + "grad_norm": 0.9951932532848269, + "learning_rate": 6.438445673883657e-06, + "loss": 0.5258, + "step": 30540 + }, + { + "epoch": 0.8914344082883409, + "grad_norm": 1.0502407080762461, + "learning_rate": 6.43462820904438e-06, + "loss": 0.5669, + "step": 30545 + }, + { + "epoch": 0.891580329782577, + "grad_norm": 1.0536413917561414, + "learning_rate": 6.430815649670363e-06, + "loss": 0.5646, + "step": 30550 + }, + { + "epoch": 0.891726251276813, + "grad_norm": 1.0644268500888934, + "learning_rate": 6.427007996649427e-06, + "loss": 0.5524, + "step": 30555 + }, + { + "epoch": 0.8918721727710491, + "grad_norm": 0.998014763097728, + "learning_rate": 6.4232052508682545e-06, + "loss": 0.5336, + "step": 30560 + }, + { + "epoch": 0.8920180942652852, + "grad_norm": 0.9854768318073398, + "learning_rate": 6.419407413212389e-06, + "loss": 0.5575, + "step": 30565 + }, + { + "epoch": 0.8921640157595214, + "grad_norm": 1.0308011294562738, + "learning_rate": 6.415614484566221e-06, + "loss": 0.498, + "step": 30570 + }, + { + "epoch": 0.8923099372537575, + "grad_norm": 1.1074020671383793, + "learning_rate": 6.41182646581301e-06, + "loss": 0.555, + "step": 30575 + }, + { + "epoch": 0.8924558587479936, + "grad_norm": 1.1034009482283433, + "learning_rate": 6.408043357834867e-06, + "loss": 0.5582, + "step": 30580 + }, + { + "epoch": 0.8926017802422297, + "grad_norm": 0.9759480031010573, + "learning_rate": 6.404265161512765e-06, + "loss": 0.516, + "step": 30585 + }, + { + "epoch": 0.8927477017364658, + "grad_norm": 1.2376174869442904, + "learning_rate": 6.400491877726511e-06, + "loss": 0.5826, + "step": 30590 + }, + { + "epoch": 0.8928936232307019, + "grad_norm": 1.1279783951884765, + "learning_rate": 6.3967235073548025e-06, + "loss": 0.5925, + "step": 30595 + }, + { + "epoch": 0.893039544724938, + "grad_norm": 1.1641281275413724, + "learning_rate": 6.392960051275165e-06, + "loss": 0.5903, + "step": 30600 + }, + { + "epoch": 0.893185466219174, + "grad_norm": 1.093501993710345, + "learning_rate": 6.38920151036399e-06, + "loss": 0.5321, + "step": 30605 + }, + { + "epoch": 0.8933313877134101, + "grad_norm": 1.0689202478975515, + "learning_rate": 6.385447885496531e-06, + "loss": 0.6033, + "step": 30610 + }, + { + "epoch": 0.8934773092076462, + "grad_norm": 1.039256245212493, + "learning_rate": 6.381699177546877e-06, + "loss": 0.5536, + "step": 30615 + }, + { + "epoch": 0.8936232307018824, + "grad_norm": 1.1495482286410592, + "learning_rate": 6.3779553873879985e-06, + "loss": 0.5047, + "step": 30620 + }, + { + "epoch": 0.8937691521961185, + "grad_norm": 1.1706408482212765, + "learning_rate": 6.374216515891694e-06, + "loss": 0.5584, + "step": 30625 + }, + { + "epoch": 0.8939150736903546, + "grad_norm": 0.9046635457155036, + "learning_rate": 6.37048256392864e-06, + "loss": 0.4874, + "step": 30630 + }, + { + "epoch": 0.8940609951845907, + "grad_norm": 1.076673238601162, + "learning_rate": 6.366753532368354e-06, + "loss": 0.5154, + "step": 30635 + }, + { + "epoch": 0.8942069166788268, + "grad_norm": 0.9808198535416821, + "learning_rate": 6.36302942207921e-06, + "loss": 0.4801, + "step": 30640 + }, + { + "epoch": 0.8943528381730629, + "grad_norm": 0.9291082799777206, + "learning_rate": 6.359310233928435e-06, + "loss": 0.4638, + "step": 30645 + }, + { + "epoch": 0.894498759667299, + "grad_norm": 0.9941111289346053, + "learning_rate": 6.355595968782109e-06, + "loss": 0.5129, + "step": 30650 + }, + { + "epoch": 0.8946446811615351, + "grad_norm": 1.13546093091978, + "learning_rate": 6.35188662750518e-06, + "loss": 0.4884, + "step": 30655 + }, + { + "epoch": 0.8947906026557712, + "grad_norm": 1.0981551462292058, + "learning_rate": 6.348182210961423e-06, + "loss": 0.548, + "step": 30660 + }, + { + "epoch": 0.8949365241500074, + "grad_norm": 0.9983409027611266, + "learning_rate": 6.344482720013489e-06, + "loss": 0.5461, + "step": 30665 + }, + { + "epoch": 0.8950824456442434, + "grad_norm": 1.1207387387030325, + "learning_rate": 6.34078815552287e-06, + "loss": 0.5795, + "step": 30670 + }, + { + "epoch": 0.8952283671384795, + "grad_norm": 1.0027070461084964, + "learning_rate": 6.337098518349914e-06, + "loss": 0.5112, + "step": 30675 + }, + { + "epoch": 0.8953742886327156, + "grad_norm": 0.8992752510587828, + "learning_rate": 6.33341380935383e-06, + "loss": 0.5356, + "step": 30680 + }, + { + "epoch": 0.8955202101269517, + "grad_norm": 0.999846550642492, + "learning_rate": 6.329734029392658e-06, + "loss": 0.5636, + "step": 30685 + }, + { + "epoch": 0.8956661316211878, + "grad_norm": 0.8461577927220665, + "learning_rate": 6.326059179323314e-06, + "loss": 0.4809, + "step": 30690 + }, + { + "epoch": 0.8958120531154239, + "grad_norm": 1.0883345393068011, + "learning_rate": 6.322389260001548e-06, + "loss": 0.5363, + "step": 30695 + }, + { + "epoch": 0.89595797460966, + "grad_norm": 1.0750080410173237, + "learning_rate": 6.318724272281976e-06, + "loss": 0.4851, + "step": 30700 + }, + { + "epoch": 0.8961038961038961, + "grad_norm": 1.001055464648106, + "learning_rate": 6.315064217018057e-06, + "loss": 0.5549, + "step": 30705 + }, + { + "epoch": 0.8962498175981322, + "grad_norm": 1.097390498263082, + "learning_rate": 6.311409095062103e-06, + "loss": 0.5132, + "step": 30710 + }, + { + "epoch": 0.8963957390923684, + "grad_norm": 1.0217109796596096, + "learning_rate": 6.307758907265275e-06, + "loss": 0.5576, + "step": 30715 + }, + { + "epoch": 0.8965416605866044, + "grad_norm": 1.0436872364897183, + "learning_rate": 6.304113654477585e-06, + "loss": 0.5685, + "step": 30720 + }, + { + "epoch": 0.8966875820808405, + "grad_norm": 1.0420778674501554, + "learning_rate": 6.300473337547906e-06, + "loss": 0.5512, + "step": 30725 + }, + { + "epoch": 0.8968335035750766, + "grad_norm": 0.9639679566618371, + "learning_rate": 6.296837957323952e-06, + "loss": 0.5194, + "step": 30730 + }, + { + "epoch": 0.8969794250693127, + "grad_norm": 1.06983236185863, + "learning_rate": 6.293207514652284e-06, + "loss": 0.5219, + "step": 30735 + }, + { + "epoch": 0.8971253465635488, + "grad_norm": 1.0289358069190055, + "learning_rate": 6.289582010378324e-06, + "loss": 0.5011, + "step": 30740 + }, + { + "epoch": 0.8972712680577849, + "grad_norm": 1.1435896903479865, + "learning_rate": 6.285961445346332e-06, + "loss": 0.487, + "step": 30745 + }, + { + "epoch": 0.897417189552021, + "grad_norm": 1.0296068100496318, + "learning_rate": 6.2823458203994325e-06, + "loss": 0.4947, + "step": 30750 + }, + { + "epoch": 0.8975631110462571, + "grad_norm": 0.8520463791349125, + "learning_rate": 6.278735136379582e-06, + "loss": 0.5364, + "step": 30755 + }, + { + "epoch": 0.8977090325404932, + "grad_norm": 1.1820495439998748, + "learning_rate": 6.275129394127605e-06, + "loss": 0.5531, + "step": 30760 + }, + { + "epoch": 0.8978549540347294, + "grad_norm": 0.9335935533542379, + "learning_rate": 6.271528594483159e-06, + "loss": 0.5547, + "step": 30765 + }, + { + "epoch": 0.8980008755289655, + "grad_norm": 1.0175668164630614, + "learning_rate": 6.2679327382847595e-06, + "loss": 0.496, + "step": 30770 + }, + { + "epoch": 0.8981467970232015, + "grad_norm": 1.0026736727604557, + "learning_rate": 6.2643418263697724e-06, + "loss": 0.4938, + "step": 30775 + }, + { + "epoch": 0.8982927185174376, + "grad_norm": 0.9822216686811812, + "learning_rate": 6.260755859574405e-06, + "loss": 0.5322, + "step": 30780 + }, + { + "epoch": 0.8984386400116737, + "grad_norm": 0.9349368027537309, + "learning_rate": 6.257174838733715e-06, + "loss": 0.586, + "step": 30785 + }, + { + "epoch": 0.8985845615059098, + "grad_norm": 0.9742324521791845, + "learning_rate": 6.253598764681609e-06, + "loss": 0.4639, + "step": 30790 + }, + { + "epoch": 0.8987304830001459, + "grad_norm": 0.9703682221450168, + "learning_rate": 6.250027638250852e-06, + "loss": 0.5122, + "step": 30795 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.9846377186188879, + "learning_rate": 6.246461460273038e-06, + "loss": 0.5596, + "step": 30800 + }, + { + "epoch": 0.8990223259886181, + "grad_norm": 1.2070277199718502, + "learning_rate": 6.242900231578623e-06, + "loss": 0.5209, + "step": 30805 + }, + { + "epoch": 0.8991682474828542, + "grad_norm": 1.2038504457706052, + "learning_rate": 6.2393439529969035e-06, + "loss": 0.5228, + "step": 30810 + }, + { + "epoch": 0.8993141689770904, + "grad_norm": 0.9211033639959786, + "learning_rate": 6.235792625356023e-06, + "loss": 0.487, + "step": 30815 + }, + { + "epoch": 0.8994600904713265, + "grad_norm": 0.9888143242627709, + "learning_rate": 6.23224624948298e-06, + "loss": 0.5685, + "step": 30820 + }, + { + "epoch": 0.8996060119655626, + "grad_norm": 1.0831973209260362, + "learning_rate": 6.228704826203613e-06, + "loss": 0.5212, + "step": 30825 + }, + { + "epoch": 0.8997519334597986, + "grad_norm": 1.021032267486361, + "learning_rate": 6.225168356342607e-06, + "loss": 0.5091, + "step": 30830 + }, + { + "epoch": 0.8998978549540347, + "grad_norm": 0.9391812438786102, + "learning_rate": 6.2216368407234895e-06, + "loss": 0.4981, + "step": 30835 + }, + { + "epoch": 0.9000437764482708, + "grad_norm": 1.1233057834540503, + "learning_rate": 6.218110280168651e-06, + "loss": 0.5176, + "step": 30840 + }, + { + "epoch": 0.9001896979425069, + "grad_norm": 0.9746962411251152, + "learning_rate": 6.214588675499312e-06, + "loss": 0.539, + "step": 30845 + }, + { + "epoch": 0.900335619436743, + "grad_norm": 1.0070858772131686, + "learning_rate": 6.211072027535543e-06, + "loss": 0.5201, + "step": 30850 + }, + { + "epoch": 0.9004815409309791, + "grad_norm": 1.0557704283842555, + "learning_rate": 6.207560337096262e-06, + "loss": 0.6069, + "step": 30855 + }, + { + "epoch": 0.9006274624252152, + "grad_norm": 1.098024224395763, + "learning_rate": 6.204053604999226e-06, + "loss": 0.5088, + "step": 30860 + }, + { + "epoch": 0.9007733839194514, + "grad_norm": 1.0813060862779902, + "learning_rate": 6.200551832061055e-06, + "loss": 0.5206, + "step": 30865 + }, + { + "epoch": 0.9009193054136875, + "grad_norm": 1.0193421618087972, + "learning_rate": 6.197055019097194e-06, + "loss": 0.4993, + "step": 30870 + }, + { + "epoch": 0.9010652269079236, + "grad_norm": 1.104771089747469, + "learning_rate": 6.1935631669219436e-06, + "loss": 0.5193, + "step": 30875 + }, + { + "epoch": 0.9012111484021597, + "grad_norm": 0.921495152216103, + "learning_rate": 6.190076276348443e-06, + "loss": 0.522, + "step": 30880 + }, + { + "epoch": 0.9013570698963957, + "grad_norm": 1.071173638171577, + "learning_rate": 6.186594348188684e-06, + "loss": 0.5382, + "step": 30885 + }, + { + "epoch": 0.9015029913906318, + "grad_norm": 1.0051369129160548, + "learning_rate": 6.183117383253499e-06, + "loss": 0.4599, + "step": 30890 + }, + { + "epoch": 0.9016489128848679, + "grad_norm": 1.084468010080285, + "learning_rate": 6.179645382352563e-06, + "loss": 0.4742, + "step": 30895 + }, + { + "epoch": 0.901794834379104, + "grad_norm": 1.17305276879897, + "learning_rate": 6.176178346294397e-06, + "loss": 0.5709, + "step": 30900 + }, + { + "epoch": 0.9019407558733401, + "grad_norm": 1.0703681576706494, + "learning_rate": 6.172716275886358e-06, + "loss": 0.5468, + "step": 30905 + }, + { + "epoch": 0.9020866773675762, + "grad_norm": 1.0014560348550983, + "learning_rate": 6.169259171934666e-06, + "loss": 0.5097, + "step": 30910 + }, + { + "epoch": 0.9022325988618124, + "grad_norm": 0.9161118264601708, + "learning_rate": 6.165807035244364e-06, + "loss": 0.5348, + "step": 30915 + }, + { + "epoch": 0.9023785203560485, + "grad_norm": 1.033482234226185, + "learning_rate": 6.162359866619351e-06, + "loss": 0.5704, + "step": 30920 + }, + { + "epoch": 0.9025244418502846, + "grad_norm": 0.8376083515932418, + "learning_rate": 6.158917666862359e-06, + "loss": 0.476, + "step": 30925 + }, + { + "epoch": 0.9026703633445207, + "grad_norm": 0.981333761604822, + "learning_rate": 6.155480436774971e-06, + "loss": 0.5033, + "step": 30930 + }, + { + "epoch": 0.9028162848387568, + "grad_norm": 1.0863659765634708, + "learning_rate": 6.1520481771576145e-06, + "loss": 0.4721, + "step": 30935 + }, + { + "epoch": 0.9029622063329928, + "grad_norm": 0.9487429074383799, + "learning_rate": 6.148620888809549e-06, + "loss": 0.5785, + "step": 30940 + }, + { + "epoch": 0.9031081278272289, + "grad_norm": 0.9893157463033818, + "learning_rate": 6.145198572528888e-06, + "loss": 0.4978, + "step": 30945 + }, + { + "epoch": 0.903254049321465, + "grad_norm": 1.071340088807136, + "learning_rate": 6.141781229112573e-06, + "loss": 0.5504, + "step": 30950 + }, + { + "epoch": 0.9033999708157011, + "grad_norm": 1.0187993075512538, + "learning_rate": 6.138368859356408e-06, + "loss": 0.5676, + "step": 30955 + }, + { + "epoch": 0.9035458923099372, + "grad_norm": 0.9998555113521561, + "learning_rate": 6.1349614640550195e-06, + "loss": 0.4819, + "step": 30960 + }, + { + "epoch": 0.9036918138041734, + "grad_norm": 0.9418079183898496, + "learning_rate": 6.131559044001885e-06, + "loss": 0.473, + "step": 30965 + }, + { + "epoch": 0.9038377352984095, + "grad_norm": 1.1005790767556463, + "learning_rate": 6.128161599989324e-06, + "loss": 0.528, + "step": 30970 + }, + { + "epoch": 0.9039836567926456, + "grad_norm": 1.005161609810259, + "learning_rate": 6.124769132808485e-06, + "loss": 0.5238, + "step": 30975 + }, + { + "epoch": 0.9041295782868817, + "grad_norm": 0.8513289649031828, + "learning_rate": 6.121381643249381e-06, + "loss": 0.5252, + "step": 30980 + }, + { + "epoch": 0.9042754997811178, + "grad_norm": 0.989182404932506, + "learning_rate": 6.1179991321008456e-06, + "loss": 0.5577, + "step": 30985 + }, + { + "epoch": 0.9044214212753539, + "grad_norm": 0.9695530732676745, + "learning_rate": 6.1146216001505596e-06, + "loss": 0.4698, + "step": 30990 + }, + { + "epoch": 0.9045673427695899, + "grad_norm": 1.1652209252209018, + "learning_rate": 6.111249048185042e-06, + "loss": 0.5786, + "step": 30995 + }, + { + "epoch": 0.904713264263826, + "grad_norm": 0.9759425072951008, + "learning_rate": 6.107881476989656e-06, + "loss": 0.5037, + "step": 31000 + }, + { + "epoch": 0.9048591857580621, + "grad_norm": 0.9957514683998013, + "learning_rate": 6.10451888734861e-06, + "loss": 0.5471, + "step": 31005 + }, + { + "epoch": 0.9050051072522982, + "grad_norm": 0.9927921285007671, + "learning_rate": 6.101161280044938e-06, + "loss": 0.5007, + "step": 31010 + }, + { + "epoch": 0.9051510287465344, + "grad_norm": 1.0715586324501267, + "learning_rate": 6.097808655860526e-06, + "loss": 0.5838, + "step": 31015 + }, + { + "epoch": 0.9052969502407705, + "grad_norm": 1.0977381208203096, + "learning_rate": 6.094461015576088e-06, + "loss": 0.524, + "step": 31020 + }, + { + "epoch": 0.9054428717350066, + "grad_norm": 1.0654985153439427, + "learning_rate": 6.091118359971195e-06, + "loss": 0.5705, + "step": 31025 + }, + { + "epoch": 0.9055887932292427, + "grad_norm": 1.119556846864035, + "learning_rate": 6.087780689824239e-06, + "loss": 0.4362, + "step": 31030 + }, + { + "epoch": 0.9057347147234788, + "grad_norm": 1.0810583967789849, + "learning_rate": 6.084448005912463e-06, + "loss": 0.5764, + "step": 31035 + }, + { + "epoch": 0.9058806362177149, + "grad_norm": 1.254736426712116, + "learning_rate": 6.081120309011944e-06, + "loss": 0.5431, + "step": 31040 + }, + { + "epoch": 0.906026557711951, + "grad_norm": 1.021062231911103, + "learning_rate": 6.077797599897593e-06, + "loss": 0.4926, + "step": 31045 + }, + { + "epoch": 0.906172479206187, + "grad_norm": 1.0208524727279626, + "learning_rate": 6.0744798793431724e-06, + "loss": 0.5446, + "step": 31050 + }, + { + "epoch": 0.9063184007004231, + "grad_norm": 1.0155441200013657, + "learning_rate": 6.071167148121269e-06, + "loss": 0.5116, + "step": 31055 + }, + { + "epoch": 0.9064643221946592, + "grad_norm": 0.9578405124866007, + "learning_rate": 6.0678594070033235e-06, + "loss": 0.5449, + "step": 31060 + }, + { + "epoch": 0.9066102436888954, + "grad_norm": 1.1023921440251614, + "learning_rate": 6.064556656759596e-06, + "loss": 0.4987, + "step": 31065 + }, + { + "epoch": 0.9067561651831315, + "grad_norm": 1.0295573316020605, + "learning_rate": 6.061258898159196e-06, + "loss": 0.511, + "step": 31070 + }, + { + "epoch": 0.9069020866773676, + "grad_norm": 0.9426448869519591, + "learning_rate": 6.057966131970069e-06, + "loss": 0.5098, + "step": 31075 + }, + { + "epoch": 0.9070480081716037, + "grad_norm": 1.414166816596012, + "learning_rate": 6.054678358958997e-06, + "loss": 0.5892, + "step": 31080 + }, + { + "epoch": 0.9071939296658398, + "grad_norm": 1.0285945298071992, + "learning_rate": 6.0513955798916025e-06, + "loss": 0.5078, + "step": 31085 + }, + { + "epoch": 0.9073398511600759, + "grad_norm": 1.0085168820771129, + "learning_rate": 6.048117795532335e-06, + "loss": 0.5288, + "step": 31090 + }, + { + "epoch": 0.907485772654312, + "grad_norm": 0.92540792874259, + "learning_rate": 6.044845006644492e-06, + "loss": 0.4996, + "step": 31095 + }, + { + "epoch": 0.907631694148548, + "grad_norm": 0.9301329440583919, + "learning_rate": 6.041577213990205e-06, + "loss": 0.4886, + "step": 31100 + }, + { + "epoch": 0.9077776156427841, + "grad_norm": 0.9458118354448951, + "learning_rate": 6.03831441833044e-06, + "loss": 0.5403, + "step": 31105 + }, + { + "epoch": 0.9079235371370202, + "grad_norm": 0.8794406711524512, + "learning_rate": 6.035056620424998e-06, + "loss": 0.4868, + "step": 31110 + }, + { + "epoch": 0.9080694586312564, + "grad_norm": 1.1824427520914502, + "learning_rate": 6.031803821032513e-06, + "loss": 0.5416, + "step": 31115 + }, + { + "epoch": 0.9082153801254925, + "grad_norm": 1.0732724327646415, + "learning_rate": 6.028556020910472e-06, + "loss": 0.4917, + "step": 31120 + }, + { + "epoch": 0.9083613016197286, + "grad_norm": 0.9178265748231079, + "learning_rate": 6.0253132208151725e-06, + "loss": 0.5868, + "step": 31125 + }, + { + "epoch": 0.9085072231139647, + "grad_norm": 1.0284250625449678, + "learning_rate": 6.022075421501778e-06, + "loss": 0.5512, + "step": 31130 + }, + { + "epoch": 0.9086531446082008, + "grad_norm": 1.063822860478113, + "learning_rate": 6.018842623724253e-06, + "loss": 0.534, + "step": 31135 + }, + { + "epoch": 0.9087990661024369, + "grad_norm": 1.6631472430847554, + "learning_rate": 6.015614828235426e-06, + "loss": 0.5461, + "step": 31140 + }, + { + "epoch": 0.908944987596673, + "grad_norm": 1.0706749562589644, + "learning_rate": 6.012392035786947e-06, + "loss": 0.5286, + "step": 31145 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.9024397453829718, + "learning_rate": 6.009174247129299e-06, + "loss": 0.5136, + "step": 31150 + }, + { + "epoch": 0.9092368305851452, + "grad_norm": 0.923137024392914, + "learning_rate": 6.005961463011813e-06, + "loss": 0.5525, + "step": 31155 + }, + { + "epoch": 0.9093827520793812, + "grad_norm": 0.9854482912022399, + "learning_rate": 6.002753684182637e-06, + "loss": 0.5132, + "step": 31160 + }, + { + "epoch": 0.9095286735736174, + "grad_norm": 0.9583724268021456, + "learning_rate": 5.9995509113887666e-06, + "loss": 0.4881, + "step": 31165 + }, + { + "epoch": 0.9096745950678535, + "grad_norm": 1.1056715192931814, + "learning_rate": 5.9963531453760255e-06, + "loss": 0.5461, + "step": 31170 + }, + { + "epoch": 0.9098205165620896, + "grad_norm": 0.9983701486504825, + "learning_rate": 5.9931603868890735e-06, + "loss": 0.5326, + "step": 31175 + }, + { + "epoch": 0.9099664380563257, + "grad_norm": 1.0084330801741046, + "learning_rate": 5.989972636671409e-06, + "loss": 0.5402, + "step": 31180 + }, + { + "epoch": 0.9101123595505618, + "grad_norm": 1.0645425432848696, + "learning_rate": 5.986789895465348e-06, + "loss": 0.5829, + "step": 31185 + }, + { + "epoch": 0.9102582810447979, + "grad_norm": 1.1019803813358091, + "learning_rate": 5.983612164012062e-06, + "loss": 0.5166, + "step": 31190 + }, + { + "epoch": 0.910404202539034, + "grad_norm": 1.164090499797717, + "learning_rate": 5.980439443051539e-06, + "loss": 0.5006, + "step": 31195 + }, + { + "epoch": 0.9105501240332701, + "grad_norm": 1.005277959036037, + "learning_rate": 5.977271733322615e-06, + "loss": 0.5354, + "step": 31200 + }, + { + "epoch": 0.9106960455275062, + "grad_norm": 1.033691536534947, + "learning_rate": 5.9741090355629355e-06, + "loss": 0.6019, + "step": 31205 + }, + { + "epoch": 0.9108419670217422, + "grad_norm": 1.0253810412060618, + "learning_rate": 5.970951350509006e-06, + "loss": 0.553, + "step": 31210 + }, + { + "epoch": 0.9109878885159785, + "grad_norm": 1.2592189362163204, + "learning_rate": 5.96779867889615e-06, + "loss": 0.5902, + "step": 31215 + }, + { + "epoch": 0.9111338100102145, + "grad_norm": 1.0394923775141476, + "learning_rate": 5.964651021458521e-06, + "loss": 0.5146, + "step": 31220 + }, + { + "epoch": 0.9112797315044506, + "grad_norm": 1.0153763223121535, + "learning_rate": 5.961508378929117e-06, + "loss": 0.5195, + "step": 31225 + }, + { + "epoch": 0.9114256529986867, + "grad_norm": 0.9961698768918347, + "learning_rate": 5.958370752039753e-06, + "loss": 0.5614, + "step": 31230 + }, + { + "epoch": 0.9115715744929228, + "grad_norm": 1.0450466103250151, + "learning_rate": 5.9552381415210925e-06, + "loss": 0.5321, + "step": 31235 + }, + { + "epoch": 0.9117174959871589, + "grad_norm": 1.068705360286735, + "learning_rate": 5.952110548102617e-06, + "loss": 0.5059, + "step": 31240 + }, + { + "epoch": 0.911863417481395, + "grad_norm": 0.9964858760523883, + "learning_rate": 5.948987972512644e-06, + "loss": 0.5057, + "step": 31245 + }, + { + "epoch": 0.9120093389756311, + "grad_norm": 0.9289308056371659, + "learning_rate": 5.94587041547833e-06, + "loss": 0.4789, + "step": 31250 + }, + { + "epoch": 0.9121552604698672, + "grad_norm": 0.9809821991870397, + "learning_rate": 5.9427578777256485e-06, + "loss": 0.5331, + "step": 31255 + }, + { + "epoch": 0.9123011819641033, + "grad_norm": 0.9809166605745736, + "learning_rate": 5.9396503599794175e-06, + "loss": 0.5758, + "step": 31260 + }, + { + "epoch": 0.9124471034583395, + "grad_norm": 1.084114207476625, + "learning_rate": 5.9365478629632755e-06, + "loss": 0.5394, + "step": 31265 + }, + { + "epoch": 0.9125930249525755, + "grad_norm": 1.091027514947355, + "learning_rate": 5.933450387399709e-06, + "loss": 0.5398, + "step": 31270 + }, + { + "epoch": 0.9127389464468116, + "grad_norm": 1.0994095365611294, + "learning_rate": 5.930357934010004e-06, + "loss": 0.578, + "step": 31275 + }, + { + "epoch": 0.9128848679410477, + "grad_norm": 0.9351519000209817, + "learning_rate": 5.927270503514312e-06, + "loss": 0.5068, + "step": 31280 + }, + { + "epoch": 0.9130307894352838, + "grad_norm": 1.055765290402011, + "learning_rate": 5.924188096631592e-06, + "loss": 0.5253, + "step": 31285 + }, + { + "epoch": 0.9131767109295199, + "grad_norm": 0.952439205718905, + "learning_rate": 5.9211107140796385e-06, + "loss": 0.5045, + "step": 31290 + }, + { + "epoch": 0.913322632423756, + "grad_norm": 0.9719293979550976, + "learning_rate": 5.918038356575085e-06, + "loss": 0.4997, + "step": 31295 + }, + { + "epoch": 0.9134685539179921, + "grad_norm": 0.9971675384795458, + "learning_rate": 5.914971024833379e-06, + "loss": 0.4517, + "step": 31300 + }, + { + "epoch": 0.9136144754122282, + "grad_norm": 1.0809282011065169, + "learning_rate": 5.9119087195688115e-06, + "loss": 0.5774, + "step": 31305 + }, + { + "epoch": 0.9137603969064643, + "grad_norm": 0.9332638910340019, + "learning_rate": 5.908851441494491e-06, + "loss": 0.4972, + "step": 31310 + }, + { + "epoch": 0.9139063184007005, + "grad_norm": 1.017298725479635, + "learning_rate": 5.90579919132237e-06, + "loss": 0.5519, + "step": 31315 + }, + { + "epoch": 0.9140522398949366, + "grad_norm": 0.9581703860876212, + "learning_rate": 5.902751969763217e-06, + "loss": 0.5308, + "step": 31320 + }, + { + "epoch": 0.9141981613891726, + "grad_norm": 0.9804958493778927, + "learning_rate": 5.899709777526638e-06, + "loss": 0.5896, + "step": 31325 + }, + { + "epoch": 0.9143440828834087, + "grad_norm": 1.0702718158927564, + "learning_rate": 5.89667261532106e-06, + "loss": 0.5051, + "step": 31330 + }, + { + "epoch": 0.9144900043776448, + "grad_norm": 1.0007373590109032, + "learning_rate": 5.893640483853743e-06, + "loss": 0.5801, + "step": 31335 + }, + { + "epoch": 0.9146359258718809, + "grad_norm": 1.013508659308757, + "learning_rate": 5.890613383830778e-06, + "loss": 0.5356, + "step": 31340 + }, + { + "epoch": 0.914781847366117, + "grad_norm": 1.0521093857573989, + "learning_rate": 5.8875913159570855e-06, + "loss": 0.5245, + "step": 31345 + }, + { + "epoch": 0.9149277688603531, + "grad_norm": 0.9727732752795446, + "learning_rate": 5.884574280936402e-06, + "loss": 0.5143, + "step": 31350 + }, + { + "epoch": 0.9150736903545892, + "grad_norm": 0.9160916668246117, + "learning_rate": 5.881562279471305e-06, + "loss": 0.4882, + "step": 31355 + }, + { + "epoch": 0.9152196118488254, + "grad_norm": 0.9770555239823763, + "learning_rate": 5.878555312263192e-06, + "loss": 0.5365, + "step": 31360 + }, + { + "epoch": 0.9153655333430615, + "grad_norm": 1.0827212181009942, + "learning_rate": 5.8755533800123015e-06, + "loss": 0.5197, + "step": 31365 + }, + { + "epoch": 0.9155114548372976, + "grad_norm": 0.9711317016550641, + "learning_rate": 5.872556483417676e-06, + "loss": 0.5078, + "step": 31370 + }, + { + "epoch": 0.9156573763315337, + "grad_norm": 0.9395837642777, + "learning_rate": 5.869564623177208e-06, + "loss": 0.538, + "step": 31375 + }, + { + "epoch": 0.9158032978257697, + "grad_norm": 0.976118613543372, + "learning_rate": 5.866577799987601e-06, + "loss": 0.5407, + "step": 31380 + }, + { + "epoch": 0.9159492193200058, + "grad_norm": 0.8918711864452179, + "learning_rate": 5.863596014544399e-06, + "loss": 0.4543, + "step": 31385 + }, + { + "epoch": 0.9160951408142419, + "grad_norm": 1.0104467179834946, + "learning_rate": 5.860619267541967e-06, + "loss": 0.5385, + "step": 31390 + }, + { + "epoch": 0.916241062308478, + "grad_norm": 0.9839933987243363, + "learning_rate": 5.85764755967349e-06, + "loss": 0.5159, + "step": 31395 + }, + { + "epoch": 0.9163869838027141, + "grad_norm": 1.016251482201384, + "learning_rate": 5.854680891630992e-06, + "loss": 0.5603, + "step": 31400 + }, + { + "epoch": 0.9165329052969502, + "grad_norm": 1.0128208992399501, + "learning_rate": 5.8517192641053106e-06, + "loss": 0.5332, + "step": 31405 + }, + { + "epoch": 0.9166788267911864, + "grad_norm": 0.9604393492902995, + "learning_rate": 5.848762677786121e-06, + "loss": 0.5196, + "step": 31410 + }, + { + "epoch": 0.9168247482854225, + "grad_norm": 0.9598333133687205, + "learning_rate": 5.845811133361918e-06, + "loss": 0.5027, + "step": 31415 + }, + { + "epoch": 0.9169706697796586, + "grad_norm": 1.0289666002720075, + "learning_rate": 5.8428646315200246e-06, + "loss": 0.5226, + "step": 31420 + }, + { + "epoch": 0.9171165912738947, + "grad_norm": 0.9744226413128271, + "learning_rate": 5.839923172946586e-06, + "loss": 0.504, + "step": 31425 + }, + { + "epoch": 0.9172625127681308, + "grad_norm": 1.0100517730443925, + "learning_rate": 5.836986758326578e-06, + "loss": 0.512, + "step": 31430 + }, + { + "epoch": 0.9174084342623668, + "grad_norm": 1.120101253174796, + "learning_rate": 5.834055388343801e-06, + "loss": 0.5284, + "step": 31435 + }, + { + "epoch": 0.9175543557566029, + "grad_norm": 0.9778272476700308, + "learning_rate": 5.8311290636808765e-06, + "loss": 0.4887, + "step": 31440 + }, + { + "epoch": 0.917700277250839, + "grad_norm": 1.1721872563674882, + "learning_rate": 5.828207785019255e-06, + "loss": 0.5162, + "step": 31445 + }, + { + "epoch": 0.9178461987450751, + "grad_norm": 1.0692293926090946, + "learning_rate": 5.825291553039208e-06, + "loss": 0.5194, + "step": 31450 + }, + { + "epoch": 0.9179921202393112, + "grad_norm": 0.9269333426138349, + "learning_rate": 5.8223803684198415e-06, + "loss": 0.5488, + "step": 31455 + }, + { + "epoch": 0.9181380417335474, + "grad_norm": 1.1208102992878484, + "learning_rate": 5.819474231839076e-06, + "loss": 0.5116, + "step": 31460 + }, + { + "epoch": 0.9182839632277835, + "grad_norm": 1.0488380508477457, + "learning_rate": 5.816573143973657e-06, + "loss": 0.5478, + "step": 31465 + }, + { + "epoch": 0.9184298847220196, + "grad_norm": 0.9572381107664635, + "learning_rate": 5.813677105499158e-06, + "loss": 0.5214, + "step": 31470 + }, + { + "epoch": 0.9185758062162557, + "grad_norm": 1.196483551614418, + "learning_rate": 5.810786117089975e-06, + "loss": 0.504, + "step": 31475 + }, + { + "epoch": 0.9187217277104918, + "grad_norm": 1.0389718264573946, + "learning_rate": 5.807900179419332e-06, + "loss": 0.5248, + "step": 31480 + }, + { + "epoch": 0.9188676492047279, + "grad_norm": 1.1205777267900252, + "learning_rate": 5.805019293159271e-06, + "loss": 0.5722, + "step": 31485 + }, + { + "epoch": 0.9190135706989639, + "grad_norm": 1.2602142760453576, + "learning_rate": 5.802143458980663e-06, + "loss": 0.5987, + "step": 31490 + }, + { + "epoch": 0.9191594921932, + "grad_norm": 0.8809563533784155, + "learning_rate": 5.799272677553196e-06, + "loss": 0.5534, + "step": 31495 + }, + { + "epoch": 0.9193054136874361, + "grad_norm": 1.022134406859791, + "learning_rate": 5.796406949545384e-06, + "loss": 0.5028, + "step": 31500 + }, + { + "epoch": 0.9194513351816722, + "grad_norm": 0.9958885546350226, + "learning_rate": 5.79354627562457e-06, + "loss": 0.5144, + "step": 31505 + }, + { + "epoch": 0.9195972566759084, + "grad_norm": 0.9586393693455683, + "learning_rate": 5.790690656456913e-06, + "loss": 0.4971, + "step": 31510 + }, + { + "epoch": 0.9197431781701445, + "grad_norm": 0.9104926669708108, + "learning_rate": 5.787840092707397e-06, + "loss": 0.5012, + "step": 31515 + }, + { + "epoch": 0.9198890996643806, + "grad_norm": 0.8971201381839433, + "learning_rate": 5.7849945850398265e-06, + "loss": 0.4606, + "step": 31520 + }, + { + "epoch": 0.9200350211586167, + "grad_norm": 1.0757963411310465, + "learning_rate": 5.78215413411684e-06, + "loss": 0.538, + "step": 31525 + }, + { + "epoch": 0.9201809426528528, + "grad_norm": 0.8092683643260684, + "learning_rate": 5.779318740599879e-06, + "loss": 0.4993, + "step": 31530 + }, + { + "epoch": 0.9203268641470889, + "grad_norm": 1.0734661349566186, + "learning_rate": 5.776488405149225e-06, + "loss": 0.5671, + "step": 31535 + }, + { + "epoch": 0.920472785641325, + "grad_norm": 1.1005848563672969, + "learning_rate": 5.773663128423971e-06, + "loss": 0.5109, + "step": 31540 + }, + { + "epoch": 0.920618707135561, + "grad_norm": 1.1131244445886301, + "learning_rate": 5.7708429110820345e-06, + "loss": 0.5196, + "step": 31545 + }, + { + "epoch": 0.9207646286297971, + "grad_norm": 1.024710100947874, + "learning_rate": 5.7680277537801596e-06, + "loss": 0.5611, + "step": 31550 + }, + { + "epoch": 0.9209105501240332, + "grad_norm": 0.9815784808476141, + "learning_rate": 5.7652176571739055e-06, + "loss": 0.4933, + "step": 31555 + }, + { + "epoch": 0.9210564716182694, + "grad_norm": 0.9674750595692396, + "learning_rate": 5.762412621917658e-06, + "loss": 0.5259, + "step": 31560 + }, + { + "epoch": 0.9212023931125055, + "grad_norm": 1.1112468733005603, + "learning_rate": 5.759612648664617e-06, + "loss": 0.5642, + "step": 31565 + }, + { + "epoch": 0.9213483146067416, + "grad_norm": 0.9704411756490212, + "learning_rate": 5.756817738066813e-06, + "loss": 0.4597, + "step": 31570 + }, + { + "epoch": 0.9214942361009777, + "grad_norm": 0.8942571739452657, + "learning_rate": 5.754027890775093e-06, + "loss": 0.5536, + "step": 31575 + }, + { + "epoch": 0.9216401575952138, + "grad_norm": 1.0540959562003775, + "learning_rate": 5.751243107439126e-06, + "loss": 0.5605, + "step": 31580 + }, + { + "epoch": 0.9217860790894499, + "grad_norm": 1.0479665689525084, + "learning_rate": 5.748463388707398e-06, + "loss": 0.497, + "step": 31585 + }, + { + "epoch": 0.921932000583686, + "grad_norm": 1.015230370189131, + "learning_rate": 5.745688735227215e-06, + "loss": 0.5243, + "step": 31590 + }, + { + "epoch": 0.922077922077922, + "grad_norm": 1.0138227540966689, + "learning_rate": 5.742919147644714e-06, + "loss": 0.54, + "step": 31595 + }, + { + "epoch": 0.9222238435721581, + "grad_norm": 0.9213330067075063, + "learning_rate": 5.740154626604845e-06, + "loss": 0.5009, + "step": 31600 + }, + { + "epoch": 0.9223697650663942, + "grad_norm": 0.9607157300944271, + "learning_rate": 5.737395172751372e-06, + "loss": 0.5205, + "step": 31605 + }, + { + "epoch": 0.9225156865606304, + "grad_norm": 0.9925073745629928, + "learning_rate": 5.734640786726893e-06, + "loss": 0.5078, + "step": 31610 + }, + { + "epoch": 0.9226616080548665, + "grad_norm": 1.0988873460142738, + "learning_rate": 5.7318914691728095e-06, + "loss": 0.5357, + "step": 31615 + }, + { + "epoch": 0.9228075295491026, + "grad_norm": 1.1493430640061477, + "learning_rate": 5.7291472207293594e-06, + "loss": 0.519, + "step": 31620 + }, + { + "epoch": 0.9229534510433387, + "grad_norm": 0.8916772655107527, + "learning_rate": 5.726408042035586e-06, + "loss": 0.481, + "step": 31625 + }, + { + "epoch": 0.9230993725375748, + "grad_norm": 0.993415329232206, + "learning_rate": 5.723673933729367e-06, + "loss": 0.5904, + "step": 31630 + }, + { + "epoch": 0.9232452940318109, + "grad_norm": 1.0223446604284172, + "learning_rate": 5.720944896447379e-06, + "loss": 0.5169, + "step": 31635 + }, + { + "epoch": 0.923391215526047, + "grad_norm": 1.1120195717784853, + "learning_rate": 5.71822093082514e-06, + "loss": 0.4938, + "step": 31640 + }, + { + "epoch": 0.9235371370202831, + "grad_norm": 0.9257167384486059, + "learning_rate": 5.7155020374969706e-06, + "loss": 0.4983, + "step": 31645 + }, + { + "epoch": 0.9236830585145192, + "grad_norm": 0.9266532227557428, + "learning_rate": 5.712788217096014e-06, + "loss": 0.556, + "step": 31650 + }, + { + "epoch": 0.9238289800087552, + "grad_norm": 1.1213606221453962, + "learning_rate": 5.71007947025424e-06, + "loss": 0.5566, + "step": 31655 + }, + { + "epoch": 0.9239749015029914, + "grad_norm": 0.9728696939200219, + "learning_rate": 5.7073757976024225e-06, + "loss": 0.492, + "step": 31660 + }, + { + "epoch": 0.9241208229972275, + "grad_norm": 0.9735082313707822, + "learning_rate": 5.704677199770172e-06, + "loss": 0.5256, + "step": 31665 + }, + { + "epoch": 0.9242667444914636, + "grad_norm": 1.0611148840662614, + "learning_rate": 5.701983677385903e-06, + "loss": 0.5449, + "step": 31670 + }, + { + "epoch": 0.9244126659856997, + "grad_norm": 0.9387481771581796, + "learning_rate": 5.699295231076852e-06, + "loss": 0.5736, + "step": 31675 + }, + { + "epoch": 0.9245585874799358, + "grad_norm": 1.0832717938936667, + "learning_rate": 5.696611861469072e-06, + "loss": 0.5682, + "step": 31680 + }, + { + "epoch": 0.9247045089741719, + "grad_norm": 0.8510028898215768, + "learning_rate": 5.693933569187437e-06, + "loss": 0.5166, + "step": 31685 + }, + { + "epoch": 0.924850430468408, + "grad_norm": 1.1235643002882414, + "learning_rate": 5.6912603548556405e-06, + "loss": 0.5046, + "step": 31690 + }, + { + "epoch": 0.9249963519626441, + "grad_norm": 0.9179832343001979, + "learning_rate": 5.688592219096184e-06, + "loss": 0.5033, + "step": 31695 + }, + { + "epoch": 0.9251422734568802, + "grad_norm": 1.1102966969451658, + "learning_rate": 5.685929162530405e-06, + "loss": 0.4932, + "step": 31700 + }, + { + "epoch": 0.9252881949511162, + "grad_norm": 0.969940724614742, + "learning_rate": 5.683271185778432e-06, + "loss": 0.4671, + "step": 31705 + }, + { + "epoch": 0.9254341164453525, + "grad_norm": 0.9642585952206327, + "learning_rate": 5.68061828945923e-06, + "loss": 0.5072, + "step": 31710 + }, + { + "epoch": 0.9255800379395885, + "grad_norm": 0.9583576991705849, + "learning_rate": 5.677970474190578e-06, + "loss": 0.5244, + "step": 31715 + }, + { + "epoch": 0.9257259594338246, + "grad_norm": 0.9014056225841488, + "learning_rate": 5.675327740589069e-06, + "loss": 0.539, + "step": 31720 + }, + { + "epoch": 0.9258718809280607, + "grad_norm": 1.0248490218534667, + "learning_rate": 5.672690089270107e-06, + "loss": 0.5196, + "step": 31725 + }, + { + "epoch": 0.9260178024222968, + "grad_norm": 0.9678155273003267, + "learning_rate": 5.670057520847923e-06, + "loss": 0.5237, + "step": 31730 + }, + { + "epoch": 0.9261637239165329, + "grad_norm": 1.0229218390382524, + "learning_rate": 5.66743003593556e-06, + "loss": 0.5519, + "step": 31735 + }, + { + "epoch": 0.926309645410769, + "grad_norm": 1.046174719379701, + "learning_rate": 5.664807635144873e-06, + "loss": 0.5102, + "step": 31740 + }, + { + "epoch": 0.9264555669050051, + "grad_norm": 1.071932799796896, + "learning_rate": 5.662190319086546e-06, + "loss": 0.5717, + "step": 31745 + }, + { + "epoch": 0.9266014883992412, + "grad_norm": 1.0729315072392658, + "learning_rate": 5.659578088370062e-06, + "loss": 0.5779, + "step": 31750 + }, + { + "epoch": 0.9267474098934773, + "grad_norm": 1.0915419069520298, + "learning_rate": 5.6569709436037265e-06, + "loss": 0.5585, + "step": 31755 + }, + { + "epoch": 0.9268933313877135, + "grad_norm": 1.0167125764508396, + "learning_rate": 5.654368885394666e-06, + "loss": 0.5309, + "step": 31760 + }, + { + "epoch": 0.9270392528819495, + "grad_norm": 1.0424229754296737, + "learning_rate": 5.651771914348817e-06, + "loss": 0.4994, + "step": 31765 + }, + { + "epoch": 0.9271851743761856, + "grad_norm": 1.1027655441406234, + "learning_rate": 5.649180031070937e-06, + "loss": 0.5154, + "step": 31770 + }, + { + "epoch": 0.9273310958704217, + "grad_norm": 0.9313753100857511, + "learning_rate": 5.646593236164586e-06, + "loss": 0.5091, + "step": 31775 + }, + { + "epoch": 0.9274770173646578, + "grad_norm": 1.0583117055235665, + "learning_rate": 5.644011530232152e-06, + "loss": 0.55, + "step": 31780 + }, + { + "epoch": 0.9276229388588939, + "grad_norm": 0.9143411243107543, + "learning_rate": 5.6414349138748375e-06, + "loss": 0.5434, + "step": 31785 + }, + { + "epoch": 0.92776886035313, + "grad_norm": 1.0386798196074016, + "learning_rate": 5.638863387692646e-06, + "loss": 0.5158, + "step": 31790 + }, + { + "epoch": 0.9279147818473661, + "grad_norm": 0.8997653072960133, + "learning_rate": 5.636296952284417e-06, + "loss": 0.4879, + "step": 31795 + }, + { + "epoch": 0.9280607033416022, + "grad_norm": 1.1642671816665333, + "learning_rate": 5.633735608247781e-06, + "loss": 0.5834, + "step": 31800 + }, + { + "epoch": 0.9282066248358383, + "grad_norm": 0.9874577506471407, + "learning_rate": 5.631179356179204e-06, + "loss": 0.4963, + "step": 31805 + }, + { + "epoch": 0.9283525463300745, + "grad_norm": 0.9642178660748422, + "learning_rate": 5.628628196673953e-06, + "loss": 0.5338, + "step": 31810 + }, + { + "epoch": 0.9284984678243106, + "grad_norm": 0.9570004738418421, + "learning_rate": 5.626082130326117e-06, + "loss": 0.5318, + "step": 31815 + }, + { + "epoch": 0.9286443893185466, + "grad_norm": 1.186193679196771, + "learning_rate": 5.623541157728586e-06, + "loss": 0.5355, + "step": 31820 + }, + { + "epoch": 0.9287903108127827, + "grad_norm": 0.9451195718359684, + "learning_rate": 5.621005279473083e-06, + "loss": 0.4632, + "step": 31825 + }, + { + "epoch": 0.9289362323070188, + "grad_norm": 0.9625344111486446, + "learning_rate": 5.618474496150131e-06, + "loss": 0.5392, + "step": 31830 + }, + { + "epoch": 0.9290821538012549, + "grad_norm": 1.0328375691556269, + "learning_rate": 5.615948808349066e-06, + "loss": 0.5493, + "step": 31835 + }, + { + "epoch": 0.929228075295491, + "grad_norm": 1.0515395906021583, + "learning_rate": 5.6134282166580525e-06, + "loss": 0.4935, + "step": 31840 + }, + { + "epoch": 0.9293739967897271, + "grad_norm": 1.3085672807154634, + "learning_rate": 5.610912721664043e-06, + "loss": 0.5334, + "step": 31845 + }, + { + "epoch": 0.9295199182839632, + "grad_norm": 1.0576759196709575, + "learning_rate": 5.608402323952827e-06, + "loss": 0.4972, + "step": 31850 + }, + { + "epoch": 0.9296658397781993, + "grad_norm": 1.0185436077379475, + "learning_rate": 5.605897024108997e-06, + "loss": 0.5883, + "step": 31855 + }, + { + "epoch": 0.9298117612724355, + "grad_norm": 1.133586188129723, + "learning_rate": 5.603396822715953e-06, + "loss": 0.5504, + "step": 31860 + }, + { + "epoch": 0.9299576827666716, + "grad_norm": 1.0660002075547204, + "learning_rate": 5.600901720355923e-06, + "loss": 0.5242, + "step": 31865 + }, + { + "epoch": 0.9301036042609077, + "grad_norm": 1.038727774726783, + "learning_rate": 5.5984117176099295e-06, + "loss": 0.5507, + "step": 31870 + }, + { + "epoch": 0.9302495257551437, + "grad_norm": 1.0055715703660268, + "learning_rate": 5.595926815057819e-06, + "loss": 0.5145, + "step": 31875 + }, + { + "epoch": 0.9303954472493798, + "grad_norm": 0.947604687560181, + "learning_rate": 5.593447013278247e-06, + "loss": 0.5041, + "step": 31880 + }, + { + "epoch": 0.9305413687436159, + "grad_norm": 0.9269548880671084, + "learning_rate": 5.590972312848686e-06, + "loss": 0.5092, + "step": 31885 + }, + { + "epoch": 0.930687290237852, + "grad_norm": 0.9513306453284274, + "learning_rate": 5.58850271434541e-06, + "loss": 0.5386, + "step": 31890 + }, + { + "epoch": 0.9308332117320881, + "grad_norm": 0.9582900018746017, + "learning_rate": 5.586038218343515e-06, + "loss": 0.5096, + "step": 31895 + }, + { + "epoch": 0.9309791332263242, + "grad_norm": 0.8940592201219112, + "learning_rate": 5.5835788254169045e-06, + "loss": 0.5629, + "step": 31900 + }, + { + "epoch": 0.9311250547205603, + "grad_norm": 0.9547257387754181, + "learning_rate": 5.5811245361382916e-06, + "loss": 0.5526, + "step": 31905 + }, + { + "epoch": 0.9312709762147965, + "grad_norm": 1.0317926827619388, + "learning_rate": 5.57867535107921e-06, + "loss": 0.5165, + "step": 31910 + }, + { + "epoch": 0.9314168977090326, + "grad_norm": 0.9533641542598994, + "learning_rate": 5.576231270809987e-06, + "loss": 0.514, + "step": 31915 + }, + { + "epoch": 0.9315628192032687, + "grad_norm": 1.1410699736394636, + "learning_rate": 5.573792295899782e-06, + "loss": 0.5169, + "step": 31920 + }, + { + "epoch": 0.9317087406975048, + "grad_norm": 1.009128012617985, + "learning_rate": 5.571358426916551e-06, + "loss": 0.5187, + "step": 31925 + }, + { + "epoch": 0.9318546621917408, + "grad_norm": 1.0778917206906513, + "learning_rate": 5.568929664427068e-06, + "loss": 0.5613, + "step": 31930 + }, + { + "epoch": 0.9320005836859769, + "grad_norm": 0.9078292135919422, + "learning_rate": 5.566506008996918e-06, + "loss": 0.5431, + "step": 31935 + }, + { + "epoch": 0.932146505180213, + "grad_norm": 1.0442124037312723, + "learning_rate": 5.564087461190488e-06, + "loss": 0.4817, + "step": 31940 + }, + { + "epoch": 0.9322924266744491, + "grad_norm": 1.1363502299439294, + "learning_rate": 5.5616740215709884e-06, + "loss": 0.5645, + "step": 31945 + }, + { + "epoch": 0.9324383481686852, + "grad_norm": 0.9184147280609047, + "learning_rate": 5.559265690700426e-06, + "loss": 0.4927, + "step": 31950 + }, + { + "epoch": 0.9325842696629213, + "grad_norm": 1.0352520867868198, + "learning_rate": 5.556862469139635e-06, + "loss": 0.5382, + "step": 31955 + }, + { + "epoch": 0.9327301911571575, + "grad_norm": 0.9675309386048626, + "learning_rate": 5.554464357448243e-06, + "loss": 0.5296, + "step": 31960 + }, + { + "epoch": 0.9328761126513936, + "grad_norm": 1.0196329024275952, + "learning_rate": 5.552071356184701e-06, + "loss": 0.5657, + "step": 31965 + }, + { + "epoch": 0.9330220341456297, + "grad_norm": 0.9408299908649591, + "learning_rate": 5.549683465906261e-06, + "loss": 0.4653, + "step": 31970 + }, + { + "epoch": 0.9331679556398658, + "grad_norm": 0.9822200148482348, + "learning_rate": 5.5473006871689855e-06, + "loss": 0.5159, + "step": 31975 + }, + { + "epoch": 0.9333138771341019, + "grad_norm": 1.129894230800963, + "learning_rate": 5.5449230205277525e-06, + "loss": 0.5997, + "step": 31980 + }, + { + "epoch": 0.933459798628338, + "grad_norm": 0.9775420186286264, + "learning_rate": 5.5425504665362445e-06, + "loss": 0.5201, + "step": 31985 + }, + { + "epoch": 0.933605720122574, + "grad_norm": 0.9276292516317872, + "learning_rate": 5.5401830257469565e-06, + "loss": 0.5204, + "step": 31990 + }, + { + "epoch": 0.9337516416168101, + "grad_norm": 1.0036440236335673, + "learning_rate": 5.537820698711189e-06, + "loss": 0.5344, + "step": 31995 + }, + { + "epoch": 0.9338975631110462, + "grad_norm": 1.151969581015306, + "learning_rate": 5.535463485979058e-06, + "loss": 0.5687, + "step": 32000 + }, + { + "epoch": 0.9340434846052823, + "grad_norm": 1.0729535065934772, + "learning_rate": 5.533111388099481e-06, + "loss": 0.5547, + "step": 32005 + }, + { + "epoch": 0.9341894060995185, + "grad_norm": 0.9617476627129081, + "learning_rate": 5.5307644056201865e-06, + "loss": 0.4852, + "step": 32010 + }, + { + "epoch": 0.9343353275937546, + "grad_norm": 0.9527595038161738, + "learning_rate": 5.528422539087719e-06, + "loss": 0.5022, + "step": 32015 + }, + { + "epoch": 0.9344812490879907, + "grad_norm": 1.0696385905057664, + "learning_rate": 5.526085789047418e-06, + "loss": 0.5683, + "step": 32020 + }, + { + "epoch": 0.9346271705822268, + "grad_norm": 0.9308166600790008, + "learning_rate": 5.5237541560434465e-06, + "loss": 0.4923, + "step": 32025 + }, + { + "epoch": 0.9347730920764629, + "grad_norm": 1.1002910979082146, + "learning_rate": 5.521427640618766e-06, + "loss": 0.5422, + "step": 32030 + }, + { + "epoch": 0.934919013570699, + "grad_norm": 1.0904448248167755, + "learning_rate": 5.519106243315149e-06, + "loss": 0.5455, + "step": 32035 + }, + { + "epoch": 0.935064935064935, + "grad_norm": 1.012187526547558, + "learning_rate": 5.516789964673177e-06, + "loss": 0.506, + "step": 32040 + }, + { + "epoch": 0.9352108565591711, + "grad_norm": 1.1166942946625376, + "learning_rate": 5.514478805232236e-06, + "loss": 0.5288, + "step": 32045 + }, + { + "epoch": 0.9353567780534072, + "grad_norm": 0.9722735365239863, + "learning_rate": 5.512172765530527e-06, + "loss": 0.5274, + "step": 32050 + }, + { + "epoch": 0.9355026995476433, + "grad_norm": 1.0099030286564756, + "learning_rate": 5.509871846105051e-06, + "loss": 0.5677, + "step": 32055 + }, + { + "epoch": 0.9356486210418795, + "grad_norm": 1.0077647828759178, + "learning_rate": 5.507576047491619e-06, + "loss": 0.4838, + "step": 32060 + }, + { + "epoch": 0.9357945425361156, + "grad_norm": 1.0187261792820985, + "learning_rate": 5.505285370224855e-06, + "loss": 0.4953, + "step": 32065 + }, + { + "epoch": 0.9359404640303517, + "grad_norm": 0.9529094247193499, + "learning_rate": 5.502999814838182e-06, + "loss": 0.5926, + "step": 32070 + }, + { + "epoch": 0.9360863855245878, + "grad_norm": 0.9988246938941732, + "learning_rate": 5.500719381863837e-06, + "loss": 0.5284, + "step": 32075 + }, + { + "epoch": 0.9362323070188239, + "grad_norm": 0.9719893841296832, + "learning_rate": 5.498444071832861e-06, + "loss": 0.5038, + "step": 32080 + }, + { + "epoch": 0.93637822851306, + "grad_norm": 1.105894495324846, + "learning_rate": 5.496173885275104e-06, + "loss": 0.5161, + "step": 32085 + }, + { + "epoch": 0.936524150007296, + "grad_norm": 0.961983803468379, + "learning_rate": 5.493908822719214e-06, + "loss": 0.5093, + "step": 32090 + }, + { + "epoch": 0.9366700715015321, + "grad_norm": 1.104508324182209, + "learning_rate": 5.491648884692663e-06, + "loss": 0.5786, + "step": 32095 + }, + { + "epoch": 0.9368159929957682, + "grad_norm": 0.9654443278620762, + "learning_rate": 5.489394071721714e-06, + "loss": 0.5342, + "step": 32100 + }, + { + "epoch": 0.9369619144900044, + "grad_norm": 1.0744947223939199, + "learning_rate": 5.487144384331442e-06, + "loss": 0.5188, + "step": 32105 + }, + { + "epoch": 0.9371078359842405, + "grad_norm": 1.1634668821257546, + "learning_rate": 5.484899823045731e-06, + "loss": 0.5173, + "step": 32110 + }, + { + "epoch": 0.9372537574784766, + "grad_norm": 0.9303682489852825, + "learning_rate": 5.482660388387265e-06, + "loss": 0.5217, + "step": 32115 + }, + { + "epoch": 0.9373996789727127, + "grad_norm": 0.8910344252256887, + "learning_rate": 5.4804260808775454e-06, + "loss": 0.5511, + "step": 32120 + }, + { + "epoch": 0.9375456004669488, + "grad_norm": 0.9723850296906433, + "learning_rate": 5.478196901036867e-06, + "loss": 0.5001, + "step": 32125 + }, + { + "epoch": 0.9376915219611849, + "grad_norm": 0.9962596224624716, + "learning_rate": 5.475972849384338e-06, + "loss": 0.5553, + "step": 32130 + }, + { + "epoch": 0.937837443455421, + "grad_norm": 1.0634195109270448, + "learning_rate": 5.473753926437866e-06, + "loss": 0.5193, + "step": 32135 + }, + { + "epoch": 0.9379833649496571, + "grad_norm": 0.8449855096674338, + "learning_rate": 5.471540132714176e-06, + "loss": 0.4895, + "step": 32140 + }, + { + "epoch": 0.9381292864438932, + "grad_norm": 0.9397992786899602, + "learning_rate": 5.469331468728788e-06, + "loss": 0.4916, + "step": 32145 + }, + { + "epoch": 0.9382752079381292, + "grad_norm": 0.9235692598768837, + "learning_rate": 5.46712793499603e-06, + "loss": 0.6033, + "step": 32150 + }, + { + "epoch": 0.9384211294323654, + "grad_norm": 1.026363759239074, + "learning_rate": 5.464929532029033e-06, + "loss": 0.5888, + "step": 32155 + }, + { + "epoch": 0.9385670509266015, + "grad_norm": 0.989640622068954, + "learning_rate": 5.46273626033974e-06, + "loss": 0.5364, + "step": 32160 + }, + { + "epoch": 0.9387129724208376, + "grad_norm": 1.021372727540997, + "learning_rate": 5.460548120438898e-06, + "loss": 0.5486, + "step": 32165 + }, + { + "epoch": 0.9388588939150737, + "grad_norm": 0.9049922969362844, + "learning_rate": 5.458365112836051e-06, + "loss": 0.508, + "step": 32170 + }, + { + "epoch": 0.9390048154093098, + "grad_norm": 0.9961778348984796, + "learning_rate": 5.456187238039556e-06, + "loss": 0.5453, + "step": 32175 + }, + { + "epoch": 0.9391507369035459, + "grad_norm": 1.0193700775334373, + "learning_rate": 5.45401449655657e-06, + "loss": 0.5238, + "step": 32180 + }, + { + "epoch": 0.939296658397782, + "grad_norm": 0.9176568059248664, + "learning_rate": 5.451846888893054e-06, + "loss": 0.4589, + "step": 32185 + }, + { + "epoch": 0.9394425798920181, + "grad_norm": 1.0711435355519072, + "learning_rate": 5.449684415553783e-06, + "loss": 0.5262, + "step": 32190 + }, + { + "epoch": 0.9395885013862542, + "grad_norm": 1.0266778237076375, + "learning_rate": 5.447527077042325e-06, + "loss": 0.5264, + "step": 32195 + }, + { + "epoch": 0.9397344228804902, + "grad_norm": 1.0181382804539882, + "learning_rate": 5.445374873861056e-06, + "loss": 0.5762, + "step": 32200 + }, + { + "epoch": 0.9398803443747265, + "grad_norm": 1.0351835837230354, + "learning_rate": 5.443227806511157e-06, + "loss": 0.5357, + "step": 32205 + }, + { + "epoch": 0.9400262658689625, + "grad_norm": 1.0539822136537988, + "learning_rate": 5.4410858754926166e-06, + "loss": 0.5838, + "step": 32210 + }, + { + "epoch": 0.9401721873631986, + "grad_norm": 1.059967724648873, + "learning_rate": 5.438949081304219e-06, + "loss": 0.4786, + "step": 32215 + }, + { + "epoch": 0.9403181088574347, + "grad_norm": 1.093441054360934, + "learning_rate": 5.436817424443558e-06, + "loss": 0.5903, + "step": 32220 + }, + { + "epoch": 0.9404640303516708, + "grad_norm": 1.0062845438569128, + "learning_rate": 5.434690905407029e-06, + "loss": 0.5805, + "step": 32225 + }, + { + "epoch": 0.9406099518459069, + "grad_norm": 0.9561755804812622, + "learning_rate": 5.4325695246898315e-06, + "loss": 0.5293, + "step": 32230 + }, + { + "epoch": 0.940755873340143, + "grad_norm": 0.9427322986943857, + "learning_rate": 5.430453282785971e-06, + "loss": 0.5073, + "step": 32235 + }, + { + "epoch": 0.9409017948343791, + "grad_norm": 0.8729784735374143, + "learning_rate": 5.428342180188248e-06, + "loss": 0.4975, + "step": 32240 + }, + { + "epoch": 0.9410477163286152, + "grad_norm": 1.0720648333779812, + "learning_rate": 5.4262362173882835e-06, + "loss": 0.5735, + "step": 32245 + }, + { + "epoch": 0.9411936378228513, + "grad_norm": 0.969168330805528, + "learning_rate": 5.424135394876478e-06, + "loss": 0.5364, + "step": 32250 + }, + { + "epoch": 0.9413395593170875, + "grad_norm": 0.9505643008936041, + "learning_rate": 5.422039713142058e-06, + "loss": 0.4968, + "step": 32255 + }, + { + "epoch": 0.9414854808113235, + "grad_norm": 0.968023828653141, + "learning_rate": 5.419949172673032e-06, + "loss": 0.5219, + "step": 32260 + }, + { + "epoch": 0.9416314023055596, + "grad_norm": 1.0173988966125602, + "learning_rate": 5.417863773956228e-06, + "loss": 0.5272, + "step": 32265 + }, + { + "epoch": 0.9417773237997957, + "grad_norm": 1.0581380216305314, + "learning_rate": 5.415783517477268e-06, + "loss": 0.4939, + "step": 32270 + }, + { + "epoch": 0.9419232452940318, + "grad_norm": 1.1092085771557247, + "learning_rate": 5.413708403720577e-06, + "loss": 0.5666, + "step": 32275 + }, + { + "epoch": 0.9420691667882679, + "grad_norm": 1.1165806864666583, + "learning_rate": 5.411638433169389e-06, + "loss": 0.5455, + "step": 32280 + }, + { + "epoch": 0.942215088282504, + "grad_norm": 1.0678423743130478, + "learning_rate": 5.409573606305731e-06, + "loss": 0.482, + "step": 32285 + }, + { + "epoch": 0.9423610097767401, + "grad_norm": 0.9500444389606177, + "learning_rate": 5.407513923610437e-06, + "loss": 0.5205, + "step": 32290 + }, + { + "epoch": 0.9425069312709762, + "grad_norm": 1.036409361618088, + "learning_rate": 5.405459385563143e-06, + "loss": 0.5134, + "step": 32295 + }, + { + "epoch": 0.9426528527652123, + "grad_norm": 0.9331912339013145, + "learning_rate": 5.4034099926422835e-06, + "loss": 0.5074, + "step": 32300 + }, + { + "epoch": 0.9427987742594485, + "grad_norm": 0.9503658750409534, + "learning_rate": 5.401365745325106e-06, + "loss": 0.5562, + "step": 32305 + }, + { + "epoch": 0.9429446957536846, + "grad_norm": 0.9821297261699153, + "learning_rate": 5.399326644087645e-06, + "loss": 0.5508, + "step": 32310 + }, + { + "epoch": 0.9430906172479206, + "grad_norm": 0.9258730356943676, + "learning_rate": 5.397292689404746e-06, + "loss": 0.4497, + "step": 32315 + }, + { + "epoch": 0.9432365387421567, + "grad_norm": 1.0341894543120496, + "learning_rate": 5.3952638817500515e-06, + "loss": 0.5157, + "step": 32320 + }, + { + "epoch": 0.9433824602363928, + "grad_norm": 0.8866010774437126, + "learning_rate": 5.3932402215960075e-06, + "loss": 0.4654, + "step": 32325 + }, + { + "epoch": 0.9435283817306289, + "grad_norm": 1.030424889629682, + "learning_rate": 5.391221709413862e-06, + "loss": 0.522, + "step": 32330 + }, + { + "epoch": 0.943674303224865, + "grad_norm": 1.1016410270209107, + "learning_rate": 5.3892083456736615e-06, + "loss": 0.5656, + "step": 32335 + }, + { + "epoch": 0.9438202247191011, + "grad_norm": 0.9657615339305116, + "learning_rate": 5.387200130844262e-06, + "loss": 0.5648, + "step": 32340 + }, + { + "epoch": 0.9439661462133372, + "grad_norm": 1.0085048621724384, + "learning_rate": 5.385197065393303e-06, + "loss": 0.4893, + "step": 32345 + }, + { + "epoch": 0.9441120677075733, + "grad_norm": 0.9533405179225334, + "learning_rate": 5.383199149787245e-06, + "loss": 0.4724, + "step": 32350 + }, + { + "epoch": 0.9442579892018095, + "grad_norm": 1.1375491503550434, + "learning_rate": 5.3812063844913356e-06, + "loss": 0.5123, + "step": 32355 + }, + { + "epoch": 0.9444039106960456, + "grad_norm": 1.1568315364097925, + "learning_rate": 5.379218769969628e-06, + "loss": 0.4994, + "step": 32360 + }, + { + "epoch": 0.9445498321902817, + "grad_norm": 1.031978122242187, + "learning_rate": 5.377236306684977e-06, + "loss": 0.5083, + "step": 32365 + }, + { + "epoch": 0.9446957536845177, + "grad_norm": 0.9164732549671073, + "learning_rate": 5.375258995099031e-06, + "loss": 0.4753, + "step": 32370 + }, + { + "epoch": 0.9448416751787538, + "grad_norm": 1.095053920064882, + "learning_rate": 5.373286835672251e-06, + "loss": 0.5164, + "step": 32375 + }, + { + "epoch": 0.9449875966729899, + "grad_norm": 1.0993939498070473, + "learning_rate": 5.371319828863883e-06, + "loss": 0.5911, + "step": 32380 + }, + { + "epoch": 0.945133518167226, + "grad_norm": 1.0714038720499457, + "learning_rate": 5.369357975131992e-06, + "loss": 0.5227, + "step": 32385 + }, + { + "epoch": 0.9452794396614621, + "grad_norm": 1.0515651481500352, + "learning_rate": 5.367401274933423e-06, + "loss": 0.5194, + "step": 32390 + }, + { + "epoch": 0.9454253611556982, + "grad_norm": 1.0114511510710738, + "learning_rate": 5.365449728723835e-06, + "loss": 0.5478, + "step": 32395 + }, + { + "epoch": 0.9455712826499343, + "grad_norm": 1.0717098404795151, + "learning_rate": 5.363503336957679e-06, + "loss": 0.4959, + "step": 32400 + }, + { + "epoch": 0.9457172041441705, + "grad_norm": 1.0217572457427606, + "learning_rate": 5.36156210008821e-06, + "loss": 0.5399, + "step": 32405 + }, + { + "epoch": 0.9458631256384066, + "grad_norm": 1.048430174293286, + "learning_rate": 5.359626018567484e-06, + "loss": 0.5221, + "step": 32410 + }, + { + "epoch": 0.9460090471326427, + "grad_norm": 1.1518842693964366, + "learning_rate": 5.35769509284635e-06, + "loss": 0.5202, + "step": 32415 + }, + { + "epoch": 0.9461549686268788, + "grad_norm": 0.9690391741965505, + "learning_rate": 5.355769323374462e-06, + "loss": 0.5099, + "step": 32420 + }, + { + "epoch": 0.9463008901211148, + "grad_norm": 1.1001506832752115, + "learning_rate": 5.353848710600266e-06, + "loss": 0.5197, + "step": 32425 + }, + { + "epoch": 0.9464468116153509, + "grad_norm": 0.9128926299634978, + "learning_rate": 5.351933254971022e-06, + "loss": 0.5783, + "step": 32430 + }, + { + "epoch": 0.946592733109587, + "grad_norm": 1.0067123464497771, + "learning_rate": 5.3500229569327775e-06, + "loss": 0.5875, + "step": 32435 + }, + { + "epoch": 0.9467386546038231, + "grad_norm": 1.020425317530466, + "learning_rate": 5.348117816930374e-06, + "loss": 0.5047, + "step": 32440 + }, + { + "epoch": 0.9468845760980592, + "grad_norm": 0.9901664790546404, + "learning_rate": 5.346217835407464e-06, + "loss": 0.5509, + "step": 32445 + }, + { + "epoch": 0.9470304975922953, + "grad_norm": 0.9363910630107029, + "learning_rate": 5.344323012806491e-06, + "loss": 0.4902, + "step": 32450 + }, + { + "epoch": 0.9471764190865315, + "grad_norm": 0.9268987380651581, + "learning_rate": 5.342433349568707e-06, + "loss": 0.4917, + "step": 32455 + }, + { + "epoch": 0.9473223405807676, + "grad_norm": 1.0074519871226457, + "learning_rate": 5.340548846134146e-06, + "loss": 0.5126, + "step": 32460 + }, + { + "epoch": 0.9474682620750037, + "grad_norm": 1.0024981816391079, + "learning_rate": 5.338669502941655e-06, + "loss": 0.5051, + "step": 32465 + }, + { + "epoch": 0.9476141835692398, + "grad_norm": 1.0811704900680763, + "learning_rate": 5.336795320428876e-06, + "loss": 0.5108, + "step": 32470 + }, + { + "epoch": 0.9477601050634759, + "grad_norm": 0.9178559495477222, + "learning_rate": 5.334926299032241e-06, + "loss": 0.4762, + "step": 32475 + }, + { + "epoch": 0.947906026557712, + "grad_norm": 1.2133271272432984, + "learning_rate": 5.333062439186995e-06, + "loss": 0.5634, + "step": 32480 + }, + { + "epoch": 0.948051948051948, + "grad_norm": 1.0319536826568692, + "learning_rate": 5.331203741327163e-06, + "loss": 0.5916, + "step": 32485 + }, + { + "epoch": 0.9481978695461841, + "grad_norm": 1.0526814990908873, + "learning_rate": 5.329350205885583e-06, + "loss": 0.5443, + "step": 32490 + }, + { + "epoch": 0.9483437910404202, + "grad_norm": 0.9725696915607186, + "learning_rate": 5.3275018332938844e-06, + "loss": 0.5224, + "step": 32495 + }, + { + "epoch": 0.9484897125346563, + "grad_norm": 1.0901222653279823, + "learning_rate": 5.3256586239824964e-06, + "loss": 0.5006, + "step": 32500 + }, + { + "epoch": 0.9486356340288925, + "grad_norm": 0.8570398912908441, + "learning_rate": 5.323820578380645e-06, + "loss": 0.5193, + "step": 32505 + }, + { + "epoch": 0.9487815555231286, + "grad_norm": 1.0173662891091206, + "learning_rate": 5.321987696916352e-06, + "loss": 0.5386, + "step": 32510 + }, + { + "epoch": 0.9489274770173647, + "grad_norm": 0.9657507906669571, + "learning_rate": 5.320159980016438e-06, + "loss": 0.4992, + "step": 32515 + }, + { + "epoch": 0.9490733985116008, + "grad_norm": 0.9361403786008836, + "learning_rate": 5.31833742810652e-06, + "loss": 0.5024, + "step": 32520 + }, + { + "epoch": 0.9492193200058369, + "grad_norm": 1.034992469883588, + "learning_rate": 5.316520041611018e-06, + "loss": 0.5307, + "step": 32525 + }, + { + "epoch": 0.949365241500073, + "grad_norm": 0.9389121182420452, + "learning_rate": 5.314707820953137e-06, + "loss": 0.4787, + "step": 32530 + }, + { + "epoch": 0.949511162994309, + "grad_norm": 1.0691424888725687, + "learning_rate": 5.312900766554895e-06, + "loss": 0.5613, + "step": 32535 + }, + { + "epoch": 0.9496570844885451, + "grad_norm": 0.8533700947589191, + "learning_rate": 5.311098878837091e-06, + "loss": 0.4551, + "step": 32540 + }, + { + "epoch": 0.9498030059827812, + "grad_norm": 1.0216651568801873, + "learning_rate": 5.3093021582193296e-06, + "loss": 0.5606, + "step": 32545 + }, + { + "epoch": 0.9499489274770173, + "grad_norm": 0.9944316795042544, + "learning_rate": 5.307510605120014e-06, + "loss": 0.5108, + "step": 32550 + }, + { + "epoch": 0.9500948489712535, + "grad_norm": 1.001027602886065, + "learning_rate": 5.305724219956338e-06, + "loss": 0.5028, + "step": 32555 + }, + { + "epoch": 0.9502407704654896, + "grad_norm": 0.9413797617653485, + "learning_rate": 5.303943003144296e-06, + "loss": 0.5269, + "step": 32560 + }, + { + "epoch": 0.9503866919597257, + "grad_norm": 1.0480738113292865, + "learning_rate": 5.302166955098676e-06, + "loss": 0.5623, + "step": 32565 + }, + { + "epoch": 0.9505326134539618, + "grad_norm": 1.0271440494518247, + "learning_rate": 5.300396076233069e-06, + "loss": 0.533, + "step": 32570 + }, + { + "epoch": 0.9506785349481979, + "grad_norm": 1.1121075776593368, + "learning_rate": 5.29863036695985e-06, + "loss": 0.526, + "step": 32575 + }, + { + "epoch": 0.950824456442434, + "grad_norm": 1.0256968168566702, + "learning_rate": 5.296869827690204e-06, + "loss": 0.5257, + "step": 32580 + }, + { + "epoch": 0.95097037793667, + "grad_norm": 1.0409042006338227, + "learning_rate": 5.2951144588341005e-06, + "loss": 0.5052, + "step": 32585 + }, + { + "epoch": 0.9511162994309061, + "grad_norm": 0.912893867936593, + "learning_rate": 5.293364260800312e-06, + "loss": 0.5164, + "step": 32590 + }, + { + "epoch": 0.9512622209251422, + "grad_norm": 1.0098995168495661, + "learning_rate": 5.291619233996403e-06, + "loss": 0.5931, + "step": 32595 + }, + { + "epoch": 0.9514081424193783, + "grad_norm": 1.040839479143544, + "learning_rate": 5.289879378828741e-06, + "loss": 0.5649, + "step": 32600 + }, + { + "epoch": 0.9515540639136145, + "grad_norm": 1.0551045640183863, + "learning_rate": 5.288144695702476e-06, + "loss": 0.5477, + "step": 32605 + }, + { + "epoch": 0.9516999854078506, + "grad_norm": 0.934896284795098, + "learning_rate": 5.286415185021568e-06, + "loss": 0.4793, + "step": 32610 + }, + { + "epoch": 0.9518459069020867, + "grad_norm": 0.9506652452349018, + "learning_rate": 5.28469084718876e-06, + "loss": 0.513, + "step": 32615 + }, + { + "epoch": 0.9519918283963228, + "grad_norm": 1.139952722775066, + "learning_rate": 5.2829716826056045e-06, + "loss": 0.5275, + "step": 32620 + }, + { + "epoch": 0.9521377498905589, + "grad_norm": 1.035508919818366, + "learning_rate": 5.281257691672431e-06, + "loss": 0.5248, + "step": 32625 + }, + { + "epoch": 0.952283671384795, + "grad_norm": 1.101384977301379, + "learning_rate": 5.279548874788382e-06, + "loss": 0.5746, + "step": 32630 + }, + { + "epoch": 0.9524295928790311, + "grad_norm": 1.1202838667513801, + "learning_rate": 5.277845232351384e-06, + "loss": 0.5734, + "step": 32635 + }, + { + "epoch": 0.9525755143732672, + "grad_norm": 0.9949726020492728, + "learning_rate": 5.276146764758161e-06, + "loss": 0.5642, + "step": 32640 + }, + { + "epoch": 0.9527214358675032, + "grad_norm": 0.9575314480512831, + "learning_rate": 5.2744534724042395e-06, + "loss": 0.4667, + "step": 32645 + }, + { + "epoch": 0.9528673573617393, + "grad_norm": 1.0148875212566582, + "learning_rate": 5.272765355683926e-06, + "loss": 0.5618, + "step": 32650 + }, + { + "epoch": 0.9530132788559755, + "grad_norm": 0.99515839850543, + "learning_rate": 5.271082414990332e-06, + "loss": 0.5293, + "step": 32655 + }, + { + "epoch": 0.9531592003502116, + "grad_norm": 0.9904444532458596, + "learning_rate": 5.269404650715365e-06, + "loss": 0.5821, + "step": 32660 + }, + { + "epoch": 0.9533051218444477, + "grad_norm": 1.1847514427465977, + "learning_rate": 5.267732063249721e-06, + "loss": 0.5702, + "step": 32665 + }, + { + "epoch": 0.9534510433386838, + "grad_norm": 1.0141353687465244, + "learning_rate": 5.266064652982895e-06, + "loss": 0.4969, + "step": 32670 + }, + { + "epoch": 0.9535969648329199, + "grad_norm": 0.9100795609414316, + "learning_rate": 5.2644024203031716e-06, + "loss": 0.5364, + "step": 32675 + }, + { + "epoch": 0.953742886327156, + "grad_norm": 0.9902050761953441, + "learning_rate": 5.262745365597634e-06, + "loss": 0.5303, + "step": 32680 + }, + { + "epoch": 0.9538888078213921, + "grad_norm": 0.9428925797883027, + "learning_rate": 5.261093489252162e-06, + "loss": 0.4707, + "step": 32685 + }, + { + "epoch": 0.9540347293156282, + "grad_norm": 0.9522013198823092, + "learning_rate": 5.259446791651422e-06, + "loss": 0.5141, + "step": 32690 + }, + { + "epoch": 0.9541806508098643, + "grad_norm": 1.0495746841068556, + "learning_rate": 5.257805273178879e-06, + "loss": 0.5102, + "step": 32695 + }, + { + "epoch": 0.9543265723041003, + "grad_norm": 1.1007383179722245, + "learning_rate": 5.256168934216791e-06, + "loss": 0.5405, + "step": 32700 + }, + { + "epoch": 0.9544724937983365, + "grad_norm": 0.9679425435095155, + "learning_rate": 5.254537775146213e-06, + "loss": 0.4906, + "step": 32705 + }, + { + "epoch": 0.9546184152925726, + "grad_norm": 0.976114342532864, + "learning_rate": 5.2529117963469855e-06, + "loss": 0.5285, + "step": 32710 + }, + { + "epoch": 0.9547643367868087, + "grad_norm": 0.9719846692548411, + "learning_rate": 5.251290998197752e-06, + "loss": 0.489, + "step": 32715 + }, + { + "epoch": 0.9549102582810448, + "grad_norm": 0.9713635488915366, + "learning_rate": 5.249675381075945e-06, + "loss": 0.5648, + "step": 32720 + }, + { + "epoch": 0.9550561797752809, + "grad_norm": 0.9314669581398894, + "learning_rate": 5.24806494535779e-06, + "loss": 0.5166, + "step": 32725 + }, + { + "epoch": 0.955202101269517, + "grad_norm": 1.0382269272043272, + "learning_rate": 5.2464596914183065e-06, + "loss": 0.5181, + "step": 32730 + }, + { + "epoch": 0.9553480227637531, + "grad_norm": 0.9144298973759393, + "learning_rate": 5.244859619631312e-06, + "loss": 0.4862, + "step": 32735 + }, + { + "epoch": 0.9554939442579892, + "grad_norm": 1.016100758082206, + "learning_rate": 5.24326473036941e-06, + "loss": 0.5064, + "step": 32740 + }, + { + "epoch": 0.9556398657522253, + "grad_norm": 0.9166681959333947, + "learning_rate": 5.241675024003999e-06, + "loss": 0.4981, + "step": 32745 + }, + { + "epoch": 0.9557857872464613, + "grad_norm": 1.1089264006598776, + "learning_rate": 5.240090500905271e-06, + "loss": 0.5701, + "step": 32750 + }, + { + "epoch": 0.9559317087406975, + "grad_norm": 0.8845834130119041, + "learning_rate": 5.238511161442217e-06, + "loss": 0.5381, + "step": 32755 + }, + { + "epoch": 0.9560776302349336, + "grad_norm": 1.0760980263918716, + "learning_rate": 5.236937005982612e-06, + "loss": 0.5033, + "step": 32760 + }, + { + "epoch": 0.9562235517291697, + "grad_norm": 0.93997371920883, + "learning_rate": 5.23536803489303e-06, + "loss": 0.4803, + "step": 32765 + }, + { + "epoch": 0.9563694732234058, + "grad_norm": 1.114164484132752, + "learning_rate": 5.2338042485388315e-06, + "loss": 0.5201, + "step": 32770 + }, + { + "epoch": 0.9565153947176419, + "grad_norm": 1.0438821301319572, + "learning_rate": 5.232245647284176e-06, + "loss": 0.5296, + "step": 32775 + }, + { + "epoch": 0.956661316211878, + "grad_norm": 0.9643308512194376, + "learning_rate": 5.230692231492011e-06, + "loss": 0.5093, + "step": 32780 + }, + { + "epoch": 0.9568072377061141, + "grad_norm": 0.9827159002137831, + "learning_rate": 5.229144001524079e-06, + "loss": 0.5094, + "step": 32785 + }, + { + "epoch": 0.9569531592003502, + "grad_norm": 0.9540932215899559, + "learning_rate": 5.227600957740917e-06, + "loss": 0.5222, + "step": 32790 + }, + { + "epoch": 0.9570990806945863, + "grad_norm": 1.0394539900180224, + "learning_rate": 5.226063100501851e-06, + "loss": 0.5111, + "step": 32795 + }, + { + "epoch": 0.9572450021888224, + "grad_norm": 0.962093786861143, + "learning_rate": 5.224530430164993e-06, + "loss": 0.546, + "step": 32800 + }, + { + "epoch": 0.9573909236830586, + "grad_norm": 0.9440000777362706, + "learning_rate": 5.223002947087262e-06, + "loss": 0.5269, + "step": 32805 + }, + { + "epoch": 0.9575368451772946, + "grad_norm": 1.0633015198672835, + "learning_rate": 5.221480651624359e-06, + "loss": 0.5397, + "step": 32810 + }, + { + "epoch": 0.9576827666715307, + "grad_norm": 0.9617027669553335, + "learning_rate": 5.2199635441307775e-06, + "loss": 0.5303, + "step": 32815 + }, + { + "epoch": 0.9578286881657668, + "grad_norm": 0.8119501727917513, + "learning_rate": 5.218451624959804e-06, + "loss": 0.4668, + "step": 32820 + }, + { + "epoch": 0.9579746096600029, + "grad_norm": 0.8831345798253704, + "learning_rate": 5.216944894463519e-06, + "loss": 0.4946, + "step": 32825 + }, + { + "epoch": 0.958120531154239, + "grad_norm": 1.0502557724917378, + "learning_rate": 5.215443352992791e-06, + "loss": 0.5613, + "step": 32830 + }, + { + "epoch": 0.9582664526484751, + "grad_norm": 0.8841140001128457, + "learning_rate": 5.213947000897282e-06, + "loss": 0.4991, + "step": 32835 + }, + { + "epoch": 0.9584123741427112, + "grad_norm": 0.858299719075058, + "learning_rate": 5.212455838525449e-06, + "loss": 0.5047, + "step": 32840 + }, + { + "epoch": 0.9585582956369473, + "grad_norm": 0.9821504706577947, + "learning_rate": 5.210969866224531e-06, + "loss": 0.5616, + "step": 32845 + }, + { + "epoch": 0.9587042171311835, + "grad_norm": 1.2101543767330076, + "learning_rate": 5.2094890843405696e-06, + "loss": 0.6015, + "step": 32850 + }, + { + "epoch": 0.9588501386254196, + "grad_norm": 0.9942147599361678, + "learning_rate": 5.208013493218389e-06, + "loss": 0.5285, + "step": 32855 + }, + { + "epoch": 0.9589960601196557, + "grad_norm": 1.3218168419484149, + "learning_rate": 5.206543093201611e-06, + "loss": 0.5893, + "step": 32860 + }, + { + "epoch": 0.9591419816138917, + "grad_norm": 0.8421553959586671, + "learning_rate": 5.205077884632646e-06, + "loss": 0.4549, + "step": 32865 + }, + { + "epoch": 0.9592879031081278, + "grad_norm": 0.919440616794099, + "learning_rate": 5.2036178678526904e-06, + "loss": 0.4463, + "step": 32870 + }, + { + "epoch": 0.9594338246023639, + "grad_norm": 0.9696298943745347, + "learning_rate": 5.202163043201743e-06, + "loss": 0.4907, + "step": 32875 + }, + { + "epoch": 0.9595797460966, + "grad_norm": 0.8747653288765233, + "learning_rate": 5.200713411018582e-06, + "loss": 0.4758, + "step": 32880 + }, + { + "epoch": 0.9597256675908361, + "grad_norm": 1.052103595342119, + "learning_rate": 5.199268971640783e-06, + "loss": 0.5292, + "step": 32885 + }, + { + "epoch": 0.9598715890850722, + "grad_norm": 0.9555986822809132, + "learning_rate": 5.1978297254047095e-06, + "loss": 0.5132, + "step": 32890 + }, + { + "epoch": 0.9600175105793083, + "grad_norm": 1.116101319679959, + "learning_rate": 5.196395672645518e-06, + "loss": 0.5324, + "step": 32895 + }, + { + "epoch": 0.9601634320735445, + "grad_norm": 0.8891671994728823, + "learning_rate": 5.1949668136971556e-06, + "loss": 0.5196, + "step": 32900 + }, + { + "epoch": 0.9603093535677806, + "grad_norm": 0.8898822687487035, + "learning_rate": 5.193543148892358e-06, + "loss": 0.5461, + "step": 32905 + }, + { + "epoch": 0.9604552750620167, + "grad_norm": 0.9721937975495203, + "learning_rate": 5.192124678562651e-06, + "loss": 0.5088, + "step": 32910 + }, + { + "epoch": 0.9606011965562528, + "grad_norm": 1.0005016735438703, + "learning_rate": 5.190711403038352e-06, + "loss": 0.4949, + "step": 32915 + }, + { + "epoch": 0.9607471180504888, + "grad_norm": 1.0446026730281885, + "learning_rate": 5.18930332264857e-06, + "loss": 0.5171, + "step": 32920 + }, + { + "epoch": 0.9608930395447249, + "grad_norm": 1.0611777673683729, + "learning_rate": 5.187900437721199e-06, + "loss": 0.484, + "step": 32925 + }, + { + "epoch": 0.961038961038961, + "grad_norm": 0.9666390137632779, + "learning_rate": 5.1865027485829345e-06, + "loss": 0.5161, + "step": 32930 + }, + { + "epoch": 0.9611848825331971, + "grad_norm": 0.9420953812880385, + "learning_rate": 5.185110255559249e-06, + "loss": 0.5186, + "step": 32935 + }, + { + "epoch": 0.9613308040274332, + "grad_norm": 1.0845116014942744, + "learning_rate": 5.1837229589744086e-06, + "loss": 0.5055, + "step": 32940 + }, + { + "epoch": 0.9614767255216693, + "grad_norm": 1.004766470078887, + "learning_rate": 5.182340859151478e-06, + "loss": 0.5698, + "step": 32945 + }, + { + "epoch": 0.9616226470159055, + "grad_norm": 0.9818628144336805, + "learning_rate": 5.180963956412298e-06, + "loss": 0.4901, + "step": 32950 + }, + { + "epoch": 0.9617685685101416, + "grad_norm": 1.0234294705338192, + "learning_rate": 5.179592251077512e-06, + "loss": 0.5834, + "step": 32955 + }, + { + "epoch": 0.9619144900043777, + "grad_norm": 0.9008938291871342, + "learning_rate": 5.178225743466542e-06, + "loss": 0.5034, + "step": 32960 + }, + { + "epoch": 0.9620604114986138, + "grad_norm": 1.1751660764035112, + "learning_rate": 5.17686443389761e-06, + "loss": 0.5235, + "step": 32965 + }, + { + "epoch": 0.9622063329928499, + "grad_norm": 0.9187648440525964, + "learning_rate": 5.175508322687718e-06, + "loss": 0.5371, + "step": 32970 + }, + { + "epoch": 0.962352254487086, + "grad_norm": 0.9558112646720623, + "learning_rate": 5.174157410152661e-06, + "loss": 0.5556, + "step": 32975 + }, + { + "epoch": 0.962498175981322, + "grad_norm": 1.0042514547229604, + "learning_rate": 5.172811696607031e-06, + "loss": 0.4797, + "step": 32980 + }, + { + "epoch": 0.9626440974755581, + "grad_norm": 0.9847438386513686, + "learning_rate": 5.171471182364195e-06, + "loss": 0.5639, + "step": 32985 + }, + { + "epoch": 0.9627900189697942, + "grad_norm": 0.9932990069174642, + "learning_rate": 5.170135867736322e-06, + "loss": 0.4755, + "step": 32990 + }, + { + "epoch": 0.9629359404640303, + "grad_norm": 1.0292678148982501, + "learning_rate": 5.168805753034359e-06, + "loss": 0.5335, + "step": 32995 + }, + { + "epoch": 0.9630818619582665, + "grad_norm": 0.959641399038719, + "learning_rate": 5.167480838568058e-06, + "loss": 0.5271, + "step": 33000 + }, + { + "epoch": 0.9632277834525026, + "grad_norm": 0.9690860574424337, + "learning_rate": 5.1661611246459395e-06, + "loss": 0.5601, + "step": 33005 + }, + { + "epoch": 0.9633737049467387, + "grad_norm": 0.9848156239347772, + "learning_rate": 5.164846611575328e-06, + "loss": 0.5075, + "step": 33010 + }, + { + "epoch": 0.9635196264409748, + "grad_norm": 1.0002301548731347, + "learning_rate": 5.163537299662336e-06, + "loss": 0.4866, + "step": 33015 + }, + { + "epoch": 0.9636655479352109, + "grad_norm": 1.2093082617707185, + "learning_rate": 5.162233189211853e-06, + "loss": 0.5509, + "step": 33020 + }, + { + "epoch": 0.963811469429447, + "grad_norm": 0.9514686470490332, + "learning_rate": 5.160934280527574e-06, + "loss": 0.5274, + "step": 33025 + }, + { + "epoch": 0.963957390923683, + "grad_norm": 1.0041336624940398, + "learning_rate": 5.159640573911969e-06, + "loss": 0.5364, + "step": 33030 + }, + { + "epoch": 0.9641033124179191, + "grad_norm": 1.091454286976059, + "learning_rate": 5.158352069666304e-06, + "loss": 0.5062, + "step": 33035 + }, + { + "epoch": 0.9642492339121552, + "grad_norm": 1.088978983775512, + "learning_rate": 5.1570687680906295e-06, + "loss": 0.5349, + "step": 33040 + }, + { + "epoch": 0.9643951554063913, + "grad_norm": 0.9832650522237123, + "learning_rate": 5.155790669483787e-06, + "loss": 0.5017, + "step": 33045 + }, + { + "epoch": 0.9645410769006275, + "grad_norm": 0.9790684900733043, + "learning_rate": 5.1545177741434084e-06, + "loss": 0.526, + "step": 33050 + }, + { + "epoch": 0.9646869983948636, + "grad_norm": 1.057200653760597, + "learning_rate": 5.153250082365904e-06, + "loss": 0.5167, + "step": 33055 + }, + { + "epoch": 0.9648329198890997, + "grad_norm": 0.940885123767378, + "learning_rate": 5.151987594446485e-06, + "loss": 0.5414, + "step": 33060 + }, + { + "epoch": 0.9649788413833358, + "grad_norm": 1.191675067518615, + "learning_rate": 5.150730310679145e-06, + "loss": 0.4929, + "step": 33065 + }, + { + "epoch": 0.9651247628775719, + "grad_norm": 1.003718608937224, + "learning_rate": 5.149478231356668e-06, + "loss": 0.5561, + "step": 33070 + }, + { + "epoch": 0.965270684371808, + "grad_norm": 0.82691596437521, + "learning_rate": 5.148231356770617e-06, + "loss": 0.507, + "step": 33075 + }, + { + "epoch": 0.965416605866044, + "grad_norm": 1.0216014318969204, + "learning_rate": 5.146989687211356e-06, + "loss": 0.5369, + "step": 33080 + }, + { + "epoch": 0.9655625273602801, + "grad_norm": 1.054176341278807, + "learning_rate": 5.1457532229680315e-06, + "loss": 0.5557, + "step": 33085 + }, + { + "epoch": 0.9657084488545162, + "grad_norm": 0.9851936942970125, + "learning_rate": 5.144521964328571e-06, + "loss": 0.4779, + "step": 33090 + }, + { + "epoch": 0.9658543703487523, + "grad_norm": 0.9838057145646232, + "learning_rate": 5.143295911579706e-06, + "loss": 0.4917, + "step": 33095 + }, + { + "epoch": 0.9660002918429885, + "grad_norm": 1.0725837704920778, + "learning_rate": 5.142075065006935e-06, + "loss": 0.5353, + "step": 33100 + }, + { + "epoch": 0.9661462133372246, + "grad_norm": 0.9648922133667956, + "learning_rate": 5.140859424894561e-06, + "loss": 0.5435, + "step": 33105 + }, + { + "epoch": 0.9662921348314607, + "grad_norm": 1.0604852329179624, + "learning_rate": 5.139648991525667e-06, + "loss": 0.5639, + "step": 33110 + }, + { + "epoch": 0.9664380563256968, + "grad_norm": 1.190154121547011, + "learning_rate": 5.1384437651821265e-06, + "loss": 0.5585, + "step": 33115 + }, + { + "epoch": 0.9665839778199329, + "grad_norm": 0.9956171957700265, + "learning_rate": 5.137243746144599e-06, + "loss": 0.535, + "step": 33120 + }, + { + "epoch": 0.966729899314169, + "grad_norm": 0.8802547214672635, + "learning_rate": 5.136048934692528e-06, + "loss": 0.5361, + "step": 33125 + }, + { + "epoch": 0.9668758208084051, + "grad_norm": 1.0011791380197759, + "learning_rate": 5.134859331104151e-06, + "loss": 0.5344, + "step": 33130 + }, + { + "epoch": 0.9670217423026412, + "grad_norm": 1.0765221212822242, + "learning_rate": 5.133674935656487e-06, + "loss": 0.556, + "step": 33135 + }, + { + "epoch": 0.9671676637968772, + "grad_norm": 1.319692278979502, + "learning_rate": 5.1324957486253464e-06, + "loss": 0.5475, + "step": 33140 + }, + { + "epoch": 0.9673135852911133, + "grad_norm": 1.0379433584240518, + "learning_rate": 5.131321770285326e-06, + "loss": 0.5471, + "step": 33145 + }, + { + "epoch": 0.9674595067853495, + "grad_norm": 1.0714375389036257, + "learning_rate": 5.130153000909807e-06, + "loss": 0.5159, + "step": 33150 + }, + { + "epoch": 0.9676054282795856, + "grad_norm": 1.1460201847533646, + "learning_rate": 5.128989440770958e-06, + "loss": 0.5708, + "step": 33155 + }, + { + "epoch": 0.9677513497738217, + "grad_norm": 0.9951473014589465, + "learning_rate": 5.127831090139736e-06, + "loss": 0.5317, + "step": 33160 + }, + { + "epoch": 0.9678972712680578, + "grad_norm": 1.0649119610585445, + "learning_rate": 5.12667794928589e-06, + "loss": 0.5425, + "step": 33165 + }, + { + "epoch": 0.9680431927622939, + "grad_norm": 0.9072497002866126, + "learning_rate": 5.125530018477939e-06, + "loss": 0.5484, + "step": 33170 + }, + { + "epoch": 0.96818911425653, + "grad_norm": 1.1678431382956997, + "learning_rate": 5.12438729798321e-06, + "loss": 0.5346, + "step": 33175 + }, + { + "epoch": 0.9683350357507661, + "grad_norm": 0.9897926605156816, + "learning_rate": 5.123249788067804e-06, + "loss": 0.5004, + "step": 33180 + }, + { + "epoch": 0.9684809572450022, + "grad_norm": 0.9735793420759921, + "learning_rate": 5.122117488996612e-06, + "loss": 0.5117, + "step": 33185 + }, + { + "epoch": 0.9686268787392383, + "grad_norm": 1.0087351247524305, + "learning_rate": 5.1209904010333096e-06, + "loss": 0.503, + "step": 33190 + }, + { + "epoch": 0.9687728002334743, + "grad_norm": 1.0583610493142284, + "learning_rate": 5.119868524440364e-06, + "loss": 0.5127, + "step": 33195 + }, + { + "epoch": 0.9689187217277105, + "grad_norm": 1.0238923474747301, + "learning_rate": 5.1187518594790174e-06, + "loss": 0.4973, + "step": 33200 + }, + { + "epoch": 0.9690646432219466, + "grad_norm": 0.8435041410971695, + "learning_rate": 5.11764040640931e-06, + "loss": 0.5011, + "step": 33205 + }, + { + "epoch": 0.9692105647161827, + "grad_norm": 1.0151820301918038, + "learning_rate": 5.116534165490069e-06, + "loss": 0.5011, + "step": 33210 + }, + { + "epoch": 0.9693564862104188, + "grad_norm": 0.9971972187325656, + "learning_rate": 5.1154331369788974e-06, + "loss": 0.4803, + "step": 33215 + }, + { + "epoch": 0.9695024077046549, + "grad_norm": 1.1223327247478896, + "learning_rate": 5.114337321132194e-06, + "loss": 0.5584, + "step": 33220 + }, + { + "epoch": 0.969648329198891, + "grad_norm": 1.0468772696652713, + "learning_rate": 5.113246718205137e-06, + "loss": 0.4865, + "step": 33225 + }, + { + "epoch": 0.9697942506931271, + "grad_norm": 1.1348237523854359, + "learning_rate": 5.1121613284516924e-06, + "loss": 0.4956, + "step": 33230 + }, + { + "epoch": 0.9699401721873632, + "grad_norm": 0.8658675113336013, + "learning_rate": 5.111081152124618e-06, + "loss": 0.5014, + "step": 33235 + }, + { + "epoch": 0.9700860936815993, + "grad_norm": 1.0762854808496933, + "learning_rate": 5.110006189475454e-06, + "loss": 0.5397, + "step": 33240 + }, + { + "epoch": 0.9702320151758353, + "grad_norm": 0.8423060453443036, + "learning_rate": 5.108936440754519e-06, + "loss": 0.4841, + "step": 33245 + }, + { + "epoch": 0.9703779366700716, + "grad_norm": 0.9461089986587222, + "learning_rate": 5.107871906210928e-06, + "loss": 0.5691, + "step": 33250 + }, + { + "epoch": 0.9705238581643076, + "grad_norm": 0.9278039398188089, + "learning_rate": 5.106812586092576e-06, + "loss": 0.523, + "step": 33255 + }, + { + "epoch": 0.9706697796585437, + "grad_norm": 1.012146005817263, + "learning_rate": 5.105758480646149e-06, + "loss": 0.5332, + "step": 33260 + }, + { + "epoch": 0.9708157011527798, + "grad_norm": 1.0517619488957122, + "learning_rate": 5.104709590117112e-06, + "loss": 0.566, + "step": 33265 + }, + { + "epoch": 0.9709616226470159, + "grad_norm": 1.0288930009520139, + "learning_rate": 5.103665914749721e-06, + "loss": 0.5468, + "step": 33270 + }, + { + "epoch": 0.971107544141252, + "grad_norm": 1.0933419406856637, + "learning_rate": 5.102627454787012e-06, + "loss": 0.4999, + "step": 33275 + }, + { + "epoch": 0.9712534656354881, + "grad_norm": 1.0138066205381293, + "learning_rate": 5.101594210470813e-06, + "loss": 0.5197, + "step": 33280 + }, + { + "epoch": 0.9713993871297242, + "grad_norm": 1.1901627962464099, + "learning_rate": 5.100566182041731e-06, + "loss": 0.5612, + "step": 33285 + }, + { + "epoch": 0.9715453086239603, + "grad_norm": 1.0175863314278761, + "learning_rate": 5.099543369739165e-06, + "loss": 0.5398, + "step": 33290 + }, + { + "epoch": 0.9716912301181964, + "grad_norm": 1.0861544032973653, + "learning_rate": 5.098525773801295e-06, + "loss": 0.5091, + "step": 33295 + }, + { + "epoch": 0.9718371516124326, + "grad_norm": 0.9290366814685095, + "learning_rate": 5.097513394465085e-06, + "loss": 0.5036, + "step": 33300 + }, + { + "epoch": 0.9719830731066686, + "grad_norm": 1.0448890955356152, + "learning_rate": 5.096506231966289e-06, + "loss": 0.581, + "step": 33305 + }, + { + "epoch": 0.9721289946009047, + "grad_norm": 0.9090545822960594, + "learning_rate": 5.0955042865394425e-06, + "loss": 0.4972, + "step": 33310 + }, + { + "epoch": 0.9722749160951408, + "grad_norm": 1.3530843560789703, + "learning_rate": 5.094507558417869e-06, + "loss": 0.58, + "step": 33315 + }, + { + "epoch": 0.9724208375893769, + "grad_norm": 0.9659651473516816, + "learning_rate": 5.093516047833673e-06, + "loss": 0.4957, + "step": 33320 + }, + { + "epoch": 0.972566759083613, + "grad_norm": 0.928527395145158, + "learning_rate": 5.092529755017747e-06, + "loss": 0.503, + "step": 33325 + }, + { + "epoch": 0.9727126805778491, + "grad_norm": 0.9342770201511649, + "learning_rate": 5.091548680199767e-06, + "loss": 0.4841, + "step": 33330 + }, + { + "epoch": 0.9728586020720852, + "grad_norm": 1.0330013272728686, + "learning_rate": 5.090572823608195e-06, + "loss": 0.5481, + "step": 33335 + }, + { + "epoch": 0.9730045235663213, + "grad_norm": 1.021005975163304, + "learning_rate": 5.089602185470278e-06, + "loss": 0.4873, + "step": 33340 + }, + { + "epoch": 0.9731504450605574, + "grad_norm": 0.8948410259376687, + "learning_rate": 5.088636766012046e-06, + "loss": 0.4945, + "step": 33345 + }, + { + "epoch": 0.9732963665547936, + "grad_norm": 0.8602093129756095, + "learning_rate": 5.0876765654583185e-06, + "loss": 0.4972, + "step": 33350 + }, + { + "epoch": 0.9734422880490297, + "grad_norm": 1.1249056725836115, + "learning_rate": 5.086721584032693e-06, + "loss": 0.5108, + "step": 33355 + }, + { + "epoch": 0.9735882095432657, + "grad_norm": 0.8970028911947296, + "learning_rate": 5.085771821957554e-06, + "loss": 0.5015, + "step": 33360 + }, + { + "epoch": 0.9737341310375018, + "grad_norm": 1.00764752847316, + "learning_rate": 5.084827279454072e-06, + "loss": 0.5445, + "step": 33365 + }, + { + "epoch": 0.9738800525317379, + "grad_norm": 1.0387737602810836, + "learning_rate": 5.083887956742202e-06, + "loss": 0.5585, + "step": 33370 + }, + { + "epoch": 0.974025974025974, + "grad_norm": 1.0749149699775726, + "learning_rate": 5.082953854040683e-06, + "loss": 0.6185, + "step": 33375 + }, + { + "epoch": 0.9741718955202101, + "grad_norm": 1.121355266355452, + "learning_rate": 5.08202497156704e-06, + "loss": 0.5013, + "step": 33380 + }, + { + "epoch": 0.9743178170144462, + "grad_norm": 0.993181963027802, + "learning_rate": 5.081101309537578e-06, + "loss": 0.4913, + "step": 33385 + }, + { + "epoch": 0.9744637385086823, + "grad_norm": 1.0152078631509958, + "learning_rate": 5.080182868167388e-06, + "loss": 0.495, + "step": 33390 + }, + { + "epoch": 0.9746096600029184, + "grad_norm": 0.9557924930629481, + "learning_rate": 5.079269647670351e-06, + "loss": 0.513, + "step": 33395 + }, + { + "epoch": 0.9747555814971546, + "grad_norm": 0.9643025103093668, + "learning_rate": 5.078361648259122e-06, + "loss": 0.5102, + "step": 33400 + }, + { + "epoch": 0.9749015029913907, + "grad_norm": 1.049116992295395, + "learning_rate": 5.077458870145146e-06, + "loss": 0.5245, + "step": 33405 + }, + { + "epoch": 0.9750474244856268, + "grad_norm": 1.0067790767085887, + "learning_rate": 5.076561313538658e-06, + "loss": 0.5375, + "step": 33410 + }, + { + "epoch": 0.9751933459798628, + "grad_norm": 1.0708950557925891, + "learning_rate": 5.075668978648664e-06, + "loss": 0.5352, + "step": 33415 + }, + { + "epoch": 0.9753392674740989, + "grad_norm": 0.987469132979006, + "learning_rate": 5.0747818656829625e-06, + "loss": 0.5341, + "step": 33420 + }, + { + "epoch": 0.975485188968335, + "grad_norm": 0.9560857706118973, + "learning_rate": 5.0738999748481366e-06, + "loss": 0.4944, + "step": 33425 + }, + { + "epoch": 0.9756311104625711, + "grad_norm": 0.9393343319458181, + "learning_rate": 5.073023306349552e-06, + "loss": 0.482, + "step": 33430 + }, + { + "epoch": 0.9757770319568072, + "grad_norm": 0.9749974002721633, + "learning_rate": 5.072151860391352e-06, + "loss": 0.5367, + "step": 33435 + }, + { + "epoch": 0.9759229534510433, + "grad_norm": 0.9803606779047043, + "learning_rate": 5.071285637176473e-06, + "loss": 0.5715, + "step": 33440 + }, + { + "epoch": 0.9760688749452794, + "grad_norm": 1.0588347798111502, + "learning_rate": 5.070424636906629e-06, + "loss": 0.5851, + "step": 33445 + }, + { + "epoch": 0.9762147964395156, + "grad_norm": 0.9797603512920966, + "learning_rate": 5.0695688597823225e-06, + "loss": 0.5652, + "step": 33450 + }, + { + "epoch": 0.9763607179337517, + "grad_norm": 0.9769923064909117, + "learning_rate": 5.068718306002837e-06, + "loss": 0.5213, + "step": 33455 + }, + { + "epoch": 0.9765066394279878, + "grad_norm": 1.09165998209017, + "learning_rate": 5.0678729757662354e-06, + "loss": 0.5766, + "step": 33460 + }, + { + "epoch": 0.9766525609222239, + "grad_norm": 1.0749137723608306, + "learning_rate": 5.067032869269374e-06, + "loss": 0.5665, + "step": 33465 + }, + { + "epoch": 0.97679848241646, + "grad_norm": 1.0068148780791313, + "learning_rate": 5.066197986707887e-06, + "loss": 0.5195, + "step": 33470 + }, + { + "epoch": 0.976944403910696, + "grad_norm": 0.9967499572431862, + "learning_rate": 5.065368328276189e-06, + "loss": 0.5025, + "step": 33475 + }, + { + "epoch": 0.9770903254049321, + "grad_norm": 0.8860617550926022, + "learning_rate": 5.0645438941674836e-06, + "loss": 0.5272, + "step": 33480 + }, + { + "epoch": 0.9772362468991682, + "grad_norm": 0.9670277615613261, + "learning_rate": 5.0637246845737535e-06, + "loss": 0.517, + "step": 33485 + }, + { + "epoch": 0.9773821683934043, + "grad_norm": 1.1273733012637333, + "learning_rate": 5.06291069968577e-06, + "loss": 0.5639, + "step": 33490 + }, + { + "epoch": 0.9775280898876404, + "grad_norm": 0.8517430494352236, + "learning_rate": 5.0621019396930844e-06, + "loss": 0.5022, + "step": 33495 + }, + { + "epoch": 0.9776740113818766, + "grad_norm": 0.854923004141028, + "learning_rate": 5.0612984047840305e-06, + "loss": 0.5249, + "step": 33500 + }, + { + "epoch": 0.9778199328761127, + "grad_norm": 1.1043474655152912, + "learning_rate": 5.060500095145726e-06, + "loss": 0.564, + "step": 33505 + }, + { + "epoch": 0.9779658543703488, + "grad_norm": 1.0426307699685953, + "learning_rate": 5.059707010964071e-06, + "loss": 0.5621, + "step": 33510 + }, + { + "epoch": 0.9781117758645849, + "grad_norm": 1.1669967905970806, + "learning_rate": 5.058919152423754e-06, + "loss": 0.5449, + "step": 33515 + }, + { + "epoch": 0.978257697358821, + "grad_norm": 1.1715461189033711, + "learning_rate": 5.058136519708239e-06, + "loss": 0.5272, + "step": 33520 + }, + { + "epoch": 0.978403618853057, + "grad_norm": 1.0968187603986936, + "learning_rate": 5.057359112999778e-06, + "loss": 0.5325, + "step": 33525 + }, + { + "epoch": 0.9785495403472931, + "grad_norm": 0.8341579304262031, + "learning_rate": 5.056586932479406e-06, + "loss": 0.4742, + "step": 33530 + }, + { + "epoch": 0.9786954618415292, + "grad_norm": 0.9399784407931918, + "learning_rate": 5.055819978326936e-06, + "loss": 0.5053, + "step": 33535 + }, + { + "epoch": 0.9788413833357653, + "grad_norm": 1.040836098664231, + "learning_rate": 5.055058250720973e-06, + "loss": 0.5699, + "step": 33540 + }, + { + "epoch": 0.9789873048300015, + "grad_norm": 0.9576166852777381, + "learning_rate": 5.0543017498388925e-06, + "loss": 0.5314, + "step": 33545 + }, + { + "epoch": 0.9791332263242376, + "grad_norm": 1.0821381600671702, + "learning_rate": 5.0535504758568655e-06, + "loss": 0.5414, + "step": 33550 + }, + { + "epoch": 0.9792791478184737, + "grad_norm": 0.8874562276236875, + "learning_rate": 5.052804428949839e-06, + "loss": 0.4717, + "step": 33555 + }, + { + "epoch": 0.9794250693127098, + "grad_norm": 1.077466291656469, + "learning_rate": 5.052063609291541e-06, + "loss": 0.5072, + "step": 33560 + }, + { + "epoch": 0.9795709908069459, + "grad_norm": 0.9402816687641828, + "learning_rate": 5.051328017054488e-06, + "loss": 0.4963, + "step": 33565 + }, + { + "epoch": 0.979716912301182, + "grad_norm": 1.0665721824246712, + "learning_rate": 5.05059765240998e-06, + "loss": 0.5358, + "step": 33570 + }, + { + "epoch": 0.979862833795418, + "grad_norm": 1.0663817878247017, + "learning_rate": 5.049872515528087e-06, + "loss": 0.5911, + "step": 33575 + }, + { + "epoch": 0.9800087552896541, + "grad_norm": 0.9949668657119501, + "learning_rate": 5.049152606577677e-06, + "loss": 0.5098, + "step": 33580 + }, + { + "epoch": 0.9801546767838902, + "grad_norm": 1.0831224483291853, + "learning_rate": 5.048437925726394e-06, + "loss": 0.5786, + "step": 33585 + }, + { + "epoch": 0.9803005982781263, + "grad_norm": 1.1327200471693837, + "learning_rate": 5.047728473140661e-06, + "loss": 0.5327, + "step": 33590 + }, + { + "epoch": 0.9804465197723625, + "grad_norm": 1.0302789352412236, + "learning_rate": 5.047024248985693e-06, + "loss": 0.5411, + "step": 33595 + }, + { + "epoch": 0.9805924412665986, + "grad_norm": 1.1077679676381746, + "learning_rate": 5.046325253425478e-06, + "loss": 0.5933, + "step": 33600 + }, + { + "epoch": 0.9807383627608347, + "grad_norm": 1.0753854572626638, + "learning_rate": 5.045631486622792e-06, + "loss": 0.5936, + "step": 33605 + }, + { + "epoch": 0.9808842842550708, + "grad_norm": 1.0920498382630135, + "learning_rate": 5.044942948739185e-06, + "loss": 0.5008, + "step": 33610 + }, + { + "epoch": 0.9810302057493069, + "grad_norm": 0.8933129497704895, + "learning_rate": 5.044259639935007e-06, + "loss": 0.4747, + "step": 33615 + }, + { + "epoch": 0.981176127243543, + "grad_norm": 1.0305549440006463, + "learning_rate": 5.043581560369372e-06, + "loss": 0.527, + "step": 33620 + }, + { + "epoch": 0.9813220487377791, + "grad_norm": 0.9580121087779867, + "learning_rate": 5.042908710200183e-06, + "loss": 0.5347, + "step": 33625 + }, + { + "epoch": 0.9814679702320152, + "grad_norm": 0.9205918761564317, + "learning_rate": 5.04224108958413e-06, + "loss": 0.4701, + "step": 33630 + }, + { + "epoch": 0.9816138917262512, + "grad_norm": 1.2236944920625357, + "learning_rate": 5.041578698676678e-06, + "loss": 0.5496, + "step": 33635 + }, + { + "epoch": 0.9817598132204873, + "grad_norm": 0.9490949498357534, + "learning_rate": 5.040921537632076e-06, + "loss": 0.4985, + "step": 33640 + }, + { + "epoch": 0.9819057347147235, + "grad_norm": 1.1907449418833003, + "learning_rate": 5.04026960660336e-06, + "loss": 0.5483, + "step": 33645 + }, + { + "epoch": 0.9820516562089596, + "grad_norm": 1.110200459051081, + "learning_rate": 5.039622905742343e-06, + "loss": 0.4873, + "step": 33650 + }, + { + "epoch": 0.9821975777031957, + "grad_norm": 1.0729031205278334, + "learning_rate": 5.038981435199619e-06, + "loss": 0.5015, + "step": 33655 + }, + { + "epoch": 0.9823434991974318, + "grad_norm": 1.0289432450989846, + "learning_rate": 5.0383451951245675e-06, + "loss": 0.5541, + "step": 33660 + }, + { + "epoch": 0.9824894206916679, + "grad_norm": 0.9722733112237959, + "learning_rate": 5.037714185665353e-06, + "loss": 0.5272, + "step": 33665 + }, + { + "epoch": 0.982635342185904, + "grad_norm": 1.064043484288904, + "learning_rate": 5.037088406968911e-06, + "loss": 0.5103, + "step": 33670 + }, + { + "epoch": 0.9827812636801401, + "grad_norm": 1.0346664860819423, + "learning_rate": 5.036467859180968e-06, + "loss": 0.5282, + "step": 33675 + }, + { + "epoch": 0.9829271851743762, + "grad_norm": 0.9895544493447913, + "learning_rate": 5.035852542446034e-06, + "loss": 0.4942, + "step": 33680 + }, + { + "epoch": 0.9830731066686123, + "grad_norm": 0.8857360534632025, + "learning_rate": 5.035242456907394e-06, + "loss": 0.5021, + "step": 33685 + }, + { + "epoch": 0.9832190281628483, + "grad_norm": 1.0209309071157355, + "learning_rate": 5.034637602707115e-06, + "loss": 0.5533, + "step": 33690 + }, + { + "epoch": 0.9833649496570845, + "grad_norm": 0.9655930898221335, + "learning_rate": 5.034037979986055e-06, + "loss": 0.5099, + "step": 33695 + }, + { + "epoch": 0.9835108711513206, + "grad_norm": 1.108373019645238, + "learning_rate": 5.033443588883844e-06, + "loss": 0.4903, + "step": 33700 + }, + { + "epoch": 0.9836567926455567, + "grad_norm": 1.034365216081099, + "learning_rate": 5.032854429538894e-06, + "loss": 0.5776, + "step": 33705 + }, + { + "epoch": 0.9838027141397928, + "grad_norm": 0.9427036341063435, + "learning_rate": 5.0322705020884065e-06, + "loss": 0.5601, + "step": 33710 + }, + { + "epoch": 0.9839486356340289, + "grad_norm": 1.1051007774815587, + "learning_rate": 5.031691806668358e-06, + "loss": 0.5656, + "step": 33715 + }, + { + "epoch": 0.984094557128265, + "grad_norm": 1.202707695806946, + "learning_rate": 5.03111834341351e-06, + "loss": 0.55, + "step": 33720 + }, + { + "epoch": 0.9842404786225011, + "grad_norm": 1.1083712032784958, + "learning_rate": 5.030550112457401e-06, + "loss": 0.4812, + "step": 33725 + }, + { + "epoch": 0.9843864001167372, + "grad_norm": 1.1442206629294946, + "learning_rate": 5.0299871139323565e-06, + "loss": 0.5685, + "step": 33730 + }, + { + "epoch": 0.9845323216109733, + "grad_norm": 0.9140169945017929, + "learning_rate": 5.029429347969483e-06, + "loss": 0.5002, + "step": 33735 + }, + { + "epoch": 0.9846782431052093, + "grad_norm": 0.9578531251436923, + "learning_rate": 5.028876814698661e-06, + "loss": 0.5434, + "step": 33740 + }, + { + "epoch": 0.9848241645994456, + "grad_norm": 0.9265414791068719, + "learning_rate": 5.028329514248563e-06, + "loss": 0.4876, + "step": 33745 + }, + { + "epoch": 0.9849700860936816, + "grad_norm": 1.106580671006795, + "learning_rate": 5.027787446746639e-06, + "loss": 0.5594, + "step": 33750 + }, + { + "epoch": 0.9851160075879177, + "grad_norm": 1.0000048943538333, + "learning_rate": 5.027250612319115e-06, + "loss": 0.4994, + "step": 33755 + }, + { + "epoch": 0.9852619290821538, + "grad_norm": 0.9279773868928076, + "learning_rate": 5.0267190110910045e-06, + "loss": 0.5034, + "step": 33760 + }, + { + "epoch": 0.9854078505763899, + "grad_norm": 1.0872808378834786, + "learning_rate": 5.026192643186102e-06, + "loss": 0.5498, + "step": 33765 + }, + { + "epoch": 0.985553772070626, + "grad_norm": 0.938516462965969, + "learning_rate": 5.025671508726983e-06, + "loss": 0.501, + "step": 33770 + }, + { + "epoch": 0.9856996935648621, + "grad_norm": 1.263096457109755, + "learning_rate": 5.025155607835002e-06, + "loss": 0.5403, + "step": 33775 + }, + { + "epoch": 0.9858456150590982, + "grad_norm": 0.9929012980015474, + "learning_rate": 5.024644940630293e-06, + "loss": 0.6056, + "step": 33780 + }, + { + "epoch": 0.9859915365533343, + "grad_norm": 1.206529714361955, + "learning_rate": 5.024139507231782e-06, + "loss": 0.539, + "step": 33785 + }, + { + "epoch": 0.9861374580475704, + "grad_norm": 1.0379883567226897, + "learning_rate": 5.023639307757162e-06, + "loss": 0.5352, + "step": 33790 + }, + { + "epoch": 0.9862833795418066, + "grad_norm": 1.1306631842019477, + "learning_rate": 5.023144342322916e-06, + "loss": 0.5866, + "step": 33795 + }, + { + "epoch": 0.9864293010360426, + "grad_norm": 1.0108377813317388, + "learning_rate": 5.022654611044306e-06, + "loss": 0.572, + "step": 33800 + }, + { + "epoch": 0.9865752225302787, + "grad_norm": 0.9320636134038177, + "learning_rate": 5.022170114035375e-06, + "loss": 0.5147, + "step": 33805 + }, + { + "epoch": 0.9867211440245148, + "grad_norm": 1.0175455601670906, + "learning_rate": 5.021690851408948e-06, + "loss": 0.5335, + "step": 33810 + }, + { + "epoch": 0.9868670655187509, + "grad_norm": 1.0505046296815395, + "learning_rate": 5.021216823276629e-06, + "loss": 0.548, + "step": 33815 + }, + { + "epoch": 0.987012987012987, + "grad_norm": 1.0668456587494082, + "learning_rate": 5.020748029748804e-06, + "loss": 0.5384, + "step": 33820 + }, + { + "epoch": 0.9871589085072231, + "grad_norm": 0.9007015843794126, + "learning_rate": 5.020284470934643e-06, + "loss": 0.4901, + "step": 33825 + }, + { + "epoch": 0.9873048300014592, + "grad_norm": 1.1310276536231032, + "learning_rate": 5.019826146942089e-06, + "loss": 0.5645, + "step": 33830 + }, + { + "epoch": 0.9874507514956953, + "grad_norm": 0.9722862756883272, + "learning_rate": 5.0193730578778765e-06, + "loss": 0.498, + "step": 33835 + }, + { + "epoch": 0.9875966729899314, + "grad_norm": 0.9734766858183307, + "learning_rate": 5.018925203847516e-06, + "loss": 0.4848, + "step": 33840 + }, + { + "epoch": 0.9877425944841676, + "grad_norm": 1.24537530114486, + "learning_rate": 5.018482584955293e-06, + "loss": 0.5845, + "step": 33845 + }, + { + "epoch": 0.9878885159784037, + "grad_norm": 1.1734094489109732, + "learning_rate": 5.018045201304286e-06, + "loss": 0.5326, + "step": 33850 + }, + { + "epoch": 0.9880344374726397, + "grad_norm": 1.0180124376027657, + "learning_rate": 5.017613052996342e-06, + "loss": 0.5296, + "step": 33855 + }, + { + "epoch": 0.9881803589668758, + "grad_norm": 0.9177150422853549, + "learning_rate": 5.0171861401321025e-06, + "loss": 0.4881, + "step": 33860 + }, + { + "epoch": 0.9883262804611119, + "grad_norm": 1.0814809806392454, + "learning_rate": 5.016764462810975e-06, + "loss": 0.5144, + "step": 33865 + }, + { + "epoch": 0.988472201955348, + "grad_norm": 0.9146467195207497, + "learning_rate": 5.0163480211311565e-06, + "loss": 0.4895, + "step": 33870 + }, + { + "epoch": 0.9886181234495841, + "grad_norm": 1.0301122390412922, + "learning_rate": 5.015936815189624e-06, + "loss": 0.5572, + "step": 33875 + }, + { + "epoch": 0.9887640449438202, + "grad_norm": 1.0361172551464364, + "learning_rate": 5.015530845082138e-06, + "loss": 0.5444, + "step": 33880 + }, + { + "epoch": 0.9889099664380563, + "grad_norm": 1.140939330237156, + "learning_rate": 5.01513011090323e-06, + "loss": 0.5709, + "step": 33885 + }, + { + "epoch": 0.9890558879322924, + "grad_norm": 0.9633466382471937, + "learning_rate": 5.0147346127462235e-06, + "loss": 0.4635, + "step": 33890 + }, + { + "epoch": 0.9892018094265286, + "grad_norm": 0.9618784257000533, + "learning_rate": 5.014344350703214e-06, + "loss": 0.5946, + "step": 33895 + }, + { + "epoch": 0.9893477309207647, + "grad_norm": 0.9715137131827606, + "learning_rate": 5.013959324865082e-06, + "loss": 0.5803, + "step": 33900 + }, + { + "epoch": 0.9894936524150008, + "grad_norm": 1.210351679528532, + "learning_rate": 5.01357953532149e-06, + "loss": 0.5834, + "step": 33905 + }, + { + "epoch": 0.9896395739092368, + "grad_norm": 1.001088802477084, + "learning_rate": 5.013204982160878e-06, + "loss": 0.5233, + "step": 33910 + }, + { + "epoch": 0.9897854954034729, + "grad_norm": 1.079061766315509, + "learning_rate": 5.012835665470469e-06, + "loss": 0.5374, + "step": 33915 + }, + { + "epoch": 0.989931416897709, + "grad_norm": 0.9744772838191876, + "learning_rate": 5.012471585336263e-06, + "loss": 0.4811, + "step": 33920 + }, + { + "epoch": 0.9900773383919451, + "grad_norm": 1.1886066580660766, + "learning_rate": 5.012112741843044e-06, + "loss": 0.5516, + "step": 33925 + }, + { + "epoch": 0.9902232598861812, + "grad_norm": 1.0043509799454422, + "learning_rate": 5.011759135074372e-06, + "loss": 0.5405, + "step": 33930 + }, + { + "epoch": 0.9903691813804173, + "grad_norm": 0.9229418285152787, + "learning_rate": 5.011410765112597e-06, + "loss": 0.5003, + "step": 33935 + }, + { + "epoch": 0.9905151028746534, + "grad_norm": 1.037200535420287, + "learning_rate": 5.0110676320388395e-06, + "loss": 0.4649, + "step": 33940 + }, + { + "epoch": 0.9906610243688896, + "grad_norm": 0.9104571270919357, + "learning_rate": 5.010729735933006e-06, + "loss": 0.5037, + "step": 33945 + }, + { + "epoch": 0.9908069458631257, + "grad_norm": 1.0056870234001343, + "learning_rate": 5.01039707687378e-06, + "loss": 0.5149, + "step": 33950 + }, + { + "epoch": 0.9909528673573618, + "grad_norm": 1.0665776610375306, + "learning_rate": 5.01006965493863e-06, + "loss": 0.5629, + "step": 33955 + }, + { + "epoch": 0.9910987888515979, + "grad_norm": 0.9572439710461341, + "learning_rate": 5.009747470203801e-06, + "loss": 0.4838, + "step": 33960 + }, + { + "epoch": 0.991244710345834, + "grad_norm": 1.0837510316096048, + "learning_rate": 5.009430522744322e-06, + "loss": 0.5326, + "step": 33965 + }, + { + "epoch": 0.99139063184007, + "grad_norm": 1.007204258097148, + "learning_rate": 5.009118812633996e-06, + "loss": 0.5782, + "step": 33970 + }, + { + "epoch": 0.9915365533343061, + "grad_norm": 1.0399550064040173, + "learning_rate": 5.008812339945411e-06, + "loss": 0.5193, + "step": 33975 + }, + { + "epoch": 0.9916824748285422, + "grad_norm": 0.9896983949080145, + "learning_rate": 5.0085111047499384e-06, + "loss": 0.5305, + "step": 33980 + }, + { + "epoch": 0.9918283963227783, + "grad_norm": 0.9621123880163975, + "learning_rate": 5.008215107117724e-06, + "loss": 0.5182, + "step": 33985 + }, + { + "epoch": 0.9919743178170144, + "grad_norm": 0.8447976358203553, + "learning_rate": 5.007924347117697e-06, + "loss": 0.4877, + "step": 33990 + }, + { + "epoch": 0.9921202393112506, + "grad_norm": 1.1596474947702167, + "learning_rate": 5.007638824817563e-06, + "loss": 0.5708, + "step": 33995 + }, + { + "epoch": 0.9922661608054867, + "grad_norm": 1.074628382692495, + "learning_rate": 5.007358540283818e-06, + "loss": 0.5672, + "step": 34000 + }, + { + "epoch": 0.9924120822997228, + "grad_norm": 0.9154829466566006, + "learning_rate": 5.007083493581728e-06, + "loss": 0.4771, + "step": 34005 + }, + { + "epoch": 0.9925580037939589, + "grad_norm": 1.074246880963391, + "learning_rate": 5.0068136847753385e-06, + "loss": 0.5565, + "step": 34010 + }, + { + "epoch": 0.992703925288195, + "grad_norm": 0.9229558777937205, + "learning_rate": 5.006549113927486e-06, + "loss": 0.5182, + "step": 34015 + }, + { + "epoch": 0.992849846782431, + "grad_norm": 1.1462405119761574, + "learning_rate": 5.006289781099778e-06, + "loss": 0.5588, + "step": 34020 + }, + { + "epoch": 0.9929957682766671, + "grad_norm": 1.0710422413291885, + "learning_rate": 5.006035686352608e-06, + "loss": 0.5618, + "step": 34025 + }, + { + "epoch": 0.9931416897709032, + "grad_norm": 0.9111509681353691, + "learning_rate": 5.005786829745139e-06, + "loss": 0.5442, + "step": 34030 + }, + { + "epoch": 0.9932876112651393, + "grad_norm": 0.9510318450701138, + "learning_rate": 5.005543211335331e-06, + "loss": 0.5496, + "step": 34035 + }, + { + "epoch": 0.9934335327593754, + "grad_norm": 1.0008996083381918, + "learning_rate": 5.005304831179909e-06, + "loss": 0.5313, + "step": 34040 + }, + { + "epoch": 0.9935794542536116, + "grad_norm": 1.036343656349479, + "learning_rate": 5.005071689334387e-06, + "loss": 0.5071, + "step": 34045 + }, + { + "epoch": 0.9937253757478477, + "grad_norm": 1.0064022594062205, + "learning_rate": 5.004843785853055e-06, + "loss": 0.5189, + "step": 34050 + }, + { + "epoch": 0.9938712972420838, + "grad_norm": 1.0727308088819358, + "learning_rate": 5.0046211207889865e-06, + "loss": 0.498, + "step": 34055 + }, + { + "epoch": 0.9940172187363199, + "grad_norm": 1.011996656846501, + "learning_rate": 5.004403694194032e-06, + "loss": 0.5572, + "step": 34060 + }, + { + "epoch": 0.994163140230556, + "grad_norm": 1.090875683762067, + "learning_rate": 5.004191506118822e-06, + "loss": 0.5221, + "step": 34065 + }, + { + "epoch": 0.994309061724792, + "grad_norm": 0.9333200560570092, + "learning_rate": 5.00398455661277e-06, + "loss": 0.4956, + "step": 34070 + }, + { + "epoch": 0.9944549832190281, + "grad_norm": 0.9882746033956104, + "learning_rate": 5.003782845724071e-06, + "loss": 0.5227, + "step": 34075 + }, + { + "epoch": 0.9946009047132642, + "grad_norm": 0.8267576198682784, + "learning_rate": 5.003586373499691e-06, + "loss": 0.5028, + "step": 34080 + }, + { + "epoch": 0.9947468262075003, + "grad_norm": 0.9879584677953103, + "learning_rate": 5.003395139985384e-06, + "loss": 0.5377, + "step": 34085 + }, + { + "epoch": 0.9948927477017364, + "grad_norm": 0.9719402077931502, + "learning_rate": 5.003209145225686e-06, + "loss": 0.5038, + "step": 34090 + }, + { + "epoch": 0.9950386691959726, + "grad_norm": 1.046885445097955, + "learning_rate": 5.003028389263907e-06, + "loss": 0.5311, + "step": 34095 + }, + { + "epoch": 0.9951845906902087, + "grad_norm": 0.9376607022492564, + "learning_rate": 5.002852872142138e-06, + "loss": 0.537, + "step": 34100 + }, + { + "epoch": 0.9953305121844448, + "grad_norm": 1.1437560781956682, + "learning_rate": 5.0026825939012535e-06, + "loss": 0.5348, + "step": 34105 + }, + { + "epoch": 0.9954764336786809, + "grad_norm": 0.9876093898709207, + "learning_rate": 5.002517554580903e-06, + "loss": 0.5113, + "step": 34110 + }, + { + "epoch": 0.995622355172917, + "grad_norm": 1.1891251598070707, + "learning_rate": 5.002357754219525e-06, + "loss": 0.5266, + "step": 34115 + }, + { + "epoch": 0.9957682766671531, + "grad_norm": 1.199713564049697, + "learning_rate": 5.002203192854326e-06, + "loss": 0.551, + "step": 34120 + }, + { + "epoch": 0.9959141981613892, + "grad_norm": 1.0276670045212444, + "learning_rate": 5.0020538705213e-06, + "loss": 0.5597, + "step": 34125 + }, + { + "epoch": 0.9960601196556252, + "grad_norm": 0.8596206355535277, + "learning_rate": 5.0019097872552216e-06, + "loss": 0.5082, + "step": 34130 + }, + { + "epoch": 0.9962060411498613, + "grad_norm": 1.1390781389722806, + "learning_rate": 5.001770943089642e-06, + "loss": 0.5143, + "step": 34135 + }, + { + "epoch": 0.9963519626440974, + "grad_norm": 1.010091785793645, + "learning_rate": 5.001637338056892e-06, + "loss": 0.4944, + "step": 34140 + }, + { + "epoch": 0.9964978841383336, + "grad_norm": 1.1178732238965712, + "learning_rate": 5.001508972188087e-06, + "loss": 0.5127, + "step": 34145 + }, + { + "epoch": 0.9966438056325697, + "grad_norm": 1.061360375157705, + "learning_rate": 5.001385845513118e-06, + "loss": 0.5202, + "step": 34150 + }, + { + "epoch": 0.9967897271268058, + "grad_norm": 0.997627126386671, + "learning_rate": 5.001267958060657e-06, + "loss": 0.5369, + "step": 34155 + }, + { + "epoch": 0.9969356486210419, + "grad_norm": 1.0447716836138679, + "learning_rate": 5.001155309858156e-06, + "loss": 0.5427, + "step": 34160 + }, + { + "epoch": 0.997081570115278, + "grad_norm": 0.9997743865793186, + "learning_rate": 5.001047900931849e-06, + "loss": 0.4906, + "step": 34165 + }, + { + "epoch": 0.9972274916095141, + "grad_norm": 1.1352521600880445, + "learning_rate": 5.000945731306746e-06, + "loss": 0.5921, + "step": 34170 + }, + { + "epoch": 0.9973734131037502, + "grad_norm": 0.9336651797943503, + "learning_rate": 5.000848801006639e-06, + "loss": 0.5408, + "step": 34175 + }, + { + "epoch": 0.9975193345979863, + "grad_norm": 0.8528773724163246, + "learning_rate": 5.000757110054101e-06, + "loss": 0.5046, + "step": 34180 + }, + { + "epoch": 0.9976652560922223, + "grad_norm": 1.1731211514032513, + "learning_rate": 5.0006706584704865e-06, + "loss": 0.5576, + "step": 34185 + }, + { + "epoch": 0.9978111775864584, + "grad_norm": 1.2239020358344568, + "learning_rate": 5.0005894462759244e-06, + "loss": 0.547, + "step": 34190 + }, + { + "epoch": 0.9979570990806946, + "grad_norm": 1.193873006903444, + "learning_rate": 5.000513473489325e-06, + "loss": 0.5358, + "step": 34195 + }, + { + "epoch": 0.9981030205749307, + "grad_norm": 0.9949503406352171, + "learning_rate": 5.000442740128385e-06, + "loss": 0.5466, + "step": 34200 + }, + { + "epoch": 0.9982489420691668, + "grad_norm": 1.0040847301012406, + "learning_rate": 5.0003772462095715e-06, + "loss": 0.5184, + "step": 34205 + }, + { + "epoch": 0.9983948635634029, + "grad_norm": 0.9666355319129454, + "learning_rate": 5.000316991748136e-06, + "loss": 0.5559, + "step": 34210 + }, + { + "epoch": 0.998540785057639, + "grad_norm": 0.9893589176004742, + "learning_rate": 5.000261976758115e-06, + "loss": 0.5457, + "step": 34215 + }, + { + "epoch": 0.9986867065518751, + "grad_norm": 0.9431329088837169, + "learning_rate": 5.000212201252313e-06, + "loss": 0.5169, + "step": 34220 + }, + { + "epoch": 0.9988326280461112, + "grad_norm": 1.053905405555924, + "learning_rate": 5.000167665242325e-06, + "loss": 0.5972, + "step": 34225 + }, + { + "epoch": 0.9989785495403473, + "grad_norm": 1.1493154532957772, + "learning_rate": 5.000128368738522e-06, + "loss": 0.5305, + "step": 34230 + }, + { + "epoch": 0.9991244710345834, + "grad_norm": 1.094965406863394, + "learning_rate": 5.000094311750053e-06, + "loss": 0.5338, + "step": 34235 + }, + { + "epoch": 0.9992703925288194, + "grad_norm": 1.0393104106214965, + "learning_rate": 5.000065494284851e-06, + "loss": 0.5055, + "step": 34240 + }, + { + "epoch": 0.9994163140230556, + "grad_norm": 1.0626403252982533, + "learning_rate": 5.000041916349627e-06, + "loss": 0.5483, + "step": 34245 + }, + { + "epoch": 0.9995622355172917, + "grad_norm": 1.0130078106020866, + "learning_rate": 5.000023577949868e-06, + "loss": 0.5814, + "step": 34250 + }, + { + "epoch": 0.9997081570115278, + "grad_norm": 1.080029261493259, + "learning_rate": 5.0000104790898465e-06, + "loss": 0.5956, + "step": 34255 + }, + { + "epoch": 0.9998540785057639, + "grad_norm": 0.8605516671437958, + "learning_rate": 5.000002619772615e-06, + "loss": 0.5014, + "step": 34260 + }, + { + "epoch": 1.0, + "grad_norm": 1.1940589638955315, + "learning_rate": 5e-06, + "loss": 0.4856, + "step": 34265 + }, + { + "epoch": 1.0, + "step": 34265, + "total_flos": 488058617069568.0, + "train_loss": 0.5987731399213277, + "train_runtime": 55800.5629, + "train_samples_per_second": 2.456, + "train_steps_per_second": 0.614 + } + ], + "logging_steps": 5, + "max_steps": 34265, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 488058617069568.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}