diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13158 @@ +{ + "best_global_step": 9356, + "best_metric": 0.9798825256975033, + "best_model_checkpoint": "runs/de_sapbert/checkpoint-9356", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 9356, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010688328345446773, + "grad_norm": 683.587890625, + "learning_rate": 8.547008547008549e-08, + "loss": 50.3236, + "step": 5 + }, + { + "epoch": 0.0021376656690893546, + "grad_norm": 546.2178955078125, + "learning_rate": 1.9230769230769234e-07, + "loss": 59.3515, + "step": 10 + }, + { + "epoch": 0.0032064985036340315, + "grad_norm": 602.040771484375, + "learning_rate": 2.991452991452992e-07, + "loss": 62.6096, + "step": 15 + }, + { + "epoch": 0.004275331338178709, + "grad_norm": 697.5074462890625, + "learning_rate": 4.05982905982906e-07, + "loss": 71.224, + "step": 20 + }, + { + "epoch": 0.005344164172723386, + "grad_norm": 629.8714599609375, + "learning_rate": 5.128205128205128e-07, + "loss": 58.249, + "step": 25 + }, + { + "epoch": 0.006412997007268063, + "grad_norm": 850.3056640625, + "learning_rate": 6.196581196581197e-07, + "loss": 53.1647, + "step": 30 + }, + { + "epoch": 0.007481829841812741, + "grad_norm": 535.2101440429688, + "learning_rate": 7.264957264957266e-07, + "loss": 74.0864, + "step": 35 + }, + { + "epoch": 0.008550662676357419, + "grad_norm": 877.2177734375, + "learning_rate": 8.333333333333333e-07, + "loss": 68.1396, + "step": 40 + }, + { + "epoch": 0.009619495510902095, + "grad_norm": 827.8037109375, + "learning_rate": 9.401709401709402e-07, + "loss": 67.5764, + "step": 45 + }, + { + "epoch": 0.010688328345446772, + "grad_norm": 738.37158203125, + "learning_rate": 1.047008547008547e-06, + "loss": 59.9784, + "step": 50 + }, + { + "epoch": 0.01175716117999145, + "grad_norm": 698.6246337890625, + "learning_rate": 1.153846153846154e-06, + "loss": 51.3131, + "step": 55 + }, + { + "epoch": 0.012825994014536126, + "grad_norm": 683.5339965820312, + "learning_rate": 1.2606837606837608e-06, + "loss": 68.0457, + "step": 60 + }, + { + "epoch": 0.013894826849080803, + "grad_norm": 766.8721923828125, + "learning_rate": 1.3675213675213678e-06, + "loss": 58.9018, + "step": 65 + }, + { + "epoch": 0.014963659683625482, + "grad_norm": 694.8308715820312, + "learning_rate": 1.4743589743589745e-06, + "loss": 59.4911, + "step": 70 + }, + { + "epoch": 0.01603249251817016, + "grad_norm": 585.236572265625, + "learning_rate": 1.5811965811965813e-06, + "loss": 58.0199, + "step": 75 + }, + { + "epoch": 0.017101325352714837, + "grad_norm": 778.44287109375, + "learning_rate": 1.6880341880341883e-06, + "loss": 63.7679, + "step": 80 + }, + { + "epoch": 0.018170158187259512, + "grad_norm": 646.8430786132812, + "learning_rate": 1.794871794871795e-06, + "loss": 60.7314, + "step": 85 + }, + { + "epoch": 0.01923899102180419, + "grad_norm": 989.9755249023438, + "learning_rate": 1.9017094017094018e-06, + "loss": 77.8096, + "step": 90 + }, + { + "epoch": 0.020307823856348866, + "grad_norm": 834.5411987304688, + "learning_rate": 2.008547008547009e-06, + "loss": 66.2088, + "step": 95 + }, + { + "epoch": 0.021376656690893545, + "grad_norm": 643.298095703125, + "learning_rate": 2.1153846153846155e-06, + "loss": 65.6054, + "step": 100 + }, + { + "epoch": 0.02244548952543822, + "grad_norm": 1178.525146484375, + "learning_rate": 2.222222222222222e-06, + "loss": 80.6449, + "step": 105 + }, + { + "epoch": 0.0235143223599829, + "grad_norm": 697.0106201171875, + "learning_rate": 2.3290598290598295e-06, + "loss": 69.9855, + "step": 110 + }, + { + "epoch": 0.024583155194527577, + "grad_norm": 783.412841796875, + "learning_rate": 2.435897435897436e-06, + "loss": 56.6307, + "step": 115 + }, + { + "epoch": 0.025651988029072252, + "grad_norm": 692.8482666015625, + "learning_rate": 2.542735042735043e-06, + "loss": 46.5356, + "step": 120 + }, + { + "epoch": 0.02672082086361693, + "grad_norm": 731.8605346679688, + "learning_rate": 2.64957264957265e-06, + "loss": 57.7131, + "step": 125 + }, + { + "epoch": 0.027789653698161606, + "grad_norm": 710.8729858398438, + "learning_rate": 2.756410256410257e-06, + "loss": 49.0737, + "step": 130 + }, + { + "epoch": 0.028858486532706284, + "grad_norm": 608.5044555664062, + "learning_rate": 2.8632478632478635e-06, + "loss": 50.7495, + "step": 135 + }, + { + "epoch": 0.029927319367250963, + "grad_norm": 723.5994873046875, + "learning_rate": 2.9700854700854705e-06, + "loss": 57.1029, + "step": 140 + }, + { + "epoch": 0.030996152201795638, + "grad_norm": 697.4583740234375, + "learning_rate": 3.0769230769230774e-06, + "loss": 45.4298, + "step": 145 + }, + { + "epoch": 0.03206498503634032, + "grad_norm": 567.723388671875, + "learning_rate": 3.183760683760684e-06, + "loss": 57.8409, + "step": 150 + }, + { + "epoch": 0.03313381787088499, + "grad_norm": 688.6309814453125, + "learning_rate": 3.290598290598291e-06, + "loss": 68.8388, + "step": 155 + }, + { + "epoch": 0.034202650705429674, + "grad_norm": 628.3114624023438, + "learning_rate": 3.397435897435898e-06, + "loss": 64.5809, + "step": 160 + }, + { + "epoch": 0.03527148353997435, + "grad_norm": 556.7693481445312, + "learning_rate": 3.5042735042735045e-06, + "loss": 54.5407, + "step": 165 + }, + { + "epoch": 0.036340316374519024, + "grad_norm": 494.6075744628906, + "learning_rate": 3.6111111111111115e-06, + "loss": 48.341, + "step": 170 + }, + { + "epoch": 0.0374091492090637, + "grad_norm": 554.0121459960938, + "learning_rate": 3.7179487179487184e-06, + "loss": 44.3806, + "step": 175 + }, + { + "epoch": 0.03847798204360838, + "grad_norm": 758.5612182617188, + "learning_rate": 3.8247863247863246e-06, + "loss": 59.123, + "step": 180 + }, + { + "epoch": 0.03954681487815306, + "grad_norm": 716.5413208007812, + "learning_rate": 3.9316239316239315e-06, + "loss": 59.4863, + "step": 185 + }, + { + "epoch": 0.04061564771269773, + "grad_norm": 503.7677307128906, + "learning_rate": 4.0384615384615385e-06, + "loss": 65.4498, + "step": 190 + }, + { + "epoch": 0.041684480547242414, + "grad_norm": 613.170654296875, + "learning_rate": 4.145299145299146e-06, + "loss": 52.9526, + "step": 195 + }, + { + "epoch": 0.04275331338178709, + "grad_norm": 725.29833984375, + "learning_rate": 4.2521367521367524e-06, + "loss": 46.6744, + "step": 200 + }, + { + "epoch": 0.043822146216331764, + "grad_norm": 525.0426635742188, + "learning_rate": 4.358974358974359e-06, + "loss": 37.1728, + "step": 205 + }, + { + "epoch": 0.04489097905087644, + "grad_norm": 627.0368041992188, + "learning_rate": 4.465811965811966e-06, + "loss": 63.3973, + "step": 210 + }, + { + "epoch": 0.04595981188542112, + "grad_norm": 501.9342956542969, + "learning_rate": 4.5726495726495725e-06, + "loss": 51.1136, + "step": 215 + }, + { + "epoch": 0.0470286447199658, + "grad_norm": 535.6387329101562, + "learning_rate": 4.6794871794871795e-06, + "loss": 40.9712, + "step": 220 + }, + { + "epoch": 0.04809747755451047, + "grad_norm": 492.3857421875, + "learning_rate": 4.786324786324787e-06, + "loss": 46.4765, + "step": 225 + }, + { + "epoch": 0.049166310389055154, + "grad_norm": 579.775390625, + "learning_rate": 4.8931623931623934e-06, + "loss": 47.3894, + "step": 230 + }, + { + "epoch": 0.05023514322359983, + "grad_norm": 530.4070434570312, + "learning_rate": 5e-06, + "loss": 36.866, + "step": 235 + }, + { + "epoch": 0.051303976058144504, + "grad_norm": 476.2954406738281, + "learning_rate": 5.1068376068376065e-06, + "loss": 32.3704, + "step": 240 + }, + { + "epoch": 0.052372808892689186, + "grad_norm": 412.430419921875, + "learning_rate": 5.213675213675214e-06, + "loss": 36.0298, + "step": 245 + }, + { + "epoch": 0.05344164172723386, + "grad_norm": 467.79412841796875, + "learning_rate": 5.320512820512821e-06, + "loss": 43.7503, + "step": 250 + }, + { + "epoch": 0.05451047456177854, + "grad_norm": 395.0292663574219, + "learning_rate": 5.4273504273504275e-06, + "loss": 33.9929, + "step": 255 + }, + { + "epoch": 0.05557930739632321, + "grad_norm": 543.4691162109375, + "learning_rate": 5.534188034188035e-06, + "loss": 38.4924, + "step": 260 + }, + { + "epoch": 0.056648140230867894, + "grad_norm": 446.6466369628906, + "learning_rate": 5.641025641025641e-06, + "loss": 30.8329, + "step": 265 + }, + { + "epoch": 0.05771697306541257, + "grad_norm": 409.0653381347656, + "learning_rate": 5.7478632478632475e-06, + "loss": 29.0382, + "step": 270 + }, + { + "epoch": 0.058785805899957244, + "grad_norm": 397.3152770996094, + "learning_rate": 5.854700854700855e-06, + "loss": 25.2408, + "step": 275 + }, + { + "epoch": 0.059854638734501926, + "grad_norm": 324.9696350097656, + "learning_rate": 5.961538461538462e-06, + "loss": 30.9856, + "step": 280 + }, + { + "epoch": 0.0609234715690466, + "grad_norm": 439.66241455078125, + "learning_rate": 6.0683760683760684e-06, + "loss": 25.987, + "step": 285 + }, + { + "epoch": 0.061992304403591277, + "grad_norm": 299.0304260253906, + "learning_rate": 6.175213675213676e-06, + "loss": 24.4493, + "step": 290 + }, + { + "epoch": 0.06306113723813596, + "grad_norm": 324.8219909667969, + "learning_rate": 6.282051282051282e-06, + "loss": 25.6893, + "step": 295 + }, + { + "epoch": 0.06412997007268063, + "grad_norm": 419.6073303222656, + "learning_rate": 6.3888888888888885e-06, + "loss": 29.4356, + "step": 300 + }, + { + "epoch": 0.06519880290722531, + "grad_norm": 299.56048583984375, + "learning_rate": 6.495726495726496e-06, + "loss": 22.2, + "step": 305 + }, + { + "epoch": 0.06626763574176998, + "grad_norm": 303.1939697265625, + "learning_rate": 6.602564102564103e-06, + "loss": 24.3048, + "step": 310 + }, + { + "epoch": 0.06733646857631466, + "grad_norm": 203.7785186767578, + "learning_rate": 6.7094017094017094e-06, + "loss": 18.5714, + "step": 315 + }, + { + "epoch": 0.06840530141085935, + "grad_norm": 292.95050048828125, + "learning_rate": 6.816239316239317e-06, + "loss": 17.4822, + "step": 320 + }, + { + "epoch": 0.06947413424540402, + "grad_norm": 145.5703125, + "learning_rate": 6.923076923076923e-06, + "loss": 15.8966, + "step": 325 + }, + { + "epoch": 0.0705429670799487, + "grad_norm": 261.8589782714844, + "learning_rate": 7.02991452991453e-06, + "loss": 14.8771, + "step": 330 + }, + { + "epoch": 0.07161179991449337, + "grad_norm": 215.4441375732422, + "learning_rate": 7.136752136752137e-06, + "loss": 16.0905, + "step": 335 + }, + { + "epoch": 0.07268063274903805, + "grad_norm": 206.23388671875, + "learning_rate": 7.243589743589744e-06, + "loss": 11.9737, + "step": 340 + }, + { + "epoch": 0.07374946558358272, + "grad_norm": 200.727294921875, + "learning_rate": 7.350427350427351e-06, + "loss": 12.1407, + "step": 345 + }, + { + "epoch": 0.0748182984181274, + "grad_norm": 183.34083557128906, + "learning_rate": 7.457264957264958e-06, + "loss": 11.5492, + "step": 350 + }, + { + "epoch": 0.07588713125267209, + "grad_norm": 155.31253051757812, + "learning_rate": 7.564102564102564e-06, + "loss": 13.0664, + "step": 355 + }, + { + "epoch": 0.07695596408721676, + "grad_norm": 141.0535125732422, + "learning_rate": 7.670940170940172e-06, + "loss": 10.0428, + "step": 360 + }, + { + "epoch": 0.07802479692176144, + "grad_norm": 164.8147430419922, + "learning_rate": 7.77777777777778e-06, + "loss": 9.2962, + "step": 365 + }, + { + "epoch": 0.07909362975630611, + "grad_norm": 113.84266662597656, + "learning_rate": 7.884615384615384e-06, + "loss": 8.6304, + "step": 370 + }, + { + "epoch": 0.08016246259085079, + "grad_norm": 90.67302703857422, + "learning_rate": 7.991452991452993e-06, + "loss": 5.7954, + "step": 375 + }, + { + "epoch": 0.08123129542539546, + "grad_norm": 77.02782440185547, + "learning_rate": 8.098290598290598e-06, + "loss": 6.0213, + "step": 380 + }, + { + "epoch": 0.08230012825994014, + "grad_norm": 77.24604797363281, + "learning_rate": 8.205128205128205e-06, + "loss": 6.8873, + "step": 385 + }, + { + "epoch": 0.08336896109448483, + "grad_norm": 78.4577407836914, + "learning_rate": 8.311965811965812e-06, + "loss": 5.6347, + "step": 390 + }, + { + "epoch": 0.0844377939290295, + "grad_norm": 70.10798645019531, + "learning_rate": 8.41880341880342e-06, + "loss": 5.7346, + "step": 395 + }, + { + "epoch": 0.08550662676357418, + "grad_norm": 53.02711486816406, + "learning_rate": 8.525641025641026e-06, + "loss": 4.2817, + "step": 400 + }, + { + "epoch": 0.08657545959811885, + "grad_norm": 58.17922592163086, + "learning_rate": 8.632478632478633e-06, + "loss": 3.9817, + "step": 405 + }, + { + "epoch": 0.08764429243266353, + "grad_norm": 47.072662353515625, + "learning_rate": 8.73931623931624e-06, + "loss": 3.1871, + "step": 410 + }, + { + "epoch": 0.0887131252672082, + "grad_norm": 35.73280715942383, + "learning_rate": 8.846153846153847e-06, + "loss": 3.2088, + "step": 415 + }, + { + "epoch": 0.08978195810175288, + "grad_norm": 40.767581939697266, + "learning_rate": 8.952991452991454e-06, + "loss": 3.5216, + "step": 420 + }, + { + "epoch": 0.09085079093629757, + "grad_norm": 32.53750991821289, + "learning_rate": 9.059829059829061e-06, + "loss": 2.3657, + "step": 425 + }, + { + "epoch": 0.09191962377084224, + "grad_norm": 31.6849422454834, + "learning_rate": 9.166666666666666e-06, + "loss": 2.3054, + "step": 430 + }, + { + "epoch": 0.09298845660538692, + "grad_norm": 29.081796646118164, + "learning_rate": 9.273504273504275e-06, + "loss": 2.174, + "step": 435 + }, + { + "epoch": 0.0940572894399316, + "grad_norm": 34.65196990966797, + "learning_rate": 9.38034188034188e-06, + "loss": 2.4017, + "step": 440 + }, + { + "epoch": 0.09512612227447627, + "grad_norm": 19.63212776184082, + "learning_rate": 9.487179487179487e-06, + "loss": 2.1189, + "step": 445 + }, + { + "epoch": 0.09619495510902094, + "grad_norm": 24.055822372436523, + "learning_rate": 9.594017094017094e-06, + "loss": 2.3965, + "step": 450 + }, + { + "epoch": 0.09726378794356563, + "grad_norm": 18.823020935058594, + "learning_rate": 9.700854700854701e-06, + "loss": 1.7638, + "step": 455 + }, + { + "epoch": 0.09833262077811031, + "grad_norm": 16.97188949584961, + "learning_rate": 9.807692307692308e-06, + "loss": 1.4081, + "step": 460 + }, + { + "epoch": 0.09940145361265498, + "grad_norm": 18.629823684692383, + "learning_rate": 9.914529914529915e-06, + "loss": 1.5501, + "step": 465 + }, + { + "epoch": 0.10047028644719966, + "grad_norm": 16.413358688354492, + "learning_rate": 1.0021367521367522e-05, + "loss": 1.4015, + "step": 470 + }, + { + "epoch": 0.10153911928174433, + "grad_norm": 14.555413246154785, + "learning_rate": 1.012820512820513e-05, + "loss": 1.3726, + "step": 475 + }, + { + "epoch": 0.10260795211628901, + "grad_norm": 17.65382194519043, + "learning_rate": 1.0235042735042734e-05, + "loss": 1.1044, + "step": 480 + }, + { + "epoch": 0.10367678495083368, + "grad_norm": 13.994580268859863, + "learning_rate": 1.0341880341880343e-05, + "loss": 1.1651, + "step": 485 + }, + { + "epoch": 0.10474561778537837, + "grad_norm": 13.052831649780273, + "learning_rate": 1.044871794871795e-05, + "loss": 1.1674, + "step": 490 + }, + { + "epoch": 0.10581445061992305, + "grad_norm": 16.172752380371094, + "learning_rate": 1.0555555555555557e-05, + "loss": 1.2274, + "step": 495 + }, + { + "epoch": 0.10688328345446772, + "grad_norm": 12.005922317504883, + "learning_rate": 1.0662393162393162e-05, + "loss": 1.0606, + "step": 500 + }, + { + "epoch": 0.1079521162890124, + "grad_norm": 13.400876998901367, + "learning_rate": 1.076923076923077e-05, + "loss": 1.2207, + "step": 505 + }, + { + "epoch": 0.10902094912355707, + "grad_norm": 14.658180236816406, + "learning_rate": 1.0876068376068376e-05, + "loss": 1.12, + "step": 510 + }, + { + "epoch": 0.11008978195810175, + "grad_norm": 11.943291664123535, + "learning_rate": 1.0982905982905985e-05, + "loss": 0.9925, + "step": 515 + }, + { + "epoch": 0.11115861479264642, + "grad_norm": 10.507229804992676, + "learning_rate": 1.1089743589743592e-05, + "loss": 0.9542, + "step": 520 + }, + { + "epoch": 0.11222744762719111, + "grad_norm": 12.802043914794922, + "learning_rate": 1.1196581196581197e-05, + "loss": 1.1911, + "step": 525 + }, + { + "epoch": 0.11329628046173579, + "grad_norm": 10.767659187316895, + "learning_rate": 1.1303418803418804e-05, + "loss": 1.2418, + "step": 530 + }, + { + "epoch": 0.11436511329628046, + "grad_norm": 12.087440490722656, + "learning_rate": 1.1410256410256411e-05, + "loss": 1.0926, + "step": 535 + }, + { + "epoch": 0.11543394613082514, + "grad_norm": 9.390274047851562, + "learning_rate": 1.1517094017094016e-05, + "loss": 0.9052, + "step": 540 + }, + { + "epoch": 0.11650277896536981, + "grad_norm": 10.022379875183105, + "learning_rate": 1.1623931623931625e-05, + "loss": 0.9725, + "step": 545 + }, + { + "epoch": 0.11757161179991449, + "grad_norm": 12.384991645812988, + "learning_rate": 1.1730769230769232e-05, + "loss": 1.0631, + "step": 550 + }, + { + "epoch": 0.11864044463445918, + "grad_norm": 10.217049598693848, + "learning_rate": 1.1837606837606839e-05, + "loss": 1.0069, + "step": 555 + }, + { + "epoch": 0.11970927746900385, + "grad_norm": 9.567984580993652, + "learning_rate": 1.1944444444444444e-05, + "loss": 0.9301, + "step": 560 + }, + { + "epoch": 0.12077811030354853, + "grad_norm": 7.623697280883789, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.7413, + "step": 565 + }, + { + "epoch": 0.1218469431380932, + "grad_norm": 12.299339294433594, + "learning_rate": 1.2158119658119658e-05, + "loss": 1.0359, + "step": 570 + }, + { + "epoch": 0.12291577597263788, + "grad_norm": 10.420650482177734, + "learning_rate": 1.2264957264957267e-05, + "loss": 0.9793, + "step": 575 + }, + { + "epoch": 0.12398460880718255, + "grad_norm": 9.585742950439453, + "learning_rate": 1.2371794871794874e-05, + "loss": 0.7139, + "step": 580 + }, + { + "epoch": 0.12505344164172724, + "grad_norm": 8.004850387573242, + "learning_rate": 1.247863247863248e-05, + "loss": 1.0595, + "step": 585 + }, + { + "epoch": 0.12612227447627192, + "grad_norm": 9.993664741516113, + "learning_rate": 1.2585470085470086e-05, + "loss": 0.8869, + "step": 590 + }, + { + "epoch": 0.1271911073108166, + "grad_norm": 9.177787780761719, + "learning_rate": 1.2692307692307693e-05, + "loss": 0.8318, + "step": 595 + }, + { + "epoch": 0.12825994014536127, + "grad_norm": 10.756118774414062, + "learning_rate": 1.2799145299145298e-05, + "loss": 0.7328, + "step": 600 + }, + { + "epoch": 0.12932877297990594, + "grad_norm": 9.876399040222168, + "learning_rate": 1.2905982905982907e-05, + "loss": 0.7785, + "step": 605 + }, + { + "epoch": 0.13039760581445062, + "grad_norm": 9.022135734558105, + "learning_rate": 1.3012820512820514e-05, + "loss": 0.8507, + "step": 610 + }, + { + "epoch": 0.1314664386489953, + "grad_norm": 9.838420867919922, + "learning_rate": 1.3119658119658121e-05, + "loss": 1.0039, + "step": 615 + }, + { + "epoch": 0.13253527148353997, + "grad_norm": 11.163064002990723, + "learning_rate": 1.3226495726495728e-05, + "loss": 0.7514, + "step": 620 + }, + { + "epoch": 0.13360410431808464, + "grad_norm": 7.968421459197998, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.6666, + "step": 625 + }, + { + "epoch": 0.13467293715262932, + "grad_norm": 9.873268127441406, + "learning_rate": 1.3440170940170942e-05, + "loss": 0.6639, + "step": 630 + }, + { + "epoch": 0.135741769987174, + "grad_norm": 7.193119049072266, + "learning_rate": 1.3547008547008549e-05, + "loss": 0.77, + "step": 635 + }, + { + "epoch": 0.1368106028217187, + "grad_norm": 9.957530975341797, + "learning_rate": 1.3653846153846156e-05, + "loss": 0.8079, + "step": 640 + }, + { + "epoch": 0.13787943565626337, + "grad_norm": 9.682191848754883, + "learning_rate": 1.3760683760683761e-05, + "loss": 0.8061, + "step": 645 + }, + { + "epoch": 0.13894826849080805, + "grad_norm": 7.611622333526611, + "learning_rate": 1.3867521367521368e-05, + "loss": 0.7666, + "step": 650 + }, + { + "epoch": 0.14001710132535272, + "grad_norm": 9.452914237976074, + "learning_rate": 1.3974358974358975e-05, + "loss": 0.7835, + "step": 655 + }, + { + "epoch": 0.1410859341598974, + "grad_norm": 7.584820747375488, + "learning_rate": 1.4081196581196584e-05, + "loss": 0.7319, + "step": 660 + }, + { + "epoch": 0.14215476699444207, + "grad_norm": 7.296449661254883, + "learning_rate": 1.4188034188034189e-05, + "loss": 0.5945, + "step": 665 + }, + { + "epoch": 0.14322359982898675, + "grad_norm": 8.2069730758667, + "learning_rate": 1.4294871794871796e-05, + "loss": 0.7008, + "step": 670 + }, + { + "epoch": 0.14429243266353142, + "grad_norm": 8.537630081176758, + "learning_rate": 1.4401709401709403e-05, + "loss": 0.9728, + "step": 675 + }, + { + "epoch": 0.1453612654980761, + "grad_norm": 8.83273696899414, + "learning_rate": 1.450854700854701e-05, + "loss": 0.7346, + "step": 680 + }, + { + "epoch": 0.14643009833262077, + "grad_norm": 9.676769256591797, + "learning_rate": 1.4615384615384615e-05, + "loss": 0.8463, + "step": 685 + }, + { + "epoch": 0.14749893116716545, + "grad_norm": 8.45507526397705, + "learning_rate": 1.4722222222222224e-05, + "loss": 0.7251, + "step": 690 + }, + { + "epoch": 0.14856776400171012, + "grad_norm": 9.440534591674805, + "learning_rate": 1.4829059829059831e-05, + "loss": 0.7444, + "step": 695 + }, + { + "epoch": 0.1496365968362548, + "grad_norm": 7.491715431213379, + "learning_rate": 1.4935897435897438e-05, + "loss": 0.6496, + "step": 700 + }, + { + "epoch": 0.15070542967079947, + "grad_norm": 6.313747406005859, + "learning_rate": 1.5042735042735043e-05, + "loss": 0.7623, + "step": 705 + }, + { + "epoch": 0.15177426250534418, + "grad_norm": 7.4156060218811035, + "learning_rate": 1.514957264957265e-05, + "loss": 0.8285, + "step": 710 + }, + { + "epoch": 0.15284309533988885, + "grad_norm": 8.984630584716797, + "learning_rate": 1.5256410256410257e-05, + "loss": 0.886, + "step": 715 + }, + { + "epoch": 0.15391192817443353, + "grad_norm": 7.339222431182861, + "learning_rate": 1.5363247863247866e-05, + "loss": 0.7274, + "step": 720 + }, + { + "epoch": 0.1549807610089782, + "grad_norm": 7.579738140106201, + "learning_rate": 1.5470085470085473e-05, + "loss": 0.8669, + "step": 725 + }, + { + "epoch": 0.15604959384352288, + "grad_norm": 8.190738677978516, + "learning_rate": 1.557692307692308e-05, + "loss": 0.6741, + "step": 730 + }, + { + "epoch": 0.15711842667806755, + "grad_norm": 7.186857223510742, + "learning_rate": 1.5683760683760683e-05, + "loss": 0.7396, + "step": 735 + }, + { + "epoch": 0.15818725951261223, + "grad_norm": 6.874704360961914, + "learning_rate": 1.579059829059829e-05, + "loss": 0.7197, + "step": 740 + }, + { + "epoch": 0.1592560923471569, + "grad_norm": 9.56429386138916, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.9099, + "step": 745 + }, + { + "epoch": 0.16032492518170158, + "grad_norm": 7.520778179168701, + "learning_rate": 1.6004273504273508e-05, + "loss": 0.7344, + "step": 750 + }, + { + "epoch": 0.16139375801624625, + "grad_norm": 5.668506622314453, + "learning_rate": 1.6111111111111115e-05, + "loss": 0.6107, + "step": 755 + }, + { + "epoch": 0.16246259085079093, + "grad_norm": 9.869526863098145, + "learning_rate": 1.6217948717948718e-05, + "loss": 0.6929, + "step": 760 + }, + { + "epoch": 0.1635314236853356, + "grad_norm": 8.029936790466309, + "learning_rate": 1.6324786324786325e-05, + "loss": 0.7019, + "step": 765 + }, + { + "epoch": 0.16460025651988028, + "grad_norm": 8.148101806640625, + "learning_rate": 1.6431623931623932e-05, + "loss": 0.5759, + "step": 770 + }, + { + "epoch": 0.16566908935442498, + "grad_norm": 7.96873664855957, + "learning_rate": 1.653846153846154e-05, + "loss": 0.5736, + "step": 775 + }, + { + "epoch": 0.16673792218896966, + "grad_norm": 9.016830444335938, + "learning_rate": 1.6645299145299146e-05, + "loss": 0.5878, + "step": 780 + }, + { + "epoch": 0.16780675502351433, + "grad_norm": 7.439241886138916, + "learning_rate": 1.6752136752136753e-05, + "loss": 0.7349, + "step": 785 + }, + { + "epoch": 0.168875587858059, + "grad_norm": 9.404788970947266, + "learning_rate": 1.685897435897436e-05, + "loss": 0.7691, + "step": 790 + }, + { + "epoch": 0.16994442069260368, + "grad_norm": 6.773132801055908, + "learning_rate": 1.6965811965811967e-05, + "loss": 0.6228, + "step": 795 + }, + { + "epoch": 0.17101325352714836, + "grad_norm": 6.86265754699707, + "learning_rate": 1.7072649572649574e-05, + "loss": 0.6394, + "step": 800 + }, + { + "epoch": 0.17208208636169303, + "grad_norm": 6.647765159606934, + "learning_rate": 1.717948717948718e-05, + "loss": 0.624, + "step": 805 + }, + { + "epoch": 0.1731509191962377, + "grad_norm": 6.882334232330322, + "learning_rate": 1.7286324786324788e-05, + "loss": 0.6487, + "step": 810 + }, + { + "epoch": 0.17421975203078238, + "grad_norm": 8.620728492736816, + "learning_rate": 1.7393162393162395e-05, + "loss": 0.6001, + "step": 815 + }, + { + "epoch": 0.17528858486532706, + "grad_norm": 7.544363021850586, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.6685, + "step": 820 + }, + { + "epoch": 0.17635741769987173, + "grad_norm": 8.941640853881836, + "learning_rate": 1.760683760683761e-05, + "loss": 0.8176, + "step": 825 + }, + { + "epoch": 0.1774262505344164, + "grad_norm": 6.829235553741455, + "learning_rate": 1.7713675213675216e-05, + "loss": 0.6417, + "step": 830 + }, + { + "epoch": 0.17849508336896108, + "grad_norm": 7.0878705978393555, + "learning_rate": 1.7820512820512823e-05, + "loss": 0.6353, + "step": 835 + }, + { + "epoch": 0.17956391620350576, + "grad_norm": 9.062278747558594, + "learning_rate": 1.792735042735043e-05, + "loss": 0.5946, + "step": 840 + }, + { + "epoch": 0.18063274903805046, + "grad_norm": 7.967255115509033, + "learning_rate": 1.8034188034188037e-05, + "loss": 0.6122, + "step": 845 + }, + { + "epoch": 0.18170158187259514, + "grad_norm": 7.076515197753906, + "learning_rate": 1.8141025641025644e-05, + "loss": 0.6499, + "step": 850 + }, + { + "epoch": 0.1827704147071398, + "grad_norm": 7.658061981201172, + "learning_rate": 1.8247863247863247e-05, + "loss": 0.601, + "step": 855 + }, + { + "epoch": 0.1838392475416845, + "grad_norm": 7.605343341827393, + "learning_rate": 1.8354700854700854e-05, + "loss": 0.5404, + "step": 860 + }, + { + "epoch": 0.18490808037622916, + "grad_norm": 6.7278900146484375, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.6574, + "step": 865 + }, + { + "epoch": 0.18597691321077384, + "grad_norm": 6.190138339996338, + "learning_rate": 1.856837606837607e-05, + "loss": 0.5377, + "step": 870 + }, + { + "epoch": 0.1870457460453185, + "grad_norm": 5.700743198394775, + "learning_rate": 1.867521367521368e-05, + "loss": 0.5537, + "step": 875 + }, + { + "epoch": 0.1881145788798632, + "grad_norm": 9.452567100524902, + "learning_rate": 1.8782051282051282e-05, + "loss": 0.7377, + "step": 880 + }, + { + "epoch": 0.18918341171440786, + "grad_norm": 8.572616577148438, + "learning_rate": 1.888888888888889e-05, + "loss": 0.8466, + "step": 885 + }, + { + "epoch": 0.19025224454895254, + "grad_norm": 6.903915882110596, + "learning_rate": 1.8995726495726496e-05, + "loss": 0.4808, + "step": 890 + }, + { + "epoch": 0.1913210773834972, + "grad_norm": 6.828094959259033, + "learning_rate": 1.9102564102564106e-05, + "loss": 0.5482, + "step": 895 + }, + { + "epoch": 0.1923899102180419, + "grad_norm": 9.153865814208984, + "learning_rate": 1.920940170940171e-05, + "loss": 0.6588, + "step": 900 + }, + { + "epoch": 0.19345874305258656, + "grad_norm": 8.290953636169434, + "learning_rate": 1.9316239316239317e-05, + "loss": 0.6953, + "step": 905 + }, + { + "epoch": 0.19452757588713127, + "grad_norm": 6.111261367797852, + "learning_rate": 1.9423076923076924e-05, + "loss": 0.4803, + "step": 910 + }, + { + "epoch": 0.19559640872167594, + "grad_norm": 8.402656555175781, + "learning_rate": 1.952991452991453e-05, + "loss": 0.6869, + "step": 915 + }, + { + "epoch": 0.19666524155622062, + "grad_norm": 5.929531574249268, + "learning_rate": 1.9636752136752138e-05, + "loss": 0.5393, + "step": 920 + }, + { + "epoch": 0.1977340743907653, + "grad_norm": 7.195873260498047, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.4829, + "step": 925 + }, + { + "epoch": 0.19880290722530997, + "grad_norm": 8.35781192779541, + "learning_rate": 1.9850427350427352e-05, + "loss": 0.5071, + "step": 930 + }, + { + "epoch": 0.19987174005985464, + "grad_norm": 7.967381954193115, + "learning_rate": 1.995726495726496e-05, + "loss": 0.6828, + "step": 935 + }, + { + "epoch": 0.20094057289439932, + "grad_norm": 7.339260101318359, + "learning_rate": 1.999287410926366e-05, + "loss": 0.6468, + "step": 940 + }, + { + "epoch": 0.202009405728944, + "grad_norm": 6.559432029724121, + "learning_rate": 1.9980997624703088e-05, + "loss": 0.6169, + "step": 945 + }, + { + "epoch": 0.20307823856348867, + "grad_norm": 6.627939701080322, + "learning_rate": 1.996912114014252e-05, + "loss": 0.5389, + "step": 950 + }, + { + "epoch": 0.20414707139803334, + "grad_norm": 6.5172119140625, + "learning_rate": 1.995724465558195e-05, + "loss": 0.4527, + "step": 955 + }, + { + "epoch": 0.20521590423257802, + "grad_norm": 6.31046199798584, + "learning_rate": 1.994536817102138e-05, + "loss": 0.5405, + "step": 960 + }, + { + "epoch": 0.2062847370671227, + "grad_norm": 6.0619425773620605, + "learning_rate": 1.993349168646081e-05, + "loss": 0.6583, + "step": 965 + }, + { + "epoch": 0.20735356990166737, + "grad_norm": 7.9872941970825195, + "learning_rate": 1.992161520190024e-05, + "loss": 0.7037, + "step": 970 + }, + { + "epoch": 0.20842240273621207, + "grad_norm": 6.310743808746338, + "learning_rate": 1.9909738717339668e-05, + "loss": 0.564, + "step": 975 + }, + { + "epoch": 0.20949123557075675, + "grad_norm": 6.636473655700684, + "learning_rate": 1.9897862232779098e-05, + "loss": 0.4531, + "step": 980 + }, + { + "epoch": 0.21056006840530142, + "grad_norm": 6.087254047393799, + "learning_rate": 1.988598574821853e-05, + "loss": 0.5454, + "step": 985 + }, + { + "epoch": 0.2116289012398461, + "grad_norm": 6.705723762512207, + "learning_rate": 1.987410926365796e-05, + "loss": 0.4836, + "step": 990 + }, + { + "epoch": 0.21269773407439077, + "grad_norm": 6.238287448883057, + "learning_rate": 1.9862232779097387e-05, + "loss": 0.5458, + "step": 995 + }, + { + "epoch": 0.21376656690893545, + "grad_norm": 5.439404010772705, + "learning_rate": 1.985035629453682e-05, + "loss": 0.5202, + "step": 1000 + }, + { + "epoch": 0.21483539974348012, + "grad_norm": 5.1525654792785645, + "learning_rate": 1.9838479809976248e-05, + "loss": 0.5532, + "step": 1005 + }, + { + "epoch": 0.2159042325780248, + "grad_norm": 6.952949047088623, + "learning_rate": 1.9826603325415678e-05, + "loss": 0.63, + "step": 1010 + }, + { + "epoch": 0.21697306541256947, + "grad_norm": 5.5152788162231445, + "learning_rate": 1.981472684085511e-05, + "loss": 0.6299, + "step": 1015 + }, + { + "epoch": 0.21804189824711415, + "grad_norm": 7.090893745422363, + "learning_rate": 1.980285035629454e-05, + "loss": 0.5405, + "step": 1020 + }, + { + "epoch": 0.21911073108165882, + "grad_norm": 5.758279323577881, + "learning_rate": 1.979097387173397e-05, + "loss": 0.4379, + "step": 1025 + }, + { + "epoch": 0.2201795639162035, + "grad_norm": 10.006664276123047, + "learning_rate": 1.9779097387173397e-05, + "loss": 0.7446, + "step": 1030 + }, + { + "epoch": 0.22124839675074817, + "grad_norm": 6.134273529052734, + "learning_rate": 1.9767220902612828e-05, + "loss": 0.6652, + "step": 1035 + }, + { + "epoch": 0.22231722958529285, + "grad_norm": 7.395203113555908, + "learning_rate": 1.9755344418052258e-05, + "loss": 0.6873, + "step": 1040 + }, + { + "epoch": 0.22338606241983755, + "grad_norm": 4.713867664337158, + "learning_rate": 1.974346793349169e-05, + "loss": 0.4904, + "step": 1045 + }, + { + "epoch": 0.22445489525438223, + "grad_norm": 6.1399359703063965, + "learning_rate": 1.973159144893112e-05, + "loss": 0.5763, + "step": 1050 + }, + { + "epoch": 0.2255237280889269, + "grad_norm": 7.5219879150390625, + "learning_rate": 1.971971496437055e-05, + "loss": 0.5801, + "step": 1055 + }, + { + "epoch": 0.22659256092347158, + "grad_norm": 5.3690619468688965, + "learning_rate": 1.9707838479809977e-05, + "loss": 0.518, + "step": 1060 + }, + { + "epoch": 0.22766139375801625, + "grad_norm": 7.701142311096191, + "learning_rate": 1.9695961995249407e-05, + "loss": 0.5928, + "step": 1065 + }, + { + "epoch": 0.22873022659256093, + "grad_norm": 5.431284427642822, + "learning_rate": 1.9684085510688838e-05, + "loss": 0.5575, + "step": 1070 + }, + { + "epoch": 0.2297990594271056, + "grad_norm": 5.841889381408691, + "learning_rate": 1.967220902612827e-05, + "loss": 0.497, + "step": 1075 + }, + { + "epoch": 0.23086789226165028, + "grad_norm": 6.84688663482666, + "learning_rate": 1.9660332541567696e-05, + "loss": 0.4607, + "step": 1080 + }, + { + "epoch": 0.23193672509619495, + "grad_norm": 7.2094645500183105, + "learning_rate": 1.964845605700713e-05, + "loss": 0.5531, + "step": 1085 + }, + { + "epoch": 0.23300555793073963, + "grad_norm": 8.189807891845703, + "learning_rate": 1.9636579572446557e-05, + "loss": 0.5372, + "step": 1090 + }, + { + "epoch": 0.2340743907652843, + "grad_norm": 6.64928674697876, + "learning_rate": 1.9624703087885987e-05, + "loss": 0.4641, + "step": 1095 + }, + { + "epoch": 0.23514322359982898, + "grad_norm": 5.907129764556885, + "learning_rate": 1.9612826603325418e-05, + "loss": 0.4724, + "step": 1100 + }, + { + "epoch": 0.23621205643437365, + "grad_norm": 5.550957202911377, + "learning_rate": 1.960095011876485e-05, + "loss": 0.4422, + "step": 1105 + }, + { + "epoch": 0.23728088926891835, + "grad_norm": 5.1877899169921875, + "learning_rate": 1.9589073634204276e-05, + "loss": 0.4571, + "step": 1110 + }, + { + "epoch": 0.23834972210346303, + "grad_norm": 6.098719120025635, + "learning_rate": 1.9577197149643706e-05, + "loss": 0.5104, + "step": 1115 + }, + { + "epoch": 0.2394185549380077, + "grad_norm": 5.2909770011901855, + "learning_rate": 1.9565320665083137e-05, + "loss": 0.5217, + "step": 1120 + }, + { + "epoch": 0.24048738777255238, + "grad_norm": 7.134459018707275, + "learning_rate": 1.9553444180522567e-05, + "loss": 0.4481, + "step": 1125 + }, + { + "epoch": 0.24155622060709706, + "grad_norm": 7.295234203338623, + "learning_rate": 1.9541567695961994e-05, + "loss": 0.5244, + "step": 1130 + }, + { + "epoch": 0.24262505344164173, + "grad_norm": 6.2853193283081055, + "learning_rate": 1.952969121140143e-05, + "loss": 0.4593, + "step": 1135 + }, + { + "epoch": 0.2436938862761864, + "grad_norm": 6.563023567199707, + "learning_rate": 1.9517814726840856e-05, + "loss": 0.5566, + "step": 1140 + }, + { + "epoch": 0.24476271911073108, + "grad_norm": 5.089166164398193, + "learning_rate": 1.9505938242280286e-05, + "loss": 0.5473, + "step": 1145 + }, + { + "epoch": 0.24583155194527576, + "grad_norm": 4.706131458282471, + "learning_rate": 1.9494061757719717e-05, + "loss": 0.3649, + "step": 1150 + }, + { + "epoch": 0.24690038477982043, + "grad_norm": 7.967005729675293, + "learning_rate": 1.9482185273159147e-05, + "loss": 0.5466, + "step": 1155 + }, + { + "epoch": 0.2479692176143651, + "grad_norm": 6.625776767730713, + "learning_rate": 1.9470308788598574e-05, + "loss": 0.4687, + "step": 1160 + }, + { + "epoch": 0.24903805044890978, + "grad_norm": 6.631053447723389, + "learning_rate": 1.9458432304038005e-05, + "loss": 0.521, + "step": 1165 + }, + { + "epoch": 0.2501068832834545, + "grad_norm": 5.4099555015563965, + "learning_rate": 1.944655581947744e-05, + "loss": 0.4539, + "step": 1170 + }, + { + "epoch": 0.25117571611799916, + "grad_norm": 6.302776336669922, + "learning_rate": 1.9434679334916866e-05, + "loss": 0.5975, + "step": 1175 + }, + { + "epoch": 0.25224454895254383, + "grad_norm": 6.055430889129639, + "learning_rate": 1.9422802850356297e-05, + "loss": 0.4108, + "step": 1180 + }, + { + "epoch": 0.2533133817870885, + "grad_norm": 6.507791519165039, + "learning_rate": 1.9410926365795727e-05, + "loss": 0.4052, + "step": 1185 + }, + { + "epoch": 0.2543822146216332, + "grad_norm": 4.61367130279541, + "learning_rate": 1.9399049881235158e-05, + "loss": 0.3805, + "step": 1190 + }, + { + "epoch": 0.25545104745617786, + "grad_norm": 7.931022644042969, + "learning_rate": 1.9387173396674585e-05, + "loss": 0.508, + "step": 1195 + }, + { + "epoch": 0.25651988029072254, + "grad_norm": 7.164324760437012, + "learning_rate": 1.9375296912114015e-05, + "loss": 0.4517, + "step": 1200 + }, + { + "epoch": 0.2575887131252672, + "grad_norm": 5.5631890296936035, + "learning_rate": 1.9363420427553446e-05, + "loss": 0.5546, + "step": 1205 + }, + { + "epoch": 0.2586575459598119, + "grad_norm": 6.603108882904053, + "learning_rate": 1.9351543942992876e-05, + "loss": 0.5427, + "step": 1210 + }, + { + "epoch": 0.25972637879435656, + "grad_norm": 6.033792018890381, + "learning_rate": 1.9339667458432304e-05, + "loss": 0.4869, + "step": 1215 + }, + { + "epoch": 0.26079521162890124, + "grad_norm": 6.105428218841553, + "learning_rate": 1.9327790973871738e-05, + "loss": 0.5849, + "step": 1220 + }, + { + "epoch": 0.2618640444634459, + "grad_norm": 5.2770161628723145, + "learning_rate": 1.9315914489311165e-05, + "loss": 0.4608, + "step": 1225 + }, + { + "epoch": 0.2629328772979906, + "grad_norm": 8.29669189453125, + "learning_rate": 1.9304038004750595e-05, + "loss": 0.6211, + "step": 1230 + }, + { + "epoch": 0.26400171013253526, + "grad_norm": 5.556075572967529, + "learning_rate": 1.9292161520190026e-05, + "loss": 0.5136, + "step": 1235 + }, + { + "epoch": 0.26507054296707994, + "grad_norm": 6.262655258178711, + "learning_rate": 1.9280285035629456e-05, + "loss": 0.4788, + "step": 1240 + }, + { + "epoch": 0.2661393758016246, + "grad_norm": 7.117279052734375, + "learning_rate": 1.9268408551068884e-05, + "loss": 0.4763, + "step": 1245 + }, + { + "epoch": 0.2672082086361693, + "grad_norm": 7.450889587402344, + "learning_rate": 1.9256532066508314e-05, + "loss": 0.5398, + "step": 1250 + }, + { + "epoch": 0.26827704147071396, + "grad_norm": 6.718365669250488, + "learning_rate": 1.9244655581947745e-05, + "loss": 0.5782, + "step": 1255 + }, + { + "epoch": 0.26934587430525864, + "grad_norm": 5.374184608459473, + "learning_rate": 1.9232779097387175e-05, + "loss": 0.4568, + "step": 1260 + }, + { + "epoch": 0.2704147071398033, + "grad_norm": 4.485583305358887, + "learning_rate": 1.9220902612826606e-05, + "loss": 0.3874, + "step": 1265 + }, + { + "epoch": 0.271483539974348, + "grad_norm": 5.478529930114746, + "learning_rate": 1.9209026128266036e-05, + "loss": 0.4604, + "step": 1270 + }, + { + "epoch": 0.27255237280889266, + "grad_norm": 5.911350727081299, + "learning_rate": 1.9197149643705463e-05, + "loss": 0.4202, + "step": 1275 + }, + { + "epoch": 0.2736212056434374, + "grad_norm": 7.953678131103516, + "learning_rate": 1.9185273159144894e-05, + "loss": 0.4467, + "step": 1280 + }, + { + "epoch": 0.27469003847798207, + "grad_norm": 6.318038463592529, + "learning_rate": 1.9173396674584325e-05, + "loss": 0.3631, + "step": 1285 + }, + { + "epoch": 0.27575887131252674, + "grad_norm": 7.290485382080078, + "learning_rate": 1.9161520190023755e-05, + "loss": 0.5917, + "step": 1290 + }, + { + "epoch": 0.2768277041470714, + "grad_norm": 6.057776927947998, + "learning_rate": 1.9149643705463182e-05, + "loss": 0.4315, + "step": 1295 + }, + { + "epoch": 0.2778965369816161, + "grad_norm": 5.482032775878906, + "learning_rate": 1.9137767220902613e-05, + "loss": 0.3898, + "step": 1300 + }, + { + "epoch": 0.27896536981616077, + "grad_norm": 7.219336032867432, + "learning_rate": 1.9125890736342047e-05, + "loss": 0.5546, + "step": 1305 + }, + { + "epoch": 0.28003420265070544, + "grad_norm": 6.556499481201172, + "learning_rate": 1.9114014251781474e-05, + "loss": 0.4766, + "step": 1310 + }, + { + "epoch": 0.2811030354852501, + "grad_norm": 8.849128723144531, + "learning_rate": 1.9102137767220904e-05, + "loss": 0.5883, + "step": 1315 + }, + { + "epoch": 0.2821718683197948, + "grad_norm": 6.604886054992676, + "learning_rate": 1.9090261282660335e-05, + "loss": 0.4837, + "step": 1320 + }, + { + "epoch": 0.28324070115433947, + "grad_norm": 6.3507304191589355, + "learning_rate": 1.9078384798099766e-05, + "loss": 0.4576, + "step": 1325 + }, + { + "epoch": 0.28430953398888414, + "grad_norm": 7.592872619628906, + "learning_rate": 1.9066508313539193e-05, + "loss": 0.3894, + "step": 1330 + }, + { + "epoch": 0.2853783668234288, + "grad_norm": 7.806624889373779, + "learning_rate": 1.9054631828978623e-05, + "loss": 0.5239, + "step": 1335 + }, + { + "epoch": 0.2864471996579735, + "grad_norm": 5.70356559753418, + "learning_rate": 1.9042755344418054e-05, + "loss": 0.4255, + "step": 1340 + }, + { + "epoch": 0.28751603249251817, + "grad_norm": 6.017592906951904, + "learning_rate": 1.9030878859857484e-05, + "loss": 0.3944, + "step": 1345 + }, + { + "epoch": 0.28858486532706284, + "grad_norm": 8.678641319274902, + "learning_rate": 1.9019002375296915e-05, + "loss": 0.6219, + "step": 1350 + }, + { + "epoch": 0.2896536981616075, + "grad_norm": 5.638593673706055, + "learning_rate": 1.9007125890736345e-05, + "loss": 0.4791, + "step": 1355 + }, + { + "epoch": 0.2907225309961522, + "grad_norm": 6.662184238433838, + "learning_rate": 1.8995249406175773e-05, + "loss": 0.4101, + "step": 1360 + }, + { + "epoch": 0.29179136383069687, + "grad_norm": 5.850408554077148, + "learning_rate": 1.8983372921615203e-05, + "loss": 0.4422, + "step": 1365 + }, + { + "epoch": 0.29286019666524155, + "grad_norm": 6.298422813415527, + "learning_rate": 1.8971496437054634e-05, + "loss": 0.4426, + "step": 1370 + }, + { + "epoch": 0.2939290294997862, + "grad_norm": 6.113378524780273, + "learning_rate": 1.8959619952494064e-05, + "loss": 0.4108, + "step": 1375 + }, + { + "epoch": 0.2949978623343309, + "grad_norm": 5.136318206787109, + "learning_rate": 1.894774346793349e-05, + "loss": 0.4734, + "step": 1380 + }, + { + "epoch": 0.29606669516887557, + "grad_norm": 7.1877760887146, + "learning_rate": 1.8935866983372922e-05, + "loss": 0.4678, + "step": 1385 + }, + { + "epoch": 0.29713552800342025, + "grad_norm": 7.322067737579346, + "learning_rate": 1.8923990498812352e-05, + "loss": 0.4334, + "step": 1390 + }, + { + "epoch": 0.2982043608379649, + "grad_norm": 4.906497001647949, + "learning_rate": 1.8912114014251783e-05, + "loss": 0.4211, + "step": 1395 + }, + { + "epoch": 0.2992731936725096, + "grad_norm": 4.929844379425049, + "learning_rate": 1.8900237529691214e-05, + "loss": 0.5807, + "step": 1400 + }, + { + "epoch": 0.30034202650705427, + "grad_norm": 6.196166515350342, + "learning_rate": 1.8888361045130644e-05, + "loss": 0.5956, + "step": 1405 + }, + { + "epoch": 0.30141085934159895, + "grad_norm": 5.226170539855957, + "learning_rate": 1.887648456057007e-05, + "loss": 0.4175, + "step": 1410 + }, + { + "epoch": 0.3024796921761437, + "grad_norm": 4.843142509460449, + "learning_rate": 1.8864608076009502e-05, + "loss": 0.4233, + "step": 1415 + }, + { + "epoch": 0.30354852501068835, + "grad_norm": 5.112825393676758, + "learning_rate": 1.8852731591448932e-05, + "loss": 0.4118, + "step": 1420 + }, + { + "epoch": 0.304617357845233, + "grad_norm": 6.756041526794434, + "learning_rate": 1.8840855106888363e-05, + "loss": 0.3919, + "step": 1425 + }, + { + "epoch": 0.3056861906797777, + "grad_norm": 5.811524868011475, + "learning_rate": 1.882897862232779e-05, + "loss": 0.5152, + "step": 1430 + }, + { + "epoch": 0.3067550235143224, + "grad_norm": 5.891305446624756, + "learning_rate": 1.8817102137767224e-05, + "loss": 0.393, + "step": 1435 + }, + { + "epoch": 0.30782385634886705, + "grad_norm": 6.220530986785889, + "learning_rate": 1.880522565320665e-05, + "loss": 0.3765, + "step": 1440 + }, + { + "epoch": 0.30889268918341173, + "grad_norm": 6.09738826751709, + "learning_rate": 1.8793349168646082e-05, + "loss": 0.4476, + "step": 1445 + }, + { + "epoch": 0.3099615220179564, + "grad_norm": 4.718704700469971, + "learning_rate": 1.8781472684085512e-05, + "loss": 0.5094, + "step": 1450 + }, + { + "epoch": 0.3110303548525011, + "grad_norm": 5.264518737792969, + "learning_rate": 1.8769596199524943e-05, + "loss": 0.4496, + "step": 1455 + }, + { + "epoch": 0.31209918768704575, + "grad_norm": 5.551924705505371, + "learning_rate": 1.8757719714964373e-05, + "loss": 0.4305, + "step": 1460 + }, + { + "epoch": 0.31316802052159043, + "grad_norm": 4.252546787261963, + "learning_rate": 1.87458432304038e-05, + "loss": 0.495, + "step": 1465 + }, + { + "epoch": 0.3142368533561351, + "grad_norm": 4.372467517852783, + "learning_rate": 1.8733966745843235e-05, + "loss": 0.5561, + "step": 1470 + }, + { + "epoch": 0.3153056861906798, + "grad_norm": 6.216442108154297, + "learning_rate": 1.872209026128266e-05, + "loss": 0.4432, + "step": 1475 + }, + { + "epoch": 0.31637451902522445, + "grad_norm": 3.8125741481781006, + "learning_rate": 1.8710213776722092e-05, + "loss": 0.3382, + "step": 1480 + }, + { + "epoch": 0.31744335185976913, + "grad_norm": 5.539150714874268, + "learning_rate": 1.8698337292161523e-05, + "loss": 0.4209, + "step": 1485 + }, + { + "epoch": 0.3185121846943138, + "grad_norm": 6.593637466430664, + "learning_rate": 1.8686460807600953e-05, + "loss": 0.3776, + "step": 1490 + }, + { + "epoch": 0.3195810175288585, + "grad_norm": 5.109198570251465, + "learning_rate": 1.867458432304038e-05, + "loss": 0.4271, + "step": 1495 + }, + { + "epoch": 0.32064985036340315, + "grad_norm": 7.083045959472656, + "learning_rate": 1.866270783847981e-05, + "loss": 0.5628, + "step": 1500 + }, + { + "epoch": 0.32171868319794783, + "grad_norm": 7.068709850311279, + "learning_rate": 1.865083135391924e-05, + "loss": 0.4438, + "step": 1505 + }, + { + "epoch": 0.3227875160324925, + "grad_norm": 4.62941312789917, + "learning_rate": 1.8638954869358672e-05, + "loss": 0.3863, + "step": 1510 + }, + { + "epoch": 0.3238563488670372, + "grad_norm": 4.4039788246154785, + "learning_rate": 1.86270783847981e-05, + "loss": 0.5629, + "step": 1515 + }, + { + "epoch": 0.32492518170158186, + "grad_norm": 6.153443813323975, + "learning_rate": 1.8615201900237533e-05, + "loss": 0.4851, + "step": 1520 + }, + { + "epoch": 0.32599401453612653, + "grad_norm": 4.207914352416992, + "learning_rate": 1.860332541567696e-05, + "loss": 0.3386, + "step": 1525 + }, + { + "epoch": 0.3270628473706712, + "grad_norm": 5.669225692749023, + "learning_rate": 1.859144893111639e-05, + "loss": 0.4546, + "step": 1530 + }, + { + "epoch": 0.3281316802052159, + "grad_norm": 6.5213775634765625, + "learning_rate": 1.857957244655582e-05, + "loss": 0.4147, + "step": 1535 + }, + { + "epoch": 0.32920051303976056, + "grad_norm": 5.153679370880127, + "learning_rate": 1.8567695961995252e-05, + "loss": 0.403, + "step": 1540 + }, + { + "epoch": 0.33026934587430523, + "grad_norm": 5.655941009521484, + "learning_rate": 1.855581947743468e-05, + "loss": 0.4155, + "step": 1545 + }, + { + "epoch": 0.33133817870884996, + "grad_norm": 6.6513895988464355, + "learning_rate": 1.854394299287411e-05, + "loss": 0.4831, + "step": 1550 + }, + { + "epoch": 0.33240701154339464, + "grad_norm": 5.994706153869629, + "learning_rate": 1.853206650831354e-05, + "loss": 0.388, + "step": 1555 + }, + { + "epoch": 0.3334758443779393, + "grad_norm": 4.132383346557617, + "learning_rate": 1.852019002375297e-05, + "loss": 0.4184, + "step": 1560 + }, + { + "epoch": 0.334544677212484, + "grad_norm": 4.975070953369141, + "learning_rate": 1.8508313539192398e-05, + "loss": 0.3645, + "step": 1565 + }, + { + "epoch": 0.33561351004702866, + "grad_norm": 6.475266933441162, + "learning_rate": 1.8496437054631832e-05, + "loss": 0.4653, + "step": 1570 + }, + { + "epoch": 0.33668234288157334, + "grad_norm": 5.302603244781494, + "learning_rate": 1.848456057007126e-05, + "loss": 0.5196, + "step": 1575 + }, + { + "epoch": 0.337751175716118, + "grad_norm": 6.404365539550781, + "learning_rate": 1.847268408551069e-05, + "loss": 0.5252, + "step": 1580 + }, + { + "epoch": 0.3388200085506627, + "grad_norm": 5.3015923500061035, + "learning_rate": 1.846080760095012e-05, + "loss": 0.6971, + "step": 1585 + }, + { + "epoch": 0.33988884138520736, + "grad_norm": 6.321039199829102, + "learning_rate": 1.844893111638955e-05, + "loss": 0.3881, + "step": 1590 + }, + { + "epoch": 0.34095767421975204, + "grad_norm": 4.614476680755615, + "learning_rate": 1.843705463182898e-05, + "loss": 0.4302, + "step": 1595 + }, + { + "epoch": 0.3420265070542967, + "grad_norm": 5.174408912658691, + "learning_rate": 1.842517814726841e-05, + "loss": 0.3701, + "step": 1600 + }, + { + "epoch": 0.3430953398888414, + "grad_norm": 4.7469706535339355, + "learning_rate": 1.8413301662707842e-05, + "loss": 0.3893, + "step": 1605 + }, + { + "epoch": 0.34416417272338606, + "grad_norm": 5.967380046844482, + "learning_rate": 1.840142517814727e-05, + "loss": 0.4246, + "step": 1610 + }, + { + "epoch": 0.34523300555793074, + "grad_norm": 4.841580867767334, + "learning_rate": 1.83895486935867e-05, + "loss": 0.3006, + "step": 1615 + }, + { + "epoch": 0.3463018383924754, + "grad_norm": 5.739339351654053, + "learning_rate": 1.837767220902613e-05, + "loss": 0.7078, + "step": 1620 + }, + { + "epoch": 0.3473706712270201, + "grad_norm": 5.888680458068848, + "learning_rate": 1.836579572446556e-05, + "loss": 0.375, + "step": 1625 + }, + { + "epoch": 0.34843950406156476, + "grad_norm": 6.077122211456299, + "learning_rate": 1.835391923990499e-05, + "loss": 0.4425, + "step": 1630 + }, + { + "epoch": 0.34950833689610944, + "grad_norm": 6.087640762329102, + "learning_rate": 1.834204275534442e-05, + "loss": 0.3841, + "step": 1635 + }, + { + "epoch": 0.3505771697306541, + "grad_norm": 7.3536529541015625, + "learning_rate": 1.833016627078385e-05, + "loss": 0.4075, + "step": 1640 + }, + { + "epoch": 0.3516460025651988, + "grad_norm": 6.833067893981934, + "learning_rate": 1.831828978622328e-05, + "loss": 0.5383, + "step": 1645 + }, + { + "epoch": 0.35271483539974346, + "grad_norm": 5.849217414855957, + "learning_rate": 1.8306413301662707e-05, + "loss": 0.4623, + "step": 1650 + }, + { + "epoch": 0.35378366823428814, + "grad_norm": 5.285182952880859, + "learning_rate": 1.829453681710214e-05, + "loss": 0.3986, + "step": 1655 + }, + { + "epoch": 0.3548525010688328, + "grad_norm": 5.706902980804443, + "learning_rate": 1.8282660332541568e-05, + "loss": 0.4038, + "step": 1660 + }, + { + "epoch": 0.3559213339033775, + "grad_norm": 4.221705436706543, + "learning_rate": 1.8270783847981e-05, + "loss": 0.5615, + "step": 1665 + }, + { + "epoch": 0.35699016673792217, + "grad_norm": 6.5307745933532715, + "learning_rate": 1.825890736342043e-05, + "loss": 0.5535, + "step": 1670 + }, + { + "epoch": 0.35805899957246684, + "grad_norm": 5.936892509460449, + "learning_rate": 1.824703087885986e-05, + "loss": 0.3316, + "step": 1675 + }, + { + "epoch": 0.3591278324070115, + "grad_norm": 4.413790702819824, + "learning_rate": 1.8235154394299287e-05, + "loss": 0.3916, + "step": 1680 + }, + { + "epoch": 0.36019666524155625, + "grad_norm": 5.399665355682373, + "learning_rate": 1.8223277909738718e-05, + "loss": 0.3723, + "step": 1685 + }, + { + "epoch": 0.3612654980761009, + "grad_norm": 8.413554191589355, + "learning_rate": 1.8211401425178148e-05, + "loss": 0.5188, + "step": 1690 + }, + { + "epoch": 0.3623343309106456, + "grad_norm": 3.7601664066314697, + "learning_rate": 1.819952494061758e-05, + "loss": 0.3817, + "step": 1695 + }, + { + "epoch": 0.36340316374519027, + "grad_norm": 5.661569595336914, + "learning_rate": 1.818764845605701e-05, + "loss": 0.4036, + "step": 1700 + }, + { + "epoch": 0.36447199657973495, + "grad_norm": 6.07588005065918, + "learning_rate": 1.817577197149644e-05, + "loss": 0.4224, + "step": 1705 + }, + { + "epoch": 0.3655408294142796, + "grad_norm": 5.329127311706543, + "learning_rate": 1.8163895486935867e-05, + "loss": 0.4171, + "step": 1710 + }, + { + "epoch": 0.3666096622488243, + "grad_norm": 7.156865119934082, + "learning_rate": 1.8152019002375298e-05, + "loss": 0.4122, + "step": 1715 + }, + { + "epoch": 0.367678495083369, + "grad_norm": 5.72195291519165, + "learning_rate": 1.8140142517814728e-05, + "loss": 0.4024, + "step": 1720 + }, + { + "epoch": 0.36874732791791365, + "grad_norm": 4.991401672363281, + "learning_rate": 1.812826603325416e-05, + "loss": 0.3882, + "step": 1725 + }, + { + "epoch": 0.3698161607524583, + "grad_norm": 4.662073612213135, + "learning_rate": 1.811638954869359e-05, + "loss": 0.3282, + "step": 1730 + }, + { + "epoch": 0.370884993587003, + "grad_norm": 5.966677188873291, + "learning_rate": 1.8104513064133016e-05, + "loss": 0.2961, + "step": 1735 + }, + { + "epoch": 0.3719538264215477, + "grad_norm": 5.708690166473389, + "learning_rate": 1.809263657957245e-05, + "loss": 0.4568, + "step": 1740 + }, + { + "epoch": 0.37302265925609235, + "grad_norm": 5.69785213470459, + "learning_rate": 1.8080760095011877e-05, + "loss": 0.3544, + "step": 1745 + }, + { + "epoch": 0.374091492090637, + "grad_norm": 6.101360321044922, + "learning_rate": 1.8068883610451308e-05, + "loss": 0.4282, + "step": 1750 + }, + { + "epoch": 0.3751603249251817, + "grad_norm": 6.585791110992432, + "learning_rate": 1.805700712589074e-05, + "loss": 0.3798, + "step": 1755 + }, + { + "epoch": 0.3762291577597264, + "grad_norm": 5.618402481079102, + "learning_rate": 1.804513064133017e-05, + "loss": 0.4618, + "step": 1760 + }, + { + "epoch": 0.37729799059427105, + "grad_norm": 6.637610912322998, + "learning_rate": 1.8033254156769596e-05, + "loss": 0.3876, + "step": 1765 + }, + { + "epoch": 0.3783668234288157, + "grad_norm": 4.9898295402526855, + "learning_rate": 1.8021377672209027e-05, + "loss": 0.3393, + "step": 1770 + }, + { + "epoch": 0.3794356562633604, + "grad_norm": 6.182595252990723, + "learning_rate": 1.8009501187648457e-05, + "loss": 0.4276, + "step": 1775 + }, + { + "epoch": 0.3805044890979051, + "grad_norm": 5.460147380828857, + "learning_rate": 1.7997624703087888e-05, + "loss": 0.4641, + "step": 1780 + }, + { + "epoch": 0.38157332193244975, + "grad_norm": 4.0731940269470215, + "learning_rate": 1.798574821852732e-05, + "loss": 0.4248, + "step": 1785 + }, + { + "epoch": 0.3826421547669944, + "grad_norm": 3.6468496322631836, + "learning_rate": 1.797387173396675e-05, + "loss": 0.3601, + "step": 1790 + }, + { + "epoch": 0.3837109876015391, + "grad_norm": 3.701404094696045, + "learning_rate": 1.7961995249406176e-05, + "loss": 0.3384, + "step": 1795 + }, + { + "epoch": 0.3847798204360838, + "grad_norm": 6.082109451293945, + "learning_rate": 1.7950118764845607e-05, + "loss": 0.3598, + "step": 1800 + }, + { + "epoch": 0.38584865327062845, + "grad_norm": 4.901666164398193, + "learning_rate": 1.7938242280285037e-05, + "loss": 0.4363, + "step": 1805 + }, + { + "epoch": 0.3869174861051731, + "grad_norm": 3.848799467086792, + "learning_rate": 1.7926365795724468e-05, + "loss": 0.335, + "step": 1810 + }, + { + "epoch": 0.38798631893971786, + "grad_norm": 4.457520484924316, + "learning_rate": 1.7914489311163895e-05, + "loss": 0.3892, + "step": 1815 + }, + { + "epoch": 0.38905515177426253, + "grad_norm": 6.423126697540283, + "learning_rate": 1.7902612826603326e-05, + "loss": 0.4103, + "step": 1820 + }, + { + "epoch": 0.3901239846088072, + "grad_norm": 5.50001335144043, + "learning_rate": 1.7890736342042756e-05, + "loss": 0.3959, + "step": 1825 + }, + { + "epoch": 0.3911928174433519, + "grad_norm": 3.85994553565979, + "learning_rate": 1.7878859857482187e-05, + "loss": 0.3593, + "step": 1830 + }, + { + "epoch": 0.39226165027789656, + "grad_norm": 6.009896278381348, + "learning_rate": 1.7866983372921617e-05, + "loss": 0.4366, + "step": 1835 + }, + { + "epoch": 0.39333048311244123, + "grad_norm": 4.844223499298096, + "learning_rate": 1.7855106888361048e-05, + "loss": 0.3633, + "step": 1840 + }, + { + "epoch": 0.3943993159469859, + "grad_norm": 5.032964706420898, + "learning_rate": 1.7843230403800475e-05, + "loss": 0.4333, + "step": 1845 + }, + { + "epoch": 0.3954681487815306, + "grad_norm": 5.1685872077941895, + "learning_rate": 1.7831353919239905e-05, + "loss": 0.4825, + "step": 1850 + }, + { + "epoch": 0.39653698161607526, + "grad_norm": 5.741828918457031, + "learning_rate": 1.7819477434679336e-05, + "loss": 0.3041, + "step": 1855 + }, + { + "epoch": 0.39760581445061993, + "grad_norm": 5.440220832824707, + "learning_rate": 1.7807600950118767e-05, + "loss": 0.3416, + "step": 1860 + }, + { + "epoch": 0.3986746472851646, + "grad_norm": 4.476759433746338, + "learning_rate": 1.7795724465558197e-05, + "loss": 0.4578, + "step": 1865 + }, + { + "epoch": 0.3997434801197093, + "grad_norm": 6.7310991287231445, + "learning_rate": 1.7783847980997628e-05, + "loss": 0.5441, + "step": 1870 + }, + { + "epoch": 0.40081231295425396, + "grad_norm": 5.929594993591309, + "learning_rate": 1.7771971496437058e-05, + "loss": 0.5124, + "step": 1875 + }, + { + "epoch": 0.40188114578879863, + "grad_norm": 4.516419410705566, + "learning_rate": 1.7760095011876485e-05, + "loss": 0.4026, + "step": 1880 + }, + { + "epoch": 0.4029499786233433, + "grad_norm": 5.7698798179626465, + "learning_rate": 1.7748218527315916e-05, + "loss": 0.4537, + "step": 1885 + }, + { + "epoch": 0.404018811457888, + "grad_norm": 4.604269027709961, + "learning_rate": 1.7736342042755346e-05, + "loss": 0.3983, + "step": 1890 + }, + { + "epoch": 0.40508764429243266, + "grad_norm": 6.4217610359191895, + "learning_rate": 1.7724465558194777e-05, + "loss": 0.3894, + "step": 1895 + }, + { + "epoch": 0.40615647712697733, + "grad_norm": 5.296751022338867, + "learning_rate": 1.7712589073634204e-05, + "loss": 0.4747, + "step": 1900 + }, + { + "epoch": 0.407225309961522, + "grad_norm": 4.870068550109863, + "learning_rate": 1.7700712589073638e-05, + "loss": 0.4042, + "step": 1905 + }, + { + "epoch": 0.4082941427960667, + "grad_norm": 4.312191486358643, + "learning_rate": 1.7688836104513065e-05, + "loss": 0.3477, + "step": 1910 + }, + { + "epoch": 0.40936297563061136, + "grad_norm": 5.281498432159424, + "learning_rate": 1.7676959619952496e-05, + "loss": 0.4512, + "step": 1915 + }, + { + "epoch": 0.41043180846515603, + "grad_norm": 4.401067733764648, + "learning_rate": 1.7665083135391926e-05, + "loss": 0.3451, + "step": 1920 + }, + { + "epoch": 0.4115006412997007, + "grad_norm": 5.28626012802124, + "learning_rate": 1.7653206650831357e-05, + "loss": 0.3983, + "step": 1925 + }, + { + "epoch": 0.4125694741342454, + "grad_norm": 5.951436519622803, + "learning_rate": 1.7641330166270784e-05, + "loss": 0.62, + "step": 1930 + }, + { + "epoch": 0.41363830696879006, + "grad_norm": 3.4126088619232178, + "learning_rate": 1.7629453681710215e-05, + "loss": 0.4632, + "step": 1935 + }, + { + "epoch": 0.41470713980333473, + "grad_norm": 4.540611267089844, + "learning_rate": 1.7617577197149645e-05, + "loss": 0.3757, + "step": 1940 + }, + { + "epoch": 0.4157759726378794, + "grad_norm": 5.913720607757568, + "learning_rate": 1.7605700712589076e-05, + "loss": 0.4752, + "step": 1945 + }, + { + "epoch": 0.41684480547242414, + "grad_norm": 4.386907577514648, + "learning_rate": 1.7593824228028503e-05, + "loss": 0.453, + "step": 1950 + }, + { + "epoch": 0.4179136383069688, + "grad_norm": 4.836590766906738, + "learning_rate": 1.7581947743467937e-05, + "loss": 0.4348, + "step": 1955 + }, + { + "epoch": 0.4189824711415135, + "grad_norm": 4.215417861938477, + "learning_rate": 1.7570071258907364e-05, + "loss": 0.3944, + "step": 1960 + }, + { + "epoch": 0.42005130397605817, + "grad_norm": 5.9303789138793945, + "learning_rate": 1.7558194774346795e-05, + "loss": 0.3702, + "step": 1965 + }, + { + "epoch": 0.42112013681060284, + "grad_norm": 5.648311138153076, + "learning_rate": 1.7546318289786225e-05, + "loss": 0.3888, + "step": 1970 + }, + { + "epoch": 0.4221889696451475, + "grad_norm": 5.413701057434082, + "learning_rate": 1.7534441805225656e-05, + "loss": 0.3968, + "step": 1975 + }, + { + "epoch": 0.4232578024796922, + "grad_norm": 4.331090450286865, + "learning_rate": 1.7522565320665083e-05, + "loss": 0.3609, + "step": 1980 + }, + { + "epoch": 0.42432663531423687, + "grad_norm": 4.991115093231201, + "learning_rate": 1.7510688836104513e-05, + "loss": 0.4328, + "step": 1985 + }, + { + "epoch": 0.42539546814878154, + "grad_norm": 5.451033115386963, + "learning_rate": 1.7498812351543944e-05, + "loss": 0.516, + "step": 1990 + }, + { + "epoch": 0.4264643009833262, + "grad_norm": 5.011542320251465, + "learning_rate": 1.7486935866983374e-05, + "loss": 0.3605, + "step": 1995 + }, + { + "epoch": 0.4275331338178709, + "grad_norm": 5.4983086585998535, + "learning_rate": 1.74750593824228e-05, + "loss": 0.3094, + "step": 2000 + }, + { + "epoch": 0.42860196665241557, + "grad_norm": 5.928680896759033, + "learning_rate": 1.7463182897862236e-05, + "loss": 0.3866, + "step": 2005 + }, + { + "epoch": 0.42967079948696024, + "grad_norm": 4.630986213684082, + "learning_rate": 1.7451306413301666e-05, + "loss": 0.3943, + "step": 2010 + }, + { + "epoch": 0.4307396323215049, + "grad_norm": 4.091104030609131, + "learning_rate": 1.7439429928741093e-05, + "loss": 0.3931, + "step": 2015 + }, + { + "epoch": 0.4318084651560496, + "grad_norm": 6.031238555908203, + "learning_rate": 1.7427553444180524e-05, + "loss": 0.3728, + "step": 2020 + }, + { + "epoch": 0.43287729799059427, + "grad_norm": 4.81741189956665, + "learning_rate": 1.7415676959619954e-05, + "loss": 0.2953, + "step": 2025 + }, + { + "epoch": 0.43394613082513894, + "grad_norm": 5.144311904907227, + "learning_rate": 1.7403800475059385e-05, + "loss": 0.3547, + "step": 2030 + }, + { + "epoch": 0.4350149636596836, + "grad_norm": 4.806643009185791, + "learning_rate": 1.7391923990498812e-05, + "loss": 0.4275, + "step": 2035 + }, + { + "epoch": 0.4360837964942283, + "grad_norm": 4.138782501220703, + "learning_rate": 1.7380047505938246e-05, + "loss": 0.2959, + "step": 2040 + }, + { + "epoch": 0.43715262932877297, + "grad_norm": 5.7593255043029785, + "learning_rate": 1.7368171021377673e-05, + "loss": 0.3245, + "step": 2045 + }, + { + "epoch": 0.43822146216331764, + "grad_norm": 4.043095588684082, + "learning_rate": 1.7356294536817104e-05, + "loss": 0.4148, + "step": 2050 + }, + { + "epoch": 0.4392902949978623, + "grad_norm": 4.848685264587402, + "learning_rate": 1.7344418052256534e-05, + "loss": 0.3542, + "step": 2055 + }, + { + "epoch": 0.440359127832407, + "grad_norm": 5.738672256469727, + "learning_rate": 1.7332541567695965e-05, + "loss": 0.4493, + "step": 2060 + }, + { + "epoch": 0.44142796066695167, + "grad_norm": 4.470565319061279, + "learning_rate": 1.7320665083135392e-05, + "loss": 0.376, + "step": 2065 + }, + { + "epoch": 0.44249679350149634, + "grad_norm": 4.22749137878418, + "learning_rate": 1.7308788598574823e-05, + "loss": 0.322, + "step": 2070 + }, + { + "epoch": 0.443565626336041, + "grad_norm": 5.158305644989014, + "learning_rate": 1.7296912114014253e-05, + "loss": 0.3227, + "step": 2075 + }, + { + "epoch": 0.4446344591705857, + "grad_norm": 6.257720947265625, + "learning_rate": 1.7285035629453684e-05, + "loss": 0.3046, + "step": 2080 + }, + { + "epoch": 0.4457032920051304, + "grad_norm": 5.981179237365723, + "learning_rate": 1.727315914489311e-05, + "loss": 0.3064, + "step": 2085 + }, + { + "epoch": 0.4467721248396751, + "grad_norm": 5.584667682647705, + "learning_rate": 1.7261282660332545e-05, + "loss": 0.3199, + "step": 2090 + }, + { + "epoch": 0.4478409576742198, + "grad_norm": 5.660790920257568, + "learning_rate": 1.7249406175771972e-05, + "loss": 0.3774, + "step": 2095 + }, + { + "epoch": 0.44890979050876445, + "grad_norm": 4.129720687866211, + "learning_rate": 1.7237529691211402e-05, + "loss": 0.3212, + "step": 2100 + }, + { + "epoch": 0.4499786233433091, + "grad_norm": 3.2054107189178467, + "learning_rate": 1.7225653206650833e-05, + "loss": 0.3302, + "step": 2105 + }, + { + "epoch": 0.4510474561778538, + "grad_norm": 3.934522867202759, + "learning_rate": 1.7213776722090264e-05, + "loss": 0.3205, + "step": 2110 + }, + { + "epoch": 0.4521162890123985, + "grad_norm": 5.592263221740723, + "learning_rate": 1.720190023752969e-05, + "loss": 0.3673, + "step": 2115 + }, + { + "epoch": 0.45318512184694315, + "grad_norm": 5.707674026489258, + "learning_rate": 1.719002375296912e-05, + "loss": 0.3752, + "step": 2120 + }, + { + "epoch": 0.4542539546814878, + "grad_norm": 4.284328937530518, + "learning_rate": 1.7178147268408552e-05, + "loss": 0.3192, + "step": 2125 + }, + { + "epoch": 0.4553227875160325, + "grad_norm": 4.87931489944458, + "learning_rate": 1.7166270783847982e-05, + "loss": 0.3372, + "step": 2130 + }, + { + "epoch": 0.4563916203505772, + "grad_norm": 5.3206048011779785, + "learning_rate": 1.7154394299287413e-05, + "loss": 0.321, + "step": 2135 + }, + { + "epoch": 0.45746045318512185, + "grad_norm": 5.118194103240967, + "learning_rate": 1.7142517814726843e-05, + "loss": 0.4086, + "step": 2140 + }, + { + "epoch": 0.4585292860196665, + "grad_norm": 5.390005111694336, + "learning_rate": 1.7130641330166274e-05, + "loss": 0.4092, + "step": 2145 + }, + { + "epoch": 0.4595981188542112, + "grad_norm": 6.221261978149414, + "learning_rate": 1.71187648456057e-05, + "loss": 0.4591, + "step": 2150 + }, + { + "epoch": 0.4606669516887559, + "grad_norm": 4.9464497566223145, + "learning_rate": 1.7106888361045132e-05, + "loss": 0.503, + "step": 2155 + }, + { + "epoch": 0.46173578452330055, + "grad_norm": 6.745388984680176, + "learning_rate": 1.7095011876484562e-05, + "loss": 0.4767, + "step": 2160 + }, + { + "epoch": 0.4628046173578452, + "grad_norm": 5.506555080413818, + "learning_rate": 1.7083135391923993e-05, + "loss": 0.3425, + "step": 2165 + }, + { + "epoch": 0.4638734501923899, + "grad_norm": 5.21577787399292, + "learning_rate": 1.707125890736342e-05, + "loss": 0.372, + "step": 2170 + }, + { + "epoch": 0.4649422830269346, + "grad_norm": 4.69103479385376, + "learning_rate": 1.7059382422802854e-05, + "loss": 0.4671, + "step": 2175 + }, + { + "epoch": 0.46601111586147925, + "grad_norm": 4.060796737670898, + "learning_rate": 1.704750593824228e-05, + "loss": 0.3767, + "step": 2180 + }, + { + "epoch": 0.4670799486960239, + "grad_norm": 6.448695659637451, + "learning_rate": 1.703562945368171e-05, + "loss": 0.3096, + "step": 2185 + }, + { + "epoch": 0.4681487815305686, + "grad_norm": 4.255459308624268, + "learning_rate": 1.7023752969121142e-05, + "loss": 0.3654, + "step": 2190 + }, + { + "epoch": 0.4692176143651133, + "grad_norm": 5.383869647979736, + "learning_rate": 1.7011876484560573e-05, + "loss": 0.4243, + "step": 2195 + }, + { + "epoch": 0.47028644719965795, + "grad_norm": 4.97196102142334, + "learning_rate": 1.7e-05, + "loss": 0.4411, + "step": 2200 + }, + { + "epoch": 0.47135528003420263, + "grad_norm": 4.9628071784973145, + "learning_rate": 1.698812351543943e-05, + "loss": 0.5001, + "step": 2205 + }, + { + "epoch": 0.4724241128687473, + "grad_norm": 5.05242919921875, + "learning_rate": 1.697624703087886e-05, + "loss": 0.3485, + "step": 2210 + }, + { + "epoch": 0.473492945703292, + "grad_norm": 4.373459339141846, + "learning_rate": 1.696437054631829e-05, + "loss": 0.3546, + "step": 2215 + }, + { + "epoch": 0.4745617785378367, + "grad_norm": 5.0651397705078125, + "learning_rate": 1.6952494061757722e-05, + "loss": 0.4037, + "step": 2220 + }, + { + "epoch": 0.4756306113723814, + "grad_norm": 6.026737213134766, + "learning_rate": 1.6940617577197153e-05, + "loss": 0.3114, + "step": 2225 + }, + { + "epoch": 0.47669944420692606, + "grad_norm": 4.404332160949707, + "learning_rate": 1.692874109263658e-05, + "loss": 0.3422, + "step": 2230 + }, + { + "epoch": 0.47776827704147073, + "grad_norm": 5.780966281890869, + "learning_rate": 1.691686460807601e-05, + "loss": 0.4241, + "step": 2235 + }, + { + "epoch": 0.4788371098760154, + "grad_norm": 5.648661136627197, + "learning_rate": 1.690498812351544e-05, + "loss": 0.3912, + "step": 2240 + }, + { + "epoch": 0.4799059427105601, + "grad_norm": 3.616197109222412, + "learning_rate": 1.689311163895487e-05, + "loss": 0.3748, + "step": 2245 + }, + { + "epoch": 0.48097477554510476, + "grad_norm": 4.634115219116211, + "learning_rate": 1.68812351543943e-05, + "loss": 0.3746, + "step": 2250 + }, + { + "epoch": 0.48204360837964944, + "grad_norm": 4.268435478210449, + "learning_rate": 1.686935866983373e-05, + "loss": 0.3544, + "step": 2255 + }, + { + "epoch": 0.4831124412141941, + "grad_norm": 4.208693504333496, + "learning_rate": 1.685748218527316e-05, + "loss": 0.3246, + "step": 2260 + }, + { + "epoch": 0.4841812740487388, + "grad_norm": 7.521546840667725, + "learning_rate": 1.684560570071259e-05, + "loss": 0.3739, + "step": 2265 + }, + { + "epoch": 0.48525010688328346, + "grad_norm": 5.12343692779541, + "learning_rate": 1.683372921615202e-05, + "loss": 0.3606, + "step": 2270 + }, + { + "epoch": 0.48631893971782814, + "grad_norm": 6.54265022277832, + "learning_rate": 1.682185273159145e-05, + "loss": 0.3891, + "step": 2275 + }, + { + "epoch": 0.4873877725523728, + "grad_norm": 4.471118450164795, + "learning_rate": 1.680997624703088e-05, + "loss": 0.2855, + "step": 2280 + }, + { + "epoch": 0.4884566053869175, + "grad_norm": 7.488130569458008, + "learning_rate": 1.679809976247031e-05, + "loss": 0.4829, + "step": 2285 + }, + { + "epoch": 0.48952543822146216, + "grad_norm": 6.3466033935546875, + "learning_rate": 1.678622327790974e-05, + "loss": 0.4529, + "step": 2290 + }, + { + "epoch": 0.49059427105600684, + "grad_norm": 7.353418350219727, + "learning_rate": 1.677434679334917e-05, + "loss": 0.4819, + "step": 2295 + }, + { + "epoch": 0.4916631038905515, + "grad_norm": 4.575865745544434, + "learning_rate": 1.67624703087886e-05, + "loss": 0.3453, + "step": 2300 + }, + { + "epoch": 0.4927319367250962, + "grad_norm": 4.988368511199951, + "learning_rate": 1.675059382422803e-05, + "loss": 0.3425, + "step": 2305 + }, + { + "epoch": 0.49380076955964086, + "grad_norm": 5.05146598815918, + "learning_rate": 1.6738717339667462e-05, + "loss": 0.2884, + "step": 2310 + }, + { + "epoch": 0.49486960239418554, + "grad_norm": 6.10252571105957, + "learning_rate": 1.672684085510689e-05, + "loss": 0.3184, + "step": 2315 + }, + { + "epoch": 0.4959384352287302, + "grad_norm": 5.356700420379639, + "learning_rate": 1.671496437054632e-05, + "loss": 0.3043, + "step": 2320 + }, + { + "epoch": 0.4970072680632749, + "grad_norm": 4.550732135772705, + "learning_rate": 1.670308788598575e-05, + "loss": 0.3746, + "step": 2325 + }, + { + "epoch": 0.49807610089781956, + "grad_norm": 4.781940937042236, + "learning_rate": 1.669121140142518e-05, + "loss": 0.4023, + "step": 2330 + }, + { + "epoch": 0.49914493373236424, + "grad_norm": 3.1689300537109375, + "learning_rate": 1.6679334916864608e-05, + "loss": 0.2994, + "step": 2335 + }, + { + "epoch": 0.500213766566909, + "grad_norm": 5.919034004211426, + "learning_rate": 1.6667458432304042e-05, + "loss": 0.3858, + "step": 2340 + }, + { + "epoch": 0.5012825994014536, + "grad_norm": 4.044144153594971, + "learning_rate": 1.665558194774347e-05, + "loss": 0.3488, + "step": 2345 + }, + { + "epoch": 0.5023514322359983, + "grad_norm": 5.063786506652832, + "learning_rate": 1.66437054631829e-05, + "loss": 0.4467, + "step": 2350 + }, + { + "epoch": 0.5034202650705429, + "grad_norm": 4.159796237945557, + "learning_rate": 1.663182897862233e-05, + "loss": 0.3199, + "step": 2355 + }, + { + "epoch": 0.5044890979050877, + "grad_norm": 4.232370853424072, + "learning_rate": 1.661995249406176e-05, + "loss": 0.3124, + "step": 2360 + }, + { + "epoch": 0.5055579307396323, + "grad_norm": 3.8301782608032227, + "learning_rate": 1.6608076009501188e-05, + "loss": 0.3229, + "step": 2365 + }, + { + "epoch": 0.506626763574177, + "grad_norm": 5.729179382324219, + "learning_rate": 1.6596199524940618e-05, + "loss": 0.3416, + "step": 2370 + }, + { + "epoch": 0.5076955964087216, + "grad_norm": 4.137636184692383, + "learning_rate": 1.658432304038005e-05, + "loss": 0.3555, + "step": 2375 + }, + { + "epoch": 0.5087644292432664, + "grad_norm": 6.014377593994141, + "learning_rate": 1.657244655581948e-05, + "loss": 0.3078, + "step": 2380 + }, + { + "epoch": 0.509833262077811, + "grad_norm": 5.031920909881592, + "learning_rate": 1.6560570071258906e-05, + "loss": 0.3105, + "step": 2385 + }, + { + "epoch": 0.5109020949123557, + "grad_norm": 4.162966728210449, + "learning_rate": 1.654869358669834e-05, + "loss": 0.3565, + "step": 2390 + }, + { + "epoch": 0.5119709277469003, + "grad_norm": 5.57382345199585, + "learning_rate": 1.6536817102137768e-05, + "loss": 0.3113, + "step": 2395 + }, + { + "epoch": 0.5130397605814451, + "grad_norm": 6.443201065063477, + "learning_rate": 1.6524940617577198e-05, + "loss": 0.4326, + "step": 2400 + }, + { + "epoch": 0.5141085934159897, + "grad_norm": 7.050893306732178, + "learning_rate": 1.651306413301663e-05, + "loss": 0.4395, + "step": 2405 + }, + { + "epoch": 0.5151774262505344, + "grad_norm": 4.315305709838867, + "learning_rate": 1.650118764845606e-05, + "loss": 0.3549, + "step": 2410 + }, + { + "epoch": 0.516246259085079, + "grad_norm": 3.76841402053833, + "learning_rate": 1.6489311163895486e-05, + "loss": 0.4271, + "step": 2415 + }, + { + "epoch": 0.5173150919196238, + "grad_norm": 4.878926753997803, + "learning_rate": 1.6477434679334917e-05, + "loss": 0.3136, + "step": 2420 + }, + { + "epoch": 0.5183839247541685, + "grad_norm": 4.831075668334961, + "learning_rate": 1.646555819477435e-05, + "loss": 0.3235, + "step": 2425 + }, + { + "epoch": 0.5194527575887131, + "grad_norm": 4.886428356170654, + "learning_rate": 1.6453681710213778e-05, + "loss": 0.2909, + "step": 2430 + }, + { + "epoch": 0.5205215904232579, + "grad_norm": 5.281339645385742, + "learning_rate": 1.644180522565321e-05, + "loss": 0.3296, + "step": 2435 + }, + { + "epoch": 0.5215904232578025, + "grad_norm": 4.9752516746521, + "learning_rate": 1.642992874109264e-05, + "loss": 0.4124, + "step": 2440 + }, + { + "epoch": 0.5226592560923472, + "grad_norm": 5.5705952644348145, + "learning_rate": 1.641805225653207e-05, + "loss": 0.4444, + "step": 2445 + }, + { + "epoch": 0.5237280889268918, + "grad_norm": 4.4641499519348145, + "learning_rate": 1.6406175771971497e-05, + "loss": 0.3649, + "step": 2450 + }, + { + "epoch": 0.5247969217614366, + "grad_norm": 4.909672260284424, + "learning_rate": 1.6394299287410927e-05, + "loss": 0.3897, + "step": 2455 + }, + { + "epoch": 0.5258657545959812, + "grad_norm": 5.340948581695557, + "learning_rate": 1.6382422802850358e-05, + "loss": 0.36, + "step": 2460 + }, + { + "epoch": 0.5269345874305259, + "grad_norm": 5.204975128173828, + "learning_rate": 1.637054631828979e-05, + "loss": 0.3899, + "step": 2465 + }, + { + "epoch": 0.5280034202650705, + "grad_norm": 5.030284881591797, + "learning_rate": 1.6358669833729216e-05, + "loss": 0.3303, + "step": 2470 + }, + { + "epoch": 0.5290722530996153, + "grad_norm": 3.7952535152435303, + "learning_rate": 1.634679334916865e-05, + "loss": 0.3115, + "step": 2475 + }, + { + "epoch": 0.5301410859341599, + "grad_norm": 5.823569297790527, + "learning_rate": 1.6334916864608077e-05, + "loss": 0.4637, + "step": 2480 + }, + { + "epoch": 0.5312099187687046, + "grad_norm": 6.1813483238220215, + "learning_rate": 1.6323040380047507e-05, + "loss": 0.4402, + "step": 2485 + }, + { + "epoch": 0.5322787516032492, + "grad_norm": 3.668980360031128, + "learning_rate": 1.6311163895486938e-05, + "loss": 0.2825, + "step": 2490 + }, + { + "epoch": 0.533347584437794, + "grad_norm": 4.954606056213379, + "learning_rate": 1.629928741092637e-05, + "loss": 0.3389, + "step": 2495 + }, + { + "epoch": 0.5344164172723386, + "grad_norm": 4.136919021606445, + "learning_rate": 1.6287410926365796e-05, + "loss": 0.3513, + "step": 2500 + }, + { + "epoch": 0.5354852501068833, + "grad_norm": 5.383963108062744, + "learning_rate": 1.6275534441805226e-05, + "loss": 0.4301, + "step": 2505 + }, + { + "epoch": 0.5365540829414279, + "grad_norm": 4.818902015686035, + "learning_rate": 1.6263657957244657e-05, + "loss": 0.3733, + "step": 2510 + }, + { + "epoch": 0.5376229157759727, + "grad_norm": 4.797301769256592, + "learning_rate": 1.6251781472684087e-05, + "loss": 0.3241, + "step": 2515 + }, + { + "epoch": 0.5386917486105173, + "grad_norm": 5.040024757385254, + "learning_rate": 1.6239904988123514e-05, + "loss": 0.3457, + "step": 2520 + }, + { + "epoch": 0.539760581445062, + "grad_norm": 5.214640140533447, + "learning_rate": 1.622802850356295e-05, + "loss": 0.4533, + "step": 2525 + }, + { + "epoch": 0.5408294142796066, + "grad_norm": 3.6819052696228027, + "learning_rate": 1.6216152019002375e-05, + "loss": 0.3397, + "step": 2530 + }, + { + "epoch": 0.5418982471141514, + "grad_norm": 4.882740020751953, + "learning_rate": 1.6204275534441806e-05, + "loss": 0.4097, + "step": 2535 + }, + { + "epoch": 0.542967079948696, + "grad_norm": 4.784149646759033, + "learning_rate": 1.6192399049881237e-05, + "loss": 0.3083, + "step": 2540 + }, + { + "epoch": 0.5440359127832407, + "grad_norm": 5.621673107147217, + "learning_rate": 1.6180522565320667e-05, + "loss": 0.3199, + "step": 2545 + }, + { + "epoch": 0.5451047456177853, + "grad_norm": 5.204516887664795, + "learning_rate": 1.6168646080760094e-05, + "loss": 0.3971, + "step": 2550 + }, + { + "epoch": 0.54617357845233, + "grad_norm": 4.5771026611328125, + "learning_rate": 1.6156769596199525e-05, + "loss": 0.5229, + "step": 2555 + }, + { + "epoch": 0.5472424112868748, + "grad_norm": 5.919792652130127, + "learning_rate": 1.6144893111638955e-05, + "loss": 0.3765, + "step": 2560 + }, + { + "epoch": 0.5483112441214194, + "grad_norm": 4.573512554168701, + "learning_rate": 1.6133016627078386e-05, + "loss": 0.3727, + "step": 2565 + }, + { + "epoch": 0.5493800769559641, + "grad_norm": 3.752349615097046, + "learning_rate": 1.6121140142517816e-05, + "loss": 0.3252, + "step": 2570 + }, + { + "epoch": 0.5504489097905088, + "grad_norm": 3.7579493522644043, + "learning_rate": 1.6109263657957247e-05, + "loss": 0.2897, + "step": 2575 + }, + { + "epoch": 0.5515177426250535, + "grad_norm": 3.408615827560425, + "learning_rate": 1.6097387173396678e-05, + "loss": 0.2867, + "step": 2580 + }, + { + "epoch": 0.5525865754595981, + "grad_norm": 6.79346227645874, + "learning_rate": 1.6085510688836105e-05, + "loss": 0.3058, + "step": 2585 + }, + { + "epoch": 0.5536554082941428, + "grad_norm": 4.814434051513672, + "learning_rate": 1.6073634204275535e-05, + "loss": 0.461, + "step": 2590 + }, + { + "epoch": 0.5547242411286875, + "grad_norm": 4.379047393798828, + "learning_rate": 1.6061757719714966e-05, + "loss": 0.2341, + "step": 2595 + }, + { + "epoch": 0.5557930739632322, + "grad_norm": 7.072385787963867, + "learning_rate": 1.6049881235154396e-05, + "loss": 0.3446, + "step": 2600 + }, + { + "epoch": 0.5568619067977768, + "grad_norm": 6.0254411697387695, + "learning_rate": 1.6038004750593824e-05, + "loss": 0.2844, + "step": 2605 + }, + { + "epoch": 0.5579307396323215, + "grad_norm": 3.961240768432617, + "learning_rate": 1.6026128266033257e-05, + "loss": 0.3414, + "step": 2610 + }, + { + "epoch": 0.5589995724668662, + "grad_norm": 4.460314750671387, + "learning_rate": 1.6014251781472685e-05, + "loss": 0.3575, + "step": 2615 + }, + { + "epoch": 0.5600684053014109, + "grad_norm": 4.68889856338501, + "learning_rate": 1.6002375296912115e-05, + "loss": 0.3799, + "step": 2620 + }, + { + "epoch": 0.5611372381359555, + "grad_norm": 4.315304756164551, + "learning_rate": 1.5990498812351546e-05, + "loss": 0.3553, + "step": 2625 + }, + { + "epoch": 0.5622060709705002, + "grad_norm": 5.276904582977295, + "learning_rate": 1.5978622327790976e-05, + "loss": 0.3938, + "step": 2630 + }, + { + "epoch": 0.5632749038050449, + "grad_norm": 6.12239408493042, + "learning_rate": 1.5966745843230403e-05, + "loss": 0.4941, + "step": 2635 + }, + { + "epoch": 0.5643437366395896, + "grad_norm": 3.896017074584961, + "learning_rate": 1.5954869358669834e-05, + "loss": 0.3454, + "step": 2640 + }, + { + "epoch": 0.5654125694741342, + "grad_norm": 4.870078086853027, + "learning_rate": 1.5942992874109265e-05, + "loss": 0.3009, + "step": 2645 + }, + { + "epoch": 0.5664814023086789, + "grad_norm": 4.661655426025391, + "learning_rate": 1.5931116389548695e-05, + "loss": 0.4625, + "step": 2650 + }, + { + "epoch": 0.5675502351432236, + "grad_norm": 4.946725368499756, + "learning_rate": 1.5919239904988126e-05, + "loss": 0.3426, + "step": 2655 + }, + { + "epoch": 0.5686190679777683, + "grad_norm": 5.536448955535889, + "learning_rate": 1.5907363420427556e-05, + "loss": 0.3403, + "step": 2660 + }, + { + "epoch": 0.5696879008123129, + "grad_norm": 4.526655673980713, + "learning_rate": 1.5895486935866983e-05, + "loss": 0.3571, + "step": 2665 + }, + { + "epoch": 0.5707567336468576, + "grad_norm": 5.318846225738525, + "learning_rate": 1.5883610451306414e-05, + "loss": 0.235, + "step": 2670 + }, + { + "epoch": 0.5718255664814023, + "grad_norm": 4.3493571281433105, + "learning_rate": 1.5871733966745844e-05, + "loss": 0.3939, + "step": 2675 + }, + { + "epoch": 0.572894399315947, + "grad_norm": 4.984584331512451, + "learning_rate": 1.5859857482185275e-05, + "loss": 0.3041, + "step": 2680 + }, + { + "epoch": 0.5739632321504916, + "grad_norm": 5.118055820465088, + "learning_rate": 1.5847980997624702e-05, + "loss": 0.398, + "step": 2685 + }, + { + "epoch": 0.5750320649850363, + "grad_norm": 3.7693285942077637, + "learning_rate": 1.5836104513064136e-05, + "loss": 0.4327, + "step": 2690 + }, + { + "epoch": 0.5761008978195811, + "grad_norm": 4.113673210144043, + "learning_rate": 1.5824228028503563e-05, + "loss": 0.3622, + "step": 2695 + }, + { + "epoch": 0.5771697306541257, + "grad_norm": 4.5102386474609375, + "learning_rate": 1.5812351543942994e-05, + "loss": 0.3461, + "step": 2700 + }, + { + "epoch": 0.5782385634886704, + "grad_norm": 4.592400074005127, + "learning_rate": 1.5800475059382424e-05, + "loss": 0.409, + "step": 2705 + }, + { + "epoch": 0.579307396323215, + "grad_norm": 4.869931697845459, + "learning_rate": 1.5788598574821855e-05, + "loss": 0.3561, + "step": 2710 + }, + { + "epoch": 0.5803762291577598, + "grad_norm": 4.971279621124268, + "learning_rate": 1.5776722090261285e-05, + "loss": 0.3608, + "step": 2715 + }, + { + "epoch": 0.5814450619923044, + "grad_norm": 4.390021324157715, + "learning_rate": 1.5764845605700713e-05, + "loss": 0.3824, + "step": 2720 + }, + { + "epoch": 0.5825138948268491, + "grad_norm": 4.252533912658691, + "learning_rate": 1.5752969121140143e-05, + "loss": 0.3403, + "step": 2725 + }, + { + "epoch": 0.5835827276613937, + "grad_norm": 4.273214817047119, + "learning_rate": 1.5741092636579574e-05, + "loss": 0.4133, + "step": 2730 + }, + { + "epoch": 0.5846515604959385, + "grad_norm": 6.121555328369141, + "learning_rate": 1.5729216152019004e-05, + "loss": 0.4104, + "step": 2735 + }, + { + "epoch": 0.5857203933304831, + "grad_norm": 4.297682762145996, + "learning_rate": 1.5717339667458435e-05, + "loss": 0.2704, + "step": 2740 + }, + { + "epoch": 0.5867892261650278, + "grad_norm": 3.2164599895477295, + "learning_rate": 1.5705463182897865e-05, + "loss": 0.275, + "step": 2745 + }, + { + "epoch": 0.5878580589995724, + "grad_norm": 5.293271541595459, + "learning_rate": 1.5693586698337293e-05, + "loss": 0.3642, + "step": 2750 + }, + { + "epoch": 0.5889268918341172, + "grad_norm": 7.932840824127197, + "learning_rate": 1.5681710213776723e-05, + "loss": 0.3882, + "step": 2755 + }, + { + "epoch": 0.5899957246686618, + "grad_norm": 4.301117897033691, + "learning_rate": 1.5669833729216154e-05, + "loss": 0.445, + "step": 2760 + }, + { + "epoch": 0.5910645575032065, + "grad_norm": 5.11594820022583, + "learning_rate": 1.5657957244655584e-05, + "loss": 0.275, + "step": 2765 + }, + { + "epoch": 0.5921333903377511, + "grad_norm": 6.5174384117126465, + "learning_rate": 1.564608076009501e-05, + "loss": 0.3727, + "step": 2770 + }, + { + "epoch": 0.5932022231722959, + "grad_norm": 4.847846508026123, + "learning_rate": 1.5634204275534445e-05, + "loss": 0.3458, + "step": 2775 + }, + { + "epoch": 0.5942710560068405, + "grad_norm": 4.418210983276367, + "learning_rate": 1.5622327790973872e-05, + "loss": 0.3775, + "step": 2780 + }, + { + "epoch": 0.5953398888413852, + "grad_norm": 4.810405731201172, + "learning_rate": 1.5610451306413303e-05, + "loss": 0.3731, + "step": 2785 + }, + { + "epoch": 0.5964087216759298, + "grad_norm": 3.9812352657318115, + "learning_rate": 1.5598574821852734e-05, + "loss": 0.3073, + "step": 2790 + }, + { + "epoch": 0.5974775545104746, + "grad_norm": 4.542743682861328, + "learning_rate": 1.5586698337292164e-05, + "loss": 0.3113, + "step": 2795 + }, + { + "epoch": 0.5985463873450192, + "grad_norm": 4.736385345458984, + "learning_rate": 1.557482185273159e-05, + "loss": 0.3666, + "step": 2800 + }, + { + "epoch": 0.5996152201795639, + "grad_norm": 5.22001314163208, + "learning_rate": 1.5562945368171022e-05, + "loss": 0.3769, + "step": 2805 + }, + { + "epoch": 0.6006840530141085, + "grad_norm": 5.7952680587768555, + "learning_rate": 1.5551068883610452e-05, + "loss": 0.3267, + "step": 2810 + }, + { + "epoch": 0.6017528858486533, + "grad_norm": 4.174045085906982, + "learning_rate": 1.5539192399049883e-05, + "loss": 0.2513, + "step": 2815 + }, + { + "epoch": 0.6028217186831979, + "grad_norm": 3.869800090789795, + "learning_rate": 1.552731591448931e-05, + "loss": 0.273, + "step": 2820 + }, + { + "epoch": 0.6038905515177426, + "grad_norm": 4.380319118499756, + "learning_rate": 1.5515439429928744e-05, + "loss": 0.3712, + "step": 2825 + }, + { + "epoch": 0.6049593843522874, + "grad_norm": 3.972041368484497, + "learning_rate": 1.550356294536817e-05, + "loss": 0.4728, + "step": 2830 + }, + { + "epoch": 0.606028217186832, + "grad_norm": 5.212732791900635, + "learning_rate": 1.5491686460807602e-05, + "loss": 0.3608, + "step": 2835 + }, + { + "epoch": 0.6070970500213767, + "grad_norm": 5.559129238128662, + "learning_rate": 1.5479809976247032e-05, + "loss": 0.3765, + "step": 2840 + }, + { + "epoch": 0.6081658828559213, + "grad_norm": 4.520800590515137, + "learning_rate": 1.5467933491686463e-05, + "loss": 0.3285, + "step": 2845 + }, + { + "epoch": 0.609234715690466, + "grad_norm": 6.37885856628418, + "learning_rate": 1.5456057007125893e-05, + "loss": 0.4822, + "step": 2850 + }, + { + "epoch": 0.6103035485250107, + "grad_norm": 3.292531967163086, + "learning_rate": 1.544418052256532e-05, + "loss": 0.3185, + "step": 2855 + }, + { + "epoch": 0.6113723813595554, + "grad_norm": 4.683765411376953, + "learning_rate": 1.5432304038004754e-05, + "loss": 0.3027, + "step": 2860 + }, + { + "epoch": 0.6124412141941, + "grad_norm": 6.004202365875244, + "learning_rate": 1.542042755344418e-05, + "loss": 0.3819, + "step": 2865 + }, + { + "epoch": 0.6135100470286448, + "grad_norm": 4.668170928955078, + "learning_rate": 1.5408551068883612e-05, + "loss": 0.2819, + "step": 2870 + }, + { + "epoch": 0.6145788798631894, + "grad_norm": 6.2482781410217285, + "learning_rate": 1.5396674584323043e-05, + "loss": 0.3369, + "step": 2875 + }, + { + "epoch": 0.6156477126977341, + "grad_norm": 4.60993766784668, + "learning_rate": 1.5384798099762473e-05, + "loss": 0.331, + "step": 2880 + }, + { + "epoch": 0.6167165455322787, + "grad_norm": 5.188110828399658, + "learning_rate": 1.53729216152019e-05, + "loss": 0.3662, + "step": 2885 + }, + { + "epoch": 0.6177853783668235, + "grad_norm": 5.201088905334473, + "learning_rate": 1.536104513064133e-05, + "loss": 0.3472, + "step": 2890 + }, + { + "epoch": 0.6188542112013681, + "grad_norm": 5.363198280334473, + "learning_rate": 1.534916864608076e-05, + "loss": 0.387, + "step": 2895 + }, + { + "epoch": 0.6199230440359128, + "grad_norm": 5.238138198852539, + "learning_rate": 1.5337292161520192e-05, + "loss": 0.3017, + "step": 2900 + }, + { + "epoch": 0.6209918768704574, + "grad_norm": 4.188523769378662, + "learning_rate": 1.532541567695962e-05, + "loss": 0.2628, + "step": 2905 + }, + { + "epoch": 0.6220607097050022, + "grad_norm": 4.730754852294922, + "learning_rate": 1.5313539192399053e-05, + "loss": 0.2683, + "step": 2910 + }, + { + "epoch": 0.6231295425395468, + "grad_norm": 3.7036404609680176, + "learning_rate": 1.530166270783848e-05, + "loss": 0.3189, + "step": 2915 + }, + { + "epoch": 0.6241983753740915, + "grad_norm": 4.961543560028076, + "learning_rate": 1.528978622327791e-05, + "loss": 0.3389, + "step": 2920 + }, + { + "epoch": 0.6252672082086361, + "grad_norm": 4.376546859741211, + "learning_rate": 1.527790973871734e-05, + "loss": 0.3552, + "step": 2925 + }, + { + "epoch": 0.6263360410431809, + "grad_norm": 3.2792232036590576, + "learning_rate": 1.5266033254156772e-05, + "loss": 0.2784, + "step": 2930 + }, + { + "epoch": 0.6274048738777255, + "grad_norm": 4.739627838134766, + "learning_rate": 1.5254156769596201e-05, + "loss": 0.3416, + "step": 2935 + }, + { + "epoch": 0.6284737067122702, + "grad_norm": 4.889829635620117, + "learning_rate": 1.5242280285035631e-05, + "loss": 0.3805, + "step": 2940 + }, + { + "epoch": 0.6295425395468148, + "grad_norm": 5.562602519989014, + "learning_rate": 1.523040380047506e-05, + "loss": 0.4616, + "step": 2945 + }, + { + "epoch": 0.6306113723813596, + "grad_norm": 6.154614448547363, + "learning_rate": 1.521852731591449e-05, + "loss": 0.3584, + "step": 2950 + }, + { + "epoch": 0.6316802052159042, + "grad_norm": 4.117344856262207, + "learning_rate": 1.520665083135392e-05, + "loss": 0.3439, + "step": 2955 + }, + { + "epoch": 0.6327490380504489, + "grad_norm": 4.961648941040039, + "learning_rate": 1.519477434679335e-05, + "loss": 0.3569, + "step": 2960 + }, + { + "epoch": 0.6338178708849936, + "grad_norm": 4.030764579772949, + "learning_rate": 1.5182897862232779e-05, + "loss": 0.2775, + "step": 2965 + }, + { + "epoch": 0.6348867037195383, + "grad_norm": 5.615406036376953, + "learning_rate": 1.517102137767221e-05, + "loss": 0.3758, + "step": 2970 + }, + { + "epoch": 0.635955536554083, + "grad_norm": 5.250066757202148, + "learning_rate": 1.5159144893111638e-05, + "loss": 0.4178, + "step": 2975 + }, + { + "epoch": 0.6370243693886276, + "grad_norm": 3.862907648086548, + "learning_rate": 1.514726840855107e-05, + "loss": 0.2725, + "step": 2980 + }, + { + "epoch": 0.6380932022231723, + "grad_norm": 7.1906023025512695, + "learning_rate": 1.51353919239905e-05, + "loss": 0.4838, + "step": 2985 + }, + { + "epoch": 0.639162035057717, + "grad_norm": 4.240938663482666, + "learning_rate": 1.512351543942993e-05, + "loss": 0.3184, + "step": 2990 + }, + { + "epoch": 0.6402308678922617, + "grad_norm": 5.662024974822998, + "learning_rate": 1.511163895486936e-05, + "loss": 0.3265, + "step": 2995 + }, + { + "epoch": 0.6412997007268063, + "grad_norm": 5.721799850463867, + "learning_rate": 1.509976247030879e-05, + "loss": 0.3344, + "step": 3000 + }, + { + "epoch": 0.642368533561351, + "grad_norm": 4.524104118347168, + "learning_rate": 1.508788598574822e-05, + "loss": 0.284, + "step": 3005 + }, + { + "epoch": 0.6434373663958957, + "grad_norm": 4.907393455505371, + "learning_rate": 1.5076009501187649e-05, + "loss": 0.3337, + "step": 3010 + }, + { + "epoch": 0.6445061992304404, + "grad_norm": 4.567984580993652, + "learning_rate": 1.5064133016627081e-05, + "loss": 0.3085, + "step": 3015 + }, + { + "epoch": 0.645575032064985, + "grad_norm": 6.088601589202881, + "learning_rate": 1.505225653206651e-05, + "loss": 0.3685, + "step": 3020 + }, + { + "epoch": 0.6466438648995297, + "grad_norm": 5.842155456542969, + "learning_rate": 1.504038004750594e-05, + "loss": 0.4621, + "step": 3025 + }, + { + "epoch": 0.6477126977340744, + "grad_norm": 4.505978584289551, + "learning_rate": 1.502850356294537e-05, + "loss": 0.2958, + "step": 3030 + }, + { + "epoch": 0.6487815305686191, + "grad_norm": 3.832209825515747, + "learning_rate": 1.50166270783848e-05, + "loss": 0.4165, + "step": 3035 + }, + { + "epoch": 0.6498503634031637, + "grad_norm": 3.149580240249634, + "learning_rate": 1.5004750593824229e-05, + "loss": 0.336, + "step": 3040 + }, + { + "epoch": 0.6509191962377084, + "grad_norm": 4.5704121589660645, + "learning_rate": 1.499287410926366e-05, + "loss": 0.328, + "step": 3045 + }, + { + "epoch": 0.6519880290722531, + "grad_norm": 5.424034595489502, + "learning_rate": 1.4980997624703088e-05, + "loss": 0.3256, + "step": 3050 + }, + { + "epoch": 0.6530568619067978, + "grad_norm": 4.873384475708008, + "learning_rate": 1.4969121140142519e-05, + "loss": 0.2877, + "step": 3055 + }, + { + "epoch": 0.6541256947413424, + "grad_norm": 4.21671199798584, + "learning_rate": 1.4957244655581948e-05, + "loss": 0.3468, + "step": 3060 + }, + { + "epoch": 0.6551945275758871, + "grad_norm": 4.723153591156006, + "learning_rate": 1.494536817102138e-05, + "loss": 0.3836, + "step": 3065 + }, + { + "epoch": 0.6562633604104318, + "grad_norm": 4.3572587966918945, + "learning_rate": 1.4933491686460809e-05, + "loss": 0.3084, + "step": 3070 + }, + { + "epoch": 0.6573321932449765, + "grad_norm": 4.8245439529418945, + "learning_rate": 1.492161520190024e-05, + "loss": 0.2879, + "step": 3075 + }, + { + "epoch": 0.6584010260795211, + "grad_norm": 4.260484218597412, + "learning_rate": 1.4909738717339668e-05, + "loss": 0.2836, + "step": 3080 + }, + { + "epoch": 0.6594698589140658, + "grad_norm": 3.668529748916626, + "learning_rate": 1.4897862232779099e-05, + "loss": 0.3049, + "step": 3085 + }, + { + "epoch": 0.6605386917486105, + "grad_norm": 5.860143661499023, + "learning_rate": 1.4885985748218528e-05, + "loss": 0.3972, + "step": 3090 + }, + { + "epoch": 0.6616075245831552, + "grad_norm": 3.9581236839294434, + "learning_rate": 1.4874109263657958e-05, + "loss": 0.3189, + "step": 3095 + }, + { + "epoch": 0.6626763574176999, + "grad_norm": 2.8415067195892334, + "learning_rate": 1.4862232779097387e-05, + "loss": 0.207, + "step": 3100 + }, + { + "epoch": 0.6637451902522445, + "grad_norm": 5.096329689025879, + "learning_rate": 1.485035629453682e-05, + "loss": 0.2844, + "step": 3105 + }, + { + "epoch": 0.6648140230867893, + "grad_norm": 5.822755813598633, + "learning_rate": 1.4838479809976248e-05, + "loss": 0.3583, + "step": 3110 + }, + { + "epoch": 0.6658828559213339, + "grad_norm": 5.467360019683838, + "learning_rate": 1.4826603325415679e-05, + "loss": 0.2681, + "step": 3115 + }, + { + "epoch": 0.6669516887558786, + "grad_norm": 5.418729305267334, + "learning_rate": 1.4814726840855107e-05, + "loss": 0.3788, + "step": 3120 + }, + { + "epoch": 0.6680205215904232, + "grad_norm": 5.312787055969238, + "learning_rate": 1.4802850356294538e-05, + "loss": 0.3335, + "step": 3125 + }, + { + "epoch": 0.669089354424968, + "grad_norm": 4.632271766662598, + "learning_rate": 1.4790973871733969e-05, + "loss": 0.2958, + "step": 3130 + }, + { + "epoch": 0.6701581872595126, + "grad_norm": 5.137240886688232, + "learning_rate": 1.4779097387173397e-05, + "loss": 0.343, + "step": 3135 + }, + { + "epoch": 0.6712270200940573, + "grad_norm": 4.227065086364746, + "learning_rate": 1.4767220902612828e-05, + "loss": 0.3026, + "step": 3140 + }, + { + "epoch": 0.672295852928602, + "grad_norm": 4.9906110763549805, + "learning_rate": 1.4755344418052257e-05, + "loss": 0.3575, + "step": 3145 + }, + { + "epoch": 0.6733646857631467, + "grad_norm": 6.338077545166016, + "learning_rate": 1.4743467933491689e-05, + "loss": 0.3962, + "step": 3150 + }, + { + "epoch": 0.6744335185976913, + "grad_norm": 5.018848896026611, + "learning_rate": 1.4731591448931118e-05, + "loss": 0.292, + "step": 3155 + }, + { + "epoch": 0.675502351432236, + "grad_norm": 5.4188432693481445, + "learning_rate": 1.4719714964370548e-05, + "loss": 0.4226, + "step": 3160 + }, + { + "epoch": 0.6765711842667806, + "grad_norm": 5.020565032958984, + "learning_rate": 1.4707838479809977e-05, + "loss": 0.3999, + "step": 3165 + }, + { + "epoch": 0.6776400171013254, + "grad_norm": 5.457892894744873, + "learning_rate": 1.4695961995249408e-05, + "loss": 0.3949, + "step": 3170 + }, + { + "epoch": 0.67870884993587, + "grad_norm": 4.842294216156006, + "learning_rate": 1.4684085510688837e-05, + "loss": 0.2844, + "step": 3175 + }, + { + "epoch": 0.6797776827704147, + "grad_norm": 4.515163421630859, + "learning_rate": 1.4672209026128267e-05, + "loss": 0.3003, + "step": 3180 + }, + { + "epoch": 0.6808465156049593, + "grad_norm": 3.4031429290771484, + "learning_rate": 1.4660332541567696e-05, + "loss": 0.2636, + "step": 3185 + }, + { + "epoch": 0.6819153484395041, + "grad_norm": 4.693248748779297, + "learning_rate": 1.4648456057007128e-05, + "loss": 0.2334, + "step": 3190 + }, + { + "epoch": 0.6829841812740487, + "grad_norm": 4.690431118011475, + "learning_rate": 1.4636579572446557e-05, + "loss": 0.2574, + "step": 3195 + }, + { + "epoch": 0.6840530141085934, + "grad_norm": 3.9794492721557617, + "learning_rate": 1.4624703087885988e-05, + "loss": 0.3476, + "step": 3200 + }, + { + "epoch": 0.685121846943138, + "grad_norm": 4.062690258026123, + "learning_rate": 1.4612826603325417e-05, + "loss": 0.3763, + "step": 3205 + }, + { + "epoch": 0.6861906797776828, + "grad_norm": 2.888495683670044, + "learning_rate": 1.4600950118764847e-05, + "loss": 0.2873, + "step": 3210 + }, + { + "epoch": 0.6872595126122274, + "grad_norm": 4.061041355133057, + "learning_rate": 1.4589073634204276e-05, + "loss": 0.2859, + "step": 3215 + }, + { + "epoch": 0.6883283454467721, + "grad_norm": 5.954913139343262, + "learning_rate": 1.4577197149643707e-05, + "loss": 0.3335, + "step": 3220 + }, + { + "epoch": 0.6893971782813167, + "grad_norm": 4.9537434577941895, + "learning_rate": 1.4565320665083135e-05, + "loss": 0.3712, + "step": 3225 + }, + { + "epoch": 0.6904660111158615, + "grad_norm": 3.5754384994506836, + "learning_rate": 1.4553444180522566e-05, + "loss": 0.4072, + "step": 3230 + }, + { + "epoch": 0.6915348439504062, + "grad_norm": 6.583157062530518, + "learning_rate": 1.4541567695961995e-05, + "loss": 0.3442, + "step": 3235 + }, + { + "epoch": 0.6926036767849508, + "grad_norm": 4.144803524017334, + "learning_rate": 1.4529691211401427e-05, + "loss": 0.32, + "step": 3240 + }, + { + "epoch": 0.6936725096194956, + "grad_norm": 3.350670576095581, + "learning_rate": 1.4517814726840856e-05, + "loss": 0.3076, + "step": 3245 + }, + { + "epoch": 0.6947413424540402, + "grad_norm": 3.798152208328247, + "learning_rate": 1.4505938242280287e-05, + "loss": 0.3134, + "step": 3250 + }, + { + "epoch": 0.6958101752885849, + "grad_norm": 4.410452365875244, + "learning_rate": 1.4494061757719715e-05, + "loss": 0.3155, + "step": 3255 + }, + { + "epoch": 0.6968790081231295, + "grad_norm": 5.064853191375732, + "learning_rate": 1.4482185273159146e-05, + "loss": 0.3097, + "step": 3260 + }, + { + "epoch": 0.6979478409576743, + "grad_norm": 5.49769401550293, + "learning_rate": 1.4470308788598575e-05, + "loss": 0.2727, + "step": 3265 + }, + { + "epoch": 0.6990166737922189, + "grad_norm": 4.130645751953125, + "learning_rate": 1.4458432304038005e-05, + "loss": 0.3666, + "step": 3270 + }, + { + "epoch": 0.7000855066267636, + "grad_norm": 5.358222484588623, + "learning_rate": 1.4446555819477438e-05, + "loss": 0.3049, + "step": 3275 + }, + { + "epoch": 0.7011543394613082, + "grad_norm": 3.783137559890747, + "learning_rate": 1.4434679334916866e-05, + "loss": 0.3506, + "step": 3280 + }, + { + "epoch": 0.702223172295853, + "grad_norm": 4.486612319946289, + "learning_rate": 1.4422802850356297e-05, + "loss": 0.3027, + "step": 3285 + }, + { + "epoch": 0.7032920051303976, + "grad_norm": 5.604061126708984, + "learning_rate": 1.4410926365795726e-05, + "loss": 0.2706, + "step": 3290 + }, + { + "epoch": 0.7043608379649423, + "grad_norm": 5.663457870483398, + "learning_rate": 1.4399049881235156e-05, + "loss": 0.3165, + "step": 3295 + }, + { + "epoch": 0.7054296707994869, + "grad_norm": 4.874339580535889, + "learning_rate": 1.4387173396674585e-05, + "loss": 0.3567, + "step": 3300 + }, + { + "epoch": 0.7064985036340317, + "grad_norm": 5.478762626647949, + "learning_rate": 1.4375296912114016e-05, + "loss": 0.2795, + "step": 3305 + }, + { + "epoch": 0.7075673364685763, + "grad_norm": 4.213021278381348, + "learning_rate": 1.4363420427553445e-05, + "loss": 0.2905, + "step": 3310 + }, + { + "epoch": 0.708636169303121, + "grad_norm": 4.549129009246826, + "learning_rate": 1.4351543942992875e-05, + "loss": 0.2946, + "step": 3315 + }, + { + "epoch": 0.7097050021376656, + "grad_norm": 4.900253772735596, + "learning_rate": 1.4339667458432304e-05, + "loss": 0.298, + "step": 3320 + }, + { + "epoch": 0.7107738349722104, + "grad_norm": 5.591811656951904, + "learning_rate": 1.4327790973871736e-05, + "loss": 0.289, + "step": 3325 + }, + { + "epoch": 0.711842667806755, + "grad_norm": 3.1972029209136963, + "learning_rate": 1.4315914489311165e-05, + "loss": 0.3194, + "step": 3330 + }, + { + "epoch": 0.7129115006412997, + "grad_norm": 3.692401647567749, + "learning_rate": 1.4304038004750596e-05, + "loss": 0.2719, + "step": 3335 + }, + { + "epoch": 0.7139803334758443, + "grad_norm": 6.502699851989746, + "learning_rate": 1.4292161520190025e-05, + "loss": 0.3079, + "step": 3340 + }, + { + "epoch": 0.7150491663103891, + "grad_norm": 4.761363506317139, + "learning_rate": 1.4280285035629455e-05, + "loss": 0.3373, + "step": 3345 + }, + { + "epoch": 0.7161179991449337, + "grad_norm": 5.628553867340088, + "learning_rate": 1.4268408551068884e-05, + "loss": 0.3103, + "step": 3350 + }, + { + "epoch": 0.7171868319794784, + "grad_norm": 5.576054096221924, + "learning_rate": 1.4256532066508314e-05, + "loss": 0.3384, + "step": 3355 + }, + { + "epoch": 0.718255664814023, + "grad_norm": 4.364500999450684, + "learning_rate": 1.4244655581947743e-05, + "loss": 0.3785, + "step": 3360 + }, + { + "epoch": 0.7193244976485678, + "grad_norm": 2.8248353004455566, + "learning_rate": 1.4232779097387176e-05, + "loss": 0.2583, + "step": 3365 + }, + { + "epoch": 0.7203933304831125, + "grad_norm": 5.5604987144470215, + "learning_rate": 1.4220902612826604e-05, + "loss": 0.2992, + "step": 3370 + }, + { + "epoch": 0.7214621633176571, + "grad_norm": 4.8770527839660645, + "learning_rate": 1.4209026128266035e-05, + "loss": 0.2196, + "step": 3375 + }, + { + "epoch": 0.7225309961522018, + "grad_norm": 4.998085021972656, + "learning_rate": 1.4197149643705464e-05, + "loss": 0.3438, + "step": 3380 + }, + { + "epoch": 0.7235998289867465, + "grad_norm": 4.125364303588867, + "learning_rate": 1.4185273159144894e-05, + "loss": 0.333, + "step": 3385 + }, + { + "epoch": 0.7246686618212912, + "grad_norm": 5.174322605133057, + "learning_rate": 1.4173396674584323e-05, + "loss": 0.4422, + "step": 3390 + }, + { + "epoch": 0.7257374946558358, + "grad_norm": 4.850910186767578, + "learning_rate": 1.4161520190023754e-05, + "loss": 0.458, + "step": 3395 + }, + { + "epoch": 0.7268063274903805, + "grad_norm": 4.238053321838379, + "learning_rate": 1.4149643705463183e-05, + "loss": 0.2526, + "step": 3400 + }, + { + "epoch": 0.7278751603249252, + "grad_norm": 4.8868842124938965, + "learning_rate": 1.4137767220902613e-05, + "loss": 0.2443, + "step": 3405 + }, + { + "epoch": 0.7289439931594699, + "grad_norm": 6.352740287780762, + "learning_rate": 1.4125890736342045e-05, + "loss": 0.4024, + "step": 3410 + }, + { + "epoch": 0.7300128259940145, + "grad_norm": 3.7694151401519775, + "learning_rate": 1.4114014251781474e-05, + "loss": 0.3057, + "step": 3415 + }, + { + "epoch": 0.7310816588285592, + "grad_norm": 4.326847553253174, + "learning_rate": 1.4102137767220905e-05, + "loss": 0.3417, + "step": 3420 + }, + { + "epoch": 0.7321504916631039, + "grad_norm": 4.306587219238281, + "learning_rate": 1.4090261282660334e-05, + "loss": 0.3535, + "step": 3425 + }, + { + "epoch": 0.7332193244976486, + "grad_norm": 4.4991044998168945, + "learning_rate": 1.4078384798099764e-05, + "loss": 0.3814, + "step": 3430 + }, + { + "epoch": 0.7342881573321932, + "grad_norm": 4.0679779052734375, + "learning_rate": 1.4066508313539193e-05, + "loss": 0.3196, + "step": 3435 + }, + { + "epoch": 0.735356990166738, + "grad_norm": 4.0540666580200195, + "learning_rate": 1.4054631828978624e-05, + "loss": 0.2738, + "step": 3440 + }, + { + "epoch": 0.7364258230012826, + "grad_norm": 4.532857894897461, + "learning_rate": 1.4042755344418053e-05, + "loss": 0.2127, + "step": 3445 + }, + { + "epoch": 0.7374946558358273, + "grad_norm": 4.681793212890625, + "learning_rate": 1.4030878859857485e-05, + "loss": 0.2415, + "step": 3450 + }, + { + "epoch": 0.7385634886703719, + "grad_norm": 5.458173751831055, + "learning_rate": 1.4019002375296914e-05, + "loss": 0.368, + "step": 3455 + }, + { + "epoch": 0.7396323215049166, + "grad_norm": 4.303793430328369, + "learning_rate": 1.4007125890736344e-05, + "loss": 0.2965, + "step": 3460 + }, + { + "epoch": 0.7407011543394613, + "grad_norm": 5.24821138381958, + "learning_rate": 1.3995249406175773e-05, + "loss": 0.3676, + "step": 3465 + }, + { + "epoch": 0.741769987174006, + "grad_norm": 7.041927337646484, + "learning_rate": 1.3983372921615204e-05, + "loss": 0.4793, + "step": 3470 + }, + { + "epoch": 0.7428388200085506, + "grad_norm": 4.38003396987915, + "learning_rate": 1.3971496437054632e-05, + "loss": 0.2924, + "step": 3475 + }, + { + "epoch": 0.7439076528430953, + "grad_norm": 4.844277858734131, + "learning_rate": 1.3959619952494063e-05, + "loss": 0.3051, + "step": 3480 + }, + { + "epoch": 0.74497648567764, + "grad_norm": 4.943488121032715, + "learning_rate": 1.3947743467933492e-05, + "loss": 0.3206, + "step": 3485 + }, + { + "epoch": 0.7460453185121847, + "grad_norm": 3.5360701084136963, + "learning_rate": 1.3935866983372922e-05, + "loss": 0.3062, + "step": 3490 + }, + { + "epoch": 0.7471141513467293, + "grad_norm": 4.964517116546631, + "learning_rate": 1.3923990498812351e-05, + "loss": 0.3099, + "step": 3495 + }, + { + "epoch": 0.748182984181274, + "grad_norm": 4.1770124435424805, + "learning_rate": 1.3912114014251783e-05, + "loss": 0.3528, + "step": 3500 + }, + { + "epoch": 0.7492518170158188, + "grad_norm": 4.830697059631348, + "learning_rate": 1.3900237529691212e-05, + "loss": 0.3075, + "step": 3505 + }, + { + "epoch": 0.7503206498503634, + "grad_norm": 4.7558512687683105, + "learning_rate": 1.3888361045130643e-05, + "loss": 0.3132, + "step": 3510 + }, + { + "epoch": 0.7513894826849081, + "grad_norm": 5.082642555236816, + "learning_rate": 1.3876484560570072e-05, + "loss": 0.3789, + "step": 3515 + }, + { + "epoch": 0.7524583155194527, + "grad_norm": 5.486532211303711, + "learning_rate": 1.3864608076009502e-05, + "loss": 0.3316, + "step": 3520 + }, + { + "epoch": 0.7535271483539975, + "grad_norm": 4.763543605804443, + "learning_rate": 1.3852731591448931e-05, + "loss": 0.3113, + "step": 3525 + }, + { + "epoch": 0.7545959811885421, + "grad_norm": 4.146590709686279, + "learning_rate": 1.3840855106888362e-05, + "loss": 0.2481, + "step": 3530 + }, + { + "epoch": 0.7556648140230868, + "grad_norm": 4.292271614074707, + "learning_rate": 1.382897862232779e-05, + "loss": 0.3174, + "step": 3535 + }, + { + "epoch": 0.7567336468576314, + "grad_norm": 5.971374988555908, + "learning_rate": 1.3817102137767223e-05, + "loss": 0.3116, + "step": 3540 + }, + { + "epoch": 0.7578024796921762, + "grad_norm": 4.599390983581543, + "learning_rate": 1.3805225653206652e-05, + "loss": 0.29, + "step": 3545 + }, + { + "epoch": 0.7588713125267208, + "grad_norm": 3.7273731231689453, + "learning_rate": 1.3793349168646082e-05, + "loss": 0.33, + "step": 3550 + }, + { + "epoch": 0.7599401453612655, + "grad_norm": 3.681992530822754, + "learning_rate": 1.3781472684085513e-05, + "loss": 0.2002, + "step": 3555 + }, + { + "epoch": 0.7610089781958101, + "grad_norm": 5.324198246002197, + "learning_rate": 1.3769596199524942e-05, + "loss": 0.3566, + "step": 3560 + }, + { + "epoch": 0.7620778110303549, + "grad_norm": 4.434847354888916, + "learning_rate": 1.3757719714964372e-05, + "loss": 0.2618, + "step": 3565 + }, + { + "epoch": 0.7631466438648995, + "grad_norm": 5.279498100280762, + "learning_rate": 1.3745843230403801e-05, + "loss": 0.316, + "step": 3570 + }, + { + "epoch": 0.7642154766994442, + "grad_norm": 3.4741098880767822, + "learning_rate": 1.3733966745843233e-05, + "loss": 0.2997, + "step": 3575 + }, + { + "epoch": 0.7652843095339888, + "grad_norm": 4.7899909019470215, + "learning_rate": 1.372209026128266e-05, + "loss": 0.2809, + "step": 3580 + }, + { + "epoch": 0.7663531423685336, + "grad_norm": 4.318710803985596, + "learning_rate": 1.3710213776722093e-05, + "loss": 0.2023, + "step": 3585 + }, + { + "epoch": 0.7674219752030782, + "grad_norm": 4.148991107940674, + "learning_rate": 1.3698337292161522e-05, + "loss": 0.2726, + "step": 3590 + }, + { + "epoch": 0.7684908080376229, + "grad_norm": 5.0960373878479, + "learning_rate": 1.3686460807600952e-05, + "loss": 0.2878, + "step": 3595 + }, + { + "epoch": 0.7695596408721675, + "grad_norm": 5.928832530975342, + "learning_rate": 1.3674584323040381e-05, + "loss": 0.4026, + "step": 3600 + }, + { + "epoch": 0.7706284737067123, + "grad_norm": 4.24060583114624, + "learning_rate": 1.3662707838479811e-05, + "loss": 0.3205, + "step": 3605 + }, + { + "epoch": 0.7716973065412569, + "grad_norm": 4.517853736877441, + "learning_rate": 1.365083135391924e-05, + "loss": 0.3092, + "step": 3610 + }, + { + "epoch": 0.7727661393758016, + "grad_norm": 5.5383501052856445, + "learning_rate": 1.3638954869358671e-05, + "loss": 0.3249, + "step": 3615 + }, + { + "epoch": 0.7738349722103463, + "grad_norm": 3.5598056316375732, + "learning_rate": 1.36270783847981e-05, + "loss": 0.2898, + "step": 3620 + }, + { + "epoch": 0.774903805044891, + "grad_norm": 5.0517578125, + "learning_rate": 1.3615201900237532e-05, + "loss": 0.3464, + "step": 3625 + }, + { + "epoch": 0.7759726378794357, + "grad_norm": 4.764474868774414, + "learning_rate": 1.360332541567696e-05, + "loss": 0.3755, + "step": 3630 + }, + { + "epoch": 0.7770414707139803, + "grad_norm": 4.272229194641113, + "learning_rate": 1.3591448931116391e-05, + "loss": 0.3236, + "step": 3635 + }, + { + "epoch": 0.7781103035485251, + "grad_norm": 4.496946811676025, + "learning_rate": 1.357957244655582e-05, + "loss": 0.3298, + "step": 3640 + }, + { + "epoch": 0.7791791363830697, + "grad_norm": 3.3338801860809326, + "learning_rate": 1.356769596199525e-05, + "loss": 0.3301, + "step": 3645 + }, + { + "epoch": 0.7802479692176144, + "grad_norm": 4.775890350341797, + "learning_rate": 1.355581947743468e-05, + "loss": 0.2428, + "step": 3650 + }, + { + "epoch": 0.781316802052159, + "grad_norm": 3.7741811275482178, + "learning_rate": 1.354394299287411e-05, + "loss": 0.2789, + "step": 3655 + }, + { + "epoch": 0.7823856348867038, + "grad_norm": 5.699966907501221, + "learning_rate": 1.3532066508313539e-05, + "loss": 0.4398, + "step": 3660 + }, + { + "epoch": 0.7834544677212484, + "grad_norm": 5.20950174331665, + "learning_rate": 1.352019002375297e-05, + "loss": 0.3211, + "step": 3665 + }, + { + "epoch": 0.7845233005557931, + "grad_norm": 4.900545120239258, + "learning_rate": 1.3508313539192398e-05, + "loss": 0.3079, + "step": 3670 + }, + { + "epoch": 0.7855921333903377, + "grad_norm": 4.627389907836914, + "learning_rate": 1.349643705463183e-05, + "loss": 0.2765, + "step": 3675 + }, + { + "epoch": 0.7866609662248825, + "grad_norm": 3.996687889099121, + "learning_rate": 1.348456057007126e-05, + "loss": 0.2414, + "step": 3680 + }, + { + "epoch": 0.7877297990594271, + "grad_norm": 4.968347072601318, + "learning_rate": 1.347268408551069e-05, + "loss": 0.3142, + "step": 3685 + }, + { + "epoch": 0.7887986318939718, + "grad_norm": 5.365523815155029, + "learning_rate": 1.346080760095012e-05, + "loss": 0.4895, + "step": 3690 + }, + { + "epoch": 0.7898674647285164, + "grad_norm": 3.6716244220733643, + "learning_rate": 1.344893111638955e-05, + "loss": 0.3058, + "step": 3695 + }, + { + "epoch": 0.7909362975630612, + "grad_norm": 3.6110551357269287, + "learning_rate": 1.343705463182898e-05, + "loss": 0.2568, + "step": 3700 + }, + { + "epoch": 0.7920051303976058, + "grad_norm": 3.8466339111328125, + "learning_rate": 1.3425178147268409e-05, + "loss": 0.2505, + "step": 3705 + }, + { + "epoch": 0.7930739632321505, + "grad_norm": 6.473718643188477, + "learning_rate": 1.3413301662707841e-05, + "loss": 0.3416, + "step": 3710 + }, + { + "epoch": 0.7941427960666951, + "grad_norm": 4.931123733520508, + "learning_rate": 1.340142517814727e-05, + "loss": 0.2867, + "step": 3715 + }, + { + "epoch": 0.7952116289012399, + "grad_norm": 4.821789741516113, + "learning_rate": 1.33895486935867e-05, + "loss": 0.2696, + "step": 3720 + }, + { + "epoch": 0.7962804617357845, + "grad_norm": 3.5999889373779297, + "learning_rate": 1.337767220902613e-05, + "loss": 0.293, + "step": 3725 + }, + { + "epoch": 0.7973492945703292, + "grad_norm": 3.716235637664795, + "learning_rate": 1.336579572446556e-05, + "loss": 0.2741, + "step": 3730 + }, + { + "epoch": 0.7984181274048738, + "grad_norm": 3.1744401454925537, + "learning_rate": 1.3353919239904989e-05, + "loss": 0.3276, + "step": 3735 + }, + { + "epoch": 0.7994869602394186, + "grad_norm": 4.65699577331543, + "learning_rate": 1.334204275534442e-05, + "loss": 0.2688, + "step": 3740 + }, + { + "epoch": 0.8005557930739632, + "grad_norm": 3.338193416595459, + "learning_rate": 1.3330166270783848e-05, + "loss": 0.2408, + "step": 3745 + }, + { + "epoch": 0.8016246259085079, + "grad_norm": 4.22088098526001, + "learning_rate": 1.3318289786223279e-05, + "loss": 0.2926, + "step": 3750 + }, + { + "epoch": 0.8026934587430525, + "grad_norm": 5.624631881713867, + "learning_rate": 1.3306413301662708e-05, + "loss": 0.3119, + "step": 3755 + }, + { + "epoch": 0.8037622915775973, + "grad_norm": 3.8507394790649414, + "learning_rate": 1.329453681710214e-05, + "loss": 0.3018, + "step": 3760 + }, + { + "epoch": 0.804831124412142, + "grad_norm": 4.6665239334106445, + "learning_rate": 1.3282660332541569e-05, + "loss": 0.3448, + "step": 3765 + }, + { + "epoch": 0.8058999572466866, + "grad_norm": 4.100464344024658, + "learning_rate": 1.3270783847981e-05, + "loss": 0.3539, + "step": 3770 + }, + { + "epoch": 0.8069687900812313, + "grad_norm": 6.0533623695373535, + "learning_rate": 1.3258907363420428e-05, + "loss": 0.2776, + "step": 3775 + }, + { + "epoch": 0.808037622915776, + "grad_norm": 3.781015396118164, + "learning_rate": 1.3247030878859859e-05, + "loss": 0.2255, + "step": 3780 + }, + { + "epoch": 0.8091064557503207, + "grad_norm": 5.616995334625244, + "learning_rate": 1.3235154394299288e-05, + "loss": 0.2507, + "step": 3785 + }, + { + "epoch": 0.8101752885848653, + "grad_norm": 5.021564960479736, + "learning_rate": 1.3223277909738718e-05, + "loss": 0.3463, + "step": 3790 + }, + { + "epoch": 0.81124412141941, + "grad_norm": 4.946634769439697, + "learning_rate": 1.3211401425178147e-05, + "loss": 0.2849, + "step": 3795 + }, + { + "epoch": 0.8123129542539547, + "grad_norm": 3.1573128700256348, + "learning_rate": 1.319952494061758e-05, + "loss": 0.2678, + "step": 3800 + }, + { + "epoch": 0.8133817870884994, + "grad_norm": 5.302856922149658, + "learning_rate": 1.3187648456057008e-05, + "loss": 0.3446, + "step": 3805 + }, + { + "epoch": 0.814450619923044, + "grad_norm": 5.2195820808410645, + "learning_rate": 1.3175771971496439e-05, + "loss": 0.344, + "step": 3810 + }, + { + "epoch": 0.8155194527575887, + "grad_norm": 5.514340877532959, + "learning_rate": 1.3163895486935867e-05, + "loss": 0.3305, + "step": 3815 + }, + { + "epoch": 0.8165882855921334, + "grad_norm": 4.197089195251465, + "learning_rate": 1.3152019002375298e-05, + "loss": 0.2728, + "step": 3820 + }, + { + "epoch": 0.8176571184266781, + "grad_norm": 4.766973972320557, + "learning_rate": 1.3140142517814727e-05, + "loss": 0.4181, + "step": 3825 + }, + { + "epoch": 0.8187259512612227, + "grad_norm": 5.202324390411377, + "learning_rate": 1.3128266033254157e-05, + "loss": 0.3351, + "step": 3830 + }, + { + "epoch": 0.8197947840957674, + "grad_norm": 3.472627878189087, + "learning_rate": 1.311638954869359e-05, + "loss": 0.2646, + "step": 3835 + }, + { + "epoch": 0.8208636169303121, + "grad_norm": 4.589137554168701, + "learning_rate": 1.3104513064133017e-05, + "loss": 0.2628, + "step": 3840 + }, + { + "epoch": 0.8219324497648568, + "grad_norm": 3.9725475311279297, + "learning_rate": 1.3092636579572449e-05, + "loss": 0.2747, + "step": 3845 + }, + { + "epoch": 0.8230012825994014, + "grad_norm": 3.832432985305786, + "learning_rate": 1.3080760095011878e-05, + "loss": 0.2253, + "step": 3850 + }, + { + "epoch": 0.8240701154339461, + "grad_norm": 4.213531494140625, + "learning_rate": 1.3068883610451308e-05, + "loss": 0.2741, + "step": 3855 + }, + { + "epoch": 0.8251389482684908, + "grad_norm": 6.430481910705566, + "learning_rate": 1.3057007125890737e-05, + "loss": 0.3982, + "step": 3860 + }, + { + "epoch": 0.8262077811030355, + "grad_norm": 2.416151762008667, + "learning_rate": 1.3045130641330168e-05, + "loss": 0.3014, + "step": 3865 + }, + { + "epoch": 0.8272766139375801, + "grad_norm": 4.334439754486084, + "learning_rate": 1.3033254156769597e-05, + "loss": 0.2696, + "step": 3870 + }, + { + "epoch": 0.8283454467721248, + "grad_norm": 3.599234104156494, + "learning_rate": 1.3021377672209027e-05, + "loss": 0.2607, + "step": 3875 + }, + { + "epoch": 0.8294142796066695, + "grad_norm": 4.65981388092041, + "learning_rate": 1.3009501187648456e-05, + "loss": 0.3154, + "step": 3880 + }, + { + "epoch": 0.8304831124412142, + "grad_norm": 5.147418975830078, + "learning_rate": 1.2997624703087888e-05, + "loss": 0.3275, + "step": 3885 + }, + { + "epoch": 0.8315519452757588, + "grad_norm": 4.910894870758057, + "learning_rate": 1.2985748218527317e-05, + "loss": 0.274, + "step": 3890 + }, + { + "epoch": 0.8326207781103035, + "grad_norm": 3.3270483016967773, + "learning_rate": 1.2973871733966748e-05, + "loss": 0.3042, + "step": 3895 + }, + { + "epoch": 0.8336896109448483, + "grad_norm": 5.005611419677734, + "learning_rate": 1.2961995249406177e-05, + "loss": 0.2692, + "step": 3900 + }, + { + "epoch": 0.8347584437793929, + "grad_norm": 3.320770263671875, + "learning_rate": 1.2950118764845607e-05, + "loss": 0.2505, + "step": 3905 + }, + { + "epoch": 0.8358272766139376, + "grad_norm": 4.788522720336914, + "learning_rate": 1.2938242280285036e-05, + "loss": 0.3762, + "step": 3910 + }, + { + "epoch": 0.8368961094484823, + "grad_norm": 5.107404708862305, + "learning_rate": 1.2926365795724467e-05, + "loss": 0.2467, + "step": 3915 + }, + { + "epoch": 0.837964942283027, + "grad_norm": 3.5440781116485596, + "learning_rate": 1.2914489311163895e-05, + "loss": 0.2227, + "step": 3920 + }, + { + "epoch": 0.8390337751175716, + "grad_norm": 5.089791774749756, + "learning_rate": 1.2902612826603326e-05, + "loss": 0.2513, + "step": 3925 + }, + { + "epoch": 0.8401026079521163, + "grad_norm": 5.978660583496094, + "learning_rate": 1.2890736342042755e-05, + "loss": 0.313, + "step": 3930 + }, + { + "epoch": 0.841171440786661, + "grad_norm": 4.347848415374756, + "learning_rate": 1.2878859857482187e-05, + "loss": 0.265, + "step": 3935 + }, + { + "epoch": 0.8422402736212057, + "grad_norm": 5.038461208343506, + "learning_rate": 1.2866983372921616e-05, + "loss": 0.2839, + "step": 3940 + }, + { + "epoch": 0.8433091064557503, + "grad_norm": 4.367410659790039, + "learning_rate": 1.2855106888361046e-05, + "loss": 0.3432, + "step": 3945 + }, + { + "epoch": 0.844377939290295, + "grad_norm": 4.267697334289551, + "learning_rate": 1.2843230403800475e-05, + "loss": 0.2168, + "step": 3950 + }, + { + "epoch": 0.8454467721248397, + "grad_norm": 4.99351167678833, + "learning_rate": 1.2831353919239906e-05, + "loss": 0.3083, + "step": 3955 + }, + { + "epoch": 0.8465156049593844, + "grad_norm": 3.725167751312256, + "learning_rate": 1.2819477434679335e-05, + "loss": 0.3362, + "step": 3960 + }, + { + "epoch": 0.847584437793929, + "grad_norm": 4.825465679168701, + "learning_rate": 1.2807600950118765e-05, + "loss": 0.2897, + "step": 3965 + }, + { + "epoch": 0.8486532706284737, + "grad_norm": 4.231856822967529, + "learning_rate": 1.2795724465558198e-05, + "loss": 0.299, + "step": 3970 + }, + { + "epoch": 0.8497221034630184, + "grad_norm": 3.8439395427703857, + "learning_rate": 1.2783847980997626e-05, + "loss": 0.3421, + "step": 3975 + }, + { + "epoch": 0.8507909362975631, + "grad_norm": 4.338144779205322, + "learning_rate": 1.2771971496437057e-05, + "loss": 0.2886, + "step": 3980 + }, + { + "epoch": 0.8518597691321077, + "grad_norm": 5.123786449432373, + "learning_rate": 1.2760095011876486e-05, + "loss": 0.3563, + "step": 3985 + }, + { + "epoch": 0.8529286019666524, + "grad_norm": 5.506287574768066, + "learning_rate": 1.2748218527315916e-05, + "loss": 0.3204, + "step": 3990 + }, + { + "epoch": 0.853997434801197, + "grad_norm": 3.644973039627075, + "learning_rate": 1.2736342042755345e-05, + "loss": 0.3025, + "step": 3995 + }, + { + "epoch": 0.8550662676357418, + "grad_norm": 5.109133720397949, + "learning_rate": 1.2724465558194776e-05, + "loss": 0.2813, + "step": 4000 + }, + { + "epoch": 0.8561351004702864, + "grad_norm": 5.544173717498779, + "learning_rate": 1.2712589073634205e-05, + "loss": 0.2787, + "step": 4005 + }, + { + "epoch": 0.8572039333048311, + "grad_norm": 5.382670879364014, + "learning_rate": 1.2700712589073637e-05, + "loss": 0.2643, + "step": 4010 + }, + { + "epoch": 0.8582727661393758, + "grad_norm": 5.406363010406494, + "learning_rate": 1.2688836104513064e-05, + "loss": 0.291, + "step": 4015 + }, + { + "epoch": 0.8593415989739205, + "grad_norm": 3.5062954425811768, + "learning_rate": 1.2676959619952496e-05, + "loss": 0.2467, + "step": 4020 + }, + { + "epoch": 0.8604104318084651, + "grad_norm": 5.817686080932617, + "learning_rate": 1.2665083135391925e-05, + "loss": 0.3489, + "step": 4025 + }, + { + "epoch": 0.8614792646430098, + "grad_norm": 3.931792974472046, + "learning_rate": 1.2653206650831356e-05, + "loss": 0.2613, + "step": 4030 + }, + { + "epoch": 0.8625480974775546, + "grad_norm": 4.279338359832764, + "learning_rate": 1.2641330166270785e-05, + "loss": 0.3007, + "step": 4035 + }, + { + "epoch": 0.8636169303120992, + "grad_norm": 3.9646289348602295, + "learning_rate": 1.2629453681710215e-05, + "loss": 0.2685, + "step": 4040 + }, + { + "epoch": 0.8646857631466439, + "grad_norm": 5.029911518096924, + "learning_rate": 1.2617577197149644e-05, + "loss": 0.2984, + "step": 4045 + }, + { + "epoch": 0.8657545959811885, + "grad_norm": 4.78744649887085, + "learning_rate": 1.2605700712589074e-05, + "loss": 0.2321, + "step": 4050 + }, + { + "epoch": 0.8668234288157333, + "grad_norm": 3.825188636779785, + "learning_rate": 1.2593824228028503e-05, + "loss": 0.2417, + "step": 4055 + }, + { + "epoch": 0.8678922616502779, + "grad_norm": 4.478353500366211, + "learning_rate": 1.2581947743467936e-05, + "loss": 0.3164, + "step": 4060 + }, + { + "epoch": 0.8689610944848226, + "grad_norm": 5.523867607116699, + "learning_rate": 1.2570071258907364e-05, + "loss": 0.3769, + "step": 4065 + }, + { + "epoch": 0.8700299273193672, + "grad_norm": 6.190155506134033, + "learning_rate": 1.2558194774346795e-05, + "loss": 0.3385, + "step": 4070 + }, + { + "epoch": 0.871098760153912, + "grad_norm": 4.058770179748535, + "learning_rate": 1.2546318289786224e-05, + "loss": 0.3135, + "step": 4075 + }, + { + "epoch": 0.8721675929884566, + "grad_norm": 5.607039928436279, + "learning_rate": 1.2534441805225654e-05, + "loss": 0.3295, + "step": 4080 + }, + { + "epoch": 0.8732364258230013, + "grad_norm": 4.902414321899414, + "learning_rate": 1.2522565320665083e-05, + "loss": 0.2992, + "step": 4085 + }, + { + "epoch": 0.8743052586575459, + "grad_norm": 4.188961505889893, + "learning_rate": 1.2510688836104514e-05, + "loss": 0.2723, + "step": 4090 + }, + { + "epoch": 0.8753740914920907, + "grad_norm": 4.536145210266113, + "learning_rate": 1.2498812351543943e-05, + "loss": 0.2805, + "step": 4095 + }, + { + "epoch": 0.8764429243266353, + "grad_norm": 3.7727832794189453, + "learning_rate": 1.2486935866983373e-05, + "loss": 0.2171, + "step": 4100 + }, + { + "epoch": 0.87751175716118, + "grad_norm": 4.528228759765625, + "learning_rate": 1.2475059382422802e-05, + "loss": 0.2618, + "step": 4105 + }, + { + "epoch": 0.8785805899957246, + "grad_norm": 4.920950412750244, + "learning_rate": 1.2463182897862234e-05, + "loss": 0.2994, + "step": 4110 + }, + { + "epoch": 0.8796494228302694, + "grad_norm": 4.851797580718994, + "learning_rate": 1.2451306413301665e-05, + "loss": 0.2866, + "step": 4115 + }, + { + "epoch": 0.880718255664814, + "grad_norm": 3.021509885787964, + "learning_rate": 1.2439429928741094e-05, + "loss": 0.2091, + "step": 4120 + }, + { + "epoch": 0.8817870884993587, + "grad_norm": 5.19913911819458, + "learning_rate": 1.2427553444180524e-05, + "loss": 0.3285, + "step": 4125 + }, + { + "epoch": 0.8828559213339033, + "grad_norm": 4.311760902404785, + "learning_rate": 1.2415676959619953e-05, + "loss": 0.2854, + "step": 4130 + }, + { + "epoch": 0.8839247541684481, + "grad_norm": 5.5093994140625, + "learning_rate": 1.2403800475059384e-05, + "loss": 0.3004, + "step": 4135 + }, + { + "epoch": 0.8849935870029927, + "grad_norm": 3.5908706188201904, + "learning_rate": 1.2391923990498813e-05, + "loss": 0.2335, + "step": 4140 + }, + { + "epoch": 0.8860624198375374, + "grad_norm": 3.561647653579712, + "learning_rate": 1.2380047505938245e-05, + "loss": 0.2919, + "step": 4145 + }, + { + "epoch": 0.887131252672082, + "grad_norm": 3.1781160831451416, + "learning_rate": 1.2368171021377674e-05, + "loss": 0.2786, + "step": 4150 + }, + { + "epoch": 0.8882000855066268, + "grad_norm": 4.471413612365723, + "learning_rate": 1.2356294536817104e-05, + "loss": 0.3312, + "step": 4155 + }, + { + "epoch": 0.8892689183411714, + "grad_norm": 5.232965469360352, + "learning_rate": 1.2344418052256533e-05, + "loss": 0.2677, + "step": 4160 + }, + { + "epoch": 0.8903377511757161, + "grad_norm": 4.883133888244629, + "learning_rate": 1.2332541567695964e-05, + "loss": 0.2774, + "step": 4165 + }, + { + "epoch": 0.8914065840102608, + "grad_norm": 4.092249393463135, + "learning_rate": 1.2320665083135392e-05, + "loss": 0.2649, + "step": 4170 + }, + { + "epoch": 0.8924754168448055, + "grad_norm": 3.5607283115386963, + "learning_rate": 1.2308788598574823e-05, + "loss": 0.3119, + "step": 4175 + }, + { + "epoch": 0.8935442496793502, + "grad_norm": 4.573966026306152, + "learning_rate": 1.2296912114014252e-05, + "loss": 0.243, + "step": 4180 + }, + { + "epoch": 0.8946130825138948, + "grad_norm": 4.2962775230407715, + "learning_rate": 1.2285035629453684e-05, + "loss": 0.292, + "step": 4185 + }, + { + "epoch": 0.8956819153484396, + "grad_norm": 4.585544109344482, + "learning_rate": 1.2273159144893111e-05, + "loss": 0.352, + "step": 4190 + }, + { + "epoch": 0.8967507481829842, + "grad_norm": 4.529600143432617, + "learning_rate": 1.2261282660332543e-05, + "loss": 0.2422, + "step": 4195 + }, + { + "epoch": 0.8978195810175289, + "grad_norm": 2.9587581157684326, + "learning_rate": 1.2249406175771972e-05, + "loss": 0.2427, + "step": 4200 + }, + { + "epoch": 0.8988884138520735, + "grad_norm": 4.409660339355469, + "learning_rate": 1.2237529691211403e-05, + "loss": 0.2246, + "step": 4205 + }, + { + "epoch": 0.8999572466866183, + "grad_norm": 3.328666925430298, + "learning_rate": 1.2225653206650832e-05, + "loss": 0.2275, + "step": 4210 + }, + { + "epoch": 0.9010260795211629, + "grad_norm": 4.411447048187256, + "learning_rate": 1.2213776722090262e-05, + "loss": 0.3766, + "step": 4215 + }, + { + "epoch": 0.9020949123557076, + "grad_norm": 3.3779454231262207, + "learning_rate": 1.2201900237529691e-05, + "loss": 0.2748, + "step": 4220 + }, + { + "epoch": 0.9031637451902522, + "grad_norm": 5.558443069458008, + "learning_rate": 1.2190023752969122e-05, + "loss": 0.2941, + "step": 4225 + }, + { + "epoch": 0.904232578024797, + "grad_norm": 3.7313380241394043, + "learning_rate": 1.217814726840855e-05, + "loss": 0.2693, + "step": 4230 + }, + { + "epoch": 0.9053014108593416, + "grad_norm": 3.5401077270507812, + "learning_rate": 1.2166270783847983e-05, + "loss": 0.3058, + "step": 4235 + }, + { + "epoch": 0.9063702436938863, + "grad_norm": 3.6305854320526123, + "learning_rate": 1.2154394299287412e-05, + "loss": 0.2167, + "step": 4240 + }, + { + "epoch": 0.9074390765284309, + "grad_norm": 4.208883285522461, + "learning_rate": 1.2142517814726842e-05, + "loss": 0.2371, + "step": 4245 + }, + { + "epoch": 0.9085079093629757, + "grad_norm": 4.586354732513428, + "learning_rate": 1.2130641330166273e-05, + "loss": 0.2199, + "step": 4250 + }, + { + "epoch": 0.9095767421975203, + "grad_norm": 3.673724889755249, + "learning_rate": 1.2118764845605702e-05, + "loss": 0.2803, + "step": 4255 + }, + { + "epoch": 0.910645575032065, + "grad_norm": 4.0301337242126465, + "learning_rate": 1.2106888361045132e-05, + "loss": 0.2765, + "step": 4260 + }, + { + "epoch": 0.9117144078666096, + "grad_norm": 4.114202976226807, + "learning_rate": 1.2095011876484561e-05, + "loss": 0.2734, + "step": 4265 + }, + { + "epoch": 0.9127832407011544, + "grad_norm": 6.415131568908691, + "learning_rate": 1.2083135391923993e-05, + "loss": 0.3345, + "step": 4270 + }, + { + "epoch": 0.913852073535699, + "grad_norm": 4.800512790679932, + "learning_rate": 1.207125890736342e-05, + "loss": 0.2873, + "step": 4275 + }, + { + "epoch": 0.9149209063702437, + "grad_norm": 4.536464214324951, + "learning_rate": 1.2059382422802853e-05, + "loss": 0.2506, + "step": 4280 + }, + { + "epoch": 0.9159897392047883, + "grad_norm": 4.594064235687256, + "learning_rate": 1.2047505938242281e-05, + "loss": 0.2335, + "step": 4285 + }, + { + "epoch": 0.917058572039333, + "grad_norm": 5.493027687072754, + "learning_rate": 1.2035629453681712e-05, + "loss": 0.3218, + "step": 4290 + }, + { + "epoch": 0.9181274048738777, + "grad_norm": 4.560657501220703, + "learning_rate": 1.2023752969121141e-05, + "loss": 0.2971, + "step": 4295 + }, + { + "epoch": 0.9191962377084224, + "grad_norm": 3.5777430534362793, + "learning_rate": 1.2011876484560571e-05, + "loss": 0.2296, + "step": 4300 + }, + { + "epoch": 0.9202650705429671, + "grad_norm": 4.112082481384277, + "learning_rate": 1.2e-05, + "loss": 0.3087, + "step": 4305 + }, + { + "epoch": 0.9213339033775118, + "grad_norm": 3.815093994140625, + "learning_rate": 1.1988123515439431e-05, + "loss": 0.3353, + "step": 4310 + }, + { + "epoch": 0.9224027362120565, + "grad_norm": 5.078567028045654, + "learning_rate": 1.197624703087886e-05, + "loss": 0.3046, + "step": 4315 + }, + { + "epoch": 0.9234715690466011, + "grad_norm": 3.549429178237915, + "learning_rate": 1.1964370546318292e-05, + "loss": 0.3431, + "step": 4320 + }, + { + "epoch": 0.9245404018811458, + "grad_norm": 4.466531276702881, + "learning_rate": 1.195249406175772e-05, + "loss": 0.2707, + "step": 4325 + }, + { + "epoch": 0.9256092347156905, + "grad_norm": 5.423553943634033, + "learning_rate": 1.1940617577197151e-05, + "loss": 0.284, + "step": 4330 + }, + { + "epoch": 0.9266780675502352, + "grad_norm": 4.436051845550537, + "learning_rate": 1.192874109263658e-05, + "loss": 0.2714, + "step": 4335 + }, + { + "epoch": 0.9277469003847798, + "grad_norm": 4.404295444488525, + "learning_rate": 1.191686460807601e-05, + "loss": 0.2751, + "step": 4340 + }, + { + "epoch": 0.9288157332193245, + "grad_norm": 4.390391826629639, + "learning_rate": 1.190498812351544e-05, + "loss": 0.3047, + "step": 4345 + }, + { + "epoch": 0.9298845660538692, + "grad_norm": 4.6937479972839355, + "learning_rate": 1.189311163895487e-05, + "loss": 0.2867, + "step": 4350 + }, + { + "epoch": 0.9309533988884139, + "grad_norm": 4.352549076080322, + "learning_rate": 1.1881235154394299e-05, + "loss": 0.2895, + "step": 4355 + }, + { + "epoch": 0.9320222317229585, + "grad_norm": 4.013473033905029, + "learning_rate": 1.186935866983373e-05, + "loss": 0.2749, + "step": 4360 + }, + { + "epoch": 0.9330910645575032, + "grad_norm": 3.603860378265381, + "learning_rate": 1.1857482185273158e-05, + "loss": 0.2549, + "step": 4365 + }, + { + "epoch": 0.9341598973920479, + "grad_norm": 5.079062461853027, + "learning_rate": 1.184560570071259e-05, + "loss": 0.2648, + "step": 4370 + }, + { + "epoch": 0.9352287302265926, + "grad_norm": 6.029326438903809, + "learning_rate": 1.183372921615202e-05, + "loss": 0.3001, + "step": 4375 + }, + { + "epoch": 0.9362975630611372, + "grad_norm": 4.8559041023254395, + "learning_rate": 1.182185273159145e-05, + "loss": 0.3198, + "step": 4380 + }, + { + "epoch": 0.9373663958956819, + "grad_norm": 4.295980453491211, + "learning_rate": 1.1809976247030879e-05, + "loss": 0.2583, + "step": 4385 + }, + { + "epoch": 0.9384352287302266, + "grad_norm": 6.648914337158203, + "learning_rate": 1.179809976247031e-05, + "loss": 0.2894, + "step": 4390 + }, + { + "epoch": 0.9395040615647713, + "grad_norm": 5.454647064208984, + "learning_rate": 1.178622327790974e-05, + "loss": 0.3017, + "step": 4395 + }, + { + "epoch": 0.9405728943993159, + "grad_norm": 5.520369529724121, + "learning_rate": 1.1774346793349169e-05, + "loss": 0.2754, + "step": 4400 + }, + { + "epoch": 0.9416417272338606, + "grad_norm": 3.847935914993286, + "learning_rate": 1.1762470308788601e-05, + "loss": 0.3289, + "step": 4405 + }, + { + "epoch": 0.9427105600684053, + "grad_norm": 4.063333988189697, + "learning_rate": 1.175059382422803e-05, + "loss": 0.2787, + "step": 4410 + }, + { + "epoch": 0.94377939290295, + "grad_norm": 4.977645397186279, + "learning_rate": 1.173871733966746e-05, + "loss": 0.2406, + "step": 4415 + }, + { + "epoch": 0.9448482257374946, + "grad_norm": 4.375988483428955, + "learning_rate": 1.172684085510689e-05, + "loss": 0.3144, + "step": 4420 + }, + { + "epoch": 0.9459170585720393, + "grad_norm": 4.656064987182617, + "learning_rate": 1.171496437054632e-05, + "loss": 0.3237, + "step": 4425 + }, + { + "epoch": 0.946985891406584, + "grad_norm": 4.027129650115967, + "learning_rate": 1.1703087885985749e-05, + "loss": 0.2641, + "step": 4430 + }, + { + "epoch": 0.9480547242411287, + "grad_norm": 4.126834869384766, + "learning_rate": 1.169121140142518e-05, + "loss": 0.2875, + "step": 4435 + }, + { + "epoch": 0.9491235570756734, + "grad_norm": 3.4707841873168945, + "learning_rate": 1.1679334916864608e-05, + "loss": 0.3211, + "step": 4440 + }, + { + "epoch": 0.950192389910218, + "grad_norm": 2.8617501258850098, + "learning_rate": 1.166745843230404e-05, + "loss": 0.2403, + "step": 4445 + }, + { + "epoch": 0.9512612227447628, + "grad_norm": 4.50408935546875, + "learning_rate": 1.1655581947743468e-05, + "loss": 0.3018, + "step": 4450 + }, + { + "epoch": 0.9523300555793074, + "grad_norm": 3.976015329360962, + "learning_rate": 1.16437054631829e-05, + "loss": 0.2531, + "step": 4455 + }, + { + "epoch": 0.9533988884138521, + "grad_norm": 6.214652061462402, + "learning_rate": 1.1631828978622329e-05, + "loss": 0.349, + "step": 4460 + }, + { + "epoch": 0.9544677212483967, + "grad_norm": 3.969996929168701, + "learning_rate": 1.161995249406176e-05, + "loss": 0.238, + "step": 4465 + }, + { + "epoch": 0.9555365540829415, + "grad_norm": 3.9902470111846924, + "learning_rate": 1.1608076009501188e-05, + "loss": 0.2768, + "step": 4470 + }, + { + "epoch": 0.9566053869174861, + "grad_norm": 4.20414924621582, + "learning_rate": 1.1596199524940619e-05, + "loss": 0.2944, + "step": 4475 + }, + { + "epoch": 0.9576742197520308, + "grad_norm": 3.5199337005615234, + "learning_rate": 1.1584323040380048e-05, + "loss": 0.3043, + "step": 4480 + }, + { + "epoch": 0.9587430525865754, + "grad_norm": 3.7765684127807617, + "learning_rate": 1.1572446555819478e-05, + "loss": 0.2434, + "step": 4485 + }, + { + "epoch": 0.9598118854211202, + "grad_norm": 3.9338152408599854, + "learning_rate": 1.1560570071258907e-05, + "loss": 0.2451, + "step": 4490 + }, + { + "epoch": 0.9608807182556648, + "grad_norm": 2.86897873878479, + "learning_rate": 1.154869358669834e-05, + "loss": 0.213, + "step": 4495 + }, + { + "epoch": 0.9619495510902095, + "grad_norm": 4.536627292633057, + "learning_rate": 1.1536817102137768e-05, + "loss": 0.26, + "step": 4500 + }, + { + "epoch": 0.9630183839247541, + "grad_norm": 5.863621234893799, + "learning_rate": 1.1524940617577199e-05, + "loss": 0.3173, + "step": 4505 + }, + { + "epoch": 0.9640872167592989, + "grad_norm": 5.156888008117676, + "learning_rate": 1.1513064133016627e-05, + "loss": 0.2745, + "step": 4510 + }, + { + "epoch": 0.9651560495938435, + "grad_norm": 3.947845220565796, + "learning_rate": 1.1501187648456058e-05, + "loss": 0.2824, + "step": 4515 + }, + { + "epoch": 0.9662248824283882, + "grad_norm": 3.6855573654174805, + "learning_rate": 1.1489311163895487e-05, + "loss": 0.2769, + "step": 4520 + }, + { + "epoch": 0.9672937152629328, + "grad_norm": 3.929898977279663, + "learning_rate": 1.1477434679334917e-05, + "loss": 0.2464, + "step": 4525 + }, + { + "epoch": 0.9683625480974776, + "grad_norm": 3.9288270473480225, + "learning_rate": 1.146555819477435e-05, + "loss": 0.3213, + "step": 4530 + }, + { + "epoch": 0.9694313809320222, + "grad_norm": 5.536011219024658, + "learning_rate": 1.1453681710213777e-05, + "loss": 0.3606, + "step": 4535 + }, + { + "epoch": 0.9705002137665669, + "grad_norm": 3.3420379161834717, + "learning_rate": 1.1441805225653209e-05, + "loss": 0.2183, + "step": 4540 + }, + { + "epoch": 0.9715690466011115, + "grad_norm": 3.492932081222534, + "learning_rate": 1.1429928741092638e-05, + "loss": 0.2567, + "step": 4545 + }, + { + "epoch": 0.9726378794356563, + "grad_norm": 5.132521629333496, + "learning_rate": 1.1418052256532068e-05, + "loss": 0.2521, + "step": 4550 + }, + { + "epoch": 0.9737067122702009, + "grad_norm": 4.512472152709961, + "learning_rate": 1.1406175771971497e-05, + "loss": 0.2696, + "step": 4555 + }, + { + "epoch": 0.9747755451047456, + "grad_norm": 5.246362686157227, + "learning_rate": 1.1394299287410928e-05, + "loss": 0.3409, + "step": 4560 + }, + { + "epoch": 0.9758443779392902, + "grad_norm": 4.033038139343262, + "learning_rate": 1.1382422802850357e-05, + "loss": 0.2732, + "step": 4565 + }, + { + "epoch": 0.976913210773835, + "grad_norm": 4.162726879119873, + "learning_rate": 1.1370546318289787e-05, + "loss": 0.3003, + "step": 4570 + }, + { + "epoch": 0.9779820436083797, + "grad_norm": 5.6553730964660645, + "learning_rate": 1.1358669833729216e-05, + "loss": 0.3426, + "step": 4575 + }, + { + "epoch": 0.9790508764429243, + "grad_norm": 3.857776403427124, + "learning_rate": 1.1346793349168648e-05, + "loss": 0.2873, + "step": 4580 + }, + { + "epoch": 0.980119709277469, + "grad_norm": 4.109443187713623, + "learning_rate": 1.1334916864608077e-05, + "loss": 0.3, + "step": 4585 + }, + { + "epoch": 0.9811885421120137, + "grad_norm": 3.3073673248291016, + "learning_rate": 1.1323040380047508e-05, + "loss": 0.2074, + "step": 4590 + }, + { + "epoch": 0.9822573749465584, + "grad_norm": 3.0706233978271484, + "learning_rate": 1.1311163895486937e-05, + "loss": 0.2521, + "step": 4595 + }, + { + "epoch": 0.983326207781103, + "grad_norm": 5.8296356201171875, + "learning_rate": 1.1299287410926367e-05, + "loss": 0.3123, + "step": 4600 + }, + { + "epoch": 0.9843950406156478, + "grad_norm": 3.409862995147705, + "learning_rate": 1.1287410926365796e-05, + "loss": 0.2492, + "step": 4605 + }, + { + "epoch": 0.9854638734501924, + "grad_norm": 5.090631008148193, + "learning_rate": 1.1275534441805227e-05, + "loss": 0.3012, + "step": 4610 + }, + { + "epoch": 0.9865327062847371, + "grad_norm": 6.443350315093994, + "learning_rate": 1.1263657957244655e-05, + "loss": 0.2516, + "step": 4615 + }, + { + "epoch": 0.9876015391192817, + "grad_norm": 4.340301513671875, + "learning_rate": 1.1251781472684088e-05, + "loss": 0.3629, + "step": 4620 + }, + { + "epoch": 0.9886703719538265, + "grad_norm": 4.117158889770508, + "learning_rate": 1.1239904988123515e-05, + "loss": 0.2484, + "step": 4625 + }, + { + "epoch": 0.9897392047883711, + "grad_norm": 4.39588737487793, + "learning_rate": 1.1228028503562947e-05, + "loss": 0.2749, + "step": 4630 + }, + { + "epoch": 0.9908080376229158, + "grad_norm": 4.059388637542725, + "learning_rate": 1.1216152019002376e-05, + "loss": 0.2064, + "step": 4635 + }, + { + "epoch": 0.9918768704574604, + "grad_norm": 3.4412331581115723, + "learning_rate": 1.1204275534441806e-05, + "loss": 0.3089, + "step": 4640 + }, + { + "epoch": 0.9929457032920052, + "grad_norm": 4.691385746002197, + "learning_rate": 1.1192399049881235e-05, + "loss": 0.3145, + "step": 4645 + }, + { + "epoch": 0.9940145361265498, + "grad_norm": 3.472172737121582, + "learning_rate": 1.1180522565320666e-05, + "loss": 0.2357, + "step": 4650 + }, + { + "epoch": 0.9950833689610945, + "grad_norm": 4.1867289543151855, + "learning_rate": 1.1168646080760095e-05, + "loss": 0.2803, + "step": 4655 + }, + { + "epoch": 0.9961522017956391, + "grad_norm": 4.0518083572387695, + "learning_rate": 1.1156769596199525e-05, + "loss": 0.2437, + "step": 4660 + }, + { + "epoch": 0.9972210346301839, + "grad_norm": 3.507197141647339, + "learning_rate": 1.1144893111638954e-05, + "loss": 0.2708, + "step": 4665 + }, + { + "epoch": 0.9982898674647285, + "grad_norm": 5.1572585105896, + "learning_rate": 1.1133016627078386e-05, + "loss": 0.253, + "step": 4670 + }, + { + "epoch": 0.9993587002992732, + "grad_norm": 4.823436737060547, + "learning_rate": 1.1121140142517817e-05, + "loss": 0.2219, + "step": 4675 + }, + { + "epoch": 1.0, + "eval_loss": 0.1271175593137741, + "eval_mrr": 0.9770190895741555, + "eval_runtime": 313.9716, + "eval_samples_per_second": 7.23, + "eval_steps_per_second": 0.905, + "step": 4678 + }, + { + "epoch": 1.000427533133818, + "grad_norm": 5.583497047424316, + "learning_rate": 1.1109263657957246e-05, + "loss": 0.2621, + "step": 4680 + }, + { + "epoch": 1.0014963659683624, + "grad_norm": 4.658013343811035, + "learning_rate": 1.1097387173396676e-05, + "loss": 0.382, + "step": 4685 + }, + { + "epoch": 1.0025651988029072, + "grad_norm": 3.0044312477111816, + "learning_rate": 1.1085510688836105e-05, + "loss": 0.3026, + "step": 4690 + }, + { + "epoch": 1.003634031637452, + "grad_norm": 4.063423156738281, + "learning_rate": 1.1073634204275536e-05, + "loss": 0.3643, + "step": 4695 + }, + { + "epoch": 1.0047028644719966, + "grad_norm": 4.625239372253418, + "learning_rate": 1.1061757719714965e-05, + "loss": 0.382, + "step": 4700 + }, + { + "epoch": 1.0057716973065411, + "grad_norm": 3.8251540660858154, + "learning_rate": 1.1049881235154397e-05, + "loss": 0.3082, + "step": 4705 + }, + { + "epoch": 1.0068405301410859, + "grad_norm": 4.241628170013428, + "learning_rate": 1.1038004750593824e-05, + "loss": 0.3411, + "step": 4710 + }, + { + "epoch": 1.0079093629756306, + "grad_norm": 5.6527276039123535, + "learning_rate": 1.1026128266033256e-05, + "loss": 0.317, + "step": 4715 + }, + { + "epoch": 1.0089781958101753, + "grad_norm": 5.0404052734375, + "learning_rate": 1.1014251781472685e-05, + "loss": 0.4396, + "step": 4720 + }, + { + "epoch": 1.01004702864472, + "grad_norm": 4.585846900939941, + "learning_rate": 1.1002375296912116e-05, + "loss": 0.4034, + "step": 4725 + }, + { + "epoch": 1.0111158614792646, + "grad_norm": 4.704357624053955, + "learning_rate": 1.0990498812351544e-05, + "loss": 0.2875, + "step": 4730 + }, + { + "epoch": 1.0121846943138093, + "grad_norm": 5.956788063049316, + "learning_rate": 1.0978622327790975e-05, + "loss": 0.4919, + "step": 4735 + }, + { + "epoch": 1.013253527148354, + "grad_norm": 4.240102291107178, + "learning_rate": 1.0966745843230404e-05, + "loss": 0.3118, + "step": 4740 + }, + { + "epoch": 1.0143223599828988, + "grad_norm": 4.7897515296936035, + "learning_rate": 1.0954869358669834e-05, + "loss": 0.3976, + "step": 4745 + }, + { + "epoch": 1.0153911928174433, + "grad_norm": 3.1631078720092773, + "learning_rate": 1.0942992874109263e-05, + "loss": 0.2919, + "step": 4750 + }, + { + "epoch": 1.016460025651988, + "grad_norm": 4.258396148681641, + "learning_rate": 1.0931116389548696e-05, + "loss": 0.5247, + "step": 4755 + }, + { + "epoch": 1.0175288584865327, + "grad_norm": 3.010542392730713, + "learning_rate": 1.0919239904988124e-05, + "loss": 0.2126, + "step": 4760 + }, + { + "epoch": 1.0185976913210775, + "grad_norm": 3.0874409675598145, + "learning_rate": 1.0907363420427555e-05, + "loss": 0.3455, + "step": 4765 + }, + { + "epoch": 1.019666524155622, + "grad_norm": 4.446132183074951, + "learning_rate": 1.0895486935866984e-05, + "loss": 0.3498, + "step": 4770 + }, + { + "epoch": 1.0207353569901667, + "grad_norm": 4.1357502937316895, + "learning_rate": 1.0883610451306414e-05, + "loss": 0.29, + "step": 4775 + }, + { + "epoch": 1.0218041898247114, + "grad_norm": 6.850640296936035, + "learning_rate": 1.0871733966745843e-05, + "loss": 0.3684, + "step": 4780 + }, + { + "epoch": 1.0228730226592562, + "grad_norm": 3.9681396484375, + "learning_rate": 1.0859857482185274e-05, + "loss": 0.3033, + "step": 4785 + }, + { + "epoch": 1.0239418554938007, + "grad_norm": 3.521563768386841, + "learning_rate": 1.0847980997624703e-05, + "loss": 0.2747, + "step": 4790 + }, + { + "epoch": 1.0250106883283454, + "grad_norm": 4.060203552246094, + "learning_rate": 1.0836104513064135e-05, + "loss": 0.2813, + "step": 4795 + }, + { + "epoch": 1.0260795211628901, + "grad_norm": 3.187224864959717, + "learning_rate": 1.0824228028503562e-05, + "loss": 0.2945, + "step": 4800 + }, + { + "epoch": 1.0271483539974349, + "grad_norm": 3.6413896083831787, + "learning_rate": 1.0812351543942994e-05, + "loss": 0.421, + "step": 4805 + }, + { + "epoch": 1.0282171868319794, + "grad_norm": 4.686298847198486, + "learning_rate": 1.0800475059382423e-05, + "loss": 0.3365, + "step": 4810 + }, + { + "epoch": 1.029286019666524, + "grad_norm": 5.890400409698486, + "learning_rate": 1.0788598574821854e-05, + "loss": 0.3708, + "step": 4815 + }, + { + "epoch": 1.0303548525010688, + "grad_norm": 3.566652774810791, + "learning_rate": 1.0776722090261284e-05, + "loss": 0.4345, + "step": 4820 + }, + { + "epoch": 1.0314236853356136, + "grad_norm": 5.6578826904296875, + "learning_rate": 1.0764845605700713e-05, + "loss": 0.3728, + "step": 4825 + }, + { + "epoch": 1.032492518170158, + "grad_norm": 4.193053245544434, + "learning_rate": 1.0752969121140144e-05, + "loss": 0.3173, + "step": 4830 + }, + { + "epoch": 1.0335613510047028, + "grad_norm": 4.646356105804443, + "learning_rate": 1.0741092636579572e-05, + "loss": 0.2574, + "step": 4835 + }, + { + "epoch": 1.0346301838392475, + "grad_norm": 3.941087245941162, + "learning_rate": 1.0729216152019005e-05, + "loss": 0.3128, + "step": 4840 + }, + { + "epoch": 1.0356990166737923, + "grad_norm": 4.5648884773254395, + "learning_rate": 1.0717339667458434e-05, + "loss": 0.2542, + "step": 4845 + }, + { + "epoch": 1.036767849508337, + "grad_norm": 3.661923408508301, + "learning_rate": 1.0705463182897864e-05, + "loss": 0.1649, + "step": 4850 + }, + { + "epoch": 1.0378366823428815, + "grad_norm": 5.052914619445801, + "learning_rate": 1.0693586698337293e-05, + "loss": 0.4091, + "step": 4855 + }, + { + "epoch": 1.0389055151774262, + "grad_norm": 5.769303321838379, + "learning_rate": 1.0681710213776724e-05, + "loss": 0.3553, + "step": 4860 + }, + { + "epoch": 1.039974348011971, + "grad_norm": 8.323318481445312, + "learning_rate": 1.0669833729216152e-05, + "loss": 0.5474, + "step": 4865 + }, + { + "epoch": 1.0410431808465157, + "grad_norm": 5.351403713226318, + "learning_rate": 1.0657957244655583e-05, + "loss": 0.3586, + "step": 4870 + }, + { + "epoch": 1.0421120136810602, + "grad_norm": 3.5083069801330566, + "learning_rate": 1.0646080760095012e-05, + "loss": 0.2589, + "step": 4875 + }, + { + "epoch": 1.043180846515605, + "grad_norm": 3.8574445247650146, + "learning_rate": 1.0634204275534444e-05, + "loss": 0.3143, + "step": 4880 + }, + { + "epoch": 1.0442496793501497, + "grad_norm": 3.950756311416626, + "learning_rate": 1.0622327790973871e-05, + "loss": 0.3248, + "step": 4885 + }, + { + "epoch": 1.0453185121846944, + "grad_norm": 5.606834411621094, + "learning_rate": 1.0610451306413303e-05, + "loss": 0.4631, + "step": 4890 + }, + { + "epoch": 1.046387345019239, + "grad_norm": 4.092567443847656, + "learning_rate": 1.0598574821852732e-05, + "loss": 0.2977, + "step": 4895 + }, + { + "epoch": 1.0474561778537836, + "grad_norm": 5.365922451019287, + "learning_rate": 1.0586698337292163e-05, + "loss": 0.2487, + "step": 4900 + }, + { + "epoch": 1.0485250106883284, + "grad_norm": 5.173450946807861, + "learning_rate": 1.0574821852731592e-05, + "loss": 0.3554, + "step": 4905 + }, + { + "epoch": 1.049593843522873, + "grad_norm": 5.21553373336792, + "learning_rate": 1.0562945368171022e-05, + "loss": 0.3579, + "step": 4910 + }, + { + "epoch": 1.0506626763574176, + "grad_norm": 4.973548889160156, + "learning_rate": 1.0551068883610451e-05, + "loss": 0.3562, + "step": 4915 + }, + { + "epoch": 1.0517315091919623, + "grad_norm": 6.216787815093994, + "learning_rate": 1.0539192399049882e-05, + "loss": 0.4625, + "step": 4920 + }, + { + "epoch": 1.052800342026507, + "grad_norm": 4.355904579162598, + "learning_rate": 1.052731591448931e-05, + "loss": 0.2834, + "step": 4925 + }, + { + "epoch": 1.0538691748610518, + "grad_norm": 4.44107723236084, + "learning_rate": 1.0515439429928743e-05, + "loss": 0.4072, + "step": 4930 + }, + { + "epoch": 1.0549380076955963, + "grad_norm": 5.2141289710998535, + "learning_rate": 1.0503562945368172e-05, + "loss": 0.2831, + "step": 4935 + }, + { + "epoch": 1.056006840530141, + "grad_norm": 4.4729228019714355, + "learning_rate": 1.0491686460807602e-05, + "loss": 0.2642, + "step": 4940 + }, + { + "epoch": 1.0570756733646858, + "grad_norm": 4.615827560424805, + "learning_rate": 1.0479809976247031e-05, + "loss": 0.2887, + "step": 4945 + }, + { + "epoch": 1.0581445061992305, + "grad_norm": 4.060108661651611, + "learning_rate": 1.0467933491686462e-05, + "loss": 0.4394, + "step": 4950 + }, + { + "epoch": 1.059213339033775, + "grad_norm": 3.323357582092285, + "learning_rate": 1.0456057007125892e-05, + "loss": 0.2957, + "step": 4955 + }, + { + "epoch": 1.0602821718683197, + "grad_norm": 3.9010369777679443, + "learning_rate": 1.0444180522565321e-05, + "loss": 0.298, + "step": 4960 + }, + { + "epoch": 1.0613510047028645, + "grad_norm": 4.847980976104736, + "learning_rate": 1.0432304038004753e-05, + "loss": 0.2643, + "step": 4965 + }, + { + "epoch": 1.0624198375374092, + "grad_norm": 4.916622638702393, + "learning_rate": 1.0420427553444182e-05, + "loss": 0.3147, + "step": 4970 + }, + { + "epoch": 1.063488670371954, + "grad_norm": 6.059121131896973, + "learning_rate": 1.0408551068883613e-05, + "loss": 0.3433, + "step": 4975 + }, + { + "epoch": 1.0645575032064984, + "grad_norm": 4.212458610534668, + "learning_rate": 1.0396674584323041e-05, + "loss": 0.2797, + "step": 4980 + }, + { + "epoch": 1.0656263360410432, + "grad_norm": 3.9374332427978516, + "learning_rate": 1.0384798099762472e-05, + "loss": 0.2958, + "step": 4985 + }, + { + "epoch": 1.066695168875588, + "grad_norm": 7.207469940185547, + "learning_rate": 1.0372921615201901e-05, + "loss": 0.3972, + "step": 4990 + }, + { + "epoch": 1.0677640017101326, + "grad_norm": 5.122316360473633, + "learning_rate": 1.0361045130641331e-05, + "loss": 0.3361, + "step": 4995 + }, + { + "epoch": 1.0688328345446771, + "grad_norm": 3.4103052616119385, + "learning_rate": 1.034916864608076e-05, + "loss": 0.3174, + "step": 5000 + }, + { + "epoch": 1.0699016673792219, + "grad_norm": 4.129265308380127, + "learning_rate": 1.033729216152019e-05, + "loss": 0.3346, + "step": 5005 + }, + { + "epoch": 1.0709705002137666, + "grad_norm": 4.027009963989258, + "learning_rate": 1.032541567695962e-05, + "loss": 0.326, + "step": 5010 + }, + { + "epoch": 1.0720393330483113, + "grad_norm": 3.362579822540283, + "learning_rate": 1.0313539192399052e-05, + "loss": 0.2975, + "step": 5015 + }, + { + "epoch": 1.0731081658828558, + "grad_norm": 5.225454330444336, + "learning_rate": 1.030166270783848e-05, + "loss": 0.3303, + "step": 5020 + }, + { + "epoch": 1.0741769987174006, + "grad_norm": 3.756742000579834, + "learning_rate": 1.0289786223277911e-05, + "loss": 0.2285, + "step": 5025 + }, + { + "epoch": 1.0752458315519453, + "grad_norm": 4.867086887359619, + "learning_rate": 1.027790973871734e-05, + "loss": 0.3798, + "step": 5030 + }, + { + "epoch": 1.07631466438649, + "grad_norm": 4.204124927520752, + "learning_rate": 1.026603325415677e-05, + "loss": 0.2882, + "step": 5035 + }, + { + "epoch": 1.0773834972210345, + "grad_norm": 4.995541095733643, + "learning_rate": 1.02541567695962e-05, + "loss": 0.4249, + "step": 5040 + }, + { + "epoch": 1.0784523300555793, + "grad_norm": 4.921726226806641, + "learning_rate": 1.024228028503563e-05, + "loss": 0.4139, + "step": 5045 + }, + { + "epoch": 1.079521162890124, + "grad_norm": 5.5460734367370605, + "learning_rate": 1.0230403800475059e-05, + "loss": 0.4502, + "step": 5050 + }, + { + "epoch": 1.0805899957246687, + "grad_norm": 4.828423023223877, + "learning_rate": 1.0218527315914491e-05, + "loss": 0.3521, + "step": 5055 + }, + { + "epoch": 1.0816588285592132, + "grad_norm": 3.87648344039917, + "learning_rate": 1.0206650831353918e-05, + "loss": 0.342, + "step": 5060 + }, + { + "epoch": 1.082727661393758, + "grad_norm": 4.833287715911865, + "learning_rate": 1.019477434679335e-05, + "loss": 0.294, + "step": 5065 + }, + { + "epoch": 1.0837964942283027, + "grad_norm": 4.559665679931641, + "learning_rate": 1.018289786223278e-05, + "loss": 0.2994, + "step": 5070 + }, + { + "epoch": 1.0848653270628474, + "grad_norm": 4.908376216888428, + "learning_rate": 1.017102137767221e-05, + "loss": 0.3467, + "step": 5075 + }, + { + "epoch": 1.085934159897392, + "grad_norm": 6.717689514160156, + "learning_rate": 1.0159144893111639e-05, + "loss": 0.443, + "step": 5080 + }, + { + "epoch": 1.0870029927319367, + "grad_norm": 3.5398693084716797, + "learning_rate": 1.014726840855107e-05, + "loss": 0.2759, + "step": 5085 + }, + { + "epoch": 1.0880718255664814, + "grad_norm": 4.501621246337891, + "learning_rate": 1.0135391923990498e-05, + "loss": 0.2614, + "step": 5090 + }, + { + "epoch": 1.0891406584010261, + "grad_norm": 5.194151401519775, + "learning_rate": 1.0123515439429929e-05, + "loss": 0.3649, + "step": 5095 + }, + { + "epoch": 1.0902094912355706, + "grad_norm": 4.68430757522583, + "learning_rate": 1.0111638954869361e-05, + "loss": 0.3699, + "step": 5100 + }, + { + "epoch": 1.0912783240701154, + "grad_norm": 6.266822814941406, + "learning_rate": 1.009976247030879e-05, + "loss": 0.3785, + "step": 5105 + }, + { + "epoch": 1.09234715690466, + "grad_norm": 5.040585517883301, + "learning_rate": 1.008788598574822e-05, + "loss": 0.325, + "step": 5110 + }, + { + "epoch": 1.0934159897392048, + "grad_norm": 6.65608024597168, + "learning_rate": 1.007600950118765e-05, + "loss": 0.3954, + "step": 5115 + }, + { + "epoch": 1.0944848225737496, + "grad_norm": 4.3081488609313965, + "learning_rate": 1.006413301662708e-05, + "loss": 0.2887, + "step": 5120 + }, + { + "epoch": 1.095553655408294, + "grad_norm": 3.5742883682250977, + "learning_rate": 1.0052256532066509e-05, + "loss": 0.2565, + "step": 5125 + }, + { + "epoch": 1.0966224882428388, + "grad_norm": 4.272683620452881, + "learning_rate": 1.004038004750594e-05, + "loss": 0.2661, + "step": 5130 + }, + { + "epoch": 1.0976913210773835, + "grad_norm": 3.7064707279205322, + "learning_rate": 1.0028503562945368e-05, + "loss": 0.286, + "step": 5135 + }, + { + "epoch": 1.0987601539119283, + "grad_norm": 6.0004963874816895, + "learning_rate": 1.00166270783848e-05, + "loss": 0.3969, + "step": 5140 + }, + { + "epoch": 1.0998289867464728, + "grad_norm": 6.301701068878174, + "learning_rate": 1.0004750593824228e-05, + "loss": 0.2451, + "step": 5145 + }, + { + "epoch": 1.1008978195810175, + "grad_norm": 5.471083164215088, + "learning_rate": 9.992874109263658e-06, + "loss": 0.2472, + "step": 5150 + }, + { + "epoch": 1.1019666524155622, + "grad_norm": 6.456992149353027, + "learning_rate": 9.980997624703089e-06, + "loss": 0.286, + "step": 5155 + }, + { + "epoch": 1.103035485250107, + "grad_norm": 6.616683006286621, + "learning_rate": 9.969121140142518e-06, + "loss": 0.3109, + "step": 5160 + }, + { + "epoch": 1.1041043180846515, + "grad_norm": 5.748746871948242, + "learning_rate": 9.95724465558195e-06, + "loss": 0.441, + "step": 5165 + }, + { + "epoch": 1.1051731509191962, + "grad_norm": 4.254424571990967, + "learning_rate": 9.945368171021379e-06, + "loss": 0.3494, + "step": 5170 + }, + { + "epoch": 1.106241983753741, + "grad_norm": 6.022365093231201, + "learning_rate": 9.93349168646081e-06, + "loss": 0.345, + "step": 5175 + }, + { + "epoch": 1.1073108165882857, + "grad_norm": 3.26804518699646, + "learning_rate": 9.921615201900238e-06, + "loss": 0.2503, + "step": 5180 + }, + { + "epoch": 1.1083796494228302, + "grad_norm": 3.100945234298706, + "learning_rate": 9.909738717339669e-06, + "loss": 0.3922, + "step": 5185 + }, + { + "epoch": 1.109448482257375, + "grad_norm": 4.631006717681885, + "learning_rate": 9.897862232779099e-06, + "loss": 0.3429, + "step": 5190 + }, + { + "epoch": 1.1105173150919196, + "grad_norm": 4.623953819274902, + "learning_rate": 9.885985748218528e-06, + "loss": 0.3437, + "step": 5195 + }, + { + "epoch": 1.1115861479264644, + "grad_norm": 3.877652406692505, + "learning_rate": 9.874109263657959e-06, + "loss": 0.2751, + "step": 5200 + }, + { + "epoch": 1.1126549807610089, + "grad_norm": 4.4313225746154785, + "learning_rate": 9.862232779097387e-06, + "loss": 0.3634, + "step": 5205 + }, + { + "epoch": 1.1137238135955536, + "grad_norm": 5.426332473754883, + "learning_rate": 9.850356294536818e-06, + "loss": 0.2685, + "step": 5210 + }, + { + "epoch": 1.1147926464300983, + "grad_norm": 3.7707009315490723, + "learning_rate": 9.838479809976248e-06, + "loss": 0.2121, + "step": 5215 + }, + { + "epoch": 1.115861479264643, + "grad_norm": 3.5573911666870117, + "learning_rate": 9.826603325415677e-06, + "loss": 0.4083, + "step": 5220 + }, + { + "epoch": 1.1169303120991876, + "grad_norm": 3.2365455627441406, + "learning_rate": 9.814726840855108e-06, + "loss": 0.2497, + "step": 5225 + }, + { + "epoch": 1.1179991449337323, + "grad_norm": 3.604321241378784, + "learning_rate": 9.802850356294538e-06, + "loss": 0.3521, + "step": 5230 + }, + { + "epoch": 1.119067977768277, + "grad_norm": 4.779599666595459, + "learning_rate": 9.790973871733967e-06, + "loss": 0.3048, + "step": 5235 + }, + { + "epoch": 1.1201368106028218, + "grad_norm": 3.685837745666504, + "learning_rate": 9.779097387173398e-06, + "loss": 0.2622, + "step": 5240 + }, + { + "epoch": 1.1212056434373663, + "grad_norm": 4.687803268432617, + "learning_rate": 9.767220902612827e-06, + "loss": 0.2651, + "step": 5245 + }, + { + "epoch": 1.122274476271911, + "grad_norm": 3.861872911453247, + "learning_rate": 9.755344418052257e-06, + "loss": 0.2971, + "step": 5250 + }, + { + "epoch": 1.1233433091064557, + "grad_norm": 5.285345077514648, + "learning_rate": 9.743467933491688e-06, + "loss": 0.3186, + "step": 5255 + }, + { + "epoch": 1.1244121419410005, + "grad_norm": 4.946507930755615, + "learning_rate": 9.731591448931117e-06, + "loss": 0.3277, + "step": 5260 + }, + { + "epoch": 1.1254809747755452, + "grad_norm": 3.527979612350464, + "learning_rate": 9.719714964370547e-06, + "loss": 0.2521, + "step": 5265 + }, + { + "epoch": 1.1265498076100897, + "grad_norm": 4.42695426940918, + "learning_rate": 9.707838479809976e-06, + "loss": 0.3114, + "step": 5270 + }, + { + "epoch": 1.1276186404446344, + "grad_norm": 2.8614661693573, + "learning_rate": 9.695961995249407e-06, + "loss": 0.3582, + "step": 5275 + }, + { + "epoch": 1.1286874732791792, + "grad_norm": 4.813942909240723, + "learning_rate": 9.684085510688837e-06, + "loss": 0.3617, + "step": 5280 + }, + { + "epoch": 1.129756306113724, + "grad_norm": 4.115063667297363, + "learning_rate": 9.672209026128266e-06, + "loss": 0.328, + "step": 5285 + }, + { + "epoch": 1.1308251389482684, + "grad_norm": 2.9611902236938477, + "learning_rate": 9.660332541567697e-06, + "loss": 0.2627, + "step": 5290 + }, + { + "epoch": 1.1318939717828131, + "grad_norm": 4.242338180541992, + "learning_rate": 9.648456057007125e-06, + "loss": 0.3054, + "step": 5295 + }, + { + "epoch": 1.1329628046173579, + "grad_norm": 3.4355380535125732, + "learning_rate": 9.636579572446556e-06, + "loss": 0.5342, + "step": 5300 + }, + { + "epoch": 1.1340316374519026, + "grad_norm": 3.823155641555786, + "learning_rate": 9.624703087885987e-06, + "loss": 0.2956, + "step": 5305 + }, + { + "epoch": 1.1351004702864471, + "grad_norm": 3.815985679626465, + "learning_rate": 9.612826603325417e-06, + "loss": 0.3008, + "step": 5310 + }, + { + "epoch": 1.1361693031209918, + "grad_norm": 6.562064170837402, + "learning_rate": 9.600950118764848e-06, + "loss": 0.4844, + "step": 5315 + }, + { + "epoch": 1.1372381359555366, + "grad_norm": 4.454050540924072, + "learning_rate": 9.589073634204276e-06, + "loss": 0.2941, + "step": 5320 + }, + { + "epoch": 1.1383069687900813, + "grad_norm": 4.194582462310791, + "learning_rate": 9.577197149643707e-06, + "loss": 0.3928, + "step": 5325 + }, + { + "epoch": 1.1393758016246258, + "grad_norm": 5.349386215209961, + "learning_rate": 9.565320665083136e-06, + "loss": 0.2804, + "step": 5330 + }, + { + "epoch": 1.1404446344591705, + "grad_norm": 4.539825916290283, + "learning_rate": 9.553444180522566e-06, + "loss": 0.2605, + "step": 5335 + }, + { + "epoch": 1.1415134672937153, + "grad_norm": 4.817893028259277, + "learning_rate": 9.541567695961997e-06, + "loss": 0.2871, + "step": 5340 + }, + { + "epoch": 1.14258230012826, + "grad_norm": 3.5281782150268555, + "learning_rate": 9.529691211401426e-06, + "loss": 0.2106, + "step": 5345 + }, + { + "epoch": 1.1436511329628045, + "grad_norm": 5.409219264984131, + "learning_rate": 9.517814726840856e-06, + "loss": 0.3053, + "step": 5350 + }, + { + "epoch": 1.1447199657973492, + "grad_norm": 4.795240879058838, + "learning_rate": 9.505938242280285e-06, + "loss": 0.3719, + "step": 5355 + }, + { + "epoch": 1.145788798631894, + "grad_norm": 6.551200866699219, + "learning_rate": 9.494061757719716e-06, + "loss": 0.396, + "step": 5360 + }, + { + "epoch": 1.1468576314664387, + "grad_norm": 4.7790703773498535, + "learning_rate": 9.482185273159146e-06, + "loss": 0.4105, + "step": 5365 + }, + { + "epoch": 1.1479264643009834, + "grad_norm": 5.062493801116943, + "learning_rate": 9.470308788598575e-06, + "loss": 0.3882, + "step": 5370 + }, + { + "epoch": 1.148995297135528, + "grad_norm": 4.342947006225586, + "learning_rate": 9.458432304038006e-06, + "loss": 0.2815, + "step": 5375 + }, + { + "epoch": 1.1500641299700727, + "grad_norm": 4.391014099121094, + "learning_rate": 9.446555819477435e-06, + "loss": 0.2477, + "step": 5380 + }, + { + "epoch": 1.1511329628046174, + "grad_norm": 3.2322447299957275, + "learning_rate": 9.434679334916865e-06, + "loss": 0.2798, + "step": 5385 + }, + { + "epoch": 1.152201795639162, + "grad_norm": 3.8520939350128174, + "learning_rate": 9.422802850356296e-06, + "loss": 0.2758, + "step": 5390 + }, + { + "epoch": 1.1532706284737066, + "grad_norm": 3.970700740814209, + "learning_rate": 9.410926365795725e-06, + "loss": 0.2938, + "step": 5395 + }, + { + "epoch": 1.1543394613082514, + "grad_norm": 4.378193378448486, + "learning_rate": 9.399049881235155e-06, + "loss": 0.3946, + "step": 5400 + }, + { + "epoch": 1.155408294142796, + "grad_norm": 3.779149293899536, + "learning_rate": 9.387173396674586e-06, + "loss": 0.2348, + "step": 5405 + }, + { + "epoch": 1.1564771269773408, + "grad_norm": 3.6311495304107666, + "learning_rate": 9.375296912114015e-06, + "loss": 0.2701, + "step": 5410 + }, + { + "epoch": 1.1575459598118853, + "grad_norm": 4.026009559631348, + "learning_rate": 9.363420427553445e-06, + "loss": 0.3154, + "step": 5415 + }, + { + "epoch": 1.15861479264643, + "grad_norm": 3.796111583709717, + "learning_rate": 9.351543942992874e-06, + "loss": 0.2617, + "step": 5420 + }, + { + "epoch": 1.1596836254809748, + "grad_norm": 4.301056861877441, + "learning_rate": 9.339667458432304e-06, + "loss": 0.3529, + "step": 5425 + }, + { + "epoch": 1.1607524583155195, + "grad_norm": 4.392220973968506, + "learning_rate": 9.327790973871735e-06, + "loss": 0.2858, + "step": 5430 + }, + { + "epoch": 1.161821291150064, + "grad_norm": 3.698474168777466, + "learning_rate": 9.315914489311164e-06, + "loss": 0.42, + "step": 5435 + }, + { + "epoch": 1.1628901239846088, + "grad_norm": 4.409991264343262, + "learning_rate": 9.304038004750594e-06, + "loss": 0.3894, + "step": 5440 + }, + { + "epoch": 1.1639589568191535, + "grad_norm": 2.799488067626953, + "learning_rate": 9.292161520190025e-06, + "loss": 0.3073, + "step": 5445 + }, + { + "epoch": 1.1650277896536982, + "grad_norm": 3.6285009384155273, + "learning_rate": 9.280285035629456e-06, + "loss": 0.2824, + "step": 5450 + }, + { + "epoch": 1.1660966224882428, + "grad_norm": 4.096553802490234, + "learning_rate": 9.268408551068884e-06, + "loss": 0.3139, + "step": 5455 + }, + { + "epoch": 1.1671654553227875, + "grad_norm": 6.436227798461914, + "learning_rate": 9.256532066508315e-06, + "loss": 0.4651, + "step": 5460 + }, + { + "epoch": 1.1682342881573322, + "grad_norm": 4.163245677947998, + "learning_rate": 9.244655581947744e-06, + "loss": 0.2611, + "step": 5465 + }, + { + "epoch": 1.169303120991877, + "grad_norm": 4.249100208282471, + "learning_rate": 9.232779097387174e-06, + "loss": 0.2663, + "step": 5470 + }, + { + "epoch": 1.1703719538264215, + "grad_norm": 4.877579212188721, + "learning_rate": 9.220902612826605e-06, + "loss": 0.252, + "step": 5475 + }, + { + "epoch": 1.1714407866609662, + "grad_norm": 4.387494087219238, + "learning_rate": 9.209026128266034e-06, + "loss": 0.4996, + "step": 5480 + }, + { + "epoch": 1.172509619495511, + "grad_norm": 3.6732518672943115, + "learning_rate": 9.197149643705464e-06, + "loss": 0.2786, + "step": 5485 + }, + { + "epoch": 1.1735784523300556, + "grad_norm": 4.684414386749268, + "learning_rate": 9.185273159144895e-06, + "loss": 0.2628, + "step": 5490 + }, + { + "epoch": 1.1746472851646002, + "grad_norm": 5.551144599914551, + "learning_rate": 9.173396674584324e-06, + "loss": 0.3326, + "step": 5495 + }, + { + "epoch": 1.1757161179991449, + "grad_norm": 3.942741632461548, + "learning_rate": 9.161520190023754e-06, + "loss": 0.3294, + "step": 5500 + }, + { + "epoch": 1.1767849508336896, + "grad_norm": 4.97520637512207, + "learning_rate": 9.149643705463183e-06, + "loss": 0.341, + "step": 5505 + }, + { + "epoch": 1.1778537836682343, + "grad_norm": 4.264441967010498, + "learning_rate": 9.137767220902614e-06, + "loss": 0.2878, + "step": 5510 + }, + { + "epoch": 1.178922616502779, + "grad_norm": 5.5287299156188965, + "learning_rate": 9.125890736342044e-06, + "loss": 0.354, + "step": 5515 + }, + { + "epoch": 1.1799914493373236, + "grad_norm": 2.997340679168701, + "learning_rate": 9.114014251781473e-06, + "loss": 0.2667, + "step": 5520 + }, + { + "epoch": 1.1810602821718683, + "grad_norm": 4.381051540374756, + "learning_rate": 9.102137767220904e-06, + "loss": 0.2932, + "step": 5525 + }, + { + "epoch": 1.182129115006413, + "grad_norm": 3.4648494720458984, + "learning_rate": 9.090261282660332e-06, + "loss": 0.2608, + "step": 5530 + }, + { + "epoch": 1.1831979478409576, + "grad_norm": 4.567250728607178, + "learning_rate": 9.078384798099763e-06, + "loss": 0.3078, + "step": 5535 + }, + { + "epoch": 1.1842667806755023, + "grad_norm": 4.373274326324463, + "learning_rate": 9.066508313539194e-06, + "loss": 0.4028, + "step": 5540 + }, + { + "epoch": 1.185335613510047, + "grad_norm": 4.338989734649658, + "learning_rate": 9.054631828978622e-06, + "loss": 0.3429, + "step": 5545 + }, + { + "epoch": 1.1864044463445917, + "grad_norm": 4.9778008460998535, + "learning_rate": 9.042755344418053e-06, + "loss": 0.3481, + "step": 5550 + }, + { + "epoch": 1.1874732791791365, + "grad_norm": 4.068686008453369, + "learning_rate": 9.030878859857482e-06, + "loss": 0.2931, + "step": 5555 + }, + { + "epoch": 1.188542112013681, + "grad_norm": 3.909130096435547, + "learning_rate": 9.019002375296912e-06, + "loss": 0.2719, + "step": 5560 + }, + { + "epoch": 1.1896109448482257, + "grad_norm": 4.785898208618164, + "learning_rate": 9.007125890736343e-06, + "loss": 0.3968, + "step": 5565 + }, + { + "epoch": 1.1906797776827704, + "grad_norm": 5.576188087463379, + "learning_rate": 8.995249406175772e-06, + "loss": 0.4653, + "step": 5570 + }, + { + "epoch": 1.1917486105173152, + "grad_norm": 3.010072946548462, + "learning_rate": 8.983372921615202e-06, + "loss": 0.2727, + "step": 5575 + }, + { + "epoch": 1.1928174433518597, + "grad_norm": 4.709297180175781, + "learning_rate": 8.971496437054633e-06, + "loss": 0.4521, + "step": 5580 + }, + { + "epoch": 1.1938862761864044, + "grad_norm": 5.573824405670166, + "learning_rate": 8.959619952494063e-06, + "loss": 0.3042, + "step": 5585 + }, + { + "epoch": 1.1949551090209491, + "grad_norm": 4.321738243103027, + "learning_rate": 8.947743467933492e-06, + "loss": 0.2687, + "step": 5590 + }, + { + "epoch": 1.1960239418554939, + "grad_norm": 5.602605819702148, + "learning_rate": 8.935866983372923e-06, + "loss": 0.2892, + "step": 5595 + }, + { + "epoch": 1.1970927746900384, + "grad_norm": 3.6464884281158447, + "learning_rate": 8.923990498812353e-06, + "loss": 0.2515, + "step": 5600 + }, + { + "epoch": 1.1981616075245831, + "grad_norm": 3.9809868335723877, + "learning_rate": 8.912114014251782e-06, + "loss": 0.3022, + "step": 5605 + }, + { + "epoch": 1.1992304403591278, + "grad_norm": 4.83494758605957, + "learning_rate": 8.900237529691213e-06, + "loss": 0.4018, + "step": 5610 + }, + { + "epoch": 1.2002992731936726, + "grad_norm": 3.961460590362549, + "learning_rate": 8.888361045130642e-06, + "loss": 0.2532, + "step": 5615 + }, + { + "epoch": 1.2013681060282173, + "grad_norm": 2.4498984813690186, + "learning_rate": 8.876484560570072e-06, + "loss": 0.2285, + "step": 5620 + }, + { + "epoch": 1.2024369388627618, + "grad_norm": 3.654311418533325, + "learning_rate": 8.864608076009503e-06, + "loss": 0.3307, + "step": 5625 + }, + { + "epoch": 1.2035057716973065, + "grad_norm": 4.238831996917725, + "learning_rate": 8.852731591448932e-06, + "loss": 0.2353, + "step": 5630 + }, + { + "epoch": 1.2045746045318513, + "grad_norm": 3.811962842941284, + "learning_rate": 8.840855106888362e-06, + "loss": 0.3137, + "step": 5635 + }, + { + "epoch": 1.2056434373663958, + "grad_norm": 3.8361501693725586, + "learning_rate": 8.828978622327791e-06, + "loss": 0.2549, + "step": 5640 + }, + { + "epoch": 1.2067122702009405, + "grad_norm": 4.136886119842529, + "learning_rate": 8.817102137767222e-06, + "loss": 0.2901, + "step": 5645 + }, + { + "epoch": 1.2077811030354852, + "grad_norm": 4.573363304138184, + "learning_rate": 8.805225653206652e-06, + "loss": 0.4423, + "step": 5650 + }, + { + "epoch": 1.20884993587003, + "grad_norm": 4.777524948120117, + "learning_rate": 8.793349168646081e-06, + "loss": 0.2959, + "step": 5655 + }, + { + "epoch": 1.2099187687045747, + "grad_norm": 4.250500679016113, + "learning_rate": 8.781472684085511e-06, + "loss": 0.2523, + "step": 5660 + }, + { + "epoch": 1.2109876015391192, + "grad_norm": 4.024094581604004, + "learning_rate": 8.769596199524942e-06, + "loss": 0.1746, + "step": 5665 + }, + { + "epoch": 1.212056434373664, + "grad_norm": 4.290604591369629, + "learning_rate": 8.757719714964371e-06, + "loss": 0.3541, + "step": 5670 + }, + { + "epoch": 1.2131252672082087, + "grad_norm": 3.597705125808716, + "learning_rate": 8.745843230403801e-06, + "loss": 0.2801, + "step": 5675 + }, + { + "epoch": 1.2141941000427534, + "grad_norm": 5.059614181518555, + "learning_rate": 8.73396674584323e-06, + "loss": 0.2846, + "step": 5680 + }, + { + "epoch": 1.215262932877298, + "grad_norm": 3.8920083045959473, + "learning_rate": 8.722090261282661e-06, + "loss": 0.3503, + "step": 5685 + }, + { + "epoch": 1.2163317657118426, + "grad_norm": 4.512190818786621, + "learning_rate": 8.710213776722091e-06, + "loss": 0.2831, + "step": 5690 + }, + { + "epoch": 1.2174005985463874, + "grad_norm": 4.729888916015625, + "learning_rate": 8.69833729216152e-06, + "loss": 0.2708, + "step": 5695 + }, + { + "epoch": 1.218469431380932, + "grad_norm": 4.533064365386963, + "learning_rate": 8.68646080760095e-06, + "loss": 0.2846, + "step": 5700 + }, + { + "epoch": 1.2195382642154766, + "grad_norm": 3.9406075477600098, + "learning_rate": 8.67458432304038e-06, + "loss": 0.3136, + "step": 5705 + }, + { + "epoch": 1.2206070970500214, + "grad_norm": 6.5291924476623535, + "learning_rate": 8.66270783847981e-06, + "loss": 0.3377, + "step": 5710 + }, + { + "epoch": 1.221675929884566, + "grad_norm": 4.291172981262207, + "learning_rate": 8.65083135391924e-06, + "loss": 0.4648, + "step": 5715 + }, + { + "epoch": 1.2227447627191108, + "grad_norm": 5.999503135681152, + "learning_rate": 8.63895486935867e-06, + "loss": 0.371, + "step": 5720 + }, + { + "epoch": 1.2238135955536553, + "grad_norm": 3.673821449279785, + "learning_rate": 8.6270783847981e-06, + "loss": 0.3419, + "step": 5725 + }, + { + "epoch": 1.2248824283882, + "grad_norm": 4.6455607414245605, + "learning_rate": 8.61520190023753e-06, + "loss": 0.3034, + "step": 5730 + }, + { + "epoch": 1.2259512612227448, + "grad_norm": 4.375533103942871, + "learning_rate": 8.603325415676961e-06, + "loss": 0.2912, + "step": 5735 + }, + { + "epoch": 1.2270200940572895, + "grad_norm": 4.1931376457214355, + "learning_rate": 8.59144893111639e-06, + "loss": 0.3251, + "step": 5740 + }, + { + "epoch": 1.228088926891834, + "grad_norm": 7.451878547668457, + "learning_rate": 8.57957244655582e-06, + "loss": 0.3989, + "step": 5745 + }, + { + "epoch": 1.2291577597263788, + "grad_norm": 5.163100242614746, + "learning_rate": 8.567695961995251e-06, + "loss": 0.3193, + "step": 5750 + }, + { + "epoch": 1.2302265925609235, + "grad_norm": 6.099165439605713, + "learning_rate": 8.55581947743468e-06, + "loss": 0.3586, + "step": 5755 + }, + { + "epoch": 1.2312954253954682, + "grad_norm": 3.8234498500823975, + "learning_rate": 8.54394299287411e-06, + "loss": 0.2832, + "step": 5760 + }, + { + "epoch": 1.232364258230013, + "grad_norm": 4.173794269561768, + "learning_rate": 8.53206650831354e-06, + "loss": 0.389, + "step": 5765 + }, + { + "epoch": 1.2334330910645575, + "grad_norm": 4.987196922302246, + "learning_rate": 8.52019002375297e-06, + "loss": 0.3889, + "step": 5770 + }, + { + "epoch": 1.2345019238991022, + "grad_norm": 3.354900360107422, + "learning_rate": 8.5083135391924e-06, + "loss": 0.2243, + "step": 5775 + }, + { + "epoch": 1.235570756733647, + "grad_norm": 4.882574558258057, + "learning_rate": 8.49643705463183e-06, + "loss": 0.2416, + "step": 5780 + }, + { + "epoch": 1.2366395895681914, + "grad_norm": 4.3282790184021, + "learning_rate": 8.48456057007126e-06, + "loss": 0.3066, + "step": 5785 + }, + { + "epoch": 1.2377084224027362, + "grad_norm": 5.309357166290283, + "learning_rate": 8.472684085510689e-06, + "loss": 0.3227, + "step": 5790 + }, + { + "epoch": 1.2387772552372809, + "grad_norm": 3.708139181137085, + "learning_rate": 8.46080760095012e-06, + "loss": 0.3194, + "step": 5795 + }, + { + "epoch": 1.2398460880718256, + "grad_norm": 5.823927879333496, + "learning_rate": 8.44893111638955e-06, + "loss": 0.3851, + "step": 5800 + }, + { + "epoch": 1.2409149209063703, + "grad_norm": 5.825521945953369, + "learning_rate": 8.437054631828979e-06, + "loss": 0.2793, + "step": 5805 + }, + { + "epoch": 1.2419837537409149, + "grad_norm": 4.350478172302246, + "learning_rate": 8.42517814726841e-06, + "loss": 0.2482, + "step": 5810 + }, + { + "epoch": 1.2430525865754596, + "grad_norm": 4.824470043182373, + "learning_rate": 8.413301662707838e-06, + "loss": 0.3311, + "step": 5815 + }, + { + "epoch": 1.2441214194100043, + "grad_norm": 4.695113182067871, + "learning_rate": 8.401425178147269e-06, + "loss": 0.2456, + "step": 5820 + }, + { + "epoch": 1.245190252244549, + "grad_norm": 5.539307594299316, + "learning_rate": 8.3895486935867e-06, + "loss": 0.3267, + "step": 5825 + }, + { + "epoch": 1.2462590850790936, + "grad_norm": 4.055349826812744, + "learning_rate": 8.377672209026128e-06, + "loss": 0.2496, + "step": 5830 + }, + { + "epoch": 1.2473279179136383, + "grad_norm": 4.012608051300049, + "learning_rate": 8.365795724465559e-06, + "loss": 0.2896, + "step": 5835 + }, + { + "epoch": 1.248396750748183, + "grad_norm": 4.369838714599609, + "learning_rate": 8.35391923990499e-06, + "loss": 0.2717, + "step": 5840 + }, + { + "epoch": 1.2494655835827277, + "grad_norm": 7.311318874359131, + "learning_rate": 8.342042755344418e-06, + "loss": 0.344, + "step": 5845 + }, + { + "epoch": 1.2505344164172723, + "grad_norm": 3.8691282272338867, + "learning_rate": 8.330166270783849e-06, + "loss": 0.1997, + "step": 5850 + }, + { + "epoch": 1.251603249251817, + "grad_norm": 4.140939712524414, + "learning_rate": 8.318289786223278e-06, + "loss": 0.2454, + "step": 5855 + }, + { + "epoch": 1.2526720820863617, + "grad_norm": 4.034096717834473, + "learning_rate": 8.306413301662708e-06, + "loss": 0.2923, + "step": 5860 + }, + { + "epoch": 1.2537409149209064, + "grad_norm": 4.175270080566406, + "learning_rate": 8.294536817102139e-06, + "loss": 0.268, + "step": 5865 + }, + { + "epoch": 1.2548097477554512, + "grad_norm": 5.182862758636475, + "learning_rate": 8.28266033254157e-06, + "loss": 0.2469, + "step": 5870 + }, + { + "epoch": 1.2558785805899957, + "grad_norm": 3.4455058574676514, + "learning_rate": 8.270783847980998e-06, + "loss": 0.2488, + "step": 5875 + }, + { + "epoch": 1.2569474134245404, + "grad_norm": 3.5229389667510986, + "learning_rate": 8.258907363420429e-06, + "loss": 0.2809, + "step": 5880 + }, + { + "epoch": 1.2580162462590851, + "grad_norm": 5.2068071365356445, + "learning_rate": 8.247030878859859e-06, + "loss": 0.2967, + "step": 5885 + }, + { + "epoch": 1.2590850790936297, + "grad_norm": 5.500560283660889, + "learning_rate": 8.235154394299288e-06, + "loss": 0.3366, + "step": 5890 + }, + { + "epoch": 1.2601539119281744, + "grad_norm": 3.9053938388824463, + "learning_rate": 8.223277909738719e-06, + "loss": 0.2968, + "step": 5895 + }, + { + "epoch": 1.2612227447627191, + "grad_norm": 3.7163820266723633, + "learning_rate": 8.211401425178147e-06, + "loss": 0.2, + "step": 5900 + }, + { + "epoch": 1.2622915775972638, + "grad_norm": 4.347673416137695, + "learning_rate": 8.199524940617578e-06, + "loss": 0.2761, + "step": 5905 + }, + { + "epoch": 1.2633604104318086, + "grad_norm": 3.297481060028076, + "learning_rate": 8.187648456057008e-06, + "loss": 0.2177, + "step": 5910 + }, + { + "epoch": 1.264429243266353, + "grad_norm": 5.587257385253906, + "learning_rate": 8.175771971496437e-06, + "loss": 0.2721, + "step": 5915 + }, + { + "epoch": 1.2654980761008978, + "grad_norm": 3.562802791595459, + "learning_rate": 8.163895486935868e-06, + "loss": 0.2592, + "step": 5920 + }, + { + "epoch": 1.2665669089354425, + "grad_norm": 5.265760898590088, + "learning_rate": 8.152019002375298e-06, + "loss": 0.453, + "step": 5925 + }, + { + "epoch": 1.267635741769987, + "grad_norm": 4.091883182525635, + "learning_rate": 8.140142517814727e-06, + "loss": 0.1952, + "step": 5930 + }, + { + "epoch": 1.2687045746045318, + "grad_norm": 4.552518844604492, + "learning_rate": 8.128266033254158e-06, + "loss": 0.3269, + "step": 5935 + }, + { + "epoch": 1.2697734074390765, + "grad_norm": 4.755618572235107, + "learning_rate": 8.116389548693587e-06, + "loss": 0.2776, + "step": 5940 + }, + { + "epoch": 1.2708422402736212, + "grad_norm": 4.392646312713623, + "learning_rate": 8.104513064133017e-06, + "loss": 0.2627, + "step": 5945 + }, + { + "epoch": 1.271911073108166, + "grad_norm": 2.6964704990386963, + "learning_rate": 8.092636579572448e-06, + "loss": 0.2524, + "step": 5950 + }, + { + "epoch": 1.2729799059427105, + "grad_norm": 3.914213180541992, + "learning_rate": 8.080760095011877e-06, + "loss": 0.2713, + "step": 5955 + }, + { + "epoch": 1.2740487387772552, + "grad_norm": 3.009427785873413, + "learning_rate": 8.068883610451307e-06, + "loss": 0.2618, + "step": 5960 + }, + { + "epoch": 1.2751175716118, + "grad_norm": 4.362711429595947, + "learning_rate": 8.057007125890736e-06, + "loss": 0.2735, + "step": 5965 + }, + { + "epoch": 1.2761864044463445, + "grad_norm": 5.038128852844238, + "learning_rate": 8.045130641330167e-06, + "loss": 0.327, + "step": 5970 + }, + { + "epoch": 1.2772552372808892, + "grad_norm": 5.867886543273926, + "learning_rate": 8.033254156769597e-06, + "loss": 0.3294, + "step": 5975 + }, + { + "epoch": 1.278324070115434, + "grad_norm": 3.8101038932800293, + "learning_rate": 8.021377672209026e-06, + "loss": 0.3299, + "step": 5980 + }, + { + "epoch": 1.2793929029499786, + "grad_norm": 4.082939624786377, + "learning_rate": 8.009501187648457e-06, + "loss": 0.2328, + "step": 5985 + }, + { + "epoch": 1.2804617357845234, + "grad_norm": 5.352798938751221, + "learning_rate": 7.997624703087885e-06, + "loss": 0.2554, + "step": 5990 + }, + { + "epoch": 1.281530568619068, + "grad_norm": 2.7532148361206055, + "learning_rate": 7.985748218527316e-06, + "loss": 0.3285, + "step": 5995 + }, + { + "epoch": 1.2825994014536126, + "grad_norm": 4.2501349449157715, + "learning_rate": 7.973871733966747e-06, + "loss": 0.2884, + "step": 6000 + }, + { + "epoch": 1.2836682342881574, + "grad_norm": 3.0817322731018066, + "learning_rate": 7.961995249406177e-06, + "loss": 0.3201, + "step": 6005 + }, + { + "epoch": 1.284737067122702, + "grad_norm": 4.214169502258301, + "learning_rate": 7.950118764845608e-06, + "loss": 0.3529, + "step": 6010 + }, + { + "epoch": 1.2858058999572468, + "grad_norm": 4.896885871887207, + "learning_rate": 7.938242280285036e-06, + "loss": 0.3113, + "step": 6015 + }, + { + "epoch": 1.2868747327917913, + "grad_norm": 4.869765758514404, + "learning_rate": 7.926365795724467e-06, + "loss": 0.386, + "step": 6020 + }, + { + "epoch": 1.287943565626336, + "grad_norm": 4.720851421356201, + "learning_rate": 7.914489311163896e-06, + "loss": 0.2453, + "step": 6025 + }, + { + "epoch": 1.2890123984608808, + "grad_norm": 4.764908790588379, + "learning_rate": 7.902612826603326e-06, + "loss": 0.3845, + "step": 6030 + }, + { + "epoch": 1.2900812312954253, + "grad_norm": 4.5335845947265625, + "learning_rate": 7.890736342042757e-06, + "loss": 0.3311, + "step": 6035 + }, + { + "epoch": 1.29115006412997, + "grad_norm": 5.650118350982666, + "learning_rate": 7.878859857482186e-06, + "loss": 0.3556, + "step": 6040 + }, + { + "epoch": 1.2922188969645148, + "grad_norm": 4.7145209312438965, + "learning_rate": 7.866983372921616e-06, + "loss": 0.2491, + "step": 6045 + }, + { + "epoch": 1.2932877297990595, + "grad_norm": 5.045220851898193, + "learning_rate": 7.855106888361045e-06, + "loss": 0.3478, + "step": 6050 + }, + { + "epoch": 1.2943565626336042, + "grad_norm": 3.746929407119751, + "learning_rate": 7.843230403800476e-06, + "loss": 0.175, + "step": 6055 + }, + { + "epoch": 1.2954253954681487, + "grad_norm": 3.4932451248168945, + "learning_rate": 7.831353919239906e-06, + "loss": 0.2475, + "step": 6060 + }, + { + "epoch": 1.2964942283026935, + "grad_norm": 4.507287502288818, + "learning_rate": 7.819477434679335e-06, + "loss": 0.2793, + "step": 6065 + }, + { + "epoch": 1.2975630611372382, + "grad_norm": 3.872846841812134, + "learning_rate": 7.807600950118766e-06, + "loss": 0.3745, + "step": 6070 + }, + { + "epoch": 1.2986318939717827, + "grad_norm": 3.80639910697937, + "learning_rate": 7.795724465558195e-06, + "loss": 0.2619, + "step": 6075 + }, + { + "epoch": 1.2997007268063274, + "grad_norm": 4.278339862823486, + "learning_rate": 7.783847980997625e-06, + "loss": 0.2882, + "step": 6080 + }, + { + "epoch": 1.3007695596408722, + "grad_norm": 3.2503674030303955, + "learning_rate": 7.771971496437056e-06, + "loss": 0.2651, + "step": 6085 + }, + { + "epoch": 1.3018383924754169, + "grad_norm": 3.709991216659546, + "learning_rate": 7.760095011876485e-06, + "loss": 0.3257, + "step": 6090 + }, + { + "epoch": 1.3029072253099616, + "grad_norm": 4.797738075256348, + "learning_rate": 7.748218527315915e-06, + "loss": 0.2626, + "step": 6095 + }, + { + "epoch": 1.3039760581445061, + "grad_norm": 3.289095163345337, + "learning_rate": 7.736342042755346e-06, + "loss": 0.3391, + "step": 6100 + }, + { + "epoch": 1.3050448909790509, + "grad_norm": 5.237732410430908, + "learning_rate": 7.724465558194774e-06, + "loss": 0.2868, + "step": 6105 + }, + { + "epoch": 1.3061137238135956, + "grad_norm": 3.3352086544036865, + "learning_rate": 7.712589073634205e-06, + "loss": 0.2358, + "step": 6110 + }, + { + "epoch": 1.30718255664814, + "grad_norm": 4.8291168212890625, + "learning_rate": 7.700712589073634e-06, + "loss": 0.3505, + "step": 6115 + }, + { + "epoch": 1.308251389482685, + "grad_norm": 6.421624183654785, + "learning_rate": 7.688836104513064e-06, + "loss": 0.3643, + "step": 6120 + }, + { + "epoch": 1.3093202223172296, + "grad_norm": 2.7074790000915527, + "learning_rate": 7.676959619952495e-06, + "loss": 0.2729, + "step": 6125 + }, + { + "epoch": 1.3103890551517743, + "grad_norm": 4.26420783996582, + "learning_rate": 7.665083135391924e-06, + "loss": 0.2363, + "step": 6130 + }, + { + "epoch": 1.311457887986319, + "grad_norm": 6.1749773025512695, + "learning_rate": 7.653206650831354e-06, + "loss": 0.3173, + "step": 6135 + }, + { + "epoch": 1.3125267208208635, + "grad_norm": 3.3525917530059814, + "learning_rate": 7.641330166270783e-06, + "loss": 0.2797, + "step": 6140 + }, + { + "epoch": 1.3135955536554083, + "grad_norm": 3.1302783489227295, + "learning_rate": 7.629453681710216e-06, + "loss": 0.3054, + "step": 6145 + }, + { + "epoch": 1.314664386489953, + "grad_norm": 3.0552220344543457, + "learning_rate": 7.617577197149645e-06, + "loss": 0.2596, + "step": 6150 + }, + { + "epoch": 1.3157332193244977, + "grad_norm": 5.424324035644531, + "learning_rate": 7.605700712589075e-06, + "loss": 0.3871, + "step": 6155 + }, + { + "epoch": 1.3168020521590424, + "grad_norm": 4.735466480255127, + "learning_rate": 7.593824228028505e-06, + "loss": 0.2798, + "step": 6160 + }, + { + "epoch": 1.317870884993587, + "grad_norm": 5.158178806304932, + "learning_rate": 7.581947743467934e-06, + "loss": 0.2532, + "step": 6165 + }, + { + "epoch": 1.3189397178281317, + "grad_norm": 5.720581531524658, + "learning_rate": 7.570071258907364e-06, + "loss": 0.2649, + "step": 6170 + }, + { + "epoch": 1.3200085506626764, + "grad_norm": 4.740435600280762, + "learning_rate": 7.5581947743467946e-06, + "loss": 0.3457, + "step": 6175 + }, + { + "epoch": 1.321077383497221, + "grad_norm": 4.528372287750244, + "learning_rate": 7.546318289786224e-06, + "loss": 0.4062, + "step": 6180 + }, + { + "epoch": 1.3221462163317657, + "grad_norm": 5.7430243492126465, + "learning_rate": 7.534441805225654e-06, + "loss": 0.4318, + "step": 6185 + }, + { + "epoch": 1.3232150491663104, + "grad_norm": 3.7349984645843506, + "learning_rate": 7.522565320665084e-06, + "loss": 0.2305, + "step": 6190 + }, + { + "epoch": 1.3242838820008551, + "grad_norm": 3.384366273880005, + "learning_rate": 7.510688836104514e-06, + "loss": 0.218, + "step": 6195 + }, + { + "epoch": 1.3253527148353998, + "grad_norm": 4.311688423156738, + "learning_rate": 7.498812351543944e-06, + "loss": 0.2738, + "step": 6200 + }, + { + "epoch": 1.3264215476699444, + "grad_norm": 3.9737985134124756, + "learning_rate": 7.486935866983374e-06, + "loss": 0.3043, + "step": 6205 + }, + { + "epoch": 1.327490380504489, + "grad_norm": 3.2927355766296387, + "learning_rate": 7.475059382422803e-06, + "loss": 0.2055, + "step": 6210 + }, + { + "epoch": 1.3285592133390338, + "grad_norm": 4.364592552185059, + "learning_rate": 7.463182897862233e-06, + "loss": 0.2528, + "step": 6215 + }, + { + "epoch": 1.3296280461735783, + "grad_norm": 4.896527290344238, + "learning_rate": 7.451306413301664e-06, + "loss": 0.3514, + "step": 6220 + }, + { + "epoch": 1.330696879008123, + "grad_norm": 3.7543258666992188, + "learning_rate": 7.439429928741093e-06, + "loss": 0.3199, + "step": 6225 + }, + { + "epoch": 1.3317657118426678, + "grad_norm": 4.389688491821289, + "learning_rate": 7.427553444180523e-06, + "loss": 0.2453, + "step": 6230 + }, + { + "epoch": 1.3328345446772125, + "grad_norm": 5.297595500946045, + "learning_rate": 7.415676959619953e-06, + "loss": 0.3237, + "step": 6235 + }, + { + "epoch": 1.3339033775117572, + "grad_norm": 4.290585041046143, + "learning_rate": 7.403800475059383e-06, + "loss": 0.3409, + "step": 6240 + }, + { + "epoch": 1.3349722103463018, + "grad_norm": 3.8684494495391846, + "learning_rate": 7.391923990498813e-06, + "loss": 0.268, + "step": 6245 + }, + { + "epoch": 1.3360410431808465, + "grad_norm": 7.344365119934082, + "learning_rate": 7.380047505938243e-06, + "loss": 0.4072, + "step": 6250 + }, + { + "epoch": 1.3371098760153912, + "grad_norm": 4.403175354003906, + "learning_rate": 7.368171021377672e-06, + "loss": 0.3601, + "step": 6255 + }, + { + "epoch": 1.338178708849936, + "grad_norm": 4.6706414222717285, + "learning_rate": 7.356294536817102e-06, + "loss": 0.3997, + "step": 6260 + }, + { + "epoch": 1.3392475416844807, + "grad_norm": 3.4723129272460938, + "learning_rate": 7.344418052256533e-06, + "loss": 0.2062, + "step": 6265 + }, + { + "epoch": 1.3403163745190252, + "grad_norm": 3.8669190406799316, + "learning_rate": 7.332541567695962e-06, + "loss": 0.2703, + "step": 6270 + }, + { + "epoch": 1.34138520735357, + "grad_norm": 4.620151519775391, + "learning_rate": 7.320665083135392e-06, + "loss": 0.2498, + "step": 6275 + }, + { + "epoch": 1.3424540401881147, + "grad_norm": 4.765347480773926, + "learning_rate": 7.308788598574822e-06, + "loss": 0.3418, + "step": 6280 + }, + { + "epoch": 1.3435228730226592, + "grad_norm": 3.9806559085845947, + "learning_rate": 7.296912114014253e-06, + "loss": 0.2046, + "step": 6285 + }, + { + "epoch": 1.344591705857204, + "grad_norm": 6.489411354064941, + "learning_rate": 7.285035629453683e-06, + "loss": 0.3267, + "step": 6290 + }, + { + "epoch": 1.3456605386917486, + "grad_norm": 4.385682582855225, + "learning_rate": 7.2731591448931125e-06, + "loss": 0.2756, + "step": 6295 + }, + { + "epoch": 1.3467293715262934, + "grad_norm": 5.30741548538208, + "learning_rate": 7.261282660332542e-06, + "loss": 0.2808, + "step": 6300 + }, + { + "epoch": 1.347798204360838, + "grad_norm": 3.52230167388916, + "learning_rate": 7.249406175771973e-06, + "loss": 0.3037, + "step": 6305 + }, + { + "epoch": 1.3488670371953826, + "grad_norm": 3.3302509784698486, + "learning_rate": 7.2375296912114025e-06, + "loss": 0.3837, + "step": 6310 + }, + { + "epoch": 1.3499358700299273, + "grad_norm": 4.349034309387207, + "learning_rate": 7.225653206650832e-06, + "loss": 0.2496, + "step": 6315 + }, + { + "epoch": 1.351004702864472, + "grad_norm": 3.651261329650879, + "learning_rate": 7.213776722090262e-06, + "loss": 0.3131, + "step": 6320 + }, + { + "epoch": 1.3520735356990166, + "grad_norm": 4.3042144775390625, + "learning_rate": 7.201900237529692e-06, + "loss": 0.3038, + "step": 6325 + }, + { + "epoch": 1.3531423685335613, + "grad_norm": 4.746523380279541, + "learning_rate": 7.190023752969122e-06, + "loss": 0.2872, + "step": 6330 + }, + { + "epoch": 1.354211201368106, + "grad_norm": 3.058163642883301, + "learning_rate": 7.178147268408552e-06, + "loss": 0.3548, + "step": 6335 + }, + { + "epoch": 1.3552800342026508, + "grad_norm": 4.4561309814453125, + "learning_rate": 7.1662707838479815e-06, + "loss": 0.1994, + "step": 6340 + }, + { + "epoch": 1.3563488670371955, + "grad_norm": 3.580275535583496, + "learning_rate": 7.154394299287411e-06, + "loss": 0.2383, + "step": 6345 + }, + { + "epoch": 1.35741769987174, + "grad_norm": 4.0294671058654785, + "learning_rate": 7.142517814726842e-06, + "loss": 0.3412, + "step": 6350 + }, + { + "epoch": 1.3584865327062847, + "grad_norm": 4.032179355621338, + "learning_rate": 7.1306413301662715e-06, + "loss": 0.2392, + "step": 6355 + }, + { + "epoch": 1.3595553655408295, + "grad_norm": 3.6529910564422607, + "learning_rate": 7.118764845605701e-06, + "loss": 0.2946, + "step": 6360 + }, + { + "epoch": 1.360624198375374, + "grad_norm": 5.5632781982421875, + "learning_rate": 7.106888361045131e-06, + "loss": 0.3075, + "step": 6365 + }, + { + "epoch": 1.3616930312099187, + "grad_norm": 4.6378865242004395, + "learning_rate": 7.0950118764845614e-06, + "loss": 0.2695, + "step": 6370 + }, + { + "epoch": 1.3627618640444634, + "grad_norm": 4.01276969909668, + "learning_rate": 7.083135391923991e-06, + "loss": 0.2289, + "step": 6375 + }, + { + "epoch": 1.3638306968790082, + "grad_norm": 2.731029748916626, + "learning_rate": 7.071258907363421e-06, + "loss": 0.339, + "step": 6380 + }, + { + "epoch": 1.3648995297135529, + "grad_norm": 6.142641544342041, + "learning_rate": 7.0593824228028505e-06, + "loss": 0.3048, + "step": 6385 + }, + { + "epoch": 1.3659683625480974, + "grad_norm": 4.8854289054870605, + "learning_rate": 7.04750593824228e-06, + "loss": 0.3437, + "step": 6390 + }, + { + "epoch": 1.3670371953826421, + "grad_norm": 4.592909336090088, + "learning_rate": 7.035629453681711e-06, + "loss": 0.2587, + "step": 6395 + }, + { + "epoch": 1.3681060282171869, + "grad_norm": 4.572000026702881, + "learning_rate": 7.0237529691211405e-06, + "loss": 0.3156, + "step": 6400 + }, + { + "epoch": 1.3691748610517316, + "grad_norm": 6.196121692657471, + "learning_rate": 7.01187648456057e-06, + "loss": 0.2827, + "step": 6405 + }, + { + "epoch": 1.3702436938862763, + "grad_norm": 3.967109441757202, + "learning_rate": 7e-06, + "loss": 0.2337, + "step": 6410 + }, + { + "epoch": 1.3713125267208208, + "grad_norm": 3.1756539344787598, + "learning_rate": 6.98812351543943e-06, + "loss": 0.265, + "step": 6415 + }, + { + "epoch": 1.3723813595553656, + "grad_norm": 3.4292986392974854, + "learning_rate": 6.97624703087886e-06, + "loss": 0.2822, + "step": 6420 + }, + { + "epoch": 1.3734501923899103, + "grad_norm": 4.521055698394775, + "learning_rate": 6.964370546318291e-06, + "loss": 0.2697, + "step": 6425 + }, + { + "epoch": 1.3745190252244548, + "grad_norm": 3.9092273712158203, + "learning_rate": 6.95249406175772e-06, + "loss": 0.2136, + "step": 6430 + }, + { + "epoch": 1.3755878580589995, + "grad_norm": 3.5216240882873535, + "learning_rate": 6.940617577197151e-06, + "loss": 0.2549, + "step": 6435 + }, + { + "epoch": 1.3766566908935443, + "grad_norm": 5.987946510314941, + "learning_rate": 6.928741092636581e-06, + "loss": 0.2631, + "step": 6440 + }, + { + "epoch": 1.377725523728089, + "grad_norm": 5.098079681396484, + "learning_rate": 6.91686460807601e-06, + "loss": 0.324, + "step": 6445 + }, + { + "epoch": 1.3787943565626337, + "grad_norm": 4.314655303955078, + "learning_rate": 6.90498812351544e-06, + "loss": 0.3722, + "step": 6450 + }, + { + "epoch": 1.3798631893971782, + "grad_norm": 5.151162147521973, + "learning_rate": 6.893111638954871e-06, + "loss": 0.453, + "step": 6455 + }, + { + "epoch": 1.380932022231723, + "grad_norm": 4.187003135681152, + "learning_rate": 6.8812351543943e-06, + "loss": 0.2798, + "step": 6460 + }, + { + "epoch": 1.3820008550662677, + "grad_norm": 5.253510475158691, + "learning_rate": 6.86935866983373e-06, + "loss": 0.2605, + "step": 6465 + }, + { + "epoch": 1.3830696879008122, + "grad_norm": 2.9405324459075928, + "learning_rate": 6.85748218527316e-06, + "loss": 0.3834, + "step": 6470 + }, + { + "epoch": 1.384138520735357, + "grad_norm": 3.8434178829193115, + "learning_rate": 6.845605700712589e-06, + "loss": 0.2683, + "step": 6475 + }, + { + "epoch": 1.3852073535699017, + "grad_norm": 4.633339881896973, + "learning_rate": 6.83372921615202e-06, + "loss": 0.243, + "step": 6480 + }, + { + "epoch": 1.3862761864044464, + "grad_norm": 4.103108882904053, + "learning_rate": 6.82185273159145e-06, + "loss": 0.3287, + "step": 6485 + }, + { + "epoch": 1.3873450192389911, + "grad_norm": 4.187243938446045, + "learning_rate": 6.809976247030879e-06, + "loss": 0.2754, + "step": 6490 + }, + { + "epoch": 1.3884138520735356, + "grad_norm": 5.196486949920654, + "learning_rate": 6.798099762470309e-06, + "loss": 0.3701, + "step": 6495 + }, + { + "epoch": 1.3894826849080804, + "grad_norm": 4.622681140899658, + "learning_rate": 6.78622327790974e-06, + "loss": 0.2762, + "step": 6500 + }, + { + "epoch": 1.390551517742625, + "grad_norm": 2.859978675842285, + "learning_rate": 6.774346793349169e-06, + "loss": 0.266, + "step": 6505 + }, + { + "epoch": 1.3916203505771696, + "grad_norm": 5.8184332847595215, + "learning_rate": 6.762470308788599e-06, + "loss": 0.2958, + "step": 6510 + }, + { + "epoch": 1.3926891834117143, + "grad_norm": 3.787079334259033, + "learning_rate": 6.750593824228029e-06, + "loss": 0.2754, + "step": 6515 + }, + { + "epoch": 1.393758016246259, + "grad_norm": 4.132429599761963, + "learning_rate": 6.7387173396674584e-06, + "loss": 0.3634, + "step": 6520 + }, + { + "epoch": 1.3948268490808038, + "grad_norm": 5.011837005615234, + "learning_rate": 6.726840855106889e-06, + "loss": 0.2974, + "step": 6525 + }, + { + "epoch": 1.3958956819153485, + "grad_norm": 5.0287957191467285, + "learning_rate": 6.714964370546319e-06, + "loss": 0.3158, + "step": 6530 + }, + { + "epoch": 1.396964514749893, + "grad_norm": 3.846284866333008, + "learning_rate": 6.703087885985748e-06, + "loss": 0.2718, + "step": 6535 + }, + { + "epoch": 1.3980333475844378, + "grad_norm": 5.715949058532715, + "learning_rate": 6.691211401425178e-06, + "loss": 0.2586, + "step": 6540 + }, + { + "epoch": 1.3991021804189825, + "grad_norm": 2.0641372203826904, + "learning_rate": 6.679334916864609e-06, + "loss": 0.2071, + "step": 6545 + }, + { + "epoch": 1.4001710132535272, + "grad_norm": 3.989108085632324, + "learning_rate": 6.667458432304038e-06, + "loss": 0.3308, + "step": 6550 + }, + { + "epoch": 1.401239846088072, + "grad_norm": 5.488873481750488, + "learning_rate": 6.655581947743468e-06, + "loss": 0.2517, + "step": 6555 + }, + { + "epoch": 1.4023086789226165, + "grad_norm": 4.91823673248291, + "learning_rate": 6.643705463182898e-06, + "loss": 0.234, + "step": 6560 + }, + { + "epoch": 1.4033775117571612, + "grad_norm": 5.4402289390563965, + "learning_rate": 6.631828978622329e-06, + "loss": 0.3025, + "step": 6565 + }, + { + "epoch": 1.404446344591706, + "grad_norm": 5.417737007141113, + "learning_rate": 6.619952494061759e-06, + "loss": 0.2619, + "step": 6570 + }, + { + "epoch": 1.4055151774262504, + "grad_norm": 3.603675127029419, + "learning_rate": 6.6080760095011885e-06, + "loss": 0.2521, + "step": 6575 + }, + { + "epoch": 1.4065840102607952, + "grad_norm": 4.426266670227051, + "learning_rate": 6.596199524940618e-06, + "loss": 0.3113, + "step": 6580 + }, + { + "epoch": 1.40765284309534, + "grad_norm": 4.535027980804443, + "learning_rate": 6.584323040380049e-06, + "loss": 0.3537, + "step": 6585 + }, + { + "epoch": 1.4087216759298846, + "grad_norm": 3.585488796234131, + "learning_rate": 6.5724465558194785e-06, + "loss": 0.2386, + "step": 6590 + }, + { + "epoch": 1.4097905087644294, + "grad_norm": 5.358974456787109, + "learning_rate": 6.560570071258908e-06, + "loss": 0.3081, + "step": 6595 + }, + { + "epoch": 1.4108593415989739, + "grad_norm": 3.859417200088501, + "learning_rate": 6.548693586698338e-06, + "loss": 0.189, + "step": 6600 + }, + { + "epoch": 1.4119281744335186, + "grad_norm": 3.350184679031372, + "learning_rate": 6.536817102137768e-06, + "loss": 0.2643, + "step": 6605 + }, + { + "epoch": 1.4129970072680633, + "grad_norm": 3.4859519004821777, + "learning_rate": 6.524940617577198e-06, + "loss": 0.2206, + "step": 6610 + }, + { + "epoch": 1.4140658401026078, + "grad_norm": 6.238532543182373, + "learning_rate": 6.513064133016628e-06, + "loss": 0.3687, + "step": 6615 + }, + { + "epoch": 1.4151346729371526, + "grad_norm": 6.955577850341797, + "learning_rate": 6.5011876484560576e-06, + "loss": 0.3626, + "step": 6620 + }, + { + "epoch": 1.4162035057716973, + "grad_norm": 4.4574995040893555, + "learning_rate": 6.489311163895487e-06, + "loss": 0.2711, + "step": 6625 + }, + { + "epoch": 1.417272338606242, + "grad_norm": 4.533407211303711, + "learning_rate": 6.477434679334918e-06, + "loss": 0.3304, + "step": 6630 + }, + { + "epoch": 1.4183411714407868, + "grad_norm": 2.947624921798706, + "learning_rate": 6.4655581947743475e-06, + "loss": 0.321, + "step": 6635 + }, + { + "epoch": 1.4194100042753313, + "grad_norm": 4.557621955871582, + "learning_rate": 6.453681710213777e-06, + "loss": 0.319, + "step": 6640 + }, + { + "epoch": 1.420478837109876, + "grad_norm": 4.511264324188232, + "learning_rate": 6.441805225653207e-06, + "loss": 0.2363, + "step": 6645 + }, + { + "epoch": 1.4215476699444207, + "grad_norm": 4.200313568115234, + "learning_rate": 6.429928741092637e-06, + "loss": 0.2319, + "step": 6650 + }, + { + "epoch": 1.4226165027789655, + "grad_norm": 7.376286506652832, + "learning_rate": 6.418052256532067e-06, + "loss": 0.3526, + "step": 6655 + }, + { + "epoch": 1.4236853356135102, + "grad_norm": 4.415379047393799, + "learning_rate": 6.406175771971497e-06, + "loss": 0.4186, + "step": 6660 + }, + { + "epoch": 1.4247541684480547, + "grad_norm": 4.578277587890625, + "learning_rate": 6.394299287410927e-06, + "loss": 0.2846, + "step": 6665 + }, + { + "epoch": 1.4258230012825994, + "grad_norm": 4.811502456665039, + "learning_rate": 6.382422802850356e-06, + "loss": 0.3077, + "step": 6670 + }, + { + "epoch": 1.4268918341171442, + "grad_norm": 3.3036532402038574, + "learning_rate": 6.370546318289787e-06, + "loss": 0.3709, + "step": 6675 + }, + { + "epoch": 1.4279606669516887, + "grad_norm": 4.229010105133057, + "learning_rate": 6.3586698337292165e-06, + "loss": 0.3438, + "step": 6680 + }, + { + "epoch": 1.4290294997862334, + "grad_norm": 7.352675914764404, + "learning_rate": 6.346793349168646e-06, + "loss": 0.4159, + "step": 6685 + }, + { + "epoch": 1.4300983326207781, + "grad_norm": 3.935654878616333, + "learning_rate": 6.334916864608076e-06, + "loss": 0.3511, + "step": 6690 + }, + { + "epoch": 1.4311671654553229, + "grad_norm": 4.271127700805664, + "learning_rate": 6.323040380047506e-06, + "loss": 0.3061, + "step": 6695 + }, + { + "epoch": 1.4322359982898676, + "grad_norm": 4.57000207901001, + "learning_rate": 6.311163895486936e-06, + "loss": 0.2694, + "step": 6700 + }, + { + "epoch": 1.433304831124412, + "grad_norm": 4.243838787078857, + "learning_rate": 6.299287410926367e-06, + "loss": 0.3412, + "step": 6705 + }, + { + "epoch": 1.4343736639589568, + "grad_norm": 4.534287929534912, + "learning_rate": 6.2874109263657964e-06, + "loss": 0.2631, + "step": 6710 + }, + { + "epoch": 1.4354424967935016, + "grad_norm": 4.457566261291504, + "learning_rate": 6.275534441805227e-06, + "loss": 0.3879, + "step": 6715 + }, + { + "epoch": 1.436511329628046, + "grad_norm": 3.7211356163024902, + "learning_rate": 6.263657957244657e-06, + "loss": 0.2942, + "step": 6720 + }, + { + "epoch": 1.4375801624625908, + "grad_norm": 6.8076300621032715, + "learning_rate": 6.251781472684086e-06, + "loss": 0.3901, + "step": 6725 + }, + { + "epoch": 1.4386489952971355, + "grad_norm": 6.238668441772461, + "learning_rate": 6.239904988123516e-06, + "loss": 0.3192, + "step": 6730 + }, + { + "epoch": 1.4397178281316803, + "grad_norm": 4.374307155609131, + "learning_rate": 6.228028503562946e-06, + "loss": 0.2269, + "step": 6735 + }, + { + "epoch": 1.440786660966225, + "grad_norm": 5.202229976654053, + "learning_rate": 6.216152019002376e-06, + "loss": 0.372, + "step": 6740 + }, + { + "epoch": 1.4418554938007695, + "grad_norm": 4.483334064483643, + "learning_rate": 6.204275534441806e-06, + "loss": 0.2467, + "step": 6745 + }, + { + "epoch": 1.4429243266353142, + "grad_norm": 3.3366737365722656, + "learning_rate": 6.192399049881236e-06, + "loss": 0.2313, + "step": 6750 + }, + { + "epoch": 1.443993159469859, + "grad_norm": 6.443348407745361, + "learning_rate": 6.1805225653206655e-06, + "loss": 0.3538, + "step": 6755 + }, + { + "epoch": 1.4450619923044035, + "grad_norm": 3.4701974391937256, + "learning_rate": 6.168646080760096e-06, + "loss": 0.252, + "step": 6760 + }, + { + "epoch": 1.4461308251389482, + "grad_norm": 3.572749137878418, + "learning_rate": 6.156769596199526e-06, + "loss": 0.3049, + "step": 6765 + }, + { + "epoch": 1.447199657973493, + "grad_norm": 4.363938808441162, + "learning_rate": 6.144893111638955e-06, + "loss": 0.2796, + "step": 6770 + }, + { + "epoch": 1.4482684908080377, + "grad_norm": 3.493666172027588, + "learning_rate": 6.133016627078385e-06, + "loss": 0.2128, + "step": 6775 + }, + { + "epoch": 1.4493373236425824, + "grad_norm": 4.754271507263184, + "learning_rate": 6.121140142517815e-06, + "loss": 0.3407, + "step": 6780 + }, + { + "epoch": 1.450406156477127, + "grad_norm": 4.948278903961182, + "learning_rate": 6.109263657957245e-06, + "loss": 0.2196, + "step": 6785 + }, + { + "epoch": 1.4514749893116716, + "grad_norm": 4.344764709472656, + "learning_rate": 6.097387173396675e-06, + "loss": 0.2472, + "step": 6790 + }, + { + "epoch": 1.4525438221462164, + "grad_norm": 4.455203056335449, + "learning_rate": 6.085510688836105e-06, + "loss": 0.2788, + "step": 6795 + }, + { + "epoch": 1.453612654980761, + "grad_norm": 5.69878625869751, + "learning_rate": 6.0736342042755345e-06, + "loss": 0.305, + "step": 6800 + }, + { + "epoch": 1.4546814878153058, + "grad_norm": 4.746001243591309, + "learning_rate": 6.061757719714965e-06, + "loss": 0.3072, + "step": 6805 + }, + { + "epoch": 1.4557503206498503, + "grad_norm": 3.463618755340576, + "learning_rate": 6.049881235154395e-06, + "loss": 0.2879, + "step": 6810 + }, + { + "epoch": 1.456819153484395, + "grad_norm": 3.4969255924224854, + "learning_rate": 6.0380047505938244e-06, + "loss": 0.4406, + "step": 6815 + }, + { + "epoch": 1.4578879863189398, + "grad_norm": 3.6291632652282715, + "learning_rate": 6.026128266033254e-06, + "loss": 0.3371, + "step": 6820 + }, + { + "epoch": 1.4589568191534843, + "grad_norm": 4.0304765701293945, + "learning_rate": 6.014251781472684e-06, + "loss": 0.2775, + "step": 6825 + }, + { + "epoch": 1.460025651988029, + "grad_norm": 3.6861469745635986, + "learning_rate": 6.002375296912114e-06, + "loss": 0.2872, + "step": 6830 + }, + { + "epoch": 1.4610944848225738, + "grad_norm": 4.720432758331299, + "learning_rate": 5.990498812351544e-06, + "loss": 0.3056, + "step": 6835 + }, + { + "epoch": 1.4621633176571185, + "grad_norm": 3.8419721126556396, + "learning_rate": 5.978622327790974e-06, + "loss": 0.3183, + "step": 6840 + }, + { + "epoch": 1.4632321504916632, + "grad_norm": 4.320315361022949, + "learning_rate": 5.9667458432304035e-06, + "loss": 0.2801, + "step": 6845 + }, + { + "epoch": 1.4643009833262077, + "grad_norm": 4.07327127456665, + "learning_rate": 5.954869358669835e-06, + "loss": 0.2641, + "step": 6850 + }, + { + "epoch": 1.4653698161607525, + "grad_norm": 5.109342098236084, + "learning_rate": 5.942992874109265e-06, + "loss": 0.2903, + "step": 6855 + }, + { + "epoch": 1.4664386489952972, + "grad_norm": 5.147985458374023, + "learning_rate": 5.931116389548694e-06, + "loss": 0.4097, + "step": 6860 + }, + { + "epoch": 1.4675074818298417, + "grad_norm": 5.812030792236328, + "learning_rate": 5.919239904988124e-06, + "loss": 0.2133, + "step": 6865 + }, + { + "epoch": 1.4685763146643864, + "grad_norm": 4.3751220703125, + "learning_rate": 5.9073634204275545e-06, + "loss": 0.3064, + "step": 6870 + }, + { + "epoch": 1.4696451474989312, + "grad_norm": 3.8219094276428223, + "learning_rate": 5.895486935866984e-06, + "loss": 0.2629, + "step": 6875 + }, + { + "epoch": 1.470713980333476, + "grad_norm": 3.550219774246216, + "learning_rate": 5.883610451306414e-06, + "loss": 0.1846, + "step": 6880 + }, + { + "epoch": 1.4717828131680206, + "grad_norm": 4.344959259033203, + "learning_rate": 5.871733966745844e-06, + "loss": 0.2552, + "step": 6885 + }, + { + "epoch": 1.4728516460025651, + "grad_norm": 3.7821099758148193, + "learning_rate": 5.859857482185274e-06, + "loss": 0.2812, + "step": 6890 + }, + { + "epoch": 1.4739204788371099, + "grad_norm": 5.074913501739502, + "learning_rate": 5.847980997624704e-06, + "loss": 0.2796, + "step": 6895 + }, + { + "epoch": 1.4749893116716546, + "grad_norm": 5.702268600463867, + "learning_rate": 5.836104513064134e-06, + "loss": 0.3157, + "step": 6900 + }, + { + "epoch": 1.476058144506199, + "grad_norm": 4.769154071807861, + "learning_rate": 5.824228028503563e-06, + "loss": 0.271, + "step": 6905 + }, + { + "epoch": 1.4771269773407438, + "grad_norm": 3.915893077850342, + "learning_rate": 5.812351543942993e-06, + "loss": 0.2352, + "step": 6910 + }, + { + "epoch": 1.4781958101752886, + "grad_norm": 5.49572229385376, + "learning_rate": 5.8004750593824236e-06, + "loss": 0.3752, + "step": 6915 + }, + { + "epoch": 1.4792646430098333, + "grad_norm": 5.197114944458008, + "learning_rate": 5.788598574821853e-06, + "loss": 0.2811, + "step": 6920 + }, + { + "epoch": 1.480333475844378, + "grad_norm": 4.672935485839844, + "learning_rate": 5.776722090261283e-06, + "loss": 0.3303, + "step": 6925 + }, + { + "epoch": 1.4814023086789225, + "grad_norm": 3.5662314891815186, + "learning_rate": 5.764845605700713e-06, + "loss": 0.3382, + "step": 6930 + }, + { + "epoch": 1.4824711415134673, + "grad_norm": 3.7478342056274414, + "learning_rate": 5.752969121140143e-06, + "loss": 0.2235, + "step": 6935 + }, + { + "epoch": 1.483539974348012, + "grad_norm": 5.836414813995361, + "learning_rate": 5.741092636579573e-06, + "loss": 0.2446, + "step": 6940 + }, + { + "epoch": 1.4846088071825567, + "grad_norm": 4.945041179656982, + "learning_rate": 5.729216152019003e-06, + "loss": 0.2745, + "step": 6945 + }, + { + "epoch": 1.4856776400171015, + "grad_norm": 4.556496620178223, + "learning_rate": 5.717339667458432e-06, + "loss": 0.3061, + "step": 6950 + }, + { + "epoch": 1.486746472851646, + "grad_norm": 5.837685585021973, + "learning_rate": 5.705463182897862e-06, + "loss": 0.3059, + "step": 6955 + }, + { + "epoch": 1.4878153056861907, + "grad_norm": 3.4342663288116455, + "learning_rate": 5.6935866983372926e-06, + "loss": 0.2692, + "step": 6960 + }, + { + "epoch": 1.4888841385207354, + "grad_norm": 4.30683708190918, + "learning_rate": 5.681710213776722e-06, + "loss": 0.2853, + "step": 6965 + }, + { + "epoch": 1.48995297135528, + "grad_norm": 3.7401227951049805, + "learning_rate": 5.669833729216152e-06, + "loss": 0.238, + "step": 6970 + }, + { + "epoch": 1.4910218041898247, + "grad_norm": 3.991908311843872, + "learning_rate": 5.657957244655582e-06, + "loss": 0.3086, + "step": 6975 + }, + { + "epoch": 1.4920906370243694, + "grad_norm": 5.546383857727051, + "learning_rate": 5.646080760095012e-06, + "loss": 0.3016, + "step": 6980 + }, + { + "epoch": 1.4931594698589141, + "grad_norm": 4.429809093475342, + "learning_rate": 5.634204275534442e-06, + "loss": 0.3272, + "step": 6985 + }, + { + "epoch": 1.4942283026934589, + "grad_norm": 4.91778564453125, + "learning_rate": 5.6223277909738725e-06, + "loss": 0.3125, + "step": 6990 + }, + { + "epoch": 1.4952971355280034, + "grad_norm": 5.806905269622803, + "learning_rate": 5.610451306413302e-06, + "loss": 0.2777, + "step": 6995 + }, + { + "epoch": 1.496365968362548, + "grad_norm": 5.0485711097717285, + "learning_rate": 5.598574821852733e-06, + "loss": 0.3026, + "step": 7000 + }, + { + "epoch": 1.4974348011970928, + "grad_norm": 4.642349720001221, + "learning_rate": 5.5866983372921624e-06, + "loss": 0.2522, + "step": 7005 + }, + { + "epoch": 1.4985036340316373, + "grad_norm": 3.192457437515259, + "learning_rate": 5.574821852731592e-06, + "loss": 0.2487, + "step": 7010 + }, + { + "epoch": 1.499572466866182, + "grad_norm": 4.002120494842529, + "learning_rate": 5.562945368171022e-06, + "loss": 0.2316, + "step": 7015 + }, + { + "epoch": 1.5006412997007268, + "grad_norm": 4.840696334838867, + "learning_rate": 5.551068883610452e-06, + "loss": 0.2484, + "step": 7020 + }, + { + "epoch": 1.5017101325352715, + "grad_norm": 4.7393927574157715, + "learning_rate": 5.539192399049882e-06, + "loss": 0.2852, + "step": 7025 + }, + { + "epoch": 1.5027789653698163, + "grad_norm": 4.964815616607666, + "learning_rate": 5.527315914489312e-06, + "loss": 0.2944, + "step": 7030 + }, + { + "epoch": 1.5038477982043608, + "grad_norm": 4.7306342124938965, + "learning_rate": 5.5154394299287415e-06, + "loss": 0.3133, + "step": 7035 + }, + { + "epoch": 1.5049166310389055, + "grad_norm": 5.262001991271973, + "learning_rate": 5.503562945368171e-06, + "loss": 0.2557, + "step": 7040 + }, + { + "epoch": 1.5059854638734502, + "grad_norm": 4.136565685272217, + "learning_rate": 5.491686460807602e-06, + "loss": 0.232, + "step": 7045 + }, + { + "epoch": 1.5070542967079947, + "grad_norm": 3.917520046234131, + "learning_rate": 5.4798099762470315e-06, + "loss": 0.2635, + "step": 7050 + }, + { + "epoch": 1.5081231295425397, + "grad_norm": 5.6809210777282715, + "learning_rate": 5.467933491686461e-06, + "loss": 0.3033, + "step": 7055 + }, + { + "epoch": 1.5091919623770842, + "grad_norm": 3.7200369834899902, + "learning_rate": 5.456057007125891e-06, + "loss": 0.2477, + "step": 7060 + }, + { + "epoch": 1.510260795211629, + "grad_norm": 4.6949543952941895, + "learning_rate": 5.444180522565321e-06, + "loss": 0.2443, + "step": 7065 + }, + { + "epoch": 1.5113296280461737, + "grad_norm": 4.025641918182373, + "learning_rate": 5.432304038004751e-06, + "loss": 0.4329, + "step": 7070 + }, + { + "epoch": 1.5123984608807182, + "grad_norm": 3.7725117206573486, + "learning_rate": 5.420427553444181e-06, + "loss": 0.2682, + "step": 7075 + }, + { + "epoch": 1.513467293715263, + "grad_norm": 4.11836051940918, + "learning_rate": 5.4085510688836105e-06, + "loss": 0.3149, + "step": 7080 + }, + { + "epoch": 1.5145361265498076, + "grad_norm": 4.033612251281738, + "learning_rate": 5.39667458432304e-06, + "loss": 0.353, + "step": 7085 + }, + { + "epoch": 1.5156049593843521, + "grad_norm": 5.4751482009887695, + "learning_rate": 5.384798099762471e-06, + "loss": 0.2247, + "step": 7090 + }, + { + "epoch": 1.516673792218897, + "grad_norm": 4.203334808349609, + "learning_rate": 5.3729216152019005e-06, + "loss": 0.2862, + "step": 7095 + }, + { + "epoch": 1.5177426250534416, + "grad_norm": 5.31473970413208, + "learning_rate": 5.36104513064133e-06, + "loss": 0.287, + "step": 7100 + }, + { + "epoch": 1.5188114578879863, + "grad_norm": 4.896878719329834, + "learning_rate": 5.34916864608076e-06, + "loss": 0.3141, + "step": 7105 + }, + { + "epoch": 1.519880290722531, + "grad_norm": 3.62528133392334, + "learning_rate": 5.33729216152019e-06, + "loss": 0.4446, + "step": 7110 + }, + { + "epoch": 1.5209491235570756, + "grad_norm": 5.231464385986328, + "learning_rate": 5.32541567695962e-06, + "loss": 0.2853, + "step": 7115 + }, + { + "epoch": 1.5220179563916203, + "grad_norm": 3.0587196350097656, + "learning_rate": 5.31353919239905e-06, + "loss": 0.2662, + "step": 7120 + }, + { + "epoch": 1.523086789226165, + "grad_norm": 5.080547332763672, + "learning_rate": 5.3016627078384795e-06, + "loss": 0.2729, + "step": 7125 + }, + { + "epoch": 1.5241556220607098, + "grad_norm": 3.547877073287964, + "learning_rate": 5.289786223277911e-06, + "loss": 0.2376, + "step": 7130 + }, + { + "epoch": 1.5252244548952545, + "grad_norm": 3.9913973808288574, + "learning_rate": 5.277909738717341e-06, + "loss": 0.2434, + "step": 7135 + }, + { + "epoch": 1.526293287729799, + "grad_norm": 3.9852547645568848, + "learning_rate": 5.26603325415677e-06, + "loss": 0.302, + "step": 7140 + }, + { + "epoch": 1.5273621205643437, + "grad_norm": 3.660104274749756, + "learning_rate": 5.2541567695962e-06, + "loss": 0.2346, + "step": 7145 + }, + { + "epoch": 1.5284309533988885, + "grad_norm": 4.887364387512207, + "learning_rate": 5.242280285035631e-06, + "loss": 0.4036, + "step": 7150 + }, + { + "epoch": 1.529499786233433, + "grad_norm": 5.766690254211426, + "learning_rate": 5.23040380047506e-06, + "loss": 0.2902, + "step": 7155 + }, + { + "epoch": 1.530568619067978, + "grad_norm": 5.018100738525391, + "learning_rate": 5.21852731591449e-06, + "loss": 0.4254, + "step": 7160 + }, + { + "epoch": 1.5316374519025224, + "grad_norm": 2.8769116401672363, + "learning_rate": 5.20665083135392e-06, + "loss": 0.2863, + "step": 7165 + }, + { + "epoch": 1.5327062847370672, + "grad_norm": 4.766345024108887, + "learning_rate": 5.194774346793349e-06, + "loss": 0.2618, + "step": 7170 + }, + { + "epoch": 1.533775117571612, + "grad_norm": 4.371603012084961, + "learning_rate": 5.18289786223278e-06, + "loss": 0.3614, + "step": 7175 + }, + { + "epoch": 1.5348439504061564, + "grad_norm": 3.7386531829833984, + "learning_rate": 5.17102137767221e-06, + "loss": 0.3084, + "step": 7180 + }, + { + "epoch": 1.5359127832407011, + "grad_norm": 3.2616264820098877, + "learning_rate": 5.159144893111639e-06, + "loss": 0.2799, + "step": 7185 + }, + { + "epoch": 1.5369816160752459, + "grad_norm": 4.840415000915527, + "learning_rate": 5.147268408551069e-06, + "loss": 0.2843, + "step": 7190 + }, + { + "epoch": 1.5380504489097904, + "grad_norm": 2.643326997756958, + "learning_rate": 5.1353919239905e-06, + "loss": 0.255, + "step": 7195 + }, + { + "epoch": 1.5391192817443353, + "grad_norm": 3.9539496898651123, + "learning_rate": 5.123515439429929e-06, + "loss": 0.2278, + "step": 7200 + }, + { + "epoch": 1.5401881145788798, + "grad_norm": 4.173327922821045, + "learning_rate": 5.111638954869359e-06, + "loss": 0.3137, + "step": 7205 + }, + { + "epoch": 1.5412569474134246, + "grad_norm": 4.327914237976074, + "learning_rate": 5.099762470308789e-06, + "loss": 0.3365, + "step": 7210 + }, + { + "epoch": 1.5423257802479693, + "grad_norm": 2.9048960208892822, + "learning_rate": 5.087885985748218e-06, + "loss": 0.1981, + "step": 7215 + }, + { + "epoch": 1.5433946130825138, + "grad_norm": 4.26038932800293, + "learning_rate": 5.076009501187649e-06, + "loss": 0.2338, + "step": 7220 + }, + { + "epoch": 1.5444634459170585, + "grad_norm": 5.362328052520752, + "learning_rate": 5.064133016627079e-06, + "loss": 0.2692, + "step": 7225 + }, + { + "epoch": 1.5455322787516033, + "grad_norm": 4.408464431762695, + "learning_rate": 5.052256532066508e-06, + "loss": 0.2129, + "step": 7230 + }, + { + "epoch": 1.5466011115861478, + "grad_norm": 5.237843990325928, + "learning_rate": 5.040380047505938e-06, + "loss": 0.2395, + "step": 7235 + }, + { + "epoch": 1.5476699444206927, + "grad_norm": 6.1017045974731445, + "learning_rate": 5.028503562945369e-06, + "loss": 0.485, + "step": 7240 + }, + { + "epoch": 1.5487387772552372, + "grad_norm": 5.8066582679748535, + "learning_rate": 5.016627078384798e-06, + "loss": 0.2166, + "step": 7245 + }, + { + "epoch": 1.549807610089782, + "grad_norm": 6.7323899269104, + "learning_rate": 5.004750593824228e-06, + "loss": 0.2799, + "step": 7250 + }, + { + "epoch": 1.5508764429243267, + "grad_norm": 4.477848052978516, + "learning_rate": 4.9928741092636586e-06, + "loss": 0.2856, + "step": 7255 + }, + { + "epoch": 1.5519452757588712, + "grad_norm": 3.282881498336792, + "learning_rate": 4.980997624703088e-06, + "loss": 0.272, + "step": 7260 + }, + { + "epoch": 1.5530141085934162, + "grad_norm": 4.757537364959717, + "learning_rate": 4.969121140142518e-06, + "loss": 0.299, + "step": 7265 + }, + { + "epoch": 1.5540829414279607, + "grad_norm": 6.090857028961182, + "learning_rate": 4.9572446555819485e-06, + "loss": 0.3309, + "step": 7270 + }, + { + "epoch": 1.5551517742625054, + "grad_norm": 3.326892137527466, + "learning_rate": 4.945368171021378e-06, + "loss": 0.223, + "step": 7275 + }, + { + "epoch": 1.5562206070970501, + "grad_norm": 3.5346665382385254, + "learning_rate": 4.933491686460808e-06, + "loss": 0.2351, + "step": 7280 + }, + { + "epoch": 1.5572894399315946, + "grad_norm": 3.1125802993774414, + "learning_rate": 4.921615201900238e-06, + "loss": 0.2177, + "step": 7285 + }, + { + "epoch": 1.5583582727661394, + "grad_norm": 3.7614200115203857, + "learning_rate": 4.909738717339667e-06, + "loss": 0.2606, + "step": 7290 + }, + { + "epoch": 1.559427105600684, + "grad_norm": 3.761014223098755, + "learning_rate": 4.897862232779098e-06, + "loss": 0.3972, + "step": 7295 + }, + { + "epoch": 1.5604959384352286, + "grad_norm": 3.6661438941955566, + "learning_rate": 4.885985748218528e-06, + "loss": 0.2594, + "step": 7300 + }, + { + "epoch": 1.5615647712697736, + "grad_norm": 4.455360412597656, + "learning_rate": 4.874109263657958e-06, + "loss": 0.2934, + "step": 7305 + }, + { + "epoch": 1.562633604104318, + "grad_norm": 4.19691801071167, + "learning_rate": 4.862232779097388e-06, + "loss": 0.4105, + "step": 7310 + }, + { + "epoch": 1.5637024369388628, + "grad_norm": 4.041048049926758, + "learning_rate": 4.8503562945368175e-06, + "loss": 0.1971, + "step": 7315 + }, + { + "epoch": 1.5647712697734075, + "grad_norm": 3.2611756324768066, + "learning_rate": 4.838479809976247e-06, + "loss": 0.2107, + "step": 7320 + }, + { + "epoch": 1.565840102607952, + "grad_norm": 3.419591188430786, + "learning_rate": 4.826603325415678e-06, + "loss": 0.2441, + "step": 7325 + }, + { + "epoch": 1.5669089354424968, + "grad_norm": 4.567037105560303, + "learning_rate": 4.8147268408551075e-06, + "loss": 0.2413, + "step": 7330 + }, + { + "epoch": 1.5679777682770415, + "grad_norm": 3.887484550476074, + "learning_rate": 4.802850356294537e-06, + "loss": 0.2619, + "step": 7335 + }, + { + "epoch": 1.569046601111586, + "grad_norm": 4.95120906829834, + "learning_rate": 4.790973871733967e-06, + "loss": 0.3098, + "step": 7340 + }, + { + "epoch": 1.570115433946131, + "grad_norm": 4.205053806304932, + "learning_rate": 4.779097387173397e-06, + "loss": 0.3024, + "step": 7345 + }, + { + "epoch": 1.5711842667806755, + "grad_norm": 6.198763847351074, + "learning_rate": 4.767220902612827e-06, + "loss": 0.2548, + "step": 7350 + }, + { + "epoch": 1.5722530996152202, + "grad_norm": 4.158599853515625, + "learning_rate": 4.755344418052257e-06, + "loss": 0.2925, + "step": 7355 + }, + { + "epoch": 1.573321932449765, + "grad_norm": 3.3105695247650146, + "learning_rate": 4.7434679334916866e-06, + "loss": 0.2245, + "step": 7360 + }, + { + "epoch": 1.5743907652843094, + "grad_norm": 2.852360963821411, + "learning_rate": 4.731591448931116e-06, + "loss": 0.2612, + "step": 7365 + }, + { + "epoch": 1.5754595981188542, + "grad_norm": 5.082930564880371, + "learning_rate": 4.719714964370547e-06, + "loss": 0.4075, + "step": 7370 + }, + { + "epoch": 1.576528430953399, + "grad_norm": 3.626047372817993, + "learning_rate": 4.7078384798099765e-06, + "loss": 0.2457, + "step": 7375 + }, + { + "epoch": 1.5775972637879434, + "grad_norm": 3.2513113021850586, + "learning_rate": 4.695961995249407e-06, + "loss": 0.214, + "step": 7380 + }, + { + "epoch": 1.5786660966224884, + "grad_norm": 4.396987438201904, + "learning_rate": 4.684085510688837e-06, + "loss": 0.2761, + "step": 7385 + }, + { + "epoch": 1.5797349294570329, + "grad_norm": 4.177000045776367, + "learning_rate": 4.6722090261282665e-06, + "loss": 0.278, + "step": 7390 + }, + { + "epoch": 1.5808037622915776, + "grad_norm": 6.472886562347412, + "learning_rate": 4.660332541567696e-06, + "loss": 0.4008, + "step": 7395 + }, + { + "epoch": 1.5818725951261223, + "grad_norm": 5.244050979614258, + "learning_rate": 4.648456057007127e-06, + "loss": 0.3222, + "step": 7400 + }, + { + "epoch": 1.5829414279606668, + "grad_norm": 3.3180673122406006, + "learning_rate": 4.636579572446556e-06, + "loss": 0.2645, + "step": 7405 + }, + { + "epoch": 1.5840102607952118, + "grad_norm": 4.317756652832031, + "learning_rate": 4.624703087885986e-06, + "loss": 0.2121, + "step": 7410 + }, + { + "epoch": 1.5850790936297563, + "grad_norm": 5.13472843170166, + "learning_rate": 4.612826603325416e-06, + "loss": 0.2679, + "step": 7415 + }, + { + "epoch": 1.586147926464301, + "grad_norm": 4.850220680236816, + "learning_rate": 4.6009501187648455e-06, + "loss": 0.342, + "step": 7420 + }, + { + "epoch": 1.5872167592988458, + "grad_norm": 3.7907469272613525, + "learning_rate": 4.589073634204276e-06, + "loss": 0.2312, + "step": 7425 + }, + { + "epoch": 1.5882855921333903, + "grad_norm": 5.306363582611084, + "learning_rate": 4.577197149643706e-06, + "loss": 0.3332, + "step": 7430 + }, + { + "epoch": 1.589354424967935, + "grad_norm": 4.227755069732666, + "learning_rate": 4.5653206650831355e-06, + "loss": 0.2628, + "step": 7435 + }, + { + "epoch": 1.5904232578024797, + "grad_norm": 4.175191879272461, + "learning_rate": 4.553444180522565e-06, + "loss": 0.2824, + "step": 7440 + }, + { + "epoch": 1.5914920906370242, + "grad_norm": 4.70232629776001, + "learning_rate": 4.541567695961996e-06, + "loss": 0.3249, + "step": 7445 + }, + { + "epoch": 1.5925609234715692, + "grad_norm": 5.078143119812012, + "learning_rate": 4.5296912114014254e-06, + "loss": 0.3738, + "step": 7450 + }, + { + "epoch": 1.5936297563061137, + "grad_norm": 3.0150363445281982, + "learning_rate": 4.517814726840856e-06, + "loss": 0.3357, + "step": 7455 + }, + { + "epoch": 1.5946985891406584, + "grad_norm": 6.010279655456543, + "learning_rate": 4.505938242280286e-06, + "loss": 0.2563, + "step": 7460 + }, + { + "epoch": 1.5957674219752032, + "grad_norm": 4.169801712036133, + "learning_rate": 4.494061757719715e-06, + "loss": 0.2091, + "step": 7465 + }, + { + "epoch": 1.5968362548097477, + "grad_norm": 5.483653545379639, + "learning_rate": 4.482185273159145e-06, + "loss": 0.2425, + "step": 7470 + }, + { + "epoch": 1.5979050876442924, + "grad_norm": 3.874551773071289, + "learning_rate": 4.470308788598575e-06, + "loss": 0.2699, + "step": 7475 + }, + { + "epoch": 1.5989739204788371, + "grad_norm": 5.686993598937988, + "learning_rate": 4.458432304038005e-06, + "loss": 0.3409, + "step": 7480 + }, + { + "epoch": 1.6000427533133816, + "grad_norm": 4.527751922607422, + "learning_rate": 4.446555819477435e-06, + "loss": 0.2233, + "step": 7485 + }, + { + "epoch": 1.6011115861479266, + "grad_norm": 4.663357257843018, + "learning_rate": 4.434679334916865e-06, + "loss": 0.3269, + "step": 7490 + }, + { + "epoch": 1.602180418982471, + "grad_norm": 5.009659767150879, + "learning_rate": 4.4228028503562945e-06, + "loss": 0.3029, + "step": 7495 + }, + { + "epoch": 1.6032492518170158, + "grad_norm": 3.9787962436676025, + "learning_rate": 4.410926365795725e-06, + "loss": 0.2547, + "step": 7500 + }, + { + "epoch": 1.6043180846515606, + "grad_norm": 5.281296253204346, + "learning_rate": 4.399049881235155e-06, + "loss": 0.2855, + "step": 7505 + }, + { + "epoch": 1.605386917486105, + "grad_norm": 6.091033935546875, + "learning_rate": 4.387173396674584e-06, + "loss": 0.3106, + "step": 7510 + }, + { + "epoch": 1.6064557503206498, + "grad_norm": 5.57248067855835, + "learning_rate": 4.375296912114015e-06, + "loss": 0.262, + "step": 7515 + }, + { + "epoch": 1.6075245831551945, + "grad_norm": 4.538100242614746, + "learning_rate": 4.363420427553445e-06, + "loss": 0.3016, + "step": 7520 + }, + { + "epoch": 1.608593415989739, + "grad_norm": 2.859865665435791, + "learning_rate": 4.351543942992874e-06, + "loss": 0.2852, + "step": 7525 + }, + { + "epoch": 1.609662248824284, + "grad_norm": 4.841543197631836, + "learning_rate": 4.339667458432305e-06, + "loss": 0.3126, + "step": 7530 + }, + { + "epoch": 1.6107310816588285, + "grad_norm": 4.134354114532471, + "learning_rate": 4.327790973871735e-06, + "loss": 0.2779, + "step": 7535 + }, + { + "epoch": 1.6117999144933732, + "grad_norm": 5.4539875984191895, + "learning_rate": 4.315914489311164e-06, + "loss": 0.2811, + "step": 7540 + }, + { + "epoch": 1.612868747327918, + "grad_norm": 4.018299579620361, + "learning_rate": 4.304038004750594e-06, + "loss": 0.259, + "step": 7545 + }, + { + "epoch": 1.6139375801624625, + "grad_norm": 3.978214740753174, + "learning_rate": 4.292161520190024e-06, + "loss": 0.2602, + "step": 7550 + }, + { + "epoch": 1.6150064129970074, + "grad_norm": 4.782619953155518, + "learning_rate": 4.280285035629454e-06, + "loss": 0.2481, + "step": 7555 + }, + { + "epoch": 1.616075245831552, + "grad_norm": 4.34796142578125, + "learning_rate": 4.268408551068884e-06, + "loss": 0.2022, + "step": 7560 + }, + { + "epoch": 1.6171440786660967, + "grad_norm": 4.58864688873291, + "learning_rate": 4.256532066508314e-06, + "loss": 0.2943, + "step": 7565 + }, + { + "epoch": 1.6182129115006414, + "grad_norm": 3.2588422298431396, + "learning_rate": 4.244655581947743e-06, + "loss": 0.2144, + "step": 7570 + }, + { + "epoch": 1.619281744335186, + "grad_norm": 4.609071731567383, + "learning_rate": 4.232779097387174e-06, + "loss": 0.2589, + "step": 7575 + }, + { + "epoch": 1.6203505771697306, + "grad_norm": 3.8828067779541016, + "learning_rate": 4.220902612826604e-06, + "loss": 0.1999, + "step": 7580 + }, + { + "epoch": 1.6214194100042754, + "grad_norm": 5.068613052368164, + "learning_rate": 4.209026128266034e-06, + "loss": 0.3035, + "step": 7585 + }, + { + "epoch": 1.6224882428388199, + "grad_norm": 3.4416937828063965, + "learning_rate": 4.197149643705464e-06, + "loss": 0.2322, + "step": 7590 + }, + { + "epoch": 1.6235570756733648, + "grad_norm": 4.246146202087402, + "learning_rate": 4.185273159144894e-06, + "loss": 0.237, + "step": 7595 + }, + { + "epoch": 1.6246259085079093, + "grad_norm": 4.175546646118164, + "learning_rate": 4.173396674584323e-06, + "loss": 0.2815, + "step": 7600 + }, + { + "epoch": 1.625694741342454, + "grad_norm": 5.142884254455566, + "learning_rate": 4.161520190023753e-06, + "loss": 0.4136, + "step": 7605 + }, + { + "epoch": 1.6267635741769988, + "grad_norm": 4.261429309844971, + "learning_rate": 4.1496437054631835e-06, + "loss": 0.2474, + "step": 7610 + }, + { + "epoch": 1.6278324070115433, + "grad_norm": 5.0894646644592285, + "learning_rate": 4.137767220902613e-06, + "loss": 0.3143, + "step": 7615 + }, + { + "epoch": 1.628901239846088, + "grad_norm": 4.596246242523193, + "learning_rate": 4.125890736342043e-06, + "loss": 0.244, + "step": 7620 + }, + { + "epoch": 1.6299700726806328, + "grad_norm": 4.05454158782959, + "learning_rate": 4.114014251781473e-06, + "loss": 0.3134, + "step": 7625 + }, + { + "epoch": 1.6310389055151773, + "grad_norm": 5.604685306549072, + "learning_rate": 4.102137767220903e-06, + "loss": 0.2516, + "step": 7630 + }, + { + "epoch": 1.6321077383497222, + "grad_norm": 2.5428969860076904, + "learning_rate": 4.090261282660333e-06, + "loss": 0.3159, + "step": 7635 + }, + { + "epoch": 1.6331765711842667, + "grad_norm": 3.228505849838257, + "learning_rate": 4.078384798099763e-06, + "loss": 0.2519, + "step": 7640 + }, + { + "epoch": 1.6342454040188115, + "grad_norm": 5.0502753257751465, + "learning_rate": 4.066508313539192e-06, + "loss": 0.2785, + "step": 7645 + }, + { + "epoch": 1.6353142368533562, + "grad_norm": 3.824427366256714, + "learning_rate": 4.054631828978622e-06, + "loss": 0.2627, + "step": 7650 + }, + { + "epoch": 1.6363830696879007, + "grad_norm": 4.460954666137695, + "learning_rate": 4.0427553444180526e-06, + "loss": 0.2677, + "step": 7655 + }, + { + "epoch": 1.6374519025224454, + "grad_norm": 3.3890676498413086, + "learning_rate": 4.030878859857483e-06, + "loss": 0.1962, + "step": 7660 + }, + { + "epoch": 1.6385207353569902, + "grad_norm": 4.556974411010742, + "learning_rate": 4.019002375296913e-06, + "loss": 0.3779, + "step": 7665 + }, + { + "epoch": 1.639589568191535, + "grad_norm": 3.9803950786590576, + "learning_rate": 4.0071258907363425e-06, + "loss": 0.2731, + "step": 7670 + }, + { + "epoch": 1.6406584010260796, + "grad_norm": 3.7230427265167236, + "learning_rate": 3.995249406175772e-06, + "loss": 0.2806, + "step": 7675 + }, + { + "epoch": 1.6417272338606241, + "grad_norm": 3.6325037479400635, + "learning_rate": 3.983372921615202e-06, + "loss": 0.2582, + "step": 7680 + }, + { + "epoch": 1.6427960666951689, + "grad_norm": 4.024942398071289, + "learning_rate": 3.9714964370546325e-06, + "loss": 0.2064, + "step": 7685 + }, + { + "epoch": 1.6438648995297136, + "grad_norm": 4.7745819091796875, + "learning_rate": 3.959619952494062e-06, + "loss": 0.263, + "step": 7690 + }, + { + "epoch": 1.644933732364258, + "grad_norm": 3.8132996559143066, + "learning_rate": 3.947743467933492e-06, + "loss": 0.3126, + "step": 7695 + }, + { + "epoch": 1.646002565198803, + "grad_norm": 3.711763620376587, + "learning_rate": 3.9358669833729216e-06, + "loss": 0.2889, + "step": 7700 + }, + { + "epoch": 1.6470713980333476, + "grad_norm": 3.696894645690918, + "learning_rate": 3.923990498812352e-06, + "loss": 0.2372, + "step": 7705 + }, + { + "epoch": 1.6481402308678923, + "grad_norm": 5.242607593536377, + "learning_rate": 3.912114014251782e-06, + "loss": 0.2062, + "step": 7710 + }, + { + "epoch": 1.649209063702437, + "grad_norm": 3.8635284900665283, + "learning_rate": 3.9002375296912115e-06, + "loss": 0.3085, + "step": 7715 + }, + { + "epoch": 1.6502778965369815, + "grad_norm": 4.494617938995361, + "learning_rate": 3.888361045130641e-06, + "loss": 0.22, + "step": 7720 + }, + { + "epoch": 1.6513467293715263, + "grad_norm": 5.683468818664551, + "learning_rate": 3.876484560570072e-06, + "loss": 0.258, + "step": 7725 + }, + { + "epoch": 1.652415562206071, + "grad_norm": 7.1560845375061035, + "learning_rate": 3.8646080760095015e-06, + "loss": 0.2496, + "step": 7730 + }, + { + "epoch": 1.6534843950406155, + "grad_norm": 4.27496337890625, + "learning_rate": 3.852731591448932e-06, + "loss": 0.2975, + "step": 7735 + }, + { + "epoch": 1.6545532278751605, + "grad_norm": 5.494519233703613, + "learning_rate": 3.840855106888362e-06, + "loss": 0.2744, + "step": 7740 + }, + { + "epoch": 1.655622060709705, + "grad_norm": 4.088238716125488, + "learning_rate": 3.8289786223277914e-06, + "loss": 0.2255, + "step": 7745 + }, + { + "epoch": 1.6566908935442497, + "grad_norm": 3.627351760864258, + "learning_rate": 3.817102137767221e-06, + "loss": 0.2387, + "step": 7750 + }, + { + "epoch": 1.6577597263787944, + "grad_norm": 4.195761680603027, + "learning_rate": 3.8052256532066513e-06, + "loss": 0.2724, + "step": 7755 + }, + { + "epoch": 1.658828559213339, + "grad_norm": 4.758053779602051, + "learning_rate": 3.793349168646081e-06, + "loss": 0.282, + "step": 7760 + }, + { + "epoch": 1.6598973920478837, + "grad_norm": 3.427823066711426, + "learning_rate": 3.781472684085511e-06, + "loss": 0.166, + "step": 7765 + }, + { + "epoch": 1.6609662248824284, + "grad_norm": 4.784726142883301, + "learning_rate": 3.769596199524941e-06, + "loss": 0.2653, + "step": 7770 + }, + { + "epoch": 1.662035057716973, + "grad_norm": 4.018444538116455, + "learning_rate": 3.757719714964371e-06, + "loss": 0.2368, + "step": 7775 + }, + { + "epoch": 1.6631038905515179, + "grad_norm": 4.532012462615967, + "learning_rate": 3.7458432304038006e-06, + "loss": 0.2235, + "step": 7780 + }, + { + "epoch": 1.6641727233860624, + "grad_norm": 4.576938152313232, + "learning_rate": 3.7339667458432303e-06, + "loss": 0.309, + "step": 7785 + }, + { + "epoch": 1.665241556220607, + "grad_norm": 4.126202583312988, + "learning_rate": 3.7220902612826604e-06, + "loss": 0.3402, + "step": 7790 + }, + { + "epoch": 1.6663103890551518, + "grad_norm": 5.895056247711182, + "learning_rate": 3.710213776722091e-06, + "loss": 0.2741, + "step": 7795 + }, + { + "epoch": 1.6673792218896963, + "grad_norm": 5.252209663391113, + "learning_rate": 3.6983372921615207e-06, + "loss": 0.2282, + "step": 7800 + }, + { + "epoch": 1.6684480547242413, + "grad_norm": 5.411665439605713, + "learning_rate": 3.6864608076009504e-06, + "loss": 0.3233, + "step": 7805 + }, + { + "epoch": 1.6695168875587858, + "grad_norm": 3.801215887069702, + "learning_rate": 3.6745843230403805e-06, + "loss": 0.23, + "step": 7810 + }, + { + "epoch": 1.6705857203933305, + "grad_norm": 5.455605983734131, + "learning_rate": 3.6627078384798102e-06, + "loss": 0.22, + "step": 7815 + }, + { + "epoch": 1.6716545532278753, + "grad_norm": 3.8827927112579346, + "learning_rate": 3.6508313539192404e-06, + "loss": 0.216, + "step": 7820 + }, + { + "epoch": 1.6727233860624198, + "grad_norm": 3.9195375442504883, + "learning_rate": 3.63895486935867e-06, + "loss": 0.3479, + "step": 7825 + }, + { + "epoch": 1.6737922188969645, + "grad_norm": 4.495283603668213, + "learning_rate": 3.6270783847981e-06, + "loss": 0.2256, + "step": 7830 + }, + { + "epoch": 1.6748610517315092, + "grad_norm": 5.642339706420898, + "learning_rate": 3.61520190023753e-06, + "loss": 0.3205, + "step": 7835 + }, + { + "epoch": 1.6759298845660537, + "grad_norm": 5.5151495933532715, + "learning_rate": 3.60332541567696e-06, + "loss": 0.28, + "step": 7840 + }, + { + "epoch": 1.6769987174005987, + "grad_norm": 3.8195252418518066, + "learning_rate": 3.5914489311163897e-06, + "loss": 0.255, + "step": 7845 + }, + { + "epoch": 1.6780675502351432, + "grad_norm": 5.310424327850342, + "learning_rate": 3.5795724465558194e-06, + "loss": 0.2584, + "step": 7850 + }, + { + "epoch": 1.679136383069688, + "grad_norm": 5.491156101226807, + "learning_rate": 3.5676959619952495e-06, + "loss": 0.254, + "step": 7855 + }, + { + "epoch": 1.6802052159042327, + "grad_norm": 4.094849109649658, + "learning_rate": 3.5558194774346792e-06, + "loss": 0.2051, + "step": 7860 + }, + { + "epoch": 1.6812740487387772, + "grad_norm": 3.9543018341064453, + "learning_rate": 3.54394299287411e-06, + "loss": 0.2653, + "step": 7865 + }, + { + "epoch": 1.682342881573322, + "grad_norm": 4.145587921142578, + "learning_rate": 3.5320665083135395e-06, + "loss": 0.2882, + "step": 7870 + }, + { + "epoch": 1.6834117144078666, + "grad_norm": 3.4505057334899902, + "learning_rate": 3.5201900237529696e-06, + "loss": 0.2685, + "step": 7875 + }, + { + "epoch": 1.6844805472424111, + "grad_norm": 4.536677837371826, + "learning_rate": 3.5083135391923993e-06, + "loss": 0.2606, + "step": 7880 + }, + { + "epoch": 1.685549380076956, + "grad_norm": 5.157629013061523, + "learning_rate": 3.4964370546318295e-06, + "loss": 0.2266, + "step": 7885 + }, + { + "epoch": 1.6866182129115006, + "grad_norm": 4.595909595489502, + "learning_rate": 3.484560570071259e-06, + "loss": 0.2112, + "step": 7890 + }, + { + "epoch": 1.6876870457460453, + "grad_norm": 4.331202030181885, + "learning_rate": 3.4726840855106893e-06, + "loss": 0.245, + "step": 7895 + }, + { + "epoch": 1.68875587858059, + "grad_norm": 5.239740371704102, + "learning_rate": 3.460807600950119e-06, + "loss": 0.2144, + "step": 7900 + }, + { + "epoch": 1.6898247114151346, + "grad_norm": 3.1925699710845947, + "learning_rate": 3.448931116389549e-06, + "loss": 0.3334, + "step": 7905 + }, + { + "epoch": 1.6908935442496793, + "grad_norm": 3.5667247772216797, + "learning_rate": 3.437054631828979e-06, + "loss": 0.2754, + "step": 7910 + }, + { + "epoch": 1.691962377084224, + "grad_norm": 4.145174026489258, + "learning_rate": 3.4251781472684085e-06, + "loss": 0.3048, + "step": 7915 + }, + { + "epoch": 1.6930312099187685, + "grad_norm": 3.559020519256592, + "learning_rate": 3.4133016627078386e-06, + "loss": 0.2319, + "step": 7920 + }, + { + "epoch": 1.6941000427533135, + "grad_norm": 3.1762850284576416, + "learning_rate": 3.4014251781472683e-06, + "loss": 0.3505, + "step": 7925 + }, + { + "epoch": 1.695168875587858, + "grad_norm": 4.600183963775635, + "learning_rate": 3.3895486935866985e-06, + "loss": 0.3171, + "step": 7930 + }, + { + "epoch": 1.6962377084224027, + "grad_norm": 4.069181442260742, + "learning_rate": 3.3776722090261286e-06, + "loss": 0.2359, + "step": 7935 + }, + { + "epoch": 1.6973065412569475, + "grad_norm": 5.979001998901367, + "learning_rate": 3.3657957244655587e-06, + "loss": 0.259, + "step": 7940 + }, + { + "epoch": 1.698375374091492, + "grad_norm": 4.2909040451049805, + "learning_rate": 3.3539192399049884e-06, + "loss": 0.2345, + "step": 7945 + }, + { + "epoch": 1.699444206926037, + "grad_norm": 4.572742938995361, + "learning_rate": 3.3420427553444185e-06, + "loss": 0.2364, + "step": 7950 + }, + { + "epoch": 1.7005130397605814, + "grad_norm": 4.979130744934082, + "learning_rate": 3.3301662707838482e-06, + "loss": 0.3125, + "step": 7955 + }, + { + "epoch": 1.7015818725951262, + "grad_norm": 8.828888893127441, + "learning_rate": 3.3182897862232784e-06, + "loss": 0.3855, + "step": 7960 + }, + { + "epoch": 1.702650705429671, + "grad_norm": 3.5113627910614014, + "learning_rate": 3.306413301662708e-06, + "loss": 0.3085, + "step": 7965 + }, + { + "epoch": 1.7037195382642154, + "grad_norm": 3.138580322265625, + "learning_rate": 3.294536817102138e-06, + "loss": 0.2558, + "step": 7970 + }, + { + "epoch": 1.7047883710987601, + "grad_norm": 3.382124900817871, + "learning_rate": 3.282660332541568e-06, + "loss": 0.2863, + "step": 7975 + }, + { + "epoch": 1.7058572039333049, + "grad_norm": 4.64111328125, + "learning_rate": 3.2707838479809976e-06, + "loss": 0.2763, + "step": 7980 + }, + { + "epoch": 1.7069260367678494, + "grad_norm": 3.9928252696990967, + "learning_rate": 3.2589073634204277e-06, + "loss": 0.2307, + "step": 7985 + }, + { + "epoch": 1.7079948696023943, + "grad_norm": 4.402683258056641, + "learning_rate": 3.2470308788598574e-06, + "loss": 0.2604, + "step": 7990 + }, + { + "epoch": 1.7090637024369388, + "grad_norm": 4.634458541870117, + "learning_rate": 3.2351543942992876e-06, + "loss": 0.3524, + "step": 7995 + }, + { + "epoch": 1.7101325352714836, + "grad_norm": 3.9876441955566406, + "learning_rate": 3.2232779097387173e-06, + "loss": 0.2591, + "step": 8000 + }, + { + "epoch": 1.7112013681060283, + "grad_norm": 5.491477012634277, + "learning_rate": 3.211401425178148e-06, + "loss": 0.2945, + "step": 8005 + }, + { + "epoch": 1.7122702009405728, + "grad_norm": 3.348909378051758, + "learning_rate": 3.1995249406175775e-06, + "loss": 0.2698, + "step": 8010 + }, + { + "epoch": 1.7133390337751175, + "grad_norm": 2.3808627128601074, + "learning_rate": 3.1876484560570076e-06, + "loss": 0.2091, + "step": 8015 + }, + { + "epoch": 1.7144078666096623, + "grad_norm": 4.511120319366455, + "learning_rate": 3.1757719714964373e-06, + "loss": 0.2313, + "step": 8020 + }, + { + "epoch": 1.7154766994442068, + "grad_norm": 3.1614320278167725, + "learning_rate": 3.1638954869358675e-06, + "loss": 0.2935, + "step": 8025 + }, + { + "epoch": 1.7165455322787517, + "grad_norm": 4.708336353302002, + "learning_rate": 3.152019002375297e-06, + "loss": 0.2358, + "step": 8030 + }, + { + "epoch": 1.7176143651132962, + "grad_norm": 5.274806499481201, + "learning_rate": 3.1401425178147273e-06, + "loss": 0.2842, + "step": 8035 + }, + { + "epoch": 1.718683197947841, + "grad_norm": 4.673067569732666, + "learning_rate": 3.128266033254157e-06, + "loss": 0.26, + "step": 8040 + }, + { + "epoch": 1.7197520307823857, + "grad_norm": 7.412868499755859, + "learning_rate": 3.1163895486935867e-06, + "loss": 0.312, + "step": 8045 + }, + { + "epoch": 1.7208208636169302, + "grad_norm": 5.098508834838867, + "learning_rate": 3.104513064133017e-06, + "loss": 0.2776, + "step": 8050 + }, + { + "epoch": 1.721889696451475, + "grad_norm": 2.9823100566864014, + "learning_rate": 3.0926365795724465e-06, + "loss": 0.1612, + "step": 8055 + }, + { + "epoch": 1.7229585292860197, + "grad_norm": 3.906702995300293, + "learning_rate": 3.0807600950118767e-06, + "loss": 0.1803, + "step": 8060 + }, + { + "epoch": 1.7240273621205642, + "grad_norm": 4.462987899780273, + "learning_rate": 3.0688836104513064e-06, + "loss": 0.2677, + "step": 8065 + }, + { + "epoch": 1.7250961949551091, + "grad_norm": 3.3349108695983887, + "learning_rate": 3.0570071258907365e-06, + "loss": 0.2315, + "step": 8070 + }, + { + "epoch": 1.7261650277896536, + "grad_norm": 3.8888843059539795, + "learning_rate": 3.0451306413301666e-06, + "loss": 0.2583, + "step": 8075 + }, + { + "epoch": 1.7272338606241984, + "grad_norm": 3.5807013511657715, + "learning_rate": 3.0332541567695967e-06, + "loss": 0.2488, + "step": 8080 + }, + { + "epoch": 1.728302693458743, + "grad_norm": 4.443240165710449, + "learning_rate": 3.0213776722090264e-06, + "loss": 0.2379, + "step": 8085 + }, + { + "epoch": 1.7293715262932876, + "grad_norm": 4.572385311126709, + "learning_rate": 3.0095011876484566e-06, + "loss": 0.2637, + "step": 8090 + }, + { + "epoch": 1.7304403591278326, + "grad_norm": 3.8426921367645264, + "learning_rate": 2.9976247030878863e-06, + "loss": 0.2101, + "step": 8095 + }, + { + "epoch": 1.731509191962377, + "grad_norm": 3.6695351600646973, + "learning_rate": 2.9857482185273164e-06, + "loss": 0.283, + "step": 8100 + }, + { + "epoch": 1.7325780247969218, + "grad_norm": 4.494965076446533, + "learning_rate": 2.973871733966746e-06, + "loss": 0.2163, + "step": 8105 + }, + { + "epoch": 1.7336468576314665, + "grad_norm": 4.575949192047119, + "learning_rate": 2.961995249406176e-06, + "loss": 0.2474, + "step": 8110 + }, + { + "epoch": 1.734715690466011, + "grad_norm": 5.060282230377197, + "learning_rate": 2.950118764845606e-06, + "loss": 0.3346, + "step": 8115 + }, + { + "epoch": 1.7357845233005558, + "grad_norm": 5.1213274002075195, + "learning_rate": 2.9382422802850356e-06, + "loss": 0.2031, + "step": 8120 + }, + { + "epoch": 1.7368533561351005, + "grad_norm": 4.754722595214844, + "learning_rate": 2.9263657957244658e-06, + "loss": 0.2301, + "step": 8125 + }, + { + "epoch": 1.737922188969645, + "grad_norm": 3.7561304569244385, + "learning_rate": 2.9144893111638955e-06, + "loss": 0.3413, + "step": 8130 + }, + { + "epoch": 1.73899102180419, + "grad_norm": 4.434960842132568, + "learning_rate": 2.9026128266033256e-06, + "loss": 0.3121, + "step": 8135 + }, + { + "epoch": 1.7400598546387345, + "grad_norm": 3.5216495990753174, + "learning_rate": 2.8907363420427553e-06, + "loss": 0.2696, + "step": 8140 + }, + { + "epoch": 1.7411286874732792, + "grad_norm": 3.2195262908935547, + "learning_rate": 2.878859857482186e-06, + "loss": 0.1871, + "step": 8145 + }, + { + "epoch": 1.742197520307824, + "grad_norm": 2.6963675022125244, + "learning_rate": 2.8669833729216155e-06, + "loss": 0.2427, + "step": 8150 + }, + { + "epoch": 1.7432663531423684, + "grad_norm": 3.3632442951202393, + "learning_rate": 2.8551068883610457e-06, + "loss": 0.215, + "step": 8155 + }, + { + "epoch": 1.7443351859769132, + "grad_norm": 4.627504825592041, + "learning_rate": 2.8432304038004754e-06, + "loss": 0.2603, + "step": 8160 + }, + { + "epoch": 1.745404018811458, + "grad_norm": 4.896625995635986, + "learning_rate": 2.8313539192399055e-06, + "loss": 0.2149, + "step": 8165 + }, + { + "epoch": 1.7464728516460024, + "grad_norm": 3.6175167560577393, + "learning_rate": 2.819477434679335e-06, + "loss": 0.2961, + "step": 8170 + }, + { + "epoch": 1.7475416844805474, + "grad_norm": 2.9704079627990723, + "learning_rate": 2.807600950118765e-06, + "loss": 0.2363, + "step": 8175 + }, + { + "epoch": 1.7486105173150919, + "grad_norm": 5.211386203765869, + "learning_rate": 2.795724465558195e-06, + "loss": 0.2238, + "step": 8180 + }, + { + "epoch": 1.7496793501496366, + "grad_norm": 4.538329601287842, + "learning_rate": 2.7838479809976247e-06, + "loss": 0.2522, + "step": 8185 + }, + { + "epoch": 1.7507481829841813, + "grad_norm": 4.693541049957275, + "learning_rate": 2.771971496437055e-06, + "loss": 0.2314, + "step": 8190 + }, + { + "epoch": 1.7518170158187258, + "grad_norm": 6.232285499572754, + "learning_rate": 2.7600950118764846e-06, + "loss": 0.34, + "step": 8195 + }, + { + "epoch": 1.7528858486532708, + "grad_norm": 3.3624300956726074, + "learning_rate": 2.7482185273159147e-06, + "loss": 0.3113, + "step": 8200 + }, + { + "epoch": 1.7539546814878153, + "grad_norm": 4.9479193687438965, + "learning_rate": 2.7363420427553444e-06, + "loss": 0.2022, + "step": 8205 + }, + { + "epoch": 1.75502351432236, + "grad_norm": 2.4150469303131104, + "learning_rate": 2.7244655581947745e-06, + "loss": 0.2244, + "step": 8210 + }, + { + "epoch": 1.7560923471569048, + "grad_norm": 2.7240800857543945, + "learning_rate": 2.7125890736342046e-06, + "loss": 0.1794, + "step": 8215 + }, + { + "epoch": 1.7571611799914493, + "grad_norm": 5.584763526916504, + "learning_rate": 2.7007125890736348e-06, + "loss": 0.3058, + "step": 8220 + }, + { + "epoch": 1.758230012825994, + "grad_norm": 6.3999505043029785, + "learning_rate": 2.6888361045130645e-06, + "loss": 0.2495, + "step": 8225 + }, + { + "epoch": 1.7592988456605387, + "grad_norm": 3.8963570594787598, + "learning_rate": 2.6769596199524946e-06, + "loss": 0.3586, + "step": 8230 + }, + { + "epoch": 1.7603676784950832, + "grad_norm": 4.307738780975342, + "learning_rate": 2.6650831353919243e-06, + "loss": 0.2726, + "step": 8235 + }, + { + "epoch": 1.7614365113296282, + "grad_norm": 4.685522079467773, + "learning_rate": 2.653206650831354e-06, + "loss": 0.2774, + "step": 8240 + }, + { + "epoch": 1.7625053441641727, + "grad_norm": 3.685218095779419, + "learning_rate": 2.641330166270784e-06, + "loss": 0.3009, + "step": 8245 + }, + { + "epoch": 1.7635741769987174, + "grad_norm": 3.9087140560150146, + "learning_rate": 2.629453681710214e-06, + "loss": 0.2813, + "step": 8250 + }, + { + "epoch": 1.7646430098332622, + "grad_norm": 4.455633163452148, + "learning_rate": 2.617577197149644e-06, + "loss": 0.2942, + "step": 8255 + }, + { + "epoch": 1.7657118426678067, + "grad_norm": 3.3832907676696777, + "learning_rate": 2.6057007125890737e-06, + "loss": 0.2463, + "step": 8260 + }, + { + "epoch": 1.7667806755023514, + "grad_norm": 4.235377788543701, + "learning_rate": 2.5938242280285038e-06, + "loss": 0.2399, + "step": 8265 + }, + { + "epoch": 1.7678495083368961, + "grad_norm": 5.997225761413574, + "learning_rate": 2.5819477434679335e-06, + "loss": 0.2725, + "step": 8270 + }, + { + "epoch": 1.7689183411714406, + "grad_norm": 3.9668803215026855, + "learning_rate": 2.5700712589073636e-06, + "loss": 0.2171, + "step": 8275 + }, + { + "epoch": 1.7699871740059856, + "grad_norm": 6.379711151123047, + "learning_rate": 2.5581947743467933e-06, + "loss": 0.3037, + "step": 8280 + }, + { + "epoch": 1.77105600684053, + "grad_norm": 4.1840901374816895, + "learning_rate": 2.546318289786224e-06, + "loss": 0.1921, + "step": 8285 + }, + { + "epoch": 1.7721248396750748, + "grad_norm": 3.4607646465301514, + "learning_rate": 2.5344418052256536e-06, + "loss": 0.2519, + "step": 8290 + }, + { + "epoch": 1.7731936725096196, + "grad_norm": 4.899019241333008, + "learning_rate": 2.5225653206650837e-06, + "loss": 0.2644, + "step": 8295 + }, + { + "epoch": 1.774262505344164, + "grad_norm": 3.769134283065796, + "learning_rate": 2.5106888361045134e-06, + "loss": 0.3707, + "step": 8300 + }, + { + "epoch": 1.7753313381787088, + "grad_norm": 3.0456831455230713, + "learning_rate": 2.4988123515439435e-06, + "loss": 0.1496, + "step": 8305 + }, + { + "epoch": 1.7764001710132535, + "grad_norm": 4.198024749755859, + "learning_rate": 2.4869358669833732e-06, + "loss": 0.2251, + "step": 8310 + }, + { + "epoch": 1.777469003847798, + "grad_norm": 3.964083194732666, + "learning_rate": 2.475059382422803e-06, + "loss": 0.2426, + "step": 8315 + }, + { + "epoch": 1.778537836682343, + "grad_norm": 4.519120216369629, + "learning_rate": 2.463182897862233e-06, + "loss": 0.2853, + "step": 8320 + }, + { + "epoch": 1.7796066695168875, + "grad_norm": 4.322653293609619, + "learning_rate": 2.4513064133016627e-06, + "loss": 0.2156, + "step": 8325 + }, + { + "epoch": 1.7806755023514322, + "grad_norm": 2.6961798667907715, + "learning_rate": 2.439429928741093e-06, + "loss": 0.2293, + "step": 8330 + }, + { + "epoch": 1.781744335185977, + "grad_norm": 4.139772415161133, + "learning_rate": 2.4275534441805226e-06, + "loss": 0.2516, + "step": 8335 + }, + { + "epoch": 1.7828131680205215, + "grad_norm": 3.3040573596954346, + "learning_rate": 2.4156769596199527e-06, + "loss": 0.2272, + "step": 8340 + }, + { + "epoch": 1.7838820008550664, + "grad_norm": 4.51014518737793, + "learning_rate": 2.403800475059383e-06, + "loss": 0.2995, + "step": 8345 + }, + { + "epoch": 1.784950833689611, + "grad_norm": 3.647020101547241, + "learning_rate": 2.3919239904988125e-06, + "loss": 0.2825, + "step": 8350 + }, + { + "epoch": 1.7860196665241557, + "grad_norm": 3.456620931625366, + "learning_rate": 2.3800475059382427e-06, + "loss": 0.2604, + "step": 8355 + }, + { + "epoch": 1.7870884993587004, + "grad_norm": 5.626756191253662, + "learning_rate": 2.3681710213776724e-06, + "loss": 0.2216, + "step": 8360 + }, + { + "epoch": 1.788157332193245, + "grad_norm": 4.277560710906982, + "learning_rate": 2.356294536817102e-06, + "loss": 0.3034, + "step": 8365 + }, + { + "epoch": 1.7892261650277896, + "grad_norm": 2.8576090335845947, + "learning_rate": 2.344418052256532e-06, + "loss": 0.229, + "step": 8370 + }, + { + "epoch": 1.7902949978623344, + "grad_norm": 4.79686975479126, + "learning_rate": 2.3325415676959623e-06, + "loss": 0.271, + "step": 8375 + }, + { + "epoch": 1.7913638306968789, + "grad_norm": 5.135036945343018, + "learning_rate": 2.320665083135392e-06, + "loss": 0.2371, + "step": 8380 + }, + { + "epoch": 1.7924326635314238, + "grad_norm": 5.7761406898498535, + "learning_rate": 2.308788598574822e-06, + "loss": 0.2592, + "step": 8385 + }, + { + "epoch": 1.7935014963659683, + "grad_norm": 2.8430325984954834, + "learning_rate": 2.296912114014252e-06, + "loss": 0.206, + "step": 8390 + }, + { + "epoch": 1.794570329200513, + "grad_norm": 4.540223598480225, + "learning_rate": 2.285035629453682e-06, + "loss": 0.2167, + "step": 8395 + }, + { + "epoch": 1.7956391620350578, + "grad_norm": 4.889501094818115, + "learning_rate": 2.2731591448931117e-06, + "loss": 0.2521, + "step": 8400 + }, + { + "epoch": 1.7967079948696023, + "grad_norm": 3.3274142742156982, + "learning_rate": 2.261282660332542e-06, + "loss": 0.2283, + "step": 8405 + }, + { + "epoch": 1.797776827704147, + "grad_norm": 3.501002073287964, + "learning_rate": 2.249406175771972e-06, + "loss": 0.2161, + "step": 8410 + }, + { + "epoch": 1.7988456605386918, + "grad_norm": 2.9936413764953613, + "learning_rate": 2.2375296912114016e-06, + "loss": 0.2233, + "step": 8415 + }, + { + "epoch": 1.7999144933732363, + "grad_norm": 4.086530685424805, + "learning_rate": 2.2256532066508318e-06, + "loss": 0.2799, + "step": 8420 + }, + { + "epoch": 1.8009833262077812, + "grad_norm": 4.791090965270996, + "learning_rate": 2.2137767220902615e-06, + "loss": 0.2558, + "step": 8425 + }, + { + "epoch": 1.8020521590423257, + "grad_norm": 4.07485294342041, + "learning_rate": 2.201900237529691e-06, + "loss": 0.3093, + "step": 8430 + }, + { + "epoch": 1.8031209918768705, + "grad_norm": 4.454413414001465, + "learning_rate": 2.1900237529691213e-06, + "loss": 0.2751, + "step": 8435 + }, + { + "epoch": 1.8041898247114152, + "grad_norm": 4.849613666534424, + "learning_rate": 2.178147268408551e-06, + "loss": 0.268, + "step": 8440 + }, + { + "epoch": 1.8052586575459597, + "grad_norm": 4.424874782562256, + "learning_rate": 2.166270783847981e-06, + "loss": 0.2473, + "step": 8445 + }, + { + "epoch": 1.8063274903805044, + "grad_norm": 5.070244789123535, + "learning_rate": 2.1543942992874112e-06, + "loss": 0.3218, + "step": 8450 + }, + { + "epoch": 1.8073963232150492, + "grad_norm": 4.407561302185059, + "learning_rate": 2.142517814726841e-06, + "loss": 0.2602, + "step": 8455 + }, + { + "epoch": 1.8084651560495937, + "grad_norm": 3.2732160091400146, + "learning_rate": 2.130641330166271e-06, + "loss": 0.2118, + "step": 8460 + }, + { + "epoch": 1.8095339888841386, + "grad_norm": 6.757079124450684, + "learning_rate": 2.1187648456057008e-06, + "loss": 0.2526, + "step": 8465 + }, + { + "epoch": 1.8106028217186831, + "grad_norm": 3.9517734050750732, + "learning_rate": 2.106888361045131e-06, + "loss": 0.2797, + "step": 8470 + }, + { + "epoch": 1.8116716545532279, + "grad_norm": 3.6137807369232178, + "learning_rate": 2.0950118764845606e-06, + "loss": 0.2177, + "step": 8475 + }, + { + "epoch": 1.8127404873877726, + "grad_norm": 3.5731587409973145, + "learning_rate": 2.0831353919239907e-06, + "loss": 0.2264, + "step": 8480 + }, + { + "epoch": 1.8138093202223171, + "grad_norm": 4.859638690948486, + "learning_rate": 2.071258907363421e-06, + "loss": 0.253, + "step": 8485 + }, + { + "epoch": 1.814878153056862, + "grad_norm": 4.231696605682373, + "learning_rate": 2.0593824228028506e-06, + "loss": 0.199, + "step": 8490 + }, + { + "epoch": 1.8159469858914066, + "grad_norm": 3.7343459129333496, + "learning_rate": 2.0475059382422803e-06, + "loss": 0.2484, + "step": 8495 + }, + { + "epoch": 1.8170158187259513, + "grad_norm": 4.6749958992004395, + "learning_rate": 2.0356294536817104e-06, + "loss": 0.2666, + "step": 8500 + }, + { + "epoch": 1.818084651560496, + "grad_norm": 3.5160164833068848, + "learning_rate": 2.02375296912114e-06, + "loss": 0.2423, + "step": 8505 + }, + { + "epoch": 1.8191534843950405, + "grad_norm": 5.324501037597656, + "learning_rate": 2.01187648456057e-06, + "loss": 0.3206, + "step": 8510 + }, + { + "epoch": 1.8202223172295853, + "grad_norm": 4.3562092781066895, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.1885, + "step": 8515 + }, + { + "epoch": 1.82129115006413, + "grad_norm": 3.188398838043213, + "learning_rate": 1.98812351543943e-06, + "loss": 0.3103, + "step": 8520 + }, + { + "epoch": 1.8223599828986745, + "grad_norm": 3.9081082344055176, + "learning_rate": 1.97624703087886e-06, + "loss": 0.2204, + "step": 8525 + }, + { + "epoch": 1.8234288157332195, + "grad_norm": 4.220818519592285, + "learning_rate": 1.96437054631829e-06, + "loss": 0.2269, + "step": 8530 + }, + { + "epoch": 1.824497648567764, + "grad_norm": 4.2256035804748535, + "learning_rate": 1.95249406175772e-06, + "loss": 0.3602, + "step": 8535 + }, + { + "epoch": 1.8255664814023087, + "grad_norm": 3.092357635498047, + "learning_rate": 1.9406175771971497e-06, + "loss": 0.1735, + "step": 8540 + }, + { + "epoch": 1.8266353142368534, + "grad_norm": 5.8758649826049805, + "learning_rate": 1.9287410926365794e-06, + "loss": 0.3107, + "step": 8545 + }, + { + "epoch": 1.827704147071398, + "grad_norm": 4.43316650390625, + "learning_rate": 1.91686460807601e-06, + "loss": 0.2346, + "step": 8550 + }, + { + "epoch": 1.8287729799059427, + "grad_norm": 4.877310276031494, + "learning_rate": 1.9049881235154396e-06, + "loss": 0.2372, + "step": 8555 + }, + { + "epoch": 1.8298418127404874, + "grad_norm": 5.378355026245117, + "learning_rate": 1.8931116389548696e-06, + "loss": 0.2665, + "step": 8560 + }, + { + "epoch": 1.830910645575032, + "grad_norm": 4.576028347015381, + "learning_rate": 1.8812351543942995e-06, + "loss": 0.2722, + "step": 8565 + }, + { + "epoch": 1.8319794784095769, + "grad_norm": 2.452864646911621, + "learning_rate": 1.8693586698337294e-06, + "loss": 0.1879, + "step": 8570 + }, + { + "epoch": 1.8330483112441214, + "grad_norm": 4.013648509979248, + "learning_rate": 1.8574821852731593e-06, + "loss": 0.2434, + "step": 8575 + }, + { + "epoch": 1.834117144078666, + "grad_norm": 4.295891761779785, + "learning_rate": 1.845605700712589e-06, + "loss": 0.3408, + "step": 8580 + }, + { + "epoch": 1.8351859769132108, + "grad_norm": 5.399013042449951, + "learning_rate": 1.8337292161520193e-06, + "loss": 0.2819, + "step": 8585 + }, + { + "epoch": 1.8362548097477553, + "grad_norm": 5.267197608947754, + "learning_rate": 1.8218527315914493e-06, + "loss": 0.2828, + "step": 8590 + }, + { + "epoch": 1.8373236425823, + "grad_norm": 4.1791672706604, + "learning_rate": 1.809976247030879e-06, + "loss": 0.1821, + "step": 8595 + }, + { + "epoch": 1.8383924754168448, + "grad_norm": 4.158424377441406, + "learning_rate": 1.7980997624703089e-06, + "loss": 0.2193, + "step": 8600 + }, + { + "epoch": 1.8394613082513895, + "grad_norm": 3.101128101348877, + "learning_rate": 1.7862232779097388e-06, + "loss": 0.3155, + "step": 8605 + }, + { + "epoch": 1.8405301410859343, + "grad_norm": 3.661057233810425, + "learning_rate": 1.7743467933491687e-06, + "loss": 0.2129, + "step": 8610 + }, + { + "epoch": 1.8415989739204788, + "grad_norm": 3.7547378540039062, + "learning_rate": 1.7624703087885986e-06, + "loss": 0.2321, + "step": 8615 + }, + { + "epoch": 1.8426678067550235, + "grad_norm": 4.53202486038208, + "learning_rate": 1.7505938242280287e-06, + "loss": 0.1748, + "step": 8620 + }, + { + "epoch": 1.8437366395895682, + "grad_norm": 3.7189040184020996, + "learning_rate": 1.7387173396674587e-06, + "loss": 0.2355, + "step": 8625 + }, + { + "epoch": 1.8448054724241127, + "grad_norm": 5.827390670776367, + "learning_rate": 1.7268408551068886e-06, + "loss": 0.2226, + "step": 8630 + }, + { + "epoch": 1.8458743052586577, + "grad_norm": 4.365615367889404, + "learning_rate": 1.7149643705463185e-06, + "loss": 0.2812, + "step": 8635 + }, + { + "epoch": 1.8469431380932022, + "grad_norm": 4.593905925750732, + "learning_rate": 1.7030878859857484e-06, + "loss": 0.2542, + "step": 8640 + }, + { + "epoch": 1.848011970927747, + "grad_norm": 4.3599419593811035, + "learning_rate": 1.691211401425178e-06, + "loss": 0.214, + "step": 8645 + }, + { + "epoch": 1.8490808037622917, + "grad_norm": 5.342328071594238, + "learning_rate": 1.679334916864608e-06, + "loss": 0.2157, + "step": 8650 + }, + { + "epoch": 1.8501496365968362, + "grad_norm": 3.1678943634033203, + "learning_rate": 1.6674584323040384e-06, + "loss": 0.2336, + "step": 8655 + }, + { + "epoch": 1.851218469431381, + "grad_norm": 4.464089870452881, + "learning_rate": 1.655581947743468e-06, + "loss": 0.3409, + "step": 8660 + }, + { + "epoch": 1.8522873022659256, + "grad_norm": 4.1919755935668945, + "learning_rate": 1.643705463182898e-06, + "loss": 0.2902, + "step": 8665 + }, + { + "epoch": 1.8533561351004701, + "grad_norm": 3.814858913421631, + "learning_rate": 1.6318289786223279e-06, + "loss": 0.2329, + "step": 8670 + }, + { + "epoch": 1.854424967935015, + "grad_norm": 3.2706382274627686, + "learning_rate": 1.6199524940617578e-06, + "loss": 0.1849, + "step": 8675 + }, + { + "epoch": 1.8554938007695596, + "grad_norm": 3.6442952156066895, + "learning_rate": 1.6080760095011877e-06, + "loss": 0.2919, + "step": 8680 + }, + { + "epoch": 1.8565626336041043, + "grad_norm": 3.179872512817383, + "learning_rate": 1.5961995249406176e-06, + "loss": 0.2157, + "step": 8685 + }, + { + "epoch": 1.857631466438649, + "grad_norm": 3.71156644821167, + "learning_rate": 1.5843230403800478e-06, + "loss": 0.2286, + "step": 8690 + }, + { + "epoch": 1.8587002992731936, + "grad_norm": 5.000162124633789, + "learning_rate": 1.5724465558194777e-06, + "loss": 0.1957, + "step": 8695 + }, + { + "epoch": 1.8597691321077383, + "grad_norm": 3.7217514514923096, + "learning_rate": 1.5605700712589076e-06, + "loss": 0.2102, + "step": 8700 + }, + { + "epoch": 1.860837964942283, + "grad_norm": 5.23848295211792, + "learning_rate": 1.5486935866983375e-06, + "loss": 0.3172, + "step": 8705 + }, + { + "epoch": 1.8619067977768275, + "grad_norm": 3.95940899848938, + "learning_rate": 1.5368171021377672e-06, + "loss": 0.2619, + "step": 8710 + }, + { + "epoch": 1.8629756306113725, + "grad_norm": 4.389864921569824, + "learning_rate": 1.5249406175771971e-06, + "loss": 0.2898, + "step": 8715 + }, + { + "epoch": 1.864044463445917, + "grad_norm": 4.196899890899658, + "learning_rate": 1.513064133016627e-06, + "loss": 0.2523, + "step": 8720 + }, + { + "epoch": 1.8651132962804617, + "grad_norm": 4.35107946395874, + "learning_rate": 1.5011876484560572e-06, + "loss": 0.2534, + "step": 8725 + }, + { + "epoch": 1.8661821291150065, + "grad_norm": 5.233465194702148, + "learning_rate": 1.489311163895487e-06, + "loss": 0.2546, + "step": 8730 + }, + { + "epoch": 1.867250961949551, + "grad_norm": 4.285619735717773, + "learning_rate": 1.477434679334917e-06, + "loss": 0.2171, + "step": 8735 + }, + { + "epoch": 1.868319794784096, + "grad_norm": 5.0237579345703125, + "learning_rate": 1.465558194774347e-06, + "loss": 0.2617, + "step": 8740 + }, + { + "epoch": 1.8693886276186404, + "grad_norm": 3.848062753677368, + "learning_rate": 1.4536817102137768e-06, + "loss": 0.1917, + "step": 8745 + }, + { + "epoch": 1.8704574604531852, + "grad_norm": 3.6329150199890137, + "learning_rate": 1.4418052256532067e-06, + "loss": 0.2256, + "step": 8750 + }, + { + "epoch": 1.87152629328773, + "grad_norm": 4.504333019256592, + "learning_rate": 1.4299287410926366e-06, + "loss": 0.2319, + "step": 8755 + }, + { + "epoch": 1.8725951261222744, + "grad_norm": 6.011372089385986, + "learning_rate": 1.4180522565320668e-06, + "loss": 0.2783, + "step": 8760 + }, + { + "epoch": 1.8736639589568191, + "grad_norm": 4.750868320465088, + "learning_rate": 1.4061757719714967e-06, + "loss": 0.2885, + "step": 8765 + }, + { + "epoch": 1.8747327917913639, + "grad_norm": 3.2728309631347656, + "learning_rate": 1.3942992874109266e-06, + "loss": 0.2586, + "step": 8770 + }, + { + "epoch": 1.8758016246259084, + "grad_norm": 3.3371262550354004, + "learning_rate": 1.3824228028503565e-06, + "loss": 0.2009, + "step": 8775 + }, + { + "epoch": 1.8768704574604533, + "grad_norm": 3.7395825386047363, + "learning_rate": 1.3705463182897862e-06, + "loss": 0.273, + "step": 8780 + }, + { + "epoch": 1.8779392902949978, + "grad_norm": 4.672481060028076, + "learning_rate": 1.3586698337292161e-06, + "loss": 0.2502, + "step": 8785 + }, + { + "epoch": 1.8790081231295426, + "grad_norm": 2.957099676132202, + "learning_rate": 1.346793349168646e-06, + "loss": 0.2174, + "step": 8790 + }, + { + "epoch": 1.8800769559640873, + "grad_norm": 4.8943915367126465, + "learning_rate": 1.3349168646080762e-06, + "loss": 0.2723, + "step": 8795 + }, + { + "epoch": 1.8811457887986318, + "grad_norm": 4.067677021026611, + "learning_rate": 1.323040380047506e-06, + "loss": 0.2633, + "step": 8800 + }, + { + "epoch": 1.8822146216331765, + "grad_norm": 4.314869403839111, + "learning_rate": 1.311163895486936e-06, + "loss": 0.2794, + "step": 8805 + }, + { + "epoch": 1.8832834544677213, + "grad_norm": 4.225076675415039, + "learning_rate": 1.299287410926366e-06, + "loss": 0.2961, + "step": 8810 + }, + { + "epoch": 1.8843522873022658, + "grad_norm": 3.992135763168335, + "learning_rate": 1.2874109263657958e-06, + "loss": 0.2598, + "step": 8815 + }, + { + "epoch": 1.8854211201368107, + "grad_norm": 4.5158586502075195, + "learning_rate": 1.2755344418052257e-06, + "loss": 0.2794, + "step": 8820 + }, + { + "epoch": 1.8864899529713552, + "grad_norm": 4.226551055908203, + "learning_rate": 1.2636579572446556e-06, + "loss": 0.2447, + "step": 8825 + }, + { + "epoch": 1.8875587858059, + "grad_norm": 3.2052338123321533, + "learning_rate": 1.2517814726840858e-06, + "loss": 0.2741, + "step": 8830 + }, + { + "epoch": 1.8886276186404447, + "grad_norm": 3.315537929534912, + "learning_rate": 1.2399049881235155e-06, + "loss": 0.2192, + "step": 8835 + }, + { + "epoch": 1.8896964514749892, + "grad_norm": 4.095473289489746, + "learning_rate": 1.2280285035629456e-06, + "loss": 0.3188, + "step": 8840 + }, + { + "epoch": 1.890765284309534, + "grad_norm": 4.654134273529053, + "learning_rate": 1.2161520190023753e-06, + "loss": 0.294, + "step": 8845 + }, + { + "epoch": 1.8918341171440787, + "grad_norm": 3.982452154159546, + "learning_rate": 1.2042755344418052e-06, + "loss": 0.2961, + "step": 8850 + }, + { + "epoch": 1.8929029499786232, + "grad_norm": 3.594325542449951, + "learning_rate": 1.1923990498812353e-06, + "loss": 0.2288, + "step": 8855 + }, + { + "epoch": 1.8939717828131681, + "grad_norm": 4.437509059906006, + "learning_rate": 1.1805225653206653e-06, + "loss": 0.2796, + "step": 8860 + }, + { + "epoch": 1.8950406156477126, + "grad_norm": 4.6788716316223145, + "learning_rate": 1.1686460807600952e-06, + "loss": 0.2464, + "step": 8865 + }, + { + "epoch": 1.8961094484822574, + "grad_norm": 4.381009578704834, + "learning_rate": 1.1567695961995249e-06, + "loss": 0.2435, + "step": 8870 + }, + { + "epoch": 1.897178281316802, + "grad_norm": 4.203982353210449, + "learning_rate": 1.144893111638955e-06, + "loss": 0.2993, + "step": 8875 + }, + { + "epoch": 1.8982471141513466, + "grad_norm": 3.9560775756835938, + "learning_rate": 1.133016627078385e-06, + "loss": 0.2049, + "step": 8880 + }, + { + "epoch": 1.8993159469858916, + "grad_norm": 4.908998012542725, + "learning_rate": 1.1211401425178148e-06, + "loss": 0.2588, + "step": 8885 + }, + { + "epoch": 1.900384779820436, + "grad_norm": 2.399383544921875, + "learning_rate": 1.1092636579572447e-06, + "loss": 0.2559, + "step": 8890 + }, + { + "epoch": 1.9014536126549808, + "grad_norm": 5.100274085998535, + "learning_rate": 1.0973871733966747e-06, + "loss": 0.261, + "step": 8895 + }, + { + "epoch": 1.9025224454895255, + "grad_norm": 1.9479761123657227, + "learning_rate": 1.0855106888361046e-06, + "loss": 0.2132, + "step": 8900 + }, + { + "epoch": 1.90359127832407, + "grad_norm": 4.266331195831299, + "learning_rate": 1.0736342042755345e-06, + "loss": 0.2184, + "step": 8905 + }, + { + "epoch": 1.9046601111586148, + "grad_norm": 3.761469841003418, + "learning_rate": 1.0617577197149644e-06, + "loss": 0.2551, + "step": 8910 + }, + { + "epoch": 1.9057289439931595, + "grad_norm": 5.301465034484863, + "learning_rate": 1.0498812351543943e-06, + "loss": 0.2302, + "step": 8915 + }, + { + "epoch": 1.906797776827704, + "grad_norm": 4.8627095222473145, + "learning_rate": 1.0380047505938242e-06, + "loss": 0.2441, + "step": 8920 + }, + { + "epoch": 1.907866609662249, + "grad_norm": 3.7152163982391357, + "learning_rate": 1.0261282660332544e-06, + "loss": 0.2176, + "step": 8925 + }, + { + "epoch": 1.9089354424967935, + "grad_norm": 4.612980365753174, + "learning_rate": 1.0142517814726843e-06, + "loss": 0.3041, + "step": 8930 + }, + { + "epoch": 1.9100042753313382, + "grad_norm": 3.9601426124572754, + "learning_rate": 1.002375296912114e-06, + "loss": 0.2325, + "step": 8935 + }, + { + "epoch": 1.911073108165883, + "grad_norm": 3.773958921432495, + "learning_rate": 9.904988123515439e-07, + "loss": 0.2463, + "step": 8940 + }, + { + "epoch": 1.9121419410004274, + "grad_norm": 5.172873020172119, + "learning_rate": 9.78622327790974e-07, + "loss": 0.2997, + "step": 8945 + }, + { + "epoch": 1.9132107738349722, + "grad_norm": 3.382683038711548, + "learning_rate": 9.66745843230404e-07, + "loss": 0.2202, + "step": 8950 + }, + { + "epoch": 1.914279606669517, + "grad_norm": 5.699649333953857, + "learning_rate": 9.548693586698338e-07, + "loss": 0.2745, + "step": 8955 + }, + { + "epoch": 1.9153484395040614, + "grad_norm": 4.574731349945068, + "learning_rate": 9.429928741092638e-07, + "loss": 0.2642, + "step": 8960 + }, + { + "epoch": 1.9164172723386064, + "grad_norm": 7.173608303070068, + "learning_rate": 9.311163895486937e-07, + "loss": 0.2782, + "step": 8965 + }, + { + "epoch": 1.9174861051731509, + "grad_norm": 3.9324846267700195, + "learning_rate": 9.192399049881236e-07, + "loss": 0.2435, + "step": 8970 + }, + { + "epoch": 1.9185549380076956, + "grad_norm": 3.742494583129883, + "learning_rate": 9.073634204275535e-07, + "loss": 0.2492, + "step": 8975 + }, + { + "epoch": 1.9196237708422403, + "grad_norm": 5.236582279205322, + "learning_rate": 8.954869358669835e-07, + "loss": 0.2161, + "step": 8980 + }, + { + "epoch": 1.9206926036767848, + "grad_norm": 3.473259449005127, + "learning_rate": 8.836104513064133e-07, + "loss": 0.2549, + "step": 8985 + }, + { + "epoch": 1.9217614365113296, + "grad_norm": 3.2006514072418213, + "learning_rate": 8.717339667458432e-07, + "loss": 0.2217, + "step": 8990 + }, + { + "epoch": 1.9228302693458743, + "grad_norm": 3.0505008697509766, + "learning_rate": 8.598574821852733e-07, + "loss": 0.266, + "step": 8995 + }, + { + "epoch": 1.9238991021804188, + "grad_norm": 3.8124094009399414, + "learning_rate": 8.479809976247032e-07, + "loss": 0.2909, + "step": 9000 + }, + { + "epoch": 1.9249679350149638, + "grad_norm": 3.0390665531158447, + "learning_rate": 8.361045130641331e-07, + "loss": 0.2149, + "step": 9005 + }, + { + "epoch": 1.9260367678495083, + "grad_norm": 3.928755521774292, + "learning_rate": 8.24228028503563e-07, + "loss": 0.3099, + "step": 9010 + }, + { + "epoch": 1.927105600684053, + "grad_norm": 4.092939376831055, + "learning_rate": 8.12351543942993e-07, + "loss": 0.1929, + "step": 9015 + }, + { + "epoch": 1.9281744335185977, + "grad_norm": 4.7592573165893555, + "learning_rate": 8.004750593824228e-07, + "loss": 0.2854, + "step": 9020 + }, + { + "epoch": 1.9292432663531422, + "grad_norm": 3.904730796813965, + "learning_rate": 7.885985748218527e-07, + "loss": 0.1857, + "step": 9025 + }, + { + "epoch": 1.9303120991876872, + "grad_norm": 4.656405925750732, + "learning_rate": 7.767220902612828e-07, + "loss": 0.2445, + "step": 9030 + }, + { + "epoch": 1.9313809320222317, + "grad_norm": 3.890486240386963, + "learning_rate": 7.648456057007127e-07, + "loss": 0.226, + "step": 9035 + }, + { + "epoch": 1.9324497648567764, + "grad_norm": 4.5724334716796875, + "learning_rate": 7.529691211401426e-07, + "loss": 0.2822, + "step": 9040 + }, + { + "epoch": 1.9335185976913212, + "grad_norm": 4.720613479614258, + "learning_rate": 7.410926365795724e-07, + "loss": 0.2541, + "step": 9045 + }, + { + "epoch": 1.9345874305258657, + "grad_norm": 3.9262373447418213, + "learning_rate": 7.292161520190025e-07, + "loss": 0.2442, + "step": 9050 + }, + { + "epoch": 1.9356562633604104, + "grad_norm": 3.6456849575042725, + "learning_rate": 7.173396674584323e-07, + "loss": 0.2504, + "step": 9055 + }, + { + "epoch": 1.9367250961949551, + "grad_norm": 3.021383762359619, + "learning_rate": 7.054631828978623e-07, + "loss": 0.2073, + "step": 9060 + }, + { + "epoch": 1.9377939290294997, + "grad_norm": 4.671846389770508, + "learning_rate": 6.935866983372923e-07, + "loss": 0.2245, + "step": 9065 + }, + { + "epoch": 1.9388627618640446, + "grad_norm": 4.805634021759033, + "learning_rate": 6.817102137767222e-07, + "loss": 0.2442, + "step": 9070 + }, + { + "epoch": 1.9399315946985891, + "grad_norm": 3.9393720626831055, + "learning_rate": 6.698337292161521e-07, + "loss": 0.2382, + "step": 9075 + }, + { + "epoch": 1.9410004275331338, + "grad_norm": 5.1551408767700195, + "learning_rate": 6.579572446555819e-07, + "loss": 0.2482, + "step": 9080 + }, + { + "epoch": 1.9420692603676786, + "grad_norm": 5.381765365600586, + "learning_rate": 6.460807600950119e-07, + "loss": 0.2849, + "step": 9085 + }, + { + "epoch": 1.943138093202223, + "grad_norm": 3.842059850692749, + "learning_rate": 6.342042755344418e-07, + "loss": 0.2666, + "step": 9090 + }, + { + "epoch": 1.9442069260367678, + "grad_norm": 4.254835605621338, + "learning_rate": 6.223277909738719e-07, + "loss": 0.224, + "step": 9095 + }, + { + "epoch": 1.9452757588713125, + "grad_norm": 5.467522144317627, + "learning_rate": 6.104513064133017e-07, + "loss": 0.2961, + "step": 9100 + }, + { + "epoch": 1.946344591705857, + "grad_norm": 4.110438823699951, + "learning_rate": 5.985748218527317e-07, + "loss": 0.217, + "step": 9105 + }, + { + "epoch": 1.947413424540402, + "grad_norm": 4.675514221191406, + "learning_rate": 5.866983372921616e-07, + "loss": 0.2384, + "step": 9110 + }, + { + "epoch": 1.9484822573749465, + "grad_norm": 4.90285062789917, + "learning_rate": 5.748218527315915e-07, + "loss": 0.2205, + "step": 9115 + }, + { + "epoch": 1.9495510902094912, + "grad_norm": 4.838087558746338, + "learning_rate": 5.629453681710214e-07, + "loss": 0.2807, + "step": 9120 + }, + { + "epoch": 1.950619923044036, + "grad_norm": 4.49014949798584, + "learning_rate": 5.510688836104513e-07, + "loss": 0.2577, + "step": 9125 + }, + { + "epoch": 1.9516887558785805, + "grad_norm": 6.248046398162842, + "learning_rate": 5.391923990498813e-07, + "loss": 0.3212, + "step": 9130 + }, + { + "epoch": 1.9527575887131252, + "grad_norm": 2.6727161407470703, + "learning_rate": 5.273159144893112e-07, + "loss": 0.239, + "step": 9135 + }, + { + "epoch": 1.95382642154767, + "grad_norm": 5.567617416381836, + "learning_rate": 5.154394299287412e-07, + "loss": 0.2283, + "step": 9140 + }, + { + "epoch": 1.9548952543822147, + "grad_norm": 4.877483367919922, + "learning_rate": 5.03562945368171e-07, + "loss": 0.2606, + "step": 9145 + }, + { + "epoch": 1.9559640872167594, + "grad_norm": 4.150485515594482, + "learning_rate": 4.91686460807601e-07, + "loss": 0.2684, + "step": 9150 + }, + { + "epoch": 1.957032920051304, + "grad_norm": 4.878507614135742, + "learning_rate": 4.798099762470309e-07, + "loss": 0.2857, + "step": 9155 + }, + { + "epoch": 1.9581017528858486, + "grad_norm": 5.343387126922607, + "learning_rate": 4.6793349168646085e-07, + "loss": 0.2962, + "step": 9160 + }, + { + "epoch": 1.9591705857203934, + "grad_norm": 4.346437454223633, + "learning_rate": 4.560570071258908e-07, + "loss": 0.2639, + "step": 9165 + }, + { + "epoch": 1.9602394185549379, + "grad_norm": 5.331128120422363, + "learning_rate": 4.441805225653207e-07, + "loss": 0.2549, + "step": 9170 + }, + { + "epoch": 1.9613082513894828, + "grad_norm": 4.075921535491943, + "learning_rate": 4.3230403800475065e-07, + "loss": 0.2561, + "step": 9175 + }, + { + "epoch": 1.9623770842240273, + "grad_norm": 4.879267692565918, + "learning_rate": 4.2042755344418056e-07, + "loss": 0.2324, + "step": 9180 + }, + { + "epoch": 1.963445917058572, + "grad_norm": 4.325537204742432, + "learning_rate": 4.085510688836105e-07, + "loss": 0.3142, + "step": 9185 + }, + { + "epoch": 1.9645147498931168, + "grad_norm": 3.530134439468384, + "learning_rate": 3.966745843230404e-07, + "loss": 0.2778, + "step": 9190 + }, + { + "epoch": 1.9655835827276613, + "grad_norm": 2.6315903663635254, + "learning_rate": 3.8479809976247036e-07, + "loss": 0.2603, + "step": 9195 + }, + { + "epoch": 1.966652415562206, + "grad_norm": 4.100142002105713, + "learning_rate": 3.729216152019002e-07, + "loss": 0.2191, + "step": 9200 + }, + { + "epoch": 1.9677212483967508, + "grad_norm": 3.4908711910247803, + "learning_rate": 3.610451306413302e-07, + "loss": 0.2658, + "step": 9205 + }, + { + "epoch": 1.9687900812312953, + "grad_norm": 4.331186771392822, + "learning_rate": 3.4916864608076015e-07, + "loss": 0.2701, + "step": 9210 + }, + { + "epoch": 1.9698589140658402, + "grad_norm": 6.090305805206299, + "learning_rate": 3.3729216152019e-07, + "loss": 0.3, + "step": 9215 + }, + { + "epoch": 1.9709277469003847, + "grad_norm": 3.7345423698425293, + "learning_rate": 3.2541567695962e-07, + "loss": 0.2728, + "step": 9220 + }, + { + "epoch": 1.9719965797349295, + "grad_norm": 6.370054244995117, + "learning_rate": 3.135391923990499e-07, + "loss": 0.2724, + "step": 9225 + }, + { + "epoch": 1.9730654125694742, + "grad_norm": 3.2030200958251953, + "learning_rate": 3.0166270783847986e-07, + "loss": 0.1735, + "step": 9230 + }, + { + "epoch": 1.9741342454040187, + "grad_norm": 3.904633045196533, + "learning_rate": 2.897862232779098e-07, + "loss": 0.25, + "step": 9235 + }, + { + "epoch": 1.9752030782385634, + "grad_norm": 5.196364402770996, + "learning_rate": 2.779097387173397e-07, + "loss": 0.276, + "step": 9240 + }, + { + "epoch": 1.9762719110731082, + "grad_norm": 5.8324785232543945, + "learning_rate": 2.660332541567696e-07, + "loss": 0.2581, + "step": 9245 + }, + { + "epoch": 1.9773407439076527, + "grad_norm": 3.4866878986358643, + "learning_rate": 2.541567695961995e-07, + "loss": 0.2545, + "step": 9250 + }, + { + "epoch": 1.9784095767421976, + "grad_norm": 5.080046653747559, + "learning_rate": 2.422802850356295e-07, + "loss": 0.2251, + "step": 9255 + }, + { + "epoch": 1.9794784095767421, + "grad_norm": 4.654627799987793, + "learning_rate": 2.304038004750594e-07, + "loss": 0.269, + "step": 9260 + }, + { + "epoch": 1.9805472424112869, + "grad_norm": 4.3756327629089355, + "learning_rate": 2.1852731591448934e-07, + "loss": 0.2026, + "step": 9265 + }, + { + "epoch": 1.9816160752458316, + "grad_norm": 4.612358093261719, + "learning_rate": 2.0665083135391925e-07, + "loss": 0.2297, + "step": 9270 + }, + { + "epoch": 1.9826849080803761, + "grad_norm": 4.363190174102783, + "learning_rate": 1.9477434679334917e-07, + "loss": 0.2414, + "step": 9275 + }, + { + "epoch": 1.983753740914921, + "grad_norm": 4.239806175231934, + "learning_rate": 1.828978622327791e-07, + "loss": 0.2724, + "step": 9280 + }, + { + "epoch": 1.9848225737494656, + "grad_norm": 3.087779998779297, + "learning_rate": 1.7102137767220902e-07, + "loss": 0.2338, + "step": 9285 + }, + { + "epoch": 1.9858914065840103, + "grad_norm": 5.1465277671813965, + "learning_rate": 1.59144893111639e-07, + "loss": 0.2606, + "step": 9290 + }, + { + "epoch": 1.986960239418555, + "grad_norm": 3.789433240890503, + "learning_rate": 1.4726840855106888e-07, + "loss": 0.2747, + "step": 9295 + }, + { + "epoch": 1.9880290722530995, + "grad_norm": 3.880868673324585, + "learning_rate": 1.3539192399049882e-07, + "loss": 0.1792, + "step": 9300 + }, + { + "epoch": 1.9890979050876443, + "grad_norm": 4.200949668884277, + "learning_rate": 1.2351543942992876e-07, + "loss": 0.2479, + "step": 9305 + }, + { + "epoch": 1.990166737922189, + "grad_norm": 4.372617721557617, + "learning_rate": 1.1163895486935867e-07, + "loss": 0.2554, + "step": 9310 + }, + { + "epoch": 1.9912355707567335, + "grad_norm": 3.7008919715881348, + "learning_rate": 9.97624703087886e-08, + "loss": 0.25, + "step": 9315 + }, + { + "epoch": 1.9923044035912785, + "grad_norm": 3.9479458332061768, + "learning_rate": 8.788598574821854e-08, + "loss": 0.2814, + "step": 9320 + }, + { + "epoch": 1.993373236425823, + "grad_norm": 4.310093402862549, + "learning_rate": 7.600950118764846e-08, + "loss": 0.2102, + "step": 9325 + }, + { + "epoch": 1.9944420692603677, + "grad_norm": 3.808363199234009, + "learning_rate": 6.41330166270784e-08, + "loss": 0.2466, + "step": 9330 + }, + { + "epoch": 1.9955109020949124, + "grad_norm": 4.076649188995361, + "learning_rate": 5.225653206650832e-08, + "loss": 0.2547, + "step": 9335 + }, + { + "epoch": 1.996579734929457, + "grad_norm": 3.773390531539917, + "learning_rate": 4.0380047505938245e-08, + "loss": 0.2216, + "step": 9340 + }, + { + "epoch": 1.9976485677640017, + "grad_norm": 3.149965286254883, + "learning_rate": 2.8503562945368176e-08, + "loss": 0.2521, + "step": 9345 + }, + { + "epoch": 1.9987174005985464, + "grad_norm": 3.375763177871704, + "learning_rate": 1.66270783847981e-08, + "loss": 0.2558, + "step": 9350 + }, + { + "epoch": 1.999786233433091, + "grad_norm": 5.134764194488525, + "learning_rate": 4.7505938242280285e-09, + "loss": 0.2345, + "step": 9355 + }, + { + "epoch": 2.0, + "eval_loss": 0.12192188948392868, + "eval_mrr": 0.9798825256975033, + "eval_runtime": 315.6223, + "eval_samples_per_second": 7.192, + "eval_steps_per_second": 0.9, + "step": 9356 + } + ], + "logging_steps": 5, + "max_steps": 9356, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 1, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}