diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5734 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1626, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024622960911049553, + "grad_norm": 1.5390625, + "learning_rate": 2.040816326530612e-10, + "loss": 1.3865270614624023, + "step": 2 + }, + { + "epoch": 0.0049245921822099106, + "grad_norm": 4.375, + "learning_rate": 6.122448979591837e-10, + "loss": 1.8760377168655396, + "step": 4 + }, + { + "epoch": 0.007386888273314866, + "grad_norm": 2.359375, + "learning_rate": 1.020408163265306e-09, + "loss": 1.1314038038253784, + "step": 6 + }, + { + "epoch": 0.009849184364419821, + "grad_norm": 5.71875, + "learning_rate": 1.4285714285714286e-09, + "loss": 1.8253700733184814, + "step": 8 + }, + { + "epoch": 0.012311480455524777, + "grad_norm": 12.625, + "learning_rate": 1.8367346938775511e-09, + "loss": 2.2051210403442383, + "step": 10 + }, + { + "epoch": 0.014773776546629732, + "grad_norm": 20.375, + "learning_rate": 2.2448979591836736e-09, + "loss": 2.4439101219177246, + "step": 12 + }, + { + "epoch": 0.017236072637734686, + "grad_norm": 3.578125, + "learning_rate": 2.653061224489796e-09, + "loss": 1.3878843784332275, + "step": 14 + }, + { + "epoch": 0.019698368728839642, + "grad_norm": 1.765625, + "learning_rate": 3.0612244897959187e-09, + "loss": 1.1822748184204102, + "step": 16 + }, + { + "epoch": 0.0221606648199446, + "grad_norm": 2.53125, + "learning_rate": 3.4693877551020408e-09, + "loss": 1.1794735193252563, + "step": 18 + }, + { + "epoch": 0.024622960911049555, + "grad_norm": 14.625, + "learning_rate": 3.877551020408163e-09, + "loss": 2.3212547302246094, + "step": 20 + }, + { + "epoch": 0.02708525700215451, + "grad_norm": 5.625, + "learning_rate": 4.285714285714286e-09, + "loss": 1.7700073719024658, + "step": 22 + }, + { + "epoch": 0.029547553093259463, + "grad_norm": 14.25, + "learning_rate": 4.693877551020409e-09, + "loss": 2.191647529602051, + "step": 24 + }, + { + "epoch": 0.03200984918436442, + "grad_norm": 4.15625, + "learning_rate": 5.102040816326531e-09, + "loss": 1.7301385402679443, + "step": 26 + }, + { + "epoch": 0.03447214527546937, + "grad_norm": 14.1875, + "learning_rate": 5.510204081632653e-09, + "loss": 2.343463659286499, + "step": 28 + }, + { + "epoch": 0.03693444136657433, + "grad_norm": 5.90625, + "learning_rate": 5.918367346938776e-09, + "loss": 1.2581849098205566, + "step": 30 + }, + { + "epoch": 0.039396737457679284, + "grad_norm": 5.1875, + "learning_rate": 6.326530612244899e-09, + "loss": 1.9037660360336304, + "step": 32 + }, + { + "epoch": 0.041859033548784244, + "grad_norm": 6.25, + "learning_rate": 6.73469387755102e-09, + "loss": 1.8926417827606201, + "step": 34 + }, + { + "epoch": 0.0443213296398892, + "grad_norm": 4.15625, + "learning_rate": 7.142857142857143e-09, + "loss": 1.494161605834961, + "step": 36 + }, + { + "epoch": 0.04678362573099415, + "grad_norm": 72.5, + "learning_rate": 7.551020408163264e-09, + "loss": 2.4310765266418457, + "step": 38 + }, + { + "epoch": 0.04924592182209911, + "grad_norm": 13.1875, + "learning_rate": 7.959183673469387e-09, + "loss": 2.401200294494629, + "step": 40 + }, + { + "epoch": 0.05170821791320406, + "grad_norm": 17.875, + "learning_rate": 8.36734693877551e-09, + "loss": 2.269543170928955, + "step": 42 + }, + { + "epoch": 0.05417051400430902, + "grad_norm": 6.375, + "learning_rate": 8.775510204081633e-09, + "loss": 1.880392074584961, + "step": 44 + }, + { + "epoch": 0.056632810095413974, + "grad_norm": 10.9375, + "learning_rate": 9.183673469387756e-09, + "loss": 2.2891359329223633, + "step": 46 + }, + { + "epoch": 0.05909510618651893, + "grad_norm": 2.953125, + "learning_rate": 9.591836734693877e-09, + "loss": 1.245388150215149, + "step": 48 + }, + { + "epoch": 0.061557402277623886, + "grad_norm": 14.0625, + "learning_rate": 1e-08, + "loss": 1.8519728183746338, + "step": 50 + }, + { + "epoch": 0.06401969836872884, + "grad_norm": 12.3125, + "learning_rate": 9.99996825131286e-09, + "loss": 2.678940773010254, + "step": 52 + }, + { + "epoch": 0.0664819944598338, + "grad_norm": 13.4375, + "learning_rate": 9.999873005755431e-09, + "loss": 2.3168435096740723, + "step": 54 + }, + { + "epoch": 0.06894429055093874, + "grad_norm": 23.125, + "learning_rate": 9.999714264839672e-09, + "loss": 2.218395233154297, + "step": 56 + }, + { + "epoch": 0.0714065866420437, + "grad_norm": 3.265625, + "learning_rate": 9.999492031085492e-09, + "loss": 1.2967658042907715, + "step": 58 + }, + { + "epoch": 0.07386888273314866, + "grad_norm": 8.4375, + "learning_rate": 9.999206308020707e-09, + "loss": 2.0597116947174072, + "step": 60 + }, + { + "epoch": 0.07633117882425362, + "grad_norm": 3.984375, + "learning_rate": 9.99885710018098e-09, + "loss": 1.6437733173370361, + "step": 62 + }, + { + "epoch": 0.07879347491535857, + "grad_norm": 6.9375, + "learning_rate": 9.99844441310976e-09, + "loss": 1.878865122795105, + "step": 64 + }, + { + "epoch": 0.08125577100646353, + "grad_norm": 5.34375, + "learning_rate": 9.997968253358178e-09, + "loss": 1.8909335136413574, + "step": 66 + }, + { + "epoch": 0.08371806709756849, + "grad_norm": 15.9375, + "learning_rate": 9.997428628484963e-09, + "loss": 2.290242910385132, + "step": 68 + }, + { + "epoch": 0.08618036318867343, + "grad_norm": 8.9375, + "learning_rate": 9.996825547056302e-09, + "loss": 2.0678482055664062, + "step": 70 + }, + { + "epoch": 0.0886426592797784, + "grad_norm": 5.75, + "learning_rate": 9.996159018645721e-09, + "loss": 1.8928303718566895, + "step": 72 + }, + { + "epoch": 0.09110495537088335, + "grad_norm": 7.53125, + "learning_rate": 9.995429053833917e-09, + "loss": 1.9023447036743164, + "step": 74 + }, + { + "epoch": 0.0935672514619883, + "grad_norm": 7.59375, + "learning_rate": 9.994635664208602e-09, + "loss": 1.914489507675171, + "step": 76 + }, + { + "epoch": 0.09602954755309326, + "grad_norm": 11.125, + "learning_rate": 9.99377886236432e-09, + "loss": 2.057431221008301, + "step": 78 + }, + { + "epoch": 0.09849184364419822, + "grad_norm": 7.21875, + "learning_rate": 9.992858661902233e-09, + "loss": 1.9636759757995605, + "step": 80 + }, + { + "epoch": 0.10095413973530316, + "grad_norm": 4.15625, + "learning_rate": 9.99187507742992e-09, + "loss": 1.298654317855835, + "step": 82 + }, + { + "epoch": 0.10341643582640812, + "grad_norm": 2.953125, + "learning_rate": 9.990828124561143e-09, + "loss": 1.1845377683639526, + "step": 84 + }, + { + "epoch": 0.10587873191751308, + "grad_norm": 12.0625, + "learning_rate": 9.989717819915584e-09, + "loss": 2.3120527267456055, + "step": 86 + }, + { + "epoch": 0.10834102800861804, + "grad_norm": 6.75, + "learning_rate": 9.988544181118608e-09, + "loss": 1.792182445526123, + "step": 88 + }, + { + "epoch": 0.11080332409972299, + "grad_norm": 4.03125, + "learning_rate": 9.987307226800957e-09, + "loss": 1.4169440269470215, + "step": 90 + }, + { + "epoch": 0.11326562019082795, + "grad_norm": 19.375, + "learning_rate": 9.98600697659847e-09, + "loss": 2.2629003524780273, + "step": 92 + }, + { + "epoch": 0.11572791628193291, + "grad_norm": 4.65625, + "learning_rate": 9.984643451151764e-09, + "loss": 1.8561232089996338, + "step": 94 + }, + { + "epoch": 0.11819021237303785, + "grad_norm": 5.03125, + "learning_rate": 9.98321667210591e-09, + "loss": 1.8327598571777344, + "step": 96 + }, + { + "epoch": 0.12065250846414281, + "grad_norm": 3.34375, + "learning_rate": 9.98172666211009e-09, + "loss": 1.2463821172714233, + "step": 98 + }, + { + "epoch": 0.12311480455524777, + "grad_norm": 6.375, + "learning_rate": 9.980173444817238e-09, + "loss": 1.351346731185913, + "step": 100 + }, + { + "epoch": 0.12557710064635272, + "grad_norm": 4.0625, + "learning_rate": 9.978557044883651e-09, + "loss": 1.2666093111038208, + "step": 102 + }, + { + "epoch": 0.12803939673745768, + "grad_norm": 4.78125, + "learning_rate": 9.976877487968623e-09, + "loss": 1.905246615409851, + "step": 104 + }, + { + "epoch": 0.13050169282856264, + "grad_norm": 3.0, + "learning_rate": 9.975134800734015e-09, + "loss": 1.1379789113998413, + "step": 106 + }, + { + "epoch": 0.1329639889196676, + "grad_norm": 4.65625, + "learning_rate": 9.973329010843847e-09, + "loss": 1.8731987476348877, + "step": 108 + }, + { + "epoch": 0.13542628501077256, + "grad_norm": 10.3125, + "learning_rate": 9.97146014696384e-09, + "loss": 1.897504448890686, + "step": 110 + }, + { + "epoch": 0.1378885811018775, + "grad_norm": 2.375, + "learning_rate": 9.96952823876099e-09, + "loss": 1.1055809259414673, + "step": 112 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 14.5625, + "learning_rate": 9.967533316903066e-09, + "loss": 2.4285759925842285, + "step": 114 + }, + { + "epoch": 0.1428131732840874, + "grad_norm": 6.0625, + "learning_rate": 9.965475413058142e-09, + "loss": 1.8401623964309692, + "step": 116 + }, + { + "epoch": 0.14527546937519237, + "grad_norm": 3.625, + "learning_rate": 9.963354559894099e-09, + "loss": 1.2698298692703247, + "step": 118 + }, + { + "epoch": 0.14773776546629733, + "grad_norm": 1.6875, + "learning_rate": 9.961170791078078e-09, + "loss": 1.1040065288543701, + "step": 120 + }, + { + "epoch": 0.1502000615574023, + "grad_norm": 16.375, + "learning_rate": 9.958924141275982e-09, + "loss": 1.8983745574951172, + "step": 122 + }, + { + "epoch": 0.15266235764850725, + "grad_norm": 5.125, + "learning_rate": 9.956614646151903e-09, + "loss": 1.9957232475280762, + "step": 124 + }, + { + "epoch": 0.15512465373961218, + "grad_norm": 12.75, + "learning_rate": 9.954242342367555e-09, + "loss": 2.3904964923858643, + "step": 126 + }, + { + "epoch": 0.15758694983071714, + "grad_norm": 5.0625, + "learning_rate": 9.951807267581707e-09, + "loss": 1.8866188526153564, + "step": 128 + }, + { + "epoch": 0.1600492459218221, + "grad_norm": 1.984375, + "learning_rate": 9.94930946044957e-09, + "loss": 1.2808419466018677, + "step": 130 + }, + { + "epoch": 0.16251154201292706, + "grad_norm": 2.484375, + "learning_rate": 9.946748960622197e-09, + "loss": 1.3167526721954346, + "step": 132 + }, + { + "epoch": 0.16497383810403202, + "grad_norm": 2.4375, + "learning_rate": 9.944125808745837e-09, + "loss": 1.2127764225006104, + "step": 134 + }, + { + "epoch": 0.16743613419513698, + "grad_norm": 4.5, + "learning_rate": 9.941440046461305e-09, + "loss": 1.9335191249847412, + "step": 136 + }, + { + "epoch": 0.1698984302862419, + "grad_norm": 9.0, + "learning_rate": 9.938691716403316e-09, + "loss": 1.9803462028503418, + "step": 138 + }, + { + "epoch": 0.17236072637734687, + "grad_norm": 4.65625, + "learning_rate": 9.935880862199809e-09, + "loss": 1.820433259010315, + "step": 140 + }, + { + "epoch": 0.17482302246845183, + "grad_norm": 5.78125, + "learning_rate": 9.93300752847124e-09, + "loss": 1.9337809085845947, + "step": 142 + }, + { + "epoch": 0.1772853185595568, + "grad_norm": 5.28125, + "learning_rate": 9.930071760829904e-09, + "loss": 1.8973931074142456, + "step": 144 + }, + { + "epoch": 0.17974761465066175, + "grad_norm": 5.40625, + "learning_rate": 9.927073605879185e-09, + "loss": 1.9531124830245972, + "step": 146 + }, + { + "epoch": 0.1822099107417667, + "grad_norm": 5.75, + "learning_rate": 9.924013111212818e-09, + "loss": 1.9310762882232666, + "step": 148 + }, + { + "epoch": 0.18467220683287167, + "grad_norm": 9.375, + "learning_rate": 9.920890325414153e-09, + "loss": 2.008820056915283, + "step": 150 + }, + { + "epoch": 0.1871345029239766, + "grad_norm": 82.5, + "learning_rate": 9.917705298055361e-09, + "loss": 3.0185141563415527, + "step": 152 + }, + { + "epoch": 0.18959679901508156, + "grad_norm": 8.625, + "learning_rate": 9.914458079696664e-09, + "loss": 2.008962631225586, + "step": 154 + }, + { + "epoch": 0.19205909510618652, + "grad_norm": 9.25, + "learning_rate": 9.91114872188552e-09, + "loss": 1.6197317838668823, + "step": 156 + }, + { + "epoch": 0.19452139119729148, + "grad_norm": 4.53125, + "learning_rate": 9.907777277155811e-09, + "loss": 1.8305246829986572, + "step": 158 + }, + { + "epoch": 0.19698368728839644, + "grad_norm": 9.75, + "learning_rate": 9.904343799027012e-09, + "loss": 1.9033877849578857, + "step": 160 + }, + { + "epoch": 0.1994459833795014, + "grad_norm": 8.5, + "learning_rate": 9.90084834200333e-09, + "loss": 1.9224884510040283, + "step": 162 + }, + { + "epoch": 0.20190827947060633, + "grad_norm": 5.5, + "learning_rate": 9.897290961572854e-09, + "loss": 1.5109963417053223, + "step": 164 + }, + { + "epoch": 0.2043705755617113, + "grad_norm": 6.0625, + "learning_rate": 9.893671714206662e-09, + "loss": 1.9377520084381104, + "step": 166 + }, + { + "epoch": 0.20683287165281625, + "grad_norm": 5.03125, + "learning_rate": 9.889990657357933e-09, + "loss": 1.6958491802215576, + "step": 168 + }, + { + "epoch": 0.2092951677439212, + "grad_norm": 5.1875, + "learning_rate": 9.886247849461023e-09, + "loss": 1.320851445198059, + "step": 170 + }, + { + "epoch": 0.21175746383502617, + "grad_norm": 17.375, + "learning_rate": 9.882443349930552e-09, + "loss": 2.529175281524658, + "step": 172 + }, + { + "epoch": 0.21421975992613113, + "grad_norm": 5.53125, + "learning_rate": 9.878577219160456e-09, + "loss": 1.9636085033416748, + "step": 174 + }, + { + "epoch": 0.21668205601723609, + "grad_norm": 5.84375, + "learning_rate": 9.87464951852302e-09, + "loss": 1.9693580865859985, + "step": 176 + }, + { + "epoch": 0.21914435210834102, + "grad_norm": 8.5, + "learning_rate": 9.870660310367915e-09, + "loss": 1.955024242401123, + "step": 178 + }, + { + "epoch": 0.22160664819944598, + "grad_norm": 11.5, + "learning_rate": 9.866609658021202e-09, + "loss": 2.3577377796173096, + "step": 180 + }, + { + "epoch": 0.22406894429055094, + "grad_norm": 14.1875, + "learning_rate": 9.862497625784324e-09, + "loss": 2.3302321434020996, + "step": 182 + }, + { + "epoch": 0.2265312403816559, + "grad_norm": 5.40625, + "learning_rate": 9.8583242789331e-09, + "loss": 1.872032642364502, + "step": 184 + }, + { + "epoch": 0.22899353647276086, + "grad_norm": 8.1875, + "learning_rate": 9.854089683716666e-09, + "loss": 1.9843339920043945, + "step": 186 + }, + { + "epoch": 0.23145583256386582, + "grad_norm": 6.375, + "learning_rate": 9.849793907356444e-09, + "loss": 1.8600096702575684, + "step": 188 + }, + { + "epoch": 0.23391812865497075, + "grad_norm": 11.0, + "learning_rate": 9.845437018045063e-09, + "loss": 2.281198024749756, + "step": 190 + }, + { + "epoch": 0.2363804247460757, + "grad_norm": 4.34375, + "learning_rate": 9.841019084945281e-09, + "loss": 1.8489793539047241, + "step": 192 + }, + { + "epoch": 0.23884272083718067, + "grad_norm": 4.40625, + "learning_rate": 9.836540178188888e-09, + "loss": 1.8184915781021118, + "step": 194 + }, + { + "epoch": 0.24130501692828563, + "grad_norm": 39.5, + "learning_rate": 9.832000368875586e-09, + "loss": 2.5119130611419678, + "step": 196 + }, + { + "epoch": 0.24376731301939059, + "grad_norm": 42.25, + "learning_rate": 9.82739972907187e-09, + "loss": 1.7983183860778809, + "step": 198 + }, + { + "epoch": 0.24622960911049555, + "grad_norm": 7.9375, + "learning_rate": 9.822738331809873e-09, + "loss": 1.8701186180114746, + "step": 200 + }, + { + "epoch": 0.2486919052016005, + "grad_norm": 10.8125, + "learning_rate": 9.818016251086222e-09, + "loss": 2.0227789878845215, + "step": 202 + }, + { + "epoch": 0.25115420129270544, + "grad_norm": 9.625, + "learning_rate": 9.813233561860844e-09, + "loss": 2.185953140258789, + "step": 204 + }, + { + "epoch": 0.2536164973838104, + "grad_norm": 5.0, + "learning_rate": 9.808390340055792e-09, + "loss": 1.850534439086914, + "step": 206 + }, + { + "epoch": 0.25607879347491536, + "grad_norm": 5.125, + "learning_rate": 9.803486662554038e-09, + "loss": 1.9469786882400513, + "step": 208 + }, + { + "epoch": 0.2585410895660203, + "grad_norm": 4.03125, + "learning_rate": 9.798522607198235e-09, + "loss": 1.7527638673782349, + "step": 210 + }, + { + "epoch": 0.2610033856571253, + "grad_norm": 4.75, + "learning_rate": 9.79349825278951e-09, + "loss": 1.9203780889511108, + "step": 212 + }, + { + "epoch": 0.2634656817482302, + "grad_norm": 4.53125, + "learning_rate": 9.788413679086188e-09, + "loss": 1.8700388669967651, + "step": 214 + }, + { + "epoch": 0.2659279778393352, + "grad_norm": 5.78125, + "learning_rate": 9.783268966802539e-09, + "loss": 2.030698299407959, + "step": 216 + }, + { + "epoch": 0.2683902739304401, + "grad_norm": 7.375, + "learning_rate": 9.778064197607495e-09, + "loss": 1.936469316482544, + "step": 218 + }, + { + "epoch": 0.2708525700215451, + "grad_norm": 18.875, + "learning_rate": 9.772799454123349e-09, + "loss": 2.471208095550537, + "step": 220 + }, + { + "epoch": 0.27331486611265005, + "grad_norm": 13.25, + "learning_rate": 9.767474819924447e-09, + "loss": 2.437526226043701, + "step": 222 + }, + { + "epoch": 0.275777162203755, + "grad_norm": 6.5625, + "learning_rate": 9.762090379535862e-09, + "loss": 2.013521909713745, + "step": 224 + }, + { + "epoch": 0.27823945829485996, + "grad_norm": 14.0625, + "learning_rate": 9.756646218432053e-09, + "loss": 2.0168678760528564, + "step": 226 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 5.8125, + "learning_rate": 9.751142423035501e-09, + "loss": 1.995202660560608, + "step": 228 + }, + { + "epoch": 0.2831640504770699, + "grad_norm": 42.5, + "learning_rate": 9.74557908071535e-09, + "loss": 1.953993320465088, + "step": 230 + }, + { + "epoch": 0.2856263465681748, + "grad_norm": 2.46875, + "learning_rate": 9.739956279786e-09, + "loss": 1.149980068206787, + "step": 232 + }, + { + "epoch": 0.2880886426592798, + "grad_norm": 4.21875, + "learning_rate": 9.734274109505729e-09, + "loss": 1.7589616775512695, + "step": 234 + }, + { + "epoch": 0.29055093875038474, + "grad_norm": 5.0625, + "learning_rate": 9.72853266007526e-09, + "loss": 1.9171326160430908, + "step": 236 + }, + { + "epoch": 0.29301323484148967, + "grad_norm": 11.4375, + "learning_rate": 9.722732022636333e-09, + "loss": 1.6742775440216064, + "step": 238 + }, + { + "epoch": 0.29547553093259465, + "grad_norm": 4.78125, + "learning_rate": 9.716872289270262e-09, + "loss": 1.7873895168304443, + "step": 240 + }, + { + "epoch": 0.2979378270236996, + "grad_norm": 4.40625, + "learning_rate": 9.710953552996464e-09, + "loss": 1.9001209735870361, + "step": 242 + }, + { + "epoch": 0.3004001231148046, + "grad_norm": 4.78125, + "learning_rate": 9.704975907770995e-09, + "loss": 1.869600534439087, + "step": 244 + }, + { + "epoch": 0.3028624192059095, + "grad_norm": 3.46875, + "learning_rate": 9.69893944848505e-09, + "loss": 1.5148907899856567, + "step": 246 + }, + { + "epoch": 0.3053247152970145, + "grad_norm": 14.6875, + "learning_rate": 9.69284427096345e-09, + "loss": 1.914973497390747, + "step": 248 + }, + { + "epoch": 0.3077870113881194, + "grad_norm": 13.125, + "learning_rate": 9.686690471963147e-09, + "loss": 2.230684757232666, + "step": 250 + }, + { + "epoch": 0.31024930747922436, + "grad_norm": 7.34375, + "learning_rate": 9.680478149171657e-09, + "loss": 2.0974578857421875, + "step": 252 + }, + { + "epoch": 0.31271160357032934, + "grad_norm": 13.5625, + "learning_rate": 9.674207401205524e-09, + "loss": 2.2117700576782227, + "step": 254 + }, + { + "epoch": 0.3151738996614343, + "grad_norm": 5.25, + "learning_rate": 9.667878327608756e-09, + "loss": 1.8505613803863525, + "step": 256 + }, + { + "epoch": 0.31763619575253926, + "grad_norm": 14.25, + "learning_rate": 9.661491028851246e-09, + "loss": 1.7967166900634766, + "step": 258 + }, + { + "epoch": 0.3200984918436442, + "grad_norm": 4.0625, + "learning_rate": 9.655045606327165e-09, + "loss": 1.869051456451416, + "step": 260 + }, + { + "epoch": 0.3225607879347491, + "grad_norm": 9.0625, + "learning_rate": 9.648542162353366e-09, + "loss": 1.876924753189087, + "step": 262 + }, + { + "epoch": 0.3250230840258541, + "grad_norm": 5.21875, + "learning_rate": 9.64198080016775e-09, + "loss": 2.0315141677856445, + "step": 264 + }, + { + "epoch": 0.32748538011695905, + "grad_norm": 8.5625, + "learning_rate": 9.635361623927643e-09, + "loss": 2.1542179584503174, + "step": 266 + }, + { + "epoch": 0.32994767620806403, + "grad_norm": 3.140625, + "learning_rate": 9.62868473870811e-09, + "loss": 1.1597316265106201, + "step": 268 + }, + { + "epoch": 0.33240997229916897, + "grad_norm": 10.5, + "learning_rate": 9.621950250500333e-09, + "loss": 2.637326717376709, + "step": 270 + }, + { + "epoch": 0.33487226839027395, + "grad_norm": 2.859375, + "learning_rate": 9.615158266209887e-09, + "loss": 1.283077597618103, + "step": 272 + }, + { + "epoch": 0.3373345644813789, + "grad_norm": 7.125, + "learning_rate": 9.608308893655061e-09, + "loss": 2.046065092086792, + "step": 274 + }, + { + "epoch": 0.3397968605724838, + "grad_norm": 2.953125, + "learning_rate": 9.601402241565154e-09, + "loss": 1.1603574752807617, + "step": 276 + }, + { + "epoch": 0.3422591566635888, + "grad_norm": 5.34375, + "learning_rate": 9.59443841957873e-09, + "loss": 1.7637038230895996, + "step": 278 + }, + { + "epoch": 0.34472145275469374, + "grad_norm": 5.21875, + "learning_rate": 9.587417538241892e-09, + "loss": 1.938485860824585, + "step": 280 + }, + { + "epoch": 0.3471837488457987, + "grad_norm": 29.0, + "learning_rate": 9.580339709006524e-09, + "loss": 2.3233187198638916, + "step": 282 + }, + { + "epoch": 0.34964604493690365, + "grad_norm": 6.0, + "learning_rate": 9.573205044228518e-09, + "loss": 1.4073760509490967, + "step": 284 + }, + { + "epoch": 0.35210834102800864, + "grad_norm": 6.375, + "learning_rate": 9.566013657165994e-09, + "loss": 1.3963334560394287, + "step": 286 + }, + { + "epoch": 0.3545706371191136, + "grad_norm": 6.8125, + "learning_rate": 9.558765661977503e-09, + "loss": 1.9514954090118408, + "step": 288 + }, + { + "epoch": 0.3570329332102185, + "grad_norm": 5.75, + "learning_rate": 9.551461173720208e-09, + "loss": 2.0840539932250977, + "step": 290 + }, + { + "epoch": 0.3594952293013235, + "grad_norm": 11.9375, + "learning_rate": 9.544100308348067e-09, + "loss": 2.2709197998046875, + "step": 292 + }, + { + "epoch": 0.3619575253924284, + "grad_norm": 12.3125, + "learning_rate": 9.536683182709986e-09, + "loss": 2.443535327911377, + "step": 294 + }, + { + "epoch": 0.3644198214835334, + "grad_norm": 18.875, + "learning_rate": 9.529209914547962e-09, + "loss": 2.240347385406494, + "step": 296 + }, + { + "epoch": 0.36688211757463834, + "grad_norm": 12.375, + "learning_rate": 9.521680622495228e-09, + "loss": 2.1307570934295654, + "step": 298 + }, + { + "epoch": 0.36934441366574333, + "grad_norm": 11.8125, + "learning_rate": 9.514095426074347e-09, + "loss": 2.510369062423706, + "step": 300 + }, + { + "epoch": 0.37180670975684826, + "grad_norm": 5.03125, + "learning_rate": 9.506454445695337e-09, + "loss": 1.9031611680984497, + "step": 302 + }, + { + "epoch": 0.3742690058479532, + "grad_norm": 2.484375, + "learning_rate": 9.498757802653741e-09, + "loss": 1.2329223155975342, + "step": 304 + }, + { + "epoch": 0.3767313019390582, + "grad_norm": 5.28125, + "learning_rate": 9.491005619128721e-09, + "loss": 1.8155068159103394, + "step": 306 + }, + { + "epoch": 0.3791935980301631, + "grad_norm": 7.625, + "learning_rate": 9.483198018181099e-09, + "loss": 1.736093282699585, + "step": 308 + }, + { + "epoch": 0.3816558941212681, + "grad_norm": 13.5625, + "learning_rate": 9.475335123751412e-09, + "loss": 1.9234977960586548, + "step": 310 + }, + { + "epoch": 0.38411819021237303, + "grad_norm": 8.5, + "learning_rate": 9.467417060657952e-09, + "loss": 1.9270076751708984, + "step": 312 + }, + { + "epoch": 0.38658048630347797, + "grad_norm": 4.0625, + "learning_rate": 9.459443954594769e-09, + "loss": 1.350337028503418, + "step": 314 + }, + { + "epoch": 0.38904278239458295, + "grad_norm": 2.609375, + "learning_rate": 9.451415932129692e-09, + "loss": 1.1429853439331055, + "step": 316 + }, + { + "epoch": 0.3915050784856879, + "grad_norm": 4.90625, + "learning_rate": 9.443333120702307e-09, + "loss": 1.8531888723373413, + "step": 318 + }, + { + "epoch": 0.3939673745767929, + "grad_norm": 3.0625, + "learning_rate": 9.435195648621935e-09, + "loss": 1.3913381099700928, + "step": 320 + }, + { + "epoch": 0.3964296706678978, + "grad_norm": 5.15625, + "learning_rate": 9.42700364506561e-09, + "loss": 1.8761987686157227, + "step": 322 + }, + { + "epoch": 0.3988919667590028, + "grad_norm": 4.4375, + "learning_rate": 9.418757240076008e-09, + "loss": 1.9191958904266357, + "step": 324 + }, + { + "epoch": 0.4013542628501077, + "grad_norm": 2.75, + "learning_rate": 9.410456564559393e-09, + "loss": 1.175315260887146, + "step": 326 + }, + { + "epoch": 0.40381655894121266, + "grad_norm": 12.375, + "learning_rate": 9.402101750283545e-09, + "loss": 2.3216049671173096, + "step": 328 + }, + { + "epoch": 0.40627885503231764, + "grad_norm": 2.265625, + "learning_rate": 9.39369292987565e-09, + "loss": 1.1453694105148315, + "step": 330 + }, + { + "epoch": 0.4087411511234226, + "grad_norm": 5.71875, + "learning_rate": 9.38523023682022e-09, + "loss": 1.9262512922286987, + "step": 332 + }, + { + "epoch": 0.41120344721452756, + "grad_norm": 10.375, + "learning_rate": 9.376713805456945e-09, + "loss": 2.126582622528076, + "step": 334 + }, + { + "epoch": 0.4136657433056325, + "grad_norm": 2.609375, + "learning_rate": 9.368143770978586e-09, + "loss": 1.1786751747131348, + "step": 336 + }, + { + "epoch": 0.4161280393967375, + "grad_norm": 8.125, + "learning_rate": 9.359520269428812e-09, + "loss": 2.126143217086792, + "step": 338 + }, + { + "epoch": 0.4185903354878424, + "grad_norm": 2.390625, + "learning_rate": 9.350843437700052e-09, + "loss": 1.245577335357666, + "step": 340 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 29.625, + "learning_rate": 9.342113413531315e-09, + "loss": 2.009819507598877, + "step": 342 + }, + { + "epoch": 0.42351492767005233, + "grad_norm": 3.875, + "learning_rate": 9.333330335506001e-09, + "loss": 1.1387863159179688, + "step": 344 + }, + { + "epoch": 0.42597722376115726, + "grad_norm": 12.5625, + "learning_rate": 9.324494343049707e-09, + "loss": 2.2192680835723877, + "step": 346 + }, + { + "epoch": 0.42843951985226225, + "grad_norm": 11.0, + "learning_rate": 9.315605576428018e-09, + "loss": 1.939860463142395, + "step": 348 + }, + { + "epoch": 0.4309018159433672, + "grad_norm": 13.8125, + "learning_rate": 9.306664176744266e-09, + "loss": 2.318619728088379, + "step": 350 + }, + { + "epoch": 0.43336411203447217, + "grad_norm": 3.15625, + "learning_rate": 9.297670285937303e-09, + "loss": 1.0619254112243652, + "step": 352 + }, + { + "epoch": 0.4358264081255771, + "grad_norm": 5.4375, + "learning_rate": 9.288624046779241e-09, + "loss": 1.834202766418457, + "step": 354 + }, + { + "epoch": 0.43828870421668203, + "grad_norm": 9.8125, + "learning_rate": 9.279525602873189e-09, + "loss": 1.9926815032958984, + "step": 356 + }, + { + "epoch": 0.440751000307787, + "grad_norm": 4.8125, + "learning_rate": 9.27037509865097e-09, + "loss": 1.9792507886886597, + "step": 358 + }, + { + "epoch": 0.44321329639889195, + "grad_norm": 9.125, + "learning_rate": 9.26117267937083e-09, + "loss": 1.5881253480911255, + "step": 360 + }, + { + "epoch": 0.44567559248999694, + "grad_norm": 15.5, + "learning_rate": 9.251918491115142e-09, + "loss": 2.488168239593506, + "step": 362 + }, + { + "epoch": 0.4481378885811019, + "grad_norm": 4.8125, + "learning_rate": 9.242612680788061e-09, + "loss": 1.9684348106384277, + "step": 364 + }, + { + "epoch": 0.45060018467220686, + "grad_norm": 9.375, + "learning_rate": 9.233255396113223e-09, + "loss": 2.305130958557129, + "step": 366 + }, + { + "epoch": 0.4530624807633118, + "grad_norm": 16.75, + "learning_rate": 9.223846785631378e-09, + "loss": 2.335341215133667, + "step": 368 + }, + { + "epoch": 0.4555247768544167, + "grad_norm": 5.25, + "learning_rate": 9.214386998698039e-09, + "loss": 1.7638440132141113, + "step": 370 + }, + { + "epoch": 0.4579870729455217, + "grad_norm": 5.53125, + "learning_rate": 9.20487618548112e-09, + "loss": 1.7996431589126587, + "step": 372 + }, + { + "epoch": 0.46044936903662664, + "grad_norm": 4.6875, + "learning_rate": 9.195314496958531e-09, + "loss": 1.7842280864715576, + "step": 374 + }, + { + "epoch": 0.46291166512773163, + "grad_norm": 38.0, + "learning_rate": 9.185702084915805e-09, + "loss": 2.152765989303589, + "step": 376 + }, + { + "epoch": 0.46537396121883656, + "grad_norm": 5.125, + "learning_rate": 9.176039101943672e-09, + "loss": 1.7519220113754272, + "step": 378 + }, + { + "epoch": 0.4678362573099415, + "grad_norm": 45.0, + "learning_rate": 9.166325701435644e-09, + "loss": 2.9101526737213135, + "step": 380 + }, + { + "epoch": 0.4702985534010465, + "grad_norm": 12.9375, + "learning_rate": 9.156562037585576e-09, + "loss": 2.2048463821411133, + "step": 382 + }, + { + "epoch": 0.4727608494921514, + "grad_norm": 5.4375, + "learning_rate": 9.146748265385223e-09, + "loss": 1.8226771354675293, + "step": 384 + }, + { + "epoch": 0.4752231455832564, + "grad_norm": 16.0, + "learning_rate": 9.13688454062178e-09, + "loss": 2.297773838043213, + "step": 386 + }, + { + "epoch": 0.47768544167436133, + "grad_norm": 16.375, + "learning_rate": 9.126971019875397e-09, + "loss": 2.2794573307037354, + "step": 388 + }, + { + "epoch": 0.4801477377654663, + "grad_norm": 8.1875, + "learning_rate": 9.117007860516713e-09, + "loss": 1.2689777612686157, + "step": 390 + }, + { + "epoch": 0.48261003385657125, + "grad_norm": 10.4375, + "learning_rate": 9.106995220704344e-09, + "loss": 2.273574113845825, + "step": 392 + }, + { + "epoch": 0.4850723299476762, + "grad_norm": 4.3125, + "learning_rate": 9.09693325938237e-09, + "loss": 1.7581639289855957, + "step": 394 + }, + { + "epoch": 0.48753462603878117, + "grad_norm": 4.25, + "learning_rate": 9.08682213627782e-09, + "loss": 1.8824234008789062, + "step": 396 + }, + { + "epoch": 0.4899969221298861, + "grad_norm": 40.0, + "learning_rate": 9.076662011898145e-09, + "loss": 2.692976951599121, + "step": 398 + }, + { + "epoch": 0.4924592182209911, + "grad_norm": 5.0625, + "learning_rate": 9.066453047528642e-09, + "loss": 1.951959490776062, + "step": 400 + }, + { + "epoch": 0.494921514312096, + "grad_norm": 19.125, + "learning_rate": 9.056195405229922e-09, + "loss": 2.419041156768799, + "step": 402 + }, + { + "epoch": 0.497383810403201, + "grad_norm": 4.3125, + "learning_rate": 9.045889247835322e-09, + "loss": 1.7131880521774292, + "step": 404 + }, + { + "epoch": 0.49984610649430594, + "grad_norm": 2.875, + "learning_rate": 9.035534738948328e-09, + "loss": 1.2638614177703857, + "step": 406 + }, + { + "epoch": 0.5023084025854109, + "grad_norm": 6.90625, + "learning_rate": 9.02513204293997e-09, + "loss": 1.8727983236312866, + "step": 408 + }, + { + "epoch": 0.5047706986765158, + "grad_norm": 2.203125, + "learning_rate": 9.014681324946216e-09, + "loss": 1.1091878414154053, + "step": 410 + }, + { + "epoch": 0.5072329947676208, + "grad_norm": 5.5625, + "learning_rate": 9.004182750865357e-09, + "loss": 2.032684326171875, + "step": 412 + }, + { + "epoch": 0.5096952908587258, + "grad_norm": 3.25, + "learning_rate": 8.993636487355366e-09, + "loss": 1.4393967390060425, + "step": 414 + }, + { + "epoch": 0.5121575869498307, + "grad_norm": 15.4375, + "learning_rate": 8.98304270183125e-09, + "loss": 2.364288806915283, + "step": 416 + }, + { + "epoch": 0.5146198830409356, + "grad_norm": 6.84375, + "learning_rate": 8.9724015624624e-09, + "loss": 1.4677906036376953, + "step": 418 + }, + { + "epoch": 0.5170821791320406, + "grad_norm": 4.53125, + "learning_rate": 8.961713238169922e-09, + "loss": 1.9610824584960938, + "step": 420 + }, + { + "epoch": 0.5195444752231456, + "grad_norm": 6.1875, + "learning_rate": 8.950977898623947e-09, + "loss": 1.8107311725616455, + "step": 422 + }, + { + "epoch": 0.5220067713142506, + "grad_norm": 2.859375, + "learning_rate": 8.940195714240937e-09, + "loss": 1.2439892292022705, + "step": 424 + }, + { + "epoch": 0.5244690674053555, + "grad_norm": 9.75, + "learning_rate": 8.929366856181003e-09, + "loss": 1.985514521598816, + "step": 426 + }, + { + "epoch": 0.5269313634964604, + "grad_norm": 3.703125, + "learning_rate": 8.918491496345149e-09, + "loss": 1.8395881652832031, + "step": 428 + }, + { + "epoch": 0.5293936595875655, + "grad_norm": 3.421875, + "learning_rate": 8.907569807372576e-09, + "loss": 1.2282559871673584, + "step": 430 + }, + { + "epoch": 0.5318559556786704, + "grad_norm": 4.75, + "learning_rate": 8.896601962637927e-09, + "loss": 1.9522662162780762, + "step": 432 + }, + { + "epoch": 0.5343182517697753, + "grad_norm": 6.4375, + "learning_rate": 8.885588136248539e-09, + "loss": 1.831364631652832, + "step": 434 + }, + { + "epoch": 0.5367805478608803, + "grad_norm": 3.21875, + "learning_rate": 8.874528503041674e-09, + "loss": 1.3392367362976074, + "step": 436 + }, + { + "epoch": 0.5392428439519852, + "grad_norm": 2.03125, + "learning_rate": 8.86342323858175e-09, + "loss": 1.154931664466858, + "step": 438 + }, + { + "epoch": 0.5417051400430902, + "grad_norm": 2.84375, + "learning_rate": 8.852272519157554e-09, + "loss": 1.1106712818145752, + "step": 440 + }, + { + "epoch": 0.5441674361341952, + "grad_norm": 12.6875, + "learning_rate": 8.841076521779431e-09, + "loss": 2.266367197036743, + "step": 442 + }, + { + "epoch": 0.5466297322253001, + "grad_norm": 6.78125, + "learning_rate": 8.829835424176495e-09, + "loss": 1.9257324934005737, + "step": 444 + }, + { + "epoch": 0.549092028316405, + "grad_norm": 7.6875, + "learning_rate": 8.81854940479379e-09, + "loss": 1.2584561109542847, + "step": 446 + }, + { + "epoch": 0.55155432440751, + "grad_norm": 8.3125, + "learning_rate": 8.807218642789463e-09, + "loss": 2.150424003601074, + "step": 448 + }, + { + "epoch": 0.554016620498615, + "grad_norm": 3.6875, + "learning_rate": 8.795843318031926e-09, + "loss": 1.100125789642334, + "step": 450 + }, + { + "epoch": 0.5564789165897199, + "grad_norm": 4.71875, + "learning_rate": 8.78442361109699e-09, + "loss": 1.8502240180969238, + "step": 452 + }, + { + "epoch": 0.5589412126808249, + "grad_norm": 4.625, + "learning_rate": 8.772959703265008e-09, + "loss": 1.7188208103179932, + "step": 454 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 2.25, + "learning_rate": 8.76145177651799e-09, + "loss": 1.1569561958312988, + "step": 456 + }, + { + "epoch": 0.5638658048630347, + "grad_norm": 13.5, + "learning_rate": 8.74990001353672e-09, + "loss": 2.2237837314605713, + "step": 458 + }, + { + "epoch": 0.5663281009541398, + "grad_norm": 2.625, + "learning_rate": 8.738304597697855e-09, + "loss": 1.2278821468353271, + "step": 460 + }, + { + "epoch": 0.5687903970452447, + "grad_norm": 2.984375, + "learning_rate": 8.726665713071004e-09, + "loss": 1.4073512554168701, + "step": 462 + }, + { + "epoch": 0.5712526931363496, + "grad_norm": 12.375, + "learning_rate": 8.714983544415824e-09, + "loss": 2.3128976821899414, + "step": 464 + }, + { + "epoch": 0.5737149892274546, + "grad_norm": 13.3125, + "learning_rate": 8.703258277179076e-09, + "loss": 2.249760627746582, + "step": 466 + }, + { + "epoch": 0.5761772853185596, + "grad_norm": 5.75, + "learning_rate": 8.691490097491676e-09, + "loss": 1.949746012687683, + "step": 468 + }, + { + "epoch": 0.5786395814096645, + "grad_norm": 8.0625, + "learning_rate": 8.679679192165755e-09, + "loss": 2.0255026817321777, + "step": 470 + }, + { + "epoch": 0.5811018775007695, + "grad_norm": 2.953125, + "learning_rate": 8.667825748691678e-09, + "loss": 1.172034502029419, + "step": 472 + }, + { + "epoch": 0.5835641735918744, + "grad_norm": 13.4375, + "learning_rate": 8.655929955235084e-09, + "loss": 1.7464905977249146, + "step": 474 + }, + { + "epoch": 0.5860264696829793, + "grad_norm": 4.875, + "learning_rate": 8.643992000633882e-09, + "loss": 1.7516231536865234, + "step": 476 + }, + { + "epoch": 0.5884887657740844, + "grad_norm": 13.6875, + "learning_rate": 8.632012074395267e-09, + "loss": 1.9086973667144775, + "step": 478 + }, + { + "epoch": 0.5909510618651893, + "grad_norm": 20.375, + "learning_rate": 8.619990366692703e-09, + "loss": 1.120478630065918, + "step": 480 + }, + { + "epoch": 0.5934133579562942, + "grad_norm": 5.40625, + "learning_rate": 8.607927068362909e-09, + "loss": 1.8365321159362793, + "step": 482 + }, + { + "epoch": 0.5958756540473992, + "grad_norm": 4.21875, + "learning_rate": 8.595822370902824e-09, + "loss": 1.8781213760375977, + "step": 484 + }, + { + "epoch": 0.5983379501385041, + "grad_norm": 5.09375, + "learning_rate": 8.583676466466578e-09, + "loss": 1.8990083932876587, + "step": 486 + }, + { + "epoch": 0.6008002462296091, + "grad_norm": 9.25, + "learning_rate": 8.571489547862432e-09, + "loss": 2.005687713623047, + "step": 488 + }, + { + "epoch": 0.6032625423207141, + "grad_norm": 11.75, + "learning_rate": 8.559261808549717e-09, + "loss": 2.288544178009033, + "step": 490 + }, + { + "epoch": 0.605724838411819, + "grad_norm": 12.0625, + "learning_rate": 8.546993442635767e-09, + "loss": 1.9239308834075928, + "step": 492 + }, + { + "epoch": 0.6081871345029239, + "grad_norm": 3.203125, + "learning_rate": 8.534684644872836e-09, + "loss": 1.2520358562469482, + "step": 494 + }, + { + "epoch": 0.610649430594029, + "grad_norm": 7.65625, + "learning_rate": 8.522335610655014e-09, + "loss": 2.1090569496154785, + "step": 496 + }, + { + "epoch": 0.6131117266851339, + "grad_norm": 10.3125, + "learning_rate": 8.509946536015109e-09, + "loss": 2.2030882835388184, + "step": 498 + }, + { + "epoch": 0.6155740227762388, + "grad_norm": 18.75, + "learning_rate": 8.497517617621549e-09, + "loss": 2.205538034439087, + "step": 500 + }, + { + "epoch": 0.6180363188673438, + "grad_norm": 3.484375, + "learning_rate": 8.485049052775255e-09, + "loss": 1.5225834846496582, + "step": 502 + }, + { + "epoch": 0.6204986149584487, + "grad_norm": 4.8125, + "learning_rate": 8.472541039406509e-09, + "loss": 1.8662419319152832, + "step": 504 + }, + { + "epoch": 0.6229609110495538, + "grad_norm": 3.3125, + "learning_rate": 8.459993776071815e-09, + "loss": 1.5459778308868408, + "step": 506 + }, + { + "epoch": 0.6254232071406587, + "grad_norm": 2.359375, + "learning_rate": 8.44740746195074e-09, + "loss": 1.2113550901412964, + "step": 508 + }, + { + "epoch": 0.6278855032317636, + "grad_norm": 3.078125, + "learning_rate": 8.434782296842755e-09, + "loss": 1.2501018047332764, + "step": 510 + }, + { + "epoch": 0.6303477993228686, + "grad_norm": 5.46875, + "learning_rate": 8.422118481164076e-09, + "loss": 1.3121228218078613, + "step": 512 + }, + { + "epoch": 0.6328100954139735, + "grad_norm": 8.875, + "learning_rate": 8.409416215944459e-09, + "loss": 2.0257339477539062, + "step": 514 + }, + { + "epoch": 0.6352723915050785, + "grad_norm": 2.828125, + "learning_rate": 8.396675702824026e-09, + "loss": 1.249032974243164, + "step": 516 + }, + { + "epoch": 0.6377346875961835, + "grad_norm": 2.40625, + "learning_rate": 8.38389714405006e-09, + "loss": 1.089784026145935, + "step": 518 + }, + { + "epoch": 0.6401969836872884, + "grad_norm": 3.078125, + "learning_rate": 8.371080742473797e-09, + "loss": 1.107433795928955, + "step": 520 + }, + { + "epoch": 0.6426592797783933, + "grad_norm": 24.25, + "learning_rate": 8.358226701547196e-09, + "loss": 2.397225856781006, + "step": 522 + }, + { + "epoch": 0.6451215758694983, + "grad_norm": 30.625, + "learning_rate": 8.345335225319716e-09, + "loss": 2.917544364929199, + "step": 524 + }, + { + "epoch": 0.6475838719606033, + "grad_norm": 5.3125, + "learning_rate": 8.332406518435087e-09, + "loss": 1.9733543395996094, + "step": 526 + }, + { + "epoch": 0.6500461680517082, + "grad_norm": 11.5625, + "learning_rate": 8.319440786128039e-09, + "loss": 2.30487060546875, + "step": 528 + }, + { + "epoch": 0.6525084641428132, + "grad_norm": 14.125, + "learning_rate": 8.306438234221058e-09, + "loss": 2.489694118499756, + "step": 530 + }, + { + "epoch": 0.6549707602339181, + "grad_norm": 4.90625, + "learning_rate": 8.293399069121128e-09, + "loss": 1.7912418842315674, + "step": 532 + }, + { + "epoch": 0.6574330563250231, + "grad_norm": 9.4375, + "learning_rate": 8.280323497816431e-09, + "loss": 1.935392141342163, + "step": 534 + }, + { + "epoch": 0.6598953524161281, + "grad_norm": 6.09375, + "learning_rate": 8.267211727873078e-09, + "loss": 1.9411722421646118, + "step": 536 + }, + { + "epoch": 0.662357648507233, + "grad_norm": 3.765625, + "learning_rate": 8.254063967431816e-09, + "loss": 1.7723370790481567, + "step": 538 + }, + { + "epoch": 0.6648199445983379, + "grad_norm": 10.75, + "learning_rate": 8.240880425204702e-09, + "loss": 2.3154473304748535, + "step": 540 + }, + { + "epoch": 0.6672822406894429, + "grad_norm": 6.25, + "learning_rate": 8.22766131047182e-09, + "loss": 1.941293716430664, + "step": 542 + }, + { + "epoch": 0.6697445367805479, + "grad_norm": 5.15625, + "learning_rate": 8.21440683307794e-09, + "loss": 1.8273173570632935, + "step": 544 + }, + { + "epoch": 0.6722068328716528, + "grad_norm": 5.75, + "learning_rate": 8.201117203429187e-09, + "loss": 1.917323112487793, + "step": 546 + }, + { + "epoch": 0.6746691289627578, + "grad_norm": 4.53125, + "learning_rate": 8.18779263248971e-09, + "loss": 1.5516306161880493, + "step": 548 + }, + { + "epoch": 0.6771314250538627, + "grad_norm": 5.71875, + "learning_rate": 8.174433331778322e-09, + "loss": 2.0121002197265625, + "step": 550 + }, + { + "epoch": 0.6795937211449676, + "grad_norm": 4.34375, + "learning_rate": 8.161039513365158e-09, + "loss": 1.2636222839355469, + "step": 552 + }, + { + "epoch": 0.6820560172360727, + "grad_norm": 3.8125, + "learning_rate": 8.147611389868293e-09, + "loss": 1.3448388576507568, + "step": 554 + }, + { + "epoch": 0.6845183133271776, + "grad_norm": 8.0625, + "learning_rate": 8.13414917445037e-09, + "loss": 2.0951576232910156, + "step": 556 + }, + { + "epoch": 0.6869806094182825, + "grad_norm": 10.875, + "learning_rate": 8.120653080815219e-09, + "loss": 2.3154006004333496, + "step": 558 + }, + { + "epoch": 0.6894429055093875, + "grad_norm": 2.96875, + "learning_rate": 8.107123323204473e-09, + "loss": 1.1850239038467407, + "step": 560 + }, + { + "epoch": 0.6919052016004925, + "grad_norm": 4.4375, + "learning_rate": 8.093560116394149e-09, + "loss": 1.9023423194885254, + "step": 562 + }, + { + "epoch": 0.6943674976915974, + "grad_norm": 5.5, + "learning_rate": 8.079963675691255e-09, + "loss": 1.9364053010940552, + "step": 564 + }, + { + "epoch": 0.6968297937827024, + "grad_norm": 8.1875, + "learning_rate": 8.06633421693036e-09, + "loss": 1.8559212684631348, + "step": 566 + }, + { + "epoch": 0.6992920898738073, + "grad_norm": 12.0, + "learning_rate": 8.052671956470177e-09, + "loss": 1.9172155857086182, + "step": 568 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 5.1875, + "learning_rate": 8.038977111190119e-09, + "loss": 1.7878023386001587, + "step": 570 + }, + { + "epoch": 0.7042166820560173, + "grad_norm": 5.28125, + "learning_rate": 8.025249898486866e-09, + "loss": 1.9518636465072632, + "step": 572 + }, + { + "epoch": 0.7066789781471222, + "grad_norm": 4.875, + "learning_rate": 8.011490536270911e-09, + "loss": 1.7933154106140137, + "step": 574 + }, + { + "epoch": 0.7091412742382271, + "grad_norm": 4.75, + "learning_rate": 7.997699242963094e-09, + "loss": 1.7392499446868896, + "step": 576 + }, + { + "epoch": 0.7116035703293321, + "grad_norm": 3.734375, + "learning_rate": 7.983876237491148e-09, + "loss": 1.403039813041687, + "step": 578 + }, + { + "epoch": 0.714065866420437, + "grad_norm": 2.921875, + "learning_rate": 7.970021739286207e-09, + "loss": 1.1680914163589478, + "step": 580 + }, + { + "epoch": 0.716528162511542, + "grad_norm": 2.234375, + "learning_rate": 7.956135968279335e-09, + "loss": 1.1165484189987183, + "step": 582 + }, + { + "epoch": 0.718990458602647, + "grad_norm": 10.9375, + "learning_rate": 7.942219144898033e-09, + "loss": 2.342836856842041, + "step": 584 + }, + { + "epoch": 0.7214527546937519, + "grad_norm": 3.953125, + "learning_rate": 7.928271490062737e-09, + "loss": 1.8495182991027832, + "step": 586 + }, + { + "epoch": 0.7239150507848569, + "grad_norm": 5.875, + "learning_rate": 7.914293225183313e-09, + "loss": 1.9028046131134033, + "step": 588 + }, + { + "epoch": 0.7263773468759618, + "grad_norm": 10.0625, + "learning_rate": 7.900284572155538e-09, + "loss": 1.9208589792251587, + "step": 590 + }, + { + "epoch": 0.7288396429670668, + "grad_norm": 4.59375, + "learning_rate": 7.886245753357586e-09, + "loss": 1.8670642375946045, + "step": 592 + }, + { + "epoch": 0.7313019390581718, + "grad_norm": 65.0, + "learning_rate": 7.872176991646488e-09, + "loss": 1.555503487586975, + "step": 594 + }, + { + "epoch": 0.7337642351492767, + "grad_norm": 5.46875, + "learning_rate": 7.858078510354597e-09, + "loss": 1.9539310932159424, + "step": 596 + }, + { + "epoch": 0.7362265312403816, + "grad_norm": 2.703125, + "learning_rate": 7.843950533286057e-09, + "loss": 1.2128690481185913, + "step": 598 + }, + { + "epoch": 0.7386888273314867, + "grad_norm": 4.46875, + "learning_rate": 7.829793284713224e-09, + "loss": 1.873086929321289, + "step": 600 + }, + { + "epoch": 0.7411511234225916, + "grad_norm": 2.578125, + "learning_rate": 7.81560698937313e-09, + "loss": 1.1673393249511719, + "step": 602 + }, + { + "epoch": 0.7436134195136965, + "grad_norm": 12.8125, + "learning_rate": 7.801391872463896e-09, + "loss": 2.315310001373291, + "step": 604 + }, + { + "epoch": 0.7460757156048015, + "grad_norm": 11.3125, + "learning_rate": 7.787148159641176e-09, + "loss": 2.4388017654418945, + "step": 606 + }, + { + "epoch": 0.7485380116959064, + "grad_norm": 8.75, + "learning_rate": 7.77287607701456e-09, + "loss": 2.1161627769470215, + "step": 608 + }, + { + "epoch": 0.7510003077870114, + "grad_norm": 3.921875, + "learning_rate": 7.758575851143987e-09, + "loss": 1.1796162128448486, + "step": 610 + }, + { + "epoch": 0.7534626038781164, + "grad_norm": 4.90625, + "learning_rate": 7.744247709036165e-09, + "loss": 1.3470849990844727, + "step": 612 + }, + { + "epoch": 0.7559248999692213, + "grad_norm": 12.3125, + "learning_rate": 7.729891878140936e-09, + "loss": 2.33459734916687, + "step": 614 + }, + { + "epoch": 0.7583871960603262, + "grad_norm": 6.96875, + "learning_rate": 7.715508586347695e-09, + "loss": 1.9637078046798706, + "step": 616 + }, + { + "epoch": 0.7608494921514312, + "grad_norm": 4.34375, + "learning_rate": 7.701098061981757e-09, + "loss": 1.9413955211639404, + "step": 618 + }, + { + "epoch": 0.7633117882425362, + "grad_norm": 9.6875, + "learning_rate": 7.686660533800736e-09, + "loss": 1.9719551801681519, + "step": 620 + }, + { + "epoch": 0.7657740843336411, + "grad_norm": 3.71875, + "learning_rate": 7.672196230990918e-09, + "loss": 1.3401029109954834, + "step": 622 + }, + { + "epoch": 0.7682363804247461, + "grad_norm": 6.59375, + "learning_rate": 7.65770538316361e-09, + "loss": 1.7963333129882812, + "step": 624 + }, + { + "epoch": 0.770698676515851, + "grad_norm": 7.96875, + "learning_rate": 7.643188220351516e-09, + "loss": 2.0712432861328125, + "step": 626 + }, + { + "epoch": 0.7731609726069559, + "grad_norm": 11.125, + "learning_rate": 7.628644973005061e-09, + "loss": 2.3805270195007324, + "step": 628 + }, + { + "epoch": 0.775623268698061, + "grad_norm": 7.34375, + "learning_rate": 7.61407587198875e-09, + "loss": 1.2845838069915771, + "step": 630 + }, + { + "epoch": 0.7780855647891659, + "grad_norm": 20.875, + "learning_rate": 7.5994811485775e-09, + "loss": 2.2516846656799316, + "step": 632 + }, + { + "epoch": 0.7805478608802708, + "grad_norm": 5.0, + "learning_rate": 7.584861034452963e-09, + "loss": 1.964002251625061, + "step": 634 + }, + { + "epoch": 0.7830101569713758, + "grad_norm": 3.046875, + "learning_rate": 7.570215761699855e-09, + "loss": 1.3124688863754272, + "step": 636 + }, + { + "epoch": 0.7854724530624808, + "grad_norm": 11.6875, + "learning_rate": 7.55554556280227e-09, + "loss": 2.2107834815979004, + "step": 638 + }, + { + "epoch": 0.7879347491535857, + "grad_norm": 5.6875, + "learning_rate": 7.540850670639978e-09, + "loss": 1.9630699157714844, + "step": 640 + }, + { + "epoch": 0.7903970452446907, + "grad_norm": 5.75, + "learning_rate": 7.526131318484753e-09, + "loss": 1.9335198402404785, + "step": 642 + }, + { + "epoch": 0.7928593413357956, + "grad_norm": 3.765625, + "learning_rate": 7.511387739996644e-09, + "loss": 1.2916162014007568, + "step": 644 + }, + { + "epoch": 0.7953216374269005, + "grad_norm": 14.5625, + "learning_rate": 7.496620169220286e-09, + "loss": 2.1263046264648438, + "step": 646 + }, + { + "epoch": 0.7977839335180056, + "grad_norm": 5.78125, + "learning_rate": 7.481828840581164e-09, + "loss": 1.8862347602844238, + "step": 648 + }, + { + "epoch": 0.8002462296091105, + "grad_norm": 11.75, + "learning_rate": 7.46701398888192e-09, + "loss": 2.1435751914978027, + "step": 650 + }, + { + "epoch": 0.8027085257002154, + "grad_norm": 36.25, + "learning_rate": 7.45217584929859e-09, + "loss": 1.8985021114349365, + "step": 652 + }, + { + "epoch": 0.8051708217913204, + "grad_norm": 3.96875, + "learning_rate": 7.437314657376906e-09, + "loss": 1.255218267440796, + "step": 654 + }, + { + "epoch": 0.8076331178824253, + "grad_norm": 6.71875, + "learning_rate": 7.422430649028533e-09, + "loss": 1.8039145469665527, + "step": 656 + }, + { + "epoch": 0.8100954139735304, + "grad_norm": 2.828125, + "learning_rate": 7.407524060527333e-09, + "loss": 1.2014645338058472, + "step": 658 + }, + { + "epoch": 0.8125577100646353, + "grad_norm": 9.5625, + "learning_rate": 7.3925951285056146e-09, + "loss": 2.114205837249756, + "step": 660 + }, + { + "epoch": 0.8150200061557402, + "grad_norm": 18.0, + "learning_rate": 7.377644089950371e-09, + "loss": 2.3271141052246094, + "step": 662 + }, + { + "epoch": 0.8174823022468451, + "grad_norm": 4.59375, + "learning_rate": 7.362671182199527e-09, + "loss": 1.9512523412704468, + "step": 664 + }, + { + "epoch": 0.8199445983379502, + "grad_norm": 4.875, + "learning_rate": 7.347676642938163e-09, + "loss": 1.875675082206726, + "step": 666 + }, + { + "epoch": 0.8224068944290551, + "grad_norm": 7.28125, + "learning_rate": 7.332660710194749e-09, + "loss": 2.120806932449341, + "step": 668 + }, + { + "epoch": 0.8248691905201601, + "grad_norm": 12.1875, + "learning_rate": 7.3176236223373595e-09, + "loss": 2.482332229614258, + "step": 670 + }, + { + "epoch": 0.827331486611265, + "grad_norm": 5.34375, + "learning_rate": 7.302565618069894e-09, + "loss": 1.932433843612671, + "step": 672 + }, + { + "epoch": 0.8297937827023699, + "grad_norm": 2.296875, + "learning_rate": 7.287486936428282e-09, + "loss": 1.1869601011276245, + "step": 674 + }, + { + "epoch": 0.832256078793475, + "grad_norm": 2.40625, + "learning_rate": 7.272387816776704e-09, + "loss": 1.2416247129440308, + "step": 676 + }, + { + "epoch": 0.8347183748845799, + "grad_norm": 6.34375, + "learning_rate": 7.257268498803767e-09, + "loss": 1.4887652397155762, + "step": 678 + }, + { + "epoch": 0.8371806709756848, + "grad_norm": 5.34375, + "learning_rate": 7.2421292225187186e-09, + "loss": 1.833484411239624, + "step": 680 + }, + { + "epoch": 0.8396429670667898, + "grad_norm": 13.8125, + "learning_rate": 7.2269702282476335e-09, + "loss": 2.041853904724121, + "step": 682 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 14.625, + "learning_rate": 7.211791756629598e-09, + "loss": 2.366133689880371, + "step": 684 + }, + { + "epoch": 0.8445675592489997, + "grad_norm": 10.875, + "learning_rate": 7.196594048612881e-09, + "loss": 1.9250491857528687, + "step": 686 + }, + { + "epoch": 0.8470298553401047, + "grad_norm": 10.3125, + "learning_rate": 7.1813773454511215e-09, + "loss": 2.2896928787231445, + "step": 688 + }, + { + "epoch": 0.8494921514312096, + "grad_norm": 5.40625, + "learning_rate": 7.166141888699495e-09, + "loss": 1.9879870414733887, + "step": 690 + }, + { + "epoch": 0.8519544475223145, + "grad_norm": 11.625, + "learning_rate": 7.150887920210878e-09, + "loss": 2.2236876487731934, + "step": 692 + }, + { + "epoch": 0.8544167436134195, + "grad_norm": 10.0, + "learning_rate": 7.135615682132004e-09, + "loss": 1.4050698280334473, + "step": 694 + }, + { + "epoch": 0.8568790397045245, + "grad_norm": 22.25, + "learning_rate": 7.120325416899629e-09, + "loss": 2.2749319076538086, + "step": 696 + }, + { + "epoch": 0.8593413357956294, + "grad_norm": 15.75, + "learning_rate": 7.105017367236675e-09, + "loss": 2.3958988189697266, + "step": 698 + }, + { + "epoch": 0.8618036318867344, + "grad_norm": 11.0, + "learning_rate": 7.089691776148384e-09, + "loss": 2.313142776489258, + "step": 700 + }, + { + "epoch": 0.8642659279778393, + "grad_norm": 11.625, + "learning_rate": 7.0743488869184535e-09, + "loss": 2.3592798709869385, + "step": 702 + }, + { + "epoch": 0.8667282240689443, + "grad_norm": 8.5625, + "learning_rate": 7.058988943105175e-09, + "loss": 2.11894154548645, + "step": 704 + }, + { + "epoch": 0.8691905201600493, + "grad_norm": 2.34375, + "learning_rate": 7.04361218853758e-09, + "loss": 1.3712561130523682, + "step": 706 + }, + { + "epoch": 0.8716528162511542, + "grad_norm": 13.4375, + "learning_rate": 7.0282188673115514e-09, + "loss": 2.092770576477051, + "step": 708 + }, + { + "epoch": 0.8741151123422591, + "grad_norm": 15.0625, + "learning_rate": 7.012809223785957e-09, + "loss": 1.9357192516326904, + "step": 710 + }, + { + "epoch": 0.8765774084333641, + "grad_norm": 2.953125, + "learning_rate": 6.9973835025787715e-09, + "loss": 1.2680325508117676, + "step": 712 + }, + { + "epoch": 0.8790397045244691, + "grad_norm": 7.125, + "learning_rate": 6.981941948563198e-09, + "loss": 1.7719722986221313, + "step": 714 + }, + { + "epoch": 0.881502000615574, + "grad_norm": 5.0625, + "learning_rate": 6.966484806863764e-09, + "loss": 1.8633275032043457, + "step": 716 + }, + { + "epoch": 0.883964296706679, + "grad_norm": 3.296875, + "learning_rate": 6.9510123228524545e-09, + "loss": 1.4539438486099243, + "step": 718 + }, + { + "epoch": 0.8864265927977839, + "grad_norm": 13.25, + "learning_rate": 6.935524742144792e-09, + "loss": 2.2359728813171387, + "step": 720 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 6.78125, + "learning_rate": 6.920022310595953e-09, + "loss": 1.8414530754089355, + "step": 722 + }, + { + "epoch": 0.8913511849799939, + "grad_norm": 3.84375, + "learning_rate": 6.904505274296864e-09, + "loss": 1.2079766988754272, + "step": 724 + }, + { + "epoch": 0.8938134810710988, + "grad_norm": 8.625, + "learning_rate": 6.88897387957029e-09, + "loss": 1.9165315628051758, + "step": 726 + }, + { + "epoch": 0.8962757771622037, + "grad_norm": 3.34375, + "learning_rate": 6.87342837296693e-09, + "loss": 1.2759442329406738, + "step": 728 + }, + { + "epoch": 0.8987380732533087, + "grad_norm": 5.34375, + "learning_rate": 6.857869001261491e-09, + "loss": 1.2644639015197754, + "step": 730 + }, + { + "epoch": 0.9012003693444137, + "grad_norm": 12.75, + "learning_rate": 6.842296011448788e-09, + "loss": 2.2167718410491943, + "step": 732 + }, + { + "epoch": 0.9036626654355187, + "grad_norm": 7.1875, + "learning_rate": 6.826709650739812e-09, + "loss": 1.402853012084961, + "step": 734 + }, + { + "epoch": 0.9061249615266236, + "grad_norm": 9.25, + "learning_rate": 6.811110166557809e-09, + "loss": 2.0942487716674805, + "step": 736 + }, + { + "epoch": 0.9085872576177285, + "grad_norm": 4.40625, + "learning_rate": 6.795497806534348e-09, + "loss": 1.8234786987304688, + "step": 738 + }, + { + "epoch": 0.9110495537088334, + "grad_norm": 16.5, + "learning_rate": 6.779872818505397e-09, + "loss": 1.8784126043319702, + "step": 740 + }, + { + "epoch": 0.9135118497999385, + "grad_norm": 9.5, + "learning_rate": 6.7642354505073835e-09, + "loss": 2.2190794944763184, + "step": 742 + }, + { + "epoch": 0.9159741458910434, + "grad_norm": 4.8125, + "learning_rate": 6.748585950773263e-09, + "loss": 1.9413115978240967, + "step": 744 + }, + { + "epoch": 0.9184364419821484, + "grad_norm": 3.109375, + "learning_rate": 6.732924567728566e-09, + "loss": 1.3823771476745605, + "step": 746 + }, + { + "epoch": 0.9208987380732533, + "grad_norm": 5.03125, + "learning_rate": 6.7172515499874705e-09, + "loss": 1.9463045597076416, + "step": 748 + }, + { + "epoch": 0.9233610341643582, + "grad_norm": 6.71875, + "learning_rate": 6.701567146348843e-09, + "loss": 2.0039689540863037, + "step": 750 + }, + { + "epoch": 0.9258233302554633, + "grad_norm": 3.828125, + "learning_rate": 6.685871605792301e-09, + "loss": 1.438122272491455, + "step": 752 + }, + { + "epoch": 0.9282856263465682, + "grad_norm": 34.25, + "learning_rate": 6.670165177474241e-09, + "loss": 1.7374298572540283, + "step": 754 + }, + { + "epoch": 0.9307479224376731, + "grad_norm": 2.796875, + "learning_rate": 6.6544481107239054e-09, + "loss": 1.4571634531021118, + "step": 756 + }, + { + "epoch": 0.9332102185287781, + "grad_norm": 4.78125, + "learning_rate": 6.638720655039412e-09, + "loss": 1.7221906185150146, + "step": 758 + }, + { + "epoch": 0.935672514619883, + "grad_norm": 22.25, + "learning_rate": 6.622983060083796e-09, + "loss": 1.344387173652649, + "step": 760 + }, + { + "epoch": 0.938134810710988, + "grad_norm": 2.4375, + "learning_rate": 6.607235575681045e-09, + "loss": 1.2809216976165771, + "step": 762 + }, + { + "epoch": 0.940597106802093, + "grad_norm": 2.609375, + "learning_rate": 6.591478451812138e-09, + "loss": 1.1766109466552734, + "step": 764 + }, + { + "epoch": 0.9430594028931979, + "grad_norm": 3.765625, + "learning_rate": 6.575711938611073e-09, + "loss": 1.3128526210784912, + "step": 766 + }, + { + "epoch": 0.9455216989843028, + "grad_norm": 5.625, + "learning_rate": 6.559936286360897e-09, + "loss": 1.8674499988555908, + "step": 768 + }, + { + "epoch": 0.9479839950754079, + "grad_norm": 5.28125, + "learning_rate": 6.544151745489735e-09, + "loss": 1.934564471244812, + "step": 770 + }, + { + "epoch": 0.9504462911665128, + "grad_norm": 7.625, + "learning_rate": 6.52835856656681e-09, + "loss": 2.1300408840179443, + "step": 772 + }, + { + "epoch": 0.9529085872576177, + "grad_norm": 10.3125, + "learning_rate": 6.512557000298471e-09, + "loss": 2.284024715423584, + "step": 774 + }, + { + "epoch": 0.9553708833487227, + "grad_norm": 5.15625, + "learning_rate": 6.49674729752421e-09, + "loss": 1.9190423488616943, + "step": 776 + }, + { + "epoch": 0.9578331794398276, + "grad_norm": 9.0, + "learning_rate": 6.480929709212682e-09, + "loss": 2.2223734855651855, + "step": 778 + }, + { + "epoch": 0.9602954755309326, + "grad_norm": 5.5, + "learning_rate": 6.465104486457718e-09, + "loss": 1.9598147869110107, + "step": 780 + }, + { + "epoch": 0.9627577716220376, + "grad_norm": 6.59375, + "learning_rate": 6.4492718804743365e-09, + "loss": 2.041882276535034, + "step": 782 + }, + { + "epoch": 0.9652200677131425, + "grad_norm": 2.125, + "learning_rate": 6.433432142594771e-09, + "loss": 1.2188262939453125, + "step": 784 + }, + { + "epoch": 0.9676823638042474, + "grad_norm": 11.375, + "learning_rate": 6.4175855242644575e-09, + "loss": 2.208829879760742, + "step": 786 + }, + { + "epoch": 0.9701446598953524, + "grad_norm": 5.0, + "learning_rate": 6.401732277038063e-09, + "loss": 2.0125837326049805, + "step": 788 + }, + { + "epoch": 0.9726069559864574, + "grad_norm": 8.75, + "learning_rate": 6.3858726525754814e-09, + "loss": 2.2643885612487793, + "step": 790 + }, + { + "epoch": 0.9750692520775623, + "grad_norm": 7.0625, + "learning_rate": 6.370006902637836e-09, + "loss": 1.9207779169082642, + "step": 792 + }, + { + "epoch": 0.9775315481686673, + "grad_norm": 2.59375, + "learning_rate": 6.354135279083497e-09, + "loss": 1.2121376991271973, + "step": 794 + }, + { + "epoch": 0.9799938442597722, + "grad_norm": 10.9375, + "learning_rate": 6.338258033864067e-09, + "loss": 2.1134583950042725, + "step": 796 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 18.125, + "learning_rate": 6.3223754190203895e-09, + "loss": 2.3652374744415283, + "step": 798 + }, + { + "epoch": 0.9849184364419822, + "grad_norm": 11.6875, + "learning_rate": 6.306487686678556e-09, + "loss": 1.956110954284668, + "step": 800 + }, + { + "epoch": 0.9873807325330871, + "grad_norm": 5.21875, + "learning_rate": 6.290595089045882e-09, + "loss": 1.993713140487671, + "step": 802 + }, + { + "epoch": 0.989843028624192, + "grad_norm": 19.25, + "learning_rate": 6.274697878406925e-09, + "loss": 1.3555768728256226, + "step": 804 + }, + { + "epoch": 0.992305324715297, + "grad_norm": 14.9375, + "learning_rate": 6.2587963071194695e-09, + "loss": 1.7694034576416016, + "step": 806 + }, + { + "epoch": 0.994767620806402, + "grad_norm": 14.0, + "learning_rate": 6.242890627610518e-09, + "loss": 2.2126145362854004, + "step": 808 + }, + { + "epoch": 0.997229916897507, + "grad_norm": 5.46875, + "learning_rate": 6.226981092372297e-09, + "loss": 1.7438420057296753, + "step": 810 + }, + { + "epoch": 0.9996922129886119, + "grad_norm": 3.671875, + "learning_rate": 6.211067953958229e-09, + "loss": 1.237831711769104, + "step": 812 + }, + { + "epoch": 1.0012311480455525, + "grad_norm": 2.15625, + "learning_rate": 6.195151464978945e-09, + "loss": 1.2776278257369995, + "step": 814 + }, + { + "epoch": 1.0036934441366574, + "grad_norm": 5.625, + "learning_rate": 6.179231878098257e-09, + "loss": 1.6098976135253906, + "step": 816 + }, + { + "epoch": 1.0061557402277623, + "grad_norm": 1.875, + "learning_rate": 6.163309446029157e-09, + "loss": 1.5421602725982666, + "step": 818 + }, + { + "epoch": 1.0086180363188673, + "grad_norm": 5.3125, + "learning_rate": 6.1473844215298045e-09, + "loss": 1.4228730201721191, + "step": 820 + }, + { + "epoch": 1.0110803324099722, + "grad_norm": 14.75, + "learning_rate": 6.131457057399506e-09, + "loss": 2.0147526264190674, + "step": 822 + }, + { + "epoch": 1.0135426285010773, + "grad_norm": 13.9375, + "learning_rate": 6.115527606474713e-09, + "loss": 2.301534652709961, + "step": 824 + }, + { + "epoch": 1.0160049245921823, + "grad_norm": 6.75, + "learning_rate": 6.099596321625005e-09, + "loss": 1.9000599384307861, + "step": 826 + }, + { + "epoch": 1.0184672206832872, + "grad_norm": 1.4140625, + "learning_rate": 6.083663455749068e-09, + "loss": 1.2694454193115234, + "step": 828 + }, + { + "epoch": 1.0209295167743921, + "grad_norm": 2.453125, + "learning_rate": 6.0677292617706915e-09, + "loss": 1.1476200819015503, + "step": 830 + }, + { + "epoch": 1.023391812865497, + "grad_norm": 15.125, + "learning_rate": 6.051793992634741e-09, + "loss": 1.685870885848999, + "step": 832 + }, + { + "epoch": 1.025854108956602, + "grad_norm": 5.15625, + "learning_rate": 6.035857901303159e-09, + "loss": 2.1021130084991455, + "step": 834 + }, + { + "epoch": 1.028316405047707, + "grad_norm": 9.25, + "learning_rate": 6.019921240750932e-09, + "loss": 1.9393489360809326, + "step": 836 + }, + { + "epoch": 1.0307787011388119, + "grad_norm": 3.640625, + "learning_rate": 6.0039842639620844e-09, + "loss": 1.9408633708953857, + "step": 838 + }, + { + "epoch": 1.0332409972299168, + "grad_norm": 16.875, + "learning_rate": 5.988047223925661e-09, + "loss": 2.042579174041748, + "step": 840 + }, + { + "epoch": 1.035703293321022, + "grad_norm": 2.328125, + "learning_rate": 5.9721103736317114e-09, + "loss": 1.7358704805374146, + "step": 842 + }, + { + "epoch": 1.0381655894121269, + "grad_norm": 7.53125, + "learning_rate": 5.956173966067275e-09, + "loss": 1.5867335796356201, + "step": 844 + }, + { + "epoch": 1.0406278855032318, + "grad_norm": 4.34375, + "learning_rate": 5.940238254212358e-09, + "loss": 1.8849399089813232, + "step": 846 + }, + { + "epoch": 1.0430901815943368, + "grad_norm": 4.84375, + "learning_rate": 5.924303491035925e-09, + "loss": 1.643231987953186, + "step": 848 + }, + { + "epoch": 1.0455524776854417, + "grad_norm": 14.0625, + "learning_rate": 5.9083699294918835e-09, + "loss": 2.0420408248901367, + "step": 850 + }, + { + "epoch": 1.0480147737765466, + "grad_norm": 10.5, + "learning_rate": 5.89243782251506e-09, + "loss": 2.353334903717041, + "step": 852 + }, + { + "epoch": 1.0504770698676515, + "grad_norm": 12.625, + "learning_rate": 5.876507423017199e-09, + "loss": 2.2866880893707275, + "step": 854 + }, + { + "epoch": 1.0529393659587565, + "grad_norm": 5.09375, + "learning_rate": 5.8605789838829335e-09, + "loss": 2.091262102127075, + "step": 856 + }, + { + "epoch": 1.0554016620498614, + "grad_norm": 15.1875, + "learning_rate": 5.844652757965778e-09, + "loss": 2.1091365814208984, + "step": 858 + }, + { + "epoch": 1.0578639581409663, + "grad_norm": 2.4375, + "learning_rate": 5.828728998084117e-09, + "loss": 1.6677895784378052, + "step": 860 + }, + { + "epoch": 1.0603262542320715, + "grad_norm": 4.4375, + "learning_rate": 5.812807957017181e-09, + "loss": 1.5235992670059204, + "step": 862 + }, + { + "epoch": 1.0627885503231764, + "grad_norm": 12.1875, + "learning_rate": 5.796889887501051e-09, + "loss": 2.279834270477295, + "step": 864 + }, + { + "epoch": 1.0652508464142814, + "grad_norm": 9.125, + "learning_rate": 5.780975042224629e-09, + "loss": 2.450547456741333, + "step": 866 + }, + { + "epoch": 1.0677131425053863, + "grad_norm": 61.25, + "learning_rate": 5.765063673825634e-09, + "loss": 2.2601470947265625, + "step": 868 + }, + { + "epoch": 1.0701754385964912, + "grad_norm": 3.140625, + "learning_rate": 5.749156034886602e-09, + "loss": 1.6974682807922363, + "step": 870 + }, + { + "epoch": 1.0726377346875962, + "grad_norm": 5.75, + "learning_rate": 5.733252377930853e-09, + "loss": 1.7504122257232666, + "step": 872 + }, + { + "epoch": 1.075100030778701, + "grad_norm": 3.640625, + "learning_rate": 5.7173529554185045e-09, + "loss": 1.7744596004486084, + "step": 874 + }, + { + "epoch": 1.077562326869806, + "grad_norm": 5.0625, + "learning_rate": 5.701458019742448e-09, + "loss": 1.8063809871673584, + "step": 876 + }, + { + "epoch": 1.080024622960911, + "grad_norm": 4.75, + "learning_rate": 5.685567823224358e-09, + "loss": 1.8798420429229736, + "step": 878 + }, + { + "epoch": 1.082486919052016, + "grad_norm": 12.75, + "learning_rate": 5.669682618110672e-09, + "loss": 2.0758848190307617, + "step": 880 + }, + { + "epoch": 1.084949215143121, + "grad_norm": 12.5, + "learning_rate": 5.653802656568592e-09, + "loss": 2.1326591968536377, + "step": 882 + }, + { + "epoch": 1.087411511234226, + "grad_norm": 4.8125, + "learning_rate": 5.637928190682084e-09, + "loss": 1.9486507177352905, + "step": 884 + }, + { + "epoch": 1.089873807325331, + "grad_norm": 6.75, + "learning_rate": 5.622059472447876e-09, + "loss": 1.9365224838256836, + "step": 886 + }, + { + "epoch": 1.0923361034164358, + "grad_norm": 6.4375, + "learning_rate": 5.606196753771449e-09, + "loss": 1.8881072998046875, + "step": 888 + }, + { + "epoch": 1.0947983995075408, + "grad_norm": 7.21875, + "learning_rate": 5.590340286463054e-09, + "loss": 1.9489333629608154, + "step": 890 + }, + { + "epoch": 1.0972606955986457, + "grad_norm": 8.5, + "learning_rate": 5.574490322233697e-09, + "loss": 1.9946143627166748, + "step": 892 + }, + { + "epoch": 1.0997229916897506, + "grad_norm": 3.484375, + "learning_rate": 5.558647112691158e-09, + "loss": 1.6062787771224976, + "step": 894 + }, + { + "epoch": 1.1021852877808556, + "grad_norm": 2.859375, + "learning_rate": 5.542810909335987e-09, + "loss": 1.2802103757858276, + "step": 896 + }, + { + "epoch": 1.1046475838719605, + "grad_norm": 17.0, + "learning_rate": 5.526981963557518e-09, + "loss": 1.7315878868103027, + "step": 898 + }, + { + "epoch": 1.1071098799630656, + "grad_norm": 9.0, + "learning_rate": 5.511160526629875e-09, + "loss": 1.9750934839248657, + "step": 900 + }, + { + "epoch": 1.1095721760541706, + "grad_norm": 3.515625, + "learning_rate": 5.495346849707981e-09, + "loss": 1.6797375679016113, + "step": 902 + }, + { + "epoch": 1.1120344721452755, + "grad_norm": 10.75, + "learning_rate": 5.479541183823578e-09, + "loss": 1.8305199146270752, + "step": 904 + }, + { + "epoch": 1.1144967682363804, + "grad_norm": 4.84375, + "learning_rate": 5.463743779881238e-09, + "loss": 1.9975595474243164, + "step": 906 + }, + { + "epoch": 1.1169590643274854, + "grad_norm": 4.65625, + "learning_rate": 5.447954888654378e-09, + "loss": 1.7815577983856201, + "step": 908 + }, + { + "epoch": 1.1194213604185903, + "grad_norm": 3.109375, + "learning_rate": 5.432174760781281e-09, + "loss": 1.5837122201919556, + "step": 910 + }, + { + "epoch": 1.1218836565096952, + "grad_norm": 2.25, + "learning_rate": 5.416403646761119e-09, + "loss": 1.2701913118362427, + "step": 912 + }, + { + "epoch": 1.1243459526008002, + "grad_norm": 2.890625, + "learning_rate": 5.400641796949976e-09, + "loss": 1.3599649667739868, + "step": 914 + }, + { + "epoch": 1.1268082486919053, + "grad_norm": 6.34375, + "learning_rate": 5.384889461556868e-09, + "loss": 1.5575028657913208, + "step": 916 + }, + { + "epoch": 1.1292705447830103, + "grad_norm": 3.34375, + "learning_rate": 5.36914689063978e-09, + "loss": 1.4743753671646118, + "step": 918 + }, + { + "epoch": 1.1317328408741152, + "grad_norm": 5.25, + "learning_rate": 5.353414334101692e-09, + "loss": 1.5236045122146606, + "step": 920 + }, + { + "epoch": 1.1341951369652201, + "grad_norm": 4.4375, + "learning_rate": 5.337692041686615e-09, + "loss": 1.891930341720581, + "step": 922 + }, + { + "epoch": 1.136657433056325, + "grad_norm": 2.046875, + "learning_rate": 5.321980262975614e-09, + "loss": 1.522653341293335, + "step": 924 + }, + { + "epoch": 1.13911972914743, + "grad_norm": 15.625, + "learning_rate": 5.306279247382867e-09, + "loss": 1.66744065284729, + "step": 926 + }, + { + "epoch": 1.141582025238535, + "grad_norm": 16.875, + "learning_rate": 5.290589244151689e-09, + "loss": 2.157740592956543, + "step": 928 + }, + { + "epoch": 1.1440443213296398, + "grad_norm": 2.390625, + "learning_rate": 5.274910502350581e-09, + "loss": 1.5675222873687744, + "step": 930 + }, + { + "epoch": 1.1465066174207448, + "grad_norm": 4.84375, + "learning_rate": 5.259243270869276e-09, + "loss": 1.1499652862548828, + "step": 932 + }, + { + "epoch": 1.1489689135118497, + "grad_norm": 12.75, + "learning_rate": 5.243587798414792e-09, + "loss": 1.5367200374603271, + "step": 934 + }, + { + "epoch": 1.1514312096029546, + "grad_norm": 5.34375, + "learning_rate": 5.227944333507477e-09, + "loss": 1.9310216903686523, + "step": 936 + }, + { + "epoch": 1.1538935056940598, + "grad_norm": 11.5, + "learning_rate": 5.212313124477067e-09, + "loss": 2.123908519744873, + "step": 938 + }, + { + "epoch": 1.1563558017851647, + "grad_norm": 7.28125, + "learning_rate": 5.196694419458744e-09, + "loss": 2.1816015243530273, + "step": 940 + }, + { + "epoch": 1.1588180978762697, + "grad_norm": 1.84375, + "learning_rate": 5.1810884663891986e-09, + "loss": 1.5526807308197021, + "step": 942 + }, + { + "epoch": 1.1612803939673746, + "grad_norm": 1.8671875, + "learning_rate": 5.165495513002691e-09, + "loss": 1.3024842739105225, + "step": 944 + }, + { + "epoch": 1.1637426900584795, + "grad_norm": 2.796875, + "learning_rate": 5.149915806827121e-09, + "loss": 1.2783153057098389, + "step": 946 + }, + { + "epoch": 1.1662049861495845, + "grad_norm": 5.125, + "learning_rate": 5.134349595180094e-09, + "loss": 1.5641247034072876, + "step": 948 + }, + { + "epoch": 1.1686672822406894, + "grad_norm": 7.0, + "learning_rate": 5.1187971251650065e-09, + "loss": 1.9546620845794678, + "step": 950 + }, + { + "epoch": 1.1711295783317943, + "grad_norm": 4.4375, + "learning_rate": 5.10325864366711e-09, + "loss": 1.87162446975708, + "step": 952 + }, + { + "epoch": 1.1735918744228995, + "grad_norm": 11.5, + "learning_rate": 5.087734397349596e-09, + "loss": 1.8723485469818115, + "step": 954 + }, + { + "epoch": 1.1760541705140044, + "grad_norm": 5.21875, + "learning_rate": 5.072224632649684e-09, + "loss": 1.91074538230896, + "step": 956 + }, + { + "epoch": 1.1785164666051093, + "grad_norm": 5.25, + "learning_rate": 5.056729595774712e-09, + "loss": 1.9009315967559814, + "step": 958 + }, + { + "epoch": 1.1809787626962143, + "grad_norm": 7.3125, + "learning_rate": 5.041249532698214e-09, + "loss": 1.9836119413375854, + "step": 960 + }, + { + "epoch": 1.1834410587873192, + "grad_norm": 9.375, + "learning_rate": 5.025784689156032e-09, + "loss": 1.9037981033325195, + "step": 962 + }, + { + "epoch": 1.1859033548784241, + "grad_norm": 27.875, + "learning_rate": 5.0103353106424065e-09, + "loss": 2.551020622253418, + "step": 964 + }, + { + "epoch": 1.188365650969529, + "grad_norm": 12.75, + "learning_rate": 4.994901642406078e-09, + "loss": 2.474264144897461, + "step": 966 + }, + { + "epoch": 1.190827947060634, + "grad_norm": 11.5625, + "learning_rate": 4.979483929446398e-09, + "loss": 1.7837506532669067, + "step": 968 + }, + { + "epoch": 1.193290243151739, + "grad_norm": 3.65625, + "learning_rate": 4.964082416509442e-09, + "loss": 1.760176181793213, + "step": 970 + }, + { + "epoch": 1.1957525392428439, + "grad_norm": 17.75, + "learning_rate": 4.948697348084115e-09, + "loss": 1.9721624851226807, + "step": 972 + }, + { + "epoch": 1.1982148353339488, + "grad_norm": 6.6875, + "learning_rate": 4.933328968398283e-09, + "loss": 1.8035709857940674, + "step": 974 + }, + { + "epoch": 1.200677131425054, + "grad_norm": 5.21875, + "learning_rate": 4.9179775214148806e-09, + "loss": 1.6362351179122925, + "step": 976 + }, + { + "epoch": 1.2031394275161589, + "grad_norm": 5.90625, + "learning_rate": 4.902643250828055e-09, + "loss": 1.7732539176940918, + "step": 978 + }, + { + "epoch": 1.2056017236072638, + "grad_norm": 4.875, + "learning_rate": 4.887326400059283e-09, + "loss": 1.7590731382369995, + "step": 980 + }, + { + "epoch": 1.2080640196983687, + "grad_norm": 2.421875, + "learning_rate": 4.8720272122535195e-09, + "loss": 1.590978980064392, + "step": 982 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 22.875, + "learning_rate": 4.8567459302753234e-09, + "loss": 1.8453547954559326, + "step": 984 + }, + { + "epoch": 1.2129886118805786, + "grad_norm": 6.71875, + "learning_rate": 4.841482796705019e-09, + "loss": 2.2472167015075684, + "step": 986 + }, + { + "epoch": 1.2154509079716835, + "grad_norm": 5.0625, + "learning_rate": 4.826238053834831e-09, + "loss": 1.9840574264526367, + "step": 988 + }, + { + "epoch": 1.2179132040627885, + "grad_norm": 9.3125, + "learning_rate": 4.811011943665047e-09, + "loss": 1.930182695388794, + "step": 990 + }, + { + "epoch": 1.2203755001538936, + "grad_norm": 15.875, + "learning_rate": 4.795804707900169e-09, + "loss": 2.222364664077759, + "step": 992 + }, + { + "epoch": 1.2228377962449986, + "grad_norm": 10.9375, + "learning_rate": 4.780616587945083e-09, + "loss": 2.241105079650879, + "step": 994 + }, + { + "epoch": 1.2253000923361035, + "grad_norm": 6.09375, + "learning_rate": 4.765447824901222e-09, + "loss": 2.1059789657592773, + "step": 996 + }, + { + "epoch": 1.2277623884272084, + "grad_norm": 5.0625, + "learning_rate": 4.750298659562745e-09, + "loss": 1.9286503791809082, + "step": 998 + }, + { + "epoch": 1.2302246845183133, + "grad_norm": 4.84375, + "learning_rate": 4.735169332412704e-09, + "loss": 1.8667454719543457, + "step": 1000 + }, + { + "epoch": 1.2326869806094183, + "grad_norm": 9.4375, + "learning_rate": 4.720060083619239e-09, + "loss": 2.0463290214538574, + "step": 1002 + }, + { + "epoch": 1.2351492767005232, + "grad_norm": 6.28125, + "learning_rate": 4.7049711530317564e-09, + "loss": 2.106719970703125, + "step": 1004 + }, + { + "epoch": 1.2376115727916281, + "grad_norm": 3.8125, + "learning_rate": 4.6899027801771234e-09, + "loss": 1.829174518585205, + "step": 1006 + }, + { + "epoch": 1.240073868882733, + "grad_norm": 47.5, + "learning_rate": 4.6748552042558664e-09, + "loss": 2.110135555267334, + "step": 1008 + }, + { + "epoch": 1.242536164973838, + "grad_norm": 15.9375, + "learning_rate": 4.659828664138378e-09, + "loss": 2.152853012084961, + "step": 1010 + }, + { + "epoch": 1.244998461064943, + "grad_norm": 10.75, + "learning_rate": 4.6448233983611165e-09, + "loss": 1.862748622894287, + "step": 1012 + }, + { + "epoch": 1.247460757156048, + "grad_norm": 20.375, + "learning_rate": 4.629839645122828e-09, + "loss": 2.054180860519409, + "step": 1014 + }, + { + "epoch": 1.249923053247153, + "grad_norm": 10.5, + "learning_rate": 4.614877642280759e-09, + "loss": 2.0183398723602295, + "step": 1016 + }, + { + "epoch": 1.252385349338258, + "grad_norm": 4.3125, + "learning_rate": 4.59993762734688e-09, + "loss": 1.9448716640472412, + "step": 1018 + }, + { + "epoch": 1.254847645429363, + "grad_norm": 5.3125, + "learning_rate": 4.585019837484127e-09, + "loss": 1.909618854522705, + "step": 1020 + }, + { + "epoch": 1.2573099415204678, + "grad_norm": 4.4375, + "learning_rate": 4.5701245095026175e-09, + "loss": 1.8093581199645996, + "step": 1022 + }, + { + "epoch": 1.2597722376115728, + "grad_norm": 4.375, + "learning_rate": 4.555251879855905e-09, + "loss": 1.8561820983886719, + "step": 1024 + }, + { + "epoch": 1.2622345337026777, + "grad_norm": 5.71875, + "learning_rate": 4.540402184637225e-09, + "loss": 1.9136399030685425, + "step": 1026 + }, + { + "epoch": 1.2646968297937828, + "grad_norm": 6.1875, + "learning_rate": 4.525575659575739e-09, + "loss": 1.922465443611145, + "step": 1028 + }, + { + "epoch": 1.2671591258848878, + "grad_norm": 6.125, + "learning_rate": 4.510772540032801e-09, + "loss": 1.945884346961975, + "step": 1030 + }, + { + "epoch": 1.2696214219759927, + "grad_norm": 11.6875, + "learning_rate": 4.495993060998216e-09, + "loss": 2.1394665241241455, + "step": 1032 + }, + { + "epoch": 1.2720837180670976, + "grad_norm": 12.875, + "learning_rate": 4.481237457086511e-09, + "loss": 2.548738479614258, + "step": 1034 + }, + { + "epoch": 1.2745460141582026, + "grad_norm": 6.65625, + "learning_rate": 4.466505962533216e-09, + "loss": 2.148568868637085, + "step": 1036 + }, + { + "epoch": 1.2770083102493075, + "grad_norm": 143.0, + "learning_rate": 4.451798811191132e-09, + "loss": 2.0206987857818604, + "step": 1038 + }, + { + "epoch": 1.2794706063404124, + "grad_norm": 4.78125, + "learning_rate": 4.437116236526635e-09, + "loss": 2.025409698486328, + "step": 1040 + }, + { + "epoch": 1.2819329024315174, + "grad_norm": 14.875, + "learning_rate": 4.42245847161596e-09, + "loss": 1.8983882665634155, + "step": 1042 + }, + { + "epoch": 1.2843951985226223, + "grad_norm": 1.8515625, + "learning_rate": 4.4078257491415e-09, + "loss": 1.594254732131958, + "step": 1044 + }, + { + "epoch": 1.2868574946137272, + "grad_norm": 3.75, + "learning_rate": 4.393218301388123e-09, + "loss": 1.4578649997711182, + "step": 1046 + }, + { + "epoch": 1.2893197907048322, + "grad_norm": 6.0625, + "learning_rate": 4.378636360239471e-09, + "loss": 1.8163200616836548, + "step": 1048 + }, + { + "epoch": 1.291782086795937, + "grad_norm": 21.625, + "learning_rate": 4.364080157174287e-09, + "loss": 1.811424732208252, + "step": 1050 + }, + { + "epoch": 1.2942443828870422, + "grad_norm": 6.46875, + "learning_rate": 4.349549923262743e-09, + "loss": 1.6952979564666748, + "step": 1052 + }, + { + "epoch": 1.2967066789781472, + "grad_norm": 8.9375, + "learning_rate": 4.33504588916276e-09, + "loss": 1.85584557056427, + "step": 1054 + }, + { + "epoch": 1.299168975069252, + "grad_norm": 6.25, + "learning_rate": 4.320568285116362e-09, + "loss": 1.8780372142791748, + "step": 1056 + }, + { + "epoch": 1.301631271160357, + "grad_norm": 3.265625, + "learning_rate": 4.306117340946008e-09, + "loss": 1.694900393486023, + "step": 1058 + }, + { + "epoch": 1.304093567251462, + "grad_norm": 5.40625, + "learning_rate": 4.291693286050951e-09, + "loss": 1.7237621545791626, + "step": 1060 + }, + { + "epoch": 1.306555863342567, + "grad_norm": 7.8125, + "learning_rate": 4.277296349403592e-09, + "loss": 1.9782402515411377, + "step": 1062 + }, + { + "epoch": 1.3090181594336718, + "grad_norm": 11.625, + "learning_rate": 4.262926759545853e-09, + "loss": 2.2806496620178223, + "step": 1064 + }, + { + "epoch": 1.311480455524777, + "grad_norm": 14.9375, + "learning_rate": 4.2485847445855384e-09, + "loss": 2.0329091548919678, + "step": 1066 + }, + { + "epoch": 1.313942751615882, + "grad_norm": 7.8125, + "learning_rate": 4.234270532192722e-09, + "loss": 1.996172308921814, + "step": 1068 + }, + { + "epoch": 1.3164050477069869, + "grad_norm": 5.4375, + "learning_rate": 4.219984349596131e-09, + "loss": 1.7426702976226807, + "step": 1070 + }, + { + "epoch": 1.3188673437980918, + "grad_norm": 4.09375, + "learning_rate": 4.205726423579531e-09, + "loss": 1.9689075946807861, + "step": 1072 + }, + { + "epoch": 1.3213296398891967, + "grad_norm": 4.375, + "learning_rate": 4.1914969804781435e-09, + "loss": 1.851407766342163, + "step": 1074 + }, + { + "epoch": 1.3237919359803016, + "grad_norm": 4.5625, + "learning_rate": 4.177296246175035e-09, + "loss": 1.9321177005767822, + "step": 1076 + }, + { + "epoch": 1.3262542320714066, + "grad_norm": 10.75, + "learning_rate": 4.1631244460975395e-09, + "loss": 2.1217970848083496, + "step": 1078 + }, + { + "epoch": 1.3287165281625115, + "grad_norm": 2.34375, + "learning_rate": 4.148981805213683e-09, + "loss": 1.6175642013549805, + "step": 1080 + }, + { + "epoch": 1.3311788242536164, + "grad_norm": 9.9375, + "learning_rate": 4.134868548028603e-09, + "loss": 1.8694862127304077, + "step": 1082 + }, + { + "epoch": 1.3336411203447214, + "grad_norm": 3.9375, + "learning_rate": 4.120784898580994e-09, + "loss": 1.9671717882156372, + "step": 1084 + }, + { + "epoch": 1.3361034164358263, + "grad_norm": 5.9375, + "learning_rate": 4.106731080439549e-09, + "loss": 1.6825287342071533, + "step": 1086 + }, + { + "epoch": 1.3385657125269312, + "grad_norm": 3.03125, + "learning_rate": 4.092707316699403e-09, + "loss": 1.5507920980453491, + "step": 1088 + }, + { + "epoch": 1.3410280086180364, + "grad_norm": 6.03125, + "learning_rate": 4.078713829978599e-09, + "loss": 1.4552762508392334, + "step": 1090 + }, + { + "epoch": 1.3434903047091413, + "grad_norm": 7.09375, + "learning_rate": 4.064750842414555e-09, + "loss": 1.8754684925079346, + "step": 1092 + }, + { + "epoch": 1.3459526008002463, + "grad_norm": 94.5, + "learning_rate": 4.050818575660528e-09, + "loss": 2.175379753112793, + "step": 1094 + }, + { + "epoch": 1.3484148968913512, + "grad_norm": 2.921875, + "learning_rate": 4.0369172508821154e-09, + "loss": 1.8554493188858032, + "step": 1096 + }, + { + "epoch": 1.3508771929824561, + "grad_norm": 4.5625, + "learning_rate": 4.023047088753718e-09, + "loss": 1.2790199518203735, + "step": 1098 + }, + { + "epoch": 1.353339489073561, + "grad_norm": 4.75, + "learning_rate": 4.009208309455052e-09, + "loss": 1.7523287534713745, + "step": 1100 + }, + { + "epoch": 1.355801785164666, + "grad_norm": 7.9375, + "learning_rate": 3.9954011326676595e-09, + "loss": 2.061239242553711, + "step": 1102 + }, + { + "epoch": 1.3582640812557711, + "grad_norm": 8.875, + "learning_rate": 3.981625777571407e-09, + "loss": 2.029423713684082, + "step": 1104 + }, + { + "epoch": 1.360726377346876, + "grad_norm": 22.125, + "learning_rate": 3.967882462841013e-09, + "loss": 2.4487719535827637, + "step": 1106 + }, + { + "epoch": 1.363188673437981, + "grad_norm": 13.4375, + "learning_rate": 3.954171406642579e-09, + "loss": 2.2747087478637695, + "step": 1108 + }, + { + "epoch": 1.365650969529086, + "grad_norm": 8.875, + "learning_rate": 3.940492826630122e-09, + "loss": 2.142123222351074, + "step": 1110 + }, + { + "epoch": 1.3681132656201909, + "grad_norm": 13.3125, + "learning_rate": 3.926846939942119e-09, + "loss": 2.411155939102173, + "step": 1112 + }, + { + "epoch": 1.3705755617112958, + "grad_norm": 6.96875, + "learning_rate": 3.913233963198062e-09, + "loss": 2.1852264404296875, + "step": 1114 + }, + { + "epoch": 1.3730378578024007, + "grad_norm": 2.71875, + "learning_rate": 3.899654112495024e-09, + "loss": 1.5160444974899292, + "step": 1116 + }, + { + "epoch": 1.3755001538935057, + "grad_norm": 4.59375, + "learning_rate": 3.886107603404221e-09, + "loss": 1.5113252401351929, + "step": 1118 + }, + { + "epoch": 1.3779624499846106, + "grad_norm": 4.71875, + "learning_rate": 3.872594650967591e-09, + "loss": 1.700373649597168, + "step": 1120 + }, + { + "epoch": 1.3804247460757155, + "grad_norm": 9.5625, + "learning_rate": 3.859115469694385e-09, + "loss": 1.9584300518035889, + "step": 1122 + }, + { + "epoch": 1.3828870421668205, + "grad_norm": 5.5, + "learning_rate": 3.845670273557754e-09, + "loss": 1.8532516956329346, + "step": 1124 + }, + { + "epoch": 1.3853493382579254, + "grad_norm": 4.21875, + "learning_rate": 3.832259275991365e-09, + "loss": 1.640071988105774, + "step": 1126 + }, + { + "epoch": 1.3878116343490305, + "grad_norm": 3.390625, + "learning_rate": 3.818882689885998e-09, + "loss": 1.2326576709747314, + "step": 1128 + }, + { + "epoch": 1.3902739304401355, + "grad_norm": 4.375, + "learning_rate": 3.80554072758618e-09, + "loss": 1.5156090259552002, + "step": 1130 + }, + { + "epoch": 1.3927362265312404, + "grad_norm": 2.625, + "learning_rate": 3.7922336008868e-09, + "loss": 1.5685241222381592, + "step": 1132 + }, + { + "epoch": 1.3951985226223453, + "grad_norm": 5.09375, + "learning_rate": 3.778961521029762e-09, + "loss": 1.6617923974990845, + "step": 1134 + }, + { + "epoch": 1.3976608187134503, + "grad_norm": 6.46875, + "learning_rate": 3.765724698700621e-09, + "loss": 1.8906147480010986, + "step": 1136 + }, + { + "epoch": 1.4001231148045552, + "grad_norm": 2.875, + "learning_rate": 3.752523344025243e-09, + "loss": 1.545287847518921, + "step": 1138 + }, + { + "epoch": 1.4025854108956601, + "grad_norm": 7.78125, + "learning_rate": 3.7393576665664675e-09, + "loss": 1.732557773590088, + "step": 1140 + }, + { + "epoch": 1.4050477069867653, + "grad_norm": 2.25, + "learning_rate": 3.7262278753207815e-09, + "loss": 1.72062087059021, + "step": 1142 + }, + { + "epoch": 1.4075100030778702, + "grad_norm": 8.75, + "learning_rate": 3.7131341787150018e-09, + "loss": 1.5638048648834229, + "step": 1144 + }, + { + "epoch": 1.4099722991689752, + "grad_norm": 25.0, + "learning_rate": 3.7000767846029665e-09, + "loss": 2.013415575027466, + "step": 1146 + }, + { + "epoch": 1.41243459526008, + "grad_norm": 2.46875, + "learning_rate": 3.687055900262238e-09, + "loss": 1.5985221862792969, + "step": 1148 + }, + { + "epoch": 1.414896891351185, + "grad_norm": 12.1875, + "learning_rate": 3.6740717323908046e-09, + "loss": 1.7952547073364258, + "step": 1150 + }, + { + "epoch": 1.41735918744229, + "grad_norm": 2.9375, + "learning_rate": 3.6611244871038118e-09, + "loss": 1.5459375381469727, + "step": 1152 + }, + { + "epoch": 1.4198214835333949, + "grad_norm": 6.84375, + "learning_rate": 3.648214369930278e-09, + "loss": 1.641556739807129, + "step": 1154 + }, + { + "epoch": 1.4222837796244998, + "grad_norm": 2.109375, + "learning_rate": 3.635341585809837e-09, + "loss": 1.5961995124816895, + "step": 1156 + }, + { + "epoch": 1.4247460757156047, + "grad_norm": 9.125, + "learning_rate": 3.6225063390894896e-09, + "loss": 1.6079602241516113, + "step": 1158 + }, + { + "epoch": 1.4272083718067097, + "grad_norm": 4.84375, + "learning_rate": 3.609708833520351e-09, + "loss": 2.1076085567474365, + "step": 1160 + }, + { + "epoch": 1.4296706678978146, + "grad_norm": 19.125, + "learning_rate": 3.5969492722544207e-09, + "loss": 2.1435282230377197, + "step": 1162 + }, + { + "epoch": 1.4321329639889195, + "grad_norm": 1.796875, + "learning_rate": 3.5842278578413577e-09, + "loss": 1.6422967910766602, + "step": 1164 + }, + { + "epoch": 1.4345952600800247, + "grad_norm": 4.1875, + "learning_rate": 3.5715447922252655e-09, + "loss": 1.4160196781158447, + "step": 1166 + }, + { + "epoch": 1.4370575561711296, + "grad_norm": 7.78125, + "learning_rate": 3.558900276741485e-09, + "loss": 1.9306385517120361, + "step": 1168 + }, + { + "epoch": 1.4395198522622346, + "grad_norm": 6.625, + "learning_rate": 3.5462945121134016e-09, + "loss": 2.028043508529663, + "step": 1170 + }, + { + "epoch": 1.4419821483533395, + "grad_norm": 18.125, + "learning_rate": 3.533727698449252e-09, + "loss": 1.7561140060424805, + "step": 1172 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 11.6875, + "learning_rate": 3.521200035238954e-09, + "loss": 1.9722295999526978, + "step": 1174 + }, + { + "epoch": 1.4469067405355494, + "grad_norm": 5.40625, + "learning_rate": 3.5087117213509367e-09, + "loss": 2.2334213256835938, + "step": 1176 + }, + { + "epoch": 1.4493690366266543, + "grad_norm": 10.1875, + "learning_rate": 3.4962629550289858e-09, + "loss": 2.2049357891082764, + "step": 1178 + }, + { + "epoch": 1.4518313327177594, + "grad_norm": 11.0625, + "learning_rate": 3.4838539338890964e-09, + "loss": 2.2469396591186523, + "step": 1180 + }, + { + "epoch": 1.4542936288088644, + "grad_norm": 5.59375, + "learning_rate": 3.4714848549163314e-09, + "loss": 2.023268938064575, + "step": 1182 + }, + { + "epoch": 1.4567559248999693, + "grad_norm": 3.671875, + "learning_rate": 3.4591559144617014e-09, + "loss": 1.8120558261871338, + "step": 1184 + }, + { + "epoch": 1.4592182209910742, + "grad_norm": 5.65625, + "learning_rate": 3.4468673082390432e-09, + "loss": 1.7612297534942627, + "step": 1186 + }, + { + "epoch": 1.4616805170821792, + "grad_norm": 23.5, + "learning_rate": 3.434619231321912e-09, + "loss": 1.9972333908081055, + "step": 1188 + }, + { + "epoch": 1.464142813173284, + "grad_norm": 4.3125, + "learning_rate": 3.4224118781404923e-09, + "loss": 1.8834655284881592, + "step": 1190 + }, + { + "epoch": 1.466605109264389, + "grad_norm": 35.25, + "learning_rate": 3.4102454424784997e-09, + "loss": 2.4007821083068848, + "step": 1192 + }, + { + "epoch": 1.469067405355494, + "grad_norm": 9.0, + "learning_rate": 3.398120117470115e-09, + "loss": 2.477167844772339, + "step": 1194 + }, + { + "epoch": 1.471529701446599, + "grad_norm": 8.625, + "learning_rate": 3.3860360955969127e-09, + "loss": 2.0541319847106934, + "step": 1196 + }, + { + "epoch": 1.4739919975377038, + "grad_norm": 11.3125, + "learning_rate": 3.373993568684808e-09, + "loss": 2.007800579071045, + "step": 1198 + }, + { + "epoch": 1.4764542936288088, + "grad_norm": 13.125, + "learning_rate": 3.36199272790101e-09, + "loss": 2.2932679653167725, + "step": 1200 + }, + { + "epoch": 1.4789165897199137, + "grad_norm": 2.8125, + "learning_rate": 3.350033763750989e-09, + "loss": 1.7902061939239502, + "step": 1202 + }, + { + "epoch": 1.4813788858110188, + "grad_norm": 15.0625, + "learning_rate": 3.3381168660754523e-09, + "loss": 1.8084830045700073, + "step": 1204 + }, + { + "epoch": 1.4838411819021238, + "grad_norm": 5.46875, + "learning_rate": 3.3262422240473268e-09, + "loss": 1.930219054222107, + "step": 1206 + }, + { + "epoch": 1.4863034779932287, + "grad_norm": 4.65625, + "learning_rate": 3.314410026168757e-09, + "loss": 1.8515759706497192, + "step": 1208 + }, + { + "epoch": 1.4887657740843336, + "grad_norm": 20.875, + "learning_rate": 3.30262046026812e-09, + "loss": 2.1966378688812256, + "step": 1210 + }, + { + "epoch": 1.4912280701754386, + "grad_norm": 5.0, + "learning_rate": 3.2908737134970367e-09, + "loss": 2.388540744781494, + "step": 1212 + }, + { + "epoch": 1.4936903662665435, + "grad_norm": 10.375, + "learning_rate": 3.2791699723273984e-09, + "loss": 2.1200718879699707, + "step": 1214 + }, + { + "epoch": 1.4961526623576484, + "grad_norm": 3.515625, + "learning_rate": 3.2675094225484135e-09, + "loss": 2.037621021270752, + "step": 1216 + }, + { + "epoch": 1.4986149584487536, + "grad_norm": 3.234375, + "learning_rate": 3.2558922492636578e-09, + "loss": 1.5640082359313965, + "step": 1218 + }, + { + "epoch": 1.5010772545398585, + "grad_norm": 6.59375, + "learning_rate": 3.2443186368881287e-09, + "loss": 1.5967392921447754, + "step": 1220 + }, + { + "epoch": 1.5035395506309635, + "grad_norm": 1.1875, + "learning_rate": 3.2327887691453277e-09, + "loss": 1.4248828887939453, + "step": 1222 + }, + { + "epoch": 1.5060018467220684, + "grad_norm": 5.84375, + "learning_rate": 3.2213028290643363e-09, + "loss": 1.5917315483093262, + "step": 1224 + }, + { + "epoch": 1.5084641428131733, + "grad_norm": 5.59375, + "learning_rate": 3.2098609989769122e-09, + "loss": 1.761174201965332, + "step": 1226 + }, + { + "epoch": 1.5109264389042782, + "grad_norm": 13.8125, + "learning_rate": 3.198463460514598e-09, + "loss": 1.7805390357971191, + "step": 1228 + }, + { + "epoch": 1.5133887349953832, + "grad_norm": 3.125, + "learning_rate": 3.1871103946058343e-09, + "loss": 2.06949782371521, + "step": 1230 + }, + { + "epoch": 1.515851031086488, + "grad_norm": 8.0625, + "learning_rate": 3.1758019814730902e-09, + "loss": 1.6458537578582764, + "step": 1232 + }, + { + "epoch": 1.518313327177593, + "grad_norm": 5.90625, + "learning_rate": 3.1645384006300033e-09, + "loss": 1.8969038724899292, + "step": 1234 + }, + { + "epoch": 1.520775623268698, + "grad_norm": 2.53125, + "learning_rate": 3.153319830878523e-09, + "loss": 1.5056371688842773, + "step": 1236 + }, + { + "epoch": 1.523237919359803, + "grad_norm": 25.5, + "learning_rate": 3.142146450306082e-09, + "loss": 1.7204036712646484, + "step": 1238 + }, + { + "epoch": 1.5257002154509078, + "grad_norm": 5.5625, + "learning_rate": 3.1310184362827594e-09, + "loss": 1.7970688343048096, + "step": 1240 + }, + { + "epoch": 1.5281625115420128, + "grad_norm": 2.75, + "learning_rate": 3.1199359654584756e-09, + "loss": 1.5522937774658203, + "step": 1242 + }, + { + "epoch": 1.530624807633118, + "grad_norm": 5.46875, + "learning_rate": 3.1088992137601797e-09, + "loss": 1.5566771030426025, + "step": 1244 + }, + { + "epoch": 1.5330871037242229, + "grad_norm": 4.875, + "learning_rate": 3.097908356389059e-09, + "loss": 1.8924975395202637, + "step": 1246 + }, + { + "epoch": 1.5355493998153278, + "grad_norm": 2.234375, + "learning_rate": 3.08696356781776e-09, + "loss": 1.5438798666000366, + "step": 1248 + }, + { + "epoch": 1.5380116959064327, + "grad_norm": 1.8515625, + "learning_rate": 3.0760650217876174e-09, + "loss": 1.286960482597351, + "step": 1250 + }, + { + "epoch": 1.5404739919975377, + "grad_norm": 3.140625, + "learning_rate": 3.0652128913058935e-09, + "loss": 1.1232177019119263, + "step": 1252 + }, + { + "epoch": 1.5429362880886428, + "grad_norm": 10.0625, + "learning_rate": 3.0544073486430396e-09, + "loss": 1.7119476795196533, + "step": 1254 + }, + { + "epoch": 1.5453985841797477, + "grad_norm": 4.84375, + "learning_rate": 3.0436485653299487e-09, + "loss": 2.0494632720947266, + "step": 1256 + }, + { + "epoch": 1.5478608802708527, + "grad_norm": 3.1875, + "learning_rate": 3.032936712155246e-09, + "loss": 1.5645394325256348, + "step": 1258 + }, + { + "epoch": 1.5503231763619576, + "grad_norm": 11.1875, + "learning_rate": 3.022271959162567e-09, + "loss": 1.7430448532104492, + "step": 1260 + }, + { + "epoch": 1.5527854724530625, + "grad_norm": 3.25, + "learning_rate": 3.0116544756478663e-09, + "loss": 1.6215105056762695, + "step": 1262 + }, + { + "epoch": 1.5552477685441675, + "grad_norm": 5.40625, + "learning_rate": 3.001084430156724e-09, + "loss": 1.4022070169448853, + "step": 1264 + }, + { + "epoch": 1.5577100646352724, + "grad_norm": 4.3125, + "learning_rate": 2.990561990481675e-09, + "loss": 1.7849698066711426, + "step": 1266 + }, + { + "epoch": 1.5601723607263773, + "grad_norm": 2.90625, + "learning_rate": 2.9800873236595416e-09, + "loss": 1.514677882194519, + "step": 1268 + }, + { + "epoch": 1.5626346568174823, + "grad_norm": 10.0, + "learning_rate": 2.9696605959687833e-09, + "loss": 1.529390573501587, + "step": 1270 + }, + { + "epoch": 1.5650969529085872, + "grad_norm": 2.5625, + "learning_rate": 2.9592819729268566e-09, + "loss": 1.8093581199645996, + "step": 1272 + }, + { + "epoch": 1.5675592489996921, + "grad_norm": 10.0625, + "learning_rate": 2.948951619287592e-09, + "loss": 1.3842357397079468, + "step": 1274 + }, + { + "epoch": 1.570021545090797, + "grad_norm": 14.5, + "learning_rate": 2.938669699038571e-09, + "loss": 1.85842764377594, + "step": 1276 + }, + { + "epoch": 1.572483841181902, + "grad_norm": 29.0, + "learning_rate": 2.928436375398528e-09, + "loss": 2.2186334133148193, + "step": 1278 + }, + { + "epoch": 1.574946137273007, + "grad_norm": 7.625, + "learning_rate": 2.9182518108147588e-09, + "loss": 2.11116361618042, + "step": 1280 + }, + { + "epoch": 1.577408433364112, + "grad_norm": 10.5625, + "learning_rate": 2.9081161669605395e-09, + "loss": 2.039137363433838, + "step": 1282 + }, + { + "epoch": 1.579870729455217, + "grad_norm": 1.7578125, + "learning_rate": 2.8980296047325638e-09, + "loss": 1.548026204109192, + "step": 1284 + }, + { + "epoch": 1.582333025546322, + "grad_norm": 6.34375, + "learning_rate": 2.8879922842483867e-09, + "loss": 1.4916882514953613, + "step": 1286 + }, + { + "epoch": 1.5847953216374269, + "grad_norm": 4.5, + "learning_rate": 2.8780043648438818e-09, + "loss": 1.6858062744140625, + "step": 1288 + }, + { + "epoch": 1.587257617728532, + "grad_norm": 6.84375, + "learning_rate": 2.868066005070713e-09, + "loss": 1.8366402387619019, + "step": 1290 + }, + { + "epoch": 1.589719913819637, + "grad_norm": 3.15625, + "learning_rate": 2.8581773626938166e-09, + "loss": 1.4952478408813477, + "step": 1292 + }, + { + "epoch": 1.5921822099107419, + "grad_norm": 4.3125, + "learning_rate": 2.8483385946889017e-09, + "loss": 1.4701340198516846, + "step": 1294 + }, + { + "epoch": 1.5946445060018468, + "grad_norm": 5.25, + "learning_rate": 2.8385498572399503e-09, + "loss": 1.8555335998535156, + "step": 1296 + }, + { + "epoch": 1.5971068020929517, + "grad_norm": 5.0, + "learning_rate": 2.828811305736743e-09, + "loss": 1.8610620498657227, + "step": 1298 + }, + { + "epoch": 1.5995690981840567, + "grad_norm": 7.09375, + "learning_rate": 2.8191230947723945e-09, + "loss": 1.883762240409851, + "step": 1300 + }, + { + "epoch": 1.6020313942751616, + "grad_norm": 14.5625, + "learning_rate": 2.809485378140893e-09, + "loss": 2.238772392272949, + "step": 1302 + }, + { + "epoch": 1.6044936903662665, + "grad_norm": 6.25, + "learning_rate": 2.7998983088346625e-09, + "loss": 2.1114282608032227, + "step": 1304 + }, + { + "epoch": 1.6069559864573715, + "grad_norm": 1.9140625, + "learning_rate": 2.7903620390421363e-09, + "loss": 1.6002395153045654, + "step": 1306 + }, + { + "epoch": 1.6094182825484764, + "grad_norm": 9.4375, + "learning_rate": 2.7808767201453376e-09, + "loss": 1.6772760152816772, + "step": 1308 + }, + { + "epoch": 1.6118805786395813, + "grad_norm": 10.4375, + "learning_rate": 2.771442502717478e-09, + "loss": 2.111185073852539, + "step": 1310 + }, + { + "epoch": 1.6143428747306863, + "grad_norm": 14.125, + "learning_rate": 2.7620595365205627e-09, + "loss": 2.0705718994140625, + "step": 1312 + }, + { + "epoch": 1.6168051708217912, + "grad_norm": 4.46875, + "learning_rate": 2.752727970503024e-09, + "loss": 1.95082426071167, + "step": 1314 + }, + { + "epoch": 1.6192674669128961, + "grad_norm": 5.03125, + "learning_rate": 2.7434479527973477e-09, + "loss": 1.7210240364074707, + "step": 1316 + }, + { + "epoch": 1.621729763004001, + "grad_norm": 3.515625, + "learning_rate": 2.7342196307177214e-09, + "loss": 1.6697207689285278, + "step": 1318 + }, + { + "epoch": 1.6241920590951062, + "grad_norm": 2.65625, + "learning_rate": 2.7250431507577004e-09, + "loss": 1.4422950744628906, + "step": 1320 + }, + { + "epoch": 1.6266543551862112, + "grad_norm": 2.84375, + "learning_rate": 2.7159186585878816e-09, + "loss": 1.1386830806732178, + "step": 1322 + }, + { + "epoch": 1.629116651277316, + "grad_norm": 3.015625, + "learning_rate": 2.7068462990535863e-09, + "loss": 1.2971214056015015, + "step": 1324 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 19.875, + "learning_rate": 2.697826216172569e-09, + "loss": 1.638606309890747, + "step": 1326 + }, + { + "epoch": 1.6340412434595262, + "grad_norm": 3.109375, + "learning_rate": 2.688858553132723e-09, + "loss": 1.6914677619934082, + "step": 1328 + }, + { + "epoch": 1.636503539550631, + "grad_norm": 2.28125, + "learning_rate": 2.6799434522898126e-09, + "loss": 1.1819281578063965, + "step": 1330 + }, + { + "epoch": 1.638965835641736, + "grad_norm": 2.140625, + "learning_rate": 2.6710810551652133e-09, + "loss": 1.1034936904907227, + "step": 1332 + }, + { + "epoch": 1.641428131732841, + "grad_norm": 34.5, + "learning_rate": 2.66227150244366e-09, + "loss": 1.6707381010055542, + "step": 1334 + }, + { + "epoch": 1.643890427823946, + "grad_norm": 25.5, + "learning_rate": 2.6535149339710184e-09, + "loss": 2.70631742477417, + "step": 1336 + }, + { + "epoch": 1.6463527239150508, + "grad_norm": 30.75, + "learning_rate": 2.644811488752068e-09, + "loss": 2.4394781589508057, + "step": 1338 + }, + { + "epoch": 1.6488150200061558, + "grad_norm": 13.625, + "learning_rate": 2.636161304948286e-09, + "loss": 2.2337255477905273, + "step": 1340 + }, + { + "epoch": 1.6512773160972607, + "grad_norm": 13.0, + "learning_rate": 2.627564519875663e-09, + "loss": 2.295048236846924, + "step": 1342 + }, + { + "epoch": 1.6537396121883656, + "grad_norm": 20.0, + "learning_rate": 2.6190212700025183e-09, + "loss": 2.110807418823242, + "step": 1344 + }, + { + "epoch": 1.6562019082794706, + "grad_norm": 4.84375, + "learning_rate": 2.6105316909473364e-09, + "loss": 1.8732104301452637, + "step": 1346 + }, + { + "epoch": 1.6586642043705755, + "grad_norm": 8.125, + "learning_rate": 2.6020959174766106e-09, + "loss": 1.9254186153411865, + "step": 1348 + }, + { + "epoch": 1.6611265004616804, + "grad_norm": 6.15625, + "learning_rate": 2.5937140835027097e-09, + "loss": 1.8715019226074219, + "step": 1350 + }, + { + "epoch": 1.6635887965527854, + "grad_norm": 9.8125, + "learning_rate": 2.5853863220817436e-09, + "loss": 1.9434764385223389, + "step": 1352 + }, + { + "epoch": 1.6660510926438903, + "grad_norm": 5.25, + "learning_rate": 2.577112765411459e-09, + "loss": 2.207705497741699, + "step": 1354 + }, + { + "epoch": 1.6685133887349952, + "grad_norm": 12.625, + "learning_rate": 2.568893544829136e-09, + "loss": 1.880719780921936, + "step": 1356 + }, + { + "epoch": 1.6709756848261004, + "grad_norm": 9.5625, + "learning_rate": 2.560728790809509e-09, + "loss": 1.8875178098678589, + "step": 1358 + }, + { + "epoch": 1.6734379809172053, + "grad_norm": 5.4375, + "learning_rate": 2.5526186329626865e-09, + "loss": 1.6963284015655518, + "step": 1360 + }, + { + "epoch": 1.6759002770083102, + "grad_norm": 5.90625, + "learning_rate": 2.5445632000320995e-09, + "loss": 1.791224718093872, + "step": 1362 + }, + { + "epoch": 1.6783625730994152, + "grad_norm": 3.890625, + "learning_rate": 2.5365626198924598e-09, + "loss": 1.6278963088989258, + "step": 1364 + }, + { + "epoch": 1.6808248691905203, + "grad_norm": 3.375, + "learning_rate": 2.528617019547723e-09, + "loss": 1.3288359642028809, + "step": 1366 + }, + { + "epoch": 1.6832871652816253, + "grad_norm": 9.0625, + "learning_rate": 2.5207265251290823e-09, + "loss": 1.6888291835784912, + "step": 1368 + }, + { + "epoch": 1.6857494613727302, + "grad_norm": 13.375, + "learning_rate": 2.512891261892955e-09, + "loss": 2.285770893096924, + "step": 1370 + }, + { + "epoch": 1.6882117574638351, + "grad_norm": 3.1875, + "learning_rate": 2.505111354219002e-09, + "loss": 1.671492099761963, + "step": 1372 + }, + { + "epoch": 1.69067405355494, + "grad_norm": 6.25, + "learning_rate": 2.49738692560815e-09, + "loss": 1.5187859535217285, + "step": 1374 + }, + { + "epoch": 1.693136349646045, + "grad_norm": 7.0625, + "learning_rate": 2.4897180986806322e-09, + "loss": 1.9461727142333984, + "step": 1376 + }, + { + "epoch": 1.69559864573715, + "grad_norm": 7.53125, + "learning_rate": 2.482104995174044e-09, + "loss": 1.8825700283050537, + "step": 1378 + }, + { + "epoch": 1.6980609418282548, + "grad_norm": 5.28125, + "learning_rate": 2.474547735941405e-09, + "loss": 1.8659740686416626, + "step": 1380 + }, + { + "epoch": 1.7005232379193598, + "grad_norm": 5.59375, + "learning_rate": 2.4670464409492447e-09, + "loss": 1.7924315929412842, + "step": 1382 + }, + { + "epoch": 1.7029855340104647, + "grad_norm": 13.4375, + "learning_rate": 2.459601229275697e-09, + "loss": 1.9610867500305176, + "step": 1384 + }, + { + "epoch": 1.7054478301015696, + "grad_norm": 8.5, + "learning_rate": 2.4522122191086104e-09, + "loss": 1.836552381515503, + "step": 1386 + }, + { + "epoch": 1.7079101261926746, + "grad_norm": 8.8125, + "learning_rate": 2.4448795277436698e-09, + "loss": 1.7403874397277832, + "step": 1388 + }, + { + "epoch": 1.7103724222837795, + "grad_norm": 4.625, + "learning_rate": 2.4376032715825386e-09, + "loss": 1.5626749992370605, + "step": 1390 + }, + { + "epoch": 1.7128347183748844, + "grad_norm": 3.625, + "learning_rate": 2.4303835661310066e-09, + "loss": 1.3395249843597412, + "step": 1392 + }, + { + "epoch": 1.7152970144659896, + "grad_norm": 13.125, + "learning_rate": 2.4232205259971584e-09, + "loss": 1.0826705694198608, + "step": 1394 + }, + { + "epoch": 1.7177593105570945, + "grad_norm": 12.875, + "learning_rate": 2.4161142648895533e-09, + "loss": 1.810969352722168, + "step": 1396 + }, + { + "epoch": 1.7202216066481995, + "grad_norm": 9.0, + "learning_rate": 2.4090648956154223e-09, + "loss": 2.039994239807129, + "step": 1398 + }, + { + "epoch": 1.7226839027393044, + "grad_norm": 7.625, + "learning_rate": 2.402072530078876e-09, + "loss": 1.8878741264343262, + "step": 1400 + }, + { + "epoch": 1.7251461988304093, + "grad_norm": 4.5625, + "learning_rate": 2.395137279279127e-09, + "loss": 1.8724961280822754, + "step": 1402 + }, + { + "epoch": 1.7276084949215145, + "grad_norm": 4.0, + "learning_rate": 2.3882592533087286e-09, + "loss": 1.9301607608795166, + "step": 1404 + }, + { + "epoch": 1.7300707910126194, + "grad_norm": 24.125, + "learning_rate": 2.3814385613518284e-09, + "loss": 1.6868252754211426, + "step": 1406 + }, + { + "epoch": 1.7325330871037243, + "grad_norm": 6.78125, + "learning_rate": 2.374675311682433e-09, + "loss": 1.7913291454315186, + "step": 1408 + }, + { + "epoch": 1.7349953831948293, + "grad_norm": 2.59375, + "learning_rate": 2.3679696116626936e-09, + "loss": 1.5577332973480225, + "step": 1410 + }, + { + "epoch": 1.7374576792859342, + "grad_norm": 4.875, + "learning_rate": 2.3613215677411944e-09, + "loss": 1.5362656116485596, + "step": 1412 + }, + { + "epoch": 1.7399199753770391, + "grad_norm": 1.75, + "learning_rate": 2.354731285451268e-09, + "loss": 1.5279173851013184, + "step": 1414 + }, + { + "epoch": 1.742382271468144, + "grad_norm": 10.6875, + "learning_rate": 2.348198869409322e-09, + "loss": 1.696439504623413, + "step": 1416 + }, + { + "epoch": 1.744844567559249, + "grad_norm": 18.5, + "learning_rate": 2.341724423313171e-09, + "loss": 2.554849147796631, + "step": 1418 + }, + { + "epoch": 1.747306863650354, + "grad_norm": 13.0625, + "learning_rate": 2.335308049940398e-09, + "loss": 2.1925854682922363, + "step": 1420 + }, + { + "epoch": 1.7497691597414589, + "grad_norm": 3.46875, + "learning_rate": 2.328949851146718e-09, + "loss": 1.593017816543579, + "step": 1422 + }, + { + "epoch": 1.7522314558325638, + "grad_norm": 4.0, + "learning_rate": 2.322649927864363e-09, + "loss": 1.229564905166626, + "step": 1424 + }, + { + "epoch": 1.7546937519236687, + "grad_norm": 15.6875, + "learning_rate": 2.3164083801004798e-09, + "loss": 1.9423973560333252, + "step": 1426 + }, + { + "epoch": 1.7571560480147737, + "grad_norm": 5.75, + "learning_rate": 2.3102253069355413e-09, + "loss": 2.0594370365142822, + "step": 1428 + }, + { + "epoch": 1.7596183441058786, + "grad_norm": 6.53125, + "learning_rate": 2.3041008065217754e-09, + "loss": 1.9393881559371948, + "step": 1430 + }, + { + "epoch": 1.7620806401969837, + "grad_norm": 7.90625, + "learning_rate": 2.298034976081607e-09, + "loss": 1.8895037174224854, + "step": 1432 + }, + { + "epoch": 1.7645429362880887, + "grad_norm": 8.125, + "learning_rate": 2.292027911906112e-09, + "loss": 1.7276127338409424, + "step": 1434 + }, + { + "epoch": 1.7670052323791936, + "grad_norm": 6.125, + "learning_rate": 2.286079709353491e-09, + "loss": 1.5182913541793823, + "step": 1436 + }, + { + "epoch": 1.7694675284702985, + "grad_norm": 8.6875, + "learning_rate": 2.2801904628475545e-09, + "loss": 1.845018982887268, + "step": 1438 + }, + { + "epoch": 1.7719298245614035, + "grad_norm": 13.0625, + "learning_rate": 2.274360265876225e-09, + "loss": 2.4570071697235107, + "step": 1440 + }, + { + "epoch": 1.7743921206525086, + "grad_norm": 6.53125, + "learning_rate": 2.268589210990052e-09, + "loss": 1.779624342918396, + "step": 1442 + }, + { + "epoch": 1.7768544167436136, + "grad_norm": 11.0, + "learning_rate": 2.262877389800745e-09, + "loss": 1.5919256210327148, + "step": 1444 + }, + { + "epoch": 1.7793167128347185, + "grad_norm": 5.96875, + "learning_rate": 2.257224892979714e-09, + "loss": 2.230924129486084, + "step": 1446 + }, + { + "epoch": 1.7817790089258234, + "grad_norm": 2.296875, + "learning_rate": 2.2516318102566373e-09, + "loss": 1.6709070205688477, + "step": 1448 + }, + { + "epoch": 1.7842413050169283, + "grad_norm": 7.1875, + "learning_rate": 2.24609823041803e-09, + "loss": 1.5729997158050537, + "step": 1450 + }, + { + "epoch": 1.7867036011080333, + "grad_norm": 23.125, + "learning_rate": 2.240624241305841e-09, + "loss": 2.22371768951416, + "step": 1452 + }, + { + "epoch": 1.7891658971991382, + "grad_norm": 9.0, + "learning_rate": 2.2352099298160545e-09, + "loss": 1.9387813806533813, + "step": 1454 + }, + { + "epoch": 1.7916281932902431, + "grad_norm": 6.96875, + "learning_rate": 2.2298553818973096e-09, + "loss": 1.6565120220184326, + "step": 1456 + }, + { + "epoch": 1.794090489381348, + "grad_norm": 24.0, + "learning_rate": 2.2245606825495408e-09, + "loss": 1.6322071552276611, + "step": 1458 + }, + { + "epoch": 1.796552785472453, + "grad_norm": 6.0625, + "learning_rate": 2.219325915822624e-09, + "loss": 2.004333257675171, + "step": 1460 + }, + { + "epoch": 1.799015081563558, + "grad_norm": 11.625, + "learning_rate": 2.214151164815044e-09, + "loss": 2.2140424251556396, + "step": 1462 + }, + { + "epoch": 1.8014773776546629, + "grad_norm": 5.90625, + "learning_rate": 2.2090365116725787e-09, + "loss": 1.876783847808838, + "step": 1464 + }, + { + "epoch": 1.8039396737457678, + "grad_norm": 2.921875, + "learning_rate": 2.203982037586988e-09, + "loss": 1.5903770923614502, + "step": 1466 + }, + { + "epoch": 1.8064019698368727, + "grad_norm": 5.78125, + "learning_rate": 2.1989878227947297e-09, + "loss": 1.4093436002731323, + "step": 1468 + }, + { + "epoch": 1.8088642659279779, + "grad_norm": 5.4375, + "learning_rate": 2.1940539465756848e-09, + "loss": 1.5252522230148315, + "step": 1470 + }, + { + "epoch": 1.8113265620190828, + "grad_norm": 11.1875, + "learning_rate": 2.1891804872519013e-09, + "loss": 1.6333411931991577, + "step": 1472 + }, + { + "epoch": 1.8137888581101878, + "grad_norm": 12.125, + "learning_rate": 2.1843675221863456e-09, + "loss": 2.395686626434326, + "step": 1474 + }, + { + "epoch": 1.8162511542012927, + "grad_norm": 6.40625, + "learning_rate": 2.179615127781678e-09, + "loss": 2.011446475982666, + "step": 1476 + }, + { + "epoch": 1.8187134502923976, + "grad_norm": 27.75, + "learning_rate": 2.1749233794790424e-09, + "loss": 1.9201209545135498, + "step": 1478 + }, + { + "epoch": 1.8211757463835028, + "grad_norm": 8.75, + "learning_rate": 2.1702923517568608e-09, + "loss": 1.9654639959335327, + "step": 1480 + }, + { + "epoch": 1.8236380424746077, + "grad_norm": 14.4375, + "learning_rate": 2.1657221181296596e-09, + "loss": 2.4255740642547607, + "step": 1482 + }, + { + "epoch": 1.8261003385657126, + "grad_norm": 5.46875, + "learning_rate": 2.161212751146898e-09, + "loss": 2.1441259384155273, + "step": 1484 + }, + { + "epoch": 1.8285626346568176, + "grad_norm": 3.03125, + "learning_rate": 2.1567643223918164e-09, + "loss": 1.5081210136413574, + "step": 1486 + }, + { + "epoch": 1.8310249307479225, + "grad_norm": 3.15625, + "learning_rate": 2.1523769024803013e-09, + "loss": 1.219706416130066, + "step": 1488 + }, + { + "epoch": 1.8334872268390274, + "grad_norm": 3.296875, + "learning_rate": 2.148050561059763e-09, + "loss": 1.3154406547546387, + "step": 1490 + }, + { + "epoch": 1.8359495229301324, + "grad_norm": 4.84375, + "learning_rate": 2.1437853668080316e-09, + "loss": 1.663912057876587, + "step": 1492 + }, + { + "epoch": 1.8384118190212373, + "grad_norm": 5.5, + "learning_rate": 2.139581387432267e-09, + "loss": 1.9996685981750488, + "step": 1494 + }, + { + "epoch": 1.8408741151123422, + "grad_norm": 9.125, + "learning_rate": 2.135438689667882e-09, + "loss": 2.1527910232543945, + "step": 1496 + }, + { + "epoch": 1.8433364112034472, + "grad_norm": 5.4375, + "learning_rate": 2.1313573392774835e-09, + "loss": 2.181238889694214, + "step": 1498 + }, + { + "epoch": 1.845798707294552, + "grad_norm": 26.625, + "learning_rate": 2.1273374010498306e-09, + "loss": 2.07470965385437, + "step": 1500 + }, + { + "epoch": 1.848261003385657, + "grad_norm": 7.375, + "learning_rate": 2.123378938798803e-09, + "loss": 2.180095672607422, + "step": 1502 + }, + { + "epoch": 1.850723299476762, + "grad_norm": 10.25, + "learning_rate": 2.119482015362392e-09, + "loss": 2.023428440093994, + "step": 1504 + }, + { + "epoch": 1.8531855955678669, + "grad_norm": 6.03125, + "learning_rate": 2.1156466926016974e-09, + "loss": 1.9310382604599, + "step": 1506 + }, + { + "epoch": 1.855647891658972, + "grad_norm": 10.9375, + "learning_rate": 2.1118730313999516e-09, + "loss": 1.7410407066345215, + "step": 1508 + }, + { + "epoch": 1.858110187750077, + "grad_norm": 14.9375, + "learning_rate": 2.108161091661548e-09, + "loss": 2.463320732116699, + "step": 1510 + }, + { + "epoch": 1.860572483841182, + "grad_norm": 10.1875, + "learning_rate": 2.1045109323110943e-09, + "loss": 2.164478302001953, + "step": 1512 + }, + { + "epoch": 1.8630347799322868, + "grad_norm": 11.0, + "learning_rate": 2.1009226112924727e-09, + "loss": 2.304097890853882, + "step": 1514 + }, + { + "epoch": 1.8654970760233918, + "grad_norm": 11.4375, + "learning_rate": 2.097396185567926e-09, + "loss": 2.384671688079834, + "step": 1516 + }, + { + "epoch": 1.867959372114497, + "grad_norm": 11.875, + "learning_rate": 2.0939317111171467e-09, + "loss": 1.752406358718872, + "step": 1518 + }, + { + "epoch": 1.8704216682056019, + "grad_norm": 19.875, + "learning_rate": 2.090529242936392e-09, + "loss": 1.5490081310272217, + "step": 1520 + }, + { + "epoch": 1.8728839642967068, + "grad_norm": 5.90625, + "learning_rate": 2.087188835037611e-09, + "loss": 2.0984854698181152, + "step": 1522 + }, + { + "epoch": 1.8753462603878117, + "grad_norm": 2.890625, + "learning_rate": 2.0839105404475866e-09, + "loss": 1.6633992195129395, + "step": 1524 + }, + { + "epoch": 1.8778085564789166, + "grad_norm": 3.6875, + "learning_rate": 2.080694411207094e-09, + "loss": 1.4255918264389038, + "step": 1526 + }, + { + "epoch": 1.8802708525700216, + "grad_norm": 4.84375, + "learning_rate": 2.0775404983700724e-09, + "loss": 1.845369577407837, + "step": 1528 + }, + { + "epoch": 1.8827331486611265, + "grad_norm": 4.40625, + "learning_rate": 2.074448852002819e-09, + "loss": 1.7371915578842163, + "step": 1530 + }, + { + "epoch": 1.8851954447522314, + "grad_norm": 13.3125, + "learning_rate": 2.07141952118319e-09, + "loss": 1.805029034614563, + "step": 1532 + }, + { + "epoch": 1.8876577408433364, + "grad_norm": 6.65625, + "learning_rate": 2.068452553999822e-09, + "loss": 2.060267448425293, + "step": 1534 + }, + { + "epoch": 1.8901200369344413, + "grad_norm": 3.625, + "learning_rate": 2.065547997551375e-09, + "loss": 1.525952935218811, + "step": 1536 + }, + { + "epoch": 1.8925823330255462, + "grad_norm": 7.46875, + "learning_rate": 2.062705897945773e-09, + "loss": 1.4751570224761963, + "step": 1538 + }, + { + "epoch": 1.8950446291166512, + "grad_norm": 5.0625, + "learning_rate": 2.059926300299483e-09, + "loss": 1.6626102924346924, + "step": 1540 + }, + { + "epoch": 1.897506925207756, + "grad_norm": 5.65625, + "learning_rate": 2.057209248736792e-09, + "loss": 1.2773092985153198, + "step": 1542 + }, + { + "epoch": 1.899969221298861, + "grad_norm": 13.0625, + "learning_rate": 2.054554786389111e-09, + "loss": 1.6589457988739014, + "step": 1544 + }, + { + "epoch": 1.9024315173899662, + "grad_norm": 6.25, + "learning_rate": 2.051962955394286e-09, + "loss": 1.9413405656814575, + "step": 1546 + }, + { + "epoch": 1.9048938134810711, + "grad_norm": 10.25, + "learning_rate": 2.0494337968959344e-09, + "loss": 1.6395326852798462, + "step": 1548 + }, + { + "epoch": 1.907356109572176, + "grad_norm": 5.21875, + "learning_rate": 2.0469673510427865e-09, + "loss": 1.9667985439300537, + "step": 1550 + }, + { + "epoch": 1.909818405663281, + "grad_norm": 4.90625, + "learning_rate": 2.0445636569880505e-09, + "loss": 1.8468351364135742, + "step": 1552 + }, + { + "epoch": 1.912280701754386, + "grad_norm": 11.25, + "learning_rate": 2.0422227528887923e-09, + "loss": 2.118504524230957, + "step": 1554 + }, + { + "epoch": 1.914742997845491, + "grad_norm": 10.375, + "learning_rate": 2.0399446759053274e-09, + "loss": 2.0504517555236816, + "step": 1556 + }, + { + "epoch": 1.917205293936596, + "grad_norm": 5.25, + "learning_rate": 2.037729462200633e-09, + "loss": 1.661136507987976, + "step": 1558 + }, + { + "epoch": 1.919667590027701, + "grad_norm": 6.03125, + "learning_rate": 2.0355771469397726e-09, + "loss": 1.5671418905258179, + "step": 1560 + }, + { + "epoch": 1.9221298861188059, + "grad_norm": 5.34375, + "learning_rate": 2.0334877642893373e-09, + "loss": 2.0463449954986572, + "step": 1562 + }, + { + "epoch": 1.9245921822099108, + "grad_norm": 3.96875, + "learning_rate": 2.0314613474169064e-09, + "loss": 1.7543866634368896, + "step": 1564 + }, + { + "epoch": 1.9270544783010157, + "grad_norm": 23.375, + "learning_rate": 2.029497928490516e-09, + "loss": 1.5825181007385254, + "step": 1566 + }, + { + "epoch": 1.9295167743921207, + "grad_norm": 8.6875, + "learning_rate": 2.027597538678154e-09, + "loss": 1.5585989952087402, + "step": 1568 + }, + { + "epoch": 1.9319790704832256, + "grad_norm": 10.0625, + "learning_rate": 2.0257602081472603e-09, + "loss": 1.5373648405075073, + "step": 1570 + }, + { + "epoch": 1.9344413665743305, + "grad_norm": 3.296875, + "learning_rate": 2.023985966064252e-09, + "loss": 1.638904333114624, + "step": 1572 + }, + { + "epoch": 1.9369036626654355, + "grad_norm": 2.71875, + "learning_rate": 2.0222748405940567e-09, + "loss": 1.3301455974578857, + "step": 1574 + }, + { + "epoch": 1.9393659587565404, + "grad_norm": 2.734375, + "learning_rate": 2.0206268588996686e-09, + "loss": 1.1727893352508545, + "step": 1576 + }, + { + "epoch": 1.9418282548476453, + "grad_norm": 4.46875, + "learning_rate": 2.019042047141714e-09, + "loss": 1.2285372018814087, + "step": 1578 + }, + { + "epoch": 1.9442905509387503, + "grad_norm": 5.0625, + "learning_rate": 2.0175204304780413e-09, + "loss": 1.5906985998153687, + "step": 1580 + }, + { + "epoch": 1.9467528470298552, + "grad_norm": 18.875, + "learning_rate": 2.016062033063314e-09, + "loss": 1.8927161693572998, + "step": 1582 + }, + { + "epoch": 1.9492151431209603, + "grad_norm": 11.4375, + "learning_rate": 2.0146668780486356e-09, + "loss": 2.0817370414733887, + "step": 1584 + }, + { + "epoch": 1.9516774392120653, + "grad_norm": 8.4375, + "learning_rate": 2.0133349875811752e-09, + "loss": 2.1541638374328613, + "step": 1586 + }, + { + "epoch": 1.9541397353031702, + "grad_norm": 6.03125, + "learning_rate": 2.0120663828038197e-09, + "loss": 2.136171340942383, + "step": 1588 + }, + { + "epoch": 1.9566020313942751, + "grad_norm": 8.8125, + "learning_rate": 2.010861083854838e-09, + "loss": 2.047274112701416, + "step": 1590 + }, + { + "epoch": 1.95906432748538, + "grad_norm": 5.4375, + "learning_rate": 2.009719109867558e-09, + "loss": 2.093939781188965, + "step": 1592 + }, + { + "epoch": 1.9615266235764852, + "grad_norm": 8.0625, + "learning_rate": 2.0086404789700686e-09, + "loss": 1.9545447826385498, + "step": 1594 + }, + { + "epoch": 1.9639889196675901, + "grad_norm": 4.03125, + "learning_rate": 2.0076252082849266e-09, + "loss": 1.710350751876831, + "step": 1596 + }, + { + "epoch": 1.966451215758695, + "grad_norm": 9.8125, + "learning_rate": 2.006673313928888e-09, + "loss": 1.6602602005004883, + "step": 1598 + }, + { + "epoch": 1.9689135118498, + "grad_norm": 6.96875, + "learning_rate": 2.0057848110126513e-09, + "loss": 2.073413848876953, + "step": 1600 + }, + { + "epoch": 1.971375807940905, + "grad_norm": 18.75, + "learning_rate": 2.0049597136406157e-09, + "loss": 2.155198574066162, + "step": 1602 + }, + { + "epoch": 1.9738381040320099, + "grad_norm": 7.4375, + "learning_rate": 2.004198034910662e-09, + "loss": 2.1142520904541016, + "step": 1604 + }, + { + "epoch": 1.9763004001231148, + "grad_norm": 2.6875, + "learning_rate": 2.003499786913938e-09, + "loss": 1.6299633979797363, + "step": 1606 + }, + { + "epoch": 1.9787626962142197, + "grad_norm": 11.3125, + "learning_rate": 2.0028649807346742e-09, + "loss": 1.5626764297485352, + "step": 1608 + }, + { + "epoch": 1.9812249923053247, + "grad_norm": 16.875, + "learning_rate": 2.0022936264500017e-09, + "loss": 2.2909412384033203, + "step": 1610 + }, + { + "epoch": 1.9836872883964296, + "grad_norm": 11.25, + "learning_rate": 2.0017857331297935e-09, + "loss": 2.1796622276306152, + "step": 1612 + }, + { + "epoch": 1.9861495844875345, + "grad_norm": 5.375, + "learning_rate": 2.001341308836524e-09, + "loss": 1.9472308158874512, + "step": 1614 + }, + { + "epoch": 1.9886118805786395, + "grad_norm": 8.5625, + "learning_rate": 2.000960360625136e-09, + "loss": 1.743130087852478, + "step": 1616 + }, + { + "epoch": 1.9910741766697444, + "grad_norm": 10.1875, + "learning_rate": 2.0006428945429335e-09, + "loss": 1.43598210811615, + "step": 1618 + }, + { + "epoch": 1.9935364727608493, + "grad_norm": 12.8125, + "learning_rate": 2.0003889156294813e-09, + "loss": 1.9119551181793213, + "step": 1620 + }, + { + "epoch": 1.9959987688519545, + "grad_norm": 5.71875, + "learning_rate": 2.0001984279165285e-09, + "loss": 2.036318302154541, + "step": 1622 + }, + { + "epoch": 1.9984610649430594, + "grad_norm": 5.28125, + "learning_rate": 2.0000714344279417e-09, + "loss": 1.577465295791626, + "step": 1624 + }, + { + "epoch": 2.0, + "grad_norm": 3.578125, + "learning_rate": 2.00000793717966e-09, + "loss": 1.1681241989135742, + "step": 1626 + }, + { + "epoch": 2.0, + "step": 1626, + "total_flos": 2.5753569883429274e+18, + "train_loss": 1.8335715001506265, + "train_runtime": 15477.0683, + "train_samples_per_second": 1.679, + "train_steps_per_second": 0.105 + } + ], + "logging_steps": 2, + "max_steps": 1626, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.5753569883429274e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}