diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10396 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9974657881398885, + "eval_steps": 500, + "global_step": 2958, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020273694880892043, + "grad_norm": 57.71497237604614, + "learning_rate": 3.3783783783783786e-08, + "loss": 1.05, + "step": 2 + }, + { + "epoch": 0.0040547389761784085, + "grad_norm": 175.24371842801014, + "learning_rate": 1.0135135135135137e-07, + "loss": 2.1391, + "step": 4 + }, + { + "epoch": 0.006082108464267613, + "grad_norm": 66.9169242516532, + "learning_rate": 1.6891891891891894e-07, + "loss": 0.8946, + "step": 6 + }, + { + "epoch": 0.008109477952356817, + "grad_norm": 50.83318029761317, + "learning_rate": 2.3648648648648652e-07, + "loss": 1.7983, + "step": 8 + }, + { + "epoch": 0.01013684744044602, + "grad_norm": 34.5945384928232, + "learning_rate": 3.040540540540541e-07, + "loss": 0.9894, + "step": 10 + }, + { + "epoch": 0.012164216928535226, + "grad_norm": 60.896121116332374, + "learning_rate": 3.716216216216217e-07, + "loss": 1.2491, + "step": 12 + }, + { + "epoch": 0.01419158641662443, + "grad_norm": 50.97590219551692, + "learning_rate": 4.3918918918918923e-07, + "loss": 1.5961, + "step": 14 + }, + { + "epoch": 0.016218955904713634, + "grad_norm": 88.6208233741677, + "learning_rate": 5.067567567567568e-07, + "loss": 1.0017, + "step": 16 + }, + { + "epoch": 0.01824632539280284, + "grad_norm": 69.50908421063882, + "learning_rate": 5.743243243243245e-07, + "loss": 2.0422, + "step": 18 + }, + { + "epoch": 0.02027369488089204, + "grad_norm": 34.39797495781003, + "learning_rate": 6.418918918918919e-07, + "loss": 1.2094, + "step": 20 + }, + { + "epoch": 0.022301064368981247, + "grad_norm": 73.77608109153212, + "learning_rate": 7.094594594594595e-07, + "loss": 1.3302, + "step": 22 + }, + { + "epoch": 0.024328433857070453, + "grad_norm": 36.63064315321084, + "learning_rate": 7.770270270270271e-07, + "loss": 1.7936, + "step": 24 + }, + { + "epoch": 0.026355803345159655, + "grad_norm": 54.42741841422907, + "learning_rate": 8.445945945945947e-07, + "loss": 1.4706, + "step": 26 + }, + { + "epoch": 0.02838317283324886, + "grad_norm": 40.63459373955391, + "learning_rate": 9.121621621621622e-07, + "loss": 1.5822, + "step": 28 + }, + { + "epoch": 0.030410542321338063, + "grad_norm": 23.914568439682927, + "learning_rate": 9.797297297297298e-07, + "loss": 0.9685, + "step": 30 + }, + { + "epoch": 0.03243791180942727, + "grad_norm": 19.107237912680137, + "learning_rate": 1.0472972972972973e-06, + "loss": 0.7838, + "step": 32 + }, + { + "epoch": 0.03446528129751647, + "grad_norm": 21.414976018923056, + "learning_rate": 1.114864864864865e-06, + "loss": 0.7055, + "step": 34 + }, + { + "epoch": 0.03649265078560568, + "grad_norm": 23.189665605286294, + "learning_rate": 1.1824324324324326e-06, + "loss": 1.013, + "step": 36 + }, + { + "epoch": 0.03852002027369488, + "grad_norm": 23.931398594512604, + "learning_rate": 1.25e-06, + "loss": 0.809, + "step": 38 + }, + { + "epoch": 0.04054738976178408, + "grad_norm": 35.98120953689529, + "learning_rate": 1.3175675675675676e-06, + "loss": 1.1313, + "step": 40 + }, + { + "epoch": 0.04257475924987329, + "grad_norm": 42.21015444243768, + "learning_rate": 1.3851351351351352e-06, + "loss": 1.4629, + "step": 42 + }, + { + "epoch": 0.044602128737962494, + "grad_norm": 21.629546263354126, + "learning_rate": 1.4527027027027027e-06, + "loss": 0.9537, + "step": 44 + }, + { + "epoch": 0.0466294982260517, + "grad_norm": 15.512485004660848, + "learning_rate": 1.5202702702702704e-06, + "loss": 0.4527, + "step": 46 + }, + { + "epoch": 0.048656867714140906, + "grad_norm": 22.61560290672792, + "learning_rate": 1.5878378378378378e-06, + "loss": 0.859, + "step": 48 + }, + { + "epoch": 0.05068423720223011, + "grad_norm": 12.69707250020947, + "learning_rate": 1.6554054054054055e-06, + "loss": 0.511, + "step": 50 + }, + { + "epoch": 0.05271160669031931, + "grad_norm": 7.777471076349586, + "learning_rate": 1.722972972972973e-06, + "loss": 0.7791, + "step": 52 + }, + { + "epoch": 0.05473897617840851, + "grad_norm": 33.636962824047394, + "learning_rate": 1.7905405405405408e-06, + "loss": 1.122, + "step": 54 + }, + { + "epoch": 0.05676634566649772, + "grad_norm": 24.948721741451337, + "learning_rate": 1.8581081081081081e-06, + "loss": 1.2054, + "step": 56 + }, + { + "epoch": 0.05879371515458692, + "grad_norm": 8.9857750702695, + "learning_rate": 1.925675675675676e-06, + "loss": 0.7201, + "step": 58 + }, + { + "epoch": 0.060821084642676125, + "grad_norm": 9.163282743582531, + "learning_rate": 1.9932432432432434e-06, + "loss": 0.7312, + "step": 60 + }, + { + "epoch": 0.06284845413076533, + "grad_norm": 10.303878102479521, + "learning_rate": 2.060810810810811e-06, + "loss": 0.2859, + "step": 62 + }, + { + "epoch": 0.06487582361885454, + "grad_norm": 9.796640671725454, + "learning_rate": 2.1283783783783785e-06, + "loss": 0.3271, + "step": 64 + }, + { + "epoch": 0.06690319310694375, + "grad_norm": 31.671556835861093, + "learning_rate": 2.195945945945946e-06, + "loss": 1.034, + "step": 66 + }, + { + "epoch": 0.06893056259503294, + "grad_norm": 17.524840034617448, + "learning_rate": 2.2635135135135135e-06, + "loss": 0.7687, + "step": 68 + }, + { + "epoch": 0.07095793208312215, + "grad_norm": 18.707726155430603, + "learning_rate": 2.3310810810810813e-06, + "loss": 0.6931, + "step": 70 + }, + { + "epoch": 0.07298530157121136, + "grad_norm": 14.420997396271337, + "learning_rate": 2.3986486486486486e-06, + "loss": 0.5355, + "step": 72 + }, + { + "epoch": 0.07501267105930055, + "grad_norm": 10.442083302495975, + "learning_rate": 2.4662162162162163e-06, + "loss": 0.4725, + "step": 74 + }, + { + "epoch": 0.07704004054738976, + "grad_norm": 21.53703033422474, + "learning_rate": 2.533783783783784e-06, + "loss": 0.4804, + "step": 76 + }, + { + "epoch": 0.07906741003547897, + "grad_norm": 12.755864157131738, + "learning_rate": 2.601351351351352e-06, + "loss": 0.4393, + "step": 78 + }, + { + "epoch": 0.08109477952356817, + "grad_norm": 19.628294480468362, + "learning_rate": 2.668918918918919e-06, + "loss": 0.5181, + "step": 80 + }, + { + "epoch": 0.08312214901165738, + "grad_norm": 10.522835485393575, + "learning_rate": 2.7364864864864865e-06, + "loss": 0.8953, + "step": 82 + }, + { + "epoch": 0.08514951849974658, + "grad_norm": 10.862823641859785, + "learning_rate": 2.8040540540540546e-06, + "loss": 0.2828, + "step": 84 + }, + { + "epoch": 0.08717688798783578, + "grad_norm": 19.45044510794311, + "learning_rate": 2.871621621621622e-06, + "loss": 0.6368, + "step": 86 + }, + { + "epoch": 0.08920425747592499, + "grad_norm": 12.31282166811145, + "learning_rate": 2.9391891891891893e-06, + "loss": 0.8691, + "step": 88 + }, + { + "epoch": 0.0912316269640142, + "grad_norm": 13.782186754890805, + "learning_rate": 3.006756756756757e-06, + "loss": 0.3037, + "step": 90 + }, + { + "epoch": 0.0932589964521034, + "grad_norm": 82.80155628738072, + "learning_rate": 3.0743243243243248e-06, + "loss": 0.7177, + "step": 92 + }, + { + "epoch": 0.0952863659401926, + "grad_norm": 15.465344135737018, + "learning_rate": 3.141891891891892e-06, + "loss": 0.4884, + "step": 94 + }, + { + "epoch": 0.09731373542828181, + "grad_norm": 11.4271789744156, + "learning_rate": 3.20945945945946e-06, + "loss": 0.8032, + "step": 96 + }, + { + "epoch": 0.099341104916371, + "grad_norm": 7.445598123447847, + "learning_rate": 3.277027027027027e-06, + "loss": 0.4503, + "step": 98 + }, + { + "epoch": 0.10136847440446022, + "grad_norm": 16.604764698991488, + "learning_rate": 3.3445945945945953e-06, + "loss": 0.5667, + "step": 100 + }, + { + "epoch": 0.10339584389254941, + "grad_norm": 18.29474930758402, + "learning_rate": 3.4121621621621626e-06, + "loss": 0.2678, + "step": 102 + }, + { + "epoch": 0.10542321338063862, + "grad_norm": 9.060684262029227, + "learning_rate": 3.47972972972973e-06, + "loss": 0.3155, + "step": 104 + }, + { + "epoch": 0.10745058286872783, + "grad_norm": 29.62331074104899, + "learning_rate": 3.5472972972972973e-06, + "loss": 0.5877, + "step": 106 + }, + { + "epoch": 0.10947795235681702, + "grad_norm": 9.176872734399126, + "learning_rate": 3.6148648648648655e-06, + "loss": 0.5299, + "step": 108 + }, + { + "epoch": 0.11150532184490623, + "grad_norm": 18.32007471655931, + "learning_rate": 3.6824324324324328e-06, + "loss": 0.4141, + "step": 110 + }, + { + "epoch": 0.11353269133299544, + "grad_norm": 9.537461584330389, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.5565, + "step": 112 + }, + { + "epoch": 0.11556006082108464, + "grad_norm": 8.30387704512277, + "learning_rate": 3.817567567567567e-06, + "loss": 0.31, + "step": 114 + }, + { + "epoch": 0.11758743030917385, + "grad_norm": 12.153880533710504, + "learning_rate": 3.885135135135135e-06, + "loss": 0.3692, + "step": 116 + }, + { + "epoch": 0.11961479979726306, + "grad_norm": 8.481300273022914, + "learning_rate": 3.952702702702703e-06, + "loss": 0.4338, + "step": 118 + }, + { + "epoch": 0.12164216928535225, + "grad_norm": 10.972265785356592, + "learning_rate": 4.020270270270271e-06, + "loss": 0.8845, + "step": 120 + }, + { + "epoch": 0.12366953877344146, + "grad_norm": 10.477896866173003, + "learning_rate": 4.087837837837838e-06, + "loss": 0.4021, + "step": 122 + }, + { + "epoch": 0.12569690826153065, + "grad_norm": 15.036665161478169, + "learning_rate": 4.155405405405405e-06, + "loss": 0.2777, + "step": 124 + }, + { + "epoch": 0.12772427774961986, + "grad_norm": 10.007480274763308, + "learning_rate": 4.222972972972974e-06, + "loss": 0.3663, + "step": 126 + }, + { + "epoch": 0.12975164723770907, + "grad_norm": 7.467396272962942, + "learning_rate": 4.290540540540541e-06, + "loss": 0.4724, + "step": 128 + }, + { + "epoch": 0.13177901672579828, + "grad_norm": 8.735369836535625, + "learning_rate": 4.3581081081081085e-06, + "loss": 0.346, + "step": 130 + }, + { + "epoch": 0.1338063862138875, + "grad_norm": 7.5602002396944465, + "learning_rate": 4.4256756756756754e-06, + "loss": 0.3469, + "step": 132 + }, + { + "epoch": 0.13583375570197667, + "grad_norm": 6.0860045978113035, + "learning_rate": 4.493243243243244e-06, + "loss": 0.3361, + "step": 134 + }, + { + "epoch": 0.13786112519006588, + "grad_norm": 9.873761660319566, + "learning_rate": 4.560810810810811e-06, + "loss": 0.5408, + "step": 136 + }, + { + "epoch": 0.1398884946781551, + "grad_norm": 11.87936299932976, + "learning_rate": 4.628378378378379e-06, + "loss": 0.7303, + "step": 138 + }, + { + "epoch": 0.1419158641662443, + "grad_norm": 11.109081249992263, + "learning_rate": 4.695945945945946e-06, + "loss": 0.5593, + "step": 140 + }, + { + "epoch": 0.1439432336543335, + "grad_norm": 7.053873109551752, + "learning_rate": 4.763513513513514e-06, + "loss": 0.2442, + "step": 142 + }, + { + "epoch": 0.14597060314242272, + "grad_norm": 8.749218146671545, + "learning_rate": 4.831081081081082e-06, + "loss": 0.4316, + "step": 144 + }, + { + "epoch": 0.1479979726305119, + "grad_norm": 12.918148840696446, + "learning_rate": 4.898648648648649e-06, + "loss": 0.4492, + "step": 146 + }, + { + "epoch": 0.1500253421186011, + "grad_norm": 11.698373515062539, + "learning_rate": 4.9662162162162165e-06, + "loss": 0.6551, + "step": 148 + }, + { + "epoch": 0.15205271160669032, + "grad_norm": 15.867918118919567, + "learning_rate": 5.033783783783784e-06, + "loss": 0.5306, + "step": 150 + }, + { + "epoch": 0.15408008109477953, + "grad_norm": 10.286368336802065, + "learning_rate": 5.101351351351351e-06, + "loss": 0.4458, + "step": 152 + }, + { + "epoch": 0.15610745058286873, + "grad_norm": 7.260183509093253, + "learning_rate": 5.168918918918919e-06, + "loss": 0.7211, + "step": 154 + }, + { + "epoch": 0.15813482007095794, + "grad_norm": 4.8429993824130895, + "learning_rate": 5.2364864864864875e-06, + "loss": 0.4526, + "step": 156 + }, + { + "epoch": 0.16016218955904712, + "grad_norm": 8.413102756284689, + "learning_rate": 5.304054054054054e-06, + "loss": 0.5674, + "step": 158 + }, + { + "epoch": 0.16218955904713633, + "grad_norm": 8.198344954671482, + "learning_rate": 5.371621621621622e-06, + "loss": 0.6994, + "step": 160 + }, + { + "epoch": 0.16421692853522554, + "grad_norm": 11.599839114881892, + "learning_rate": 5.43918918918919e-06, + "loss": 0.3241, + "step": 162 + }, + { + "epoch": 0.16624429802331475, + "grad_norm": 11.804379760575786, + "learning_rate": 5.506756756756757e-06, + "loss": 0.508, + "step": 164 + }, + { + "epoch": 0.16827166751140396, + "grad_norm": 5.671985754253216, + "learning_rate": 5.574324324324325e-06, + "loss": 0.1251, + "step": 166 + }, + { + "epoch": 0.17029903699949317, + "grad_norm": 7.697744452078716, + "learning_rate": 5.641891891891892e-06, + "loss": 0.285, + "step": 168 + }, + { + "epoch": 0.17232640648758235, + "grad_norm": 12.132580219019582, + "learning_rate": 5.70945945945946e-06, + "loss": 0.3558, + "step": 170 + }, + { + "epoch": 0.17435377597567156, + "grad_norm": 9.602694879929555, + "learning_rate": 5.777027027027028e-06, + "loss": 0.4231, + "step": 172 + }, + { + "epoch": 0.17638114546376077, + "grad_norm": 8.067827204275957, + "learning_rate": 5.844594594594595e-06, + "loss": 0.267, + "step": 174 + }, + { + "epoch": 0.17840851495184998, + "grad_norm": 7.3208657541856486, + "learning_rate": 5.912162162162162e-06, + "loss": 0.4358, + "step": 176 + }, + { + "epoch": 0.1804358844399392, + "grad_norm": 7.152959464499114, + "learning_rate": 5.979729729729731e-06, + "loss": 0.4197, + "step": 178 + }, + { + "epoch": 0.1824632539280284, + "grad_norm": 14.42080104663241, + "learning_rate": 6.047297297297298e-06, + "loss": 0.1406, + "step": 180 + }, + { + "epoch": 0.18449062341611758, + "grad_norm": 13.929770683283657, + "learning_rate": 6.114864864864866e-06, + "loss": 0.6484, + "step": 182 + }, + { + "epoch": 0.1865179929042068, + "grad_norm": 8.077144614856605, + "learning_rate": 6.1824324324324326e-06, + "loss": 0.5692, + "step": 184 + }, + { + "epoch": 0.188545362392296, + "grad_norm": 13.007593281958583, + "learning_rate": 6.25e-06, + "loss": 0.6547, + "step": 186 + }, + { + "epoch": 0.1905727318803852, + "grad_norm": 13.828871727729418, + "learning_rate": 6.317567567567569e-06, + "loss": 0.1902, + "step": 188 + }, + { + "epoch": 0.1926001013684744, + "grad_norm": 11.050185189206331, + "learning_rate": 6.385135135135136e-06, + "loss": 0.2152, + "step": 190 + }, + { + "epoch": 0.19462747085656362, + "grad_norm": 7.122036658135403, + "learning_rate": 6.4527027027027035e-06, + "loss": 0.2845, + "step": 192 + }, + { + "epoch": 0.1966548403446528, + "grad_norm": 8.583369508956544, + "learning_rate": 6.5202702702702704e-06, + "loss": 0.5849, + "step": 194 + }, + { + "epoch": 0.198682209832742, + "grad_norm": 12.695588340669676, + "learning_rate": 6.587837837837838e-06, + "loss": 0.5304, + "step": 196 + }, + { + "epoch": 0.20070957932083122, + "grad_norm": 16.62364734511988, + "learning_rate": 6.655405405405406e-06, + "loss": 0.4073, + "step": 198 + }, + { + "epoch": 0.20273694880892043, + "grad_norm": 15.267556110735752, + "learning_rate": 6.722972972972973e-06, + "loss": 0.4915, + "step": 200 + }, + { + "epoch": 0.20476431829700964, + "grad_norm": 15.776817443050984, + "learning_rate": 6.790540540540541e-06, + "loss": 0.7814, + "step": 202 + }, + { + "epoch": 0.20679168778509882, + "grad_norm": 20.930471132477148, + "learning_rate": 6.858108108108109e-06, + "loss": 0.3762, + "step": 204 + }, + { + "epoch": 0.20881905727318803, + "grad_norm": 6.949013031864635, + "learning_rate": 6.925675675675676e-06, + "loss": 0.477, + "step": 206 + }, + { + "epoch": 0.21084642676127724, + "grad_norm": 27.22475284477259, + "learning_rate": 6.993243243243244e-06, + "loss": 0.3902, + "step": 208 + }, + { + "epoch": 0.21287379624936645, + "grad_norm": 12.704110783602678, + "learning_rate": 7.060810810810811e-06, + "loss": 0.4046, + "step": 210 + }, + { + "epoch": 0.21490116573745566, + "grad_norm": 6.54581310466234, + "learning_rate": 7.128378378378379e-06, + "loss": 0.3561, + "step": 212 + }, + { + "epoch": 0.21692853522554487, + "grad_norm": 16.178963449355916, + "learning_rate": 7.195945945945947e-06, + "loss": 0.3577, + "step": 214 + }, + { + "epoch": 0.21895590471363405, + "grad_norm": 10.627411144699728, + "learning_rate": 7.263513513513514e-06, + "loss": 0.4152, + "step": 216 + }, + { + "epoch": 0.22098327420172326, + "grad_norm": 12.002772519811227, + "learning_rate": 7.331081081081082e-06, + "loss": 0.2347, + "step": 218 + }, + { + "epoch": 0.22301064368981247, + "grad_norm": 11.60387481979945, + "learning_rate": 7.398648648648649e-06, + "loss": 0.3964, + "step": 220 + }, + { + "epoch": 0.22503801317790167, + "grad_norm": 17.81562803296209, + "learning_rate": 7.466216216216216e-06, + "loss": 0.4558, + "step": 222 + }, + { + "epoch": 0.22706538266599088, + "grad_norm": 9.253490057828477, + "learning_rate": 7.533783783783785e-06, + "loss": 0.3625, + "step": 224 + }, + { + "epoch": 0.2290927521540801, + "grad_norm": 5.375790938934924, + "learning_rate": 7.601351351351352e-06, + "loss": 0.3614, + "step": 226 + }, + { + "epoch": 0.23112012164216927, + "grad_norm": 14.864796311104984, + "learning_rate": 7.668918918918919e-06, + "loss": 0.2304, + "step": 228 + }, + { + "epoch": 0.23314749113025848, + "grad_norm": 6.950764187611686, + "learning_rate": 7.736486486486488e-06, + "loss": 0.2817, + "step": 230 + }, + { + "epoch": 0.2351748606183477, + "grad_norm": 11.848505193532034, + "learning_rate": 7.804054054054054e-06, + "loss": 0.419, + "step": 232 + }, + { + "epoch": 0.2372022301064369, + "grad_norm": 6.86075137588588, + "learning_rate": 7.871621621621622e-06, + "loss": 0.4031, + "step": 234 + }, + { + "epoch": 0.2392295995945261, + "grad_norm": 16.660315670931247, + "learning_rate": 7.93918918918919e-06, + "loss": 0.473, + "step": 236 + }, + { + "epoch": 0.24125696908261532, + "grad_norm": 25.14800141772283, + "learning_rate": 8.006756756756757e-06, + "loss": 1.3515, + "step": 238 + }, + { + "epoch": 0.2432843385707045, + "grad_norm": 10.074407996931646, + "learning_rate": 8.074324324324325e-06, + "loss": 0.3331, + "step": 240 + }, + { + "epoch": 0.2453117080587937, + "grad_norm": 8.021781254637734, + "learning_rate": 8.141891891891893e-06, + "loss": 0.4851, + "step": 242 + }, + { + "epoch": 0.24733907754688292, + "grad_norm": 7.9836896246915545, + "learning_rate": 8.20945945945946e-06, + "loss": 0.2329, + "step": 244 + }, + { + "epoch": 0.24936644703497213, + "grad_norm": 13.45620008377975, + "learning_rate": 8.277027027027028e-06, + "loss": 0.7057, + "step": 246 + }, + { + "epoch": 0.2513938165230613, + "grad_norm": 9.218163980059352, + "learning_rate": 8.344594594594594e-06, + "loss": 0.2595, + "step": 248 + }, + { + "epoch": 0.25342118601115055, + "grad_norm": 13.713094226787678, + "learning_rate": 8.412162162162162e-06, + "loss": 0.5982, + "step": 250 + }, + { + "epoch": 0.2554485554992397, + "grad_norm": 6.064760779934345, + "learning_rate": 8.479729729729732e-06, + "loss": 0.3845, + "step": 252 + }, + { + "epoch": 0.25747592498732896, + "grad_norm": 15.110918094443782, + "learning_rate": 8.547297297297298e-06, + "loss": 0.6082, + "step": 254 + }, + { + "epoch": 0.25950329447541814, + "grad_norm": 8.734534003252048, + "learning_rate": 8.614864864864865e-06, + "loss": 0.6625, + "step": 256 + }, + { + "epoch": 0.2615306639635073, + "grad_norm": 10.156737299506661, + "learning_rate": 8.682432432432433e-06, + "loss": 0.412, + "step": 258 + }, + { + "epoch": 0.26355803345159656, + "grad_norm": 15.392924208220645, + "learning_rate": 8.750000000000001e-06, + "loss": 0.2686, + "step": 260 + }, + { + "epoch": 0.26558540293968574, + "grad_norm": 13.693476420982156, + "learning_rate": 8.817567567567569e-06, + "loss": 0.5997, + "step": 262 + }, + { + "epoch": 0.267612772427775, + "grad_norm": 7.1956128265664585, + "learning_rate": 8.885135135135136e-06, + "loss": 0.1728, + "step": 264 + }, + { + "epoch": 0.26964014191586416, + "grad_norm": 6.240817841427259, + "learning_rate": 8.952702702702704e-06, + "loss": 0.5506, + "step": 266 + }, + { + "epoch": 0.27166751140395334, + "grad_norm": 10.914919246312364, + "learning_rate": 9.02027027027027e-06, + "loss": 0.3971, + "step": 268 + }, + { + "epoch": 0.2736948808920426, + "grad_norm": 6.713794304832443, + "learning_rate": 9.087837837837838e-06, + "loss": 0.2773, + "step": 270 + }, + { + "epoch": 0.27572225038013176, + "grad_norm": 8.823189617411034, + "learning_rate": 9.155405405405406e-06, + "loss": 0.309, + "step": 272 + }, + { + "epoch": 0.277749619868221, + "grad_norm": 16.166918341097727, + "learning_rate": 9.222972972972973e-06, + "loss": 0.3743, + "step": 274 + }, + { + "epoch": 0.2797769893563102, + "grad_norm": 7.1165594243821495, + "learning_rate": 9.290540540540541e-06, + "loss": 0.368, + "step": 276 + }, + { + "epoch": 0.2818043588443994, + "grad_norm": 9.613418626661307, + "learning_rate": 9.358108108108109e-06, + "loss": 0.2808, + "step": 278 + }, + { + "epoch": 0.2838317283324886, + "grad_norm": 12.207284877069775, + "learning_rate": 9.425675675675677e-06, + "loss": 0.4245, + "step": 280 + }, + { + "epoch": 0.2858590978205778, + "grad_norm": 11.098403576690426, + "learning_rate": 9.493243243243244e-06, + "loss": 0.8999, + "step": 282 + }, + { + "epoch": 0.287886467308667, + "grad_norm": 4.053393188876914, + "learning_rate": 9.56081081081081e-06, + "loss": 0.4571, + "step": 284 + }, + { + "epoch": 0.2899138367967562, + "grad_norm": 22.82049806852992, + "learning_rate": 9.62837837837838e-06, + "loss": 0.4142, + "step": 286 + }, + { + "epoch": 0.29194120628484543, + "grad_norm": 6.724368123275483, + "learning_rate": 9.695945945945948e-06, + "loss": 0.2728, + "step": 288 + }, + { + "epoch": 0.2939685757729346, + "grad_norm": 9.43185837281203, + "learning_rate": 9.763513513513514e-06, + "loss": 0.3808, + "step": 290 + }, + { + "epoch": 0.2959959452610238, + "grad_norm": 13.325553692636865, + "learning_rate": 9.831081081081081e-06, + "loss": 0.2381, + "step": 292 + }, + { + "epoch": 0.29802331474911303, + "grad_norm": 8.238093234814245, + "learning_rate": 9.89864864864865e-06, + "loss": 0.5037, + "step": 294 + }, + { + "epoch": 0.3000506842372022, + "grad_norm": 9.737624239551796, + "learning_rate": 9.966216216216217e-06, + "loss": 0.6162, + "step": 296 + }, + { + "epoch": 0.30207805372529145, + "grad_norm": 7.898430939018367, + "learning_rate": 9.999996518041415e-06, + "loss": 0.1968, + "step": 298 + }, + { + "epoch": 0.30410542321338063, + "grad_norm": 10.877842805311195, + "learning_rate": 9.999968662401825e-06, + "loss": 0.5337, + "step": 300 + }, + { + "epoch": 0.30613279270146987, + "grad_norm": 11.58939696502365, + "learning_rate": 9.99991295127783e-06, + "loss": 0.5111, + "step": 302 + }, + { + "epoch": 0.30816016218955905, + "grad_norm": 10.39506967485703, + "learning_rate": 9.999829384979805e-06, + "loss": 0.5569, + "step": 304 + }, + { + "epoch": 0.31018753167764823, + "grad_norm": 11.323587558133639, + "learning_rate": 9.99971796397331e-06, + "loss": 0.2636, + "step": 306 + }, + { + "epoch": 0.31221490116573747, + "grad_norm": 11.799719502702628, + "learning_rate": 9.999578688879085e-06, + "loss": 0.4167, + "step": 308 + }, + { + "epoch": 0.31424227065382665, + "grad_norm": 7.1794391218768725, + "learning_rate": 9.999411560473051e-06, + "loss": 0.7276, + "step": 310 + }, + { + "epoch": 0.3162696401419159, + "grad_norm": 6.875067214365454, + "learning_rate": 9.9992165796863e-06, + "loss": 0.1851, + "step": 312 + }, + { + "epoch": 0.31829700963000507, + "grad_norm": 15.290205188440392, + "learning_rate": 9.9989937476051e-06, + "loss": 0.6901, + "step": 314 + }, + { + "epoch": 0.32032437911809425, + "grad_norm": 7.23060917268924, + "learning_rate": 9.998743065470874e-06, + "loss": 0.4944, + "step": 316 + }, + { + "epoch": 0.3223517486061835, + "grad_norm": 30.336102771746205, + "learning_rate": 9.998464534680206e-06, + "loss": 0.5218, + "step": 318 + }, + { + "epoch": 0.32437911809427267, + "grad_norm": 8.263738885889484, + "learning_rate": 9.99815815678483e-06, + "loss": 0.2897, + "step": 320 + }, + { + "epoch": 0.3264064875823619, + "grad_norm": 7.595127861155284, + "learning_rate": 9.997823933491613e-06, + "loss": 0.5737, + "step": 322 + }, + { + "epoch": 0.3284338570704511, + "grad_norm": 17.859959376851265, + "learning_rate": 9.997461866662562e-06, + "loss": 0.4749, + "step": 324 + }, + { + "epoch": 0.33046122655854027, + "grad_norm": 10.172331447889686, + "learning_rate": 9.997071958314796e-06, + "loss": 0.3419, + "step": 326 + }, + { + "epoch": 0.3324885960466295, + "grad_norm": 9.602278352001752, + "learning_rate": 9.996654210620547e-06, + "loss": 0.4016, + "step": 328 + }, + { + "epoch": 0.3345159655347187, + "grad_norm": 10.341316948477157, + "learning_rate": 9.99620862590714e-06, + "loss": 0.5868, + "step": 330 + }, + { + "epoch": 0.3365433350228079, + "grad_norm": 9.049191734306008, + "learning_rate": 9.995735206656992e-06, + "loss": 0.5219, + "step": 332 + }, + { + "epoch": 0.3385707045108971, + "grad_norm": 8.553566429511752, + "learning_rate": 9.995233955507577e-06, + "loss": 0.2685, + "step": 334 + }, + { + "epoch": 0.34059807399898634, + "grad_norm": 8.080764210851674, + "learning_rate": 9.994704875251437e-06, + "loss": 0.4775, + "step": 336 + }, + { + "epoch": 0.3426254434870755, + "grad_norm": 6.795645893746906, + "learning_rate": 9.994147968836144e-06, + "loss": 0.4429, + "step": 338 + }, + { + "epoch": 0.3446528129751647, + "grad_norm": 4.609312280436338, + "learning_rate": 9.993563239364298e-06, + "loss": 0.0988, + "step": 340 + }, + { + "epoch": 0.34668018246325394, + "grad_norm": 6.997530695789441, + "learning_rate": 9.992950690093506e-06, + "loss": 0.4347, + "step": 342 + }, + { + "epoch": 0.3487075519513431, + "grad_norm": 6.852720843138753, + "learning_rate": 9.992310324436358e-06, + "loss": 0.5367, + "step": 344 + }, + { + "epoch": 0.35073492143943236, + "grad_norm": 10.751003167835915, + "learning_rate": 9.991642145960417e-06, + "loss": 0.5198, + "step": 346 + }, + { + "epoch": 0.35276229092752154, + "grad_norm": 6.297341296208258, + "learning_rate": 9.990946158388192e-06, + "loss": 0.5235, + "step": 348 + }, + { + "epoch": 0.3547896604156107, + "grad_norm": 13.038244406955128, + "learning_rate": 9.990222365597125e-06, + "loss": 0.4189, + "step": 350 + }, + { + "epoch": 0.35681702990369996, + "grad_norm": 7.311157867034756, + "learning_rate": 9.989470771619555e-06, + "loss": 0.4288, + "step": 352 + }, + { + "epoch": 0.35884439939178914, + "grad_norm": 8.425882778203892, + "learning_rate": 9.988691380642717e-06, + "loss": 0.3915, + "step": 354 + }, + { + "epoch": 0.3608717688798784, + "grad_norm": 8.606087226960257, + "learning_rate": 9.987884197008697e-06, + "loss": 0.6008, + "step": 356 + }, + { + "epoch": 0.36289913836796756, + "grad_norm": 9.70843094018484, + "learning_rate": 9.987049225214423e-06, + "loss": 0.4785, + "step": 358 + }, + { + "epoch": 0.3649265078560568, + "grad_norm": 6.670301753653439, + "learning_rate": 9.986186469911632e-06, + "loss": 0.4274, + "step": 360 + }, + { + "epoch": 0.366953877344146, + "grad_norm": 6.309109233121169, + "learning_rate": 9.98529593590685e-06, + "loss": 0.2969, + "step": 362 + }, + { + "epoch": 0.36898124683223515, + "grad_norm": 4.201929869684632, + "learning_rate": 9.984377628161357e-06, + "loss": 0.3564, + "step": 364 + }, + { + "epoch": 0.3710086163203244, + "grad_norm": 10.740038600613573, + "learning_rate": 9.983431551791167e-06, + "loss": 0.7475, + "step": 366 + }, + { + "epoch": 0.3730359858084136, + "grad_norm": 5.713832512329265, + "learning_rate": 9.982457712066996e-06, + "loss": 0.3629, + "step": 368 + }, + { + "epoch": 0.3750633552965028, + "grad_norm": 6.6332553784059805, + "learning_rate": 9.981456114414233e-06, + "loss": 0.4439, + "step": 370 + }, + { + "epoch": 0.377090724784592, + "grad_norm": 13.166555904266172, + "learning_rate": 9.980426764412911e-06, + "loss": 0.6747, + "step": 372 + }, + { + "epoch": 0.37911809427268117, + "grad_norm": 10.118580975699498, + "learning_rate": 9.979369667797675e-06, + "loss": 0.5599, + "step": 374 + }, + { + "epoch": 0.3811454637607704, + "grad_norm": 12.013937532567617, + "learning_rate": 9.978284830457751e-06, + "loss": 0.7503, + "step": 376 + }, + { + "epoch": 0.3831728332488596, + "grad_norm": 6.784237455597211, + "learning_rate": 9.977172258436906e-06, + "loss": 0.3179, + "step": 378 + }, + { + "epoch": 0.3852002027369488, + "grad_norm": 9.627166313656874, + "learning_rate": 9.976031957933432e-06, + "loss": 0.373, + "step": 380 + }, + { + "epoch": 0.387227572225038, + "grad_norm": 10.114407078816528, + "learning_rate": 9.974863935300088e-06, + "loss": 0.4224, + "step": 382 + }, + { + "epoch": 0.38925494171312724, + "grad_norm": 4.564691667943321, + "learning_rate": 9.973668197044083e-06, + "loss": 0.3036, + "step": 384 + }, + { + "epoch": 0.3912823112012164, + "grad_norm": 26.571931673673035, + "learning_rate": 9.972444749827035e-06, + "loss": 0.6019, + "step": 386 + }, + { + "epoch": 0.3933096806893056, + "grad_norm": 7.691704434930906, + "learning_rate": 9.971193600464925e-06, + "loss": 0.3535, + "step": 388 + }, + { + "epoch": 0.39533705017739484, + "grad_norm": 6.7164909430569475, + "learning_rate": 9.969914755928077e-06, + "loss": 0.3939, + "step": 390 + }, + { + "epoch": 0.397364419665484, + "grad_norm": 6.060161022648931, + "learning_rate": 9.968608223341099e-06, + "loss": 0.2892, + "step": 392 + }, + { + "epoch": 0.39939178915357326, + "grad_norm": 39.91864528003413, + "learning_rate": 9.967274009982855e-06, + "loss": 0.4304, + "step": 394 + }, + { + "epoch": 0.40141915864166244, + "grad_norm": 7.38896891685949, + "learning_rate": 9.965912123286425e-06, + "loss": 0.3296, + "step": 396 + }, + { + "epoch": 0.4034465281297516, + "grad_norm": 9.743461874192485, + "learning_rate": 9.964522570839062e-06, + "loss": 0.4718, + "step": 398 + }, + { + "epoch": 0.40547389761784086, + "grad_norm": 9.18464122960155, + "learning_rate": 9.963105360382141e-06, + "loss": 0.625, + "step": 400 + }, + { + "epoch": 0.40750126710593004, + "grad_norm": 7.84964550468933, + "learning_rate": 9.96166049981113e-06, + "loss": 0.463, + "step": 402 + }, + { + "epoch": 0.4095286365940193, + "grad_norm": 16.01632444798913, + "learning_rate": 9.96018799717554e-06, + "loss": 0.6034, + "step": 404 + }, + { + "epoch": 0.41155600608210846, + "grad_norm": 10.784319202698658, + "learning_rate": 9.958687860678875e-06, + "loss": 0.3927, + "step": 406 + }, + { + "epoch": 0.41358337557019764, + "grad_norm": 9.272910408880783, + "learning_rate": 9.957160098678594e-06, + "loss": 0.3758, + "step": 408 + }, + { + "epoch": 0.4156107450582869, + "grad_norm": 7.866907890663986, + "learning_rate": 9.955604719686058e-06, + "loss": 0.5563, + "step": 410 + }, + { + "epoch": 0.41763811454637606, + "grad_norm": 10.77031854630572, + "learning_rate": 9.954021732366493e-06, + "loss": 0.7904, + "step": 412 + }, + { + "epoch": 0.4196654840344653, + "grad_norm": 7.4589088504795376, + "learning_rate": 9.952411145538925e-06, + "loss": 0.5441, + "step": 414 + }, + { + "epoch": 0.4216928535225545, + "grad_norm": 7.355655623043487, + "learning_rate": 9.950772968176148e-06, + "loss": 0.6102, + "step": 416 + }, + { + "epoch": 0.4237202230106437, + "grad_norm": 6.007671993727586, + "learning_rate": 9.949107209404664e-06, + "loss": 0.2423, + "step": 418 + }, + { + "epoch": 0.4257475924987329, + "grad_norm": 5.887458010471832, + "learning_rate": 9.947413878504636e-06, + "loss": 0.2569, + "step": 420 + }, + { + "epoch": 0.4277749619868221, + "grad_norm": 6.8258559482873915, + "learning_rate": 9.94569298490983e-06, + "loss": 0.2821, + "step": 422 + }, + { + "epoch": 0.4298023314749113, + "grad_norm": 9.83357881460339, + "learning_rate": 9.943944538207576e-06, + "loss": 0.3614, + "step": 424 + }, + { + "epoch": 0.4318297009630005, + "grad_norm": 12.182320397883984, + "learning_rate": 9.942168548138695e-06, + "loss": 0.5339, + "step": 426 + }, + { + "epoch": 0.43385707045108973, + "grad_norm": 14.972001819073878, + "learning_rate": 9.940365024597466e-06, + "loss": 0.4054, + "step": 428 + }, + { + "epoch": 0.4358844399391789, + "grad_norm": 7.007528181599826, + "learning_rate": 9.938533977631555e-06, + "loss": 0.423, + "step": 430 + }, + { + "epoch": 0.4379118094272681, + "grad_norm": 6.063921052712655, + "learning_rate": 9.936675417441968e-06, + "loss": 0.5234, + "step": 432 + }, + { + "epoch": 0.43993917891535733, + "grad_norm": 4.214791762393752, + "learning_rate": 9.934789354382986e-06, + "loss": 0.1305, + "step": 434 + }, + { + "epoch": 0.4419665484034465, + "grad_norm": 9.112037124986996, + "learning_rate": 9.932875798962114e-06, + "loss": 0.6811, + "step": 436 + }, + { + "epoch": 0.44399391789153575, + "grad_norm": 6.68329840007449, + "learning_rate": 9.930934761840025e-06, + "loss": 0.3149, + "step": 438 + }, + { + "epoch": 0.44602128737962493, + "grad_norm": 46.461533038670865, + "learning_rate": 9.928966253830492e-06, + "loss": 0.7627, + "step": 440 + }, + { + "epoch": 0.44804865686771417, + "grad_norm": 5.825914397441116, + "learning_rate": 9.92697028590033e-06, + "loss": 0.5408, + "step": 442 + }, + { + "epoch": 0.45007602635580335, + "grad_norm": 6.830909605269686, + "learning_rate": 9.924946869169341e-06, + "loss": 0.4658, + "step": 444 + }, + { + "epoch": 0.45210339584389253, + "grad_norm": 12.327126883727596, + "learning_rate": 9.922896014910247e-06, + "loss": 0.4858, + "step": 446 + }, + { + "epoch": 0.45413076533198177, + "grad_norm": 8.569104854849055, + "learning_rate": 9.920817734548625e-06, + "loss": 0.4965, + "step": 448 + }, + { + "epoch": 0.45615813482007095, + "grad_norm": 16.482269060349918, + "learning_rate": 9.918712039662851e-06, + "loss": 0.2549, + "step": 450 + }, + { + "epoch": 0.4581855043081602, + "grad_norm": 8.670004582749254, + "learning_rate": 9.916578941984028e-06, + "loss": 0.4285, + "step": 452 + }, + { + "epoch": 0.46021287379624937, + "grad_norm": 7.195718461609781, + "learning_rate": 9.914418453395927e-06, + "loss": 0.3742, + "step": 454 + }, + { + "epoch": 0.46224024328433855, + "grad_norm": 7.65644135798332, + "learning_rate": 9.91223058593491e-06, + "loss": 0.3204, + "step": 456 + }, + { + "epoch": 0.4642676127724278, + "grad_norm": 11.13844094433063, + "learning_rate": 9.910015351789877e-06, + "loss": 0.4946, + "step": 458 + }, + { + "epoch": 0.46629498226051697, + "grad_norm": 6.739471851418288, + "learning_rate": 9.907772763302187e-06, + "loss": 0.4296, + "step": 460 + }, + { + "epoch": 0.4683223517486062, + "grad_norm": 9.215359164818484, + "learning_rate": 9.905502832965603e-06, + "loss": 0.7534, + "step": 462 + }, + { + "epoch": 0.4703497212366954, + "grad_norm": 5.859864821003579, + "learning_rate": 9.9032055734262e-06, + "loss": 0.3624, + "step": 464 + }, + { + "epoch": 0.47237709072478457, + "grad_norm": 11.782879427962982, + "learning_rate": 9.900880997482313e-06, + "loss": 0.702, + "step": 466 + }, + { + "epoch": 0.4744044602128738, + "grad_norm": 22.033764844233332, + "learning_rate": 9.898529118084466e-06, + "loss": 0.4921, + "step": 468 + }, + { + "epoch": 0.476431829700963, + "grad_norm": 8.499482982580286, + "learning_rate": 9.896149948335286e-06, + "loss": 0.541, + "step": 470 + }, + { + "epoch": 0.4784591991890522, + "grad_norm": 9.191586219586245, + "learning_rate": 9.893743501489442e-06, + "loss": 0.4273, + "step": 472 + }, + { + "epoch": 0.4804865686771414, + "grad_norm": 8.645457557450287, + "learning_rate": 9.891309790953565e-06, + "loss": 0.3449, + "step": 474 + }, + { + "epoch": 0.48251393816523064, + "grad_norm": 3.8739372907776084, + "learning_rate": 9.888848830286178e-06, + "loss": 0.3533, + "step": 476 + }, + { + "epoch": 0.4845413076533198, + "grad_norm": 7.798167149903781, + "learning_rate": 9.886360633197618e-06, + "loss": 0.6998, + "step": 478 + }, + { + "epoch": 0.486568677141409, + "grad_norm": 8.141028421126585, + "learning_rate": 9.883845213549959e-06, + "loss": 0.1728, + "step": 480 + }, + { + "epoch": 0.48859604662949824, + "grad_norm": 6.665713016846487, + "learning_rate": 9.881302585356933e-06, + "loss": 0.4541, + "step": 482 + }, + { + "epoch": 0.4906234161175874, + "grad_norm": 6.613213170615581, + "learning_rate": 9.87873276278386e-06, + "loss": 0.4765, + "step": 484 + }, + { + "epoch": 0.49265078560567666, + "grad_norm": 4.038644704797649, + "learning_rate": 9.876135760147558e-06, + "loss": 0.4642, + "step": 486 + }, + { + "epoch": 0.49467815509376584, + "grad_norm": 6.880965985698831, + "learning_rate": 9.873511591916273e-06, + "loss": 0.5359, + "step": 488 + }, + { + "epoch": 0.496705524581855, + "grad_norm": 27.660081295662476, + "learning_rate": 9.87086027270959e-06, + "loss": 0.4595, + "step": 490 + }, + { + "epoch": 0.49873289406994425, + "grad_norm": 4.823215925379002, + "learning_rate": 9.868181817298358e-06, + "loss": 0.4567, + "step": 492 + }, + { + "epoch": 0.5007602635580335, + "grad_norm": 8.426433348502462, + "learning_rate": 9.865476240604606e-06, + "loss": 0.5365, + "step": 494 + }, + { + "epoch": 0.5027876330461226, + "grad_norm": 5.158537757513293, + "learning_rate": 9.86274355770146e-06, + "loss": 0.1893, + "step": 496 + }, + { + "epoch": 0.5048150025342119, + "grad_norm": 6.352603664984908, + "learning_rate": 9.859983783813051e-06, + "loss": 0.1587, + "step": 498 + }, + { + "epoch": 0.5068423720223011, + "grad_norm": 6.494111246819541, + "learning_rate": 9.85719693431445e-06, + "loss": 0.276, + "step": 500 + }, + { + "epoch": 0.5088697415103902, + "grad_norm": 7.150302222680893, + "learning_rate": 9.854383024731558e-06, + "loss": 0.3555, + "step": 502 + }, + { + "epoch": 0.5108971109984795, + "grad_norm": 11.719822675722261, + "learning_rate": 9.851542070741038e-06, + "loss": 0.3796, + "step": 504 + }, + { + "epoch": 0.5129244804865687, + "grad_norm": 8.90517990174183, + "learning_rate": 9.848674088170221e-06, + "loss": 0.3297, + "step": 506 + }, + { + "epoch": 0.5149518499746579, + "grad_norm": 10.468243825104665, + "learning_rate": 9.84577909299701e-06, + "loss": 0.3548, + "step": 508 + }, + { + "epoch": 0.516979219462747, + "grad_norm": 6.490825783286284, + "learning_rate": 9.84285710134981e-06, + "loss": 0.6709, + "step": 510 + }, + { + "epoch": 0.5190065889508363, + "grad_norm": 8.578032509685652, + "learning_rate": 9.83990812950742e-06, + "loss": 0.3401, + "step": 512 + }, + { + "epoch": 0.5210339584389255, + "grad_norm": 6.844551021415997, + "learning_rate": 9.836932193898952e-06, + "loss": 0.4493, + "step": 514 + }, + { + "epoch": 0.5230613279270147, + "grad_norm": 7.612322297400124, + "learning_rate": 9.833929311103735e-06, + "loss": 0.2997, + "step": 516 + }, + { + "epoch": 0.5250886974151039, + "grad_norm": 7.842136788255145, + "learning_rate": 9.830899497851221e-06, + "loss": 0.6955, + "step": 518 + }, + { + "epoch": 0.5271160669031931, + "grad_norm": 2.707231642251362, + "learning_rate": 9.827842771020903e-06, + "loss": 0.4374, + "step": 520 + }, + { + "epoch": 0.5291434363912824, + "grad_norm": 7.180726363066334, + "learning_rate": 9.824759147642209e-06, + "loss": 0.4919, + "step": 522 + }, + { + "epoch": 0.5311708058793715, + "grad_norm": 9.817663873895702, + "learning_rate": 9.82164864489441e-06, + "loss": 0.5083, + "step": 524 + }, + { + "epoch": 0.5331981753674607, + "grad_norm": 7.404024498196365, + "learning_rate": 9.818511280106526e-06, + "loss": 0.3729, + "step": 526 + }, + { + "epoch": 0.53522554485555, + "grad_norm": 8.343841969035678, + "learning_rate": 9.815347070757234e-06, + "loss": 0.5115, + "step": 528 + }, + { + "epoch": 0.5372529143436391, + "grad_norm": 7.810316074570741, + "learning_rate": 9.812156034474755e-06, + "loss": 0.3876, + "step": 530 + }, + { + "epoch": 0.5392802838317283, + "grad_norm": 8.211697121253763, + "learning_rate": 9.808938189036777e-06, + "loss": 0.4006, + "step": 532 + }, + { + "epoch": 0.5413076533198176, + "grad_norm": 12.064192400777866, + "learning_rate": 9.805693552370338e-06, + "loss": 0.5929, + "step": 534 + }, + { + "epoch": 0.5433350228079067, + "grad_norm": 5.492308341541983, + "learning_rate": 9.802422142551742e-06, + "loss": 0.4933, + "step": 536 + }, + { + "epoch": 0.5453623922959959, + "grad_norm": 7.712159966401665, + "learning_rate": 9.799123977806439e-06, + "loss": 0.6456, + "step": 538 + }, + { + "epoch": 0.5473897617840852, + "grad_norm": 5.8701937002689615, + "learning_rate": 9.795799076508941e-06, + "loss": 0.2484, + "step": 540 + }, + { + "epoch": 0.5494171312721744, + "grad_norm": 6.195167912973253, + "learning_rate": 9.792447457182713e-06, + "loss": 0.6318, + "step": 542 + }, + { + "epoch": 0.5514445007602635, + "grad_norm": 6.88093602400296, + "learning_rate": 9.789069138500064e-06, + "loss": 0.3182, + "step": 544 + }, + { + "epoch": 0.5534718702483528, + "grad_norm": 9.636270074160052, + "learning_rate": 9.785664139282056e-06, + "loss": 0.4464, + "step": 546 + }, + { + "epoch": 0.555499239736442, + "grad_norm": 6.900052379071231, + "learning_rate": 9.782232478498389e-06, + "loss": 0.5323, + "step": 548 + }, + { + "epoch": 0.5575266092245311, + "grad_norm": 3.9191219686290606, + "learning_rate": 9.778774175267294e-06, + "loss": 0.2739, + "step": 550 + }, + { + "epoch": 0.5595539787126204, + "grad_norm": 7.518335677245059, + "learning_rate": 9.775289248855438e-06, + "loss": 0.1516, + "step": 552 + }, + { + "epoch": 0.5615813482007096, + "grad_norm": 6.187742781536469, + "learning_rate": 9.771777718677803e-06, + "loss": 0.3271, + "step": 554 + }, + { + "epoch": 0.5636087176887988, + "grad_norm": 5.640920182354742, + "learning_rate": 9.768239604297586e-06, + "loss": 0.4409, + "step": 556 + }, + { + "epoch": 0.565636087176888, + "grad_norm": 5.332224655671003, + "learning_rate": 9.764674925426093e-06, + "loss": 0.5674, + "step": 558 + }, + { + "epoch": 0.5676634566649772, + "grad_norm": 7.5939503692803285, + "learning_rate": 9.761083701922613e-06, + "loss": 0.3737, + "step": 560 + }, + { + "epoch": 0.5696908261530664, + "grad_norm": 4.736110147121129, + "learning_rate": 9.757465953794329e-06, + "loss": 0.2678, + "step": 562 + }, + { + "epoch": 0.5717181956411556, + "grad_norm": 8.679475722637507, + "learning_rate": 9.753821701196194e-06, + "loss": 0.5864, + "step": 564 + }, + { + "epoch": 0.5737455651292448, + "grad_norm": 6.479740858056721, + "learning_rate": 9.750150964430816e-06, + "loss": 0.4279, + "step": 566 + }, + { + "epoch": 0.575772934617334, + "grad_norm": 6.1383786515028556, + "learning_rate": 9.746453763948357e-06, + "loss": 0.5355, + "step": 568 + }, + { + "epoch": 0.5778003041054233, + "grad_norm": 7.479527295586585, + "learning_rate": 9.742730120346405e-06, + "loss": 0.6914, + "step": 570 + }, + { + "epoch": 0.5798276735935124, + "grad_norm": 8.175292826083087, + "learning_rate": 9.73898005436987e-06, + "loss": 0.4967, + "step": 572 + }, + { + "epoch": 0.5818550430816016, + "grad_norm": 7.725341516682334, + "learning_rate": 9.735203586910867e-06, + "loss": 0.5052, + "step": 574 + }, + { + "epoch": 0.5838824125696909, + "grad_norm": 5.359585957466787, + "learning_rate": 9.731400739008589e-06, + "loss": 0.4603, + "step": 576 + }, + { + "epoch": 0.58590978205778, + "grad_norm": 7.747571524824843, + "learning_rate": 9.727571531849206e-06, + "loss": 0.5075, + "step": 578 + }, + { + "epoch": 0.5879371515458692, + "grad_norm": 4.719314705327792, + "learning_rate": 9.723715986765736e-06, + "loss": 0.3039, + "step": 580 + }, + { + "epoch": 0.5899645210339585, + "grad_norm": 6.7804535643528485, + "learning_rate": 9.719834125237929e-06, + "loss": 0.4223, + "step": 582 + }, + { + "epoch": 0.5919918905220476, + "grad_norm": 6.680566715157348, + "learning_rate": 9.715925968892143e-06, + "loss": 0.4342, + "step": 584 + }, + { + "epoch": 0.5940192600101368, + "grad_norm": 4.091558694632045, + "learning_rate": 9.711991539501237e-06, + "loss": 0.1728, + "step": 586 + }, + { + "epoch": 0.5960466294982261, + "grad_norm": 9.019489179878326, + "learning_rate": 9.708030858984433e-06, + "loss": 0.5648, + "step": 588 + }, + { + "epoch": 0.5980739989863153, + "grad_norm": 8.393352663911926, + "learning_rate": 9.704043949407204e-06, + "loss": 0.2846, + "step": 590 + }, + { + "epoch": 0.6001013684744044, + "grad_norm": 6.00175973377738, + "learning_rate": 9.700030832981152e-06, + "loss": 0.5592, + "step": 592 + }, + { + "epoch": 0.6021287379624937, + "grad_norm": 7.581248060720144, + "learning_rate": 9.695991532063875e-06, + "loss": 0.4116, + "step": 594 + }, + { + "epoch": 0.6041561074505829, + "grad_norm": 6.781341819213388, + "learning_rate": 9.69192606915885e-06, + "loss": 0.4828, + "step": 596 + }, + { + "epoch": 0.606183476938672, + "grad_norm": 7.7151089053267965, + "learning_rate": 9.68783446691531e-06, + "loss": 0.5524, + "step": 598 + }, + { + "epoch": 0.6082108464267613, + "grad_norm": 7.708591690469782, + "learning_rate": 9.683716748128106e-06, + "loss": 0.4718, + "step": 600 + }, + { + "epoch": 0.6102382159148505, + "grad_norm": 8.334230844559466, + "learning_rate": 9.679572935737593e-06, + "loss": 0.4543, + "step": 602 + }, + { + "epoch": 0.6122655854029397, + "grad_norm": 4.40825978835256, + "learning_rate": 9.6754030528295e-06, + "loss": 0.5914, + "step": 604 + }, + { + "epoch": 0.6142929548910289, + "grad_norm": 9.50896229297184, + "learning_rate": 9.67120712263479e-06, + "loss": 0.795, + "step": 606 + }, + { + "epoch": 0.6163203243791181, + "grad_norm": 4.4455508356847, + "learning_rate": 9.666985168529544e-06, + "loss": 0.3327, + "step": 608 + }, + { + "epoch": 0.6183476938672073, + "grad_norm": 6.542035148817838, + "learning_rate": 9.662737214034827e-06, + "loss": 0.3998, + "step": 610 + }, + { + "epoch": 0.6203750633552965, + "grad_norm": 4.385874588634816, + "learning_rate": 9.65846328281655e-06, + "loss": 0.3368, + "step": 612 + }, + { + "epoch": 0.6224024328433857, + "grad_norm": 5.061860177930386, + "learning_rate": 9.65416339868535e-06, + "loss": 0.5636, + "step": 614 + }, + { + "epoch": 0.6244298023314749, + "grad_norm": 6.954869372184894, + "learning_rate": 9.649837585596445e-06, + "loss": 0.3441, + "step": 616 + }, + { + "epoch": 0.6264571718195641, + "grad_norm": 7.191464777531764, + "learning_rate": 9.645485867649514e-06, + "loss": 0.683, + "step": 618 + }, + { + "epoch": 0.6284845413076533, + "grad_norm": 5.545660071994801, + "learning_rate": 9.641108269088549e-06, + "loss": 0.4965, + "step": 620 + }, + { + "epoch": 0.6305119107957425, + "grad_norm": 5.104187440780784, + "learning_rate": 9.636704814301727e-06, + "loss": 0.1542, + "step": 622 + }, + { + "epoch": 0.6325392802838318, + "grad_norm": 4.308028402510806, + "learning_rate": 9.632275527821277e-06, + "loss": 0.2195, + "step": 624 + }, + { + "epoch": 0.6345666497719209, + "grad_norm": 6.54099138729635, + "learning_rate": 9.627820434323338e-06, + "loss": 0.2346, + "step": 626 + }, + { + "epoch": 0.6365940192600101, + "grad_norm": 5.974766468547827, + "learning_rate": 9.623339558627822e-06, + "loss": 0.6039, + "step": 628 + }, + { + "epoch": 0.6386213887480994, + "grad_norm": 4.879561508531123, + "learning_rate": 9.618832925698279e-06, + "loss": 0.4546, + "step": 630 + }, + { + "epoch": 0.6406487582361885, + "grad_norm": 11.951161945325612, + "learning_rate": 9.614300560641754e-06, + "loss": 0.3283, + "step": 632 + }, + { + "epoch": 0.6426761277242777, + "grad_norm": 4.629438831948124, + "learning_rate": 9.60974248870865e-06, + "loss": 0.5582, + "step": 634 + }, + { + "epoch": 0.644703497212367, + "grad_norm": 4.531530286786445, + "learning_rate": 9.605158735292587e-06, + "loss": 0.228, + "step": 636 + }, + { + "epoch": 0.6467308667004562, + "grad_norm": 5.872434468311987, + "learning_rate": 9.60054932593026e-06, + "loss": 0.2157, + "step": 638 + }, + { + "epoch": 0.6487582361885453, + "grad_norm": 5.011871116549795, + "learning_rate": 9.595914286301296e-06, + "loss": 0.4113, + "step": 640 + }, + { + "epoch": 0.6507856056766346, + "grad_norm": 9.754753432883431, + "learning_rate": 9.59125364222811e-06, + "loss": 0.2078, + "step": 642 + }, + { + "epoch": 0.6528129751647238, + "grad_norm": 5.045746657058162, + "learning_rate": 9.586567419675766e-06, + "loss": 0.4861, + "step": 644 + }, + { + "epoch": 0.6548403446528129, + "grad_norm": 3.540866817691693, + "learning_rate": 9.581855644751827e-06, + "loss": 0.5821, + "step": 646 + }, + { + "epoch": 0.6568677141409022, + "grad_norm": 8.82763005841965, + "learning_rate": 9.577118343706213e-06, + "loss": 0.3022, + "step": 648 + }, + { + "epoch": 0.6588950836289914, + "grad_norm": 6.8073478160122205, + "learning_rate": 9.572355542931052e-06, + "loss": 0.3034, + "step": 650 + }, + { + "epoch": 0.6609224531170805, + "grad_norm": 8.941185828538476, + "learning_rate": 9.567567268960534e-06, + "loss": 0.5086, + "step": 652 + }, + { + "epoch": 0.6629498226051698, + "grad_norm": 7.865764561029178, + "learning_rate": 9.562753548470763e-06, + "loss": 0.4477, + "step": 654 + }, + { + "epoch": 0.664977192093259, + "grad_norm": 6.297491622439807, + "learning_rate": 9.557914408279613e-06, + "loss": 0.4884, + "step": 656 + }, + { + "epoch": 0.6670045615813482, + "grad_norm": 12.331667517959866, + "learning_rate": 9.553049875346572e-06, + "loss": 0.246, + "step": 658 + }, + { + "epoch": 0.6690319310694374, + "grad_norm": 28.533753358123192, + "learning_rate": 9.548159976772593e-06, + "loss": 0.4368, + "step": 660 + }, + { + "epoch": 0.6710593005575266, + "grad_norm": 4.884765850123049, + "learning_rate": 9.543244739799944e-06, + "loss": 0.3172, + "step": 662 + }, + { + "epoch": 0.6730866700456158, + "grad_norm": 5.880751825215974, + "learning_rate": 9.538304191812062e-06, + "loss": 0.4639, + "step": 664 + }, + { + "epoch": 0.675114039533705, + "grad_norm": 6.385803549132604, + "learning_rate": 9.533338360333385e-06, + "loss": 0.7947, + "step": 666 + }, + { + "epoch": 0.6771414090217942, + "grad_norm": 9.33712717321544, + "learning_rate": 9.52834727302922e-06, + "loss": 0.2532, + "step": 668 + }, + { + "epoch": 0.6791687785098834, + "grad_norm": 5.001601652865657, + "learning_rate": 9.523330957705572e-06, + "loss": 0.6273, + "step": 670 + }, + { + "epoch": 0.6811961479979727, + "grad_norm": 17.21477346218976, + "learning_rate": 9.51828944230899e-06, + "loss": 0.6034, + "step": 672 + }, + { + "epoch": 0.6832235174860618, + "grad_norm": 5.656111950876035, + "learning_rate": 9.513222754926426e-06, + "loss": 0.3876, + "step": 674 + }, + { + "epoch": 0.685250886974151, + "grad_norm": 9.68117645145439, + "learning_rate": 9.50813092378506e-06, + "loss": 0.5876, + "step": 676 + }, + { + "epoch": 0.6872782564622403, + "grad_norm": 7.586570607902635, + "learning_rate": 9.503013977252156e-06, + "loss": 0.4659, + "step": 678 + }, + { + "epoch": 0.6893056259503294, + "grad_norm": 5.343299258508104, + "learning_rate": 9.497871943834898e-06, + "loss": 0.3492, + "step": 680 + }, + { + "epoch": 0.6913329954384186, + "grad_norm": 5.189332768248854, + "learning_rate": 9.492704852180228e-06, + "loss": 0.3218, + "step": 682 + }, + { + "epoch": 0.6933603649265079, + "grad_norm": 6.063115149958833, + "learning_rate": 9.487512731074699e-06, + "loss": 0.5084, + "step": 684 + }, + { + "epoch": 0.6953877344145971, + "grad_norm": 3.792925723261337, + "learning_rate": 9.4822956094443e-06, + "loss": 0.1575, + "step": 686 + }, + { + "epoch": 0.6974151039026862, + "grad_norm": 5.088739238663545, + "learning_rate": 9.477053516354304e-06, + "loss": 0.4761, + "step": 688 + }, + { + "epoch": 0.6994424733907755, + "grad_norm": 3.955062864180429, + "learning_rate": 9.471786481009102e-06, + "loss": 0.2861, + "step": 690 + }, + { + "epoch": 0.7014698428788647, + "grad_norm": 7.446903888569742, + "learning_rate": 9.466494532752043e-06, + "loss": 0.4799, + "step": 692 + }, + { + "epoch": 0.7034972123669538, + "grad_norm": 7.593135209941343, + "learning_rate": 9.461177701065268e-06, + "loss": 0.3775, + "step": 694 + }, + { + "epoch": 0.7055245818550431, + "grad_norm": 6.501437242826478, + "learning_rate": 9.455836015569545e-06, + "loss": 0.6122, + "step": 696 + }, + { + "epoch": 0.7075519513431323, + "grad_norm": 9.1813520150109, + "learning_rate": 9.450469506024109e-06, + "loss": 0.3952, + "step": 698 + }, + { + "epoch": 0.7095793208312214, + "grad_norm": 5.620761606883405, + "learning_rate": 9.445078202326495e-06, + "loss": 0.2429, + "step": 700 + }, + { + "epoch": 0.7116066903193107, + "grad_norm": 5.111528480023057, + "learning_rate": 9.439662134512362e-06, + "loss": 0.6006, + "step": 702 + }, + { + "epoch": 0.7136340598073999, + "grad_norm": 5.5539943916501935, + "learning_rate": 9.43422133275534e-06, + "loss": 0.484, + "step": 704 + }, + { + "epoch": 0.7156614292954891, + "grad_norm": 6.338212183227596, + "learning_rate": 9.428755827366852e-06, + "loss": 0.6089, + "step": 706 + }, + { + "epoch": 0.7176887987835783, + "grad_norm": 2.6794454605768627, + "learning_rate": 9.423265648795947e-06, + "loss": 0.0776, + "step": 708 + }, + { + "epoch": 0.7197161682716675, + "grad_norm": 5.45938934470987, + "learning_rate": 9.417750827629137e-06, + "loss": 0.2196, + "step": 710 + }, + { + "epoch": 0.7217435377597567, + "grad_norm": 8.039074305695388, + "learning_rate": 9.412211394590217e-06, + "loss": 0.5162, + "step": 712 + }, + { + "epoch": 0.7237709072478459, + "grad_norm": 6.887794528959513, + "learning_rate": 9.406647380540096e-06, + "loss": 0.5085, + "step": 714 + }, + { + "epoch": 0.7257982767359351, + "grad_norm": 2.9283321513860963, + "learning_rate": 9.401058816476634e-06, + "loss": 0.336, + "step": 716 + }, + { + "epoch": 0.7278256462240243, + "grad_norm": 9.647894953169216, + "learning_rate": 9.395445733534452e-06, + "loss": 0.4008, + "step": 718 + }, + { + "epoch": 0.7298530157121136, + "grad_norm": 9.760294889138594, + "learning_rate": 9.38980816298478e-06, + "loss": 0.3679, + "step": 720 + }, + { + "epoch": 0.7318803852002027, + "grad_norm": 4.786177670310066, + "learning_rate": 9.384146136235263e-06, + "loss": 0.4941, + "step": 722 + }, + { + "epoch": 0.733907754688292, + "grad_norm": 7.891616034740843, + "learning_rate": 9.3784596848298e-06, + "loss": 0.3427, + "step": 724 + }, + { + "epoch": 0.7359351241763812, + "grad_norm": 12.493935080346235, + "learning_rate": 9.372748840448361e-06, + "loss": 0.6076, + "step": 726 + }, + { + "epoch": 0.7379624936644703, + "grad_norm": 14.419141825180121, + "learning_rate": 9.367013634906814e-06, + "loss": 0.8555, + "step": 728 + }, + { + "epoch": 0.7399898631525595, + "grad_norm": 10.62870019929952, + "learning_rate": 9.361254100156742e-06, + "loss": 0.5119, + "step": 730 + }, + { + "epoch": 0.7420172326406488, + "grad_norm": 5.065421920970828, + "learning_rate": 9.355470268285275e-06, + "loss": 0.2821, + "step": 732 + }, + { + "epoch": 0.7440446021287379, + "grad_norm": 8.977717124796547, + "learning_rate": 9.349662171514901e-06, + "loss": 0.5633, + "step": 734 + }, + { + "epoch": 0.7460719716168271, + "grad_norm": 6.942707874520502, + "learning_rate": 9.343829842203294e-06, + "loss": 0.3094, + "step": 736 + }, + { + "epoch": 0.7480993411049164, + "grad_norm": 6.128953415201464, + "learning_rate": 9.337973312843129e-06, + "loss": 0.6363, + "step": 738 + }, + { + "epoch": 0.7501267105930056, + "grad_norm": 5.5473571462585145, + "learning_rate": 9.3320926160619e-06, + "loss": 0.1932, + "step": 740 + }, + { + "epoch": 0.7521540800810947, + "grad_norm": 5.874890600056431, + "learning_rate": 9.326187784621747e-06, + "loss": 0.2693, + "step": 742 + }, + { + "epoch": 0.754181449569184, + "grad_norm": 3.2048130794632548, + "learning_rate": 9.320258851419265e-06, + "loss": 0.176, + "step": 744 + }, + { + "epoch": 0.7562088190572732, + "grad_norm": 5.211278622773785, + "learning_rate": 9.31430584948532e-06, + "loss": 0.467, + "step": 746 + }, + { + "epoch": 0.7582361885453623, + "grad_norm": 5.3931127900425055, + "learning_rate": 9.30832881198487e-06, + "loss": 0.4719, + "step": 748 + }, + { + "epoch": 0.7602635580334516, + "grad_norm": 4.340223144034949, + "learning_rate": 9.30232777221678e-06, + "loss": 0.2335, + "step": 750 + }, + { + "epoch": 0.7622909275215408, + "grad_norm": 7.318713438969122, + "learning_rate": 9.296302763613634e-06, + "loss": 0.388, + "step": 752 + }, + { + "epoch": 0.76431829700963, + "grad_norm": 5.341778297219776, + "learning_rate": 9.290253819741549e-06, + "loss": 0.2972, + "step": 754 + }, + { + "epoch": 0.7663456664977192, + "grad_norm": 8.763200725985504, + "learning_rate": 9.284180974299987e-06, + "loss": 0.5223, + "step": 756 + }, + { + "epoch": 0.7683730359858084, + "grad_norm": 7.012162263439357, + "learning_rate": 9.27808426112157e-06, + "loss": 0.3924, + "step": 758 + }, + { + "epoch": 0.7704004054738977, + "grad_norm": 4.0149638849926506, + "learning_rate": 9.271963714171892e-06, + "loss": 0.8123, + "step": 760 + }, + { + "epoch": 0.7724277749619868, + "grad_norm": 3.716876704505102, + "learning_rate": 9.265819367549328e-06, + "loss": 0.3749, + "step": 762 + }, + { + "epoch": 0.774455144450076, + "grad_norm": 6.470320949696539, + "learning_rate": 9.25965125548484e-06, + "loss": 0.3308, + "step": 764 + }, + { + "epoch": 0.7764825139381653, + "grad_norm": 5.404236656720703, + "learning_rate": 9.253459412341797e-06, + "loss": 0.4954, + "step": 766 + }, + { + "epoch": 0.7785098834262545, + "grad_norm": 3.3654795372936803, + "learning_rate": 9.247243872615768e-06, + "loss": 0.5004, + "step": 768 + }, + { + "epoch": 0.7805372529143436, + "grad_norm": 6.148322013880938, + "learning_rate": 9.241004670934348e-06, + "loss": 0.5069, + "step": 770 + }, + { + "epoch": 0.7825646224024329, + "grad_norm": 2.8452052480274954, + "learning_rate": 9.234741842056951e-06, + "loss": 0.2124, + "step": 772 + }, + { + "epoch": 0.7845919918905221, + "grad_norm": 4.135962860148399, + "learning_rate": 9.22845542087462e-06, + "loss": 0.2972, + "step": 774 + }, + { + "epoch": 0.7866193613786112, + "grad_norm": 5.819512666211635, + "learning_rate": 9.22214544240984e-06, + "loss": 0.2247, + "step": 776 + }, + { + "epoch": 0.7886467308667005, + "grad_norm": 4.635533960370951, + "learning_rate": 9.215811941816328e-06, + "loss": 0.2969, + "step": 778 + }, + { + "epoch": 0.7906741003547897, + "grad_norm": 5.711680970522115, + "learning_rate": 9.209454954378855e-06, + "loss": 0.1892, + "step": 780 + }, + { + "epoch": 0.7927014698428788, + "grad_norm": 4.321874497657273, + "learning_rate": 9.203074515513034e-06, + "loss": 0.6946, + "step": 782 + }, + { + "epoch": 0.794728839330968, + "grad_norm": 5.543029164645176, + "learning_rate": 9.19667066076513e-06, + "loss": 0.3274, + "step": 784 + }, + { + "epoch": 0.7967562088190573, + "grad_norm": 8.985604344904418, + "learning_rate": 9.190243425811862e-06, + "loss": 0.4325, + "step": 786 + }, + { + "epoch": 0.7987835783071465, + "grad_norm": 6.334211180639937, + "learning_rate": 9.183792846460204e-06, + "loss": 0.3118, + "step": 788 + }, + { + "epoch": 0.8008109477952357, + "grad_norm": 6.423586752149836, + "learning_rate": 9.177318958647184e-06, + "loss": 0.7043, + "step": 790 + }, + { + "epoch": 0.8028383172833249, + "grad_norm": 5.103580161538544, + "learning_rate": 9.170821798439685e-06, + "loss": 0.7762, + "step": 792 + }, + { + "epoch": 0.8048656867714141, + "grad_norm": 5.816360267352095, + "learning_rate": 9.164301402034237e-06, + "loss": 0.5856, + "step": 794 + }, + { + "epoch": 0.8068930562595032, + "grad_norm": 7.59474401328996, + "learning_rate": 9.157757805756835e-06, + "loss": 0.4727, + "step": 796 + }, + { + "epoch": 0.8089204257475925, + "grad_norm": 5.95412575618408, + "learning_rate": 9.151191046062712e-06, + "loss": 0.612, + "step": 798 + }, + { + "epoch": 0.8109477952356817, + "grad_norm": 3.1494699585780093, + "learning_rate": 9.144601159536155e-06, + "loss": 0.7439, + "step": 800 + }, + { + "epoch": 0.812975164723771, + "grad_norm": 7.4880888437426645, + "learning_rate": 9.137988182890287e-06, + "loss": 0.397, + "step": 802 + }, + { + "epoch": 0.8150025342118601, + "grad_norm": 4.810856299523186, + "learning_rate": 9.131352152966875e-06, + "loss": 0.2696, + "step": 804 + }, + { + "epoch": 0.8170299036999493, + "grad_norm": 8.713877813868931, + "learning_rate": 9.124693106736114e-06, + "loss": 0.3705, + "step": 806 + }, + { + "epoch": 0.8190572731880386, + "grad_norm": 3.6042977820318756, + "learning_rate": 9.11801108129643e-06, + "loss": 0.2589, + "step": 808 + }, + { + "epoch": 0.8210846426761277, + "grad_norm": 6.065903323692972, + "learning_rate": 9.111306113874268e-06, + "loss": 0.2386, + "step": 810 + }, + { + "epoch": 0.8231120121642169, + "grad_norm": 7.413696242034458, + "learning_rate": 9.104578241823882e-06, + "loss": 0.5752, + "step": 812 + }, + { + "epoch": 0.8251393816523062, + "grad_norm": 5.515222131927005, + "learning_rate": 9.097827502627137e-06, + "loss": 0.3691, + "step": 814 + }, + { + "epoch": 0.8271667511403953, + "grad_norm": 6.793732757527473, + "learning_rate": 9.09105393389329e-06, + "loss": 0.3146, + "step": 816 + }, + { + "epoch": 0.8291941206284845, + "grad_norm": 4.7139651256925434, + "learning_rate": 9.084257573358785e-06, + "loss": 0.4584, + "step": 818 + }, + { + "epoch": 0.8312214901165738, + "grad_norm": 5.83769063981457, + "learning_rate": 9.077438458887043e-06, + "loss": 0.3516, + "step": 820 + }, + { + "epoch": 0.833248859604663, + "grad_norm": 5.69571008174091, + "learning_rate": 9.070596628468247e-06, + "loss": 0.4922, + "step": 822 + }, + { + "epoch": 0.8352762290927521, + "grad_norm": 6.213507237636644, + "learning_rate": 9.063732120219139e-06, + "loss": 0.4684, + "step": 824 + }, + { + "epoch": 0.8373035985808414, + "grad_norm": 4.886631182693813, + "learning_rate": 9.056844972382798e-06, + "loss": 0.6716, + "step": 826 + }, + { + "epoch": 0.8393309680689306, + "grad_norm": 4.686994084256959, + "learning_rate": 9.049935223328433e-06, + "loss": 0.5062, + "step": 828 + }, + { + "epoch": 0.8413583375570197, + "grad_norm": 3.673117327551783, + "learning_rate": 9.043002911551164e-06, + "loss": 0.0987, + "step": 830 + }, + { + "epoch": 0.843385707045109, + "grad_norm": 6.454907128592606, + "learning_rate": 9.036048075671815e-06, + "loss": 0.4968, + "step": 832 + }, + { + "epoch": 0.8454130765331982, + "grad_norm": 7.300743052363863, + "learning_rate": 9.029070754436696e-06, + "loss": 0.211, + "step": 834 + }, + { + "epoch": 0.8474404460212874, + "grad_norm": 2.5789490268573227, + "learning_rate": 9.02207098671738e-06, + "loss": 0.1191, + "step": 836 + }, + { + "epoch": 0.8494678155093766, + "grad_norm": 6.348111910014334, + "learning_rate": 9.015048811510495e-06, + "loss": 0.334, + "step": 838 + }, + { + "epoch": 0.8514951849974658, + "grad_norm": 3.8881795603405602, + "learning_rate": 9.008004267937507e-06, + "loss": 0.1856, + "step": 840 + }, + { + "epoch": 0.853522554485555, + "grad_norm": 5.340753270283494, + "learning_rate": 9.000937395244498e-06, + "loss": 0.584, + "step": 842 + }, + { + "epoch": 0.8555499239736442, + "grad_norm": 3.7947361839781766, + "learning_rate": 8.993848232801944e-06, + "loss": 0.2021, + "step": 844 + }, + { + "epoch": 0.8575772934617334, + "grad_norm": 10.54844744904981, + "learning_rate": 8.986736820104501e-06, + "loss": 0.5045, + "step": 846 + }, + { + "epoch": 0.8596046629498226, + "grad_norm": 5.725957012673736, + "learning_rate": 8.979603196770793e-06, + "loss": 0.4423, + "step": 848 + }, + { + "epoch": 0.8616320324379118, + "grad_norm": 8.648353534282574, + "learning_rate": 8.972447402543171e-06, + "loss": 0.5846, + "step": 850 + }, + { + "epoch": 0.863659401926001, + "grad_norm": 5.916315355712222, + "learning_rate": 8.96526947728751e-06, + "loss": 0.6604, + "step": 852 + }, + { + "epoch": 0.8656867714140902, + "grad_norm": 3.7604027051485924, + "learning_rate": 8.958069460992977e-06, + "loss": 0.1338, + "step": 854 + }, + { + "epoch": 0.8677141409021795, + "grad_norm": 5.138819429352067, + "learning_rate": 8.950847393771812e-06, + "loss": 0.4391, + "step": 856 + }, + { + "epoch": 0.8697415103902686, + "grad_norm": 8.233466868346282, + "learning_rate": 8.943603315859101e-06, + "loss": 0.2345, + "step": 858 + }, + { + "epoch": 0.8717688798783578, + "grad_norm": 7.439711170441192, + "learning_rate": 8.93633726761256e-06, + "loss": 0.3989, + "step": 860 + }, + { + "epoch": 0.8737962493664471, + "grad_norm": 5.068007349095243, + "learning_rate": 8.9290492895123e-06, + "loss": 0.5663, + "step": 862 + }, + { + "epoch": 0.8758236188545362, + "grad_norm": 5.399841403542833, + "learning_rate": 8.921739422160607e-06, + "loss": 0.461, + "step": 864 + }, + { + "epoch": 0.8778509883426254, + "grad_norm": 3.7702433170556255, + "learning_rate": 8.914407706281718e-06, + "loss": 0.2573, + "step": 866 + }, + { + "epoch": 0.8798783578307147, + "grad_norm": 5.3083907338381495, + "learning_rate": 8.907054182721586e-06, + "loss": 0.4554, + "step": 868 + }, + { + "epoch": 0.8819057273188039, + "grad_norm": 4.348547667480799, + "learning_rate": 8.89967889244766e-06, + "loss": 0.2801, + "step": 870 + }, + { + "epoch": 0.883933096806893, + "grad_norm": 4.0015383409112015, + "learning_rate": 8.892281876548655e-06, + "loss": 0.578, + "step": 872 + }, + { + "epoch": 0.8859604662949823, + "grad_norm": 3.871301043014003, + "learning_rate": 8.88486317623432e-06, + "loss": 0.1619, + "step": 874 + }, + { + "epoch": 0.8879878357830715, + "grad_norm": 4.057651694019459, + "learning_rate": 8.877422832835214e-06, + "loss": 0.6105, + "step": 876 + }, + { + "epoch": 0.8900152052711606, + "grad_norm": 2.8291196248116246, + "learning_rate": 8.86996088780247e-06, + "loss": 0.2119, + "step": 878 + }, + { + "epoch": 0.8920425747592499, + "grad_norm": 3.670505807256192, + "learning_rate": 8.862477382707569e-06, + "loss": 0.2098, + "step": 880 + }, + { + "epoch": 0.8940699442473391, + "grad_norm": 6.529786125625772, + "learning_rate": 8.8549723592421e-06, + "loss": 0.4289, + "step": 882 + }, + { + "epoch": 0.8960973137354283, + "grad_norm": 4.371762441971309, + "learning_rate": 8.84744585921754e-06, + "loss": 0.2326, + "step": 884 + }, + { + "epoch": 0.8981246832235175, + "grad_norm": 7.8937761243821765, + "learning_rate": 8.839897924565017e-06, + "loss": 0.5234, + "step": 886 + }, + { + "epoch": 0.9001520527116067, + "grad_norm": 7.408182095675612, + "learning_rate": 8.832328597335063e-06, + "loss": 0.819, + "step": 888 + }, + { + "epoch": 0.9021794221996959, + "grad_norm": 1.965971937035006, + "learning_rate": 8.8247379196974e-06, + "loss": 0.3031, + "step": 890 + }, + { + "epoch": 0.9042067916877851, + "grad_norm": 7.688454401769998, + "learning_rate": 8.817125933940695e-06, + "loss": 0.7161, + "step": 892 + }, + { + "epoch": 0.9062341611758743, + "grad_norm": 5.766312176868324, + "learning_rate": 8.809492682472322e-06, + "loss": 0.7137, + "step": 894 + }, + { + "epoch": 0.9082615306639635, + "grad_norm": 3.2093439033560816, + "learning_rate": 8.801838207818133e-06, + "loss": 0.3615, + "step": 896 + }, + { + "epoch": 0.9102889001520527, + "grad_norm": 4.217732677691225, + "learning_rate": 8.794162552622214e-06, + "loss": 0.5088, + "step": 898 + }, + { + "epoch": 0.9123162696401419, + "grad_norm": 6.726477857693403, + "learning_rate": 8.786465759646649e-06, + "loss": 0.8383, + "step": 900 + }, + { + "epoch": 0.9143436391282311, + "grad_norm": 7.633401873987132, + "learning_rate": 8.778747871771293e-06, + "loss": 0.7326, + "step": 902 + }, + { + "epoch": 0.9163710086163204, + "grad_norm": 6.953229391724788, + "learning_rate": 8.77100893199351e-06, + "loss": 0.7874, + "step": 904 + }, + { + "epoch": 0.9183983781044095, + "grad_norm": 5.757077276503306, + "learning_rate": 8.763248983427956e-06, + "loss": 0.5707, + "step": 906 + }, + { + "epoch": 0.9204257475924987, + "grad_norm": 4.2551920163036865, + "learning_rate": 8.755468069306326e-06, + "loss": 0.4633, + "step": 908 + }, + { + "epoch": 0.922453117080588, + "grad_norm": 3.638181138305172, + "learning_rate": 8.747666232977122e-06, + "loss": 0.3175, + "step": 910 + }, + { + "epoch": 0.9244804865686771, + "grad_norm": 5.524072501104057, + "learning_rate": 8.739843517905397e-06, + "loss": 0.256, + "step": 912 + }, + { + "epoch": 0.9265078560567663, + "grad_norm": 5.541374833838674, + "learning_rate": 8.73199996767253e-06, + "loss": 0.4937, + "step": 914 + }, + { + "epoch": 0.9285352255448556, + "grad_norm": 2.2549590176792336, + "learning_rate": 8.724135625975975e-06, + "loss": 0.2302, + "step": 916 + }, + { + "epoch": 0.9305625950329448, + "grad_norm": 5.059547307274291, + "learning_rate": 8.716250536629013e-06, + "loss": 0.2826, + "step": 918 + }, + { + "epoch": 0.9325899645210339, + "grad_norm": 4.854111651813162, + "learning_rate": 8.708344743560517e-06, + "loss": 0.5314, + "step": 920 + }, + { + "epoch": 0.9346173340091232, + "grad_norm": 12.02310624587097, + "learning_rate": 8.700418290814705e-06, + "loss": 0.5996, + "step": 922 + }, + { + "epoch": 0.9366447034972124, + "grad_norm": 15.404245289006797, + "learning_rate": 8.692471222550886e-06, + "loss": 0.5981, + "step": 924 + }, + { + "epoch": 0.9386720729853015, + "grad_norm": 5.657024729761937, + "learning_rate": 8.684503583043226e-06, + "loss": 0.4123, + "step": 926 + }, + { + "epoch": 0.9406994424733908, + "grad_norm": 3.6690238702698714, + "learning_rate": 8.676515416680496e-06, + "loss": 0.2704, + "step": 928 + }, + { + "epoch": 0.94272681196148, + "grad_norm": 5.406077162724565, + "learning_rate": 8.668506767965821e-06, + "loss": 0.2685, + "step": 930 + }, + { + "epoch": 0.9447541814495691, + "grad_norm": 6.41755556109416, + "learning_rate": 8.660477681516441e-06, + "loss": 0.6968, + "step": 932 + }, + { + "epoch": 0.9467815509376584, + "grad_norm": 3.462041751936351, + "learning_rate": 8.652428202063455e-06, + "loss": 0.4423, + "step": 934 + }, + { + "epoch": 0.9488089204257476, + "grad_norm": 4.300079838808357, + "learning_rate": 8.644358374451573e-06, + "loss": 0.3834, + "step": 936 + }, + { + "epoch": 0.9508362899138368, + "grad_norm": 6.657569812152387, + "learning_rate": 8.636268243638868e-06, + "loss": 0.4608, + "step": 938 + }, + { + "epoch": 0.952863659401926, + "grad_norm": 6.707285289158868, + "learning_rate": 8.628157854696524e-06, + "loss": 0.3167, + "step": 940 + }, + { + "epoch": 0.9548910288900152, + "grad_norm": 8.172772922438094, + "learning_rate": 8.620027252808588e-06, + "loss": 0.5663, + "step": 942 + }, + { + "epoch": 0.9569183983781044, + "grad_norm": 3.6572205550403942, + "learning_rate": 8.611876483271715e-06, + "loss": 0.1763, + "step": 944 + }, + { + "epoch": 0.9589457678661936, + "grad_norm": 8.569803278885331, + "learning_rate": 8.603705591494917e-06, + "loss": 0.7493, + "step": 946 + }, + { + "epoch": 0.9609731373542828, + "grad_norm": 4.239505267706155, + "learning_rate": 8.595514622999307e-06, + "loss": 0.3379, + "step": 948 + }, + { + "epoch": 0.963000506842372, + "grad_norm": 4.7615793691494, + "learning_rate": 8.587303623417852e-06, + "loss": 0.3839, + "step": 950 + }, + { + "epoch": 0.9650278763304613, + "grad_norm": 4.418919422940225, + "learning_rate": 8.57907263849511e-06, + "loss": 0.6096, + "step": 952 + }, + { + "epoch": 0.9670552458185504, + "grad_norm": 5.4550569319774125, + "learning_rate": 8.570821714086986e-06, + "loss": 0.4183, + "step": 954 + }, + { + "epoch": 0.9690826153066396, + "grad_norm": 3.0218024170551643, + "learning_rate": 8.562550896160465e-06, + "loss": 0.1402, + "step": 956 + }, + { + "epoch": 0.9711099847947289, + "grad_norm": 7.717036379851838, + "learning_rate": 8.554260230793365e-06, + "loss": 0.6842, + "step": 958 + }, + { + "epoch": 0.973137354282818, + "grad_norm": 2.874897729568267, + "learning_rate": 8.545949764174075e-06, + "loss": 0.2962, + "step": 960 + }, + { + "epoch": 0.9751647237709072, + "grad_norm": 3.7424951152222294, + "learning_rate": 8.537619542601301e-06, + "loss": 0.2641, + "step": 962 + }, + { + "epoch": 0.9771920932589965, + "grad_norm": 5.432445332292658, + "learning_rate": 8.529269612483805e-06, + "loss": 0.4452, + "step": 964 + }, + { + "epoch": 0.9792194627470857, + "grad_norm": 6.650258723038361, + "learning_rate": 8.520900020340146e-06, + "loss": 0.4789, + "step": 966 + }, + { + "epoch": 0.9812468322351748, + "grad_norm": 8.187537644312687, + "learning_rate": 8.512510812798426e-06, + "loss": 0.6415, + "step": 968 + }, + { + "epoch": 0.9832742017232641, + "grad_norm": 5.02677356621445, + "learning_rate": 8.504102036596029e-06, + "loss": 0.4051, + "step": 970 + }, + { + "epoch": 0.9853015712113533, + "grad_norm": 7.146981070145481, + "learning_rate": 8.49567373857935e-06, + "loss": 0.4138, + "step": 972 + }, + { + "epoch": 0.9873289406994424, + "grad_norm": 4.381213114893804, + "learning_rate": 8.487225965703553e-06, + "loss": 0.2625, + "step": 974 + }, + { + "epoch": 0.9893563101875317, + "grad_norm": 5.557739352011601, + "learning_rate": 8.478758765032292e-06, + "loss": 0.5094, + "step": 976 + }, + { + "epoch": 0.9913836796756209, + "grad_norm": 4.398801212494901, + "learning_rate": 8.470272183737456e-06, + "loss": 0.3639, + "step": 978 + }, + { + "epoch": 0.99341104916371, + "grad_norm": 5.862744825557705, + "learning_rate": 8.461766269098911e-06, + "loss": 0.4918, + "step": 980 + }, + { + "epoch": 0.9954384186517993, + "grad_norm": 4.297532964632133, + "learning_rate": 8.453241068504228e-06, + "loss": 0.289, + "step": 982 + }, + { + "epoch": 0.9974657881398885, + "grad_norm": 4.721287373274696, + "learning_rate": 8.444696629448421e-06, + "loss": 0.3725, + "step": 984 + }, + { + "epoch": 0.9994931576279777, + "grad_norm": 8.562409131545987, + "learning_rate": 8.436132999533689e-06, + "loss": 0.5918, + "step": 986 + }, + { + "epoch": 1.0010136847440445, + "grad_norm": 4.872104708816113, + "learning_rate": 8.427550226469141e-06, + "loss": 0.2778, + "step": 988 + }, + { + "epoch": 1.0030410542321337, + "grad_norm": 3.5612770656604034, + "learning_rate": 8.418948358070535e-06, + "loss": 0.2037, + "step": 990 + }, + { + "epoch": 1.005068423720223, + "grad_norm": 6.0604543941906135, + "learning_rate": 8.41032744226002e-06, + "loss": 0.1771, + "step": 992 + }, + { + "epoch": 1.0070957932083122, + "grad_norm": 3.2535588343862596, + "learning_rate": 8.401687527065847e-06, + "loss": 0.1182, + "step": 994 + }, + { + "epoch": 1.0091231626964015, + "grad_norm": 4.955113525107962, + "learning_rate": 8.393028660622128e-06, + "loss": 0.098, + "step": 996 + }, + { + "epoch": 1.0111505321844907, + "grad_norm": 4.275864820089432, + "learning_rate": 8.384350891168546e-06, + "loss": 0.1984, + "step": 998 + }, + { + "epoch": 1.01317790167258, + "grad_norm": 2.5505437983542274, + "learning_rate": 8.375654267050097e-06, + "loss": 0.1007, + "step": 1000 + }, + { + "epoch": 1.015205271160669, + "grad_norm": 3.799393530609951, + "learning_rate": 8.366938836716825e-06, + "loss": 0.1686, + "step": 1002 + }, + { + "epoch": 1.0172326406487582, + "grad_norm": 2.7921555804136267, + "learning_rate": 8.358204648723535e-06, + "loss": 0.1455, + "step": 1004 + }, + { + "epoch": 1.0192600101368474, + "grad_norm": 3.9009134564963706, + "learning_rate": 8.349451751729545e-06, + "loss": 0.2087, + "step": 1006 + }, + { + "epoch": 1.0212873796249367, + "grad_norm": 2.668179681944388, + "learning_rate": 8.340680194498395e-06, + "loss": 0.3583, + "step": 1008 + }, + { + "epoch": 1.023314749113026, + "grad_norm": 5.491628925842295, + "learning_rate": 8.331890025897587e-06, + "loss": 0.1584, + "step": 1010 + }, + { + "epoch": 1.0253421186011151, + "grad_norm": 3.4167775346219127, + "learning_rate": 8.323081294898308e-06, + "loss": 0.1905, + "step": 1012 + }, + { + "epoch": 1.0273694880892044, + "grad_norm": 3.5663841670781835, + "learning_rate": 8.31425405057516e-06, + "loss": 0.2597, + "step": 1014 + }, + { + "epoch": 1.0293968575772934, + "grad_norm": 2.0304329060198043, + "learning_rate": 8.305408342105884e-06, + "loss": 0.0961, + "step": 1016 + }, + { + "epoch": 1.0314242270653826, + "grad_norm": 4.7721037284042165, + "learning_rate": 8.29654421877109e-06, + "loss": 0.1468, + "step": 1018 + }, + { + "epoch": 1.0334515965534719, + "grad_norm": 4.873349883556866, + "learning_rate": 8.287661729953975e-06, + "loss": 0.2455, + "step": 1020 + }, + { + "epoch": 1.035478966041561, + "grad_norm": 4.031703732026393, + "learning_rate": 8.278760925140054e-06, + "loss": 0.1304, + "step": 1022 + }, + { + "epoch": 1.0375063355296503, + "grad_norm": 3.5040837091606534, + "learning_rate": 8.269841853916886e-06, + "loss": 0.1338, + "step": 1024 + }, + { + "epoch": 1.0395337050177396, + "grad_norm": 1.3927342464983936, + "learning_rate": 8.260904565973793e-06, + "loss": 0.0436, + "step": 1026 + }, + { + "epoch": 1.0415610745058288, + "grad_norm": 5.542594637004402, + "learning_rate": 8.251949111101582e-06, + "loss": 0.2781, + "step": 1028 + }, + { + "epoch": 1.0435884439939178, + "grad_norm": 3.044378382937683, + "learning_rate": 8.242975539192272e-06, + "loss": 0.0725, + "step": 1030 + }, + { + "epoch": 1.045615813482007, + "grad_norm": 0.8666859895079763, + "learning_rate": 8.233983900238817e-06, + "loss": 0.1526, + "step": 1032 + }, + { + "epoch": 1.0476431829700963, + "grad_norm": 4.524544177043602, + "learning_rate": 8.22497424433482e-06, + "loss": 0.1143, + "step": 1034 + }, + { + "epoch": 1.0496705524581855, + "grad_norm": 4.243372667402034, + "learning_rate": 8.215946621674264e-06, + "loss": 0.1056, + "step": 1036 + }, + { + "epoch": 1.0516979219462748, + "grad_norm": 5.2260239859841855, + "learning_rate": 8.206901082551223e-06, + "loss": 0.2854, + "step": 1038 + }, + { + "epoch": 1.053725291434364, + "grad_norm": 3.035005623175306, + "learning_rate": 8.197837677359589e-06, + "loss": 0.11, + "step": 1040 + }, + { + "epoch": 1.055752660922453, + "grad_norm": 6.119715912917096, + "learning_rate": 8.188756456592787e-06, + "loss": 0.2419, + "step": 1042 + }, + { + "epoch": 1.0577800304105422, + "grad_norm": 8.469295128181523, + "learning_rate": 8.179657470843492e-06, + "loss": 0.298, + "step": 1044 + }, + { + "epoch": 1.0598073998986315, + "grad_norm": 2.9266966749281385, + "learning_rate": 8.170540770803355e-06, + "loss": 0.199, + "step": 1046 + }, + { + "epoch": 1.0618347693867207, + "grad_norm": 5.707663955115576, + "learning_rate": 8.161406407262714e-06, + "loss": 0.075, + "step": 1048 + }, + { + "epoch": 1.06386213887481, + "grad_norm": 2.714462608662212, + "learning_rate": 8.152254431110311e-06, + "loss": 0.1047, + "step": 1050 + }, + { + "epoch": 1.0658895083628992, + "grad_norm": 5.266324663278518, + "learning_rate": 8.143084893333011e-06, + "loss": 0.1278, + "step": 1052 + }, + { + "epoch": 1.0679168778509884, + "grad_norm": 1.782437482904759, + "learning_rate": 8.133897845015522e-06, + "loss": 0.0651, + "step": 1054 + }, + { + "epoch": 1.0699442473390774, + "grad_norm": 2.542443279011463, + "learning_rate": 8.124693337340093e-06, + "loss": 0.1306, + "step": 1056 + }, + { + "epoch": 1.0719716168271667, + "grad_norm": 4.99081029538913, + "learning_rate": 8.115471421586256e-06, + "loss": 0.1588, + "step": 1058 + }, + { + "epoch": 1.073998986315256, + "grad_norm": 2.5358568971513313, + "learning_rate": 8.106232149130516e-06, + "loss": 0.1635, + "step": 1060 + }, + { + "epoch": 1.0760263558033452, + "grad_norm": 3.681738570213383, + "learning_rate": 8.096975571446077e-06, + "loss": 0.1339, + "step": 1062 + }, + { + "epoch": 1.0780537252914344, + "grad_norm": 4.874057225101878, + "learning_rate": 8.087701740102556e-06, + "loss": 0.1243, + "step": 1064 + }, + { + "epoch": 1.0800810947795236, + "grad_norm": 3.9896046014057647, + "learning_rate": 8.07841070676569e-06, + "loss": 0.0617, + "step": 1066 + }, + { + "epoch": 1.0821084642676129, + "grad_norm": 2.1199748267443, + "learning_rate": 8.069102523197045e-06, + "loss": 0.082, + "step": 1068 + }, + { + "epoch": 1.0841358337557019, + "grad_norm": 1.5137260119199176, + "learning_rate": 8.059777241253744e-06, + "loss": 0.1474, + "step": 1070 + }, + { + "epoch": 1.0861632032437911, + "grad_norm": 4.0060173226066444, + "learning_rate": 8.05043491288816e-06, + "loss": 0.2739, + "step": 1072 + }, + { + "epoch": 1.0881905727318804, + "grad_norm": 2.7510383642974454, + "learning_rate": 8.041075590147636e-06, + "loss": 0.2321, + "step": 1074 + }, + { + "epoch": 1.0902179422199696, + "grad_norm": 4.59013744580626, + "learning_rate": 8.031699325174189e-06, + "loss": 0.1827, + "step": 1076 + }, + { + "epoch": 1.0922453117080588, + "grad_norm": 2.725792293655832, + "learning_rate": 8.022306170204233e-06, + "loss": 0.0776, + "step": 1078 + }, + { + "epoch": 1.094272681196148, + "grad_norm": 2.284479776514648, + "learning_rate": 8.012896177568268e-06, + "loss": 0.1363, + "step": 1080 + }, + { + "epoch": 1.0963000506842373, + "grad_norm": 5.032872303300701, + "learning_rate": 8.003469399690603e-06, + "loss": 0.3919, + "step": 1082 + }, + { + "epoch": 1.0983274201723263, + "grad_norm": 3.7256127839290953, + "learning_rate": 7.994025889089063e-06, + "loss": 0.2009, + "step": 1084 + }, + { + "epoch": 1.1003547896604156, + "grad_norm": 5.277704613071705, + "learning_rate": 7.984565698374688e-06, + "loss": 0.1442, + "step": 1086 + }, + { + "epoch": 1.1023821591485048, + "grad_norm": 4.402775919135939, + "learning_rate": 7.97508888025145e-06, + "loss": 0.1915, + "step": 1088 + }, + { + "epoch": 1.104409528636594, + "grad_norm": 4.2366154648656575, + "learning_rate": 7.965595487515947e-06, + "loss": 0.1027, + "step": 1090 + }, + { + "epoch": 1.1064368981246833, + "grad_norm": 2.092979742626384, + "learning_rate": 7.956085573057122e-06, + "loss": 0.0655, + "step": 1092 + }, + { + "epoch": 1.1084642676127725, + "grad_norm": 2.351788415489237, + "learning_rate": 7.946559189855966e-06, + "loss": 0.1347, + "step": 1094 + }, + { + "epoch": 1.1104916371008615, + "grad_norm": 1.484734581998334, + "learning_rate": 7.937016390985215e-06, + "loss": 0.0668, + "step": 1096 + }, + { + "epoch": 1.1125190065889508, + "grad_norm": 3.377826818059427, + "learning_rate": 7.927457229609055e-06, + "loss": 0.2252, + "step": 1098 + }, + { + "epoch": 1.11454637607704, + "grad_norm": 5.419140925452316, + "learning_rate": 7.917881758982838e-06, + "loss": 0.2313, + "step": 1100 + }, + { + "epoch": 1.1165737455651292, + "grad_norm": 6.035214325266877, + "learning_rate": 7.908290032452767e-06, + "loss": 0.2758, + "step": 1102 + }, + { + "epoch": 1.1186011150532185, + "grad_norm": 6.224793264005061, + "learning_rate": 7.898682103455624e-06, + "loss": 0.1682, + "step": 1104 + }, + { + "epoch": 1.1206284845413077, + "grad_norm": 3.608182192693733, + "learning_rate": 7.889058025518437e-06, + "loss": 0.4006, + "step": 1106 + }, + { + "epoch": 1.122655854029397, + "grad_norm": 3.0814021398651246, + "learning_rate": 7.879417852258222e-06, + "loss": 0.0732, + "step": 1108 + }, + { + "epoch": 1.124683223517486, + "grad_norm": 3.0547828613160286, + "learning_rate": 7.86976163738165e-06, + "loss": 0.1129, + "step": 1110 + }, + { + "epoch": 1.1267105930055752, + "grad_norm": 5.398703017504753, + "learning_rate": 7.860089434684767e-06, + "loss": 0.2317, + "step": 1112 + }, + { + "epoch": 1.1287379624936644, + "grad_norm": 4.297334558153495, + "learning_rate": 7.85040129805269e-06, + "loss": 0.197, + "step": 1114 + }, + { + "epoch": 1.1307653319817537, + "grad_norm": 4.168045585328467, + "learning_rate": 7.840697281459304e-06, + "loss": 0.1276, + "step": 1116 + }, + { + "epoch": 1.132792701469843, + "grad_norm": 6.793105445796965, + "learning_rate": 7.830977438966965e-06, + "loss": 0.3191, + "step": 1118 + }, + { + "epoch": 1.1348200709579321, + "grad_norm": 2.8779514841528724, + "learning_rate": 7.821241824726198e-06, + "loss": 0.1986, + "step": 1120 + }, + { + "epoch": 1.1368474404460214, + "grad_norm": 1.2405374130837183, + "learning_rate": 7.81149049297539e-06, + "loss": 0.0512, + "step": 1122 + }, + { + "epoch": 1.1388748099341104, + "grad_norm": 6.759047126966548, + "learning_rate": 7.801723498040497e-06, + "loss": 0.141, + "step": 1124 + }, + { + "epoch": 1.1409021794221996, + "grad_norm": 4.0345433364044325, + "learning_rate": 7.791940894334737e-06, + "loss": 0.1277, + "step": 1126 + }, + { + "epoch": 1.1429295489102889, + "grad_norm": 4.811326720112928, + "learning_rate": 7.782142736358282e-06, + "loss": 0.2364, + "step": 1128 + }, + { + "epoch": 1.144956918398378, + "grad_norm": 3.9019332330181067, + "learning_rate": 7.772329078697963e-06, + "loss": 0.3854, + "step": 1130 + }, + { + "epoch": 1.1469842878864673, + "grad_norm": 4.972136054012557, + "learning_rate": 7.762499976026957e-06, + "loss": 0.2212, + "step": 1132 + }, + { + "epoch": 1.1490116573745566, + "grad_norm": 6.281075694605909, + "learning_rate": 7.752655483104496e-06, + "loss": 0.2237, + "step": 1134 + }, + { + "epoch": 1.1510390268626458, + "grad_norm": 4.714773115070318, + "learning_rate": 7.742795654775541e-06, + "loss": 0.189, + "step": 1136 + }, + { + "epoch": 1.1530663963507348, + "grad_norm": 6.010801811983067, + "learning_rate": 7.7329205459705e-06, + "loss": 0.1171, + "step": 1138 + }, + { + "epoch": 1.155093765838824, + "grad_norm": 2.0590693449166193, + "learning_rate": 7.723030211704905e-06, + "loss": 0.1535, + "step": 1140 + }, + { + "epoch": 1.1571211353269133, + "grad_norm": 3.5928453686431734, + "learning_rate": 7.713124707079111e-06, + "loss": 0.1929, + "step": 1142 + }, + { + "epoch": 1.1591485048150025, + "grad_norm": 2.0189584725866716, + "learning_rate": 7.703204087277989e-06, + "loss": 0.125, + "step": 1144 + }, + { + "epoch": 1.1611758743030918, + "grad_norm": 4.033099100713104, + "learning_rate": 7.69326840757062e-06, + "loss": 0.0788, + "step": 1146 + }, + { + "epoch": 1.163203243791181, + "grad_norm": 1.3189673648071172, + "learning_rate": 7.683317723309987e-06, + "loss": 0.1145, + "step": 1148 + }, + { + "epoch": 1.1652306132792702, + "grad_norm": 8.181629996910894, + "learning_rate": 7.67335208993266e-06, + "loss": 0.1808, + "step": 1150 + }, + { + "epoch": 1.1672579827673593, + "grad_norm": 5.550433715292844, + "learning_rate": 7.663371562958498e-06, + "loss": 0.1658, + "step": 1152 + }, + { + "epoch": 1.1692853522554485, + "grad_norm": 2.3773596345805936, + "learning_rate": 7.653376197990333e-06, + "loss": 0.1875, + "step": 1154 + }, + { + "epoch": 1.1713127217435377, + "grad_norm": 4.628266699216258, + "learning_rate": 7.643366050713657e-06, + "loss": 0.114, + "step": 1156 + }, + { + "epoch": 1.173340091231627, + "grad_norm": 4.995804927155124, + "learning_rate": 7.633341176896325e-06, + "loss": 0.1889, + "step": 1158 + }, + { + "epoch": 1.1753674607197162, + "grad_norm": 8.102780033788305, + "learning_rate": 7.623301632388227e-06, + "loss": 0.2327, + "step": 1160 + }, + { + "epoch": 1.1773948302078054, + "grad_norm": 0.9133849482440713, + "learning_rate": 7.6132474731209884e-06, + "loss": 0.118, + "step": 1162 + }, + { + "epoch": 1.1794221996958947, + "grad_norm": 8.525681736671865, + "learning_rate": 7.603178755107657e-06, + "loss": 0.25, + "step": 1164 + }, + { + "epoch": 1.1814495691839837, + "grad_norm": 2.7224959844475425, + "learning_rate": 7.593095534442387e-06, + "loss": 0.1532, + "step": 1166 + }, + { + "epoch": 1.183476938672073, + "grad_norm": 3.6564040576387145, + "learning_rate": 7.582997867300132e-06, + "loss": 0.2764, + "step": 1168 + }, + { + "epoch": 1.1855043081601622, + "grad_norm": 2.0995300516142374, + "learning_rate": 7.572885809936323e-06, + "loss": 0.1104, + "step": 1170 + }, + { + "epoch": 1.1875316776482514, + "grad_norm": 6.90234776810571, + "learning_rate": 7.5627594186865674e-06, + "loss": 0.1735, + "step": 1172 + }, + { + "epoch": 1.1895590471363406, + "grad_norm": 14.504553436055314, + "learning_rate": 7.55261874996632e-06, + "loss": 0.2934, + "step": 1174 + }, + { + "epoch": 1.1915864166244299, + "grad_norm": 3.3719409250609673, + "learning_rate": 7.5424638602705914e-06, + "loss": 0.1119, + "step": 1176 + }, + { + "epoch": 1.1936137861125191, + "grad_norm": 6.837103343647477, + "learning_rate": 7.5322948061736035e-06, + "loss": 0.1985, + "step": 1178 + }, + { + "epoch": 1.1956411556006081, + "grad_norm": 3.24693543916302, + "learning_rate": 7.5221116443285e-06, + "loss": 0.1514, + "step": 1180 + }, + { + "epoch": 1.1976685250886974, + "grad_norm": 6.2238002469704385, + "learning_rate": 7.511914431467018e-06, + "loss": 0.1333, + "step": 1182 + }, + { + "epoch": 1.1996958945767866, + "grad_norm": 3.9001966091113234, + "learning_rate": 7.5017032243991706e-06, + "loss": 0.1949, + "step": 1184 + }, + { + "epoch": 1.2017232640648758, + "grad_norm": 3.45227340533288, + "learning_rate": 7.491478080012943e-06, + "loss": 0.1176, + "step": 1186 + }, + { + "epoch": 1.203750633552965, + "grad_norm": 2.4390116274360456, + "learning_rate": 7.481239055273959e-06, + "loss": 0.1579, + "step": 1188 + }, + { + "epoch": 1.2057780030410543, + "grad_norm": 2.458714012944509, + "learning_rate": 7.470986207225177e-06, + "loss": 0.0691, + "step": 1190 + }, + { + "epoch": 1.2078053725291436, + "grad_norm": 6.974450623572189, + "learning_rate": 7.460719592986562e-06, + "loss": 0.1845, + "step": 1192 + }, + { + "epoch": 1.2098327420172326, + "grad_norm": 6.90438791642449, + "learning_rate": 7.450439269754779e-06, + "loss": 0.1942, + "step": 1194 + }, + { + "epoch": 1.2118601115053218, + "grad_norm": 5.726576300835688, + "learning_rate": 7.440145294802859e-06, + "loss": 0.2449, + "step": 1196 + }, + { + "epoch": 1.213887480993411, + "grad_norm": 8.051644039321834, + "learning_rate": 7.429837725479897e-06, + "loss": 0.1784, + "step": 1198 + }, + { + "epoch": 1.2159148504815003, + "grad_norm": 2.489362227964109, + "learning_rate": 7.419516619210719e-06, + "loss": 0.0647, + "step": 1200 + }, + { + "epoch": 1.2179422199695895, + "grad_norm": 2.5261548434423737, + "learning_rate": 7.409182033495568e-06, + "loss": 0.0478, + "step": 1202 + }, + { + "epoch": 1.2199695894576787, + "grad_norm": 5.55099989011959, + "learning_rate": 7.398834025909783e-06, + "loss": 0.1975, + "step": 1204 + }, + { + "epoch": 1.221996958945768, + "grad_norm": 2.079370905609275, + "learning_rate": 7.388472654103479e-06, + "loss": 0.0702, + "step": 1206 + }, + { + "epoch": 1.224024328433857, + "grad_norm": 5.8495516340000835, + "learning_rate": 7.378097975801224e-06, + "loss": 0.3112, + "step": 1208 + }, + { + "epoch": 1.2260516979219462, + "grad_norm": 3.457566739327137, + "learning_rate": 7.367710048801715e-06, + "loss": 0.2142, + "step": 1210 + }, + { + "epoch": 1.2280790674100355, + "grad_norm": 1.129777333292035, + "learning_rate": 7.357308930977467e-06, + "loss": 0.0862, + "step": 1212 + }, + { + "epoch": 1.2301064368981247, + "grad_norm": 5.779832247255059, + "learning_rate": 7.346894680274474e-06, + "loss": 0.1765, + "step": 1214 + }, + { + "epoch": 1.232133806386214, + "grad_norm": 4.9994656722523905, + "learning_rate": 7.336467354711904e-06, + "loss": 0.2239, + "step": 1216 + }, + { + "epoch": 1.2341611758743032, + "grad_norm": 3.2244469173372923, + "learning_rate": 7.3260270123817586e-06, + "loss": 0.144, + "step": 1218 + }, + { + "epoch": 1.2361885453623922, + "grad_norm": 3.8854387200861864, + "learning_rate": 7.3155737114485615e-06, + "loss": 0.1267, + "step": 1220 + }, + { + "epoch": 1.2382159148504814, + "grad_norm": 2.3259158422727677, + "learning_rate": 7.30510751014903e-06, + "loss": 0.1359, + "step": 1222 + }, + { + "epoch": 1.2402432843385707, + "grad_norm": 3.0894026092760556, + "learning_rate": 7.2946284667917515e-06, + "loss": 0.1781, + "step": 1224 + }, + { + "epoch": 1.24227065382666, + "grad_norm": 8.347988976273328, + "learning_rate": 7.284136639756856e-06, + "loss": 0.1876, + "step": 1226 + }, + { + "epoch": 1.2442980233147491, + "grad_norm": 2.401241917659144, + "learning_rate": 7.273632087495698e-06, + "loss": 0.2119, + "step": 1228 + }, + { + "epoch": 1.2463253928028384, + "grad_norm": 2.517480005404283, + "learning_rate": 7.26311486853052e-06, + "loss": 0.0995, + "step": 1230 + }, + { + "epoch": 1.2483527622909274, + "grad_norm": 2.0008917623986333, + "learning_rate": 7.252585041454134e-06, + "loss": 0.0496, + "step": 1232 + }, + { + "epoch": 1.2503801317790169, + "grad_norm": 1.0276297495009383, + "learning_rate": 7.242042664929598e-06, + "loss": 0.1692, + "step": 1234 + }, + { + "epoch": 1.2524075012671059, + "grad_norm": 4.865928669315582, + "learning_rate": 7.231487797689879e-06, + "loss": 0.1215, + "step": 1236 + }, + { + "epoch": 1.254434870755195, + "grad_norm": 2.018226844303304, + "learning_rate": 7.220920498537533e-06, + "loss": 0.1925, + "step": 1238 + }, + { + "epoch": 1.2564622402432843, + "grad_norm": 3.223170973269808, + "learning_rate": 7.210340826344377e-06, + "loss": 0.1488, + "step": 1240 + }, + { + "epoch": 1.2584896097313736, + "grad_norm": 3.353263780396923, + "learning_rate": 7.199748840051159e-06, + "loss": 0.0643, + "step": 1242 + }, + { + "epoch": 1.2605169792194628, + "grad_norm": 3.843060093171026, + "learning_rate": 7.189144598667231e-06, + "loss": 0.1471, + "step": 1244 + }, + { + "epoch": 1.2625443487075518, + "grad_norm": 1.4437108328205905, + "learning_rate": 7.17852816127022e-06, + "loss": 0.0664, + "step": 1246 + }, + { + "epoch": 1.2645717181956413, + "grad_norm": 0.2730373321602434, + "learning_rate": 7.167899587005698e-06, + "loss": 0.0475, + "step": 1248 + }, + { + "epoch": 1.2665990876837303, + "grad_norm": 4.085866452116431, + "learning_rate": 7.1572589350868495e-06, + "loss": 0.1359, + "step": 1250 + }, + { + "epoch": 1.2686264571718195, + "grad_norm": 3.2696178215712886, + "learning_rate": 7.146606264794153e-06, + "loss": 0.1531, + "step": 1252 + }, + { + "epoch": 1.2706538266599088, + "grad_norm": 3.4115626295100636, + "learning_rate": 7.1359416354750365e-06, + "loss": 0.168, + "step": 1254 + }, + { + "epoch": 1.272681196147998, + "grad_norm": 5.270481878229893, + "learning_rate": 7.125265106543556e-06, + "loss": 0.1922, + "step": 1256 + }, + { + "epoch": 1.2747085656360873, + "grad_norm": 2.3686561344343953, + "learning_rate": 7.114576737480062e-06, + "loss": 0.2011, + "step": 1258 + }, + { + "epoch": 1.2767359351241763, + "grad_norm": 0.03475893050613353, + "learning_rate": 7.103876587830867e-06, + "loss": 0.0052, + "step": 1260 + }, + { + "epoch": 1.2787633046122655, + "grad_norm": 8.75383057419793, + "learning_rate": 7.093164717207914e-06, + "loss": 0.4584, + "step": 1262 + }, + { + "epoch": 1.2807906741003547, + "grad_norm": 4.775607453320004, + "learning_rate": 7.082441185288448e-06, + "loss": 0.2378, + "step": 1264 + }, + { + "epoch": 1.282818043588444, + "grad_norm": 4.743251998425376, + "learning_rate": 7.071706051814676e-06, + "loss": 0.137, + "step": 1266 + }, + { + "epoch": 1.2848454130765332, + "grad_norm": 2.2895806011089137, + "learning_rate": 7.060959376593444e-06, + "loss": 0.1319, + "step": 1268 + }, + { + "epoch": 1.2868727825646225, + "grad_norm": 3.4560774430215884, + "learning_rate": 7.050201219495892e-06, + "loss": 0.1602, + "step": 1270 + }, + { + "epoch": 1.2889001520527117, + "grad_norm": 3.9461420801262044, + "learning_rate": 7.039431640457137e-06, + "loss": 0.1909, + "step": 1272 + }, + { + "epoch": 1.2909275215408007, + "grad_norm": 2.7671502392111815, + "learning_rate": 7.02865069947592e-06, + "loss": 0.129, + "step": 1274 + }, + { + "epoch": 1.29295489102889, + "grad_norm": 3.852962397784081, + "learning_rate": 7.017858456614284e-06, + "loss": 0.0635, + "step": 1276 + }, + { + "epoch": 1.2949822605169792, + "grad_norm": 2.8680194298002673, + "learning_rate": 7.007054971997236e-06, + "loss": 0.1534, + "step": 1278 + }, + { + "epoch": 1.2970096300050684, + "grad_norm": 8.434469500823162, + "learning_rate": 6.996240305812414e-06, + "loss": 0.1927, + "step": 1280 + }, + { + "epoch": 1.2990369994931577, + "grad_norm": 2.3529207589639696, + "learning_rate": 6.985414518309748e-06, + "loss": 0.1508, + "step": 1282 + }, + { + "epoch": 1.3010643689812469, + "grad_norm": 1.0662907317139942, + "learning_rate": 6.974577669801126e-06, + "loss": 0.0686, + "step": 1284 + }, + { + "epoch": 1.3030917384693361, + "grad_norm": 3.1147902738532562, + "learning_rate": 6.9637298206600615e-06, + "loss": 0.1095, + "step": 1286 + }, + { + "epoch": 1.3051191079574251, + "grad_norm": 6.688419304910197, + "learning_rate": 6.952871031321351e-06, + "loss": 0.1829, + "step": 1288 + }, + { + "epoch": 1.3071464774455144, + "grad_norm": 5.243765916376045, + "learning_rate": 6.942001362280739e-06, + "loss": 0.2812, + "step": 1290 + }, + { + "epoch": 1.3091738469336036, + "grad_norm": 3.6451096604741875, + "learning_rate": 6.931120874094587e-06, + "loss": 0.1206, + "step": 1292 + }, + { + "epoch": 1.3112012164216929, + "grad_norm": 5.6803618897375925, + "learning_rate": 6.920229627379529e-06, + "loss": 0.1927, + "step": 1294 + }, + { + "epoch": 1.313228585909782, + "grad_norm": 3.243687744843865, + "learning_rate": 6.909327682812132e-06, + "loss": 0.0857, + "step": 1296 + }, + { + "epoch": 1.3152559553978713, + "grad_norm": 4.6695155223458755, + "learning_rate": 6.898415101128571e-06, + "loss": 0.2788, + "step": 1298 + }, + { + "epoch": 1.3172833248859606, + "grad_norm": 3.742146888948744, + "learning_rate": 6.887491943124272e-06, + "loss": 0.1964, + "step": 1300 + }, + { + "epoch": 1.3193106943740496, + "grad_norm": 2.733472528295039, + "learning_rate": 6.876558269653592e-06, + "loss": 0.2049, + "step": 1302 + }, + { + "epoch": 1.3213380638621388, + "grad_norm": 2.3603710377375955, + "learning_rate": 6.865614141629466e-06, + "loss": 0.1062, + "step": 1304 + }, + { + "epoch": 1.323365433350228, + "grad_norm": 2.6059707789387336, + "learning_rate": 6.8546596200230734e-06, + "loss": 0.1611, + "step": 1306 + }, + { + "epoch": 1.3253928028383173, + "grad_norm": 2.1240889540688075, + "learning_rate": 6.843694765863496e-06, + "loss": 0.1021, + "step": 1308 + }, + { + "epoch": 1.3274201723264065, + "grad_norm": 4.102276826319042, + "learning_rate": 6.832719640237383e-06, + "loss": 0.1719, + "step": 1310 + }, + { + "epoch": 1.3294475418144958, + "grad_norm": 7.328965028569066, + "learning_rate": 6.8217343042886055e-06, + "loss": 0.3271, + "step": 1312 + }, + { + "epoch": 1.331474911302585, + "grad_norm": 11.792006564400545, + "learning_rate": 6.810738819217918e-06, + "loss": 0.1557, + "step": 1314 + }, + { + "epoch": 1.333502280790674, + "grad_norm": 4.384846525975929, + "learning_rate": 6.7997332462826185e-06, + "loss": 0.0938, + "step": 1316 + }, + { + "epoch": 1.3355296502787632, + "grad_norm": 1.9821813471356953, + "learning_rate": 6.788717646796201e-06, + "loss": 0.2598, + "step": 1318 + }, + { + "epoch": 1.3375570197668525, + "grad_norm": 1.4814329078040722, + "learning_rate": 6.777692082128024e-06, + "loss": 0.1645, + "step": 1320 + }, + { + "epoch": 1.3395843892549417, + "grad_norm": 3.967120749322282, + "learning_rate": 6.766656613702963e-06, + "loss": 0.3621, + "step": 1322 + }, + { + "epoch": 1.341611758743031, + "grad_norm": 2.128016314811634, + "learning_rate": 6.755611303001066e-06, + "loss": 0.1616, + "step": 1324 + }, + { + "epoch": 1.3436391282311202, + "grad_norm": 4.591135943011197, + "learning_rate": 6.7445562115572116e-06, + "loss": 0.0953, + "step": 1326 + }, + { + "epoch": 1.3456664977192094, + "grad_norm": 5.843297611544583, + "learning_rate": 6.733491400960774e-06, + "loss": 0.0826, + "step": 1328 + }, + { + "epoch": 1.3476938672072984, + "grad_norm": 6.391128076172723, + "learning_rate": 6.722416932855272e-06, + "loss": 0.148, + "step": 1330 + }, + { + "epoch": 1.3497212366953877, + "grad_norm": 4.477822865233633, + "learning_rate": 6.711332868938028e-06, + "loss": 0.1608, + "step": 1332 + }, + { + "epoch": 1.351748606183477, + "grad_norm": 1.844046718426473, + "learning_rate": 6.700239270959818e-06, + "loss": 0.2658, + "step": 1334 + }, + { + "epoch": 1.3537759756715662, + "grad_norm": 3.615528135296258, + "learning_rate": 6.689136200724543e-06, + "loss": 0.0842, + "step": 1336 + }, + { + "epoch": 1.3558033451596554, + "grad_norm": 3.603650565462502, + "learning_rate": 6.678023720088869e-06, + "loss": 0.2725, + "step": 1338 + }, + { + "epoch": 1.3578307146477446, + "grad_norm": 4.407460619087083, + "learning_rate": 6.666901890961891e-06, + "loss": 0.1966, + "step": 1340 + }, + { + "epoch": 1.3598580841358339, + "grad_norm": 4.06893259210789, + "learning_rate": 6.655770775304783e-06, + "loss": 0.1954, + "step": 1342 + }, + { + "epoch": 1.3618854536239229, + "grad_norm": 1.9178227557356118, + "learning_rate": 6.6446304351304594e-06, + "loss": 0.0513, + "step": 1344 + }, + { + "epoch": 1.3639128231120121, + "grad_norm": 3.624327004781614, + "learning_rate": 6.633480932503222e-06, + "loss": 0.1765, + "step": 1346 + }, + { + "epoch": 1.3659401926001014, + "grad_norm": 4.221569671430079, + "learning_rate": 6.622322329538421e-06, + "loss": 0.221, + "step": 1348 + }, + { + "epoch": 1.3679675620881906, + "grad_norm": 3.5263054310758095, + "learning_rate": 6.611154688402104e-06, + "loss": 0.1959, + "step": 1350 + }, + { + "epoch": 1.3699949315762798, + "grad_norm": 3.095024031661221, + "learning_rate": 6.599978071310671e-06, + "loss": 0.1624, + "step": 1352 + }, + { + "epoch": 1.3720223010643688, + "grad_norm": 8.213065977556035, + "learning_rate": 6.588792540530529e-06, + "loss": 0.1503, + "step": 1354 + }, + { + "epoch": 1.3740496705524583, + "grad_norm": 3.571325884363285, + "learning_rate": 6.577598158377743e-06, + "loss": 0.1549, + "step": 1356 + }, + { + "epoch": 1.3760770400405473, + "grad_norm": 1.1797172289327251, + "learning_rate": 6.566394987217693e-06, + "loss": 0.1963, + "step": 1358 + }, + { + "epoch": 1.3781044095286366, + "grad_norm": 0.4720168331094363, + "learning_rate": 6.5551830894647216e-06, + "loss": 0.1852, + "step": 1360 + }, + { + "epoch": 1.3801317790167258, + "grad_norm": 3.286131600036055, + "learning_rate": 6.54396252758179e-06, + "loss": 0.1432, + "step": 1362 + }, + { + "epoch": 1.382159148504815, + "grad_norm": 3.8090679918620376, + "learning_rate": 6.532733364080126e-06, + "loss": 0.1414, + "step": 1364 + }, + { + "epoch": 1.3841865179929043, + "grad_norm": 5.282148494265528, + "learning_rate": 6.521495661518878e-06, + "loss": 0.2543, + "step": 1366 + }, + { + "epoch": 1.3862138874809933, + "grad_norm": 2.2795354557671983, + "learning_rate": 6.510249482504771e-06, + "loss": 0.2312, + "step": 1368 + }, + { + "epoch": 1.3882412569690827, + "grad_norm": 3.3811968908973453, + "learning_rate": 6.49899488969175e-06, + "loss": 0.0571, + "step": 1370 + }, + { + "epoch": 1.3902686264571718, + "grad_norm": 5.102555956028209, + "learning_rate": 6.487731945780633e-06, + "loss": 0.0696, + "step": 1372 + }, + { + "epoch": 1.392295995945261, + "grad_norm": 31.246176079306068, + "learning_rate": 6.476460713518766e-06, + "loss": 0.1323, + "step": 1374 + }, + { + "epoch": 1.3943233654333502, + "grad_norm": 2.382736366649651, + "learning_rate": 6.465181255699669e-06, + "loss": 0.138, + "step": 1376 + }, + { + "epoch": 1.3963507349214395, + "grad_norm": 9.29721138731628, + "learning_rate": 6.453893635162691e-06, + "loss": 0.2565, + "step": 1378 + }, + { + "epoch": 1.3983781044095287, + "grad_norm": 2.0237891546696405, + "learning_rate": 6.44259791479265e-06, + "loss": 0.1514, + "step": 1380 + }, + { + "epoch": 1.4004054738976177, + "grad_norm": 9.728023220260434, + "learning_rate": 6.431294157519495e-06, + "loss": 0.1972, + "step": 1382 + }, + { + "epoch": 1.4024328433857072, + "grad_norm": 4.544966119212154, + "learning_rate": 6.419982426317946e-06, + "loss": 0.3302, + "step": 1384 + }, + { + "epoch": 1.4044602128737962, + "grad_norm": 2.4581760518674076, + "learning_rate": 6.408662784207149e-06, + "loss": 0.0597, + "step": 1386 + }, + { + "epoch": 1.4064875823618854, + "grad_norm": 2.072469430551111, + "learning_rate": 6.397335294250323e-06, + "loss": 0.0855, + "step": 1388 + }, + { + "epoch": 1.4085149518499747, + "grad_norm": 4.298916021055219, + "learning_rate": 6.386000019554407e-06, + "loss": 0.0528, + "step": 1390 + }, + { + "epoch": 1.410542321338064, + "grad_norm": 6.089657822762162, + "learning_rate": 6.37465702326971e-06, + "loss": 0.1473, + "step": 1392 + }, + { + "epoch": 1.4125696908261531, + "grad_norm": 6.583706051789223, + "learning_rate": 6.3633063685895594e-06, + "loss": 0.3554, + "step": 1394 + }, + { + "epoch": 1.4145970603142421, + "grad_norm": 2.951889432993456, + "learning_rate": 6.351948118749949e-06, + "loss": 0.1072, + "step": 1396 + }, + { + "epoch": 1.4166244298023316, + "grad_norm": 2.6516006784534736, + "learning_rate": 6.340582337029184e-06, + "loss": 0.0936, + "step": 1398 + }, + { + "epoch": 1.4186517992904206, + "grad_norm": 5.166347530935865, + "learning_rate": 6.329209086747535e-06, + "loss": 0.2601, + "step": 1400 + }, + { + "epoch": 1.4206791687785099, + "grad_norm": 4.02605275866056, + "learning_rate": 6.317828431266876e-06, + "loss": 0.1756, + "step": 1402 + }, + { + "epoch": 1.422706538266599, + "grad_norm": 3.1592218026641232, + "learning_rate": 6.306440433990338e-06, + "loss": 0.1581, + "step": 1404 + }, + { + "epoch": 1.4247339077546883, + "grad_norm": 5.348431446201753, + "learning_rate": 6.295045158361958e-06, + "loss": 0.2208, + "step": 1406 + }, + { + "epoch": 1.4267612772427776, + "grad_norm": 3.2177903953601787, + "learning_rate": 6.283642667866317e-06, + "loss": 0.1003, + "step": 1408 + }, + { + "epoch": 1.4287886467308666, + "grad_norm": 4.428117807489889, + "learning_rate": 6.27223302602819e-06, + "loss": 0.1824, + "step": 1410 + }, + { + "epoch": 1.4308160162189558, + "grad_norm": 1.5265814466497352, + "learning_rate": 6.260816296412198e-06, + "loss": 0.067, + "step": 1412 + }, + { + "epoch": 1.432843385707045, + "grad_norm": 1.3805731475891885, + "learning_rate": 6.2493925426224465e-06, + "loss": 0.0255, + "step": 1414 + }, + { + "epoch": 1.4348707551951343, + "grad_norm": 4.357002644473959, + "learning_rate": 6.237961828302172e-06, + "loss": 0.2488, + "step": 1416 + }, + { + "epoch": 1.4368981246832235, + "grad_norm": 0.6022237187612426, + "learning_rate": 6.226524217133392e-06, + "loss": 0.0319, + "step": 1418 + }, + { + "epoch": 1.4389254941713128, + "grad_norm": 3.677449255076383, + "learning_rate": 6.215079772836544e-06, + "loss": 0.1413, + "step": 1420 + }, + { + "epoch": 1.440952863659402, + "grad_norm": 4.173901409368219, + "learning_rate": 6.203628559170137e-06, + "loss": 0.0695, + "step": 1422 + }, + { + "epoch": 1.442980233147491, + "grad_norm": 4.731133564900678, + "learning_rate": 6.192170639930392e-06, + "loss": 0.1063, + "step": 1424 + }, + { + "epoch": 1.4450076026355803, + "grad_norm": 6.976821857675296, + "learning_rate": 6.180706078950885e-06, + "loss": 0.1413, + "step": 1426 + }, + { + "epoch": 1.4470349721236695, + "grad_norm": 2.5056462604840273, + "learning_rate": 6.169234940102199e-06, + "loss": 0.1426, + "step": 1428 + }, + { + "epoch": 1.4490623416117587, + "grad_norm": 7.026303713718109, + "learning_rate": 6.157757287291557e-06, + "loss": 0.1624, + "step": 1430 + }, + { + "epoch": 1.451089711099848, + "grad_norm": 16.274068154351532, + "learning_rate": 6.146273184462479e-06, + "loss": 0.5465, + "step": 1432 + }, + { + "epoch": 1.4531170805879372, + "grad_norm": 3.824960829003429, + "learning_rate": 6.1347826955944135e-06, + "loss": 0.3357, + "step": 1434 + }, + { + "epoch": 1.4551444500760264, + "grad_norm": 2.2811078591996554, + "learning_rate": 6.123285884702389e-06, + "loss": 0.0751, + "step": 1436 + }, + { + "epoch": 1.4571718195641155, + "grad_norm": 4.6890573899482035, + "learning_rate": 6.111782815836654e-06, + "loss": 0.1598, + "step": 1438 + }, + { + "epoch": 1.4591991890522047, + "grad_norm": 5.2687348481294505, + "learning_rate": 6.100273553082323e-06, + "loss": 0.1637, + "step": 1440 + }, + { + "epoch": 1.461226558540294, + "grad_norm": 4.0599419427953745, + "learning_rate": 6.088758160559012e-06, + "loss": 0.1251, + "step": 1442 + }, + { + "epoch": 1.4632539280283832, + "grad_norm": 2.8038016093544806, + "learning_rate": 6.077236702420493e-06, + "loss": 0.1844, + "step": 1444 + }, + { + "epoch": 1.4652812975164724, + "grad_norm": 2.268029688590044, + "learning_rate": 6.065709242854326e-06, + "loss": 0.1364, + "step": 1446 + }, + { + "epoch": 1.4673086670045616, + "grad_norm": 4.261946090169799, + "learning_rate": 6.054175846081511e-06, + "loss": 0.1498, + "step": 1448 + }, + { + "epoch": 1.4693360364926509, + "grad_norm": 4.498201305284697, + "learning_rate": 6.042636576356119e-06, + "loss": 0.1571, + "step": 1450 + }, + { + "epoch": 1.47136340598074, + "grad_norm": 3.280641832790457, + "learning_rate": 6.031091497964941e-06, + "loss": 0.1017, + "step": 1452 + }, + { + "epoch": 1.4733907754688291, + "grad_norm": 3.1992714169380316, + "learning_rate": 6.0195406752271336e-06, + "loss": 0.0904, + "step": 1454 + }, + { + "epoch": 1.4754181449569184, + "grad_norm": 6.012996270991794, + "learning_rate": 6.007984172493851e-06, + "loss": 0.0614, + "step": 1456 + }, + { + "epoch": 1.4774455144450076, + "grad_norm": 3.82934336320927, + "learning_rate": 5.996422054147895e-06, + "loss": 0.1319, + "step": 1458 + }, + { + "epoch": 1.4794728839330968, + "grad_norm": 6.95013573516929, + "learning_rate": 5.984854384603346e-06, + "loss": 0.1224, + "step": 1460 + }, + { + "epoch": 1.481500253421186, + "grad_norm": 2.4004207763472305, + "learning_rate": 5.973281228305219e-06, + "loss": 0.0476, + "step": 1462 + }, + { + "epoch": 1.4835276229092753, + "grad_norm": 12.864218667740657, + "learning_rate": 5.961702649729094e-06, + "loss": 0.2011, + "step": 1464 + }, + { + "epoch": 1.4855549923973643, + "grad_norm": 0.824411194236003, + "learning_rate": 5.950118713380757e-06, + "loss": 0.2361, + "step": 1466 + }, + { + "epoch": 1.4875823618854536, + "grad_norm": 5.734199512127657, + "learning_rate": 5.938529483795844e-06, + "loss": 0.2088, + "step": 1468 + }, + { + "epoch": 1.4896097313735428, + "grad_norm": 7.3077348804857705, + "learning_rate": 5.92693502553948e-06, + "loss": 0.0957, + "step": 1470 + }, + { + "epoch": 1.491637100861632, + "grad_norm": 5.29793871087098, + "learning_rate": 5.915335403205921e-06, + "loss": 0.1117, + "step": 1472 + }, + { + "epoch": 1.4936644703497213, + "grad_norm": 3.370848536042073, + "learning_rate": 5.903730681418191e-06, + "loss": 0.1309, + "step": 1474 + }, + { + "epoch": 1.4956918398378105, + "grad_norm": 4.079219634055649, + "learning_rate": 5.892120924827725e-06, + "loss": 0.1224, + "step": 1476 + }, + { + "epoch": 1.4977192093258997, + "grad_norm": 3.0874295448997335, + "learning_rate": 5.880506198114008e-06, + "loss": 0.1045, + "step": 1478 + }, + { + "epoch": 1.4997465788139888, + "grad_norm": 2.8939379696698286, + "learning_rate": 5.868886565984211e-06, + "loss": 0.0659, + "step": 1480 + }, + { + "epoch": 1.501773948302078, + "grad_norm": 21.670988057494622, + "learning_rate": 5.857262093172838e-06, + "loss": 0.253, + "step": 1482 + }, + { + "epoch": 1.5038013177901672, + "grad_norm": 7.033445831075649, + "learning_rate": 5.84563284444136e-06, + "loss": 0.3429, + "step": 1484 + }, + { + "epoch": 1.5058286872782565, + "grad_norm": 4.139668355302615, + "learning_rate": 5.833998884577852e-06, + "loss": 0.2709, + "step": 1486 + }, + { + "epoch": 1.5078560567663457, + "grad_norm": 4.057929738238329, + "learning_rate": 5.822360278396639e-06, + "loss": 0.1233, + "step": 1488 + }, + { + "epoch": 1.5098834262544347, + "grad_norm": 2.180107349687981, + "learning_rate": 5.8107170907379275e-06, + "loss": 0.0933, + "step": 1490 + }, + { + "epoch": 1.5119107957425242, + "grad_norm": 8.0049779333162, + "learning_rate": 5.799069386467455e-06, + "loss": 0.3751, + "step": 1492 + }, + { + "epoch": 1.5139381652306132, + "grad_norm": 1.6185706506855442, + "learning_rate": 5.7874172304761146e-06, + "loss": 0.1527, + "step": 1494 + }, + { + "epoch": 1.5159655347187024, + "grad_norm": 5.235144634126123, + "learning_rate": 5.775760687679603e-06, + "loss": 0.13, + "step": 1496 + }, + { + "epoch": 1.5179929042067917, + "grad_norm": 4.429815792887033, + "learning_rate": 5.764099823018058e-06, + "loss": 0.1374, + "step": 1498 + }, + { + "epoch": 1.520020273694881, + "grad_norm": 7.794813901637103, + "learning_rate": 5.75243470145569e-06, + "loss": 0.4024, + "step": 1500 + }, + { + "epoch": 1.5220476431829701, + "grad_norm": 8.32342349604352, + "learning_rate": 5.740765387980432e-06, + "loss": 0.1211, + "step": 1502 + }, + { + "epoch": 1.5240750126710592, + "grad_norm": 2.2164703994040154, + "learning_rate": 5.729091947603566e-06, + "loss": 0.108, + "step": 1504 + }, + { + "epoch": 1.5261023821591486, + "grad_norm": 2.3221386310457897, + "learning_rate": 5.717414445359368e-06, + "loss": 0.08, + "step": 1506 + }, + { + "epoch": 1.5281297516472376, + "grad_norm": 4.699309622963559, + "learning_rate": 5.70573294630474e-06, + "loss": 0.3477, + "step": 1508 + }, + { + "epoch": 1.5301571211353269, + "grad_norm": 4.2106486417179285, + "learning_rate": 5.694047515518853e-06, + "loss": 0.1587, + "step": 1510 + }, + { + "epoch": 1.532184490623416, + "grad_norm": 2.8798557827030105, + "learning_rate": 5.682358218102786e-06, + "loss": 0.1672, + "step": 1512 + }, + { + "epoch": 1.5342118601115053, + "grad_norm": 2.4667679523355988, + "learning_rate": 5.670665119179151e-06, + "loss": 0.2856, + "step": 1514 + }, + { + "epoch": 1.5362392295995946, + "grad_norm": 2.0567273839375817, + "learning_rate": 5.658968283891745e-06, + "loss": 0.1565, + "step": 1516 + }, + { + "epoch": 1.5382665990876836, + "grad_norm": 2.9306379645816705, + "learning_rate": 5.647267777405178e-06, + "loss": 0.2535, + "step": 1518 + }, + { + "epoch": 1.540293968575773, + "grad_norm": 2.044676767036829, + "learning_rate": 5.6355636649045145e-06, + "loss": 0.1006, + "step": 1520 + }, + { + "epoch": 1.542321338063862, + "grad_norm": 7.050430698187679, + "learning_rate": 5.623856011594908e-06, + "loss": 0.1823, + "step": 1522 + }, + { + "epoch": 1.5443487075519513, + "grad_norm": 1.9590913324846015, + "learning_rate": 5.612144882701239e-06, + "loss": 0.0669, + "step": 1524 + }, + { + "epoch": 1.5463760770400405, + "grad_norm": 2.3064458427397323, + "learning_rate": 5.60043034346775e-06, + "loss": 0.0594, + "step": 1526 + }, + { + "epoch": 1.5484034465281298, + "grad_norm": 2.7061457219360863, + "learning_rate": 5.588712459157679e-06, + "loss": 0.1837, + "step": 1528 + }, + { + "epoch": 1.550430816016219, + "grad_norm": 3.6623304666581227, + "learning_rate": 5.5769912950529095e-06, + "loss": 0.0998, + "step": 1530 + }, + { + "epoch": 1.552458185504308, + "grad_norm": 3.535938502272535, + "learning_rate": 5.565266916453589e-06, + "loss": 0.148, + "step": 1532 + }, + { + "epoch": 1.5544855549923975, + "grad_norm": 7.4763717689327995, + "learning_rate": 5.553539388677773e-06, + "loss": 0.2007, + "step": 1534 + }, + { + "epoch": 1.5565129244804865, + "grad_norm": 2.870601762721744, + "learning_rate": 5.541808777061071e-06, + "loss": 0.1337, + "step": 1536 + }, + { + "epoch": 1.5585402939685757, + "grad_norm": 4.101207782023873, + "learning_rate": 5.53007514695626e-06, + "loss": 0.1105, + "step": 1538 + }, + { + "epoch": 1.560567663456665, + "grad_norm": 3.95428603371787, + "learning_rate": 5.518338563732945e-06, + "loss": 0.1596, + "step": 1540 + }, + { + "epoch": 1.5625950329447542, + "grad_norm": 3.8275480687633614, + "learning_rate": 5.506599092777174e-06, + "loss": 0.1643, + "step": 1542 + }, + { + "epoch": 1.5646224024328435, + "grad_norm": 1.6565110581664757, + "learning_rate": 5.494856799491089e-06, + "loss": 0.04, + "step": 1544 + }, + { + "epoch": 1.5666497719209325, + "grad_norm": 3.3181706468126606, + "learning_rate": 5.483111749292551e-06, + "loss": 0.0884, + "step": 1546 + }, + { + "epoch": 1.568677141409022, + "grad_norm": 3.1005759998005784, + "learning_rate": 5.471364007614785e-06, + "loss": 0.0999, + "step": 1548 + }, + { + "epoch": 1.570704510897111, + "grad_norm": 10.195450840459827, + "learning_rate": 5.45961363990601e-06, + "loss": 0.1216, + "step": 1550 + }, + { + "epoch": 1.5727318803852002, + "grad_norm": 8.993553643221581, + "learning_rate": 5.4478607116290705e-06, + "loss": 0.2583, + "step": 1552 + }, + { + "epoch": 1.5747592498732894, + "grad_norm": 2.772222570964442, + "learning_rate": 5.43610528826108e-06, + "loss": 0.1032, + "step": 1554 + }, + { + "epoch": 1.5767866193613786, + "grad_norm": 5.180041346413565, + "learning_rate": 5.424347435293048e-06, + "loss": 0.1029, + "step": 1556 + }, + { + "epoch": 1.5788139888494679, + "grad_norm": 3.3097164347964516, + "learning_rate": 5.412587218229528e-06, + "loss": 0.1814, + "step": 1558 + }, + { + "epoch": 1.580841358337557, + "grad_norm": 1.870941295263354, + "learning_rate": 5.400824702588237e-06, + "loss": 0.113, + "step": 1560 + }, + { + "epoch": 1.5828687278256464, + "grad_norm": 2.1414532723103203, + "learning_rate": 5.3890599538996994e-06, + "loss": 0.0773, + "step": 1562 + }, + { + "epoch": 1.5848960973137354, + "grad_norm": 3.6225621380495476, + "learning_rate": 5.377293037706882e-06, + "loss": 0.1262, + "step": 1564 + }, + { + "epoch": 1.5869234668018246, + "grad_norm": 1.2253379340979413, + "learning_rate": 5.365524019564825e-06, + "loss": 0.0793, + "step": 1566 + }, + { + "epoch": 1.5889508362899138, + "grad_norm": 3.534664792153492, + "learning_rate": 5.353752965040279e-06, + "loss": 0.156, + "step": 1568 + }, + { + "epoch": 1.590978205778003, + "grad_norm": 9.095108161751181, + "learning_rate": 5.341979939711342e-06, + "loss": 0.1075, + "step": 1570 + }, + { + "epoch": 1.5930055752660923, + "grad_norm": 6.175216370777391, + "learning_rate": 5.330205009167088e-06, + "loss": 0.1905, + "step": 1572 + }, + { + "epoch": 1.5950329447541813, + "grad_norm": 4.487152274708146, + "learning_rate": 5.3184282390072085e-06, + "loss": 0.0974, + "step": 1574 + }, + { + "epoch": 1.5970603142422708, + "grad_norm": 6.762099619770064, + "learning_rate": 5.306649694841639e-06, + "loss": 0.1797, + "step": 1576 + }, + { + "epoch": 1.5990876837303598, + "grad_norm": 8.584070177770256, + "learning_rate": 5.294869442290204e-06, + "loss": 0.2632, + "step": 1578 + }, + { + "epoch": 1.601115053218449, + "grad_norm": 2.325895461056052, + "learning_rate": 5.2830875469822455e-06, + "loss": 0.1089, + "step": 1580 + }, + { + "epoch": 1.6031424227065383, + "grad_norm": 5.923530174060992, + "learning_rate": 5.271304074556251e-06, + "loss": 0.2344, + "step": 1582 + }, + { + "epoch": 1.6051697921946275, + "grad_norm": 4.906792815744144, + "learning_rate": 5.2595190906595e-06, + "loss": 0.1997, + "step": 1584 + }, + { + "epoch": 1.6071971616827168, + "grad_norm": 3.2129524908136076, + "learning_rate": 5.247732660947689e-06, + "loss": 0.1699, + "step": 1586 + }, + { + "epoch": 1.6092245311708058, + "grad_norm": 2.7239720199002546, + "learning_rate": 5.235944851084576e-06, + "loss": 0.0706, + "step": 1588 + }, + { + "epoch": 1.6112519006588952, + "grad_norm": 3.59015811171391, + "learning_rate": 5.2241557267416e-06, + "loss": 0.108, + "step": 1590 + }, + { + "epoch": 1.6132792701469842, + "grad_norm": 5.900149082353774, + "learning_rate": 5.212365353597525e-06, + "loss": 0.2588, + "step": 1592 + }, + { + "epoch": 1.6153066396350735, + "grad_norm": 3.640293161615437, + "learning_rate": 5.2005737973380775e-06, + "loss": 0.1102, + "step": 1594 + }, + { + "epoch": 1.6173340091231627, + "grad_norm": 5.650793602436028, + "learning_rate": 5.18878112365557e-06, + "loss": 0.2715, + "step": 1596 + }, + { + "epoch": 1.619361378611252, + "grad_norm": 4.628508775429804, + "learning_rate": 5.176987398248541e-06, + "loss": 0.1024, + "step": 1598 + }, + { + "epoch": 1.6213887480993412, + "grad_norm": 2.4156447711265137, + "learning_rate": 5.165192686821391e-06, + "loss": 0.1483, + "step": 1600 + }, + { + "epoch": 1.6234161175874302, + "grad_norm": 2.839026889866633, + "learning_rate": 5.153397055084009e-06, + "loss": 0.1824, + "step": 1602 + }, + { + "epoch": 1.6254434870755197, + "grad_norm": 1.8057746103859693, + "learning_rate": 5.141600568751416e-06, + "loss": 0.1296, + "step": 1604 + }, + { + "epoch": 1.6274708565636087, + "grad_norm": 3.8511373649804437, + "learning_rate": 5.1298032935433915e-06, + "loss": 0.1408, + "step": 1606 + }, + { + "epoch": 1.629498226051698, + "grad_norm": 7.0686236116423355, + "learning_rate": 5.118005295184112e-06, + "loss": 0.1722, + "step": 1608 + }, + { + "epoch": 1.6315255955397872, + "grad_norm": 7.119914716719147, + "learning_rate": 5.10620663940178e-06, + "loss": 0.1007, + "step": 1610 + }, + { + "epoch": 1.6335529650278762, + "grad_norm": 3.3655620912237065, + "learning_rate": 5.094407391928262e-06, + "loss": 0.2385, + "step": 1612 + }, + { + "epoch": 1.6355803345159656, + "grad_norm": 1.4412971488212618, + "learning_rate": 5.082607618498721e-06, + "loss": 0.0892, + "step": 1614 + }, + { + "epoch": 1.6376077040040546, + "grad_norm": 4.519989407413056, + "learning_rate": 5.07080738485125e-06, + "loss": 0.1277, + "step": 1616 + }, + { + "epoch": 1.639635073492144, + "grad_norm": 2.391278534302831, + "learning_rate": 5.059006756726506e-06, + "loss": 0.195, + "step": 1618 + }, + { + "epoch": 1.6416624429802331, + "grad_norm": 3.807942432644943, + "learning_rate": 5.0472057998673415e-06, + "loss": 0.2373, + "step": 1620 + }, + { + "epoch": 1.6436898124683224, + "grad_norm": 3.8508452944575273, + "learning_rate": 5.035404580018446e-06, + "loss": 0.1914, + "step": 1622 + }, + { + "epoch": 1.6457171819564116, + "grad_norm": 11.587143256132578, + "learning_rate": 5.023603162925967e-06, + "loss": 0.1802, + "step": 1624 + }, + { + "epoch": 1.6477445514445006, + "grad_norm": 2.979314355479905, + "learning_rate": 5.011801614337158e-06, + "loss": 0.1178, + "step": 1626 + }, + { + "epoch": 1.64977192093259, + "grad_norm": 2.0970616838797436, + "learning_rate": 5e-06, + "loss": 0.0949, + "step": 1628 + }, + { + "epoch": 1.651799290420679, + "grad_norm": 6.62237352247369, + "learning_rate": 4.988198385662842e-06, + "loss": 0.4035, + "step": 1630 + }, + { + "epoch": 1.6538266599087685, + "grad_norm": 0.9990647539211209, + "learning_rate": 4.976396837074035e-06, + "loss": 0.2671, + "step": 1632 + }, + { + "epoch": 1.6558540293968576, + "grad_norm": 4.1256835806555765, + "learning_rate": 4.964595419981556e-06, + "loss": 0.2117, + "step": 1634 + }, + { + "epoch": 1.6578813988849468, + "grad_norm": 4.461463854455266, + "learning_rate": 4.95279420013266e-06, + "loss": 0.2944, + "step": 1636 + }, + { + "epoch": 1.659908768373036, + "grad_norm": 7.137227419111255, + "learning_rate": 4.940993243273497e-06, + "loss": 0.1521, + "step": 1638 + }, + { + "epoch": 1.661936137861125, + "grad_norm": 6.3851566114624765, + "learning_rate": 4.929192615148753e-06, + "loss": 0.1036, + "step": 1640 + }, + { + "epoch": 1.6639635073492145, + "grad_norm": 5.535536875628756, + "learning_rate": 4.917392381501281e-06, + "loss": 0.2618, + "step": 1642 + }, + { + "epoch": 1.6659908768373035, + "grad_norm": 4.863795288611755, + "learning_rate": 4.905592608071739e-06, + "loss": 0.1214, + "step": 1644 + }, + { + "epoch": 1.6680182463253928, + "grad_norm": 3.490736528183799, + "learning_rate": 4.893793360598221e-06, + "loss": 0.117, + "step": 1646 + }, + { + "epoch": 1.670045615813482, + "grad_norm": 2.033326654988042, + "learning_rate": 4.881994704815889e-06, + "loss": 0.1014, + "step": 1648 + }, + { + "epoch": 1.6720729853015712, + "grad_norm": 6.133714178297199, + "learning_rate": 4.870196706456609e-06, + "loss": 0.3596, + "step": 1650 + }, + { + "epoch": 1.6741003547896605, + "grad_norm": 6.872379463529285, + "learning_rate": 4.858399431248586e-06, + "loss": 0.1819, + "step": 1652 + }, + { + "epoch": 1.6761277242777495, + "grad_norm": 1.8342525640426168, + "learning_rate": 4.846602944915993e-06, + "loss": 0.0681, + "step": 1654 + }, + { + "epoch": 1.678155093765839, + "grad_norm": 0.6183626233965518, + "learning_rate": 4.834807313178611e-06, + "loss": 0.0341, + "step": 1656 + }, + { + "epoch": 1.680182463253928, + "grad_norm": 5.360853825141301, + "learning_rate": 4.823012601751459e-06, + "loss": 0.1679, + "step": 1658 + }, + { + "epoch": 1.6822098327420172, + "grad_norm": 3.5464897488905414, + "learning_rate": 4.8112188763444325e-06, + "loss": 0.1263, + "step": 1660 + }, + { + "epoch": 1.6842372022301064, + "grad_norm": 1.3602976926729997, + "learning_rate": 4.799426202661923e-06, + "loss": 0.154, + "step": 1662 + }, + { + "epoch": 1.6862645717181957, + "grad_norm": 4.4190148240312315, + "learning_rate": 4.7876346464024756e-06, + "loss": 0.0887, + "step": 1664 + }, + { + "epoch": 1.688291941206285, + "grad_norm": 0.9132426727246168, + "learning_rate": 4.775844273258402e-06, + "loss": 0.0961, + "step": 1666 + }, + { + "epoch": 1.690319310694374, + "grad_norm": 2.5701549416317593, + "learning_rate": 4.764055148915427e-06, + "loss": 0.0984, + "step": 1668 + }, + { + "epoch": 1.6923466801824634, + "grad_norm": 2.017714398541095, + "learning_rate": 4.7522673390523115e-06, + "loss": 0.1736, + "step": 1670 + }, + { + "epoch": 1.6943740496705524, + "grad_norm": 2.412972926248841, + "learning_rate": 4.740480909340502e-06, + "loss": 0.0606, + "step": 1672 + }, + { + "epoch": 1.6964014191586416, + "grad_norm": 0.7119620515584475, + "learning_rate": 4.728695925443751e-06, + "loss": 0.0887, + "step": 1674 + }, + { + "epoch": 1.6984287886467309, + "grad_norm": 5.366191797064768, + "learning_rate": 4.716912453017755e-06, + "loss": 0.0802, + "step": 1676 + }, + { + "epoch": 1.70045615813482, + "grad_norm": 7.265048097963709, + "learning_rate": 4.7051305577097965e-06, + "loss": 0.1643, + "step": 1678 + }, + { + "epoch": 1.7024835276229093, + "grad_norm": 2.6451643483802183, + "learning_rate": 4.693350305158362e-06, + "loss": 0.0525, + "step": 1680 + }, + { + "epoch": 1.7045108971109983, + "grad_norm": 4.093654303113327, + "learning_rate": 4.681571760992795e-06, + "loss": 0.1898, + "step": 1682 + }, + { + "epoch": 1.7065382665990878, + "grad_norm": 2.9055086562067256, + "learning_rate": 4.669794990832913e-06, + "loss": 0.0925, + "step": 1684 + }, + { + "epoch": 1.7085656360871768, + "grad_norm": 0.6569772438486178, + "learning_rate": 4.658020060288661e-06, + "loss": 0.0352, + "step": 1686 + }, + { + "epoch": 1.710593005575266, + "grad_norm": 4.442061063513772, + "learning_rate": 4.646247034959722e-06, + "loss": 0.2197, + "step": 1688 + }, + { + "epoch": 1.7126203750633553, + "grad_norm": 2.2704378861629317, + "learning_rate": 4.634475980435177e-06, + "loss": 0.1941, + "step": 1690 + }, + { + "epoch": 1.7146477445514445, + "grad_norm": 3.4018796844583172, + "learning_rate": 4.622706962293119e-06, + "loss": 0.098, + "step": 1692 + }, + { + "epoch": 1.7166751140395338, + "grad_norm": 2.7831608552806215, + "learning_rate": 4.6109400461003005e-06, + "loss": 0.1335, + "step": 1694 + }, + { + "epoch": 1.7187024835276228, + "grad_norm": 1.1726342647815469, + "learning_rate": 4.599175297411765e-06, + "loss": 0.1614, + "step": 1696 + }, + { + "epoch": 1.7207298530157122, + "grad_norm": 4.690916907206021, + "learning_rate": 4.587412781770473e-06, + "loss": 0.1316, + "step": 1698 + }, + { + "epoch": 1.7227572225038013, + "grad_norm": 2.9631677256766182, + "learning_rate": 4.575652564706953e-06, + "loss": 0.1028, + "step": 1700 + }, + { + "epoch": 1.7247845919918905, + "grad_norm": 3.4387029091329433, + "learning_rate": 4.563894711738922e-06, + "loss": 0.1434, + "step": 1702 + }, + { + "epoch": 1.7268119614799797, + "grad_norm": 2.881057298259871, + "learning_rate": 4.5521392883709295e-06, + "loss": 0.154, + "step": 1704 + }, + { + "epoch": 1.728839330968069, + "grad_norm": 3.5067272126616094, + "learning_rate": 4.54038636009399e-06, + "loss": 0.0919, + "step": 1706 + }, + { + "epoch": 1.7308667004561582, + "grad_norm": 2.351736416379893, + "learning_rate": 4.528635992385214e-06, + "loss": 0.0896, + "step": 1708 + }, + { + "epoch": 1.7328940699442472, + "grad_norm": 1.6870164000825347, + "learning_rate": 4.51688825070745e-06, + "loss": 0.0934, + "step": 1710 + }, + { + "epoch": 1.7349214394323367, + "grad_norm": 2.2603835614736036, + "learning_rate": 4.505143200508912e-06, + "loss": 0.1157, + "step": 1712 + }, + { + "epoch": 1.7369488089204257, + "grad_norm": 4.5857355044944175, + "learning_rate": 4.493400907222828e-06, + "loss": 0.1931, + "step": 1714 + }, + { + "epoch": 1.738976178408515, + "grad_norm": 1.39811510628281, + "learning_rate": 4.481661436267058e-06, + "loss": 0.1084, + "step": 1716 + }, + { + "epoch": 1.7410035478966042, + "grad_norm": 5.024705493542474, + "learning_rate": 4.46992485304374e-06, + "loss": 0.1666, + "step": 1718 + }, + { + "epoch": 1.7430309173846934, + "grad_norm": 3.476303051274454, + "learning_rate": 4.458191222938931e-06, + "loss": 0.1279, + "step": 1720 + }, + { + "epoch": 1.7450582868727826, + "grad_norm": 1.1367232156553524, + "learning_rate": 4.446460611322227e-06, + "loss": 0.0641, + "step": 1722 + }, + { + "epoch": 1.7470856563608717, + "grad_norm": 1.5976561441493298, + "learning_rate": 4.434733083546414e-06, + "loss": 0.1074, + "step": 1724 + }, + { + "epoch": 1.7491130258489611, + "grad_norm": 2.3212135890814753, + "learning_rate": 4.423008704947092e-06, + "loss": 0.0546, + "step": 1726 + }, + { + "epoch": 1.7511403953370501, + "grad_norm": 2.5934476503896113, + "learning_rate": 4.4112875408423215e-06, + "loss": 0.0707, + "step": 1728 + }, + { + "epoch": 1.7531677648251394, + "grad_norm": 2.8014607935749853, + "learning_rate": 4.399569656532252e-06, + "loss": 0.1274, + "step": 1730 + }, + { + "epoch": 1.7551951343132286, + "grad_norm": 4.136506983474316, + "learning_rate": 4.38785511729876e-06, + "loss": 0.1003, + "step": 1732 + }, + { + "epoch": 1.7572225038013178, + "grad_norm": 1.7531044427967695, + "learning_rate": 4.376143988405093e-06, + "loss": 0.1666, + "step": 1734 + }, + { + "epoch": 1.759249873289407, + "grad_norm": 3.338861568912593, + "learning_rate": 4.364436335095486e-06, + "loss": 0.0613, + "step": 1736 + }, + { + "epoch": 1.761277242777496, + "grad_norm": 4.913121887324835, + "learning_rate": 4.352732222594823e-06, + "loss": 0.1649, + "step": 1738 + }, + { + "epoch": 1.7633046122655855, + "grad_norm": 8.898952212586156, + "learning_rate": 4.341031716108257e-06, + "loss": 0.1593, + "step": 1740 + }, + { + "epoch": 1.7653319817536746, + "grad_norm": 3.1056000507619883, + "learning_rate": 4.329334880820852e-06, + "loss": 0.2495, + "step": 1742 + }, + { + "epoch": 1.7673593512417638, + "grad_norm": 4.295403824511314, + "learning_rate": 4.317641781897216e-06, + "loss": 0.1657, + "step": 1744 + }, + { + "epoch": 1.769386720729853, + "grad_norm": 0.8765967525886204, + "learning_rate": 4.305952484481147e-06, + "loss": 0.0913, + "step": 1746 + }, + { + "epoch": 1.7714140902179423, + "grad_norm": 3.3142457455507026, + "learning_rate": 4.294267053695261e-06, + "loss": 0.1216, + "step": 1748 + }, + { + "epoch": 1.7734414597060315, + "grad_norm": 7.924605766931135, + "learning_rate": 4.282585554640633e-06, + "loss": 0.1298, + "step": 1750 + }, + { + "epoch": 1.7754688291941205, + "grad_norm": 3.1597914145004173, + "learning_rate": 4.270908052396436e-06, + "loss": 0.0898, + "step": 1752 + }, + { + "epoch": 1.77749619868221, + "grad_norm": 11.651616593793646, + "learning_rate": 4.2592346120195694e-06, + "loss": 0.0927, + "step": 1754 + }, + { + "epoch": 1.779523568170299, + "grad_norm": 7.507502618354595, + "learning_rate": 4.247565298544312e-06, + "loss": 0.2197, + "step": 1756 + }, + { + "epoch": 1.7815509376583882, + "grad_norm": 2.404829972638546, + "learning_rate": 4.2359001769819435e-06, + "loss": 0.0867, + "step": 1758 + }, + { + "epoch": 1.7835783071464775, + "grad_norm": 2.3022160017718214, + "learning_rate": 4.224239312320399e-06, + "loss": 0.0527, + "step": 1760 + }, + { + "epoch": 1.7856056766345665, + "grad_norm": 3.829588214128459, + "learning_rate": 4.212582769523886e-06, + "loss": 0.1941, + "step": 1762 + }, + { + "epoch": 1.787633046122656, + "grad_norm": 5.243433712738, + "learning_rate": 4.200930613532545e-06, + "loss": 0.3959, + "step": 1764 + }, + { + "epoch": 1.789660415610745, + "grad_norm": 4.6980527853967615, + "learning_rate": 4.189282909262073e-06, + "loss": 0.1319, + "step": 1766 + }, + { + "epoch": 1.7916877850988344, + "grad_norm": 2.32222738821634, + "learning_rate": 4.177639721603362e-06, + "loss": 0.1336, + "step": 1768 + }, + { + "epoch": 1.7937151545869234, + "grad_norm": 1.3606257862067037, + "learning_rate": 4.1660011154221506e-06, + "loss": 0.0602, + "step": 1770 + }, + { + "epoch": 1.7957425240750127, + "grad_norm": 2.722394342613752, + "learning_rate": 4.154367155558642e-06, + "loss": 0.0872, + "step": 1772 + }, + { + "epoch": 1.797769893563102, + "grad_norm": 1.7099269839860964, + "learning_rate": 4.142737906827164e-06, + "loss": 0.0965, + "step": 1774 + }, + { + "epoch": 1.799797263051191, + "grad_norm": 2.6370097653797986, + "learning_rate": 4.131113434015791e-06, + "loss": 0.0677, + "step": 1776 + }, + { + "epoch": 1.8018246325392804, + "grad_norm": 6.742138201546171, + "learning_rate": 4.119493801885994e-06, + "loss": 0.223, + "step": 1778 + }, + { + "epoch": 1.8038520020273694, + "grad_norm": 3.2409300333322757, + "learning_rate": 4.107879075172276e-06, + "loss": 0.1114, + "step": 1780 + }, + { + "epoch": 1.8058793715154589, + "grad_norm": 12.080462657287338, + "learning_rate": 4.09626931858181e-06, + "loss": 0.1605, + "step": 1782 + }, + { + "epoch": 1.8079067410035479, + "grad_norm": 2.9771613557865675, + "learning_rate": 4.0846645967940815e-06, + "loss": 0.1114, + "step": 1784 + }, + { + "epoch": 1.809934110491637, + "grad_norm": 3.4860574547768315, + "learning_rate": 4.073064974460522e-06, + "loss": 0.1078, + "step": 1786 + }, + { + "epoch": 1.8119614799797263, + "grad_norm": 8.87394483922418, + "learning_rate": 4.061470516204159e-06, + "loss": 0.3639, + "step": 1788 + }, + { + "epoch": 1.8139888494678154, + "grad_norm": 6.197208649220755, + "learning_rate": 4.049881286619245e-06, + "loss": 0.1699, + "step": 1790 + }, + { + "epoch": 1.8160162189559048, + "grad_norm": 2.0029987113086256, + "learning_rate": 4.038297350270906e-06, + "loss": 0.099, + "step": 1792 + }, + { + "epoch": 1.8180435884439938, + "grad_norm": 3.2423028904804236, + "learning_rate": 4.0267187716947825e-06, + "loss": 0.1783, + "step": 1794 + }, + { + "epoch": 1.820070957932083, + "grad_norm": 2.3468753247814162, + "learning_rate": 4.015145615396655e-06, + "loss": 0.1183, + "step": 1796 + }, + { + "epoch": 1.8220983274201723, + "grad_norm": 3.0123109373738712, + "learning_rate": 4.003577945852108e-06, + "loss": 0.1436, + "step": 1798 + }, + { + "epoch": 1.8241256969082615, + "grad_norm": 2.0251259378612736, + "learning_rate": 3.99201582750615e-06, + "loss": 0.0692, + "step": 1800 + }, + { + "epoch": 1.8261530663963508, + "grad_norm": 3.5207897049469468, + "learning_rate": 3.980459324772868e-06, + "loss": 0.1132, + "step": 1802 + }, + { + "epoch": 1.8281804358844398, + "grad_norm": 2.33024543276425, + "learning_rate": 3.96890850203506e-06, + "loss": 0.1802, + "step": 1804 + }, + { + "epoch": 1.8302078053725293, + "grad_norm": 3.199827159184038, + "learning_rate": 3.957363423643883e-06, + "loss": 0.0851, + "step": 1806 + }, + { + "epoch": 1.8322351748606183, + "grad_norm": 6.33556992465872, + "learning_rate": 3.945824153918491e-06, + "loss": 0.2742, + "step": 1808 + }, + { + "epoch": 1.8342625443487075, + "grad_norm": 0.8233596780827115, + "learning_rate": 3.9342907571456736e-06, + "loss": 0.0517, + "step": 1810 + }, + { + "epoch": 1.8362899138367967, + "grad_norm": 4.519439874531771, + "learning_rate": 3.9227632975795096e-06, + "loss": 0.1398, + "step": 1812 + }, + { + "epoch": 1.838317283324886, + "grad_norm": 2.4495037603105314, + "learning_rate": 3.9112418394409905e-06, + "loss": 0.1117, + "step": 1814 + }, + { + "epoch": 1.8403446528129752, + "grad_norm": 2.1543010005130703, + "learning_rate": 3.89972644691768e-06, + "loss": 0.2112, + "step": 1816 + }, + { + "epoch": 1.8423720223010642, + "grad_norm": 5.319600608236796, + "learning_rate": 3.888217184163347e-06, + "loss": 0.2579, + "step": 1818 + }, + { + "epoch": 1.8443993917891537, + "grad_norm": 3.024209024138264, + "learning_rate": 3.876714115297611e-06, + "loss": 0.2176, + "step": 1820 + }, + { + "epoch": 1.8464267612772427, + "grad_norm": 3.0338440475795214, + "learning_rate": 3.865217304405588e-06, + "loss": 0.169, + "step": 1822 + }, + { + "epoch": 1.848454130765332, + "grad_norm": 1.198423420304229, + "learning_rate": 3.853726815537522e-06, + "loss": 0.1089, + "step": 1824 + }, + { + "epoch": 1.8504815002534212, + "grad_norm": 2.8433984075173653, + "learning_rate": 3.842242712708444e-06, + "loss": 0.1277, + "step": 1826 + }, + { + "epoch": 1.8525088697415104, + "grad_norm": 3.9765532516660973, + "learning_rate": 3.830765059897803e-06, + "loss": 0.1177, + "step": 1828 + }, + { + "epoch": 1.8545362392295996, + "grad_norm": 10.734161506581456, + "learning_rate": 3.819293921049117e-06, + "loss": 0.222, + "step": 1830 + }, + { + "epoch": 1.8565636087176887, + "grad_norm": 6.3061844435389744, + "learning_rate": 3.8078293600696104e-06, + "loss": 0.0991, + "step": 1832 + }, + { + "epoch": 1.8585909782057781, + "grad_norm": 1.997594625410279, + "learning_rate": 3.7963714408298635e-06, + "loss": 0.1778, + "step": 1834 + }, + { + "epoch": 1.8606183476938671, + "grad_norm": 8.719623570499353, + "learning_rate": 3.784920227163457e-06, + "loss": 0.1057, + "step": 1836 + }, + { + "epoch": 1.8626457171819564, + "grad_norm": 6.940659312683173, + "learning_rate": 3.773475782866609e-06, + "loss": 0.2331, + "step": 1838 + }, + { + "epoch": 1.8646730866700456, + "grad_norm": 5.228295852021764, + "learning_rate": 3.76203817169783e-06, + "loss": 0.2471, + "step": 1840 + }, + { + "epoch": 1.8667004561581348, + "grad_norm": 3.5824400125965346, + "learning_rate": 3.750607457377555e-06, + "loss": 0.1377, + "step": 1842 + }, + { + "epoch": 1.868727825646224, + "grad_norm": 2.512815796021839, + "learning_rate": 3.739183703587803e-06, + "loss": 0.1274, + "step": 1844 + }, + { + "epoch": 1.870755195134313, + "grad_norm": 2.4164172770180246, + "learning_rate": 3.7277669739718107e-06, + "loss": 0.0695, + "step": 1846 + }, + { + "epoch": 1.8727825646224026, + "grad_norm": 1.7289282772850523, + "learning_rate": 3.7163573321336867e-06, + "loss": 0.1491, + "step": 1848 + }, + { + "epoch": 1.8748099341104916, + "grad_norm": 4.256924065521329, + "learning_rate": 3.7049548416380433e-06, + "loss": 0.1559, + "step": 1850 + }, + { + "epoch": 1.8768373035985808, + "grad_norm": 3.1757171772048687, + "learning_rate": 3.6935595660096623e-06, + "loss": 0.1455, + "step": 1852 + }, + { + "epoch": 1.87886467308667, + "grad_norm": 11.115012256097001, + "learning_rate": 3.682171568733126e-06, + "loss": 0.1053, + "step": 1854 + }, + { + "epoch": 1.8808920425747593, + "grad_norm": 66.81918640415712, + "learning_rate": 3.6707909132524663e-06, + "loss": 0.0847, + "step": 1856 + }, + { + "epoch": 1.8829194120628485, + "grad_norm": 2.7469117046897393, + "learning_rate": 3.659417662970818e-06, + "loss": 0.0987, + "step": 1858 + }, + { + "epoch": 1.8849467815509375, + "grad_norm": 4.184771173052866, + "learning_rate": 3.6480518812500527e-06, + "loss": 0.0771, + "step": 1860 + }, + { + "epoch": 1.886974151039027, + "grad_norm": 2.147515295097432, + "learning_rate": 3.6366936314104422e-06, + "loss": 0.201, + "step": 1862 + }, + { + "epoch": 1.889001520527116, + "grad_norm": 4.099786613052847, + "learning_rate": 3.625342976730291e-06, + "loss": 0.1796, + "step": 1864 + }, + { + "epoch": 1.8910288900152052, + "grad_norm": 3.0049704545960005, + "learning_rate": 3.6139999804455935e-06, + "loss": 0.1378, + "step": 1866 + }, + { + "epoch": 1.8930562595032945, + "grad_norm": 5.3685613224160065, + "learning_rate": 3.602664705749679e-06, + "loss": 0.1372, + "step": 1868 + }, + { + "epoch": 1.8950836289913837, + "grad_norm": 6.213208079743277, + "learning_rate": 3.5913372157928515e-06, + "loss": 0.2646, + "step": 1870 + }, + { + "epoch": 1.897110998479473, + "grad_norm": 5.8750942322172195, + "learning_rate": 3.5800175736820556e-06, + "loss": 0.1644, + "step": 1872 + }, + { + "epoch": 1.899138367967562, + "grad_norm": 8.43032082462515, + "learning_rate": 3.5687058424805066e-06, + "loss": 0.1357, + "step": 1874 + }, + { + "epoch": 1.9011657374556514, + "grad_norm": 4.61991316542933, + "learning_rate": 3.5574020852073523e-06, + "loss": 0.1376, + "step": 1876 + }, + { + "epoch": 1.9031931069437404, + "grad_norm": 2.065563542367868, + "learning_rate": 3.5461063648373107e-06, + "loss": 0.279, + "step": 1878 + }, + { + "epoch": 1.9052204764318297, + "grad_norm": 2.6313044830166294, + "learning_rate": 3.5348187443003306e-06, + "loss": 0.2156, + "step": 1880 + }, + { + "epoch": 1.907247845919919, + "grad_norm": 5.778073682683293, + "learning_rate": 3.5235392864812352e-06, + "loss": 0.1949, + "step": 1882 + }, + { + "epoch": 1.9092752154080082, + "grad_norm": 3.773388601080814, + "learning_rate": 3.5122680542193677e-06, + "loss": 0.1683, + "step": 1884 + }, + { + "epoch": 1.9113025848960974, + "grad_norm": 7.450613490309948, + "learning_rate": 3.501005110308252e-06, + "loss": 0.2302, + "step": 1886 + }, + { + "epoch": 1.9133299543841864, + "grad_norm": 4.244998976489211, + "learning_rate": 3.48975051749523e-06, + "loss": 0.0971, + "step": 1888 + }, + { + "epoch": 1.9153573238722759, + "grad_norm": 3.527956048265601, + "learning_rate": 3.4785043384811233e-06, + "loss": 0.0885, + "step": 1890 + }, + { + "epoch": 1.9173846933603649, + "grad_norm": 3.743769865584654, + "learning_rate": 3.4672666359198757e-06, + "loss": 0.093, + "step": 1892 + }, + { + "epoch": 1.9194120628484541, + "grad_norm": 3.7673519371929185, + "learning_rate": 3.4560374724182107e-06, + "loss": 0.1292, + "step": 1894 + }, + { + "epoch": 1.9214394323365434, + "grad_norm": 3.250186735430982, + "learning_rate": 3.4448169105352797e-06, + "loss": 0.087, + "step": 1896 + }, + { + "epoch": 1.9234668018246326, + "grad_norm": 3.261888369568477, + "learning_rate": 3.4336050127823078e-06, + "loss": 0.0889, + "step": 1898 + }, + { + "epoch": 1.9254941713127218, + "grad_norm": 0.7446545239076585, + "learning_rate": 3.422401841622258e-06, + "loss": 0.2078, + "step": 1900 + }, + { + "epoch": 1.9275215408008108, + "grad_norm": 5.139601436981795, + "learning_rate": 3.4112074594694724e-06, + "loss": 0.2088, + "step": 1902 + }, + { + "epoch": 1.9295489102889003, + "grad_norm": 4.33756578046134, + "learning_rate": 3.4000219286893315e-06, + "loss": 0.172, + "step": 1904 + }, + { + "epoch": 1.9315762797769893, + "grad_norm": 4.699073653339103, + "learning_rate": 3.388845311597897e-06, + "loss": 0.1266, + "step": 1906 + }, + { + "epoch": 1.9336036492650786, + "grad_norm": 4.211991004117766, + "learning_rate": 3.3776776704615787e-06, + "loss": 0.181, + "step": 1908 + }, + { + "epoch": 1.9356310187531678, + "grad_norm": 1.3323849021590488, + "learning_rate": 3.3665190674967794e-06, + "loss": 0.0541, + "step": 1910 + }, + { + "epoch": 1.937658388241257, + "grad_norm": 7.133921535512532, + "learning_rate": 3.355369564869541e-06, + "loss": 0.2435, + "step": 1912 + }, + { + "epoch": 1.9396857577293463, + "grad_norm": 2.6093792363167077, + "learning_rate": 3.344229224695219e-06, + "loss": 0.16, + "step": 1914 + }, + { + "epoch": 1.9417131272174353, + "grad_norm": 8.526126551099985, + "learning_rate": 3.333098109038111e-06, + "loss": 0.2337, + "step": 1916 + }, + { + "epoch": 1.9437404967055247, + "grad_norm": 6.381557918150761, + "learning_rate": 3.321976279911133e-06, + "loss": 0.279, + "step": 1918 + }, + { + "epoch": 1.9457678661936137, + "grad_norm": 2.265493082153073, + "learning_rate": 3.310863799275459e-06, + "loss": 0.1362, + "step": 1920 + }, + { + "epoch": 1.947795235681703, + "grad_norm": 1.3120979238135244, + "learning_rate": 3.2997607290401836e-06, + "loss": 0.0552, + "step": 1922 + }, + { + "epoch": 1.9498226051697922, + "grad_norm": 5.786764357464595, + "learning_rate": 3.2886671310619743e-06, + "loss": 0.1257, + "step": 1924 + }, + { + "epoch": 1.9518499746578812, + "grad_norm": 2.832753586181679, + "learning_rate": 3.2775830671447277e-06, + "loss": 0.1155, + "step": 1926 + }, + { + "epoch": 1.9538773441459707, + "grad_norm": 2.118829696340857, + "learning_rate": 3.2665085990392277e-06, + "loss": 0.1275, + "step": 1928 + }, + { + "epoch": 1.9559047136340597, + "grad_norm": 1.2251467175596302, + "learning_rate": 3.2554437884427897e-06, + "loss": 0.0982, + "step": 1930 + }, + { + "epoch": 1.9579320831221492, + "grad_norm": 4.210827030898681, + "learning_rate": 3.2443886969989368e-06, + "loss": 0.1226, + "step": 1932 + }, + { + "epoch": 1.9599594526102382, + "grad_norm": 7.460362447706585, + "learning_rate": 3.2333433862970376e-06, + "loss": 0.1115, + "step": 1934 + }, + { + "epoch": 1.9619868220983274, + "grad_norm": 6.226845317755583, + "learning_rate": 3.2223079178719775e-06, + "loss": 0.1877, + "step": 1936 + }, + { + "epoch": 1.9640141915864167, + "grad_norm": 5.075552763312393, + "learning_rate": 3.2112823532038007e-06, + "loss": 0.1899, + "step": 1938 + }, + { + "epoch": 1.9660415610745057, + "grad_norm": 5.096046193219227, + "learning_rate": 3.200266753717383e-06, + "loss": 0.1944, + "step": 1940 + }, + { + "epoch": 1.9680689305625951, + "grad_norm": 1.3537335771751087, + "learning_rate": 3.1892611807820828e-06, + "loss": 0.0705, + "step": 1942 + }, + { + "epoch": 1.9700963000506841, + "grad_norm": 0.9097387073344629, + "learning_rate": 3.1782656957113953e-06, + "loss": 0.0578, + "step": 1944 + }, + { + "epoch": 1.9721236695387736, + "grad_norm": 3.162892499232933, + "learning_rate": 3.1672803597626195e-06, + "loss": 0.1225, + "step": 1946 + }, + { + "epoch": 1.9741510390268626, + "grad_norm": 0.5299243554838863, + "learning_rate": 3.1563052341365064e-06, + "loss": 0.1845, + "step": 1948 + }, + { + "epoch": 1.9761784085149519, + "grad_norm": 5.273771374285927, + "learning_rate": 3.145340379976929e-06, + "loss": 0.1057, + "step": 1950 + }, + { + "epoch": 1.978205778003041, + "grad_norm": 2.3626069285688573, + "learning_rate": 3.1343858583705343e-06, + "loss": 0.0647, + "step": 1952 + }, + { + "epoch": 1.98023314749113, + "grad_norm": 3.9157263666605577, + "learning_rate": 3.1234417303464073e-06, + "loss": 0.1765, + "step": 1954 + }, + { + "epoch": 1.9822605169792196, + "grad_norm": 3.7056901158593045, + "learning_rate": 3.1125080568757286e-06, + "loss": 0.1412, + "step": 1956 + }, + { + "epoch": 1.9842878864673086, + "grad_norm": 4.34841501687879, + "learning_rate": 3.101584898871431e-06, + "loss": 0.2712, + "step": 1958 + }, + { + "epoch": 1.9863152559553978, + "grad_norm": 4.015874960553819, + "learning_rate": 3.0906723171878695e-06, + "loss": 0.2306, + "step": 1960 + }, + { + "epoch": 1.988342625443487, + "grad_norm": 4.347933685499096, + "learning_rate": 3.079770372620473e-06, + "loss": 0.0765, + "step": 1962 + }, + { + "epoch": 1.9903699949315763, + "grad_norm": 3.8112873775457916, + "learning_rate": 3.068879125905415e-06, + "loss": 0.1992, + "step": 1964 + }, + { + "epoch": 1.9923973644196655, + "grad_norm": 4.275102760803538, + "learning_rate": 3.057998637719263e-06, + "loss": 0.2459, + "step": 1966 + }, + { + "epoch": 1.9944247339077545, + "grad_norm": 4.371914785716363, + "learning_rate": 3.047128968678651e-06, + "loss": 0.1119, + "step": 1968 + }, + { + "epoch": 1.996452103395844, + "grad_norm": 3.706397034091306, + "learning_rate": 3.03627017933994e-06, + "loss": 0.0681, + "step": 1970 + }, + { + "epoch": 1.998479472883933, + "grad_norm": 2.505488665422743, + "learning_rate": 3.025422330198875e-06, + "loss": 0.0784, + "step": 1972 + }, + { + "epoch": 2.0, + "grad_norm": 3.2271040322932327, + "learning_rate": 3.014585481690255e-06, + "loss": 0.0902, + "step": 1974 + }, + { + "epoch": 2.002027369488089, + "grad_norm": 1.2545898729942557, + "learning_rate": 3.0037596941875878e-06, + "loss": 0.0291, + "step": 1976 + }, + { + "epoch": 2.0040547389761785, + "grad_norm": 0.8096852130320285, + "learning_rate": 2.992945028002766e-06, + "loss": 0.0269, + "step": 1978 + }, + { + "epoch": 2.0060821084642675, + "grad_norm": 0.19553016191859063, + "learning_rate": 2.9821415433857174e-06, + "loss": 0.0463, + "step": 1980 + }, + { + "epoch": 2.008109477952357, + "grad_norm": 1.1750959434511437, + "learning_rate": 2.971349300524081e-06, + "loss": 0.0588, + "step": 1982 + }, + { + "epoch": 2.010136847440446, + "grad_norm": 1.261266223132541, + "learning_rate": 2.960568359542865e-06, + "loss": 0.0242, + "step": 1984 + }, + { + "epoch": 2.0121642169285354, + "grad_norm": 0.5736063090654061, + "learning_rate": 2.9497987805041078e-06, + "loss": 0.0155, + "step": 1986 + }, + { + "epoch": 2.0141915864166244, + "grad_norm": 4.287161151747154, + "learning_rate": 2.9390406234065583e-06, + "loss": 0.0947, + "step": 1988 + }, + { + "epoch": 2.0162189559047135, + "grad_norm": 0.6611202002379905, + "learning_rate": 2.928293948185325e-06, + "loss": 0.0142, + "step": 1990 + }, + { + "epoch": 2.018246325392803, + "grad_norm": 3.6603886545558, + "learning_rate": 2.917558814711555e-06, + "loss": 0.0544, + "step": 1992 + }, + { + "epoch": 2.020273694880892, + "grad_norm": 1.13984669650971, + "learning_rate": 2.906835282792087e-06, + "loss": 0.0088, + "step": 1994 + }, + { + "epoch": 2.0223010643689814, + "grad_norm": 2.098026894335781, + "learning_rate": 2.8961234121691338e-06, + "loss": 0.0254, + "step": 1996 + }, + { + "epoch": 2.0243284338570704, + "grad_norm": 0.6087675954668671, + "learning_rate": 2.8854232625199396e-06, + "loss": 0.0306, + "step": 1998 + }, + { + "epoch": 2.02635580334516, + "grad_norm": 1.5118461400682832, + "learning_rate": 2.874734893456445e-06, + "loss": 0.0426, + "step": 2000 + }, + { + "epoch": 2.028383172833249, + "grad_norm": 1.2953574702258215, + "learning_rate": 2.8640583645249643e-06, + "loss": 0.0146, + "step": 2002 + }, + { + "epoch": 2.030410542321338, + "grad_norm": 2.349919396449281, + "learning_rate": 2.853393735205847e-06, + "loss": 0.0868, + "step": 2004 + }, + { + "epoch": 2.0324379118094273, + "grad_norm": 1.6690667288528214, + "learning_rate": 2.842741064913152e-06, + "loss": 0.0294, + "step": 2006 + }, + { + "epoch": 2.0344652812975164, + "grad_norm": 2.699658918176054, + "learning_rate": 2.832100412994304e-06, + "loss": 0.0505, + "step": 2008 + }, + { + "epoch": 2.036492650785606, + "grad_norm": 5.66820386614982, + "learning_rate": 2.8214718387297817e-06, + "loss": 0.0595, + "step": 2010 + }, + { + "epoch": 2.038520020273695, + "grad_norm": 2.0005644445789534, + "learning_rate": 2.8108554013327696e-06, + "loss": 0.0307, + "step": 2012 + }, + { + "epoch": 2.0405473897617843, + "grad_norm": 1.4268040568329692, + "learning_rate": 2.8002511599488413e-06, + "loss": 0.0089, + "step": 2014 + }, + { + "epoch": 2.0425747592498733, + "grad_norm": 1.735341212478133, + "learning_rate": 2.789659173655625e-06, + "loss": 0.111, + "step": 2016 + }, + { + "epoch": 2.0446021287379623, + "grad_norm": 2.766812843221022, + "learning_rate": 2.7790795014624684e-06, + "loss": 0.0844, + "step": 2018 + }, + { + "epoch": 2.046629498226052, + "grad_norm": 0.6775963079085132, + "learning_rate": 2.768512202310122e-06, + "loss": 0.0255, + "step": 2020 + }, + { + "epoch": 2.048656867714141, + "grad_norm": 0.6953099282588642, + "learning_rate": 2.757957335070402e-06, + "loss": 0.0077, + "step": 2022 + }, + { + "epoch": 2.0506842372022303, + "grad_norm": 0.3007767757866453, + "learning_rate": 2.7474149585458666e-06, + "loss": 0.0121, + "step": 2024 + }, + { + "epoch": 2.0527116066903193, + "grad_norm": 1.8778269909083947, + "learning_rate": 2.7368851314694815e-06, + "loss": 0.0281, + "step": 2026 + }, + { + "epoch": 2.0547389761784087, + "grad_norm": 1.4456912013134422, + "learning_rate": 2.726367912504303e-06, + "loss": 0.0495, + "step": 2028 + }, + { + "epoch": 2.0567663456664977, + "grad_norm": 0.1201602169177323, + "learning_rate": 2.715863360243145e-06, + "loss": 0.0064, + "step": 2030 + }, + { + "epoch": 2.0587937151545868, + "grad_norm": 0.7170262067323958, + "learning_rate": 2.7053715332082498e-06, + "loss": 0.0377, + "step": 2032 + }, + { + "epoch": 2.060821084642676, + "grad_norm": 3.1660163655997318, + "learning_rate": 2.6948924898509725e-06, + "loss": 0.038, + "step": 2034 + }, + { + "epoch": 2.0628484541307652, + "grad_norm": 2.6797744239943553, + "learning_rate": 2.6844262885514406e-06, + "loss": 0.0245, + "step": 2036 + }, + { + "epoch": 2.0648758236188547, + "grad_norm": 1.2518226589177488, + "learning_rate": 2.673972987618243e-06, + "loss": 0.0126, + "step": 2038 + }, + { + "epoch": 2.0669031931069437, + "grad_norm": 2.2937031374771375, + "learning_rate": 2.663532645288096e-06, + "loss": 0.0623, + "step": 2040 + }, + { + "epoch": 2.0689305625950327, + "grad_norm": 0.24138524901610361, + "learning_rate": 2.6531053197255248e-06, + "loss": 0.0538, + "step": 2042 + }, + { + "epoch": 2.070957932083122, + "grad_norm": 1.107568655165299, + "learning_rate": 2.6426910690225343e-06, + "loss": 0.0112, + "step": 2044 + }, + { + "epoch": 2.072985301571211, + "grad_norm": 2.898536544546573, + "learning_rate": 2.632289951198285e-06, + "loss": 0.0736, + "step": 2046 + }, + { + "epoch": 2.0750126710593007, + "grad_norm": 0.32854355116740536, + "learning_rate": 2.621902024198779e-06, + "loss": 0.077, + "step": 2048 + }, + { + "epoch": 2.0770400405473897, + "grad_norm": 4.692424520600409, + "learning_rate": 2.611527345896522e-06, + "loss": 0.0276, + "step": 2050 + }, + { + "epoch": 2.079067410035479, + "grad_norm": 1.8460477202453436, + "learning_rate": 2.601165974090219e-06, + "loss": 0.0472, + "step": 2052 + }, + { + "epoch": 2.081094779523568, + "grad_norm": 4.176283619243015, + "learning_rate": 2.5908179665044335e-06, + "loss": 0.0793, + "step": 2054 + }, + { + "epoch": 2.0831221490116576, + "grad_norm": 0.8293180323678827, + "learning_rate": 2.580483380789282e-06, + "loss": 0.0484, + "step": 2056 + }, + { + "epoch": 2.0851495184997466, + "grad_norm": 1.7584944134668925, + "learning_rate": 2.570162274520105e-06, + "loss": 0.0299, + "step": 2058 + }, + { + "epoch": 2.0871768879878356, + "grad_norm": 0.6555726160830381, + "learning_rate": 2.5598547051971424e-06, + "loss": 0.0399, + "step": 2060 + }, + { + "epoch": 2.089204257475925, + "grad_norm": 0.8084467036241524, + "learning_rate": 2.549560730245223e-06, + "loss": 0.0091, + "step": 2062 + }, + { + "epoch": 2.091231626964014, + "grad_norm": 1.0829181692084384, + "learning_rate": 2.539280407013438e-06, + "loss": 0.0479, + "step": 2064 + }, + { + "epoch": 2.0932589964521036, + "grad_norm": 0.8316132573362073, + "learning_rate": 2.529013792774826e-06, + "loss": 0.0051, + "step": 2066 + }, + { + "epoch": 2.0952863659401926, + "grad_norm": 1.4669408840618552, + "learning_rate": 2.518760944726042e-06, + "loss": 0.0166, + "step": 2068 + }, + { + "epoch": 2.0973137354282816, + "grad_norm": 1.8105494476629649, + "learning_rate": 2.5085219199870583e-06, + "loss": 0.0189, + "step": 2070 + }, + { + "epoch": 2.099341104916371, + "grad_norm": 3.4384384769750955, + "learning_rate": 2.4982967756008307e-06, + "loss": 0.1276, + "step": 2072 + }, + { + "epoch": 2.10136847440446, + "grad_norm": 0.11808016840635753, + "learning_rate": 2.488085568532984e-06, + "loss": 0.0049, + "step": 2074 + }, + { + "epoch": 2.1033958438925495, + "grad_norm": 1.094620587168751, + "learning_rate": 2.477888355671502e-06, + "loss": 0.0512, + "step": 2076 + }, + { + "epoch": 2.1054232133806385, + "grad_norm": 1.7042238532570069, + "learning_rate": 2.467705193826398e-06, + "loss": 0.0094, + "step": 2078 + }, + { + "epoch": 2.107450582868728, + "grad_norm": 2.7008085742648684, + "learning_rate": 2.4575361397294102e-06, + "loss": 0.0483, + "step": 2080 + }, + { + "epoch": 2.109477952356817, + "grad_norm": 2.5107575488991745, + "learning_rate": 2.447381250033679e-06, + "loss": 0.0371, + "step": 2082 + }, + { + "epoch": 2.111505321844906, + "grad_norm": 0.6518485380516679, + "learning_rate": 2.437240581313434e-06, + "loss": 0.0234, + "step": 2084 + }, + { + "epoch": 2.1135326913329955, + "grad_norm": 1.3967376147365607, + "learning_rate": 2.4271141900636787e-06, + "loss": 0.0443, + "step": 2086 + }, + { + "epoch": 2.1155600608210845, + "grad_norm": 0.48890531662598274, + "learning_rate": 2.417002132699869e-06, + "loss": 0.0141, + "step": 2088 + }, + { + "epoch": 2.117587430309174, + "grad_norm": 2.701735504719542, + "learning_rate": 2.406904465557614e-06, + "loss": 0.0448, + "step": 2090 + }, + { + "epoch": 2.119614799797263, + "grad_norm": 1.8497208678610335, + "learning_rate": 2.3968212448923446e-06, + "loss": 0.0379, + "step": 2092 + }, + { + "epoch": 2.1216421692853524, + "grad_norm": 2.5910675657469393, + "learning_rate": 2.386752526879014e-06, + "loss": 0.0254, + "step": 2094 + }, + { + "epoch": 2.1236695387734414, + "grad_norm": 0.9977781616514558, + "learning_rate": 2.376698367611776e-06, + "loss": 0.0465, + "step": 2096 + }, + { + "epoch": 2.1256969082615305, + "grad_norm": 0.43830427054823984, + "learning_rate": 2.366658823103677e-06, + "loss": 0.0123, + "step": 2098 + }, + { + "epoch": 2.12772427774962, + "grad_norm": 1.3070137169464595, + "learning_rate": 2.356633949286344e-06, + "loss": 0.0092, + "step": 2100 + }, + { + "epoch": 2.129751647237709, + "grad_norm": 0.7160709264855414, + "learning_rate": 2.3466238020096687e-06, + "loss": 0.0061, + "step": 2102 + }, + { + "epoch": 2.1317790167257984, + "grad_norm": 1.237037732177027, + "learning_rate": 2.3366284370415045e-06, + "loss": 0.0144, + "step": 2104 + }, + { + "epoch": 2.1338063862138874, + "grad_norm": 2.863676984163931, + "learning_rate": 2.326647910067342e-06, + "loss": 0.0243, + "step": 2106 + }, + { + "epoch": 2.135833755701977, + "grad_norm": 2.777734041904085, + "learning_rate": 2.3166822766900166e-06, + "loss": 0.0479, + "step": 2108 + }, + { + "epoch": 2.137861125190066, + "grad_norm": 0.6145328536101038, + "learning_rate": 2.3067315924293814e-06, + "loss": 0.0077, + "step": 2110 + }, + { + "epoch": 2.139888494678155, + "grad_norm": 2.1562916470565585, + "learning_rate": 2.296795912722014e-06, + "loss": 0.0743, + "step": 2112 + }, + { + "epoch": 2.1419158641662444, + "grad_norm": 14.845971548399614, + "learning_rate": 2.2868752929208915e-06, + "loss": 0.0237, + "step": 2114 + }, + { + "epoch": 2.1439432336543334, + "grad_norm": 2.6335091119122973, + "learning_rate": 2.2769697882950966e-06, + "loss": 0.026, + "step": 2116 + }, + { + "epoch": 2.145970603142423, + "grad_norm": 0.9853415571865557, + "learning_rate": 2.2670794540295e-06, + "loss": 0.0169, + "step": 2118 + }, + { + "epoch": 2.147997972630512, + "grad_norm": 0.8278243125824383, + "learning_rate": 2.2572043452244584e-06, + "loss": 0.0174, + "step": 2120 + }, + { + "epoch": 2.1500253421186013, + "grad_norm": 2.9623733736650735, + "learning_rate": 2.2473445168955067e-06, + "loss": 0.0468, + "step": 2122 + }, + { + "epoch": 2.1520527116066903, + "grad_norm": 4.58089354644119, + "learning_rate": 2.237500023973043e-06, + "loss": 0.1041, + "step": 2124 + }, + { + "epoch": 2.1540800810947793, + "grad_norm": 2.632762940859112, + "learning_rate": 2.2276709213020402e-06, + "loss": 0.0186, + "step": 2126 + }, + { + "epoch": 2.156107450582869, + "grad_norm": 3.0940654449140523, + "learning_rate": 2.2178572636417194e-06, + "loss": 0.0362, + "step": 2128 + }, + { + "epoch": 2.158134820070958, + "grad_norm": 0.9199663329239907, + "learning_rate": 2.2080591056652634e-06, + "loss": 0.0175, + "step": 2130 + }, + { + "epoch": 2.1601621895590473, + "grad_norm": 2.573612501106163, + "learning_rate": 2.1982765019595037e-06, + "loss": 0.0235, + "step": 2132 + }, + { + "epoch": 2.1621895590471363, + "grad_norm": 0.8086092177384893, + "learning_rate": 2.1885095070246116e-06, + "loss": 0.0286, + "step": 2134 + }, + { + "epoch": 2.1642169285352257, + "grad_norm": 1.7001258061601896, + "learning_rate": 2.1787581752738036e-06, + "loss": 0.0217, + "step": 2136 + }, + { + "epoch": 2.1662442980233148, + "grad_norm": 0.6551641967034103, + "learning_rate": 2.169022561033035e-06, + "loss": 0.0164, + "step": 2138 + }, + { + "epoch": 2.1682716675114038, + "grad_norm": 1.3123149183294127, + "learning_rate": 2.1593027185406977e-06, + "loss": 0.0109, + "step": 2140 + }, + { + "epoch": 2.1702990369994932, + "grad_norm": 1.9412148890513439, + "learning_rate": 2.1495987019473115e-06, + "loss": 0.035, + "step": 2142 + }, + { + "epoch": 2.1723264064875822, + "grad_norm": 2.6860975973135224, + "learning_rate": 2.1399105653152336e-06, + "loss": 0.0192, + "step": 2144 + }, + { + "epoch": 2.1743537759756717, + "grad_norm": 0.07322079776797014, + "learning_rate": 2.1302383626183522e-06, + "loss": 0.0128, + "step": 2146 + }, + { + "epoch": 2.1763811454637607, + "grad_norm": 1.0164376654610556, + "learning_rate": 2.120582147741779e-06, + "loss": 0.0879, + "step": 2148 + }, + { + "epoch": 2.17840851495185, + "grad_norm": 8.717424879022653, + "learning_rate": 2.110941974481564e-06, + "loss": 0.0484, + "step": 2150 + }, + { + "epoch": 2.180435884439939, + "grad_norm": 0.18348550285415738, + "learning_rate": 2.1013178965443794e-06, + "loss": 0.0092, + "step": 2152 + }, + { + "epoch": 2.182463253928028, + "grad_norm": 5.809199018918309, + "learning_rate": 2.091709967547233e-06, + "loss": 0.123, + "step": 2154 + }, + { + "epoch": 2.1844906234161177, + "grad_norm": 0.017197397202061433, + "learning_rate": 2.0821182410171638e-06, + "loss": 0.0041, + "step": 2156 + }, + { + "epoch": 2.1865179929042067, + "grad_norm": 0.027621308619167367, + "learning_rate": 2.0725427703909447e-06, + "loss": 0.0008, + "step": 2158 + }, + { + "epoch": 2.188545362392296, + "grad_norm": 6.507727421766117, + "learning_rate": 2.0629836090147864e-06, + "loss": 0.0448, + "step": 2160 + }, + { + "epoch": 2.190572731880385, + "grad_norm": 1.027004537599466, + "learning_rate": 2.053440810144033e-06, + "loss": 0.0599, + "step": 2162 + }, + { + "epoch": 2.1926001013684746, + "grad_norm": 3.6332571958272806, + "learning_rate": 2.0439144269428786e-06, + "loss": 0.0213, + "step": 2164 + }, + { + "epoch": 2.1946274708565636, + "grad_norm": 1.3093193548636357, + "learning_rate": 2.034404512484055e-06, + "loss": 0.0865, + "step": 2166 + }, + { + "epoch": 2.1966548403446526, + "grad_norm": 4.034514027573968, + "learning_rate": 2.0249111197485544e-06, + "loss": 0.1148, + "step": 2168 + }, + { + "epoch": 2.198682209832742, + "grad_norm": 1.392165063727535, + "learning_rate": 2.0154343016253136e-06, + "loss": 0.0504, + "step": 2170 + }, + { + "epoch": 2.200709579320831, + "grad_norm": 3.476749866468253, + "learning_rate": 2.005974110910938e-06, + "loss": 0.0444, + "step": 2172 + }, + { + "epoch": 2.2027369488089206, + "grad_norm": 2.409748977599809, + "learning_rate": 1.996530600309397e-06, + "loss": 0.0368, + "step": 2174 + }, + { + "epoch": 2.2047643182970096, + "grad_norm": 3.254618519301346, + "learning_rate": 1.987103822431733e-06, + "loss": 0.0242, + "step": 2176 + }, + { + "epoch": 2.206791687785099, + "grad_norm": 1.1331882104916515, + "learning_rate": 1.977693829795769e-06, + "loss": 0.1875, + "step": 2178 + }, + { + "epoch": 2.208819057273188, + "grad_norm": 1.9141134866013358, + "learning_rate": 1.968300674825811e-06, + "loss": 0.0286, + "step": 2180 + }, + { + "epoch": 2.210846426761277, + "grad_norm": 1.9962475372515425, + "learning_rate": 1.958924409852367e-06, + "loss": 0.0451, + "step": 2182 + }, + { + "epoch": 2.2128737962493665, + "grad_norm": 2.5283430858936002, + "learning_rate": 1.9495650871118414e-06, + "loss": 0.0261, + "step": 2184 + }, + { + "epoch": 2.2149011657374555, + "grad_norm": 2.505539397058421, + "learning_rate": 1.940222758746258e-06, + "loss": 0.0537, + "step": 2186 + }, + { + "epoch": 2.216928535225545, + "grad_norm": 1.3814978507507447, + "learning_rate": 1.9308974768029565e-06, + "loss": 0.018, + "step": 2188 + }, + { + "epoch": 2.218955904713634, + "grad_norm": 1.3341389574174953, + "learning_rate": 1.921589293234312e-06, + "loss": 0.0184, + "step": 2190 + }, + { + "epoch": 2.220983274201723, + "grad_norm": 0.09223503786406502, + "learning_rate": 1.9122982598974454e-06, + "loss": 0.0032, + "step": 2192 + }, + { + "epoch": 2.2230106436898125, + "grad_norm": 1.0087694384248855, + "learning_rate": 1.9030244285539239e-06, + "loss": 0.0149, + "step": 2194 + }, + { + "epoch": 2.2250380131779015, + "grad_norm": 0.9105412973534254, + "learning_rate": 1.8937678508694857e-06, + "loss": 0.0314, + "step": 2196 + }, + { + "epoch": 2.227065382665991, + "grad_norm": 2.0488694747117586, + "learning_rate": 1.8845285784137451e-06, + "loss": 0.0475, + "step": 2198 + }, + { + "epoch": 2.22909275215408, + "grad_norm": 4.27924419198399, + "learning_rate": 1.8753066626599086e-06, + "loss": 0.0771, + "step": 2200 + }, + { + "epoch": 2.2311201216421694, + "grad_norm": 3.143412573747134, + "learning_rate": 1.8661021549844803e-06, + "loss": 0.0504, + "step": 2202 + }, + { + "epoch": 2.2331474911302585, + "grad_norm": 0.9143501441802937, + "learning_rate": 1.8569151066669882e-06, + "loss": 0.0154, + "step": 2204 + }, + { + "epoch": 2.235174860618348, + "grad_norm": 0.4193601163536151, + "learning_rate": 1.8477455688896906e-06, + "loss": 0.0078, + "step": 2206 + }, + { + "epoch": 2.237202230106437, + "grad_norm": 3.335919902727271, + "learning_rate": 1.8385935927372872e-06, + "loss": 0.055, + "step": 2208 + }, + { + "epoch": 2.239229599594526, + "grad_norm": 2.157034873972847, + "learning_rate": 1.8294592291966468e-06, + "loss": 0.0253, + "step": 2210 + }, + { + "epoch": 2.2412569690826154, + "grad_norm": 0.4660008431300884, + "learning_rate": 1.8203425291565103e-06, + "loss": 0.0139, + "step": 2212 + }, + { + "epoch": 2.2432843385707044, + "grad_norm": 1.5703823791381284, + "learning_rate": 1.8112435434072151e-06, + "loss": 0.0551, + "step": 2214 + }, + { + "epoch": 2.245311708058794, + "grad_norm": 2.2003591363177524, + "learning_rate": 1.8021623226404117e-06, + "loss": 0.0349, + "step": 2216 + }, + { + "epoch": 2.247339077546883, + "grad_norm": 0.052628035087169275, + "learning_rate": 1.7930989174487767e-06, + "loss": 0.025, + "step": 2218 + }, + { + "epoch": 2.249366447034972, + "grad_norm": 0.29114399355072246, + "learning_rate": 1.7840533783257374e-06, + "loss": 0.0559, + "step": 2220 + }, + { + "epoch": 2.2513938165230614, + "grad_norm": 0.06640128490739176, + "learning_rate": 1.7750257556651806e-06, + "loss": 0.0054, + "step": 2222 + }, + { + "epoch": 2.2534211860111504, + "grad_norm": 0.8614016512183624, + "learning_rate": 1.7660160997611853e-06, + "loss": 0.0138, + "step": 2224 + }, + { + "epoch": 2.25544855549924, + "grad_norm": 0.39796798830979846, + "learning_rate": 1.7570244608077292e-06, + "loss": 0.0147, + "step": 2226 + }, + { + "epoch": 2.257475924987329, + "grad_norm": 0.7817254604635483, + "learning_rate": 1.748050888898421e-06, + "loss": 0.0848, + "step": 2228 + }, + { + "epoch": 2.2595032944754183, + "grad_norm": 1.6678805798197638, + "learning_rate": 1.7390954340262088e-06, + "loss": 0.0141, + "step": 2230 + }, + { + "epoch": 2.2615306639635073, + "grad_norm": 1.7300932393477455, + "learning_rate": 1.7301581460831147e-06, + "loss": 0.0388, + "step": 2232 + }, + { + "epoch": 2.263558033451597, + "grad_norm": 2.2543388356552376, + "learning_rate": 1.7212390748599462e-06, + "loss": 0.0456, + "step": 2234 + }, + { + "epoch": 2.265585402939686, + "grad_norm": 3.124615845486436, + "learning_rate": 1.7123382700460257e-06, + "loss": 0.0297, + "step": 2236 + }, + { + "epoch": 2.267612772427775, + "grad_norm": 0.4997070196394514, + "learning_rate": 1.7034557812289116e-06, + "loss": 0.039, + "step": 2238 + }, + { + "epoch": 2.2696401419158643, + "grad_norm": 0.12806234913417047, + "learning_rate": 1.6945916578941157e-06, + "loss": 0.0027, + "step": 2240 + }, + { + "epoch": 2.2716675114039533, + "grad_norm": 3.1638351052507603, + "learning_rate": 1.6857459494248418e-06, + "loss": 0.0402, + "step": 2242 + }, + { + "epoch": 2.2736948808920427, + "grad_norm": 0.02478286005955515, + "learning_rate": 1.6769187051016933e-06, + "loss": 0.0018, + "step": 2244 + }, + { + "epoch": 2.2757222503801318, + "grad_norm": 0.8343846493206177, + "learning_rate": 1.6681099741024143e-06, + "loss": 0.0189, + "step": 2246 + }, + { + "epoch": 2.2777496198682208, + "grad_norm": 0.8389679628484346, + "learning_rate": 1.6593198055016069e-06, + "loss": 0.0184, + "step": 2248 + }, + { + "epoch": 2.2797769893563102, + "grad_norm": 0.30574695520909645, + "learning_rate": 1.650548248270456e-06, + "loss": 0.0058, + "step": 2250 + }, + { + "epoch": 2.2818043588443992, + "grad_norm": 0.8789483832687746, + "learning_rate": 1.641795351276465e-06, + "loss": 0.0156, + "step": 2252 + }, + { + "epoch": 2.2838317283324887, + "grad_norm": 5.223603856468796, + "learning_rate": 1.6330611632831762e-06, + "loss": 0.0453, + "step": 2254 + }, + { + "epoch": 2.2858590978205777, + "grad_norm": 2.4707360930348194, + "learning_rate": 1.6243457329499035e-06, + "loss": 0.0207, + "step": 2256 + }, + { + "epoch": 2.287886467308667, + "grad_norm": 4.3079861882309745, + "learning_rate": 1.6156491088314557e-06, + "loss": 0.0315, + "step": 2258 + }, + { + "epoch": 2.289913836796756, + "grad_norm": 1.2763999034943083, + "learning_rate": 1.606971339377874e-06, + "loss": 0.0238, + "step": 2260 + }, + { + "epoch": 2.2919412062848457, + "grad_norm": 1.4605266812552578, + "learning_rate": 1.5983124729341532e-06, + "loss": 0.0276, + "step": 2262 + }, + { + "epoch": 2.2939685757729347, + "grad_norm": 0.13748449245935768, + "learning_rate": 1.5896725577399814e-06, + "loss": 0.0156, + "step": 2264 + }, + { + "epoch": 2.2959959452610237, + "grad_norm": 0.011717505929201287, + "learning_rate": 1.5810516419294654e-06, + "loss": 0.0013, + "step": 2266 + }, + { + "epoch": 2.298023314749113, + "grad_norm": 0.664268469772948, + "learning_rate": 1.5724497735308614e-06, + "loss": 0.019, + "step": 2268 + }, + { + "epoch": 2.300050684237202, + "grad_norm": 1.806638340634537, + "learning_rate": 1.5638670004663125e-06, + "loss": 0.0313, + "step": 2270 + }, + { + "epoch": 2.3020780537252916, + "grad_norm": 0.5778038432554351, + "learning_rate": 1.555303370551579e-06, + "loss": 0.0328, + "step": 2272 + }, + { + "epoch": 2.3041054232133806, + "grad_norm": 3.7544246421988596, + "learning_rate": 1.5467589314957736e-06, + "loss": 0.031, + "step": 2274 + }, + { + "epoch": 2.3061327927014696, + "grad_norm": 8.063544562077327, + "learning_rate": 1.5382337309010896e-06, + "loss": 0.1035, + "step": 2276 + }, + { + "epoch": 2.308160162189559, + "grad_norm": 2.48277507581714, + "learning_rate": 1.529727816262544e-06, + "loss": 0.0569, + "step": 2278 + }, + { + "epoch": 2.310187531677648, + "grad_norm": 0.06882664216673445, + "learning_rate": 1.5212412349677102e-06, + "loss": 0.0039, + "step": 2280 + }, + { + "epoch": 2.3122149011657376, + "grad_norm": 0.0407098860582503, + "learning_rate": 1.5127740342964475e-06, + "loss": 0.0193, + "step": 2282 + }, + { + "epoch": 2.3142422706538266, + "grad_norm": 1.148647029994564, + "learning_rate": 1.504326261420651e-06, + "loss": 0.0291, + "step": 2284 + }, + { + "epoch": 2.316269640141916, + "grad_norm": 0.06603736350631888, + "learning_rate": 1.4958979634039727e-06, + "loss": 0.0066, + "step": 2286 + }, + { + "epoch": 2.318297009630005, + "grad_norm": 2.2780368524782295, + "learning_rate": 1.4874891872015735e-06, + "loss": 0.0581, + "step": 2288 + }, + { + "epoch": 2.320324379118094, + "grad_norm": 3.411053705725042, + "learning_rate": 1.4790999796598543e-06, + "loss": 0.0698, + "step": 2290 + }, + { + "epoch": 2.3223517486061835, + "grad_norm": 1.5454946452315625, + "learning_rate": 1.4707303875161954e-06, + "loss": 0.0182, + "step": 2292 + }, + { + "epoch": 2.3243791180942726, + "grad_norm": 0.19787507641607074, + "learning_rate": 1.4623804573986994e-06, + "loss": 0.004, + "step": 2294 + }, + { + "epoch": 2.326406487582362, + "grad_norm": 1.4367953408429277, + "learning_rate": 1.4540502358259244e-06, + "loss": 0.0259, + "step": 2296 + }, + { + "epoch": 2.328433857070451, + "grad_norm": 3.308232740955085, + "learning_rate": 1.4457397692066362e-06, + "loss": 0.0829, + "step": 2298 + }, + { + "epoch": 2.3304612265585405, + "grad_norm": 2.777383997343979, + "learning_rate": 1.437449103839536e-06, + "loss": 0.0524, + "step": 2300 + }, + { + "epoch": 2.3324885960466295, + "grad_norm": 0.019341685910251982, + "learning_rate": 1.4291782859130171e-06, + "loss": 0.0044, + "step": 2302 + }, + { + "epoch": 2.3345159655347185, + "grad_norm": 0.5043636611086127, + "learning_rate": 1.4209273615048918e-06, + "loss": 0.021, + "step": 2304 + }, + { + "epoch": 2.336543335022808, + "grad_norm": 1.550957245905949, + "learning_rate": 1.4126963765821505e-06, + "loss": 0.0122, + "step": 2306 + }, + { + "epoch": 2.338570704510897, + "grad_norm": 2.041625238621192, + "learning_rate": 1.4044853770006938e-06, + "loss": 0.0381, + "step": 2308 + }, + { + "epoch": 2.3405980739989865, + "grad_norm": 4.116258601603008, + "learning_rate": 1.3962944085050833e-06, + "loss": 0.0601, + "step": 2310 + }, + { + "epoch": 2.3426254434870755, + "grad_norm": 0.47036939837806174, + "learning_rate": 1.3881235167282858e-06, + "loss": 0.0294, + "step": 2312 + }, + { + "epoch": 2.3446528129751645, + "grad_norm": 1.8591102156364567, + "learning_rate": 1.3799727471914121e-06, + "loss": 0.0338, + "step": 2314 + }, + { + "epoch": 2.346680182463254, + "grad_norm": 0.989864975800206, + "learning_rate": 1.371842145303478e-06, + "loss": 0.0364, + "step": 2316 + }, + { + "epoch": 2.348707551951343, + "grad_norm": 0.9836026271923362, + "learning_rate": 1.3637317563611342e-06, + "loss": 0.0193, + "step": 2318 + }, + { + "epoch": 2.3507349214394324, + "grad_norm": 1.6349221081837528, + "learning_rate": 1.3556416255484279e-06, + "loss": 0.0292, + "step": 2320 + }, + { + "epoch": 2.3527622909275214, + "grad_norm": 2.801946658773727, + "learning_rate": 1.347571797936546e-06, + "loss": 0.0319, + "step": 2322 + }, + { + "epoch": 2.354789660415611, + "grad_norm": 0.8637080962310459, + "learning_rate": 1.3395223184835587e-06, + "loss": 0.0548, + "step": 2324 + }, + { + "epoch": 2.3568170299037, + "grad_norm": 0.9977239148593361, + "learning_rate": 1.33149323203418e-06, + "loss": 0.0102, + "step": 2326 + }, + { + "epoch": 2.3588443993917894, + "grad_norm": 4.882910770615109, + "learning_rate": 1.3234845833195042e-06, + "loss": 0.0267, + "step": 2328 + }, + { + "epoch": 2.3608717688798784, + "grad_norm": 1.8073882798339331, + "learning_rate": 1.3154964169567746e-06, + "loss": 0.0205, + "step": 2330 + }, + { + "epoch": 2.3628991383679674, + "grad_norm": 1.5938374062058343, + "learning_rate": 1.3075287774491147e-06, + "loss": 0.04, + "step": 2332 + }, + { + "epoch": 2.364926507856057, + "grad_norm": 0.6702534112225408, + "learning_rate": 1.2995817091852957e-06, + "loss": 0.0392, + "step": 2334 + }, + { + "epoch": 2.366953877344146, + "grad_norm": 1.432325918161525, + "learning_rate": 1.291655256439483e-06, + "loss": 0.0359, + "step": 2336 + }, + { + "epoch": 2.3689812468322353, + "grad_norm": 0.02558162682153793, + "learning_rate": 1.283749463370988e-06, + "loss": 0.0115, + "step": 2338 + }, + { + "epoch": 2.3710086163203243, + "grad_norm": 2.15996456708974, + "learning_rate": 1.2758643740240272e-06, + "loss": 0.0181, + "step": 2340 + }, + { + "epoch": 2.3730359858084134, + "grad_norm": 1.4008479822975906, + "learning_rate": 1.268000032327471e-06, + "loss": 0.0324, + "step": 2342 + }, + { + "epoch": 2.375063355296503, + "grad_norm": 1.8075392876737175, + "learning_rate": 1.260156482094606e-06, + "loss": 0.0407, + "step": 2344 + }, + { + "epoch": 2.377090724784592, + "grad_norm": 2.1524476777215367, + "learning_rate": 1.2523337670228814e-06, + "loss": 0.0485, + "step": 2346 + }, + { + "epoch": 2.3791180942726813, + "grad_norm": 0.32476627825576465, + "learning_rate": 1.2445319306936748e-06, + "loss": 0.0095, + "step": 2348 + }, + { + "epoch": 2.3811454637607703, + "grad_norm": 1.3543998880235177, + "learning_rate": 1.2367510165720453e-06, + "loss": 0.0915, + "step": 2350 + }, + { + "epoch": 2.3831728332488598, + "grad_norm": 1.3220848976032176, + "learning_rate": 1.228991068006491e-06, + "loss": 0.0117, + "step": 2352 + }, + { + "epoch": 2.3852002027369488, + "grad_norm": 1.165514989508775, + "learning_rate": 1.2212521282287093e-06, + "loss": 0.022, + "step": 2354 + }, + { + "epoch": 2.3872275722250382, + "grad_norm": 0.02167425081076016, + "learning_rate": 1.2135342403533506e-06, + "loss": 0.0085, + "step": 2356 + }, + { + "epoch": 2.3892549417131272, + "grad_norm": 1.4990608583116172, + "learning_rate": 1.2058374473777885e-06, + "loss": 0.0428, + "step": 2358 + }, + { + "epoch": 2.3912823112012163, + "grad_norm": 0.630489826205074, + "learning_rate": 1.1981617921818683e-06, + "loss": 0.008, + "step": 2360 + }, + { + "epoch": 2.3933096806893057, + "grad_norm": 1.6310709076980605, + "learning_rate": 1.19050731752768e-06, + "loss": 0.0517, + "step": 2362 + }, + { + "epoch": 2.3953370501773947, + "grad_norm": 1.737878240822615, + "learning_rate": 1.1828740660593068e-06, + "loss": 0.0461, + "step": 2364 + }, + { + "epoch": 2.397364419665484, + "grad_norm": 2.3542439699533113, + "learning_rate": 1.1752620803026011e-06, + "loss": 0.0241, + "step": 2366 + }, + { + "epoch": 2.399391789153573, + "grad_norm": 0.8180699309170294, + "learning_rate": 1.1676714026649383e-06, + "loss": 0.0106, + "step": 2368 + }, + { + "epoch": 2.401419158641662, + "grad_norm": 1.291780222593968, + "learning_rate": 1.1601020754349846e-06, + "loss": 0.008, + "step": 2370 + }, + { + "epoch": 2.4034465281297517, + "grad_norm": 1.486237848595865, + "learning_rate": 1.1525541407824597e-06, + "loss": 0.0269, + "step": 2372 + }, + { + "epoch": 2.4054738976178407, + "grad_norm": 2.267767701063458, + "learning_rate": 1.1450276407579009e-06, + "loss": 0.0482, + "step": 2374 + }, + { + "epoch": 2.40750126710593, + "grad_norm": 0.9538563668727964, + "learning_rate": 1.1375226172924342e-06, + "loss": 0.0336, + "step": 2376 + }, + { + "epoch": 2.409528636594019, + "grad_norm": 0.7592800922138768, + "learning_rate": 1.1300391121975312e-06, + "loss": 0.0247, + "step": 2378 + }, + { + "epoch": 2.4115560060821086, + "grad_norm": 1.4996426221338082, + "learning_rate": 1.1225771671647872e-06, + "loss": 0.0322, + "step": 2380 + }, + { + "epoch": 2.4135833755701976, + "grad_norm": 2.2236534435555995, + "learning_rate": 1.1151368237656824e-06, + "loss": 0.0859, + "step": 2382 + }, + { + "epoch": 2.415610745058287, + "grad_norm": 3.4664842605923694, + "learning_rate": 1.1077181234513478e-06, + "loss": 0.0716, + "step": 2384 + }, + { + "epoch": 2.417638114546376, + "grad_norm": 2.0347418616291346, + "learning_rate": 1.1003211075523418e-06, + "loss": 0.0416, + "step": 2386 + }, + { + "epoch": 2.419665484034465, + "grad_norm": 0.92500674409509, + "learning_rate": 1.0929458172784157e-06, + "loss": 0.0085, + "step": 2388 + }, + { + "epoch": 2.4216928535225546, + "grad_norm": 1.5055737377966874, + "learning_rate": 1.0855922937182839e-06, + "loss": 0.0326, + "step": 2390 + }, + { + "epoch": 2.4237202230106436, + "grad_norm": 0.9369428380715171, + "learning_rate": 1.0782605778393935e-06, + "loss": 0.0132, + "step": 2392 + }, + { + "epoch": 2.425747592498733, + "grad_norm": 1.26949874309513, + "learning_rate": 1.0709507104877003e-06, + "loss": 0.0254, + "step": 2394 + }, + { + "epoch": 2.427774961986822, + "grad_norm": 4.384984148743625, + "learning_rate": 1.0636627323874415e-06, + "loss": 0.0217, + "step": 2396 + }, + { + "epoch": 2.429802331474911, + "grad_norm": 0.24233399930030006, + "learning_rate": 1.0563966841408995e-06, + "loss": 0.021, + "step": 2398 + }, + { + "epoch": 2.4318297009630006, + "grad_norm": 1.1881740679104273, + "learning_rate": 1.049152606228191e-06, + "loss": 0.0063, + "step": 2400 + }, + { + "epoch": 2.4338570704510896, + "grad_norm": 0.36938607570509774, + "learning_rate": 1.0419305390070245e-06, + "loss": 0.0414, + "step": 2402 + }, + { + "epoch": 2.435884439939179, + "grad_norm": 4.157731257894877, + "learning_rate": 1.034730522712491e-06, + "loss": 0.054, + "step": 2404 + }, + { + "epoch": 2.437911809427268, + "grad_norm": 1.147958308545569, + "learning_rate": 1.027552597456829e-06, + "loss": 0.0279, + "step": 2406 + }, + { + "epoch": 2.4399391789153575, + "grad_norm": 4.007444699243869, + "learning_rate": 1.0203968032292073e-06, + "loss": 0.0835, + "step": 2408 + }, + { + "epoch": 2.4419665484034465, + "grad_norm": 0.202205622478639, + "learning_rate": 1.0132631798954996e-06, + "loss": 0.0042, + "step": 2410 + }, + { + "epoch": 2.443993917891536, + "grad_norm": 3.540753251231049, + "learning_rate": 1.0061517671980582e-06, + "loss": 0.0401, + "step": 2412 + }, + { + "epoch": 2.446021287379625, + "grad_norm": 1.7286023139378153, + "learning_rate": 9.990626047555047e-07, + "loss": 0.0278, + "step": 2414 + }, + { + "epoch": 2.448048656867714, + "grad_norm": 0.775201810703293, + "learning_rate": 9.919957320624934e-07, + "loss": 0.0332, + "step": 2416 + }, + { + "epoch": 2.4500760263558035, + "grad_norm": 1.040539548095705, + "learning_rate": 9.849511884895063e-07, + "loss": 0.0241, + "step": 2418 + }, + { + "epoch": 2.4521033958438925, + "grad_norm": 3.0717693690308656, + "learning_rate": 9.779290132826224e-07, + "loss": 0.0334, + "step": 2420 + }, + { + "epoch": 2.454130765331982, + "grad_norm": 0.83865928401894, + "learning_rate": 9.709292455633057e-07, + "loss": 0.049, + "step": 2422 + }, + { + "epoch": 2.456158134820071, + "grad_norm": 0.3675452248242664, + "learning_rate": 9.639519243281853e-07, + "loss": 0.0233, + "step": 2424 + }, + { + "epoch": 2.45818550430816, + "grad_norm": 1.0804544096832684, + "learning_rate": 9.569970884488372e-07, + "loss": 0.0125, + "step": 2426 + }, + { + "epoch": 2.4602128737962494, + "grad_norm": 3.791802618423996, + "learning_rate": 9.500647766715698e-07, + "loss": 0.0401, + "step": 2428 + }, + { + "epoch": 2.4622402432843384, + "grad_norm": 3.0701213724774705, + "learning_rate": 9.43155027617203e-07, + "loss": 0.0818, + "step": 2430 + }, + { + "epoch": 2.464267612772428, + "grad_norm": 0.7047256813019569, + "learning_rate": 9.362678797808622e-07, + "loss": 0.0052, + "step": 2432 + }, + { + "epoch": 2.466294982260517, + "grad_norm": 1.3031670788822178, + "learning_rate": 9.294033715317535e-07, + "loss": 0.0189, + "step": 2434 + }, + { + "epoch": 2.4683223517486064, + "grad_norm": 3.28408988402408, + "learning_rate": 9.225615411129596e-07, + "loss": 0.0491, + "step": 2436 + }, + { + "epoch": 2.4703497212366954, + "grad_norm": 1.671450530971342, + "learning_rate": 9.157424266412163e-07, + "loss": 0.0074, + "step": 2438 + }, + { + "epoch": 2.4723770907247844, + "grad_norm": 2.455968330246284, + "learning_rate": 9.089460661067106e-07, + "loss": 0.0499, + "step": 2440 + }, + { + "epoch": 2.474404460212874, + "grad_norm": 0.9028006684524077, + "learning_rate": 9.021724973728635e-07, + "loss": 0.0043, + "step": 2442 + }, + { + "epoch": 2.476431829700963, + "grad_norm": 0.46545385271379974, + "learning_rate": 8.954217581761182e-07, + "loss": 0.0184, + "step": 2444 + }, + { + "epoch": 2.4784591991890523, + "grad_norm": 2.1107681146592236, + "learning_rate": 8.886938861257338e-07, + "loss": 0.0228, + "step": 2446 + }, + { + "epoch": 2.4804865686771413, + "grad_norm": 3.5982055968657094, + "learning_rate": 8.819889187035707e-07, + "loss": 0.0478, + "step": 2448 + }, + { + "epoch": 2.482513938165231, + "grad_norm": 2.1081044405849525, + "learning_rate": 8.753068932638875e-07, + "loss": 0.0113, + "step": 2450 + }, + { + "epoch": 2.48454130765332, + "grad_norm": 1.2235366450195277, + "learning_rate": 8.686478470331267e-07, + "loss": 0.0076, + "step": 2452 + }, + { + "epoch": 2.486568677141409, + "grad_norm": 0.19808779551866598, + "learning_rate": 8.620118171097136e-07, + "loss": 0.0248, + "step": 2454 + }, + { + "epoch": 2.4885960466294983, + "grad_norm": 1.7443723821033361, + "learning_rate": 8.553988404638469e-07, + "loss": 0.0228, + "step": 2456 + }, + { + "epoch": 2.4906234161175873, + "grad_norm": 1.1729988059885263, + "learning_rate": 8.488089539372884e-07, + "loss": 0.0399, + "step": 2458 + }, + { + "epoch": 2.4926507856056768, + "grad_norm": 7.609443681920641, + "learning_rate": 8.422421942431658e-07, + "loss": 0.0366, + "step": 2460 + }, + { + "epoch": 2.494678155093766, + "grad_norm": 1.4258284200984621, + "learning_rate": 8.356985979657628e-07, + "loss": 0.0451, + "step": 2462 + }, + { + "epoch": 2.496705524581855, + "grad_norm": 0.09955041127715249, + "learning_rate": 8.291782015603179e-07, + "loss": 0.0037, + "step": 2464 + }, + { + "epoch": 2.4987328940699443, + "grad_norm": 1.6426211874967138, + "learning_rate": 8.226810413528164e-07, + "loss": 0.0129, + "step": 2466 + }, + { + "epoch": 2.5007602635580337, + "grad_norm": 1.149397154492099, + "learning_rate": 8.162071535397953e-07, + "loss": 0.0127, + "step": 2468 + }, + { + "epoch": 2.5027876330461227, + "grad_norm": 0.08703033190800176, + "learning_rate": 8.097565741881386e-07, + "loss": 0.018, + "step": 2470 + }, + { + "epoch": 2.5048150025342117, + "grad_norm": 1.7536552844524458, + "learning_rate": 8.033293392348707e-07, + "loss": 0.0053, + "step": 2472 + }, + { + "epoch": 2.506842372022301, + "grad_norm": 0.5259504839126988, + "learning_rate": 7.969254844869672e-07, + "loss": 0.0221, + "step": 2474 + }, + { + "epoch": 2.50886974151039, + "grad_norm": 0.8449257730573404, + "learning_rate": 7.905450456211456e-07, + "loss": 0.0156, + "step": 2476 + }, + { + "epoch": 2.5108971109984797, + "grad_norm": 2.191234921410503, + "learning_rate": 7.841880581836731e-07, + "loss": 0.056, + "step": 2478 + }, + { + "epoch": 2.5129244804865687, + "grad_norm": 0.03464975015515305, + "learning_rate": 7.77854557590162e-07, + "loss": 0.0814, + "step": 2480 + }, + { + "epoch": 2.5149518499746577, + "grad_norm": 5.250772560483029, + "learning_rate": 7.715445791253806e-07, + "loss": 0.1354, + "step": 2482 + }, + { + "epoch": 2.516979219462747, + "grad_norm": 1.4927168512568465, + "learning_rate": 7.652581579430507e-07, + "loss": 0.0197, + "step": 2484 + }, + { + "epoch": 2.519006588950836, + "grad_norm": 2.865661933476632, + "learning_rate": 7.589953290656532e-07, + "loss": 0.0163, + "step": 2486 + }, + { + "epoch": 2.5210339584389256, + "grad_norm": 1.0054963618090185, + "learning_rate": 7.527561273842337e-07, + "loss": 0.0304, + "step": 2488 + }, + { + "epoch": 2.5230613279270147, + "grad_norm": 2.13564293274132, + "learning_rate": 7.465405876582049e-07, + "loss": 0.0214, + "step": 2490 + }, + { + "epoch": 2.5250886974151037, + "grad_norm": 1.4239922142772365, + "learning_rate": 7.403487445151613e-07, + "loss": 0.0283, + "step": 2492 + }, + { + "epoch": 2.527116066903193, + "grad_norm": 0.7647445096927465, + "learning_rate": 7.341806324506733e-07, + "loss": 0.0222, + "step": 2494 + }, + { + "epoch": 2.5291434363912826, + "grad_norm": 0.05843024035511473, + "learning_rate": 7.280362858281082e-07, + "loss": 0.0033, + "step": 2496 + }, + { + "epoch": 2.5311708058793716, + "grad_norm": 2.9677434942118706, + "learning_rate": 7.219157388784314e-07, + "loss": 0.0413, + "step": 2498 + }, + { + "epoch": 2.5331981753674606, + "grad_norm": 4.557903670151507, + "learning_rate": 7.158190257000147e-07, + "loss": 0.0395, + "step": 2500 + }, + { + "epoch": 2.53522554485555, + "grad_norm": 1.8410909731952476, + "learning_rate": 7.097461802584521e-07, + "loss": 0.0209, + "step": 2502 + }, + { + "epoch": 2.537252914343639, + "grad_norm": 0.3280034054925044, + "learning_rate": 7.036972363863659e-07, + "loss": 0.0116, + "step": 2504 + }, + { + "epoch": 2.5392802838317285, + "grad_norm": 1.9990685438119067, + "learning_rate": 6.976722277832204e-07, + "loss": 0.0494, + "step": 2506 + }, + { + "epoch": 2.5413076533198176, + "grad_norm": 1.0024180919202337, + "learning_rate": 6.916711880151305e-07, + "loss": 0.0072, + "step": 2508 + }, + { + "epoch": 2.5433350228079066, + "grad_norm": 0.022678107701746258, + "learning_rate": 6.856941505146819e-07, + "loss": 0.01, + "step": 2510 + }, + { + "epoch": 2.545362392295996, + "grad_norm": 1.5160603691712968, + "learning_rate": 6.797411485807365e-07, + "loss": 0.04, + "step": 2512 + }, + { + "epoch": 2.547389761784085, + "grad_norm": 2.5399065288855844, + "learning_rate": 6.738122153782528e-07, + "loss": 0.0181, + "step": 2514 + }, + { + "epoch": 2.5494171312721745, + "grad_norm": 1.1143754758350937, + "learning_rate": 6.679073839381012e-07, + "loss": 0.0079, + "step": 2516 + }, + { + "epoch": 2.5514445007602635, + "grad_norm": 1.902680619448219, + "learning_rate": 6.620266871568732e-07, + "loss": 0.0172, + "step": 2518 + }, + { + "epoch": 2.5534718702483525, + "grad_norm": 0.2450852351058602, + "learning_rate": 6.561701577967067e-07, + "loss": 0.0028, + "step": 2520 + }, + { + "epoch": 2.555499239736442, + "grad_norm": 0.9909337140158995, + "learning_rate": 6.503378284850992e-07, + "loss": 0.0156, + "step": 2522 + }, + { + "epoch": 2.557526609224531, + "grad_norm": 4.060016465231984, + "learning_rate": 6.445297317147259e-07, + "loss": 0.0558, + "step": 2524 + }, + { + "epoch": 2.5595539787126205, + "grad_norm": 0.13545159334736182, + "learning_rate": 6.387458998432583e-07, + "loss": 0.1003, + "step": 2526 + }, + { + "epoch": 2.5615813482007095, + "grad_norm": 1.5481603928784122, + "learning_rate": 6.329863650931872e-07, + "loss": 0.0368, + "step": 2528 + }, + { + "epoch": 2.563608717688799, + "grad_norm": 1.695161398983882, + "learning_rate": 6.2725115955164e-07, + "loss": 0.0441, + "step": 2530 + }, + { + "epoch": 2.565636087176888, + "grad_norm": 1.384480834353014, + "learning_rate": 6.215403151702009e-07, + "loss": 0.0524, + "step": 2532 + }, + { + "epoch": 2.5676634566649774, + "grad_norm": 1.546629707684405, + "learning_rate": 6.158538637647393e-07, + "loss": 0.0068, + "step": 2534 + }, + { + "epoch": 2.5696908261530664, + "grad_norm": 0.8985192301254908, + "learning_rate": 6.101918370152221e-07, + "loss": 0.014, + "step": 2536 + }, + { + "epoch": 2.5717181956411554, + "grad_norm": 3.619572606354654, + "learning_rate": 6.04554266465549e-07, + "loss": 0.1263, + "step": 2538 + }, + { + "epoch": 2.573745565129245, + "grad_norm": 0.8673435272672887, + "learning_rate": 5.989411835233677e-07, + "loss": 0.0098, + "step": 2540 + }, + { + "epoch": 2.575772934617334, + "grad_norm": 9.976581416312325, + "learning_rate": 5.933526194599037e-07, + "loss": 0.1655, + "step": 2542 + }, + { + "epoch": 2.5778003041054234, + "grad_norm": 3.1011274699805416, + "learning_rate": 5.877886054097842e-07, + "loss": 0.0242, + "step": 2544 + }, + { + "epoch": 2.5798276735935124, + "grad_norm": 6.900772141097595, + "learning_rate": 5.822491723708629e-07, + "loss": 0.0511, + "step": 2546 + }, + { + "epoch": 2.5818550430816014, + "grad_norm": 2.2341361114441756, + "learning_rate": 5.767343512040541e-07, + "loss": 0.0305, + "step": 2548 + }, + { + "epoch": 2.583882412569691, + "grad_norm": 0.5315359301629249, + "learning_rate": 5.712441726331502e-07, + "loss": 0.0035, + "step": 2550 + }, + { + "epoch": 2.58590978205778, + "grad_norm": 0.7617955232369547, + "learning_rate": 5.65778667244663e-07, + "loss": 0.0043, + "step": 2552 + }, + { + "epoch": 2.5879371515458693, + "grad_norm": 1.7176363609632552, + "learning_rate": 5.603378654876401e-07, + "loss": 0.0227, + "step": 2554 + }, + { + "epoch": 2.5899645210339584, + "grad_norm": 0.13241648866339098, + "learning_rate": 5.549217976735072e-07, + "loss": 0.0098, + "step": 2556 + }, + { + "epoch": 2.5919918905220474, + "grad_norm": 0.11181201818917086, + "learning_rate": 5.495304939758911e-07, + "loss": 0.0016, + "step": 2558 + }, + { + "epoch": 2.594019260010137, + "grad_norm": 5.274079452760565, + "learning_rate": 5.441639844304558e-07, + "loss": 0.073, + "step": 2560 + }, + { + "epoch": 2.5960466294982263, + "grad_norm": 2.5437583461996716, + "learning_rate": 5.388222989347336e-07, + "loss": 0.0213, + "step": 2562 + }, + { + "epoch": 2.5980739989863153, + "grad_norm": 1.0706351009834476, + "learning_rate": 5.335054672479572e-07, + "loss": 0.0154, + "step": 2564 + }, + { + "epoch": 2.6001013684744043, + "grad_norm": 1.2222633545837618, + "learning_rate": 5.282135189908982e-07, + "loss": 0.0449, + "step": 2566 + }, + { + "epoch": 2.6021287379624938, + "grad_norm": 5.506925034796873, + "learning_rate": 5.22946483645696e-07, + "loss": 0.0393, + "step": 2568 + }, + { + "epoch": 2.604156107450583, + "grad_norm": 1.892532017494142, + "learning_rate": 5.177043905557e-07, + "loss": 0.0377, + "step": 2570 + }, + { + "epoch": 2.6061834769386722, + "grad_norm": 2.183200543242684, + "learning_rate": 5.124872689253019e-07, + "loss": 0.0281, + "step": 2572 + }, + { + "epoch": 2.6082108464267613, + "grad_norm": 2.0988849900336697, + "learning_rate": 5.072951478197724e-07, + "loss": 0.0579, + "step": 2574 + }, + { + "epoch": 2.6102382159148503, + "grad_norm": 3.7497123982353484, + "learning_rate": 5.021280561651037e-07, + "loss": 0.0635, + "step": 2576 + }, + { + "epoch": 2.6122655854029397, + "grad_norm": 4.872497965374125, + "learning_rate": 4.96986022747844e-07, + "loss": 0.0205, + "step": 2578 + }, + { + "epoch": 2.6142929548910288, + "grad_norm": 0.823543499832657, + "learning_rate": 4.918690762149408e-07, + "loss": 0.0874, + "step": 2580 + }, + { + "epoch": 2.616320324379118, + "grad_norm": 3.5702120937829336, + "learning_rate": 4.867772450735747e-07, + "loss": 0.0218, + "step": 2582 + }, + { + "epoch": 2.6183476938672072, + "grad_norm": 0.1864275324621419, + "learning_rate": 4.817105576910097e-07, + "loss": 0.0227, + "step": 2584 + }, + { + "epoch": 2.6203750633552962, + "grad_norm": 0.23205739031319422, + "learning_rate": 4.766690422944298e-07, + "loss": 0.016, + "step": 2586 + }, + { + "epoch": 2.6224024328433857, + "grad_norm": 1.4973435240115818, + "learning_rate": 4.716527269707799e-07, + "loss": 0.0232, + "step": 2588 + }, + { + "epoch": 2.624429802331475, + "grad_norm": 0.8807559140120197, + "learning_rate": 4.666616396666157e-07, + "loss": 0.0384, + "step": 2590 + }, + { + "epoch": 2.626457171819564, + "grad_norm": 0.9222423167191565, + "learning_rate": 4.6169580818794025e-07, + "loss": 0.01, + "step": 2592 + }, + { + "epoch": 2.628484541307653, + "grad_norm": 5.830541536186348, + "learning_rate": 4.5675526020005635e-07, + "loss": 0.0699, + "step": 2594 + }, + { + "epoch": 2.6305119107957426, + "grad_norm": 0.35865094231509714, + "learning_rate": 4.5184002322740784e-07, + "loss": 0.0201, + "step": 2596 + }, + { + "epoch": 2.6325392802838317, + "grad_norm": 3.2225054907222384, + "learning_rate": 4.469501246534291e-07, + "loss": 0.0815, + "step": 2598 + }, + { + "epoch": 2.634566649771921, + "grad_norm": 0.4137849881741042, + "learning_rate": 4.42085591720387e-07, + "loss": 0.0104, + "step": 2600 + }, + { + "epoch": 2.63659401926001, + "grad_norm": 2.480895585242386, + "learning_rate": 4.372464515292374e-07, + "loss": 0.057, + "step": 2602 + }, + { + "epoch": 2.638621388748099, + "grad_norm": 2.8820500850669935, + "learning_rate": 4.3243273103946826e-07, + "loss": 0.0372, + "step": 2604 + }, + { + "epoch": 2.6406487582361886, + "grad_norm": 2.403745903508521, + "learning_rate": 4.2764445706894976e-07, + "loss": 0.0899, + "step": 2606 + }, + { + "epoch": 2.6426761277242776, + "grad_norm": 1.0379471378863707, + "learning_rate": 4.228816562937882e-07, + "loss": 0.013, + "step": 2608 + }, + { + "epoch": 2.644703497212367, + "grad_norm": 1.7226087235673784, + "learning_rate": 4.1814435524817343e-07, + "loss": 0.0115, + "step": 2610 + }, + { + "epoch": 2.646730866700456, + "grad_norm": 1.7608472364602148, + "learning_rate": 4.134325803242356e-07, + "loss": 0.0228, + "step": 2612 + }, + { + "epoch": 2.648758236188545, + "grad_norm": 4.155119404836121, + "learning_rate": 4.087463577718914e-07, + "loss": 0.0959, + "step": 2614 + }, + { + "epoch": 2.6507856056766346, + "grad_norm": 1.708308893559062, + "learning_rate": 4.0408571369870475e-07, + "loss": 0.0397, + "step": 2616 + }, + { + "epoch": 2.652812975164724, + "grad_norm": 2.382592817291534, + "learning_rate": 3.9945067406974067e-07, + "loss": 0.0209, + "step": 2618 + }, + { + "epoch": 2.654840344652813, + "grad_norm": 5.427259169799123, + "learning_rate": 3.948412647074135e-07, + "loss": 0.0636, + "step": 2620 + }, + { + "epoch": 2.656867714140902, + "grad_norm": 1.372412434731099, + "learning_rate": 3.9025751129135146e-07, + "loss": 0.011, + "step": 2622 + }, + { + "epoch": 2.6588950836289915, + "grad_norm": 3.1253796906352447, + "learning_rate": 3.8569943935824803e-07, + "loss": 0.0993, + "step": 2624 + }, + { + "epoch": 2.6609224531170805, + "grad_norm": 3.327470811648209, + "learning_rate": 3.811670743017232e-07, + "loss": 0.061, + "step": 2626 + }, + { + "epoch": 2.66294982260517, + "grad_norm": 2.6194939851837247, + "learning_rate": 3.766604413721797e-07, + "loss": 0.0218, + "step": 2628 + }, + { + "epoch": 2.664977192093259, + "grad_norm": 1.5865020702931334, + "learning_rate": 3.721795656766636e-07, + "loss": 0.0099, + "step": 2630 + }, + { + "epoch": 2.667004561581348, + "grad_norm": 3.388088519289982, + "learning_rate": 3.677244721787249e-07, + "loss": 0.0308, + "step": 2632 + }, + { + "epoch": 2.6690319310694375, + "grad_norm": 1.0808903372098169, + "learning_rate": 3.632951856982747e-07, + "loss": 0.0072, + "step": 2634 + }, + { + "epoch": 2.6710593005575265, + "grad_norm": 0.9512020000019997, + "learning_rate": 3.588917309114531e-07, + "loss": 0.0198, + "step": 2636 + }, + { + "epoch": 2.673086670045616, + "grad_norm": 3.9039794129679057, + "learning_rate": 3.545141323504869e-07, + "loss": 0.0595, + "step": 2638 + }, + { + "epoch": 2.675114039533705, + "grad_norm": 0.33567651027910306, + "learning_rate": 3.501624144035559e-07, + "loss": 0.0188, + "step": 2640 + }, + { + "epoch": 2.677141409021794, + "grad_norm": 2.4667958860248307, + "learning_rate": 3.458366013146519e-07, + "loss": 0.0378, + "step": 2642 + }, + { + "epoch": 2.6791687785098834, + "grad_norm": 1.7006063849431838, + "learning_rate": 3.415367171834505e-07, + "loss": 0.0178, + "step": 2644 + }, + { + "epoch": 2.681196147997973, + "grad_norm": 0.11113886315431107, + "learning_rate": 3.37262785965175e-07, + "loss": 0.0154, + "step": 2646 + }, + { + "epoch": 2.683223517486062, + "grad_norm": 2.210746968801356, + "learning_rate": 3.330148314704562e-07, + "loss": 0.0418, + "step": 2648 + }, + { + "epoch": 2.685250886974151, + "grad_norm": 0.9729082307868689, + "learning_rate": 3.2879287736521157e-07, + "loss": 0.0138, + "step": 2650 + }, + { + "epoch": 2.6872782564622404, + "grad_norm": 0.9089197135525665, + "learning_rate": 3.245969471705013e-07, + "loss": 0.0016, + "step": 2652 + }, + { + "epoch": 2.6893056259503294, + "grad_norm": 2.257843119269901, + "learning_rate": 3.204270642624069e-07, + "loss": 0.0433, + "step": 2654 + }, + { + "epoch": 2.691332995438419, + "grad_norm": 3.723802011295843, + "learning_rate": 3.16283251871895e-07, + "loss": 0.04, + "step": 2656 + }, + { + "epoch": 2.693360364926508, + "grad_norm": 1.552565505315235, + "learning_rate": 3.1216553308469124e-07, + "loss": 0.0231, + "step": 2658 + }, + { + "epoch": 2.695387734414597, + "grad_norm": 0.4924981655010352, + "learning_rate": 3.0807393084115014e-07, + "loss": 0.0018, + "step": 2660 + }, + { + "epoch": 2.6974151039026864, + "grad_norm": 2.1427434786649835, + "learning_rate": 3.040084679361255e-07, + "loss": 0.0403, + "step": 2662 + }, + { + "epoch": 2.6994424733907754, + "grad_norm": 4.721114342306017, + "learning_rate": 2.999691670188487e-07, + "loss": 0.0163, + "step": 2664 + }, + { + "epoch": 2.701469842878865, + "grad_norm": 0.1629418720842212, + "learning_rate": 2.959560505927955e-07, + "loss": 0.0102, + "step": 2666 + }, + { + "epoch": 2.703497212366954, + "grad_norm": 1.4097184690633107, + "learning_rate": 2.919691410155684e-07, + "loss": 0.0053, + "step": 2668 + }, + { + "epoch": 2.705524581855043, + "grad_norm": 0.8322422807376985, + "learning_rate": 2.8800846049876484e-07, + "loss": 0.0117, + "step": 2670 + }, + { + "epoch": 2.7075519513431323, + "grad_norm": 8.398354096975819, + "learning_rate": 2.8407403110785825e-07, + "loss": 0.0481, + "step": 2672 + }, + { + "epoch": 2.7095793208312213, + "grad_norm": 1.0128602735938765, + "learning_rate": 2.8016587476207333e-07, + "loss": 0.0071, + "step": 2674 + }, + { + "epoch": 2.711606690319311, + "grad_norm": 5.301195315910917, + "learning_rate": 2.762840132342648e-07, + "loss": 0.0809, + "step": 2676 + }, + { + "epoch": 2.7136340598074, + "grad_norm": 0.3870534239138227, + "learning_rate": 2.724284681507944e-07, + "loss": 0.0404, + "step": 2678 + }, + { + "epoch": 2.7156614292954893, + "grad_norm": 3.0705752873162306, + "learning_rate": 2.6859926099141175e-07, + "loss": 0.0444, + "step": 2680 + }, + { + "epoch": 2.7176887987835783, + "grad_norm": 0.9645810694685607, + "learning_rate": 2.647964130891351e-07, + "loss": 0.0094, + "step": 2682 + }, + { + "epoch": 2.7197161682716677, + "grad_norm": 1.8629826192928138, + "learning_rate": 2.61019945630131e-07, + "loss": 0.0086, + "step": 2684 + }, + { + "epoch": 2.7217435377597567, + "grad_norm": 0.284845388974145, + "learning_rate": 2.5726987965359716e-07, + "loss": 0.0117, + "step": 2686 + }, + { + "epoch": 2.7237709072478458, + "grad_norm": 1.8580987898674064, + "learning_rate": 2.535462360516455e-07, + "loss": 0.0132, + "step": 2688 + }, + { + "epoch": 2.725798276735935, + "grad_norm": 0.9728057770167216, + "learning_rate": 2.4984903556918573e-07, + "loss": 0.009, + "step": 2690 + }, + { + "epoch": 2.7278256462240242, + "grad_norm": 0.8413048849970278, + "learning_rate": 2.461782988038081e-07, + "loss": 0.0401, + "step": 2692 + }, + { + "epoch": 2.7298530157121137, + "grad_norm": 0.9277818240953681, + "learning_rate": 2.425340462056719e-07, + "loss": 0.0351, + "step": 2694 + }, + { + "epoch": 2.7318803852002027, + "grad_norm": 0.8004340507334959, + "learning_rate": 2.3891629807738847e-07, + "loss": 0.0294, + "step": 2696 + }, + { + "epoch": 2.7339077546882917, + "grad_norm": 1.859041061650387, + "learning_rate": 2.3532507457390884e-07, + "loss": 0.0332, + "step": 2698 + }, + { + "epoch": 2.735935124176381, + "grad_norm": 2.819128430612138, + "learning_rate": 2.3176039570241394e-07, + "loss": 0.1279, + "step": 2700 + }, + { + "epoch": 2.73796249366447, + "grad_norm": 4.465246424897435, + "learning_rate": 2.2822228132219792e-07, + "loss": 0.0533, + "step": 2702 + }, + { + "epoch": 2.7399898631525597, + "grad_norm": 1.103928515449916, + "learning_rate": 2.2471075114456232e-07, + "loss": 0.0129, + "step": 2704 + }, + { + "epoch": 2.7420172326406487, + "grad_norm": 7.032493640342869, + "learning_rate": 2.2122582473270594e-07, + "loss": 0.1243, + "step": 2706 + }, + { + "epoch": 2.7440446021287377, + "grad_norm": 0.860031301154004, + "learning_rate": 2.1776752150161228e-07, + "loss": 0.0194, + "step": 2708 + }, + { + "epoch": 2.746071971616827, + "grad_norm": 1.4493139838888847, + "learning_rate": 2.143358607179441e-07, + "loss": 0.0184, + "step": 2710 + }, + { + "epoch": 2.7480993411049166, + "grad_norm": 0.21974064822510286, + "learning_rate": 2.1093086149993613e-07, + "loss": 0.0753, + "step": 2712 + }, + { + "epoch": 2.7501267105930056, + "grad_norm": 1.2979787663162798, + "learning_rate": 2.075525428172892e-07, + "loss": 0.0957, + "step": 2714 + }, + { + "epoch": 2.7521540800810946, + "grad_norm": 0.03419502175687808, + "learning_rate": 2.0420092349105968e-07, + "loss": 0.0126, + "step": 2716 + }, + { + "epoch": 2.754181449569184, + "grad_norm": 1.4149161048970205, + "learning_rate": 2.0087602219356183e-07, + "loss": 0.0095, + "step": 2718 + }, + { + "epoch": 2.756208819057273, + "grad_norm": 0.6770256703090225, + "learning_rate": 1.9757785744825953e-07, + "loss": 0.0078, + "step": 2720 + }, + { + "epoch": 2.7582361885453626, + "grad_norm": 2.015377002849536, + "learning_rate": 1.9430644762966134e-07, + "loss": 0.0256, + "step": 2722 + }, + { + "epoch": 2.7602635580334516, + "grad_norm": 2.488332723579476, + "learning_rate": 1.9106181096322508e-07, + "loss": 0.053, + "step": 2724 + }, + { + "epoch": 2.7622909275215406, + "grad_norm": 2.6453541263701323, + "learning_rate": 1.8784396552524675e-07, + "loss": 0.0545, + "step": 2726 + }, + { + "epoch": 2.76431829700963, + "grad_norm": 1.6905099825307215, + "learning_rate": 1.8465292924276844e-07, + "loss": 0.0132, + "step": 2728 + }, + { + "epoch": 2.766345666497719, + "grad_norm": 0.3841988342923428, + "learning_rate": 1.814887198934745e-07, + "loss": 0.0501, + "step": 2730 + }, + { + "epoch": 2.7683730359858085, + "grad_norm": 2.4967302899072195, + "learning_rate": 1.78351355105591e-07, + "loss": 0.0132, + "step": 2732 + }, + { + "epoch": 2.7704004054738975, + "grad_norm": 0.5770246940961246, + "learning_rate": 1.7524085235779253e-07, + "loss": 0.0167, + "step": 2734 + }, + { + "epoch": 2.7724277749619866, + "grad_norm": 1.4370219184715807, + "learning_rate": 1.7215722897909793e-07, + "loss": 0.0527, + "step": 2736 + }, + { + "epoch": 2.774455144450076, + "grad_norm": 1.8537840494545188, + "learning_rate": 1.6910050214878072e-07, + "loss": 0.0233, + "step": 2738 + }, + { + "epoch": 2.7764825139381655, + "grad_norm": 0.21711958352250033, + "learning_rate": 1.6607068889626765e-07, + "loss": 0.0102, + "step": 2740 + }, + { + "epoch": 2.7785098834262545, + "grad_norm": 0.8590250861685046, + "learning_rate": 1.6306780610104934e-07, + "loss": 0.0045, + "step": 2742 + }, + { + "epoch": 2.7805372529143435, + "grad_norm": 2.3150532531665005, + "learning_rate": 1.6009187049258024e-07, + "loss": 0.038, + "step": 2744 + }, + { + "epoch": 2.782564622402433, + "grad_norm": 0.9033329473570512, + "learning_rate": 1.571428986501905e-07, + "loss": 0.0173, + "step": 2746 + }, + { + "epoch": 2.784591991890522, + "grad_norm": 1.7044260853289488, + "learning_rate": 1.5422090700299097e-07, + "loss": 0.0181, + "step": 2748 + }, + { + "epoch": 2.7866193613786114, + "grad_norm": 0.27487776319356677, + "learning_rate": 1.5132591182978107e-07, + "loss": 0.0016, + "step": 2750 + }, + { + "epoch": 2.7886467308667005, + "grad_norm": 2.549873049539023, + "learning_rate": 1.484579292589622e-07, + "loss": 0.0778, + "step": 2752 + }, + { + "epoch": 2.7906741003547895, + "grad_norm": 0.27185899226219806, + "learning_rate": 1.456169752684422e-07, + "loss": 0.0643, + "step": 2754 + }, + { + "epoch": 2.792701469842879, + "grad_norm": 1.9101815881440711, + "learning_rate": 1.4280306568555113e-07, + "loss": 0.0175, + "step": 2756 + }, + { + "epoch": 2.794728839330968, + "grad_norm": 0.5796717339476736, + "learning_rate": 1.400162161869484e-07, + "loss": 0.0127, + "step": 2758 + }, + { + "epoch": 2.7967562088190574, + "grad_norm": 2.7244676918049477, + "learning_rate": 1.372564422985423e-07, + "loss": 0.0173, + "step": 2760 + }, + { + "epoch": 2.7987835783071464, + "grad_norm": 0.10447799987439593, + "learning_rate": 1.3452375939539407e-07, + "loss": 0.011, + "step": 2762 + }, + { + "epoch": 2.8008109477952354, + "grad_norm": 1.2634879015106038, + "learning_rate": 1.3181818270164238e-07, + "loss": 0.0404, + "step": 2764 + }, + { + "epoch": 2.802838317283325, + "grad_norm": 4.028425822641425, + "learning_rate": 1.2913972729041156e-07, + "loss": 0.0257, + "step": 2766 + }, + { + "epoch": 2.8048656867714143, + "grad_norm": 1.3553616766758172, + "learning_rate": 1.2648840808372864e-07, + "loss": 0.0382, + "step": 2768 + }, + { + "epoch": 2.8068930562595034, + "grad_norm": 2.708194583259918, + "learning_rate": 1.2386423985244255e-07, + "loss": 0.0221, + "step": 2770 + }, + { + "epoch": 2.8089204257475924, + "grad_norm": 2.5789729672209, + "learning_rate": 1.2126723721614053e-07, + "loss": 0.066, + "step": 2772 + }, + { + "epoch": 2.810947795235682, + "grad_norm": 0.18273048228862399, + "learning_rate": 1.1869741464306749e-07, + "loss": 0.0217, + "step": 2774 + }, + { + "epoch": 2.812975164723771, + "grad_norm": 0.0707510423528695, + "learning_rate": 1.1615478645004286e-07, + "loss": 0.002, + "step": 2776 + }, + { + "epoch": 2.8150025342118603, + "grad_norm": 1.1894310452158783, + "learning_rate": 1.1363936680238275e-07, + "loss": 0.0346, + "step": 2778 + }, + { + "epoch": 2.8170299036999493, + "grad_norm": 3.405556135218041, + "learning_rate": 1.111511697138229e-07, + "loss": 0.0259, + "step": 2780 + }, + { + "epoch": 2.8190572731880383, + "grad_norm": 1.2717604332399925, + "learning_rate": 1.0869020904643646e-07, + "loss": 0.0085, + "step": 2782 + }, + { + "epoch": 2.821084642676128, + "grad_norm": 1.8892606639142935, + "learning_rate": 1.0625649851055963e-07, + "loss": 0.0428, + "step": 2784 + }, + { + "epoch": 2.823112012164217, + "grad_norm": 0.39324430978823577, + "learning_rate": 1.0385005166471451e-07, + "loss": 0.0273, + "step": 2786 + }, + { + "epoch": 2.8251393816523063, + "grad_norm": 1.3421997379901087, + "learning_rate": 1.0147088191553412e-07, + "loss": 0.0079, + "step": 2788 + }, + { + "epoch": 2.8271667511403953, + "grad_norm": 4.251220604108327, + "learning_rate": 9.911900251768636e-08, + "loss": 0.067, + "step": 2790 + }, + { + "epoch": 2.8291941206284843, + "grad_norm": 3.8261234757551033, + "learning_rate": 9.679442657380079e-08, + "loss": 0.0939, + "step": 2792 + }, + { + "epoch": 2.8312214901165738, + "grad_norm": 0.6456806695627316, + "learning_rate": 9.449716703439805e-08, + "loss": 0.0342, + "step": 2794 + }, + { + "epoch": 2.833248859604663, + "grad_norm": 0.3200952424888686, + "learning_rate": 9.222723669781219e-08, + "loss": 0.0244, + "step": 2796 + }, + { + "epoch": 2.8352762290927522, + "grad_norm": 1.829739843819025, + "learning_rate": 8.99846482101252e-08, + "loss": 0.026, + "step": 2798 + }, + { + "epoch": 2.8373035985808412, + "grad_norm": 1.3651062397654237, + "learning_rate": 8.776941406509198e-08, + "loss": 0.0242, + "step": 2800 + }, + { + "epoch": 2.8393309680689307, + "grad_norm": 2.2001621020910225, + "learning_rate": 8.558154660407547e-08, + "loss": 0.0479, + "step": 2802 + }, + { + "epoch": 2.8413583375570197, + "grad_norm": 2.2975032213175655, + "learning_rate": 8.342105801597222e-08, + "loss": 0.0277, + "step": 2804 + }, + { + "epoch": 2.843385707045109, + "grad_norm": 1.1711566075875772, + "learning_rate": 8.128796033714969e-08, + "loss": 0.0417, + "step": 2806 + }, + { + "epoch": 2.845413076533198, + "grad_norm": 0.4645326007177909, + "learning_rate": 7.918226545137575e-08, + "loss": 0.0411, + "step": 2808 + }, + { + "epoch": 2.847440446021287, + "grad_norm": 4.483371508513941, + "learning_rate": 7.710398508975481e-08, + "loss": 0.0806, + "step": 2810 + }, + { + "epoch": 2.8494678155093767, + "grad_norm": 5.769356681332059, + "learning_rate": 7.505313083066069e-08, + "loss": 0.0471, + "step": 2812 + }, + { + "epoch": 2.8514951849974657, + "grad_norm": 3.1296082807665577, + "learning_rate": 7.302971409967163e-08, + "loss": 0.0405, + "step": 2814 + }, + { + "epoch": 2.853522554485555, + "grad_norm": 1.2775705398324217, + "learning_rate": 7.103374616951042e-08, + "loss": 0.006, + "step": 2816 + }, + { + "epoch": 2.855549923973644, + "grad_norm": 0.6578004962366253, + "learning_rate": 6.906523815997601e-08, + "loss": 0.0139, + "step": 2818 + }, + { + "epoch": 2.857577293461733, + "grad_norm": 2.0149454563756204, + "learning_rate": 6.712420103788642e-08, + "loss": 0.0632, + "step": 2820 + }, + { + "epoch": 2.8596046629498226, + "grad_norm": 2.5626873890875412, + "learning_rate": 6.521064561701651e-08, + "loss": 0.0223, + "step": 2822 + }, + { + "epoch": 2.8616320324379116, + "grad_norm": 3.3825746815698494, + "learning_rate": 6.332458255803364e-08, + "loss": 0.0383, + "step": 2824 + }, + { + "epoch": 2.863659401926001, + "grad_norm": 0.20506778822835042, + "learning_rate": 6.146602236844545e-08, + "loss": 0.0501, + "step": 2826 + }, + { + "epoch": 2.86568677141409, + "grad_norm": 2.7751412477853954, + "learning_rate": 5.963497540253493e-08, + "loss": 0.0895, + "step": 2828 + }, + { + "epoch": 2.8677141409021796, + "grad_norm": 0.7901490730158314, + "learning_rate": 5.7831451861306007e-08, + "loss": 0.0074, + "step": 2830 + }, + { + "epoch": 2.8697415103902686, + "grad_norm": 4.244172968420652, + "learning_rate": 5.605546179242638e-08, + "loss": 0.0543, + "step": 2832 + }, + { + "epoch": 2.871768879878358, + "grad_norm": 1.3314542874591544, + "learning_rate": 5.4307015090170336e-08, + "loss": 0.0136, + "step": 2834 + }, + { + "epoch": 2.873796249366447, + "grad_norm": 0.0995581926630153, + "learning_rate": 5.258612149536546e-08, + "loss": 0.0087, + "step": 2836 + }, + { + "epoch": 2.875823618854536, + "grad_norm": 0.5600129610077456, + "learning_rate": 5.089279059533658e-08, + "loss": 0.0192, + "step": 2838 + }, + { + "epoch": 2.8778509883426255, + "grad_norm": 0.7138119125315527, + "learning_rate": 4.9227031823853025e-08, + "loss": 0.0113, + "step": 2840 + }, + { + "epoch": 2.8798783578307146, + "grad_norm": 3.200005925635518, + "learning_rate": 4.758885446107642e-08, + "loss": 0.033, + "step": 2842 + }, + { + "epoch": 2.881905727318804, + "grad_norm": 2.1636767518035476, + "learning_rate": 4.59782676335091e-08, + "loss": 0.0172, + "step": 2844 + }, + { + "epoch": 2.883933096806893, + "grad_norm": 1.9554277872943162, + "learning_rate": 4.439528031394247e-08, + "loss": 0.0342, + "step": 2846 + }, + { + "epoch": 2.885960466294982, + "grad_norm": 0.8784821616393415, + "learning_rate": 4.283990132140814e-08, + "loss": 0.0378, + "step": 2848 + }, + { + "epoch": 2.8879878357830715, + "grad_norm": 1.979065166846928, + "learning_rate": 4.131213932112577e-08, + "loss": 0.0285, + "step": 2850 + }, + { + "epoch": 2.8900152052711605, + "grad_norm": 1.976068431496675, + "learning_rate": 3.9812002824460294e-08, + "loss": 0.0257, + "step": 2852 + }, + { + "epoch": 2.89204257475925, + "grad_norm": 3.9503086444785116, + "learning_rate": 3.833950018886978e-08, + "loss": 0.0885, + "step": 2854 + }, + { + "epoch": 2.894069944247339, + "grad_norm": 2.1936543415869774, + "learning_rate": 3.6894639617859886e-08, + "loss": 0.0493, + "step": 2856 + }, + { + "epoch": 2.8960973137354284, + "grad_norm": 1.7024455025730059, + "learning_rate": 3.547742916093944e-08, + "loss": 0.0357, + "step": 2858 + }, + { + "epoch": 2.8981246832235175, + "grad_norm": 1.2128570058992791, + "learning_rate": 3.408787671357494e-08, + "loss": 0.0388, + "step": 2860 + }, + { + "epoch": 2.900152052711607, + "grad_norm": 0.7442449420857135, + "learning_rate": 3.272599001714616e-08, + "loss": 0.0022, + "step": 2862 + }, + { + "epoch": 2.902179422199696, + "grad_norm": 0.5619109159568635, + "learning_rate": 3.139177665890281e-08, + "loss": 0.1396, + "step": 2864 + }, + { + "epoch": 2.904206791687785, + "grad_norm": 1.1488479732813968, + "learning_rate": 3.0085244071924035e-08, + "loss": 0.0191, + "step": 2866 + }, + { + "epoch": 2.9062341611758744, + "grad_norm": 0.9616394935905191, + "learning_rate": 2.8806399535075135e-08, + "loss": 0.0177, + "step": 2868 + }, + { + "epoch": 2.9082615306639634, + "grad_norm": 0.12068421735851538, + "learning_rate": 2.7555250172967008e-08, + "loss": 0.0017, + "step": 2870 + }, + { + "epoch": 2.910288900152053, + "grad_norm": 1.338380388590219, + "learning_rate": 2.633180295591786e-08, + "loss": 0.0399, + "step": 2872 + }, + { + "epoch": 2.912316269640142, + "grad_norm": 1.2688342076638035, + "learning_rate": 2.5136064699913808e-08, + "loss": 0.0079, + "step": 2874 + }, + { + "epoch": 2.914343639128231, + "grad_norm": 4.232392829115634, + "learning_rate": 2.396804206656944e-08, + "loss": 0.0259, + "step": 2876 + }, + { + "epoch": 2.9163710086163204, + "grad_norm": 5.067939002510044, + "learning_rate": 2.282774156309342e-08, + "loss": 0.0191, + "step": 2878 + }, + { + "epoch": 2.9183983781044094, + "grad_norm": 1.3817267976451182, + "learning_rate": 2.171516954225017e-08, + "loss": 0.0251, + "step": 2880 + }, + { + "epoch": 2.920425747592499, + "grad_norm": 0.7024468788688714, + "learning_rate": 2.063033220232491e-08, + "loss": 0.0283, + "step": 2882 + }, + { + "epoch": 2.922453117080588, + "grad_norm": 1.5451615790501059, + "learning_rate": 1.9573235587089234e-08, + "loss": 0.0397, + "step": 2884 + }, + { + "epoch": 2.924480486568677, + "grad_norm": 1.4086809374295963, + "learning_rate": 1.8543885585767805e-08, + "loss": 0.016, + "step": 2886 + }, + { + "epoch": 2.9265078560567663, + "grad_norm": 2.1760516754924217, + "learning_rate": 1.7542287933005608e-08, + "loss": 0.0132, + "step": 2888 + }, + { + "epoch": 2.928535225544856, + "grad_norm": 1.703443113290112, + "learning_rate": 1.656844820883463e-08, + "loss": 0.0194, + "step": 2890 + }, + { + "epoch": 2.930562595032945, + "grad_norm": 1.0051251232076397, + "learning_rate": 1.5622371838644457e-08, + "loss": 0.0902, + "step": 2892 + }, + { + "epoch": 2.932589964521034, + "grad_norm": 1.993295108047863, + "learning_rate": 1.470406409315117e-08, + "loss": 0.0131, + "step": 2894 + }, + { + "epoch": 2.9346173340091233, + "grad_norm": 2.0799952339306604, + "learning_rate": 1.3813530088368498e-08, + "loss": 0.0212, + "step": 2896 + }, + { + "epoch": 2.9366447034972123, + "grad_norm": 4.201409728651566, + "learning_rate": 1.295077478557838e-08, + "loss": 0.0375, + "step": 2898 + }, + { + "epoch": 2.9386720729853018, + "grad_norm": 2.7691839460320846, + "learning_rate": 1.2115802991304326e-08, + "loss": 0.0224, + "step": 2900 + }, + { + "epoch": 2.9406994424733908, + "grad_norm": 0.26726828381930484, + "learning_rate": 1.1308619357284779e-08, + "loss": 0.0039, + "step": 2902 + }, + { + "epoch": 2.94272681196148, + "grad_norm": 1.1579181550089885, + "learning_rate": 1.05292283804459e-08, + "loss": 0.008, + "step": 2904 + }, + { + "epoch": 2.9447541814495692, + "grad_norm": 1.9075044826817458, + "learning_rate": 9.777634402877157e-09, + "loss": 0.0272, + "step": 2906 + }, + { + "epoch": 2.9467815509376583, + "grad_norm": 6.863847231595797, + "learning_rate": 9.053841611808556e-09, + "loss": 0.1981, + "step": 2908 + }, + { + "epoch": 2.9488089204257477, + "grad_norm": 1.8606691129644746, + "learning_rate": 8.357854039584001e-09, + "loss": 0.0536, + "step": 2910 + }, + { + "epoch": 2.9508362899138367, + "grad_norm": 1.0968359384824022, + "learning_rate": 7.689675563642972e-09, + "loss": 0.0135, + "step": 2912 + }, + { + "epoch": 2.9528636594019257, + "grad_norm": 1.3250686758878478, + "learning_rate": 7.049309906494994e-09, + "loss": 0.0055, + "step": 2914 + }, + { + "epoch": 2.954891028890015, + "grad_norm": 0.40892375428920186, + "learning_rate": 6.436760635701866e-09, + "loss": 0.0291, + "step": 2916 + }, + { + "epoch": 2.9569183983781047, + "grad_norm": 5.589708396813834, + "learning_rate": 5.852031163857131e-09, + "loss": 0.0381, + "step": 2918 + }, + { + "epoch": 2.9589457678661937, + "grad_norm": 1.3775314053460428, + "learning_rate": 5.295124748564418e-09, + "loss": 0.0287, + "step": 2920 + }, + { + "epoch": 2.9609731373542827, + "grad_norm": 0.4288372195245844, + "learning_rate": 4.766044492423016e-09, + "loss": 0.0177, + "step": 2922 + }, + { + "epoch": 2.963000506842372, + "grad_norm": 1.8639885608819973, + "learning_rate": 4.2647933430095545e-09, + "loss": 0.0694, + "step": 2924 + }, + { + "epoch": 2.965027876330461, + "grad_norm": 0.7968673739396519, + "learning_rate": 3.7913740928596785e-09, + "loss": 0.0283, + "step": 2926 + }, + { + "epoch": 2.9670552458185506, + "grad_norm": 2.750200498478091, + "learning_rate": 3.3457893794541787e-09, + "loss": 0.0397, + "step": 2928 + }, + { + "epoch": 2.9690826153066396, + "grad_norm": 0.6790694909837364, + "learning_rate": 2.9280416852051074e-09, + "loss": 0.007, + "step": 2930 + }, + { + "epoch": 2.9711099847947287, + "grad_norm": 0.2504883238040136, + "learning_rate": 2.5381333374391305e-09, + "loss": 0.0048, + "step": 2932 + }, + { + "epoch": 2.973137354282818, + "grad_norm": 1.8426208036070884, + "learning_rate": 2.176066508387531e-09, + "loss": 0.0352, + "step": 2934 + }, + { + "epoch": 2.975164723770907, + "grad_norm": 8.377996224262002, + "learning_rate": 1.841843215171779e-09, + "loss": 0.0258, + "step": 2936 + }, + { + "epoch": 2.9771920932589966, + "grad_norm": 0.2866275310323938, + "learning_rate": 1.5354653197940938e-09, + "loss": 0.0075, + "step": 2938 + }, + { + "epoch": 2.9792194627470856, + "grad_norm": 1.0939168025584018, + "learning_rate": 1.2569345291268964e-09, + "loss": 0.0279, + "step": 2940 + }, + { + "epoch": 2.9812468322351746, + "grad_norm": 0.7431021586109571, + "learning_rate": 1.0062523949005976e-09, + "loss": 0.0084, + "step": 2942 + }, + { + "epoch": 2.983274201723264, + "grad_norm": 3.2199099540454488, + "learning_rate": 7.83420313699157e-10, + "loss": 0.0207, + "step": 2944 + }, + { + "epoch": 2.9853015712113535, + "grad_norm": 0.5567334456724427, + "learning_rate": 5.884395269495358e-10, + "loss": 0.0576, + "step": 2946 + }, + { + "epoch": 2.9873289406994425, + "grad_norm": 4.043462718959936, + "learning_rate": 4.213111209155907e-10, + "loss": 0.0575, + "step": 2948 + }, + { + "epoch": 2.9893563101875316, + "grad_norm": 3.0049705361322223, + "learning_rate": 2.820360266908573e-10, + "loss": 0.0266, + "step": 2950 + }, + { + "epoch": 2.991383679675621, + "grad_norm": 2.5995362111821314, + "learning_rate": 1.706150201957746e-10, + "loss": 0.0316, + "step": 2952 + }, + { + "epoch": 2.99341104916371, + "grad_norm": 0.3014010000131884, + "learning_rate": 8.704872217157878e-11, + "loss": 0.0188, + "step": 2954 + }, + { + "epoch": 2.9954384186517995, + "grad_norm": 2.6649107593693624, + "learning_rate": 3.133759817697257e-11, + "loss": 0.0184, + "step": 2956 + }, + { + "epoch": 2.9974657881398885, + "grad_norm": 1.082495821334136, + "learning_rate": 3.481958585904721e-12, + "loss": 0.0348, + "step": 2958 + }, + { + "epoch": 2.9974657881398885, + "step": 2958, + "total_flos": 89480718090240.0, + "train_loss": 0.2258277308333025, + "train_runtime": 8280.9459, + "train_samples_per_second": 5.718, + "train_steps_per_second": 0.357 + } + ], + "logging_steps": 2, + "max_steps": 2958, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 89480718090240.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}