{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0035714285714285713, "grad_norm": 0.3871030807495117, "learning_rate": 1.1904761904761906e-07, "loss": 1.9294867515563965, "step": 2 }, { "epoch": 0.007142857142857143, "grad_norm": 0.34607651829719543, "learning_rate": 3.5714285714285716e-07, "loss": 1.931689739227295, "step": 4 }, { "epoch": 0.010714285714285714, "grad_norm": 0.3077217638492584, "learning_rate": 5.952380952380953e-07, "loss": 1.859986662864685, "step": 6 }, { "epoch": 0.014285714285714285, "grad_norm": 0.26033300161361694, "learning_rate": 8.333333333333333e-07, "loss": 1.8296231031417847, "step": 8 }, { "epoch": 0.017857142857142856, "grad_norm": 0.3577536344528198, "learning_rate": 1.0714285714285714e-06, "loss": 1.840135097503662, "step": 10 }, { "epoch": 0.02142857142857143, "grad_norm": 0.35149258375167847, "learning_rate": 1.3095238095238096e-06, "loss": 1.718151330947876, "step": 12 }, { "epoch": 0.025, "grad_norm": 0.3105311691761017, "learning_rate": 1.5476190476190479e-06, "loss": 1.8123761415481567, "step": 14 }, { "epoch": 0.02857142857142857, "grad_norm": 0.3541400134563446, "learning_rate": 1.7857142857142859e-06, "loss": 1.801349401473999, "step": 16 }, { "epoch": 0.03214285714285714, "grad_norm": 0.32876938581466675, "learning_rate": 2.023809523809524e-06, "loss": 1.8854210376739502, "step": 18 }, { "epoch": 0.03571428571428571, "grad_norm": 0.9392958283424377, "learning_rate": 2.261904761904762e-06, "loss": 1.7024314403533936, "step": 20 }, { "epoch": 0.039285714285714285, "grad_norm": 0.6484195590019226, "learning_rate": 2.5e-06, "loss": 1.9459373950958252, "step": 22 }, { "epoch": 0.04285714285714286, "grad_norm": 0.36433079838752747, "learning_rate": 2.7380952380952387e-06, "loss": 1.9512709379196167, "step": 24 }, { "epoch": 0.04642857142857143, "grad_norm": 0.4358835220336914, "learning_rate": 2.9761904761904763e-06, "loss": 1.7940953969955444, "step": 26 }, { "epoch": 0.05, "grad_norm": 0.506097137928009, "learning_rate": 3.2142857142857147e-06, "loss": 1.7999926805496216, "step": 28 }, { "epoch": 0.05357142857142857, "grad_norm": 0.5315778255462646, "learning_rate": 3.4523809523809528e-06, "loss": 1.7870306968688965, "step": 30 }, { "epoch": 0.05714285714285714, "grad_norm": 0.30482104420661926, "learning_rate": 3.690476190476191e-06, "loss": 1.8913555145263672, "step": 32 }, { "epoch": 0.060714285714285714, "grad_norm": 0.8241702318191528, "learning_rate": 3.928571428571429e-06, "loss": 1.9280858039855957, "step": 34 }, { "epoch": 0.06428571428571428, "grad_norm": 0.39840635657310486, "learning_rate": 4.166666666666667e-06, "loss": 1.9256908893585205, "step": 36 }, { "epoch": 0.06785714285714285, "grad_norm": 0.33251017332077026, "learning_rate": 4.404761904761905e-06, "loss": 1.8829214572906494, "step": 38 }, { "epoch": 0.07142857142857142, "grad_norm": 0.49388226866722107, "learning_rate": 4.642857142857144e-06, "loss": 1.8666248321533203, "step": 40 }, { "epoch": 0.075, "grad_norm": 0.28926795721054077, "learning_rate": 4.880952380952381e-06, "loss": 1.8469940423965454, "step": 42 }, { "epoch": 0.07857142857142857, "grad_norm": 0.317127525806427, "learning_rate": 5.119047619047619e-06, "loss": 1.892695426940918, "step": 44 }, { "epoch": 0.08214285714285714, "grad_norm": 0.8169130682945251, "learning_rate": 5.357142857142857e-06, "loss": 1.893534541130066, "step": 46 }, { "epoch": 0.08571428571428572, "grad_norm": 0.27684587240219116, "learning_rate": 5.595238095238096e-06, "loss": 1.5699528455734253, "step": 48 }, { "epoch": 0.08928571428571429, "grad_norm": 0.5231921076774597, "learning_rate": 5.833333333333334e-06, "loss": 1.6496429443359375, "step": 50 }, { "epoch": 0.09285714285714286, "grad_norm": 0.5755372643470764, "learning_rate": 6.071428571428571e-06, "loss": 1.6312464475631714, "step": 52 }, { "epoch": 0.09642857142857143, "grad_norm": 0.40994322299957275, "learning_rate": 6.30952380952381e-06, "loss": 1.8703556060791016, "step": 54 }, { "epoch": 0.1, "grad_norm": 0.6402392983436584, "learning_rate": 6.547619047619048e-06, "loss": 1.7122882604599, "step": 56 }, { "epoch": 0.10357142857142858, "grad_norm": 0.4092760980129242, "learning_rate": 6.785714285714287e-06, "loss": 1.7604010105133057, "step": 58 }, { "epoch": 0.10714285714285714, "grad_norm": 0.41417962312698364, "learning_rate": 7.023809523809524e-06, "loss": 2.1781420707702637, "step": 60 }, { "epoch": 0.11071428571428571, "grad_norm": 0.6246824264526367, "learning_rate": 7.261904761904762e-06, "loss": 1.7982336282730103, "step": 62 }, { "epoch": 0.11428571428571428, "grad_norm": 0.26309502124786377, "learning_rate": 7.500000000000001e-06, "loss": 1.696463704109192, "step": 64 }, { "epoch": 0.11785714285714285, "grad_norm": 0.9458585381507874, "learning_rate": 7.738095238095238e-06, "loss": 1.7728084325790405, "step": 66 }, { "epoch": 0.12142857142857143, "grad_norm": 0.22862379252910614, "learning_rate": 7.976190476190477e-06, "loss": 1.6821340322494507, "step": 68 }, { "epoch": 0.125, "grad_norm": 0.236324280500412, "learning_rate": 8.214285714285714e-06, "loss": 1.7681533098220825, "step": 70 }, { "epoch": 0.12857142857142856, "grad_norm": 0.2597522735595703, "learning_rate": 8.452380952380953e-06, "loss": 1.8034054040908813, "step": 72 }, { "epoch": 0.13214285714285715, "grad_norm": 0.24487343430519104, "learning_rate": 8.690476190476192e-06, "loss": 1.7554086446762085, "step": 74 }, { "epoch": 0.1357142857142857, "grad_norm": 0.22543826699256897, "learning_rate": 8.92857142857143e-06, "loss": 1.7456854581832886, "step": 76 }, { "epoch": 0.1392857142857143, "grad_norm": 0.2380058914422989, "learning_rate": 9.166666666666666e-06, "loss": 1.7143663167953491, "step": 78 }, { "epoch": 0.14285714285714285, "grad_norm": 0.26500657200813293, "learning_rate": 9.404761904761905e-06, "loss": 1.7059998512268066, "step": 80 }, { "epoch": 0.14642857142857144, "grad_norm": 0.2978551387786865, "learning_rate": 9.642857142857144e-06, "loss": 1.7792344093322754, "step": 82 }, { "epoch": 0.15, "grad_norm": 0.2930593490600586, "learning_rate": 9.880952380952381e-06, "loss": 1.6987429857254028, "step": 84 }, { "epoch": 0.15357142857142858, "grad_norm": 0.4046596884727478, "learning_rate": 9.999991282010348e-06, "loss": 1.7894960641860962, "step": 86 }, { "epoch": 0.15714285714285714, "grad_norm": 0.23502953350543976, "learning_rate": 9.999921538295799e-06, "loss": 1.749454379081726, "step": 88 }, { "epoch": 0.16071428571428573, "grad_norm": 0.22283266484737396, "learning_rate": 9.999782051947632e-06, "loss": 1.686018943786621, "step": 90 }, { "epoch": 0.16428571428571428, "grad_norm": 0.24027639627456665, "learning_rate": 9.999572825127696e-06, "loss": 1.480033040046692, "step": 92 }, { "epoch": 0.16785714285714284, "grad_norm": 0.5684676766395569, "learning_rate": 9.99929386107872e-06, "loss": 1.675416350364685, "step": 94 }, { "epoch": 0.17142857142857143, "grad_norm": 1.138840675354004, "learning_rate": 9.998945164124268e-06, "loss": 1.7155344486236572, "step": 96 }, { "epoch": 0.175, "grad_norm": 0.2664114534854889, "learning_rate": 9.998526739668664e-06, "loss": 1.6043933629989624, "step": 98 }, { "epoch": 0.17857142857142858, "grad_norm": 0.28691864013671875, "learning_rate": 9.998038594196913e-06, "loss": 1.6187028884887695, "step": 100 }, { "epoch": 0.18214285714285713, "grad_norm": 0.31850922107696533, "learning_rate": 9.997480735274608e-06, "loss": 1.5820776224136353, "step": 102 }, { "epoch": 0.18571428571428572, "grad_norm": 0.23401758074760437, "learning_rate": 9.996853171547794e-06, "loss": 1.5967426300048828, "step": 104 }, { "epoch": 0.18928571428571428, "grad_norm": 0.23440219461917877, "learning_rate": 9.996155912742856e-06, "loss": 1.6334154605865479, "step": 106 }, { "epoch": 0.19285714285714287, "grad_norm": 0.7341821193695068, "learning_rate": 9.995388969666348e-06, "loss": 1.598835825920105, "step": 108 }, { "epoch": 0.19642857142857142, "grad_norm": 0.6320663094520569, "learning_rate": 9.994552354204844e-06, "loss": 1.6243830919265747, "step": 110 }, { "epoch": 0.2, "grad_norm": 0.5404586791992188, "learning_rate": 9.993646079324738e-06, "loss": 1.566571831703186, "step": 112 }, { "epoch": 0.20357142857142857, "grad_norm": 0.5022917985916138, "learning_rate": 9.992670159072052e-06, "loss": 1.6408634185791016, "step": 114 }, { "epoch": 0.20714285714285716, "grad_norm": 1.1218639612197876, "learning_rate": 9.991624608572215e-06, "loss": 1.7009669542312622, "step": 116 }, { "epoch": 0.21071428571428572, "grad_norm": 0.5181306004524231, "learning_rate": 9.990509444029833e-06, "loss": 1.7996366024017334, "step": 118 }, { "epoch": 0.21428571428571427, "grad_norm": 0.43997907638549805, "learning_rate": 9.98932468272843e-06, "loss": 1.7554632425308228, "step": 120 }, { "epoch": 0.21785714285714286, "grad_norm": 0.3227292597293854, "learning_rate": 9.98807034303019e-06, "loss": 1.473575472831726, "step": 122 }, { "epoch": 0.22142857142857142, "grad_norm": 0.3611178398132324, "learning_rate": 9.98674644437566e-06, "loss": 1.594710350036621, "step": 124 }, { "epoch": 0.225, "grad_norm": 0.9151387214660645, "learning_rate": 9.985353007283464e-06, "loss": 1.6291745901107788, "step": 126 }, { "epoch": 0.22857142857142856, "grad_norm": 0.2581241726875305, "learning_rate": 9.983890053349969e-06, "loss": 1.2893997430801392, "step": 128 }, { "epoch": 0.23214285714285715, "grad_norm": 0.5861591100692749, "learning_rate": 9.982357605248963e-06, "loss": 1.0495647192001343, "step": 130 }, { "epoch": 0.2357142857142857, "grad_norm": 0.46270960569381714, "learning_rate": 9.980755686731296e-06, "loss": 1.3306972980499268, "step": 132 }, { "epoch": 0.2392857142857143, "grad_norm": 0.36067521572113037, "learning_rate": 9.979084322624518e-06, "loss": 1.5336247682571411, "step": 134 }, { "epoch": 0.24285714285714285, "grad_norm": 0.9336586594581604, "learning_rate": 9.977343538832486e-06, "loss": 1.7042999267578125, "step": 136 }, { "epoch": 0.24642857142857144, "grad_norm": 0.387260377407074, "learning_rate": 9.97553336233497e-06, "loss": 1.396690011024475, "step": 138 }, { "epoch": 0.25, "grad_norm": 0.4640398919582367, "learning_rate": 9.973653821187233e-06, "loss": 1.3623416423797607, "step": 140 }, { "epoch": 0.25357142857142856, "grad_norm": 0.33568593859672546, "learning_rate": 9.971704944519593e-06, "loss": 1.3992865085601807, "step": 142 }, { "epoch": 0.2571428571428571, "grad_norm": 0.24492622911930084, "learning_rate": 9.969686762536973e-06, "loss": 1.444324016571045, "step": 144 }, { "epoch": 0.26071428571428573, "grad_norm": 0.8304792642593384, "learning_rate": 9.967599306518438e-06, "loss": 1.3990877866744995, "step": 146 }, { "epoch": 0.2642857142857143, "grad_norm": 0.623303234577179, "learning_rate": 9.965442608816704e-06, "loss": 1.4563076496124268, "step": 148 }, { "epoch": 0.26785714285714285, "grad_norm": 0.31338322162628174, "learning_rate": 9.963216702857635e-06, "loss": 1.6392706632614136, "step": 150 }, { "epoch": 0.2714285714285714, "grad_norm": 0.3007861375808716, "learning_rate": 9.96092162313973e-06, "loss": 1.5057697296142578, "step": 152 }, { "epoch": 0.275, "grad_norm": 0.15701599419116974, "learning_rate": 9.958557405233593e-06, "loss": 1.4597502946853638, "step": 154 }, { "epoch": 0.2785714285714286, "grad_norm": 0.2882039248943329, "learning_rate": 9.956124085781366e-06, "loss": 1.3839119672775269, "step": 156 }, { "epoch": 0.28214285714285714, "grad_norm": 0.3199823498725891, "learning_rate": 9.953621702496178e-06, "loss": 1.6068451404571533, "step": 158 }, { "epoch": 0.2857142857142857, "grad_norm": 0.24365948140621185, "learning_rate": 9.951050294161548e-06, "loss": 1.7299036979675293, "step": 160 }, { "epoch": 0.2892857142857143, "grad_norm": 0.46145617961883545, "learning_rate": 9.948409900630787e-06, "loss": 1.3489717245101929, "step": 162 }, { "epoch": 0.29285714285714287, "grad_norm": 0.15912453830242157, "learning_rate": 9.945700562826394e-06, "loss": 1.5043880939483643, "step": 164 }, { "epoch": 0.29642857142857143, "grad_norm": 0.1797444075345993, "learning_rate": 9.942922322739395e-06, "loss": 1.1060163974761963, "step": 166 }, { "epoch": 0.3, "grad_norm": 0.2711468040943146, "learning_rate": 9.940075223428718e-06, "loss": 1.8668510913848877, "step": 168 }, { "epoch": 0.30357142857142855, "grad_norm": 0.1544935554265976, "learning_rate": 9.93715930902051e-06, "loss": 1.2808830738067627, "step": 170 }, { "epoch": 0.30714285714285716, "grad_norm": 0.37536972761154175, "learning_rate": 9.934174624707459e-06, "loss": 1.4175796508789062, "step": 172 }, { "epoch": 0.3107142857142857, "grad_norm": 0.22495543956756592, "learning_rate": 9.931121216748092e-06, "loss": 1.6282312870025635, "step": 174 }, { "epoch": 0.3142857142857143, "grad_norm": 0.3511154055595398, "learning_rate": 9.927999132466059e-06, "loss": 1.635170340538025, "step": 176 }, { "epoch": 0.31785714285714284, "grad_norm": 0.47558754682540894, "learning_rate": 9.924808420249404e-06, "loss": 1.563542127609253, "step": 178 }, { "epoch": 0.32142857142857145, "grad_norm": 0.6490837335586548, "learning_rate": 9.921549129549799e-06, "loss": 2.066225290298462, "step": 180 }, { "epoch": 0.325, "grad_norm": 0.37414857745170593, "learning_rate": 9.918221310881797e-06, "loss": 1.2984635829925537, "step": 182 }, { "epoch": 0.32857142857142857, "grad_norm": 0.24293118715286255, "learning_rate": 9.91482501582204e-06, "loss": 1.560595989227295, "step": 184 }, { "epoch": 0.33214285714285713, "grad_norm": 1.0313069820404053, "learning_rate": 9.91136029700846e-06, "loss": 1.6615456342697144, "step": 186 }, { "epoch": 0.3357142857142857, "grad_norm": 0.2365736961364746, "learning_rate": 9.907827208139462e-06, "loss": 1.4550660848617554, "step": 188 }, { "epoch": 0.3392857142857143, "grad_norm": 0.48783159255981445, "learning_rate": 9.904225803973095e-06, "loss": 1.1695599555969238, "step": 190 }, { "epoch": 0.34285714285714286, "grad_norm": 0.18840378522872925, "learning_rate": 9.900556140326203e-06, "loss": 1.175657033920288, "step": 192 }, { "epoch": 0.3464285714285714, "grad_norm": 0.33193379640579224, "learning_rate": 9.896818274073555e-06, "loss": 1.5140769481658936, "step": 194 }, { "epoch": 0.35, "grad_norm": 0.23811382055282593, "learning_rate": 9.893012263146971e-06, "loss": 1.5519834756851196, "step": 196 }, { "epoch": 0.3535714285714286, "grad_norm": 0.3118128478527069, "learning_rate": 9.889138166534416e-06, "loss": 1.6215221881866455, "step": 198 }, { "epoch": 0.35714285714285715, "grad_norm": 0.20358364284038544, "learning_rate": 9.885196044279095e-06, "loss": 1.4951940774917603, "step": 200 }, { "epoch": 0.3607142857142857, "grad_norm": 0.33988499641418457, "learning_rate": 9.881185957478514e-06, "loss": 1.5101016759872437, "step": 202 }, { "epoch": 0.36428571428571427, "grad_norm": 0.2685701251029968, "learning_rate": 9.877107968283538e-06, "loss": 1.3352025747299194, "step": 204 }, { "epoch": 0.3678571428571429, "grad_norm": 0.24243606626987457, "learning_rate": 9.872962139897426e-06, "loss": 1.6032525300979614, "step": 206 }, { "epoch": 0.37142857142857144, "grad_norm": 0.2574315369129181, "learning_rate": 9.86874853657485e-06, "loss": 1.4210426807403564, "step": 208 }, { "epoch": 0.375, "grad_norm": 0.30428075790405273, "learning_rate": 9.864467223620908e-06, "loss": 1.4541680812835693, "step": 210 }, { "epoch": 0.37857142857142856, "grad_norm": 0.27129194140434265, "learning_rate": 9.860118267390092e-06, "loss": 1.5869474411010742, "step": 212 }, { "epoch": 0.3821428571428571, "grad_norm": 0.2757408022880554, "learning_rate": 9.855701735285285e-06, "loss": 1.4132391214370728, "step": 214 }, { "epoch": 0.38571428571428573, "grad_norm": 0.8043122291564941, "learning_rate": 9.851217695756694e-06, "loss": 1.3423351049423218, "step": 216 }, { "epoch": 0.3892857142857143, "grad_norm": 0.3778972327709198, "learning_rate": 9.846666218300808e-06, "loss": 1.4252076148986816, "step": 218 }, { "epoch": 0.39285714285714285, "grad_norm": 0.30076470971107483, "learning_rate": 9.842047373459305e-06, "loss": 1.477191686630249, "step": 220 }, { "epoch": 0.3964285714285714, "grad_norm": 0.2551064193248749, "learning_rate": 9.837361232817964e-06, "loss": 1.4160501956939697, "step": 222 }, { "epoch": 0.4, "grad_norm": 0.3507143557071686, "learning_rate": 9.832607869005565e-06, "loss": 1.3853830099105835, "step": 224 }, { "epoch": 0.4035714285714286, "grad_norm": 0.2576788663864136, "learning_rate": 9.827787355692749e-06, "loss": 1.5642895698547363, "step": 226 }, { "epoch": 0.40714285714285714, "grad_norm": 0.6864569187164307, "learning_rate": 9.822899767590884e-06, "loss": 1.5939396619796753, "step": 228 }, { "epoch": 0.4107142857142857, "grad_norm": 0.290575236082077, "learning_rate": 9.817945180450902e-06, "loss": 1.692050814628601, "step": 230 }, { "epoch": 0.4142857142857143, "grad_norm": 0.2678094506263733, "learning_rate": 9.812923671062139e-06, "loss": 1.5078585147857666, "step": 232 }, { "epoch": 0.41785714285714287, "grad_norm": 0.2580035924911499, "learning_rate": 9.80783531725112e-06, "loss": 1.5001014471054077, "step": 234 }, { "epoch": 0.42142857142857143, "grad_norm": 0.48962509632110596, "learning_rate": 9.80268019788038e-06, "loss": 1.30159592628479, "step": 236 }, { "epoch": 0.425, "grad_norm": 0.3317374587059021, "learning_rate": 9.79745839284722e-06, "loss": 1.3308159112930298, "step": 238 }, { "epoch": 0.42857142857142855, "grad_norm": 0.40648195147514343, "learning_rate": 9.792169983082484e-06, "loss": 1.2273372411727905, "step": 240 }, { "epoch": 0.43214285714285716, "grad_norm": 0.23908843100070953, "learning_rate": 9.786815050549295e-06, "loss": 1.3610113859176636, "step": 242 }, { "epoch": 0.4357142857142857, "grad_norm": 0.30702945590019226, "learning_rate": 9.781393678241787e-06, "loss": 1.526265025138855, "step": 244 }, { "epoch": 0.4392857142857143, "grad_norm": 0.3442508578300476, "learning_rate": 9.775905950183821e-06, "loss": 1.4831691980361938, "step": 246 }, { "epoch": 0.44285714285714284, "grad_norm": 0.2286010980606079, "learning_rate": 9.770351951427684e-06, "loss": 1.5686728954315186, "step": 248 }, { "epoch": 0.44642857142857145, "grad_norm": 0.42278486490249634, "learning_rate": 9.764731768052762e-06, "loss": 1.583655834197998, "step": 250 }, { "epoch": 0.45, "grad_norm": 0.2032240778207779, "learning_rate": 9.75904548716422e-06, "loss": 1.7357392311096191, "step": 252 }, { "epoch": 0.45357142857142857, "grad_norm": 0.6682279706001282, "learning_rate": 9.753293196891639e-06, "loss": 1.4943958520889282, "step": 254 }, { "epoch": 0.45714285714285713, "grad_norm": 0.3326359689235687, "learning_rate": 9.747474986387655e-06, "loss": 1.4537054300308228, "step": 256 }, { "epoch": 0.4607142857142857, "grad_norm": 0.31363770365715027, "learning_rate": 9.74159094582658e-06, "loss": 1.4956998825073242, "step": 258 }, { "epoch": 0.4642857142857143, "grad_norm": 0.6148827075958252, "learning_rate": 9.735641166402998e-06, "loss": 1.3277488946914673, "step": 260 }, { "epoch": 0.46785714285714286, "grad_norm": 0.31903398036956787, "learning_rate": 9.729625740330363e-06, "loss": 1.49782133102417, "step": 262 }, { "epoch": 0.4714285714285714, "grad_norm": 0.8235952258110046, "learning_rate": 9.723544760839555e-06, "loss": 1.460282325744629, "step": 264 }, { "epoch": 0.475, "grad_norm": 0.3165692090988159, "learning_rate": 9.717398322177442e-06, "loss": 1.5261378288269043, "step": 266 }, { "epoch": 0.4785714285714286, "grad_norm": 1.6001724004745483, "learning_rate": 9.71118651960543e-06, "loss": 1.7769297361373901, "step": 268 }, { "epoch": 0.48214285714285715, "grad_norm": 0.26702550053596497, "learning_rate": 9.704909449397962e-06, "loss": 1.3878670930862427, "step": 270 }, { "epoch": 0.4857142857142857, "grad_norm": 0.18522176146507263, "learning_rate": 9.69856720884105e-06, "loss": 1.2690881490707397, "step": 272 }, { "epoch": 0.48928571428571427, "grad_norm": 0.40137895941734314, "learning_rate": 9.692159896230757e-06, "loss": 1.3622859716415405, "step": 274 }, { "epoch": 0.4928571428571429, "grad_norm": 0.34499719738960266, "learning_rate": 9.685687610871666e-06, "loss": 1.6427959203720093, "step": 276 }, { "epoch": 0.49642857142857144, "grad_norm": 0.3400484621524811, "learning_rate": 9.679150453075357e-06, "loss": 1.3161296844482422, "step": 278 }, { "epoch": 0.5, "grad_norm": 0.3539294898509979, "learning_rate": 9.67254852415884e-06, "loss": 1.2691534757614136, "step": 280 }, { "epoch": 0.5035714285714286, "grad_norm": 0.2683607339859009, "learning_rate": 9.665881926442994e-06, "loss": 1.5461015701293945, "step": 282 }, { "epoch": 0.5071428571428571, "grad_norm": 0.3605668246746063, "learning_rate": 9.659150763250966e-06, "loss": 1.6314688920974731, "step": 284 }, { "epoch": 0.5107142857142857, "grad_norm": 0.3184402585029602, "learning_rate": 9.652355138906591e-06, "loss": 1.518629789352417, "step": 286 }, { "epoch": 0.5142857142857142, "grad_norm": 0.870186984539032, "learning_rate": 9.645495158732755e-06, "loss": 1.143850564956665, "step": 288 }, { "epoch": 0.5178571428571429, "grad_norm": 0.33421170711517334, "learning_rate": 9.638570929049776e-06, "loss": 1.0234707593917847, "step": 290 }, { "epoch": 0.5214285714285715, "grad_norm": 0.1717844307422638, "learning_rate": 9.631582557173751e-06, "loss": 1.4672911167144775, "step": 292 }, { "epoch": 0.525, "grad_norm": 0.9301527738571167, "learning_rate": 9.624530151414894e-06, "loss": 1.2730239629745483, "step": 294 }, { "epoch": 0.5285714285714286, "grad_norm": 0.38581225275993347, "learning_rate": 9.617413821075852e-06, "loss": 1.2601397037506104, "step": 296 }, { "epoch": 0.5321428571428571, "grad_norm": 0.23976172506809235, "learning_rate": 9.61023367645002e-06, "loss": 1.2101945877075195, "step": 298 }, { "epoch": 0.5357142857142857, "grad_norm": 0.22152353823184967, "learning_rate": 9.602989828819829e-06, "loss": 1.542162537574768, "step": 300 }, { "epoch": 0.5392857142857143, "grad_norm": 0.4408532381057739, "learning_rate": 9.595682390455015e-06, "loss": 1.3136895895004272, "step": 302 }, { "epoch": 0.5428571428571428, "grad_norm": 0.4480395019054413, "learning_rate": 9.588311474610888e-06, "loss": 1.1242649555206299, "step": 304 }, { "epoch": 0.5464285714285714, "grad_norm": 0.7074999213218689, "learning_rate": 9.580877195526564e-06, "loss": 1.6407079696655273, "step": 306 }, { "epoch": 0.55, "grad_norm": 0.3410518765449524, "learning_rate": 9.573379668423209e-06, "loss": 1.3072420358657837, "step": 308 }, { "epoch": 0.5535714285714286, "grad_norm": 0.47393250465393066, "learning_rate": 9.56581900950225e-06, "loss": 1.4017832279205322, "step": 310 }, { "epoch": 0.5571428571428572, "grad_norm": 0.6089979410171509, "learning_rate": 9.558195335943566e-06, "loss": 1.5297354459762573, "step": 312 }, { "epoch": 0.5607142857142857, "grad_norm": 0.2799089848995209, "learning_rate": 9.550508765903672e-06, "loss": 1.043546199798584, "step": 314 }, { "epoch": 0.5642857142857143, "grad_norm": 0.2620464563369751, "learning_rate": 9.542759418513906e-06, "loss": 1.6538763046264648, "step": 316 }, { "epoch": 0.5678571428571428, "grad_norm": 0.5144315958023071, "learning_rate": 9.534947413878556e-06, "loss": 1.5541188716888428, "step": 318 }, { "epoch": 0.5714285714285714, "grad_norm": 0.29509297013282776, "learning_rate": 9.52707287307302e-06, "loss": 1.1971598863601685, "step": 320 }, { "epoch": 0.575, "grad_norm": 0.1929909884929657, "learning_rate": 9.519135918141913e-06, "loss": 1.1823662519454956, "step": 322 }, { "epoch": 0.5785714285714286, "grad_norm": 0.48544377088546753, "learning_rate": 9.511136672097194e-06, "loss": 1.3313523530960083, "step": 324 }, { "epoch": 0.5821428571428572, "grad_norm": 0.3510501980781555, "learning_rate": 9.503075258916241e-06, "loss": 1.3195650577545166, "step": 326 }, { "epoch": 0.5857142857142857, "grad_norm": 0.2727429270744324, "learning_rate": 9.494951803539942e-06, "loss": 1.2425987720489502, "step": 328 }, { "epoch": 0.5892857142857143, "grad_norm": 15.424638748168945, "learning_rate": 9.486766431870752e-06, "loss": 1.2101187705993652, "step": 330 }, { "epoch": 0.5928571428571429, "grad_norm": 0.2866066098213196, "learning_rate": 9.478519270770746e-06, "loss": 1.2784419059753418, "step": 332 }, { "epoch": 0.5964285714285714, "grad_norm": 0.4156343638896942, "learning_rate": 9.470210448059645e-06, "loss": 1.583785057067871, "step": 334 }, { "epoch": 0.6, "grad_norm": 0.45487159490585327, "learning_rate": 9.46184009251285e-06, "loss": 1.3652830123901367, "step": 336 }, { "epoch": 0.6035714285714285, "grad_norm": 0.32525262236595154, "learning_rate": 9.453408333859427e-06, "loss": 1.4697949886322021, "step": 338 }, { "epoch": 0.6071428571428571, "grad_norm": 0.5784197449684143, "learning_rate": 9.444915302780117e-06, "loss": 1.3824127912521362, "step": 340 }, { "epoch": 0.6107142857142858, "grad_norm": 0.26421067118644714, "learning_rate": 9.436361130905288e-06, "loss": 1.42073655128479, "step": 342 }, { "epoch": 0.6142857142857143, "grad_norm": 0.2380143105983734, "learning_rate": 9.427745950812917e-06, "loss": 1.3658424615859985, "step": 344 }, { "epoch": 0.6178571428571429, "grad_norm": 0.43499693274497986, "learning_rate": 9.41906989602652e-06, "loss": 1.461742639541626, "step": 346 }, { "epoch": 0.6214285714285714, "grad_norm": 0.400419145822525, "learning_rate": 9.410333101013086e-06, "loss": 1.3119421005249023, "step": 348 }, { "epoch": 0.625, "grad_norm": 0.4901754856109619, "learning_rate": 9.401535701180998e-06, "loss": 1.2844195365905762, "step": 350 }, { "epoch": 0.6285714285714286, "grad_norm": 0.48951858282089233, "learning_rate": 9.392677832877932e-06, "loss": 1.568238615989685, "step": 352 }, { "epoch": 0.6321428571428571, "grad_norm": 0.2112666368484497, "learning_rate": 9.383759633388737e-06, "loss": 1.5015143156051636, "step": 354 }, { "epoch": 0.6357142857142857, "grad_norm": 0.2615770101547241, "learning_rate": 9.374781240933316e-06, "loss": 1.5211448669433594, "step": 356 }, { "epoch": 0.6392857142857142, "grad_norm": 0.2031329870223999, "learning_rate": 9.365742794664484e-06, "loss": 1.3461060523986816, "step": 358 }, { "epoch": 0.6428571428571429, "grad_norm": 0.36589643359184265, "learning_rate": 9.356644434665804e-06, "loss": 1.3849568367004395, "step": 360 }, { "epoch": 0.6464285714285715, "grad_norm": 0.3387724757194519, "learning_rate": 9.347486301949417e-06, "loss": 1.558565616607666, "step": 362 }, { "epoch": 0.65, "grad_norm": 0.29686594009399414, "learning_rate": 9.33826853845387e-06, "loss": 1.3272876739501953, "step": 364 }, { "epoch": 0.6535714285714286, "grad_norm": 0.8538780808448792, "learning_rate": 9.328991287041892e-06, "loss": 1.3049917221069336, "step": 366 }, { "epoch": 0.6571428571428571, "grad_norm": 0.2655990719795227, "learning_rate": 9.319654691498205e-06, "loss": 1.2668689489364624, "step": 368 }, { "epoch": 0.6607142857142857, "grad_norm": 0.28246378898620605, "learning_rate": 9.31025889652728e-06, "loss": 1.4307278394699097, "step": 370 }, { "epoch": 0.6642857142857143, "grad_norm": 0.33245643973350525, "learning_rate": 9.300804047751093e-06, "loss": 1.3824753761291504, "step": 372 }, { "epoch": 0.6678571428571428, "grad_norm": 0.21639856696128845, "learning_rate": 9.291290291706881e-06, "loss": 1.3373095989227295, "step": 374 }, { "epoch": 0.6714285714285714, "grad_norm": 0.42471569776535034, "learning_rate": 9.281717775844857e-06, "loss": 1.2794650793075562, "step": 376 }, { "epoch": 0.675, "grad_norm": 0.5068492293357849, "learning_rate": 9.272086648525937e-06, "loss": 1.3947125673294067, "step": 378 }, { "epoch": 0.6785714285714286, "grad_norm": 0.42456403374671936, "learning_rate": 9.26239705901943e-06, "loss": 1.4322527647018433, "step": 380 }, { "epoch": 0.6821428571428572, "grad_norm": 0.7322901487350464, "learning_rate": 9.25264915750073e-06, "loss": 1.427004337310791, "step": 382 }, { "epoch": 0.6857142857142857, "grad_norm": 0.2247505635023117, "learning_rate": 9.242843095048987e-06, "loss": 1.2980873584747314, "step": 384 }, { "epoch": 0.6892857142857143, "grad_norm": 0.28208163380622864, "learning_rate": 9.232979023644768e-06, "loss": 1.466817855834961, "step": 386 }, { "epoch": 0.6928571428571428, "grad_norm": 0.4910048544406891, "learning_rate": 9.223057096167696e-06, "loss": 1.4608205556869507, "step": 388 }, { "epoch": 0.6964285714285714, "grad_norm": 0.5288735032081604, "learning_rate": 9.213077466394088e-06, "loss": 1.3513166904449463, "step": 390 }, { "epoch": 0.7, "grad_norm": 0.304855078458786, "learning_rate": 9.203040288994566e-06, "loss": 1.464281678199768, "step": 392 }, { "epoch": 0.7035714285714286, "grad_norm": 0.28660398721694946, "learning_rate": 9.192945719531662e-06, "loss": 1.3084968328475952, "step": 394 }, { "epoch": 0.7071428571428572, "grad_norm": 0.5636733770370483, "learning_rate": 9.182793914457402e-06, "loss": 1.2844712734222412, "step": 396 }, { "epoch": 0.7107142857142857, "grad_norm": 0.5751602649688721, "learning_rate": 9.172585031110895e-06, "loss": 1.5046448707580566, "step": 398 }, { "epoch": 0.7142857142857143, "grad_norm": 0.34700506925582886, "learning_rate": 9.162319227715877e-06, "loss": 1.3449612855911255, "step": 400 }, { "epoch": 0.7178571428571429, "grad_norm": 0.34046903252601624, "learning_rate": 9.151996663378271e-06, "loss": 1.3594465255737305, "step": 402 }, { "epoch": 0.7214285714285714, "grad_norm": 0.48511913418769836, "learning_rate": 9.141617498083717e-06, "loss": 1.5169265270233154, "step": 404 }, { "epoch": 0.725, "grad_norm": 0.26317858695983887, "learning_rate": 9.131181892695089e-06, "loss": 1.4639661312103271, "step": 406 }, { "epoch": 0.7285714285714285, "grad_norm": 0.4234665632247925, "learning_rate": 9.120690008950008e-06, "loss": 1.4238711595535278, "step": 408 }, { "epoch": 0.7321428571428571, "grad_norm": 0.683773934841156, "learning_rate": 9.110142009458333e-06, "loss": 1.2991688251495361, "step": 410 }, { "epoch": 0.7357142857142858, "grad_norm": 0.9090404510498047, "learning_rate": 9.099538057699643e-06, "loss": 1.4411964416503906, "step": 412 }, { "epoch": 0.7392857142857143, "grad_norm": 0.42502301931381226, "learning_rate": 9.08887831802069e-06, "loss": 1.3963665962219238, "step": 414 }, { "epoch": 0.7428571428571429, "grad_norm": 0.25458428263664246, "learning_rate": 9.078162955632878e-06, "loss": 1.3666608333587646, "step": 416 }, { "epoch": 0.7464285714285714, "grad_norm": 0.2778110206127167, "learning_rate": 9.067392136609672e-06, "loss": 1.4295861721038818, "step": 418 }, { "epoch": 0.75, "grad_norm": 0.3574320673942566, "learning_rate": 9.056566027884051e-06, "loss": 1.4124993085861206, "step": 420 }, { "epoch": 0.7535714285714286, "grad_norm": 0.2570479214191437, "learning_rate": 9.045684797245902e-06, "loss": 1.3560070991516113, "step": 422 }, { "epoch": 0.7571428571428571, "grad_norm": 0.3374227285385132, "learning_rate": 9.034748613339427e-06, "loss": 1.360439658164978, "step": 424 }, { "epoch": 0.7607142857142857, "grad_norm": 0.25365766882896423, "learning_rate": 9.023757645660531e-06, "loss": 1.3708235025405884, "step": 426 }, { "epoch": 0.7642857142857142, "grad_norm": 0.2227737158536911, "learning_rate": 9.01271206455419e-06, "loss": 1.3818211555480957, "step": 428 }, { "epoch": 0.7678571428571429, "grad_norm": 0.21550701558589935, "learning_rate": 9.001612041211817e-06, "loss": 1.3254315853118896, "step": 430 }, { "epoch": 0.7714285714285715, "grad_norm": 0.5434844493865967, "learning_rate": 8.9904577476686e-06, "loss": 1.3340120315551758, "step": 432 }, { "epoch": 0.775, "grad_norm": 0.2289412021636963, "learning_rate": 8.979249356800846e-06, "loss": 1.2770015001296997, "step": 434 }, { "epoch": 0.7785714285714286, "grad_norm": 0.25175049901008606, "learning_rate": 8.967987042323293e-06, "loss": 1.3385746479034424, "step": 436 }, { "epoch": 0.7821428571428571, "grad_norm": 0.27297094464302063, "learning_rate": 8.956670978786423e-06, "loss": 1.2522022724151611, "step": 438 }, { "epoch": 0.7857142857142857, "grad_norm": 0.2057066559791565, "learning_rate": 8.945301341573757e-06, "loss": 1.3175703287124634, "step": 440 }, { "epoch": 0.7892857142857143, "grad_norm": 0.16934043169021606, "learning_rate": 8.93387830689913e-06, "loss": 1.2785143852233887, "step": 442 }, { "epoch": 0.7928571428571428, "grad_norm": 0.17673851549625397, "learning_rate": 8.922402051803968e-06, "loss": 1.311404824256897, "step": 444 }, { "epoch": 0.7964285714285714, "grad_norm": 0.36772605776786804, "learning_rate": 8.91087275415454e-06, "loss": 1.27708101272583, "step": 446 }, { "epoch": 0.8, "grad_norm": 0.1414009928703308, "learning_rate": 8.8992905926392e-06, "loss": 1.247365117073059, "step": 448 }, { "epoch": 0.8035714285714286, "grad_norm": 0.16844603419303894, "learning_rate": 8.887655746765625e-06, "loss": 1.3339194059371948, "step": 450 }, { "epoch": 0.8071428571428572, "grad_norm": 0.4043944180011749, "learning_rate": 8.875968396858023e-06, "loss": 1.3012686967849731, "step": 452 }, { "epoch": 0.8107142857142857, "grad_norm": 0.19886070489883423, "learning_rate": 8.864228724054342e-06, "loss": 1.2051547765731812, "step": 454 }, { "epoch": 0.8142857142857143, "grad_norm": 0.18143871426582336, "learning_rate": 8.852436910303466e-06, "loss": 1.264425277709961, "step": 456 }, { "epoch": 0.8178571428571428, "grad_norm": 0.30469146370887756, "learning_rate": 8.840593138362395e-06, "loss": 1.2156575918197632, "step": 458 }, { "epoch": 0.8214285714285714, "grad_norm": 0.19490455090999603, "learning_rate": 8.828697591793405e-06, "loss": 1.2579315900802612, "step": 460 }, { "epoch": 0.825, "grad_norm": 0.22966210544109344, "learning_rate": 8.816750454961206e-06, "loss": 1.2265636920928955, "step": 462 }, { "epoch": 0.8285714285714286, "grad_norm": 0.4836777448654175, "learning_rate": 8.804751913030095e-06, "loss": 1.2515498399734497, "step": 464 }, { "epoch": 0.8321428571428572, "grad_norm": 0.22509177029132843, "learning_rate": 8.792702151961074e-06, "loss": 1.2572628259658813, "step": 466 }, { "epoch": 0.8357142857142857, "grad_norm": 0.4269544184207916, "learning_rate": 8.780601358508966e-06, "loss": 1.2433445453643799, "step": 468 }, { "epoch": 0.8392857142857143, "grad_norm": 0.19438913464546204, "learning_rate": 8.768449720219533e-06, "loss": 1.2479232549667358, "step": 470 }, { "epoch": 0.8428571428571429, "grad_norm": 0.695250391960144, "learning_rate": 8.75624742542656e-06, "loss": 1.300042748451233, "step": 472 }, { "epoch": 0.8464285714285714, "grad_norm": 0.35800135135650635, "learning_rate": 8.743994663248939e-06, "loss": 1.2871143817901611, "step": 474 }, { "epoch": 0.85, "grad_norm": 0.20253418385982513, "learning_rate": 8.73169162358774e-06, "loss": 1.2776912450790405, "step": 476 }, { "epoch": 0.8535714285714285, "grad_norm": 0.203902930021286, "learning_rate": 8.719338497123258e-06, "loss": 1.3039164543151855, "step": 478 }, { "epoch": 0.8571428571428571, "grad_norm": 0.24306446313858032, "learning_rate": 8.706935475312073e-06, "loss": 1.30210542678833, "step": 480 }, { "epoch": 0.8607142857142858, "grad_norm": 0.2822311520576477, "learning_rate": 8.694482750384069e-06, "loss": 1.2630928754806519, "step": 482 }, { "epoch": 0.8642857142857143, "grad_norm": 0.2177450954914093, "learning_rate": 8.681980515339464e-06, "loss": 1.2841533422470093, "step": 484 }, { "epoch": 0.8678571428571429, "grad_norm": 0.19454443454742432, "learning_rate": 8.669428963945815e-06, "loss": 1.2446175813674927, "step": 486 }, { "epoch": 0.8714285714285714, "grad_norm": 0.161905437707901, "learning_rate": 8.656828290735013e-06, "loss": 1.2695343494415283, "step": 488 }, { "epoch": 0.875, "grad_norm": 0.19021154940128326, "learning_rate": 8.644178691000272e-06, "loss": 1.2780508995056152, "step": 490 }, { "epoch": 0.8785714285714286, "grad_norm": 0.3725239038467407, "learning_rate": 8.631480360793095e-06, "loss": 1.2979791164398193, "step": 492 }, { "epoch": 0.8821428571428571, "grad_norm": 0.5264632701873779, "learning_rate": 8.61873349692025e-06, "loss": 1.2810431718826294, "step": 494 }, { "epoch": 0.8857142857142857, "grad_norm": 0.26536062359809875, "learning_rate": 8.605938296940702e-06, "loss": 1.2166625261306763, "step": 496 }, { "epoch": 0.8892857142857142, "grad_norm": 0.4096132516860962, "learning_rate": 8.593094959162565e-06, "loss": 1.2420190572738647, "step": 498 }, { "epoch": 0.8928571428571429, "grad_norm": 0.4396449327468872, "learning_rate": 8.58020368264002e-06, "loss": 1.2754027843475342, "step": 500 }, { "epoch": 0.8964285714285715, "grad_norm": 0.15545235574245453, "learning_rate": 8.567264667170232e-06, "loss": 1.3059731721878052, "step": 502 }, { "epoch": 0.9, "grad_norm": 0.25121352076530457, "learning_rate": 8.554278113290262e-06, "loss": 1.2766114473342896, "step": 504 }, { "epoch": 0.9035714285714286, "grad_norm": 0.21137557923793793, "learning_rate": 8.541244222273942e-06, "loss": 1.258975863456726, "step": 506 }, { "epoch": 0.9071428571428571, "grad_norm": 0.16647249460220337, "learning_rate": 8.528163196128767e-06, "loss": 1.2222638130187988, "step": 508 }, { "epoch": 0.9107142857142857, "grad_norm": 0.3039259910583496, "learning_rate": 8.51503523759277e-06, "loss": 1.257559895515442, "step": 510 }, { "epoch": 0.9142857142857143, "grad_norm": 0.32180115580558777, "learning_rate": 8.501860550131361e-06, "loss": 1.280539631843567, "step": 512 }, { "epoch": 0.9178571428571428, "grad_norm": 0.2822877764701843, "learning_rate": 8.488639337934188e-06, "loss": 1.225077509880066, "step": 514 }, { "epoch": 0.9214285714285714, "grad_norm": 0.22444438934326172, "learning_rate": 8.475371805911975e-06, "loss": 1.259244441986084, "step": 516 }, { "epoch": 0.925, "grad_norm": 0.17102967202663422, "learning_rate": 8.462058159693332e-06, "loss": 1.2512003183364868, "step": 518 }, { "epoch": 0.9285714285714286, "grad_norm": 0.9442085027694702, "learning_rate": 8.44869860562158e-06, "loss": 1.2956591844558716, "step": 520 }, { "epoch": 0.9321428571428572, "grad_norm": 0.31264039874076843, "learning_rate": 8.435293350751545e-06, "loss": 1.3134222030639648, "step": 522 }, { "epoch": 0.9357142857142857, "grad_norm": 0.20593850314617157, "learning_rate": 8.421842602846362e-06, "loss": 1.269896149635315, "step": 524 }, { "epoch": 0.9392857142857143, "grad_norm": 0.24257254600524902, "learning_rate": 8.408346570374234e-06, "loss": 1.2887259721755981, "step": 526 }, { "epoch": 0.9428571428571428, "grad_norm": 0.18374580144882202, "learning_rate": 8.394805462505224e-06, "loss": 1.2653754949569702, "step": 528 }, { "epoch": 0.9464285714285714, "grad_norm": 0.7440497875213623, "learning_rate": 8.381219489107992e-06, "loss": 1.2136163711547852, "step": 530 }, { "epoch": 0.95, "grad_norm": 0.3250195384025574, "learning_rate": 8.36758886074656e-06, "loss": 1.233951449394226, "step": 532 }, { "epoch": 0.9535714285714286, "grad_norm": 0.2864832878112793, "learning_rate": 8.353913788677036e-06, "loss": 1.2546851634979248, "step": 534 }, { "epoch": 0.9571428571428572, "grad_norm": 0.22155587375164032, "learning_rate": 8.34019448484435e-06, "loss": 1.2355575561523438, "step": 536 }, { "epoch": 0.9607142857142857, "grad_norm": 0.19411461055278778, "learning_rate": 8.326431161878957e-06, "loss": 1.2437915802001953, "step": 538 }, { "epoch": 0.9642857142857143, "grad_norm": 0.26431798934936523, "learning_rate": 8.312624033093555e-06, "loss": 1.2899754047393799, "step": 540 }, { "epoch": 0.9678571428571429, "grad_norm": 0.3181489109992981, "learning_rate": 8.298773312479767e-06, "loss": 1.2769360542297363, "step": 542 }, { "epoch": 0.9714285714285714, "grad_norm": 0.2669861912727356, "learning_rate": 8.284879214704834e-06, "loss": 1.2913857698440552, "step": 544 }, { "epoch": 0.975, "grad_norm": 0.2932322919368744, "learning_rate": 8.270941955108281e-06, "loss": 1.2430675029754639, "step": 546 }, { "epoch": 0.9785714285714285, "grad_norm": 0.3006272614002228, "learning_rate": 8.256961749698583e-06, "loss": 1.2453312873840332, "step": 548 }, { "epoch": 0.9821428571428571, "grad_norm": 0.2196272611618042, "learning_rate": 8.242938815149817e-06, "loss": 1.2648967504501343, "step": 550 }, { "epoch": 0.9857142857142858, "grad_norm": 0.2562142014503479, "learning_rate": 8.228873368798304e-06, "loss": 1.3159946203231812, "step": 552 }, { "epoch": 0.9892857142857143, "grad_norm": 0.26237812638282776, "learning_rate": 8.214765628639235e-06, "loss": 1.3476945161819458, "step": 554 }, { "epoch": 0.9928571428571429, "grad_norm": 0.38732582330703735, "learning_rate": 8.200615813323306e-06, "loss": 1.9057130813598633, "step": 556 }, { "epoch": 0.9964285714285714, "grad_norm": 0.33351263403892517, "learning_rate": 8.18642414215331e-06, "loss": 1.8800382614135742, "step": 558 }, { "epoch": 1.0, "grad_norm": 0.6058505773544312, "learning_rate": 8.172190835080757e-06, "loss": 1.8019236326217651, "step": 560 }, { "epoch": 1.0035714285714286, "grad_norm": 0.31470683217048645, "learning_rate": 8.157916112702452e-06, "loss": 1.384263277053833, "step": 562 }, { "epoch": 1.0071428571428571, "grad_norm": 0.310624897480011, "learning_rate": 8.143600196257086e-06, "loss": 1.3995013236999512, "step": 564 }, { "epoch": 1.0107142857142857, "grad_norm": 0.20878104865550995, "learning_rate": 8.129243307621791e-06, "loss": 1.3525418043136597, "step": 566 }, { "epoch": 1.0142857142857142, "grad_norm": 0.2683800160884857, "learning_rate": 8.114845669308723e-06, "loss": 1.3207361698150635, "step": 568 }, { "epoch": 1.0178571428571428, "grad_norm": 0.27859288454055786, "learning_rate": 8.100407504461595e-06, "loss": 1.3501830101013184, "step": 570 }, { "epoch": 1.0214285714285714, "grad_norm": 0.32225877046585083, "learning_rate": 8.085929036852236e-06, "loss": 1.1840941905975342, "step": 572 }, { "epoch": 1.025, "grad_norm": 0.23283155262470245, "learning_rate": 8.071410490877097e-06, "loss": 1.2650562524795532, "step": 574 }, { "epoch": 1.0285714285714285, "grad_norm": 0.1705978810787201, "learning_rate": 8.0568520915538e-06, "loss": 1.2940489053726196, "step": 576 }, { "epoch": 1.032142857142857, "grad_norm": 0.23754863440990448, "learning_rate": 8.042254064517642e-06, "loss": 1.3267643451690674, "step": 578 }, { "epoch": 1.0357142857142858, "grad_norm": 0.46769577264785767, "learning_rate": 8.027616636018085e-06, "loss": 1.2288154363632202, "step": 580 }, { "epoch": 1.0392857142857144, "grad_norm": 0.233358234167099, "learning_rate": 8.012940032915263e-06, "loss": 1.3615669012069702, "step": 582 }, { "epoch": 1.042857142857143, "grad_norm": 0.2691819369792938, "learning_rate": 7.998224482676473e-06, "loss": 1.3021140098571777, "step": 584 }, { "epoch": 1.0464285714285715, "grad_norm": 0.24730414152145386, "learning_rate": 7.983470213372624e-06, "loss": 1.2602746486663818, "step": 586 }, { "epoch": 1.05, "grad_norm": 0.2731882929801941, "learning_rate": 7.96867745367473e-06, "loss": 1.2430776357650757, "step": 588 }, { "epoch": 1.0535714285714286, "grad_norm": 0.22160141170024872, "learning_rate": 7.953846432850346e-06, "loss": 1.2589969635009766, "step": 590 }, { "epoch": 1.0571428571428572, "grad_norm": 0.2917991280555725, "learning_rate": 7.938977380760024e-06, "loss": 1.408372402191162, "step": 592 }, { "epoch": 1.0607142857142857, "grad_norm": 0.23420438170433044, "learning_rate": 7.92407052785375e-06, "loss": 1.3381731510162354, "step": 594 }, { "epoch": 1.0642857142857143, "grad_norm": 0.19835133850574493, "learning_rate": 7.909126105167373e-06, "loss": 1.3641246557235718, "step": 596 }, { "epoch": 1.0678571428571428, "grad_norm": 0.21805885434150696, "learning_rate": 7.894144344319015e-06, "loss": 1.2766021490097046, "step": 598 }, { "epoch": 1.0714285714285714, "grad_norm": 0.3379668593406677, "learning_rate": 7.879125477505495e-06, "loss": 1.2909208536148071, "step": 600 }, { "epoch": 1.075, "grad_norm": 0.3864686191082001, "learning_rate": 7.864069737498722e-06, "loss": 1.259904146194458, "step": 602 }, { "epoch": 1.0785714285714285, "grad_norm": 0.3104611933231354, "learning_rate": 7.848977357642089e-06, "loss": 1.3227314949035645, "step": 604 }, { "epoch": 1.082142857142857, "grad_norm": 0.244283065199852, "learning_rate": 7.833848571846855e-06, "loss": 1.3027191162109375, "step": 606 }, { "epoch": 1.0857142857142856, "grad_norm": 0.19385835528373718, "learning_rate": 7.818683614588523e-06, "loss": 1.0396664142608643, "step": 608 }, { "epoch": 1.0892857142857142, "grad_norm": 0.2750968933105469, "learning_rate": 7.803482720903206e-06, "loss": 1.1102863550186157, "step": 610 }, { "epoch": 1.092857142857143, "grad_norm": 0.3333893418312073, "learning_rate": 7.788246126383977e-06, "loss": 1.1634554862976074, "step": 612 }, { "epoch": 1.0964285714285715, "grad_norm": 0.28989356756210327, "learning_rate": 7.77297406717723e-06, "loss": 1.3986788988113403, "step": 614 }, { "epoch": 1.1, "grad_norm": 0.27835774421691895, "learning_rate": 7.757666779979008e-06, "loss": 1.2263062000274658, "step": 616 }, { "epoch": 1.1035714285714286, "grad_norm": 0.2572242021560669, "learning_rate": 7.74232450203134e-06, "loss": 1.2180155515670776, "step": 618 }, { "epoch": 1.1071428571428572, "grad_norm": 0.3894072473049164, "learning_rate": 7.72694747111857e-06, "loss": 1.478975534439087, "step": 620 }, { "epoch": 1.1107142857142858, "grad_norm": 0.4212060868740082, "learning_rate": 7.711535925563655e-06, "loss": 1.3129830360412598, "step": 622 }, { "epoch": 1.1142857142857143, "grad_norm": 0.23659296333789825, "learning_rate": 7.696090104224492e-06, "loss": 1.229081392288208, "step": 624 }, { "epoch": 1.1178571428571429, "grad_norm": 0.254404217004776, "learning_rate": 7.680610246490199e-06, "loss": 1.2878901958465576, "step": 626 }, { "epoch": 1.1214285714285714, "grad_norm": 0.3570263981819153, "learning_rate": 7.665096592277415e-06, "loss": 1.218833088874817, "step": 628 }, { "epoch": 1.125, "grad_norm": 0.27803489565849304, "learning_rate": 7.649549382026575e-06, "loss": 1.274793028831482, "step": 630 }, { "epoch": 1.1285714285714286, "grad_norm": 0.2562004327774048, "learning_rate": 7.633968856698192e-06, "loss": 1.3318731784820557, "step": 632 }, { "epoch": 1.1321428571428571, "grad_norm": 0.19307534396648407, "learning_rate": 7.618355257769111e-06, "loss": 1.2363682985305786, "step": 634 }, { "epoch": 1.1357142857142857, "grad_norm": 0.5484210848808289, "learning_rate": 7.602708827228779e-06, "loss": 1.259455680847168, "step": 636 }, { "epoch": 1.1392857142857142, "grad_norm": 0.2351217418909073, "learning_rate": 7.587029807575482e-06, "loss": 1.2625541687011719, "step": 638 }, { "epoch": 1.1428571428571428, "grad_norm": 0.5461699962615967, "learning_rate": 7.571318441812599e-06, "loss": 1.1984379291534424, "step": 640 }, { "epoch": 1.1464285714285714, "grad_norm": 0.30940407514572144, "learning_rate": 7.55557497344482e-06, "loss": 1.3161015510559082, "step": 642 }, { "epoch": 1.15, "grad_norm": 0.32747605443000793, "learning_rate": 7.539799646474393e-06, "loss": 1.234968900680542, "step": 644 }, { "epoch": 1.1535714285714285, "grad_norm": 0.2250605821609497, "learning_rate": 7.523992705397321e-06, "loss": 1.3490346670150757, "step": 646 }, { "epoch": 1.157142857142857, "grad_norm": 0.3528631925582886, "learning_rate": 7.508154395199592e-06, "loss": 1.350324034690857, "step": 648 }, { "epoch": 1.1607142857142858, "grad_norm": 0.247028186917305, "learning_rate": 7.492284961353361e-06, "loss": 1.285825252532959, "step": 650 }, { "epoch": 1.1642857142857144, "grad_norm": 0.26968345046043396, "learning_rate": 7.4763846498131675e-06, "loss": 1.123679518699646, "step": 652 }, { "epoch": 1.167857142857143, "grad_norm": 0.23967714607715607, "learning_rate": 7.460453707012107e-06, "loss": 1.2702839374542236, "step": 654 }, { "epoch": 1.1714285714285715, "grad_norm": 0.9412787556648254, "learning_rate": 7.444492379858021e-06, "loss": 1.3307619094848633, "step": 656 }, { "epoch": 1.175, "grad_norm": 0.60057133436203, "learning_rate": 7.428500915729663e-06, "loss": 1.218625783920288, "step": 658 }, { "epoch": 1.1785714285714286, "grad_norm": 0.2611408829689026, "learning_rate": 7.412479562472873e-06, "loss": 1.1818389892578125, "step": 660 }, { "epoch": 1.1821428571428572, "grad_norm": 0.21901297569274902, "learning_rate": 7.3964285683967285e-06, "loss": 1.2105083465576172, "step": 662 }, { "epoch": 1.1857142857142857, "grad_norm": 0.9242513179779053, "learning_rate": 7.380348182269701e-06, "loss": 1.2359505891799927, "step": 664 }, { "epoch": 1.1892857142857143, "grad_norm": 0.24152880907058716, "learning_rate": 7.364238653315795e-06, "loss": 1.268753170967102, "step": 666 }, { "epoch": 1.1928571428571428, "grad_norm": 2.834768533706665, "learning_rate": 7.348100231210697e-06, "loss": 1.2450233697891235, "step": 668 }, { "epoch": 1.1964285714285714, "grad_norm": 0.7332023978233337, "learning_rate": 7.331933166077886e-06, "loss": 1.2236673831939697, "step": 670 }, { "epoch": 1.2, "grad_norm": 0.3339300751686096, "learning_rate": 7.31573770848478e-06, "loss": 1.1605288982391357, "step": 672 }, { "epoch": 1.2035714285714285, "grad_norm": 0.7548586130142212, "learning_rate": 7.299514109438835e-06, "loss": 1.276812195777893, "step": 674 }, { "epoch": 1.207142857142857, "grad_norm": 0.32066163420677185, "learning_rate": 7.283262620383664e-06, "loss": 1.2277733087539673, "step": 676 }, { "epoch": 1.2107142857142856, "grad_norm": 0.3439161777496338, "learning_rate": 7.266983493195133e-06, "loss": 1.443245768547058, "step": 678 }, { "epoch": 1.2142857142857142, "grad_norm": 0.28881630301475525, "learning_rate": 7.250676980177468e-06, "loss": 1.3642569780349731, "step": 680 }, { "epoch": 1.217857142857143, "grad_norm": 0.3376900553703308, "learning_rate": 7.2343433340593315e-06, "loss": 1.1232848167419434, "step": 682 }, { "epoch": 1.2214285714285715, "grad_norm": 0.5144054293632507, "learning_rate": 7.217982807989915e-06, "loss": 1.2558438777923584, "step": 684 }, { "epoch": 1.225, "grad_norm": 0.25952062010765076, "learning_rate": 7.201595655535011e-06, "loss": 1.3395494222640991, "step": 686 }, { "epoch": 1.2285714285714286, "grad_norm": 0.3723627030849457, "learning_rate": 7.1851821306730876e-06, "loss": 0.9402600526809692, "step": 688 }, { "epoch": 1.2321428571428572, "grad_norm": 0.3420025110244751, "learning_rate": 7.168742487791345e-06, "loss": 0.7468339204788208, "step": 690 }, { "epoch": 1.2357142857142858, "grad_norm": 0.34970328211784363, "learning_rate": 7.152276981681781e-06, "loss": 1.0327891111373901, "step": 692 }, { "epoch": 1.2392857142857143, "grad_norm": 0.3740408420562744, "learning_rate": 7.135785867537235e-06, "loss": 1.267980694770813, "step": 694 }, { "epoch": 1.2428571428571429, "grad_norm": 0.5309215188026428, "learning_rate": 7.119269400947437e-06, "loss": 1.4097453355789185, "step": 696 }, { "epoch": 1.2464285714285714, "grad_norm": 0.3004949390888214, "learning_rate": 7.1027278378950486e-06, "loss": 1.2045501470565796, "step": 698 }, { "epoch": 1.25, "grad_norm": 0.6161743402481079, "learning_rate": 7.086161434751684e-06, "loss": 1.0838185548782349, "step": 700 }, { "epoch": 1.2535714285714286, "grad_norm": 0.22701780498027802, "learning_rate": 7.069570448273951e-06, "loss": 1.1616631746292114, "step": 702 }, { "epoch": 1.2571428571428571, "grad_norm": 0.3208640515804291, "learning_rate": 7.0529551355994686e-06, "loss": 1.2447824478149414, "step": 704 }, { "epoch": 1.2607142857142857, "grad_norm": 0.7384056448936462, "learning_rate": 7.03631575424287e-06, "loss": 1.1175577640533447, "step": 706 }, { "epoch": 1.2642857142857142, "grad_norm": 0.5497505068778992, "learning_rate": 7.019652562091826e-06, "loss": 1.141535758972168, "step": 708 }, { "epoch": 1.2678571428571428, "grad_norm": 0.3330208361148834, "learning_rate": 7.0029658174030425e-06, "loss": 1.3164706230163574, "step": 710 }, { "epoch": 1.2714285714285714, "grad_norm": 0.4105195701122284, "learning_rate": 6.986255778798253e-06, "loss": 1.234831690788269, "step": 712 }, { "epoch": 1.275, "grad_norm": 0.28338423371315, "learning_rate": 6.9695227052602174e-06, "loss": 1.1415457725524902, "step": 714 }, { "epoch": 1.2785714285714285, "grad_norm": 0.3706303536891937, "learning_rate": 6.952766856128709e-06, "loss": 1.199047565460205, "step": 716 }, { "epoch": 1.282142857142857, "grad_norm": 0.3346574902534485, "learning_rate": 6.9359884910964856e-06, "loss": 1.4197050333023071, "step": 718 }, { "epoch": 1.2857142857142856, "grad_norm": 0.3120553195476532, "learning_rate": 6.919187870205275e-06, "loss": 1.5487772226333618, "step": 720 }, { "epoch": 1.2892857142857144, "grad_norm": 0.2753259837627411, "learning_rate": 6.902365253841737e-06, "loss": 1.177211880683899, "step": 722 }, { "epoch": 1.292857142857143, "grad_norm": 0.2185521274805069, "learning_rate": 6.885520902733435e-06, "loss": 1.2806293964385986, "step": 724 }, { "epoch": 1.2964285714285715, "grad_norm": 0.14865590631961823, "learning_rate": 6.868655077944788e-06, "loss": 0.9303812980651855, "step": 726 }, { "epoch": 1.3, "grad_norm": 0.39503300189971924, "learning_rate": 6.85176804087303e-06, "loss": 1.5363171100616455, "step": 728 }, { "epoch": 1.3035714285714286, "grad_norm": 0.510991632938385, "learning_rate": 6.834860053244154e-06, "loss": 1.1531927585601807, "step": 730 }, { "epoch": 1.3071428571428572, "grad_norm": 0.28777721524238586, "learning_rate": 6.8179313771088626e-06, "loss": 1.2121974229812622, "step": 732 }, { "epoch": 1.3107142857142857, "grad_norm": 0.30707836151123047, "learning_rate": 6.800982274838495e-06, "loss": 1.4065004587173462, "step": 734 }, { "epoch": 1.3142857142857143, "grad_norm": 0.23764309287071228, "learning_rate": 6.784013009120975e-06, "loss": 1.4308959245681763, "step": 736 }, { "epoch": 1.3178571428571428, "grad_norm": 0.6906368136405945, "learning_rate": 6.767023842956725e-06, "loss": 1.1925731897354126, "step": 738 }, { "epoch": 1.3214285714285714, "grad_norm": 0.4775388538837433, "learning_rate": 6.750015039654603e-06, "loss": 1.6403999328613281, "step": 740 }, { "epoch": 1.325, "grad_norm": 0.2565818727016449, "learning_rate": 6.732986862827813e-06, "loss": 1.0603913068771362, "step": 742 }, { "epoch": 1.3285714285714285, "grad_norm": 0.47122514247894287, "learning_rate": 6.7159395763898214e-06, "loss": 1.3830267190933228, "step": 744 }, { "epoch": 1.332142857142857, "grad_norm": 0.5306914448738098, "learning_rate": 6.698873444550271e-06, "loss": 1.2981680631637573, "step": 746 }, { "epoch": 1.3357142857142856, "grad_norm": 0.408100426197052, "learning_rate": 6.68178873181088e-06, "loss": 1.2487084865570068, "step": 748 }, { "epoch": 1.3392857142857144, "grad_norm": 0.33308205008506775, "learning_rate": 6.664685702961344e-06, "loss": 0.9980481266975403, "step": 750 }, { "epoch": 1.342857142857143, "grad_norm": 0.20474325120449066, "learning_rate": 6.647564623075236e-06, "loss": 0.9687408804893494, "step": 752 }, { "epoch": 1.3464285714285715, "grad_norm": 0.8245405554771423, "learning_rate": 6.630425757505894e-06, "loss": 1.33769953250885, "step": 754 }, { "epoch": 1.35, "grad_norm": 0.2982644736766815, "learning_rate": 6.613269371882308e-06, "loss": 1.3833491802215576, "step": 756 }, { "epoch": 1.3535714285714286, "grad_norm": 0.45085495710372925, "learning_rate": 6.596095732105011e-06, "loss": 1.2755907773971558, "step": 758 }, { "epoch": 1.3571428571428572, "grad_norm": 0.29945558309555054, "learning_rate": 6.5789051043419435e-06, "loss": 1.2956531047821045, "step": 760 }, { "epoch": 1.3607142857142858, "grad_norm": 0.5544592142105103, "learning_rate": 6.5616977550243435e-06, "loss": 1.2718784809112549, "step": 762 }, { "epoch": 1.3642857142857143, "grad_norm": 0.7638172507286072, "learning_rate": 6.544473950842606e-06, "loss": 1.126919150352478, "step": 764 }, { "epoch": 1.3678571428571429, "grad_norm": 0.4192071557044983, "learning_rate": 6.527233958742154e-06, "loss": 1.4331161975860596, "step": 766 }, { "epoch": 1.3714285714285714, "grad_norm": 0.2737813889980316, "learning_rate": 6.509978045919307e-06, "loss": 1.2379997968673706, "step": 768 }, { "epoch": 1.375, "grad_norm": 0.7987821102142334, "learning_rate": 6.492706479817125e-06, "loss": 1.278856873512268, "step": 770 }, { "epoch": 1.3785714285714286, "grad_norm": 0.30944374203681946, "learning_rate": 6.475419528121279e-06, "loss": 1.3922899961471558, "step": 772 }, { "epoch": 1.3821428571428571, "grad_norm": 0.29533934593200684, "learning_rate": 6.45811745875589e-06, "loss": 1.235024094581604, "step": 774 }, { "epoch": 1.3857142857142857, "grad_norm": 0.788487434387207, "learning_rate": 6.440800539879392e-06, "loss": 1.1024410724639893, "step": 776 }, { "epoch": 1.3892857142857142, "grad_norm": 0.3519847095012665, "learning_rate": 6.423469039880355e-06, "loss": 1.233741283416748, "step": 778 }, { "epoch": 1.3928571428571428, "grad_norm": 0.18675316870212555, "learning_rate": 6.406123227373343e-06, "loss": 1.3022193908691406, "step": 780 }, { "epoch": 1.3964285714285714, "grad_norm": 0.263254314661026, "learning_rate": 6.388763371194741e-06, "loss": 1.2517147064208984, "step": 782 }, { "epoch": 1.4, "grad_norm": 0.35091346502304077, "learning_rate": 6.371389740398597e-06, "loss": 1.1601366996765137, "step": 784 }, { "epoch": 1.4035714285714285, "grad_norm": 0.34103208780288696, "learning_rate": 6.35400260425244e-06, "loss": 1.3991872072219849, "step": 786 }, { "epoch": 1.407142857142857, "grad_norm": 1.0600661039352417, "learning_rate": 6.336602232233116e-06, "loss": 1.4128477573394775, "step": 788 }, { "epoch": 1.4107142857142856, "grad_norm": 0.6274294257164001, "learning_rate": 6.319188894022612e-06, "loss": 1.5149511098861694, "step": 790 }, { "epoch": 1.4142857142857144, "grad_norm": 0.25083670020103455, "learning_rate": 6.301762859503869e-06, "loss": 1.3468106985092163, "step": 792 }, { "epoch": 1.417857142857143, "grad_norm": 0.4435229003429413, "learning_rate": 6.284324398756606e-06, "loss": 1.3005448579788208, "step": 794 }, { "epoch": 1.4214285714285715, "grad_norm": 0.5059611201286316, "learning_rate": 6.266873782053131e-06, "loss": 1.0667213201522827, "step": 796 }, { "epoch": 1.425, "grad_norm": 0.2751584053039551, "learning_rate": 6.249411279854152e-06, "loss": 1.1674690246582031, "step": 798 }, { "epoch": 1.4285714285714286, "grad_norm": 0.2168678641319275, "learning_rate": 6.231937162804584e-06, "loss": 1.0654405355453491, "step": 800 }, { "epoch": 1.4321428571428572, "grad_norm": 0.6201224327087402, "learning_rate": 6.214451701729363e-06, "loss": 1.1552761793136597, "step": 802 }, { "epoch": 1.4357142857142857, "grad_norm": 0.4682956635951996, "learning_rate": 6.196955167629236e-06, "loss": 1.3353182077407837, "step": 804 }, { "epoch": 1.4392857142857143, "grad_norm": 0.3534834384918213, "learning_rate": 6.179447831676566e-06, "loss": 1.3080209493637085, "step": 806 }, { "epoch": 1.4428571428571428, "grad_norm": 0.4813729226589203, "learning_rate": 6.161929965211135e-06, "loss": 1.3717149496078491, "step": 808 }, { "epoch": 1.4464285714285714, "grad_norm": 0.26942121982574463, "learning_rate": 6.144401839735931e-06, "loss": 1.4133044481277466, "step": 810 }, { "epoch": 1.45, "grad_norm": 0.30204319953918457, "learning_rate": 6.12686372691294e-06, "loss": 1.581753134727478, "step": 812 }, { "epoch": 1.4535714285714285, "grad_norm": 1.1933614015579224, "learning_rate": 6.109315898558943e-06, "loss": 1.1946600675582886, "step": 814 }, { "epoch": 1.457142857142857, "grad_norm": 0.651054322719574, "learning_rate": 6.091758626641296e-06, "loss": 1.2849314212799072, "step": 816 }, { "epoch": 1.4607142857142856, "grad_norm": 0.41265299916267395, "learning_rate": 6.074192183273714e-06, "loss": 1.2870151996612549, "step": 818 }, { "epoch": 1.4642857142857144, "grad_norm": 0.2880115807056427, "learning_rate": 6.056616840712065e-06, "loss": 1.156186580657959, "step": 820 }, { "epoch": 1.467857142857143, "grad_norm": 0.31380829215049744, "learning_rate": 6.039032871350136e-06, "loss": 1.3075363636016846, "step": 822 }, { "epoch": 1.4714285714285715, "grad_norm": 0.735464334487915, "learning_rate": 6.021440547715418e-06, "loss": 1.2372568845748901, "step": 824 }, { "epoch": 1.475, "grad_norm": 0.3404405117034912, "learning_rate": 6.0038401424648866e-06, "loss": 1.3656535148620605, "step": 826 }, { "epoch": 1.4785714285714286, "grad_norm": 1.0161242485046387, "learning_rate": 5.986231928380764e-06, "loss": 1.4575047492980957, "step": 828 }, { "epoch": 1.4821428571428572, "grad_norm": 0.32120001316070557, "learning_rate": 5.968616178366304e-06, "loss": 1.1328424215316772, "step": 830 }, { "epoch": 1.4857142857142858, "grad_norm": 0.24318258464336395, "learning_rate": 5.95099316544156e-06, "loss": 1.1171592473983765, "step": 832 }, { "epoch": 1.4892857142857143, "grad_norm": 0.2471759170293808, "learning_rate": 5.9333631627391385e-06, "loss": 1.1361713409423828, "step": 834 }, { "epoch": 1.4928571428571429, "grad_norm": 0.31643709540367126, "learning_rate": 5.915726443499992e-06, "loss": 1.4550275802612305, "step": 836 }, { "epoch": 1.4964285714285714, "grad_norm": 0.2178327739238739, "learning_rate": 5.89808328106916e-06, "loss": 1.1423126459121704, "step": 838 }, { "epoch": 1.5, "grad_norm": 0.38720911741256714, "learning_rate": 5.880433948891548e-06, "loss": 1.0535848140716553, "step": 840 }, { "epoch": 1.5035714285714286, "grad_norm": 0.4030672013759613, "learning_rate": 5.862778720507684e-06, "loss": 1.3946490287780762, "step": 842 }, { "epoch": 1.5071428571428571, "grad_norm": 0.35578665137290955, "learning_rate": 5.845117869549477e-06, "loss": 1.5173096656799316, "step": 844 }, { "epoch": 1.5107142857142857, "grad_norm": 0.3867500126361847, "learning_rate": 5.827451669735977e-06, "loss": 1.352368712425232, "step": 846 }, { "epoch": 1.5142857142857142, "grad_norm": 0.9219626188278198, "learning_rate": 5.80978039486914e-06, "loss": 0.9382961988449097, "step": 848 }, { "epoch": 1.5178571428571428, "grad_norm": 0.21979399025440216, "learning_rate": 5.79210431882957e-06, "loss": 0.8432712554931641, "step": 850 }, { "epoch": 1.5214285714285714, "grad_norm": 0.28859761357307434, "learning_rate": 5.774423715572289e-06, "loss": 1.296618938446045, "step": 852 }, { "epoch": 1.525, "grad_norm": 0.4942507743835449, "learning_rate": 5.756738859122483e-06, "loss": 1.0648285150527954, "step": 854 }, { "epoch": 1.5285714285714285, "grad_norm": 0.5750854015350342, "learning_rate": 5.739050023571258e-06, "loss": 1.0088112354278564, "step": 856 }, { "epoch": 1.532142857142857, "grad_norm": 0.20957696437835693, "learning_rate": 5.721357483071386e-06, "loss": 1.0590897798538208, "step": 858 }, { "epoch": 1.5357142857142856, "grad_norm": 0.8381152153015137, "learning_rate": 5.703661511833064e-06, "loss": 1.3163901567459106, "step": 860 }, { "epoch": 1.5392857142857141, "grad_norm": 0.4364100992679596, "learning_rate": 5.68596238411966e-06, "loss": 1.1863445043563843, "step": 862 }, { "epoch": 1.5428571428571427, "grad_norm": 0.3657117784023285, "learning_rate": 5.668260374243467e-06, "loss": 0.98140949010849, "step": 864 }, { "epoch": 1.5464285714285713, "grad_norm": 0.6113946437835693, "learning_rate": 5.650555756561439e-06, "loss": 1.3584340810775757, "step": 866 }, { "epoch": 1.55, "grad_norm": 0.7465829849243164, "learning_rate": 5.6328488054709575e-06, "loss": 1.149134874343872, "step": 868 }, { "epoch": 1.5535714285714286, "grad_norm": 0.9023903608322144, "learning_rate": 5.615139795405559e-06, "loss": 1.2276476621627808, "step": 870 }, { "epoch": 1.5571428571428572, "grad_norm": 0.5961250066757202, "learning_rate": 5.5974290008307e-06, "loss": 1.3803772926330566, "step": 872 }, { "epoch": 1.5607142857142857, "grad_norm": 0.31303706765174866, "learning_rate": 5.579716696239486e-06, "loss": 0.8974480628967285, "step": 874 }, { "epoch": 1.5642857142857143, "grad_norm": 0.49465271830558777, "learning_rate": 5.562003156148434e-06, "loss": 1.500373125076294, "step": 876 }, { "epoch": 1.5678571428571428, "grad_norm": 0.4547047019004822, "learning_rate": 5.544288655093203e-06, "loss": 1.3437693119049072, "step": 878 }, { "epoch": 1.5714285714285714, "grad_norm": 0.2680365741252899, "learning_rate": 5.526573467624351e-06, "loss": 1.0480762720108032, "step": 880 }, { "epoch": 1.575, "grad_norm": 0.2553335130214691, "learning_rate": 5.508857868303068e-06, "loss": 1.078729271888733, "step": 882 }, { "epoch": 1.5785714285714287, "grad_norm": 0.2632956802845001, "learning_rate": 5.491142131696934e-06, "loss": 1.16781485080719, "step": 884 }, { "epoch": 1.5821428571428573, "grad_norm": 0.42439237236976624, "learning_rate": 5.473426532375651e-06, "loss": 1.0907145738601685, "step": 886 }, { "epoch": 1.5857142857142859, "grad_norm": 0.4016067087650299, "learning_rate": 5.455711344906797e-06, "loss": 1.0479315519332886, "step": 888 }, { "epoch": 1.5892857142857144, "grad_norm": 0.787295401096344, "learning_rate": 5.437996843851567e-06, "loss": 1.1056879758834839, "step": 890 }, { "epoch": 1.592857142857143, "grad_norm": 0.24893441796302795, "learning_rate": 5.420283303760515e-06, "loss": 1.086808443069458, "step": 892 }, { "epoch": 1.5964285714285715, "grad_norm": 1.0016993284225464, "learning_rate": 5.402570999169303e-06, "loss": 1.4259756803512573, "step": 894 }, { "epoch": 1.6, "grad_norm": 0.5658416748046875, "learning_rate": 5.384860204594442e-06, "loss": 1.175308346748352, "step": 896 }, { "epoch": 1.6035714285714286, "grad_norm": 0.32960644364356995, "learning_rate": 5.367151194529045e-06, "loss": 1.3044936656951904, "step": 898 }, { "epoch": 1.6071428571428572, "grad_norm": 1.566615343093872, "learning_rate": 5.349444243438563e-06, "loss": 1.1787108182907104, "step": 900 }, { "epoch": 1.6107142857142858, "grad_norm": 0.3008659780025482, "learning_rate": 5.331739625756535e-06, "loss": 1.2578707933425903, "step": 902 }, { "epoch": 1.6142857142857143, "grad_norm": 0.3048568367958069, "learning_rate": 5.314037615880341e-06, "loss": 1.214415192604065, "step": 904 }, { "epoch": 1.6178571428571429, "grad_norm": 0.30796509981155396, "learning_rate": 5.296338488166939e-06, "loss": 1.2612226009368896, "step": 906 }, { "epoch": 1.6214285714285714, "grad_norm": 0.3856910467147827, "learning_rate": 5.278642516928617e-06, "loss": 1.1769757270812988, "step": 908 }, { "epoch": 1.625, "grad_norm": 0.4512476921081543, "learning_rate": 5.260949976428745e-06, "loss": 1.058244228363037, "step": 910 }, { "epoch": 1.6285714285714286, "grad_norm": 0.5113015174865723, "learning_rate": 5.243261140877517e-06, "loss": 1.3994414806365967, "step": 912 }, { "epoch": 1.6321428571428571, "grad_norm": 0.24723981320858002, "learning_rate": 5.225576284427712e-06, "loss": 1.29803466796875, "step": 914 }, { "epoch": 1.6357142857142857, "grad_norm": 0.2900439202785492, "learning_rate": 5.207895681170432e-06, "loss": 1.341897964477539, "step": 916 }, { "epoch": 1.6392857142857142, "grad_norm": 0.2555374205112457, "learning_rate": 5.190219605130863e-06, "loss": 1.1864595413208008, "step": 918 }, { "epoch": 1.6428571428571428, "grad_norm": 0.31760746240615845, "learning_rate": 5.172548330264023e-06, "loss": 1.2025091648101807, "step": 920 }, { "epoch": 1.6464285714285714, "grad_norm": 0.28426891565322876, "learning_rate": 5.154882130450525e-06, "loss": 1.3937333822250366, "step": 922 }, { "epoch": 1.65, "grad_norm": 0.26754945516586304, "learning_rate": 5.137221279492317e-06, "loss": 1.1592669486999512, "step": 924 }, { "epoch": 1.6535714285714285, "grad_norm": 0.398725301027298, "learning_rate": 5.119566051108453e-06, "loss": 1.041808009147644, "step": 926 }, { "epoch": 1.657142857142857, "grad_norm": 0.24082130193710327, "learning_rate": 5.10191671893084e-06, "loss": 1.1113499402999878, "step": 928 }, { "epoch": 1.6607142857142856, "grad_norm": 0.32985880970954895, "learning_rate": 5.08427355650001e-06, "loss": 1.243566632270813, "step": 930 }, { "epoch": 1.6642857142857141, "grad_norm": 0.22729991376399994, "learning_rate": 5.066636837260863e-06, "loss": 1.218003511428833, "step": 932 }, { "epoch": 1.6678571428571427, "grad_norm": 0.20701321959495544, "learning_rate": 5.049006834558443e-06, "loss": 1.1665146350860596, "step": 934 }, { "epoch": 1.6714285714285713, "grad_norm": 0.7482126355171204, "learning_rate": 5.031383821633695e-06, "loss": 1.0261443853378296, "step": 936 }, { "epoch": 1.675, "grad_norm": 0.6510646939277649, "learning_rate": 5.013768071619237e-06, "loss": 1.1913405656814575, "step": 938 }, { "epoch": 1.6785714285714286, "grad_norm": 0.3893536329269409, "learning_rate": 4.996159857535116e-06, "loss": 1.2498658895492554, "step": 940 }, { "epoch": 1.6821428571428572, "grad_norm": 0.33820265531539917, "learning_rate": 4.9785594522845835e-06, "loss": 1.2645461559295654, "step": 942 }, { "epoch": 1.6857142857142857, "grad_norm": 0.2857305407524109, "learning_rate": 4.9609671286498655e-06, "loss": 1.1648997068405151, "step": 944 }, { "epoch": 1.6892857142857143, "grad_norm": 0.35911425948143005, "learning_rate": 4.943383159287936e-06, "loss": 1.3200312852859497, "step": 946 }, { "epoch": 1.6928571428571428, "grad_norm": 0.28003281354904175, "learning_rate": 4.925807816726288e-06, "loss": 1.2886927127838135, "step": 948 }, { "epoch": 1.6964285714285714, "grad_norm": 0.3707423210144043, "learning_rate": 4.908241373358707e-06, "loss": 1.2256838083267212, "step": 950 }, { "epoch": 1.7, "grad_norm": 0.37781476974487305, "learning_rate": 4.890684101441059e-06, "loss": 1.261880874633789, "step": 952 }, { "epoch": 1.7035714285714287, "grad_norm": 0.455138623714447, "learning_rate": 4.873136273087061e-06, "loss": 1.1675777435302734, "step": 954 }, { "epoch": 1.7071428571428573, "grad_norm": 0.3084830641746521, "learning_rate": 4.855598160264071e-06, "loss": 1.0751243829727173, "step": 956 }, { "epoch": 1.7107142857142859, "grad_norm": 0.33484798669815063, "learning_rate": 4.838070034788865e-06, "loss": 1.2969300746917725, "step": 958 }, { "epoch": 1.7142857142857144, "grad_norm": 0.45519745349884033, "learning_rate": 4.820552168323434e-06, "loss": 1.1682568788528442, "step": 960 }, { "epoch": 1.717857142857143, "grad_norm": 0.3936917185783386, "learning_rate": 4.803044832370765e-06, "loss": 1.2029849290847778, "step": 962 }, { "epoch": 1.7214285714285715, "grad_norm": 0.2847800850868225, "learning_rate": 4.7855482982706396e-06, "loss": 1.308813452720642, "step": 964 }, { "epoch": 1.725, "grad_norm": 0.2914465069770813, "learning_rate": 4.768062837195417e-06, "loss": 1.2900055646896362, "step": 966 }, { "epoch": 1.7285714285714286, "grad_norm": 0.5518858432769775, "learning_rate": 4.7505887201458485e-06, "loss": 1.2404606342315674, "step": 968 }, { "epoch": 1.7321428571428572, "grad_norm": 0.34736767411231995, "learning_rate": 4.73312621794687e-06, "loss": 1.1192835569381714, "step": 970 }, { "epoch": 1.7357142857142858, "grad_norm": 0.27989068627357483, "learning_rate": 4.715675601243396e-06, "loss": 1.2646175622940063, "step": 972 }, { "epoch": 1.7392857142857143, "grad_norm": 0.2832848131656647, "learning_rate": 4.698237140496132e-06, "loss": 1.2004600763320923, "step": 974 }, { "epoch": 1.7428571428571429, "grad_norm": 1.7877376079559326, "learning_rate": 4.68081110597739e-06, "loss": 1.2224751710891724, "step": 976 }, { "epoch": 1.7464285714285714, "grad_norm": 0.2644546627998352, "learning_rate": 4.663397767766885e-06, "loss": 1.2846026420593262, "step": 978 }, { "epoch": 1.75, "grad_norm": 0.23440435528755188, "learning_rate": 4.6459973957475625e-06, "loss": 1.2761108875274658, "step": 980 }, { "epoch": 1.7535714285714286, "grad_norm": 0.29541414976119995, "learning_rate": 4.628610259601406e-06, "loss": 1.2253004312515259, "step": 982 }, { "epoch": 1.7571428571428571, "grad_norm": 0.3721539378166199, "learning_rate": 4.611236628805259e-06, "loss": 1.217316746711731, "step": 984 }, { "epoch": 1.7607142857142857, "grad_norm": 0.23486927151679993, "learning_rate": 4.593876772626659e-06, "loss": 1.238864779472351, "step": 986 }, { "epoch": 1.7642857142857142, "grad_norm": 0.35403114557266235, "learning_rate": 4.576530960119646e-06, "loss": 1.2506440877914429, "step": 988 }, { "epoch": 1.7678571428571428, "grad_norm": 0.24216312170028687, "learning_rate": 4.55919946012061e-06, "loss": 1.203848123550415, "step": 990 }, { "epoch": 1.7714285714285714, "grad_norm": 0.5742025971412659, "learning_rate": 4.54188254124411e-06, "loss": 1.2036422491073608, "step": 992 }, { "epoch": 1.775, "grad_norm": 0.4332943260669708, "learning_rate": 4.524580471878724e-06, "loss": 1.1484333276748657, "step": 994 }, { "epoch": 1.7785714285714285, "grad_norm": 0.2262076586484909, "learning_rate": 4.507293520182877e-06, "loss": 1.2005127668380737, "step": 996 }, { "epoch": 1.782142857142857, "grad_norm": 0.18153786659240723, "learning_rate": 4.490021954080695e-06, "loss": 1.1209759712219238, "step": 998 }, { "epoch": 1.7857142857142856, "grad_norm": 0.2752821147441864, "learning_rate": 4.472766041257846e-06, "loss": 1.1912975311279297, "step": 1000 }, { "epoch": 1.7892857142857141, "grad_norm": 0.37398797273635864, "learning_rate": 4.4555260491573956e-06, "loss": 1.1634106636047363, "step": 1002 }, { "epoch": 1.7928571428571427, "grad_norm": 0.4885188341140747, "learning_rate": 4.438302244975659e-06, "loss": 1.19752037525177, "step": 1004 }, { "epoch": 1.7964285714285713, "grad_norm": 0.20963414013385773, "learning_rate": 4.421094895658058e-06, "loss": 1.1573578119277954, "step": 1006 }, { "epoch": 1.8, "grad_norm": 0.19768458604812622, "learning_rate": 4.403904267894991e-06, "loss": 1.1309683322906494, "step": 1008 }, { "epoch": 1.8035714285714286, "grad_norm": 0.3265831470489502, "learning_rate": 4.386730628117692e-06, "loss": 1.210740566253662, "step": 1010 }, { "epoch": 1.8071428571428572, "grad_norm": 0.24831008911132812, "learning_rate": 4.369574242494108e-06, "loss": 1.1857199668884277, "step": 1012 }, { "epoch": 1.8107142857142857, "grad_norm": 0.24806837737560272, "learning_rate": 4.3524353769247665e-06, "loss": 1.0957400798797607, "step": 1014 }, { "epoch": 1.8142857142857143, "grad_norm": 0.21978451311588287, "learning_rate": 4.335314297038656e-06, "loss": 1.1512374877929688, "step": 1016 }, { "epoch": 1.8178571428571428, "grad_norm": 0.18174096941947937, "learning_rate": 4.318211268189121e-06, "loss": 1.1074084043502808, "step": 1018 }, { "epoch": 1.8214285714285714, "grad_norm": 0.1807389110326767, "learning_rate": 4.3011265554497305e-06, "loss": 1.1385325193405151, "step": 1020 }, { "epoch": 1.825, "grad_norm": 0.3229348063468933, "learning_rate": 4.28406042361018e-06, "loss": 1.119950771331787, "step": 1022 }, { "epoch": 1.8285714285714287, "grad_norm": 0.21613694727420807, "learning_rate": 4.267013137172189e-06, "loss": 1.1364243030548096, "step": 1024 }, { "epoch": 1.8321428571428573, "grad_norm": 0.5674333572387695, "learning_rate": 4.249984960345399e-06, "loss": 1.1446290016174316, "step": 1026 }, { "epoch": 1.8357142857142859, "grad_norm": 0.19522684812545776, "learning_rate": 4.232976157043277e-06, "loss": 1.1350977420806885, "step": 1028 }, { "epoch": 1.8392857142857144, "grad_norm": 0.22652848064899445, "learning_rate": 4.2159869908790275e-06, "loss": 1.1374115943908691, "step": 1030 }, { "epoch": 1.842857142857143, "grad_norm": 0.20917841792106628, "learning_rate": 4.199017725161505e-06, "loss": 1.1824545860290527, "step": 1032 }, { "epoch": 1.8464285714285715, "grad_norm": 0.2631721496582031, "learning_rate": 4.182068622891139e-06, "loss": 1.1770212650299072, "step": 1034 }, { "epoch": 1.85, "grad_norm": 0.24983558058738708, "learning_rate": 4.165139946755847e-06, "loss": 1.161262035369873, "step": 1036 }, { "epoch": 1.8535714285714286, "grad_norm": 0.31537604331970215, "learning_rate": 4.148231959126973e-06, "loss": 1.1958869695663452, "step": 1038 }, { "epoch": 1.8571428571428572, "grad_norm": 0.3142789900302887, "learning_rate": 4.131344922055213e-06, "loss": 1.1789402961730957, "step": 1040 }, { "epoch": 1.8607142857142858, "grad_norm": 0.42967483401298523, "learning_rate": 4.114479097266567e-06, "loss": 1.1411830186843872, "step": 1042 }, { "epoch": 1.8642857142857143, "grad_norm": 0.21074344217777252, "learning_rate": 4.0976347461582656e-06, "loss": 1.17338228225708, "step": 1044 }, { "epoch": 1.8678571428571429, "grad_norm": 0.33415719866752625, "learning_rate": 4.080812129794728e-06, "loss": 1.1420398950576782, "step": 1046 }, { "epoch": 1.8714285714285714, "grad_norm": 0.16336952149868011, "learning_rate": 4.064011508903516e-06, "loss": 1.1628490686416626, "step": 1048 }, { "epoch": 1.875, "grad_norm": 0.2252008020877838, "learning_rate": 4.047233143871292e-06, "loss": 1.173589825630188, "step": 1050 }, { "epoch": 1.8785714285714286, "grad_norm": 0.33176442980766296, "learning_rate": 4.030477294739783e-06, "loss": 1.194374918937683, "step": 1052 }, { "epoch": 1.8821428571428571, "grad_norm": 0.29097726941108704, "learning_rate": 4.013744221201749e-06, "loss": 1.1737301349639893, "step": 1054 }, { "epoch": 1.8857142857142857, "grad_norm": 0.1832679808139801, "learning_rate": 3.997034182596958e-06, "loss": 1.110135793685913, "step": 1056 }, { "epoch": 1.8892857142857142, "grad_norm": 0.2953426241874695, "learning_rate": 3.980347437908175e-06, "loss": 1.1428486108779907, "step": 1058 }, { "epoch": 1.8928571428571428, "grad_norm": 0.20754416286945343, "learning_rate": 3.963684245757132e-06, "loss": 1.17241632938385, "step": 1060 }, { "epoch": 1.8964285714285714, "grad_norm": 0.29985517263412476, "learning_rate": 3.9470448644005345e-06, "loss": 1.2037956714630127, "step": 1062 }, { "epoch": 1.9, "grad_norm": 0.24180017411708832, "learning_rate": 3.930429551726049e-06, "loss": 1.1744909286499023, "step": 1064 }, { "epoch": 1.9035714285714285, "grad_norm": 0.1725412905216217, "learning_rate": 3.913838565248318e-06, "loss": 1.1504842042922974, "step": 1066 }, { "epoch": 1.907142857142857, "grad_norm": 0.19483552873134613, "learning_rate": 3.8972721621049545e-06, "loss": 1.1242973804473877, "step": 1068 }, { "epoch": 1.9107142857142856, "grad_norm": 0.2150045484304428, "learning_rate": 3.880730599052565e-06, "loss": 1.1571553945541382, "step": 1070 }, { "epoch": 1.9142857142857141, "grad_norm": 0.26055601239204407, "learning_rate": 3.864214132462766e-06, "loss": 1.1744543313980103, "step": 1072 }, { "epoch": 1.9178571428571427, "grad_norm": 0.20224107801914215, "learning_rate": 3.84772301831822e-06, "loss": 1.129955768585205, "step": 1074 }, { "epoch": 1.9214285714285713, "grad_norm": 0.21899673342704773, "learning_rate": 3.831257512208657e-06, "loss": 1.1564751863479614, "step": 1076 }, { "epoch": 1.925, "grad_norm": 0.24604743719100952, "learning_rate": 3.814817869326915e-06, "loss": 1.1490484476089478, "step": 1078 }, { "epoch": 1.9285714285714286, "grad_norm": 0.1920636147260666, "learning_rate": 3.7984043444649898e-06, "loss": 1.1944819688796997, "step": 1080 }, { "epoch": 1.9321428571428572, "grad_norm": 0.2951393723487854, "learning_rate": 3.782017192010087e-06, "loss": 1.2130813598632812, "step": 1082 }, { "epoch": 1.9357142857142857, "grad_norm": 0.38370734453201294, "learning_rate": 3.76565666594067e-06, "loss": 1.1711630821228027, "step": 1084 }, { "epoch": 1.9392857142857143, "grad_norm": 0.7297260165214539, "learning_rate": 3.749323019822534e-06, "loss": 1.1901503801345825, "step": 1086 }, { "epoch": 1.9428571428571428, "grad_norm": 0.22041039168834686, "learning_rate": 3.7330165068048673e-06, "loss": 1.1663475036621094, "step": 1088 }, { "epoch": 1.9464285714285714, "grad_norm": 0.2529982626438141, "learning_rate": 3.7167373796163377e-06, "loss": 1.1222208738327026, "step": 1090 }, { "epoch": 1.95, "grad_norm": 0.22839988768100739, "learning_rate": 3.700485890561167e-06, "loss": 1.1396700143814087, "step": 1092 }, { "epoch": 1.9535714285714287, "grad_norm": 0.32207345962524414, "learning_rate": 3.6842622915152228e-06, "loss": 1.1646703481674194, "step": 1094 }, { "epoch": 1.9571428571428573, "grad_norm": 0.2876273989677429, "learning_rate": 3.668066833922116e-06, "loss": 1.148516058921814, "step": 1096 }, { "epoch": 1.9607142857142859, "grad_norm": 0.2196146845817566, "learning_rate": 3.6518997687893053e-06, "loss": 1.1533443927764893, "step": 1098 }, { "epoch": 1.9642857142857144, "grad_norm": 0.46365395188331604, "learning_rate": 3.635761346684206e-06, "loss": 1.1947966814041138, "step": 1100 }, { "epoch": 1.967857142857143, "grad_norm": 0.2954294681549072, "learning_rate": 3.619651817730302e-06, "loss": 1.1832884550094604, "step": 1102 }, { "epoch": 1.9714285714285715, "grad_norm": 0.2565920650959015, "learning_rate": 3.603571431603272e-06, "loss": 1.1965795755386353, "step": 1104 }, { "epoch": 1.975, "grad_norm": 0.2640427350997925, "learning_rate": 3.587520437527128e-06, "loss": 1.140123963356018, "step": 1106 }, { "epoch": 1.9785714285714286, "grad_norm": 0.26683422923088074, "learning_rate": 3.571499084270338e-06, "loss": 1.1581156253814697, "step": 1108 }, { "epoch": 1.9821428571428572, "grad_norm": 0.2290692776441574, "learning_rate": 3.5555076201419816e-06, "loss": 1.174959421157837, "step": 1110 }, { "epoch": 1.9857142857142858, "grad_norm": 0.2061983048915863, "learning_rate": 3.5395462929878945e-06, "loss": 1.220007061958313, "step": 1112 }, { "epoch": 1.9892857142857143, "grad_norm": 0.20125523209571838, "learning_rate": 3.5236153501868343e-06, "loss": 1.2462403774261475, "step": 1114 }, { "epoch": 1.9928571428571429, "grad_norm": 0.29600805044174194, "learning_rate": 3.5077150386466406e-06, "loss": 1.2024950981140137, "step": 1116 }, { "epoch": 1.9964285714285714, "grad_norm": 0.2931258976459503, "learning_rate": 3.4918456048004106e-06, "loss": 1.1237006187438965, "step": 1118 }, { "epoch": 2.0, "grad_norm": 0.4734819829463959, "learning_rate": 3.4760072946026786e-06, "loss": 1.1085011959075928, "step": 1120 }, { "epoch": 2.0035714285714286, "grad_norm": 0.20331430435180664, "learning_rate": 3.46020035352561e-06, "loss": 1.2824596166610718, "step": 1122 }, { "epoch": 2.007142857142857, "grad_norm": 0.46622058749198914, "learning_rate": 3.444425026555182e-06, "loss": 1.2747101783752441, "step": 1124 }, { "epoch": 2.0107142857142857, "grad_norm": 0.19980192184448242, "learning_rate": 3.4286815581874045e-06, "loss": 1.2517393827438354, "step": 1126 }, { "epoch": 2.0142857142857142, "grad_norm": 0.32897406816482544, "learning_rate": 3.4129701924245173e-06, "loss": 1.2301400899887085, "step": 1128 }, { "epoch": 2.017857142857143, "grad_norm": 0.17299680411815643, "learning_rate": 3.397291172771221e-06, "loss": 1.2544574737548828, "step": 1130 }, { "epoch": 2.0214285714285714, "grad_norm": 0.2090325653553009, "learning_rate": 3.3816447422308883e-06, "loss": 1.0791321992874146, "step": 1132 }, { "epoch": 2.025, "grad_norm": 0.2806832790374756, "learning_rate": 3.366031143301811e-06, "loss": 1.1756961345672607, "step": 1134 }, { "epoch": 2.0285714285714285, "grad_norm": 0.4019312858581543, "learning_rate": 3.3504506179734254e-06, "loss": 1.1622370481491089, "step": 1136 }, { "epoch": 2.032142857142857, "grad_norm": 0.22266216576099396, "learning_rate": 3.334903407722587e-06, "loss": 1.234253168106079, "step": 1138 }, { "epoch": 2.0357142857142856, "grad_norm": 0.29923903942108154, "learning_rate": 3.319389753509803e-06, "loss": 1.1241004467010498, "step": 1140 }, { "epoch": 2.039285714285714, "grad_norm": 0.3284701704978943, "learning_rate": 3.30390989577551e-06, "loss": 1.260522723197937, "step": 1142 }, { "epoch": 2.0428571428571427, "grad_norm": 0.4323379099369049, "learning_rate": 3.288464074436346e-06, "loss": 1.1753382682800293, "step": 1144 }, { "epoch": 2.0464285714285713, "grad_norm": 0.3188895285129547, "learning_rate": 3.273052528881433e-06, "loss": 1.1759196519851685, "step": 1146 }, { "epoch": 2.05, "grad_norm": 0.754629373550415, "learning_rate": 3.257675497968661e-06, "loss": 1.0839532613754272, "step": 1148 }, { "epoch": 2.0535714285714284, "grad_norm": 0.261398047208786, "learning_rate": 3.2423332200209946e-06, "loss": 1.1668034791946411, "step": 1150 }, { "epoch": 2.057142857142857, "grad_norm": 0.3192571699619293, "learning_rate": 3.2270259328227703e-06, "loss": 1.312312364578247, "step": 1152 }, { "epoch": 2.0607142857142855, "grad_norm": 0.3842572271823883, "learning_rate": 3.2117538736160235e-06, "loss": 1.241450548171997, "step": 1154 }, { "epoch": 2.064285714285714, "grad_norm": 0.3109821677207947, "learning_rate": 3.1965172790967967e-06, "loss": 1.2660008668899536, "step": 1156 }, { "epoch": 2.067857142857143, "grad_norm": 0.30365416407585144, "learning_rate": 3.1813163854114793e-06, "loss": 1.1892515420913696, "step": 1158 }, { "epoch": 2.0714285714285716, "grad_norm": 0.26805219054222107, "learning_rate": 3.1661514281531464e-06, "loss": 1.2073129415512085, "step": 1160 }, { "epoch": 2.075, "grad_norm": 0.26900723576545715, "learning_rate": 3.1510226423579127e-06, "loss": 1.1416363716125488, "step": 1162 }, { "epoch": 2.0785714285714287, "grad_norm": 0.3996395468711853, "learning_rate": 3.135930262501279e-06, "loss": 1.2287384271621704, "step": 1164 }, { "epoch": 2.0821428571428573, "grad_norm": 0.3018134832382202, "learning_rate": 3.120874522494506e-06, "loss": 1.2006416320800781, "step": 1166 }, { "epoch": 2.085714285714286, "grad_norm": 0.16339807212352753, "learning_rate": 3.105855655680986e-06, "loss": 0.9185248017311096, "step": 1168 }, { "epoch": 2.0892857142857144, "grad_norm": 0.3090437352657318, "learning_rate": 3.090873894832628e-06, "loss": 0.9894356727600098, "step": 1170 }, { "epoch": 2.092857142857143, "grad_norm": 0.30770227313041687, "learning_rate": 3.07592947214625e-06, "loss": 1.0587633848190308, "step": 1172 }, { "epoch": 2.0964285714285715, "grad_norm": 0.32658347487449646, "learning_rate": 3.0610226192399767e-06, "loss": 1.2783530950546265, "step": 1174 }, { "epoch": 2.1, "grad_norm": 0.3846922218799591, "learning_rate": 3.0461535671496537e-06, "loss": 1.0930966138839722, "step": 1176 }, { "epoch": 2.1035714285714286, "grad_norm": 0.44550713896751404, "learning_rate": 3.0313225463252716e-06, "loss": 1.0916811227798462, "step": 1178 }, { "epoch": 2.107142857142857, "grad_norm": 0.9442609548568726, "learning_rate": 3.0165297866273766e-06, "loss": 1.2753980159759521, "step": 1180 }, { "epoch": 2.1107142857142858, "grad_norm": 0.2832079529762268, "learning_rate": 3.0017755173235295e-06, "loss": 1.195408821105957, "step": 1182 }, { "epoch": 2.1142857142857143, "grad_norm": 0.27624693512916565, "learning_rate": 2.9870599670847366e-06, "loss": 1.137044072151184, "step": 1184 }, { "epoch": 2.117857142857143, "grad_norm": 0.5313391089439392, "learning_rate": 2.972383363981917e-06, "loss": 1.1940035820007324, "step": 1186 }, { "epoch": 2.1214285714285714, "grad_norm": 1.0065633058547974, "learning_rate": 2.9577459354823602e-06, "loss": 1.1326301097869873, "step": 1188 }, { "epoch": 2.125, "grad_norm": 0.19776014983654022, "learning_rate": 2.9431479084462013e-06, "loss": 1.18599534034729, "step": 1190 }, { "epoch": 2.1285714285714286, "grad_norm": 0.2414723038673401, "learning_rate": 2.9285895091229042e-06, "loss": 1.2466977834701538, "step": 1192 }, { "epoch": 2.132142857142857, "grad_norm": 0.2931707799434662, "learning_rate": 2.9140709631477666e-06, "loss": 1.155306339263916, "step": 1194 }, { "epoch": 2.1357142857142857, "grad_norm": 0.26033467054367065, "learning_rate": 2.8995924955384048e-06, "loss": 1.1785553693771362, "step": 1196 }, { "epoch": 2.1392857142857142, "grad_norm": 0.24594391882419586, "learning_rate": 2.885154330691278e-06, "loss": 1.1734336614608765, "step": 1198 }, { "epoch": 2.142857142857143, "grad_norm": 0.5041958093643188, "learning_rate": 2.8707566923782105e-06, "loss": 1.0410226583480835, "step": 1200 }, { "epoch": 2.1464285714285714, "grad_norm": 0.25237134099006653, "learning_rate": 2.856399803742916e-06, "loss": 1.2042694091796875, "step": 1202 }, { "epoch": 2.15, "grad_norm": 0.4853833019733429, "learning_rate": 2.8420838872975482e-06, "loss": 1.150026559829712, "step": 1204 }, { "epoch": 2.1535714285714285, "grad_norm": 0.3172329366207123, "learning_rate": 2.8278091649192443e-06, "loss": 1.2379705905914307, "step": 1206 }, { "epoch": 2.157142857142857, "grad_norm": 0.24962536990642548, "learning_rate": 2.81357585784669e-06, "loss": 1.2625255584716797, "step": 1208 }, { "epoch": 2.1607142857142856, "grad_norm": 0.5905876755714417, "learning_rate": 2.799384186676696e-06, "loss": 1.1990773677825928, "step": 1210 }, { "epoch": 2.164285714285714, "grad_norm": 0.2595714032649994, "learning_rate": 2.785234371360766e-06, "loss": 1.0102604627609253, "step": 1212 }, { "epoch": 2.1678571428571427, "grad_norm": 0.2449759989976883, "learning_rate": 2.7711266312016986e-06, "loss": 1.1595333814620972, "step": 1214 }, { "epoch": 2.1714285714285713, "grad_norm": 0.38237428665161133, "learning_rate": 2.757061184850183e-06, "loss": 1.2344083786010742, "step": 1216 }, { "epoch": 2.175, "grad_norm": 0.24876584112644196, "learning_rate": 2.743038250301418e-06, "loss": 1.124006748199463, "step": 1218 }, { "epoch": 2.1785714285714284, "grad_norm": 0.34139466285705566, "learning_rate": 2.7290580448917204e-06, "loss": 1.090733528137207, "step": 1220 }, { "epoch": 2.182142857142857, "grad_norm": 0.22050592303276062, "learning_rate": 2.7151207852951677e-06, "loss": 1.1178282499313354, "step": 1222 }, { "epoch": 2.185714285714286, "grad_norm": 0.26262110471725464, "learning_rate": 2.701226687520235e-06, "loss": 1.1468334197998047, "step": 1224 }, { "epoch": 2.189285714285714, "grad_norm": 0.2389093041419983, "learning_rate": 2.6873759669064474e-06, "loss": 1.1655080318450928, "step": 1226 }, { "epoch": 2.192857142857143, "grad_norm": 0.22899575531482697, "learning_rate": 2.673568838121045e-06, "loss": 1.169728398323059, "step": 1228 }, { "epoch": 2.1964285714285716, "grad_norm": 0.7747792601585388, "learning_rate": 2.659805515155653e-06, "loss": 1.0896999835968018, "step": 1230 }, { "epoch": 2.2, "grad_norm": 0.35865241289138794, "learning_rate": 2.6460862113229656e-06, "loss": 1.0157350301742554, "step": 1232 }, { "epoch": 2.2035714285714287, "grad_norm": 0.9577608108520508, "learning_rate": 2.6324111392534423e-06, "loss": 1.1235113143920898, "step": 1234 }, { "epoch": 2.2071428571428573, "grad_norm": 0.3065534234046936, "learning_rate": 2.6187805108920104e-06, "loss": 1.071955680847168, "step": 1236 }, { "epoch": 2.210714285714286, "grad_norm": 0.33233603835105896, "learning_rate": 2.605194537494779e-06, "loss": 1.3001371622085571, "step": 1238 }, { "epoch": 2.2142857142857144, "grad_norm": 0.8232606649398804, "learning_rate": 2.5916534296257655e-06, "loss": 1.2073559761047363, "step": 1240 }, { "epoch": 2.217857142857143, "grad_norm": 0.3004189431667328, "learning_rate": 2.5781573971536387e-06, "loss": 0.9778292179107666, "step": 1242 }, { "epoch": 2.2214285714285715, "grad_norm": 0.5353025794029236, "learning_rate": 2.5647066492484564e-06, "loss": 1.106062889099121, "step": 1244 }, { "epoch": 2.225, "grad_norm": 0.2562118172645569, "learning_rate": 2.5513013943784236e-06, "loss": 1.187153935432434, "step": 1246 }, { "epoch": 2.2285714285714286, "grad_norm": 0.3913024067878723, "learning_rate": 2.537941840306669e-06, "loss": 0.8193651437759399, "step": 1248 }, { "epoch": 2.232142857142857, "grad_norm": 0.29852673411369324, "learning_rate": 2.524628194088027e-06, "loss": 0.5965661406517029, "step": 1250 }, { "epoch": 2.2357142857142858, "grad_norm": 0.2190428078174591, "learning_rate": 2.511360662065813e-06, "loss": 0.9129496812820435, "step": 1252 }, { "epoch": 2.2392857142857143, "grad_norm": 0.3540997803211212, "learning_rate": 2.4981394498686413e-06, "loss": 1.138474702835083, "step": 1254 }, { "epoch": 2.242857142857143, "grad_norm": 0.9036802053451538, "learning_rate": 2.484964762407232e-06, "loss": 1.2528407573699951, "step": 1256 }, { "epoch": 2.2464285714285714, "grad_norm": 0.4152211844921112, "learning_rate": 2.471836803871233e-06, "loss": 1.105533480644226, "step": 1258 }, { "epoch": 2.25, "grad_norm": 0.48458918929100037, "learning_rate": 2.45875577772606e-06, "loss": 0.9600842595100403, "step": 1260 }, { "epoch": 2.2535714285714286, "grad_norm": 0.3086172044277191, "learning_rate": 2.4457218867097396e-06, "loss": 1.0594391822814941, "step": 1262 }, { "epoch": 2.257142857142857, "grad_norm": 0.24558311700820923, "learning_rate": 2.4327353328297673e-06, "loss": 1.1570055484771729, "step": 1264 }, { "epoch": 2.2607142857142857, "grad_norm": 1.6706045866012573, "learning_rate": 2.419796317359983e-06, "loss": 0.9727555513381958, "step": 1266 }, { "epoch": 2.2642857142857142, "grad_norm": 0.37175774574279785, "learning_rate": 2.4069050408374376e-06, "loss": 1.0557781457901, "step": 1268 }, { "epoch": 2.267857142857143, "grad_norm": 0.2886607050895691, "learning_rate": 2.3940617030593e-06, "loss": 1.1356130838394165, "step": 1270 }, { "epoch": 2.2714285714285714, "grad_norm": 0.2709295451641083, "learning_rate": 2.3812665030797512e-06, "loss": 1.0775344371795654, "step": 1272 }, { "epoch": 2.275, "grad_norm": 0.17263904213905334, "learning_rate": 2.368519639206905e-06, "loss": 0.9881319999694824, "step": 1274 }, { "epoch": 2.2785714285714285, "grad_norm": 0.3276418149471283, "learning_rate": 2.3558213089997303e-06, "loss": 1.1184488534927368, "step": 1276 }, { "epoch": 2.282142857142857, "grad_norm": 0.9172634482383728, "learning_rate": 2.3431717092649892e-06, "loss": 1.3341600894927979, "step": 1278 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5521453022956848, "learning_rate": 2.3305710360541857e-06, "loss": 1.4648536443710327, "step": 1280 }, { "epoch": 2.289285714285714, "grad_norm": 0.27971673011779785, "learning_rate": 2.3180194846605367e-06, "loss": 1.0912892818450928, "step": 1282 }, { "epoch": 2.2928571428571427, "grad_norm": 0.38737377524375916, "learning_rate": 2.3055172496159327e-06, "loss": 1.1721148490905762, "step": 1284 }, { "epoch": 2.2964285714285713, "grad_norm": 0.32838499546051025, "learning_rate": 2.2930645246879286e-06, "loss": 0.8287088871002197, "step": 1286 }, { "epoch": 2.3, "grad_norm": 0.9804138541221619, "learning_rate": 2.2806615028767447e-06, "loss": 1.3678312301635742, "step": 1288 }, { "epoch": 2.3035714285714284, "grad_norm": 0.22584359347820282, "learning_rate": 2.2683083764122626e-06, "loss": 1.076238989830017, "step": 1290 }, { "epoch": 2.307142857142857, "grad_norm": 0.3474865257740021, "learning_rate": 2.2560053367510624e-06, "loss": 1.0969926118850708, "step": 1292 }, { "epoch": 2.310714285714286, "grad_norm": 0.5705395340919495, "learning_rate": 2.24375257457344e-06, "loss": 1.2809841632843018, "step": 1294 }, { "epoch": 2.314285714285714, "grad_norm": 0.4103868007659912, "learning_rate": 2.2315502797804677e-06, "loss": 1.329990029335022, "step": 1296 }, { "epoch": 2.317857142857143, "grad_norm": 0.4318333566188812, "learning_rate": 2.2193986414910347e-06, "loss": 0.9738024473190308, "step": 1298 }, { "epoch": 2.3214285714285716, "grad_norm": 0.6811454892158508, "learning_rate": 2.2072978480389286e-06, "loss": 1.3944941759109497, "step": 1300 }, { "epoch": 2.325, "grad_norm": 0.2388792783021927, "learning_rate": 2.195248086969904e-06, "loss": 0.9329886436462402, "step": 1302 }, { "epoch": 2.3285714285714287, "grad_norm": 0.316201388835907, "learning_rate": 2.1832495450387934e-06, "loss": 1.300463080406189, "step": 1304 }, { "epoch": 2.3321428571428573, "grad_norm": 1.2377450466156006, "learning_rate": 2.1713024082065965e-06, "loss": 1.1311689615249634, "step": 1306 }, { "epoch": 2.335714285714286, "grad_norm": 0.2693905234336853, "learning_rate": 2.1594068616376056e-06, "loss": 1.1664714813232422, "step": 1308 }, { "epoch": 2.3392857142857144, "grad_norm": 0.2321355789899826, "learning_rate": 2.1475630896965336e-06, "loss": 0.9228266477584839, "step": 1310 }, { "epoch": 2.342857142857143, "grad_norm": 0.2707984149456024, "learning_rate": 2.1357712759456594e-06, "loss": 0.8861098289489746, "step": 1312 }, { "epoch": 2.3464285714285715, "grad_norm": 0.41806021332740784, "learning_rate": 2.1240316031419795e-06, "loss": 1.2544275522232056, "step": 1314 }, { "epoch": 2.35, "grad_norm": 0.232350692152977, "learning_rate": 2.112344253234377e-06, "loss": 1.2989314794540405, "step": 1316 }, { "epoch": 2.3535714285714286, "grad_norm": 0.32811442017555237, "learning_rate": 2.1007094073607996e-06, "loss": 1.0640029907226562, "step": 1318 }, { "epoch": 2.357142857142857, "grad_norm": 0.4184323847293854, "learning_rate": 2.0891272458454614e-06, "loss": 1.1912882328033447, "step": 1320 }, { "epoch": 2.3607142857142858, "grad_norm": 0.4829447865486145, "learning_rate": 2.0775979481960343e-06, "loss": 1.1298654079437256, "step": 1322 }, { "epoch": 2.3642857142857143, "grad_norm": 0.24751894176006317, "learning_rate": 2.0661216931008717e-06, "loss": 1.0057674646377563, "step": 1324 }, { "epoch": 2.367857142857143, "grad_norm": 0.3193625807762146, "learning_rate": 2.054698658426244e-06, "loss": 1.3430179357528687, "step": 1326 }, { "epoch": 2.3714285714285714, "grad_norm": 1.9639981985092163, "learning_rate": 2.043329021213577e-06, "loss": 1.1388099193572998, "step": 1328 }, { "epoch": 2.375, "grad_norm": 0.4315277636051178, "learning_rate": 2.0320129576767083e-06, "loss": 1.1900275945663452, "step": 1330 }, { "epoch": 2.3785714285714286, "grad_norm": 0.34250909090042114, "learning_rate": 2.0207506431991556e-06, "loss": 1.29435396194458, "step": 1332 }, { "epoch": 2.382142857142857, "grad_norm": 0.6402963399887085, "learning_rate": 2.0095422523314016e-06, "loss": 1.1449788808822632, "step": 1334 }, { "epoch": 2.3857142857142857, "grad_norm": 0.24777400493621826, "learning_rate": 1.998387958788185e-06, "loss": 0.9889009594917297, "step": 1336 }, { "epoch": 2.3892857142857142, "grad_norm": 0.27742165327072144, "learning_rate": 1.987287935445811e-06, "loss": 1.13013756275177, "step": 1338 }, { "epoch": 2.392857142857143, "grad_norm": 0.6482072472572327, "learning_rate": 1.976242354339471e-06, "loss": 1.2214878797531128, "step": 1340 }, { "epoch": 2.3964285714285714, "grad_norm": 0.41687601804733276, "learning_rate": 1.965251386660575e-06, "loss": 1.180694818496704, "step": 1342 }, { "epoch": 2.4, "grad_norm": 0.4969882071018219, "learning_rate": 1.9543152027541003e-06, "loss": 1.0276660919189453, "step": 1344 }, { "epoch": 2.4035714285714285, "grad_norm": 0.25297048687934875, "learning_rate": 1.9434339721159506e-06, "loss": 1.3130789995193481, "step": 1346 }, { "epoch": 2.407142857142857, "grad_norm": 0.3224523961544037, "learning_rate": 1.932607863390329e-06, "loss": 1.323912262916565, "step": 1348 }, { "epoch": 2.4107142857142856, "grad_norm": 0.3630305528640747, "learning_rate": 1.9218370443671232e-06, "loss": 1.420185923576355, "step": 1350 }, { "epoch": 2.414285714285714, "grad_norm": 0.33518993854522705, "learning_rate": 1.91112168197931e-06, "loss": 1.2631648778915405, "step": 1352 }, { "epoch": 2.4178571428571427, "grad_norm": 0.2684813439846039, "learning_rate": 1.900461942300359e-06, "loss": 1.2116239070892334, "step": 1354 }, { "epoch": 2.4214285714285713, "grad_norm": 1.0438412427902222, "learning_rate": 1.8898579905416678e-06, "loss": 0.9407988786697388, "step": 1356 }, { "epoch": 2.425, "grad_norm": 0.2759835124015808, "learning_rate": 1.8793099910499926e-06, "loss": 1.089248776435852, "step": 1358 }, { "epoch": 2.4285714285714284, "grad_norm": 0.2118200808763504, "learning_rate": 1.8688181073049125e-06, "loss": 0.9922888278961182, "step": 1360 }, { "epoch": 2.432142857142857, "grad_norm": 0.29328909516334534, "learning_rate": 1.8583825019162843e-06, "loss": 1.0572453737258911, "step": 1362 }, { "epoch": 2.435714285714286, "grad_norm": 0.34180185198783875, "learning_rate": 1.848003336621729e-06, "loss": 1.2007834911346436, "step": 1364 }, { "epoch": 2.439285714285714, "grad_norm": 2.949885368347168, "learning_rate": 1.8376807722841231e-06, "loss": 1.2154308557510376, "step": 1366 }, { "epoch": 2.442857142857143, "grad_norm": 0.29990777373313904, "learning_rate": 1.8274149688891057e-06, "loss": 1.2820924520492554, "step": 1368 }, { "epoch": 2.4464285714285716, "grad_norm": 0.2850666046142578, "learning_rate": 1.8172060855425986e-06, "loss": 1.3318397998809814, "step": 1370 }, { "epoch": 2.45, "grad_norm": 0.4406229555606842, "learning_rate": 1.8070542804683406e-06, "loss": 1.490922212600708, "step": 1372 }, { "epoch": 2.4535714285714287, "grad_norm": 0.5301911234855652, "learning_rate": 1.7969597110054343e-06, "loss": 1.04641854763031, "step": 1374 }, { "epoch": 2.4571428571428573, "grad_norm": 0.4790363013744354, "learning_rate": 1.7869225336059133e-06, "loss": 1.2003765106201172, "step": 1376 }, { "epoch": 2.460714285714286, "grad_norm": 0.3002559542655945, "learning_rate": 1.7769429038323058e-06, "loss": 1.1743593215942383, "step": 1378 }, { "epoch": 2.4642857142857144, "grad_norm": 0.47378110885620117, "learning_rate": 1.7670209763552342e-06, "loss": 1.0753716230392456, "step": 1380 }, { "epoch": 2.467857142857143, "grad_norm": 0.4303780198097229, "learning_rate": 1.757156904951014e-06, "loss": 1.195298194885254, "step": 1382 }, { "epoch": 2.4714285714285715, "grad_norm": 0.40849828720092773, "learning_rate": 1.747350842499271e-06, "loss": 1.0725401639938354, "step": 1384 }, { "epoch": 2.475, "grad_norm": 0.4191647469997406, "learning_rate": 1.7376029409805708e-06, "loss": 1.2902517318725586, "step": 1386 }, { "epoch": 2.4785714285714286, "grad_norm": 0.5962879657745361, "learning_rate": 1.7279133514740645e-06, "loss": 1.2889909744262695, "step": 1388 }, { "epoch": 2.482142857142857, "grad_norm": 0.2635829448699951, "learning_rate": 1.7182822241551434e-06, "loss": 0.9972074627876282, "step": 1390 }, { "epoch": 2.4857142857142858, "grad_norm": 0.27476590871810913, "learning_rate": 1.708709708293121e-06, "loss": 1.0351589918136597, "step": 1392 }, { "epoch": 2.4892857142857143, "grad_norm": 0.3098399341106415, "learning_rate": 1.6991959522489082e-06, "loss": 1.030190110206604, "step": 1394 }, { "epoch": 2.492857142857143, "grad_norm": 0.37093329429626465, "learning_rate": 1.6897411034727217e-06, "loss": 1.3557082414627075, "step": 1396 }, { "epoch": 2.4964285714285714, "grad_norm": 0.4083240032196045, "learning_rate": 1.680345308501795e-06, "loss": 1.0274466276168823, "step": 1398 }, { "epoch": 2.5, "grad_norm": 0.34320634603500366, "learning_rate": 1.6710087129581086e-06, "loss": 0.9457365274429321, "step": 1400 }, { "epoch": 2.5035714285714286, "grad_norm": 0.5619872808456421, "learning_rate": 1.6617314615461325e-06, "loss": 1.3013941049575806, "step": 1402 }, { "epoch": 2.507142857142857, "grad_norm": 0.9764664769172668, "learning_rate": 1.6525136980505835e-06, "loss": 1.4310553073883057, "step": 1404 }, { "epoch": 2.5107142857142857, "grad_norm": 0.548743724822998, "learning_rate": 1.6433555653341976e-06, "loss": 1.255396842956543, "step": 1406 }, { "epoch": 2.5142857142857142, "grad_norm": 0.8014435172080994, "learning_rate": 1.6342572053355166e-06, "loss": 0.830237865447998, "step": 1408 }, { "epoch": 2.517857142857143, "grad_norm": 0.21949461102485657, "learning_rate": 1.625218759066685e-06, "loss": 0.7343713641166687, "step": 1410 }, { "epoch": 2.5214285714285714, "grad_norm": 0.6966763734817505, "learning_rate": 1.6162403666112653e-06, "loss": 1.1919779777526855, "step": 1412 }, { "epoch": 2.525, "grad_norm": 0.30908581614494324, "learning_rate": 1.6073221671220692e-06, "loss": 0.9375178813934326, "step": 1414 }, { "epoch": 2.5285714285714285, "grad_norm": 0.34836652874946594, "learning_rate": 1.5984642988190022e-06, "loss": 0.8665962219238281, "step": 1416 }, { "epoch": 2.532142857142857, "grad_norm": 0.18187429010868073, "learning_rate": 1.5896668989869151e-06, "loss": 0.9749317765235901, "step": 1418 }, { "epoch": 2.5357142857142856, "grad_norm": 0.2711097002029419, "learning_rate": 1.5809301039734814e-06, "loss": 1.1920053958892822, "step": 1420 }, { "epoch": 2.539285714285714, "grad_norm": 0.35151663422584534, "learning_rate": 1.5722540491870838e-06, "loss": 1.1063796281814575, "step": 1422 }, { "epoch": 2.5428571428571427, "grad_norm": 0.46157142519950867, "learning_rate": 1.5636388690947125e-06, "loss": 0.9042350649833679, "step": 1424 }, { "epoch": 2.5464285714285713, "grad_norm": 0.44619572162628174, "learning_rate": 1.5550846972198851e-06, "loss": 1.1896483898162842, "step": 1426 }, { "epoch": 2.55, "grad_norm": 0.5084243416786194, "learning_rate": 1.5465916661405734e-06, "loss": 1.0787028074264526, "step": 1428 }, { "epoch": 2.553571428571429, "grad_norm": 0.2909405529499054, "learning_rate": 1.5381599074871512e-06, "loss": 1.1317380666732788, "step": 1430 }, { "epoch": 2.557142857142857, "grad_norm": 0.7613154053688049, "learning_rate": 1.5297895519403563e-06, "loss": 1.3027656078338623, "step": 1432 }, { "epoch": 2.560714285714286, "grad_norm": 0.38280853629112244, "learning_rate": 1.5214807292292567e-06, "loss": 0.8128288984298706, "step": 1434 }, { "epoch": 2.564285714285714, "grad_norm": 0.33587777614593506, "learning_rate": 1.5132335681292492e-06, "loss": 1.4057202339172363, "step": 1436 }, { "epoch": 2.567857142857143, "grad_norm": 0.4974580407142639, "learning_rate": 1.5050481964600582e-06, "loss": 1.2144535779953003, "step": 1438 }, { "epoch": 2.571428571428571, "grad_norm": 0.21717508137226105, "learning_rate": 1.496924741083759e-06, "loss": 0.9632461667060852, "step": 1440 }, { "epoch": 2.575, "grad_norm": 0.18900008499622345, "learning_rate": 1.4888633279028068e-06, "loss": 1.021627426147461, "step": 1442 }, { "epoch": 2.5785714285714287, "grad_norm": 0.41346102952957153, "learning_rate": 1.4808640818580885e-06, "loss": 1.0733561515808105, "step": 1444 }, { "epoch": 2.5821428571428573, "grad_norm": 0.3450411558151245, "learning_rate": 1.4729271269269823e-06, "loss": 1.0130958557128906, "step": 1446 }, { "epoch": 2.585714285714286, "grad_norm": 0.4527641832828522, "learning_rate": 1.4650525861214454e-06, "loss": 0.9112399220466614, "step": 1448 }, { "epoch": 2.5892857142857144, "grad_norm": 0.43975669145584106, "learning_rate": 1.4572405814860954e-06, "loss": 1.0099694728851318, "step": 1450 }, { "epoch": 2.592857142857143, "grad_norm": 2.6724021434783936, "learning_rate": 1.4494912340963286e-06, "loss": 0.9879626035690308, "step": 1452 }, { "epoch": 2.5964285714285715, "grad_norm": 0.33726853132247925, "learning_rate": 1.441804664056437e-06, "loss": 1.3339985609054565, "step": 1454 }, { "epoch": 2.6, "grad_norm": 0.5543254017829895, "learning_rate": 1.4341809904977511e-06, "loss": 1.0636701583862305, "step": 1456 }, { "epoch": 2.6035714285714286, "grad_norm": 0.35016801953315735, "learning_rate": 1.4266203315767917e-06, "loss": 1.2073761224746704, "step": 1458 }, { "epoch": 2.607142857142857, "grad_norm": 0.37314754724502563, "learning_rate": 1.4191228044734387e-06, "loss": 1.067349910736084, "step": 1460 }, { "epoch": 2.6107142857142858, "grad_norm": 0.27696406841278076, "learning_rate": 1.4116885253891142e-06, "loss": 1.1596084833145142, "step": 1462 }, { "epoch": 2.6142857142857143, "grad_norm": 0.23734059929847717, "learning_rate": 1.4043176095449843e-06, "loss": 1.130849003791809, "step": 1464 }, { "epoch": 2.617857142857143, "grad_norm": 0.451869934797287, "learning_rate": 1.3970101711801712e-06, "loss": 1.1519298553466797, "step": 1466 }, { "epoch": 2.6214285714285714, "grad_norm": 0.367313027381897, "learning_rate": 1.3897663235499797e-06, "loss": 1.081532597541809, "step": 1468 }, { "epoch": 2.625, "grad_norm": 1.2766571044921875, "learning_rate": 1.382586178924149e-06, "loss": 0.9227726459503174, "step": 1470 }, { "epoch": 2.6285714285714286, "grad_norm": 1.6380170583724976, "learning_rate": 1.3754698485851074e-06, "loss": 1.3057407140731812, "step": 1472 }, { "epoch": 2.632142857142857, "grad_norm": 0.3816126585006714, "learning_rate": 1.368417442826249e-06, "loss": 1.1892451047897339, "step": 1474 }, { "epoch": 2.6357142857142857, "grad_norm": 0.3007228672504425, "learning_rate": 1.3614290709502242e-06, "loss": 1.2595423460006714, "step": 1476 }, { "epoch": 2.6392857142857142, "grad_norm": 0.2307678908109665, "learning_rate": 1.3545048412672459e-06, "loss": 1.10439932346344, "step": 1478 }, { "epoch": 2.642857142857143, "grad_norm": 0.34183934330940247, "learning_rate": 1.3476448610934104e-06, "loss": 1.1247930526733398, "step": 1480 }, { "epoch": 2.6464285714285714, "grad_norm": 0.50603187084198, "learning_rate": 1.3408492367490344e-06, "loss": 1.308542013168335, "step": 1482 }, { "epoch": 2.65, "grad_norm": 0.5772185921669006, "learning_rate": 1.3341180735570081e-06, "loss": 1.086531639099121, "step": 1484 }, { "epoch": 2.6535714285714285, "grad_norm": 0.2957296073436737, "learning_rate": 1.3274514758411595e-06, "loss": 0.9083548784255981, "step": 1486 }, { "epoch": 2.657142857142857, "grad_norm": 0.2126568704843521, "learning_rate": 1.3208495469246445e-06, "loss": 1.0338191986083984, "step": 1488 }, { "epoch": 2.6607142857142856, "grad_norm": 0.23187443614006042, "learning_rate": 1.3143123891283354e-06, "loss": 1.1434146165847778, "step": 1490 }, { "epoch": 2.664285714285714, "grad_norm": 0.2083001434803009, "learning_rate": 1.3078401037692451e-06, "loss": 1.148645281791687, "step": 1492 }, { "epoch": 2.6678571428571427, "grad_norm": 0.24332857131958008, "learning_rate": 1.3014327911589495e-06, "loss": 1.0858982801437378, "step": 1494 }, { "epoch": 2.6714285714285713, "grad_norm": 0.44840723276138306, "learning_rate": 1.2950905506020383e-06, "loss": 0.8910313844680786, "step": 1496 }, { "epoch": 2.675, "grad_norm": 0.6759834885597229, "learning_rate": 1.2888134803945713e-06, "loss": 1.0723787546157837, "step": 1498 }, { "epoch": 2.678571428571429, "grad_norm": 0.3571532964706421, "learning_rate": 1.2826016778225578e-06, "loss": 1.1453263759613037, "step": 1500 }, { "epoch": 2.682142857142857, "grad_norm": 0.3260257840156555, "learning_rate": 1.2764552391604468e-06, "loss": 1.1897282600402832, "step": 1502 }, { "epoch": 2.685714285714286, "grad_norm": 0.21461273729801178, "learning_rate": 1.2703742596696383e-06, "loss": 1.114097237586975, "step": 1504 }, { "epoch": 2.689285714285714, "grad_norm": 0.39265140891075134, "learning_rate": 1.2643588335970021e-06, "loss": 1.2430890798568726, "step": 1506 }, { "epoch": 2.692857142857143, "grad_norm": 0.26661592721939087, "learning_rate": 1.2584090541734216e-06, "loss": 1.2044790983200073, "step": 1508 }, { "epoch": 2.696428571428571, "grad_norm": 0.4279651641845703, "learning_rate": 1.252525013612346e-06, "loss": 1.1148457527160645, "step": 1510 }, { "epoch": 2.7, "grad_norm": 0.26563382148742676, "learning_rate": 1.2467068031083623e-06, "loss": 1.151499629020691, "step": 1512 }, { "epoch": 2.7035714285714287, "grad_norm": 0.28036361932754517, "learning_rate": 1.2409545128357806e-06, "loss": 1.112971544265747, "step": 1514 }, { "epoch": 2.7071428571428573, "grad_norm": 0.3321837782859802, "learning_rate": 1.235268231947238e-06, "loss": 0.9679718613624573, "step": 1516 }, { "epoch": 2.710714285714286, "grad_norm": 0.7659473419189453, "learning_rate": 1.229648048572317e-06, "loss": 1.18712317943573, "step": 1518 }, { "epoch": 2.7142857142857144, "grad_norm": 0.5400887131690979, "learning_rate": 1.2240940498161797e-06, "loss": 1.0840147733688354, "step": 1520 }, { "epoch": 2.717857142857143, "grad_norm": 0.3426344096660614, "learning_rate": 1.2186063217582144e-06, "loss": 1.1307204961776733, "step": 1522 }, { "epoch": 2.7214285714285715, "grad_norm": 0.39970487356185913, "learning_rate": 1.213184949450706e-06, "loss": 1.1921186447143555, "step": 1524 }, { "epoch": 2.725, "grad_norm": 0.31394848227500916, "learning_rate": 1.2078300169175158e-06, "loss": 1.1872678995132446, "step": 1526 }, { "epoch": 2.7285714285714286, "grad_norm": 0.7688894271850586, "learning_rate": 1.20254160715278e-06, "loss": 1.1403369903564453, "step": 1528 }, { "epoch": 2.732142857142857, "grad_norm": 0.3478771448135376, "learning_rate": 1.1973198021196207e-06, "loss": 1.0353933572769165, "step": 1530 }, { "epoch": 2.7357142857142858, "grad_norm": 1.663916826248169, "learning_rate": 1.1921646827488807e-06, "loss": 1.1801190376281738, "step": 1532 }, { "epoch": 2.7392857142857143, "grad_norm": 0.9486533999443054, "learning_rate": 1.187076328937863e-06, "loss": 1.118172287940979, "step": 1534 }, { "epoch": 2.742857142857143, "grad_norm": 0.3661729693412781, "learning_rate": 1.182054819549098e-06, "loss": 1.166612982749939, "step": 1536 }, { "epoch": 2.7464285714285714, "grad_norm": 0.273942768573761, "learning_rate": 1.1771002324091183e-06, "loss": 1.219356656074524, "step": 1538 }, { "epoch": 2.75, "grad_norm": 0.2943507730960846, "learning_rate": 1.172212644307252e-06, "loss": 1.2092581987380981, "step": 1540 }, { "epoch": 2.7535714285714286, "grad_norm": 0.22103095054626465, "learning_rate": 1.1673921309944356e-06, "loss": 1.1635977029800415, "step": 1542 }, { "epoch": 2.757142857142857, "grad_norm": 0.27992480993270874, "learning_rate": 1.1626387671820363e-06, "loss": 1.1578980684280396, "step": 1544 }, { "epoch": 2.7607142857142857, "grad_norm": 0.1873656064271927, "learning_rate": 1.1579526265406972e-06, "loss": 1.1813486814498901, "step": 1546 }, { "epoch": 2.7642857142857142, "grad_norm": 0.3528795838356018, "learning_rate": 1.1533337816991932e-06, "loss": 1.1933683156967163, "step": 1548 }, { "epoch": 2.767857142857143, "grad_norm": 0.31167811155319214, "learning_rate": 1.1487823042433063e-06, "loss": 1.1475173234939575, "step": 1550 }, { "epoch": 2.7714285714285714, "grad_norm": 1.7408783435821533, "learning_rate": 1.1442982647147167e-06, "loss": 1.148131251335144, "step": 1552 }, { "epoch": 2.775, "grad_norm": 0.3031138777732849, "learning_rate": 1.1398817326099094e-06, "loss": 1.0997506380081177, "step": 1554 }, { "epoch": 2.7785714285714285, "grad_norm": 0.21349631249904633, "learning_rate": 1.1355327763790943e-06, "loss": 1.1433438062667847, "step": 1556 }, { "epoch": 2.782142857142857, "grad_norm": 0.16756878793239594, "learning_rate": 1.1312514634251492e-06, "loss": 1.0694825649261475, "step": 1558 }, { "epoch": 2.7857142857142856, "grad_norm": 0.19285623729228973, "learning_rate": 1.127037860102575e-06, "loss": 1.1415499448776245, "step": 1560 }, { "epoch": 2.789285714285714, "grad_norm": 0.3282257616519928, "learning_rate": 1.1228920317164625e-06, "loss": 1.1128462553024292, "step": 1562 }, { "epoch": 2.7928571428571427, "grad_norm": 0.20754434168338776, "learning_rate": 1.118814042521486e-06, "loss": 1.1504778861999512, "step": 1564 }, { "epoch": 2.7964285714285713, "grad_norm": 0.22546795010566711, "learning_rate": 1.1148039557209057e-06, "loss": 1.1107934713363647, "step": 1566 }, { "epoch": 2.8, "grad_norm": 0.16394157707691193, "learning_rate": 1.1108618334655843e-06, "loss": 1.0830016136169434, "step": 1568 }, { "epoch": 2.803571428571429, "grad_norm": 0.1953999102115631, "learning_rate": 1.1069877368530303e-06, "loss": 1.16024649143219, "step": 1570 }, { "epoch": 2.807142857142857, "grad_norm": 0.211993008852005, "learning_rate": 1.1031817259264454e-06, "loss": 1.1383813619613647, "step": 1572 }, { "epoch": 2.810714285714286, "grad_norm": 0.1844896823167801, "learning_rate": 1.0994438596737971e-06, "loss": 1.0519864559173584, "step": 1574 }, { "epoch": 2.814285714285714, "grad_norm": 0.4553788900375366, "learning_rate": 1.0957741960269049e-06, "loss": 1.1024482250213623, "step": 1576 }, { "epoch": 2.817857142857143, "grad_norm": 0.2758769989013672, "learning_rate": 1.092172791860539e-06, "loss": 1.0607486963272095, "step": 1578 }, { "epoch": 2.821428571428571, "grad_norm": 0.28464648127555847, "learning_rate": 1.0886397029915415e-06, "loss": 1.0878740549087524, "step": 1580 }, { "epoch": 2.825, "grad_norm": 0.2519758641719818, "learning_rate": 1.0851749841779609e-06, "loss": 1.0692694187164307, "step": 1582 }, { "epoch": 2.8285714285714287, "grad_norm": 0.20021863281726837, "learning_rate": 1.0817786891182041e-06, "loss": 1.0892566442489624, "step": 1584 }, { "epoch": 2.8321428571428573, "grad_norm": 0.21085211634635925, "learning_rate": 1.0784508704502029e-06, "loss": 1.0911756753921509, "step": 1586 }, { "epoch": 2.835714285714286, "grad_norm": 0.2599065899848938, "learning_rate": 1.0751915797505986e-06, "loss": 1.0842504501342773, "step": 1588 }, { "epoch": 2.8392857142857144, "grad_norm": 0.23683688044548035, "learning_rate": 1.0720008675339403e-06, "loss": 1.0852082967758179, "step": 1590 }, { "epoch": 2.842857142857143, "grad_norm": 0.19538818299770355, "learning_rate": 1.0688787832519085e-06, "loss": 1.1298590898513794, "step": 1592 }, { "epoch": 2.8464285714285715, "grad_norm": 0.5865882039070129, "learning_rate": 1.0658253752925417e-06, "loss": 1.122971773147583, "step": 1594 }, { "epoch": 2.85, "grad_norm": 0.3269581198692322, "learning_rate": 1.062840690979491e-06, "loss": 1.109829068183899, "step": 1596 }, { "epoch": 2.8535714285714286, "grad_norm": 0.5810469388961792, "learning_rate": 1.0599247765712832e-06, "loss": 1.1492294073104858, "step": 1598 }, { "epoch": 2.857142857142857, "grad_norm": 0.2330639660358429, "learning_rate": 1.0570776772606056e-06, "loss": 1.123344898223877, "step": 1600 }, { "epoch": 2.8607142857142858, "grad_norm": 0.2107606828212738, "learning_rate": 1.0542994371736076e-06, "loss": 1.0889390707015991, "step": 1602 }, { "epoch": 2.8642857142857143, "grad_norm": 0.2753591239452362, "learning_rate": 1.0515900993692128e-06, "loss": 1.1300913095474243, "step": 1604 }, { "epoch": 2.867857142857143, "grad_norm": 0.27015575766563416, "learning_rate": 1.048949705838454e-06, "loss": 1.0982666015625, "step": 1606 }, { "epoch": 2.8714285714285714, "grad_norm": 0.1620846688747406, "learning_rate": 1.0463782975038226e-06, "loss": 1.1166629791259766, "step": 1608 }, { "epoch": 2.875, "grad_norm": 0.21408753097057343, "learning_rate": 1.0438759142186336e-06, "loss": 1.127457857131958, "step": 1610 }, { "epoch": 2.8785714285714286, "grad_norm": 0.26070085167884827, "learning_rate": 1.0414425947664075e-06, "loss": 1.1438779830932617, "step": 1612 }, { "epoch": 2.882142857142857, "grad_norm": 0.1973988115787506, "learning_rate": 1.0390783768602694e-06, "loss": 1.1256788969039917, "step": 1614 }, { "epoch": 2.8857142857142857, "grad_norm": 0.1865663081407547, "learning_rate": 1.0367832971423664e-06, "loss": 1.0647690296173096, "step": 1616 }, { "epoch": 2.8892857142857142, "grad_norm": 0.40141281485557556, "learning_rate": 1.0345573911832976e-06, "loss": 1.0978182554244995, "step": 1618 }, { "epoch": 2.892857142857143, "grad_norm": 0.19470001757144928, "learning_rate": 1.0324006934815623e-06, "loss": 1.1264913082122803, "step": 1620 }, { "epoch": 2.8964285714285714, "grad_norm": 0.1923714429140091, "learning_rate": 1.0303132374630276e-06, "loss": 1.1599576473236084, "step": 1622 }, { "epoch": 2.9, "grad_norm": 0.2873956561088562, "learning_rate": 1.0282950554804084e-06, "loss": 1.1344720125198364, "step": 1624 }, { "epoch": 2.9035714285714285, "grad_norm": 0.2792896330356598, "learning_rate": 1.0263461788127682e-06, "loss": 1.1077191829681396, "step": 1626 }, { "epoch": 2.907142857142857, "grad_norm": 0.17874673008918762, "learning_rate": 1.0244666376650307e-06, "loss": 1.0769405364990234, "step": 1628 }, { "epoch": 2.9107142857142856, "grad_norm": 0.23230457305908203, "learning_rate": 1.0226564611675146e-06, "loss": 1.1149848699569702, "step": 1630 }, { "epoch": 2.914285714285714, "grad_norm": 0.2538415789604187, "learning_rate": 1.020915677375483e-06, "loss": 1.1285921335220337, "step": 1632 }, { "epoch": 2.9178571428571427, "grad_norm": 0.18281330168247223, "learning_rate": 1.0192443132687039e-06, "loss": 1.0885471105575562, "step": 1634 }, { "epoch": 2.9214285714285713, "grad_norm": 0.27069422602653503, "learning_rate": 1.0176423947510377e-06, "loss": 1.1098750829696655, "step": 1636 }, { "epoch": 2.925, "grad_norm": 0.24785873293876648, "learning_rate": 1.016109946650032e-06, "loss": 1.1053394079208374, "step": 1638 }, { "epoch": 2.928571428571429, "grad_norm": 0.2786495089530945, "learning_rate": 1.014646992716537e-06, "loss": 1.1500390768051147, "step": 1640 }, { "epoch": 2.932142857142857, "grad_norm": 0.3538748621940613, "learning_rate": 1.01325355562434e-06, "loss": 1.1664944887161255, "step": 1642 }, { "epoch": 2.935714285714286, "grad_norm": 0.3729296326637268, "learning_rate": 1.0119296569698112e-06, "loss": 1.1281384229660034, "step": 1644 }, { "epoch": 2.939285714285714, "grad_norm": 0.21035878360271454, "learning_rate": 1.01067531727157e-06, "loss": 1.1451420783996582, "step": 1646 }, { "epoch": 2.942857142857143, "grad_norm": 0.3253045380115509, "learning_rate": 1.0094905559701678e-06, "loss": 1.1268796920776367, "step": 1648 }, { "epoch": 2.946428571428571, "grad_norm": 0.20938168466091156, "learning_rate": 1.0083753914277859e-06, "loss": 1.0814552307128906, "step": 1650 }, { "epoch": 2.95, "grad_norm": 0.24861246347427368, "learning_rate": 1.007329840927949e-06, "loss": 1.1016547679901123, "step": 1652 }, { "epoch": 2.9535714285714287, "grad_norm": 0.26715606451034546, "learning_rate": 1.006353920675263e-06, "loss": 1.1287412643432617, "step": 1654 }, { "epoch": 2.9571428571428573, "grad_norm": 0.20948819816112518, "learning_rate": 1.0054476457951567e-06, "loss": 1.11174476146698, "step": 1656 }, { "epoch": 2.960714285714286, "grad_norm": 0.5076990127563477, "learning_rate": 1.0046110303336519e-06, "loss": 1.112143874168396, "step": 1658 }, { "epoch": 2.9642857142857144, "grad_norm": 0.5603309273719788, "learning_rate": 1.0038440872571456e-06, "loss": 1.1545910835266113, "step": 1660 }, { "epoch": 2.967857142857143, "grad_norm": 0.23968827724456787, "learning_rate": 1.0031468284522063e-06, "loss": 1.1435242891311646, "step": 1662 }, { "epoch": 2.9714285714285715, "grad_norm": 0.26473504304885864, "learning_rate": 1.0025192647253939e-06, "loss": 1.1580908298492432, "step": 1664 }, { "epoch": 2.975, "grad_norm": 0.6800065636634827, "learning_rate": 1.0019614058030874e-06, "loss": 1.1012563705444336, "step": 1666 }, { "epoch": 2.9785714285714286, "grad_norm": 0.23044763505458832, "learning_rate": 1.0014732603313375e-06, "loss": 1.1186460256576538, "step": 1668 }, { "epoch": 2.982142857142857, "grad_norm": 0.21679583191871643, "learning_rate": 1.0010548358757327e-06, "loss": 1.1382079124450684, "step": 1670 }, { "epoch": 2.9857142857142858, "grad_norm": 0.4521788954734802, "learning_rate": 1.0007061389212794e-06, "loss": 1.182320475578308, "step": 1672 }, { "epoch": 2.9892857142857143, "grad_norm": 0.24779334664344788, "learning_rate": 1.0004271748723043e-06, "loss": 1.2086482048034668, "step": 1674 }, { "epoch": 2.992857142857143, "grad_norm": 0.5126925706863403, "learning_rate": 1.0002179480523687e-06, "loss": 0.834091067314148, "step": 1676 }, { "epoch": 2.9964285714285714, "grad_norm": 0.3477499783039093, "learning_rate": 1.0000784617042023e-06, "loss": 0.722780168056488, "step": 1678 }, { "epoch": 3.0, "grad_norm": 0.47854718565940857, "learning_rate": 1.0000087179896533e-06, "loss": 0.7972838282585144, "step": 1680 }, { "epoch": 3.0, "step": 1680, "total_flos": 2.510120369642275e+18, "train_loss": 1.2744095386493774, "train_runtime": 14979.881, "train_samples_per_second": 1.794, "train_steps_per_second": 0.112 } ], "logging_steps": 2, "max_steps": 1680, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.510120369642275e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }