diff --git "a/checkpoint-14406/trainer_state.json" "b/checkpoint-14406/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-14406/trainer_state.json" @@ -0,0 +1,4128 @@ +{ + "best_metric": 0.2098342627286911, + "best_model_checkpoint": "date2format-v2/checkpoint-14406", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 14406, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005206164098292378, + "grad_norm": 0.16246353089809418, + "learning_rate": 8.327550312283136e-07, + "loss": 0.1697, + "step": 25 + }, + { + "epoch": 0.010412328196584756, + "grad_norm": 1.2059249877929688, + "learning_rate": 1.7002081887578072e-06, + "loss": 0.214, + "step": 50 + }, + { + "epoch": 0.015618492294877134, + "grad_norm": 0.9199744462966919, + "learning_rate": 2.5676613462873006e-06, + "loss": 0.1357, + "step": 75 + }, + { + "epoch": 0.02082465639316951, + "grad_norm": 0.04551107436418533, + "learning_rate": 3.4351145038167944e-06, + "loss": 0.0982, + "step": 100 + }, + { + "epoch": 0.02603082049146189, + "grad_norm": 1.7336697578430176, + "learning_rate": 4.2678695350451074e-06, + "loss": 0.1863, + "step": 125 + }, + { + "epoch": 0.03123698458975427, + "grad_norm": 1.3021111488342285, + "learning_rate": 5.135322692574601e-06, + "loss": 0.1166, + "step": 150 + }, + { + "epoch": 0.03644314868804665, + "grad_norm": 6.950756549835205, + "learning_rate": 6.002775850104095e-06, + "loss": 0.2385, + "step": 175 + }, + { + "epoch": 0.04164931278633902, + "grad_norm": 0.139039546251297, + "learning_rate": 6.870229007633589e-06, + "loss": 0.1147, + "step": 200 + }, + { + "epoch": 0.0468554768846314, + "grad_norm": 6.5412492752075195, + "learning_rate": 7.737682165163083e-06, + "loss": 0.1682, + "step": 225 + }, + { + "epoch": 0.05206164098292378, + "grad_norm": 1.130511999130249, + "learning_rate": 8.605135322692574e-06, + "loss": 0.2311, + "step": 250 + }, + { + "epoch": 0.05726780508121616, + "grad_norm": 14.400012969970703, + "learning_rate": 9.472588480222069e-06, + "loss": 0.1815, + "step": 275 + }, + { + "epoch": 0.06247396917950854, + "grad_norm": 0.6946442723274231, + "learning_rate": 1.0340041637751562e-05, + "loss": 0.1107, + "step": 300 + }, + { + "epoch": 0.06768013327780091, + "grad_norm": 9.926340103149414, + "learning_rate": 1.1207494795281055e-05, + "loss": 0.1139, + "step": 325 + }, + { + "epoch": 0.0728862973760933, + "grad_norm": 1.6435810327529907, + "learning_rate": 1.207494795281055e-05, + "loss": 0.1471, + "step": 350 + }, + { + "epoch": 0.07809246147438567, + "grad_norm": 1.6755973100662231, + "learning_rate": 1.294240111034004e-05, + "loss": 0.2039, + "step": 375 + }, + { + "epoch": 0.08329862557267805, + "grad_norm": 4.045156002044678, + "learning_rate": 1.3809854267869535e-05, + "loss": 0.2085, + "step": 400 + }, + { + "epoch": 0.08850478967097043, + "grad_norm": 0.2624989449977875, + "learning_rate": 1.4677307425399028e-05, + "loss": 0.182, + "step": 425 + }, + { + "epoch": 0.0937109537692628, + "grad_norm": 9.571054458618164, + "learning_rate": 1.5544760582928523e-05, + "loss": 0.1874, + "step": 450 + }, + { + "epoch": 0.09891711786755518, + "grad_norm": 0.22022797167301178, + "learning_rate": 1.6412213740458016e-05, + "loss": 0.1219, + "step": 475 + }, + { + "epoch": 0.10412328196584757, + "grad_norm": 9.949447631835938, + "learning_rate": 1.727966689798751e-05, + "loss": 0.2049, + "step": 500 + }, + { + "epoch": 0.10932944606413994, + "grad_norm": 5.697494983673096, + "learning_rate": 1.8147120055517e-05, + "loss": 0.1151, + "step": 525 + }, + { + "epoch": 0.11453561016243231, + "grad_norm": 11.763550758361816, + "learning_rate": 1.9014573213046498e-05, + "loss": 0.1546, + "step": 550 + }, + { + "epoch": 0.1197417742607247, + "grad_norm": 0.10468779504299164, + "learning_rate": 1.988202637057599e-05, + "loss": 0.2386, + "step": 575 + }, + { + "epoch": 0.12494793835901707, + "grad_norm": 1.6756809949874878, + "learning_rate": 2.0749479528105484e-05, + "loss": 0.2487, + "step": 600 + }, + { + "epoch": 0.13015410245730946, + "grad_norm": 0.19562338292598724, + "learning_rate": 2.1616932685634977e-05, + "loss": 0.2314, + "step": 625 + }, + { + "epoch": 0.13536026655560182, + "grad_norm": 8.64395809173584, + "learning_rate": 2.248438584316447e-05, + "loss": 0.113, + "step": 650 + }, + { + "epoch": 0.1405664306538942, + "grad_norm": 0.2188766896724701, + "learning_rate": 2.3351839000693966e-05, + "loss": 0.1685, + "step": 675 + }, + { + "epoch": 0.1457725947521866, + "grad_norm": 0.1437786966562271, + "learning_rate": 2.421929215822346e-05, + "loss": 0.2467, + "step": 700 + }, + { + "epoch": 0.15097875885047896, + "grad_norm": 8.023161888122559, + "learning_rate": 2.5086745315752952e-05, + "loss": 0.2007, + "step": 725 + }, + { + "epoch": 0.15618492294877134, + "grad_norm": 1.2265409231185913, + "learning_rate": 2.5954198473282442e-05, + "loss": 0.1716, + "step": 750 + }, + { + "epoch": 0.16139108704706373, + "grad_norm": 5.027709484100342, + "learning_rate": 2.6821651630811938e-05, + "loss": 0.2271, + "step": 775 + }, + { + "epoch": 0.1665972511453561, + "grad_norm": 0.11683953553438187, + "learning_rate": 2.768910478834143e-05, + "loss": 0.2198, + "step": 800 + }, + { + "epoch": 0.17180341524364848, + "grad_norm": 0.0886659324169159, + "learning_rate": 2.8556557945870927e-05, + "loss": 0.2351, + "step": 825 + }, + { + "epoch": 0.17700957934194086, + "grad_norm": 10.608562469482422, + "learning_rate": 2.9424011103400417e-05, + "loss": 0.295, + "step": 850 + }, + { + "epoch": 0.18221574344023322, + "grad_norm": 9.556928634643555, + "learning_rate": 3.0291464260929913e-05, + "loss": 0.1789, + "step": 875 + }, + { + "epoch": 0.1874219075385256, + "grad_norm": 9.168340682983398, + "learning_rate": 3.1158917418459406e-05, + "loss": 0.2548, + "step": 900 + }, + { + "epoch": 0.192628071636818, + "grad_norm": 6.174153804779053, + "learning_rate": 3.2026370575988896e-05, + "loss": 0.257, + "step": 925 + }, + { + "epoch": 0.19783423573511036, + "grad_norm": 13.663798332214355, + "learning_rate": 3.289382373351839e-05, + "loss": 0.1947, + "step": 950 + }, + { + "epoch": 0.20304039983340275, + "grad_norm": 11.622993469238281, + "learning_rate": 3.376127689104788e-05, + "loss": 0.2765, + "step": 975 + }, + { + "epoch": 0.20824656393169513, + "grad_norm": 9.067373275756836, + "learning_rate": 3.462873004857738e-05, + "loss": 0.2063, + "step": 1000 + }, + { + "epoch": 0.2134527280299875, + "grad_norm": 0.04181263968348503, + "learning_rate": 3.549618320610687e-05, + "loss": 0.3131, + "step": 1025 + }, + { + "epoch": 0.21865889212827988, + "grad_norm": 1.1748377084732056, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.2632, + "step": 1050 + }, + { + "epoch": 0.22386505622657227, + "grad_norm": 18.87828826904297, + "learning_rate": 3.723108952116586e-05, + "loss": 0.1721, + "step": 1075 + }, + { + "epoch": 0.22907122032486463, + "grad_norm": 0.07279066741466522, + "learning_rate": 3.809854267869536e-05, + "loss": 0.155, + "step": 1100 + }, + { + "epoch": 0.23427738442315701, + "grad_norm": 0.6264330148696899, + "learning_rate": 3.8965995836224846e-05, + "loss": 0.2785, + "step": 1125 + }, + { + "epoch": 0.2394835485214494, + "grad_norm": 0.07900076359510422, + "learning_rate": 3.983344899375434e-05, + "loss": 0.1977, + "step": 1150 + }, + { + "epoch": 0.24468971261974176, + "grad_norm": 0.14294634759426117, + "learning_rate": 4.070090215128383e-05, + "loss": 0.2297, + "step": 1175 + }, + { + "epoch": 0.24989587671803415, + "grad_norm": 12.401119232177734, + "learning_rate": 4.153365718251215e-05, + "loss": 0.3428, + "step": 1200 + }, + { + "epoch": 0.25510204081632654, + "grad_norm": 11.514626502990723, + "learning_rate": 4.240111034004164e-05, + "loss": 0.2381, + "step": 1225 + }, + { + "epoch": 0.2603082049146189, + "grad_norm": 0.33966752886772156, + "learning_rate": 4.3268563497571134e-05, + "loss": 0.2446, + "step": 1250 + }, + { + "epoch": 0.2655143690129113, + "grad_norm": 13.058968544006348, + "learning_rate": 4.4136016655100624e-05, + "loss": 0.2878, + "step": 1275 + }, + { + "epoch": 0.27072053311120364, + "grad_norm": 11.9553861618042, + "learning_rate": 4.500346981263012e-05, + "loss": 0.2735, + "step": 1300 + }, + { + "epoch": 0.27592669720949603, + "grad_norm": 0.3588428497314453, + "learning_rate": 4.5870922970159617e-05, + "loss": 0.2163, + "step": 1325 + }, + { + "epoch": 0.2811328613077884, + "grad_norm": 23.0421142578125, + "learning_rate": 4.673837612768911e-05, + "loss": 0.2516, + "step": 1350 + }, + { + "epoch": 0.2863390254060808, + "grad_norm": 0.3783215880393982, + "learning_rate": 4.757113115891742e-05, + "loss": 0.3394, + "step": 1375 + }, + { + "epoch": 0.2915451895043732, + "grad_norm": 0.13371586799621582, + "learning_rate": 4.843858431644692e-05, + "loss": 0.2155, + "step": 1400 + }, + { + "epoch": 0.2967513536026656, + "grad_norm": 15.4042329788208, + "learning_rate": 4.930603747397641e-05, + "loss": 0.3902, + "step": 1425 + }, + { + "epoch": 0.3019575177009579, + "grad_norm": 13.482477188110352, + "learning_rate": 4.9999981651326544e-05, + "loss": 0.3137, + "step": 1450 + }, + { + "epoch": 0.3071636817992503, + "grad_norm": 2.373297691345215, + "learning_rate": 4.9999339450583704e-05, + "loss": 0.3525, + "step": 1475 + }, + { + "epoch": 0.3123698458975427, + "grad_norm": 0.45967090129852295, + "learning_rate": 4.999777984310187e-05, + "loss": 0.3325, + "step": 1500 + }, + { + "epoch": 0.3175760099958351, + "grad_norm": 0.21776865422725677, + "learning_rate": 4.9995302886114334e-05, + "loss": 0.3339, + "step": 1525 + }, + { + "epoch": 0.32278217409412746, + "grad_norm": 4.673847198486328, + "learning_rate": 4.9991908670518586e-05, + "loss": 0.2315, + "step": 1550 + }, + { + "epoch": 0.32798833819241985, + "grad_norm": 2.8383209705352783, + "learning_rate": 4.998759732087296e-05, + "loss": 0.3757, + "step": 1575 + }, + { + "epoch": 0.3331945022907122, + "grad_norm": 6.966940402984619, + "learning_rate": 4.998236899539209e-05, + "loss": 0.2376, + "step": 1600 + }, + { + "epoch": 0.33840066638900457, + "grad_norm": 14.109524726867676, + "learning_rate": 4.997622388594109e-05, + "loss": 0.3046, + "step": 1625 + }, + { + "epoch": 0.34360683048729695, + "grad_norm": 0.9895745515823364, + "learning_rate": 4.9969162218028495e-05, + "loss": 0.2425, + "step": 1650 + }, + { + "epoch": 0.34881299458558934, + "grad_norm": 1.0948731899261475, + "learning_rate": 4.996118425079804e-05, + "loss": 0.3732, + "step": 1675 + }, + { + "epoch": 0.35401915868388173, + "grad_norm": 10.473814010620117, + "learning_rate": 4.9952290277019095e-05, + "loss": 0.3086, + "step": 1700 + }, + { + "epoch": 0.3592253227821741, + "grad_norm": 1.947543740272522, + "learning_rate": 4.994248062307593e-05, + "loss": 0.3405, + "step": 1725 + }, + { + "epoch": 0.36443148688046645, + "grad_norm": 7.7466254234313965, + "learning_rate": 4.993175564895577e-05, + "loss": 0.1969, + "step": 1750 + }, + { + "epoch": 0.36963765097875884, + "grad_norm": 11.565778732299805, + "learning_rate": 4.992011574823555e-05, + "loss": 0.3424, + "step": 1775 + }, + { + "epoch": 0.3748438150770512, + "grad_norm": 2.8535590171813965, + "learning_rate": 4.99075613480675e-05, + "loss": 0.2885, + "step": 1800 + }, + { + "epoch": 0.3800499791753436, + "grad_norm": 36.82617950439453, + "learning_rate": 4.9894092909163436e-05, + "loss": 0.3159, + "step": 1825 + }, + { + "epoch": 0.385256143273636, + "grad_norm": 0.5960198044776917, + "learning_rate": 4.98797109257779e-05, + "loss": 0.2981, + "step": 1850 + }, + { + "epoch": 0.3904623073719284, + "grad_norm": 23.564931869506836, + "learning_rate": 4.986441592568994e-05, + "loss": 0.2802, + "step": 1875 + }, + { + "epoch": 0.3956684714702207, + "grad_norm": 4.4088134765625, + "learning_rate": 4.984820847018388e-05, + "loss": 0.378, + "step": 1900 + }, + { + "epoch": 0.4008746355685131, + "grad_norm": 13.824947357177734, + "learning_rate": 4.983108915402855e-05, + "loss": 0.2409, + "step": 1925 + }, + { + "epoch": 0.4060807996668055, + "grad_norm": 14.21284294128418, + "learning_rate": 4.981305860545561e-05, + "loss": 0.3306, + "step": 1950 + }, + { + "epoch": 0.4112869637650979, + "grad_norm": 0.1888038069009781, + "learning_rate": 4.9794117486136404e-05, + "loss": 0.3458, + "step": 1975 + }, + { + "epoch": 0.41649312786339027, + "grad_norm": 0.19039294123649597, + "learning_rate": 4.977426649115773e-05, + "loss": 0.1629, + "step": 2000 + }, + { + "epoch": 0.42169929196168265, + "grad_norm": 0.42835375666618347, + "learning_rate": 4.9753506348996284e-05, + "loss": 0.2258, + "step": 2025 + }, + { + "epoch": 0.426905456059975, + "grad_norm": 4.263399124145508, + "learning_rate": 4.973183782149198e-05, + "loss": 0.3, + "step": 2050 + }, + { + "epoch": 0.4321116201582674, + "grad_norm": 0.17805464565753937, + "learning_rate": 4.970926170381995e-05, + "loss": 0.2086, + "step": 2075 + }, + { + "epoch": 0.43731778425655976, + "grad_norm": 0.9103575348854065, + "learning_rate": 4.968577882446139e-05, + "loss": 0.2435, + "step": 2100 + }, + { + "epoch": 0.44252394835485215, + "grad_norm": 10.369997024536133, + "learning_rate": 4.9661390045173126e-05, + "loss": 0.2264, + "step": 2125 + }, + { + "epoch": 0.44773011245314454, + "grad_norm": 2.6744861602783203, + "learning_rate": 4.963609626095603e-05, + "loss": 0.3012, + "step": 2150 + }, + { + "epoch": 0.4529362765514369, + "grad_norm": 12.971126556396484, + "learning_rate": 4.960989840002216e-05, + "loss": 0.2855, + "step": 2175 + }, + { + "epoch": 0.45814244064972925, + "grad_norm": 5.078539848327637, + "learning_rate": 4.958279742376068e-05, + "loss": 0.2832, + "step": 2200 + }, + { + "epoch": 0.46334860474802164, + "grad_norm": 11.258842468261719, + "learning_rate": 4.955479432670259e-05, + "loss": 0.3402, + "step": 2225 + }, + { + "epoch": 0.46855476884631403, + "grad_norm": 9.847345352172852, + "learning_rate": 4.952589013648425e-05, + "loss": 0.1914, + "step": 2250 + }, + { + "epoch": 0.4737609329446064, + "grad_norm": 0.0392930842936039, + "learning_rate": 4.949608591380963e-05, + "loss": 0.1341, + "step": 2275 + }, + { + "epoch": 0.4789670970428988, + "grad_norm": 7.803274154663086, + "learning_rate": 4.946538275241144e-05, + "loss": 0.3284, + "step": 2300 + }, + { + "epoch": 0.4841732611411912, + "grad_norm": 0.38506531715393066, + "learning_rate": 4.943378177901091e-05, + "loss": 0.3496, + "step": 2325 + }, + { + "epoch": 0.4893794252394835, + "grad_norm": 8.00683879852295, + "learning_rate": 4.940128415327654e-05, + "loss": 0.33, + "step": 2350 + }, + { + "epoch": 0.4945855893377759, + "grad_norm": 5.389349460601807, + "learning_rate": 4.9367891067781454e-05, + "loss": 0.2392, + "step": 2375 + }, + { + "epoch": 0.4997917534360683, + "grad_norm": 0.18322338163852692, + "learning_rate": 4.9333603747959725e-05, + "loss": 0.3306, + "step": 2400 + }, + { + "epoch": 0.5049979175343606, + "grad_norm": 47.38001251220703, + "learning_rate": 4.9298423452061306e-05, + "loss": 0.3651, + "step": 2425 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 1.591893196105957, + "learning_rate": 4.926235147110593e-05, + "loss": 0.1469, + "step": 2450 + }, + { + "epoch": 0.5154102457309454, + "grad_norm": 2.414854049682617, + "learning_rate": 4.922538912883573e-05, + "loss": 0.3108, + "step": 2475 + }, + { + "epoch": 0.5206164098292378, + "grad_norm": 0.7650768756866455, + "learning_rate": 4.91875377816666e-05, + "loss": 0.1384, + "step": 2500 + }, + { + "epoch": 0.5258225739275302, + "grad_norm": 12.073479652404785, + "learning_rate": 4.9148798818638486e-05, + "loss": 0.3325, + "step": 2525 + }, + { + "epoch": 0.5310287380258226, + "grad_norm": 1.0136282444000244, + "learning_rate": 4.910917366136438e-05, + "loss": 0.2324, + "step": 2550 + }, + { + "epoch": 0.536234902124115, + "grad_norm": 4.9315667152404785, + "learning_rate": 4.906866376397816e-05, + "loss": 0.3278, + "step": 2575 + }, + { + "epoch": 0.5414410662224073, + "grad_norm": 18.607717514038086, + "learning_rate": 4.902727061308121e-05, + "loss": 0.3609, + "step": 2600 + }, + { + "epoch": 0.5466472303206997, + "grad_norm": 28.146434783935547, + "learning_rate": 4.89849957276879e-05, + "loss": 0.229, + "step": 2625 + }, + { + "epoch": 0.5518533944189921, + "grad_norm": 6.487234592437744, + "learning_rate": 4.894184065916981e-05, + "loss": 0.1822, + "step": 2650 + }, + { + "epoch": 0.5570595585172845, + "grad_norm": 12.08668327331543, + "learning_rate": 4.8897806991198796e-05, + "loss": 0.1988, + "step": 2675 + }, + { + "epoch": 0.5622657226155768, + "grad_norm": 6.793110370635986, + "learning_rate": 4.885289633968891e-05, + "loss": 0.2108, + "step": 2700 + }, + { + "epoch": 0.5674718867138692, + "grad_norm": 13.04969596862793, + "learning_rate": 4.880711035273709e-05, + "loss": 0.2559, + "step": 2725 + }, + { + "epoch": 0.5726780508121616, + "grad_norm": 0.07108098268508911, + "learning_rate": 4.876045071056262e-05, + "loss": 0.2236, + "step": 2750 + }, + { + "epoch": 0.5778842149104539, + "grad_norm": 4.550630569458008, + "learning_rate": 4.871291912544558e-05, + "loss": 0.1857, + "step": 2775 + }, + { + "epoch": 0.5830903790087464, + "grad_norm": 3.2425384521484375, + "learning_rate": 4.8664517341663886e-05, + "loss": 0.1981, + "step": 2800 + }, + { + "epoch": 0.5882965431070387, + "grad_norm": 1.5111477375030518, + "learning_rate": 4.861524713542939e-05, + "loss": 0.2145, + "step": 2825 + }, + { + "epoch": 0.5935027072053312, + "grad_norm": 0.029720915481448174, + "learning_rate": 4.856511031482264e-05, + "loss": 0.1805, + "step": 2850 + }, + { + "epoch": 0.5987088713036235, + "grad_norm": 14.783000946044922, + "learning_rate": 4.8514108719726516e-05, + "loss": 0.2574, + "step": 2875 + }, + { + "epoch": 0.6039150354019158, + "grad_norm": 0.23782485723495483, + "learning_rate": 4.846224422175877e-05, + "loss": 0.272, + "step": 2900 + }, + { + "epoch": 0.6091211995002083, + "grad_norm": 0.6656458377838135, + "learning_rate": 4.840951872420327e-05, + "loss": 0.1804, + "step": 2925 + }, + { + "epoch": 0.6143273635985006, + "grad_norm": 7.324411869049072, + "learning_rate": 4.835593416194023e-05, + "loss": 0.2526, + "step": 2950 + }, + { + "epoch": 0.619533527696793, + "grad_norm": 17.861886978149414, + "learning_rate": 4.8301492501375135e-05, + "loss": 0.2352, + "step": 2975 + }, + { + "epoch": 0.6247396917950854, + "grad_norm": 16.718368530273438, + "learning_rate": 4.824619574036663e-05, + "loss": 0.394, + "step": 3000 + }, + { + "epoch": 0.6299458558933777, + "grad_norm": 0.1666000634431839, + "learning_rate": 4.819004590815317e-05, + "loss": 0.2006, + "step": 3025 + }, + { + "epoch": 0.6351520199916701, + "grad_norm": 4.644481182098389, + "learning_rate": 4.8133045065278584e-05, + "loss": 0.2226, + "step": 3050 + }, + { + "epoch": 0.6403581840899625, + "grad_norm": 23.20795249938965, + "learning_rate": 4.807519530351644e-05, + "loss": 0.2966, + "step": 3075 + }, + { + "epoch": 0.6455643481882549, + "grad_norm": 0.05183548480272293, + "learning_rate": 4.801649874579327e-05, + "loss": 0.1597, + "step": 3100 + }, + { + "epoch": 0.6507705122865473, + "grad_norm": 8.726778030395508, + "learning_rate": 4.79569575461107e-05, + "loss": 0.3323, + "step": 3125 + }, + { + "epoch": 0.6559766763848397, + "grad_norm": 0.2523828446865082, + "learning_rate": 4.789657388946637e-05, + "loss": 0.1146, + "step": 3150 + }, + { + "epoch": 0.661182840483132, + "grad_norm": 0.07221566885709763, + "learning_rate": 4.7835349991773775e-05, + "loss": 0.3426, + "step": 3175 + }, + { + "epoch": 0.6663890045814244, + "grad_norm": 0.2359621673822403, + "learning_rate": 4.7773288099780935e-05, + "loss": 0.2444, + "step": 3200 + }, + { + "epoch": 0.6715951686797168, + "grad_norm": 18.833477020263672, + "learning_rate": 4.7710390490987936e-05, + "loss": 0.3419, + "step": 3225 + }, + { + "epoch": 0.6768013327780091, + "grad_norm": 17.171281814575195, + "learning_rate": 4.764665947356337e-05, + "loss": 0.2313, + "step": 3250 + }, + { + "epoch": 0.6820074968763016, + "grad_norm": 0.4939661920070648, + "learning_rate": 4.758209738625963e-05, + "loss": 0.2724, + "step": 3275 + }, + { + "epoch": 0.6872136609745939, + "grad_norm": 0.8321401476860046, + "learning_rate": 4.751670659832708e-05, + "loss": 0.1976, + "step": 3300 + }, + { + "epoch": 0.6924198250728864, + "grad_norm": 1.6382664442062378, + "learning_rate": 4.74504895094271e-05, + "loss": 0.2768, + "step": 3325 + }, + { + "epoch": 0.6976259891711787, + "grad_norm": 0.03149043396115303, + "learning_rate": 4.738344854954404e-05, + "loss": 0.2127, + "step": 3350 + }, + { + "epoch": 0.702832153269471, + "grad_norm": 0.08274863660335541, + "learning_rate": 4.7315586178896035e-05, + "loss": 0.2105, + "step": 3375 + }, + { + "epoch": 0.7080383173677635, + "grad_norm": 2.460942268371582, + "learning_rate": 4.724690488784474e-05, + "loss": 0.3465, + "step": 3400 + }, + { + "epoch": 0.7132444814660558, + "grad_norm": 13.02415943145752, + "learning_rate": 4.717740719680391e-05, + "loss": 0.2309, + "step": 3425 + }, + { + "epoch": 0.7184506455643482, + "grad_norm": 15.129639625549316, + "learning_rate": 4.710709565614694e-05, + "loss": 0.1043, + "step": 3450 + }, + { + "epoch": 0.7236568096626406, + "grad_norm": 0.19271063804626465, + "learning_rate": 4.703597284611325e-05, + "loss": 0.312, + "step": 3475 + }, + { + "epoch": 0.7288629737609329, + "grad_norm": 16.245803833007812, + "learning_rate": 4.696404137671362e-05, + "loss": 0.3068, + "step": 3500 + }, + { + "epoch": 0.7340691378592253, + "grad_norm": 0.28644460439682007, + "learning_rate": 4.689130388763437e-05, + "loss": 0.3067, + "step": 3525 + }, + { + "epoch": 0.7392753019575177, + "grad_norm": 19.579248428344727, + "learning_rate": 4.681776304814056e-05, + "loss": 0.2592, + "step": 3550 + }, + { + "epoch": 0.7444814660558101, + "grad_norm": 16.12538719177246, + "learning_rate": 4.6743421556977934e-05, + "loss": 0.1631, + "step": 3575 + }, + { + "epoch": 0.7496876301541024, + "grad_norm": 0.7716278433799744, + "learning_rate": 4.6668282142274e-05, + "loss": 0.2962, + "step": 3600 + }, + { + "epoch": 0.7548937942523949, + "grad_norm": 6.197509288787842, + "learning_rate": 4.659234756143782e-05, + "loss": 0.2491, + "step": 3625 + }, + { + "epoch": 0.7600999583506872, + "grad_norm": 0.5237298011779785, + "learning_rate": 4.651562060105886e-05, + "loss": 0.2988, + "step": 3650 + }, + { + "epoch": 0.7653061224489796, + "grad_norm": 6.333150386810303, + "learning_rate": 4.643810407680475e-05, + "loss": 0.2558, + "step": 3675 + }, + { + "epoch": 0.770512286547272, + "grad_norm": 0.02860177680850029, + "learning_rate": 4.6359800833317915e-05, + "loss": 0.1318, + "step": 3700 + }, + { + "epoch": 0.7757184506455643, + "grad_norm": 0.030804630368947983, + "learning_rate": 4.6280713744111215e-05, + "loss": 0.2002, + "step": 3725 + }, + { + "epoch": 0.7809246147438568, + "grad_norm": 10.464898109436035, + "learning_rate": 4.620084571146247e-05, + "loss": 0.3029, + "step": 3750 + }, + { + "epoch": 0.7861307788421491, + "grad_norm": 1.6494271755218506, + "learning_rate": 4.6120199666307975e-05, + "loss": 0.377, + "step": 3775 + }, + { + "epoch": 0.7913369429404414, + "grad_norm": 10.32836627960205, + "learning_rate": 4.603877856813494e-05, + "loss": 0.1742, + "step": 3800 + }, + { + "epoch": 0.7965431070387339, + "grad_norm": 5.073435306549072, + "learning_rate": 4.5956585404872884e-05, + "loss": 0.1921, + "step": 3825 + }, + { + "epoch": 0.8017492711370262, + "grad_norm": 11.85318374633789, + "learning_rate": 4.587362319278397e-05, + "loss": 0.2957, + "step": 3850 + }, + { + "epoch": 0.8069554352353187, + "grad_norm": 3.998944044113159, + "learning_rate": 4.5789894976352344e-05, + "loss": 0.2102, + "step": 3875 + }, + { + "epoch": 0.812161599333611, + "grad_norm": 0.569960355758667, + "learning_rate": 4.570540382817239e-05, + "loss": 0.245, + "step": 3900 + }, + { + "epoch": 0.8173677634319034, + "grad_norm": 5.945977687835693, + "learning_rate": 4.562015284883597e-05, + "loss": 0.2113, + "step": 3925 + }, + { + "epoch": 0.8225739275301958, + "grad_norm": 10.898477554321289, + "learning_rate": 4.553414516681868e-05, + "loss": 0.2446, + "step": 3950 + }, + { + "epoch": 0.8277800916284881, + "grad_norm": 0.25490328669548035, + "learning_rate": 4.5447383938364974e-05, + "loss": 0.301, + "step": 3975 + }, + { + "epoch": 0.8329862557267805, + "grad_norm": 4.246921539306641, + "learning_rate": 4.535987234737242e-05, + "loss": 0.2944, + "step": 4000 + }, + { + "epoch": 0.8381924198250729, + "grad_norm": 6.019783973693848, + "learning_rate": 4.527161360527481e-05, + "loss": 0.3184, + "step": 4025 + }, + { + "epoch": 0.8433985839233653, + "grad_norm": 0.0902986004948616, + "learning_rate": 4.51826109509243e-05, + "loss": 0.1508, + "step": 4050 + }, + { + "epoch": 0.8486047480216576, + "grad_norm": 7.770606517791748, + "learning_rate": 4.509286765047258e-05, + "loss": 0.4018, + "step": 4075 + }, + { + "epoch": 0.85381091211995, + "grad_norm": 0.20602120459079742, + "learning_rate": 4.5002386997251025e-05, + "loss": 0.3172, + "step": 4100 + }, + { + "epoch": 0.8590170762182424, + "grad_norm": 9.447431564331055, + "learning_rate": 4.491117231164978e-05, + "loss": 0.2055, + "step": 4125 + }, + { + "epoch": 0.8642232403165347, + "grad_norm": 4.260950088500977, + "learning_rate": 4.481922694099602e-05, + "loss": 0.1652, + "step": 4150 + }, + { + "epoch": 0.8694294044148272, + "grad_norm": 8.093120574951172, + "learning_rate": 4.4726554259430966e-05, + "loss": 0.1701, + "step": 4175 + }, + { + "epoch": 0.8746355685131195, + "grad_norm": 9.310951232910156, + "learning_rate": 4.4633157667786194e-05, + "loss": 0.3233, + "step": 4200 + }, + { + "epoch": 0.879841732611412, + "grad_norm": 13.830717086791992, + "learning_rate": 4.453904059345877e-05, + "loss": 0.2724, + "step": 4225 + }, + { + "epoch": 0.8850478967097043, + "grad_norm": 7.378146648406982, + "learning_rate": 4.4444206490285465e-05, + "loss": 0.3583, + "step": 4250 + }, + { + "epoch": 0.8902540608079966, + "grad_norm": 0.06943502277135849, + "learning_rate": 4.4352494400727496e-05, + "loss": 0.3111, + "step": 4275 + }, + { + "epoch": 0.8954602249062891, + "grad_norm": 5.857403755187988, + "learning_rate": 4.425626504054255e-05, + "loss": 0.3536, + "step": 4300 + }, + { + "epoch": 0.9006663890045814, + "grad_norm": 12.8510160446167, + "learning_rate": 4.4159329028593835e-05, + "loss": 0.3129, + "step": 4325 + }, + { + "epoch": 0.9058725531028738, + "grad_norm": 0.7335798740386963, + "learning_rate": 4.406168992216536e-05, + "loss": 0.2536, + "step": 4350 + }, + { + "epoch": 0.9110787172011662, + "grad_norm": 0.06461632251739502, + "learning_rate": 4.396335130434277e-05, + "loss": 0.1635, + "step": 4375 + }, + { + "epoch": 0.9162848812994585, + "grad_norm": 0.6194918751716614, + "learning_rate": 4.386431678388183e-05, + "loss": 0.2273, + "step": 4400 + }, + { + "epoch": 0.921491045397751, + "grad_norm": 0.06244231015443802, + "learning_rate": 4.3764589995076045e-05, + "loss": 0.2393, + "step": 4425 + }, + { + "epoch": 0.9266972094960433, + "grad_norm": 4.408567428588867, + "learning_rate": 4.3664174597623225e-05, + "loss": 0.1189, + "step": 4450 + }, + { + "epoch": 0.9319033735943357, + "grad_norm": 0.7255458235740662, + "learning_rate": 4.3563074276491224e-05, + "loss": 0.2784, + "step": 4475 + }, + { + "epoch": 0.9371095376926281, + "grad_norm": 0.09919915348291397, + "learning_rate": 4.346129274178271e-05, + "loss": 0.1761, + "step": 4500 + }, + { + "epoch": 0.9423157017909205, + "grad_norm": 13.636004447937012, + "learning_rate": 4.335883372859901e-05, + "loss": 0.3252, + "step": 4525 + }, + { + "epoch": 0.9475218658892128, + "grad_norm": 0.1237548440694809, + "learning_rate": 4.3255700996903027e-05, + "loss": 0.1931, + "step": 4550 + }, + { + "epoch": 0.9527280299875052, + "grad_norm": 0.06882551312446594, + "learning_rate": 4.315189833138129e-05, + "loss": 0.256, + "step": 4575 + }, + { + "epoch": 0.9579341940857976, + "grad_norm": 0.21859368681907654, + "learning_rate": 4.304742954130504e-05, + "loss": 0.2087, + "step": 4600 + }, + { + "epoch": 0.9631403581840899, + "grad_norm": 4.428884983062744, + "learning_rate": 4.294229846039045e-05, + "loss": 0.1962, + "step": 4625 + }, + { + "epoch": 0.9683465222823824, + "grad_norm": 6.920009613037109, + "learning_rate": 4.2836508946657944e-05, + "loss": 0.2301, + "step": 4650 + }, + { + "epoch": 0.9735526863806747, + "grad_norm": 3.4040772914886475, + "learning_rate": 4.273006488229061e-05, + "loss": 0.287, + "step": 4675 + }, + { + "epoch": 0.978758850478967, + "grad_norm": 7.0576043128967285, + "learning_rate": 4.2622970173491734e-05, + "loss": 0.1524, + "step": 4700 + }, + { + "epoch": 0.9839650145772595, + "grad_norm": 0.026531610637903214, + "learning_rate": 4.2515228750341466e-05, + "loss": 0.1662, + "step": 4725 + }, + { + "epoch": 0.9891711786755518, + "grad_norm": 1.0448293685913086, + "learning_rate": 4.240684456665257e-05, + "loss": 0.2114, + "step": 4750 + }, + { + "epoch": 0.9943773427738443, + "grad_norm": 44.79957962036133, + "learning_rate": 4.229782159982536e-05, + "loss": 0.2157, + "step": 4775 + }, + { + "epoch": 0.9995835068721366, + "grad_norm": 10.862527847290039, + "learning_rate": 4.2188163850701734e-05, + "loss": 0.2925, + "step": 4800 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.892994923857868, + "eval_f1_macro": 0.8730666664599849, + "eval_f1_micro": 0.892994923857868, + "eval_f1_weighted": 0.8798157289522093, + "eval_loss": 0.34922197461128235, + "eval_precision_macro": 0.9003919914273503, + "eval_precision_micro": 0.892994923857868, + "eval_precision_weighted": 0.9045471108195786, + "eval_recall_macro": 0.8862461348175634, + "eval_recall_micro": 0.892994923857868, + "eval_recall_weighted": 0.892994923857868, + "eval_runtime": 16.3696, + "eval_samples_per_second": 902.587, + "eval_steps_per_second": 56.446, + "step": 4802 + }, + { + "epoch": 1.004789670970429, + "grad_norm": 10.292591094970703, + "learning_rate": 4.2077875343418325e-05, + "loss": 0.2158, + "step": 4825 + }, + { + "epoch": 1.0099958350687213, + "grad_norm": 5.3238325119018555, + "learning_rate": 4.1966960125258884e-05, + "loss": 0.2987, + "step": 4850 + }, + { + "epoch": 1.0152019991670138, + "grad_norm": 0.32381194829940796, + "learning_rate": 4.1855422266505675e-05, + "loss": 0.3283, + "step": 4875 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 7.971839904785156, + "learning_rate": 4.174326586029018e-05, + "loss": 0.19, + "step": 4900 + }, + { + "epoch": 1.0256143273635985, + "grad_norm": 0.252726286649704, + "learning_rate": 4.1630495022442884e-05, + "loss": 0.3692, + "step": 4925 + }, + { + "epoch": 1.0308204914618908, + "grad_norm": 3.9147284030914307, + "learning_rate": 4.1517113891342174e-05, + "loss": 0.2321, + "step": 4950 + }, + { + "epoch": 1.0360266555601834, + "grad_norm": 3.525928020477295, + "learning_rate": 4.1403126627762546e-05, + "loss": 0.1547, + "step": 4975 + }, + { + "epoch": 1.0412328196584757, + "grad_norm": 0.9435555934906006, + "learning_rate": 4.128853741472187e-05, + "loss": 0.1764, + "step": 5000 + }, + { + "epoch": 1.046438983756768, + "grad_norm": 9.357694625854492, + "learning_rate": 4.117335045732791e-05, + "loss": 0.2553, + "step": 5025 + }, + { + "epoch": 1.0516451478550604, + "grad_norm": 0.10351333767175674, + "learning_rate": 4.105756998262399e-05, + "loss": 0.128, + "step": 5050 + }, + { + "epoch": 1.0568513119533527, + "grad_norm": 3.0234265327453613, + "learning_rate": 4.094120023943387e-05, + "loss": 0.2764, + "step": 5075 + }, + { + "epoch": 1.0620574760516452, + "grad_norm": 2.055335283279419, + "learning_rate": 4.082424549820586e-05, + "loss": 0.1752, + "step": 5100 + }, + { + "epoch": 1.0672636401499376, + "grad_norm": 9.345911026000977, + "learning_rate": 4.070671005085607e-05, + "loss": 0.2165, + "step": 5125 + }, + { + "epoch": 1.07246980424823, + "grad_norm": 0.36394619941711426, + "learning_rate": 4.058859821061092e-05, + "loss": 0.1157, + "step": 5150 + }, + { + "epoch": 1.0776759683465222, + "grad_norm": 8.331086158752441, + "learning_rate": 4.046991431184887e-05, + "loss": 0.1545, + "step": 5175 + }, + { + "epoch": 1.0828821324448146, + "grad_norm": 9.395284652709961, + "learning_rate": 4.0350662709941325e-05, + "loss": 0.2645, + "step": 5200 + }, + { + "epoch": 1.0880882965431071, + "grad_norm": 6.326489448547363, + "learning_rate": 4.023084778109283e-05, + "loss": 0.2223, + "step": 5225 + }, + { + "epoch": 1.0932944606413995, + "grad_norm": 11.211806297302246, + "learning_rate": 4.0110473922180526e-05, + "loss": 0.2022, + "step": 5250 + }, + { + "epoch": 1.0985006247396918, + "grad_norm": 0.9454079270362854, + "learning_rate": 3.998954555059266e-05, + "loss": 0.2831, + "step": 5275 + }, + { + "epoch": 1.1037067888379841, + "grad_norm": 2.5448479652404785, + "learning_rate": 3.9868067104066646e-05, + "loss": 0.1692, + "step": 5300 + }, + { + "epoch": 1.1089129529362765, + "grad_norm": 2.1148276329040527, + "learning_rate": 3.9746043040526074e-05, + "loss": 0.3686, + "step": 5325 + }, + { + "epoch": 1.114119117034569, + "grad_norm": 18.591083526611328, + "learning_rate": 3.9623477837917167e-05, + "loss": 0.2465, + "step": 5350 + }, + { + "epoch": 1.1193252811328613, + "grad_norm": 1.9231871366500854, + "learning_rate": 3.950037599404451e-05, + "loss": 0.2383, + "step": 5375 + }, + { + "epoch": 1.1245314452311537, + "grad_norm": 13.078465461730957, + "learning_rate": 3.93767420264059e-05, + "loss": 0.2129, + "step": 5400 + }, + { + "epoch": 1.129737609329446, + "grad_norm": 0.015137123875319958, + "learning_rate": 3.9252580472026616e-05, + "loss": 0.123, + "step": 5425 + }, + { + "epoch": 1.1349437734277386, + "grad_norm": 1.9116227626800537, + "learning_rate": 3.912789588729292e-05, + "loss": 0.2135, + "step": 5450 + }, + { + "epoch": 1.1401499375260309, + "grad_norm": 0.04986105486750603, + "learning_rate": 3.9002692847784824e-05, + "loss": 0.3574, + "step": 5475 + }, + { + "epoch": 1.1453561016243232, + "grad_norm": 8.818171501159668, + "learning_rate": 3.8876975948108236e-05, + "loss": 0.1246, + "step": 5500 + }, + { + "epoch": 1.1505622657226156, + "grad_norm": 0.3172764182090759, + "learning_rate": 3.8750749801726275e-05, + "loss": 0.182, + "step": 5525 + }, + { + "epoch": 1.1557684298209079, + "grad_norm": 9.364998817443848, + "learning_rate": 3.862401904079004e-05, + "loss": 0.2951, + "step": 5550 + }, + { + "epoch": 1.1609745939192004, + "grad_norm": 7.908264636993408, + "learning_rate": 3.849678831596855e-05, + "loss": 0.1521, + "step": 5575 + }, + { + "epoch": 1.1661807580174928, + "grad_norm": 0.3125484585762024, + "learning_rate": 3.8369062296278174e-05, + "loss": 0.2956, + "step": 5600 + }, + { + "epoch": 1.171386922115785, + "grad_norm": 0.6337696313858032, + "learning_rate": 3.824084566891118e-05, + "loss": 0.2355, + "step": 5625 + }, + { + "epoch": 1.1765930862140774, + "grad_norm": 0.07673631608486176, + "learning_rate": 3.8112143139063824e-05, + "loss": 0.2098, + "step": 5650 + }, + { + "epoch": 1.1817992503123698, + "grad_norm": 0.02253473550081253, + "learning_rate": 3.798295942976362e-05, + "loss": 0.1149, + "step": 5675 + }, + { + "epoch": 1.1870054144106623, + "grad_norm": 6.865312099456787, + "learning_rate": 3.7853299281696056e-05, + "loss": 0.3913, + "step": 5700 + }, + { + "epoch": 1.1922115785089547, + "grad_norm": 6.194192409515381, + "learning_rate": 3.7723167453030586e-05, + "loss": 0.1924, + "step": 5725 + }, + { + "epoch": 1.197417742607247, + "grad_norm": 0.12381599843502045, + "learning_rate": 3.759256871924604e-05, + "loss": 0.3331, + "step": 5750 + }, + { + "epoch": 1.2026239067055393, + "grad_norm": 0.02437855489552021, + "learning_rate": 3.74615078729554e-05, + "loss": 0.1819, + "step": 5775 + }, + { + "epoch": 1.2078300708038316, + "grad_norm": 14.187339782714844, + "learning_rate": 3.732998972372991e-05, + "loss": 0.2303, + "step": 5800 + }, + { + "epoch": 1.2130362349021242, + "grad_norm": 16.453699111938477, + "learning_rate": 3.719801909792251e-05, + "loss": 0.3025, + "step": 5825 + }, + { + "epoch": 1.2182423990004165, + "grad_norm": 1.748321771621704, + "learning_rate": 3.706560083849083e-05, + "loss": 0.2085, + "step": 5850 + }, + { + "epoch": 1.2234485630987089, + "grad_norm": 0.6774524450302124, + "learning_rate": 3.693273980481942e-05, + "loss": 0.2587, + "step": 5875 + }, + { + "epoch": 1.2286547271970012, + "grad_norm": 12.350029945373535, + "learning_rate": 3.679944087254141e-05, + "loss": 0.1885, + "step": 5900 + }, + { + "epoch": 1.2338608912952935, + "grad_norm": 6.396539688110352, + "learning_rate": 3.6665708933359576e-05, + "loss": 0.2404, + "step": 5925 + }, + { + "epoch": 1.239067055393586, + "grad_norm": 5.529823303222656, + "learning_rate": 3.6531548894866896e-05, + "loss": 0.2086, + "step": 5950 + }, + { + "epoch": 1.2442732194918784, + "grad_norm": 4.696390628814697, + "learning_rate": 3.639696568036639e-05, + "loss": 0.1309, + "step": 5975 + }, + { + "epoch": 1.2494793835901707, + "grad_norm": 0.10419386625289917, + "learning_rate": 3.6261964228690454e-05, + "loss": 0.1567, + "step": 6000 + }, + { + "epoch": 1.254685547688463, + "grad_norm": 0.19173868000507355, + "learning_rate": 3.612654949401967e-05, + "loss": 0.1632, + "step": 6025 + }, + { + "epoch": 1.2598917117867554, + "grad_norm": 10.412970542907715, + "learning_rate": 3.599072644570092e-05, + "loss": 0.1771, + "step": 6050 + }, + { + "epoch": 1.265097875885048, + "grad_norm": 13.899092674255371, + "learning_rate": 3.585450006806512e-05, + "loss": 0.0964, + "step": 6075 + }, + { + "epoch": 1.2703040399833403, + "grad_norm": 11.265216827392578, + "learning_rate": 3.5717875360244214e-05, + "loss": 0.3711, + "step": 6100 + }, + { + "epoch": 1.2755102040816326, + "grad_norm": 5.9480977058410645, + "learning_rate": 3.5580857335987774e-05, + "loss": 0.2482, + "step": 6125 + }, + { + "epoch": 1.280716368179925, + "grad_norm": 7.014443874359131, + "learning_rate": 3.544345102347902e-05, + "loss": 0.258, + "step": 6150 + }, + { + "epoch": 1.2859225322782173, + "grad_norm": 0.06656356900930405, + "learning_rate": 3.530566146515028e-05, + "loss": 0.1432, + "step": 6175 + }, + { + "epoch": 1.2911286963765098, + "grad_norm": 9.881967544555664, + "learning_rate": 3.5167493717497905e-05, + "loss": 0.127, + "step": 6200 + }, + { + "epoch": 1.2963348604748022, + "grad_norm": 7.715837478637695, + "learning_rate": 3.502895285089679e-05, + "loss": 0.229, + "step": 6225 + }, + { + "epoch": 1.3015410245730945, + "grad_norm": 0.07192976772785187, + "learning_rate": 3.489004394941425e-05, + "loss": 0.1111, + "step": 6250 + }, + { + "epoch": 1.3067471886713868, + "grad_norm": 1.0921132564544678, + "learning_rate": 3.475077211062346e-05, + "loss": 0.1642, + "step": 6275 + }, + { + "epoch": 1.3119533527696792, + "grad_norm": 0.2106587141752243, + "learning_rate": 3.461114244541641e-05, + "loss": 0.2393, + "step": 6300 + }, + { + "epoch": 1.3171595168679717, + "grad_norm": 5.716639518737793, + "learning_rate": 3.4471160077816314e-05, + "loss": 0.2525, + "step": 6325 + }, + { + "epoch": 1.322365680966264, + "grad_norm": 0.30353063344955444, + "learning_rate": 3.43308301447896e-05, + "loss": 0.2426, + "step": 6350 + }, + { + "epoch": 1.3275718450645564, + "grad_norm": 0.039772335439920425, + "learning_rate": 3.419015779605737e-05, + "loss": 0.2302, + "step": 6375 + }, + { + "epoch": 1.332778009162849, + "grad_norm": 0.7781673073768616, + "learning_rate": 3.404914819390646e-05, + "loss": 0.2583, + "step": 6400 + }, + { + "epoch": 1.3379841732611413, + "grad_norm": 24.190811157226562, + "learning_rate": 3.3907806512999966e-05, + "loss": 0.1873, + "step": 6425 + }, + { + "epoch": 1.3431903373594336, + "grad_norm": 9.922744750976562, + "learning_rate": 3.3766137940187364e-05, + "loss": 0.2441, + "step": 6450 + }, + { + "epoch": 1.348396501457726, + "grad_norm": 7.696400165557861, + "learning_rate": 3.362414767431414e-05, + "loss": 0.1051, + "step": 6475 + }, + { + "epoch": 1.3536026655560183, + "grad_norm": 5.509871482849121, + "learning_rate": 3.348184092603105e-05, + "loss": 0.1288, + "step": 6500 + }, + { + "epoch": 1.3588088296543108, + "grad_norm": 12.904980659484863, + "learning_rate": 3.333922291760286e-05, + "loss": 0.2988, + "step": 6525 + }, + { + "epoch": 1.3640149937526032, + "grad_norm": 0.11204788088798523, + "learning_rate": 3.319629888271675e-05, + "loss": 0.1962, + "step": 6550 + }, + { + "epoch": 1.3692211578508955, + "grad_norm": 0.16563156247138977, + "learning_rate": 3.3053074066290214e-05, + "loss": 0.2317, + "step": 6575 + }, + { + "epoch": 1.3744273219491878, + "grad_norm": 10.849634170532227, + "learning_rate": 3.290955372427858e-05, + "loss": 0.1759, + "step": 6600 + }, + { + "epoch": 1.3796334860474802, + "grad_norm": 13.899421691894531, + "learning_rate": 3.276574312348218e-05, + "loss": 0.3169, + "step": 6625 + }, + { + "epoch": 1.3848396501457727, + "grad_norm": 0.03900701552629471, + "learning_rate": 3.2621647541353015e-05, + "loss": 0.2785, + "step": 6650 + }, + { + "epoch": 1.390045814244065, + "grad_norm": 4.0548095703125, + "learning_rate": 3.2477272265801126e-05, + "loss": 0.2163, + "step": 6675 + }, + { + "epoch": 1.3952519783423574, + "grad_norm": 0.026262715458869934, + "learning_rate": 3.233841378366721e-05, + "loss": 0.2331, + "step": 6700 + }, + { + "epoch": 1.4004581424406497, + "grad_norm": 1.4128097295761108, + "learning_rate": 3.2193505687298915e-05, + "loss": 0.138, + "step": 6725 + }, + { + "epoch": 1.405664306538942, + "grad_norm": 0.03084075264632702, + "learning_rate": 3.2048333609131926e-05, + "loss": 0.2318, + "step": 6750 + }, + { + "epoch": 1.4108704706372346, + "grad_norm": 0.08986690640449524, + "learning_rate": 3.19029028765807e-05, + "loss": 0.1265, + "step": 6775 + }, + { + "epoch": 1.416076634735527, + "grad_norm": 0.33206573128700256, + "learning_rate": 3.1757218826551634e-05, + "loss": 0.0584, + "step": 6800 + }, + { + "epoch": 1.4212827988338192, + "grad_norm": 0.31883516907691956, + "learning_rate": 3.161128680524714e-05, + "loss": 0.1517, + "step": 6825 + }, + { + "epoch": 1.4264889629321116, + "grad_norm": 0.03625530004501343, + "learning_rate": 3.1465112167969504e-05, + "loss": 0.1541, + "step": 6850 + }, + { + "epoch": 1.431695127030404, + "grad_norm": 9.222389221191406, + "learning_rate": 3.1318700278924374e-05, + "loss": 0.267, + "step": 6875 + }, + { + "epoch": 1.4369012911286965, + "grad_norm": 9.740432739257812, + "learning_rate": 3.11779266463367e-05, + "loss": 0.2008, + "step": 6900 + }, + { + "epoch": 1.4421074552269888, + "grad_norm": 0.007975614629685879, + "learning_rate": 3.1031065337471356e-05, + "loss": 0.1723, + "step": 6925 + }, + { + "epoch": 1.4473136193252811, + "grad_norm": 9.000720977783203, + "learning_rate": 3.088398270515885e-05, + "loss": 0.1206, + "step": 6950 + }, + { + "epoch": 1.4525197834235735, + "grad_norm": 9.622346878051758, + "learning_rate": 3.0736684146925715e-05, + "loss": 0.1296, + "step": 6975 + }, + { + "epoch": 1.4577259475218658, + "grad_norm": 0.9583789706230164, + "learning_rate": 3.0589175068222385e-05, + "loss": 0.144, + "step": 7000 + }, + { + "epoch": 1.4629321116201583, + "grad_norm": 37.292327880859375, + "learning_rate": 3.044146088222479e-05, + "loss": 0.253, + "step": 7025 + }, + { + "epoch": 1.4681382757184507, + "grad_norm": 6.009835243225098, + "learning_rate": 3.0293547009635758e-05, + "loss": 0.1883, + "step": 7050 + }, + { + "epoch": 1.473344439816743, + "grad_norm": 0.2210305631160736, + "learning_rate": 3.0145438878486033e-05, + "loss": 0.2264, + "step": 7075 + }, + { + "epoch": 1.4785506039150353, + "grad_norm": 6.8800764083862305, + "learning_rate": 2.9997141923935136e-05, + "loss": 0.2641, + "step": 7100 + }, + { + "epoch": 1.4837567680133277, + "grad_norm": 0.2739255130290985, + "learning_rate": 2.9848661588071876e-05, + "loss": 0.2043, + "step": 7125 + }, + { + "epoch": 1.4889629321116202, + "grad_norm": 0.016436373814940453, + "learning_rate": 2.9700003319714648e-05, + "loss": 0.1275, + "step": 7150 + }, + { + "epoch": 1.4941690962099126, + "grad_norm": 3.340723752975464, + "learning_rate": 2.9551172574211478e-05, + "loss": 0.3281, + "step": 7175 + }, + { + "epoch": 1.499375260308205, + "grad_norm": 4.6393256187438965, + "learning_rate": 2.9402174813239836e-05, + "loss": 0.1756, + "step": 7200 + }, + { + "epoch": 1.5045814244064974, + "grad_norm": 6.234640121459961, + "learning_rate": 2.9253015504606197e-05, + "loss": 0.2412, + "step": 7225 + }, + { + "epoch": 1.5097875885047896, + "grad_norm": 2.9910178184509277, + "learning_rate": 2.910370012204537e-05, + "loss": 0.2625, + "step": 7250 + }, + { + "epoch": 1.5149937526030821, + "grad_norm": 0.030267061665654182, + "learning_rate": 2.8954234145019672e-05, + "loss": 0.1825, + "step": 7275 + }, + { + "epoch": 1.5201999167013744, + "grad_norm": 12.349358558654785, + "learning_rate": 2.8804623058517805e-05, + "loss": 0.2111, + "step": 7300 + }, + { + "epoch": 1.5254060807996668, + "grad_norm": 0.03321736305952072, + "learning_rate": 2.86548723528536e-05, + "loss": 0.1627, + "step": 7325 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 12.457853317260742, + "learning_rate": 2.8504987523464515e-05, + "loss": 0.2239, + "step": 7350 + }, + { + "epoch": 1.5358184089962514, + "grad_norm": 0.0661047101020813, + "learning_rate": 2.8354974070709983e-05, + "loss": 0.2431, + "step": 7375 + }, + { + "epoch": 1.541024573094544, + "grad_norm": 0.02619466558098793, + "learning_rate": 2.820483749966955e-05, + "loss": 0.1084, + "step": 7400 + }, + { + "epoch": 1.5462307371928363, + "grad_norm": 3.5517141819000244, + "learning_rate": 2.8054583319940896e-05, + "loss": 0.116, + "step": 7425 + }, + { + "epoch": 1.5514369012911287, + "grad_norm": 8.054045677185059, + "learning_rate": 2.7904217045437564e-05, + "loss": 0.2523, + "step": 7450 + }, + { + "epoch": 1.5566430653894212, + "grad_norm": 5.730096340179443, + "learning_rate": 2.775374419418671e-05, + "loss": 0.1687, + "step": 7475 + }, + { + "epoch": 1.5618492294877133, + "grad_norm": 0.11246989667415619, + "learning_rate": 2.7603170288126546e-05, + "loss": 0.1266, + "step": 7500 + }, + { + "epoch": 1.5670553935860059, + "grad_norm": 7.816544532775879, + "learning_rate": 2.74525008529037e-05, + "loss": 0.2546, + "step": 7525 + }, + { + "epoch": 1.5722615576842982, + "grad_norm": 0.06883124262094498, + "learning_rate": 2.7301741417670485e-05, + "loss": 0.1984, + "step": 7550 + }, + { + "epoch": 1.5774677217825905, + "grad_norm": 3.755141019821167, + "learning_rate": 2.715089751488195e-05, + "loss": 0.1695, + "step": 7575 + }, + { + "epoch": 1.582673885880883, + "grad_norm": 0.14307986199855804, + "learning_rate": 2.6999974680092882e-05, + "loss": 0.1959, + "step": 7600 + }, + { + "epoch": 1.5878800499791752, + "grad_norm": 16.175771713256836, + "learning_rate": 2.684897845175463e-05, + "loss": 0.21, + "step": 7625 + }, + { + "epoch": 1.5930862140774678, + "grad_norm": 0.39594829082489014, + "learning_rate": 2.6697914371011912e-05, + "loss": 0.1769, + "step": 7650 + }, + { + "epoch": 1.59829237817576, + "grad_norm": 11.465653419494629, + "learning_rate": 2.654678798149942e-05, + "loss": 0.1581, + "step": 7675 + }, + { + "epoch": 1.6034985422740524, + "grad_norm": 0.006868015043437481, + "learning_rate": 2.639560482913843e-05, + "loss": 0.2084, + "step": 7700 + }, + { + "epoch": 1.608704706372345, + "grad_norm": 23.506370544433594, + "learning_rate": 2.6244370461933226e-05, + "loss": 0.1521, + "step": 7725 + }, + { + "epoch": 1.613910870470637, + "grad_norm": 7.843533039093018, + "learning_rate": 2.609309042976757e-05, + "loss": 0.161, + "step": 7750 + }, + { + "epoch": 1.6191170345689296, + "grad_norm": 0.6957140564918518, + "learning_rate": 2.5941770284200968e-05, + "loss": 0.1201, + "step": 7775 + }, + { + "epoch": 1.624323198667222, + "grad_norm": 0.09775109589099884, + "learning_rate": 2.5790415578265e-05, + "loss": 0.0975, + "step": 7800 + }, + { + "epoch": 1.6295293627655143, + "grad_norm": 7.59830379486084, + "learning_rate": 2.5639031866259512e-05, + "loss": 0.1495, + "step": 7825 + }, + { + "epoch": 1.6347355268638069, + "grad_norm": 6.013265132904053, + "learning_rate": 2.5487624703548783e-05, + "loss": 0.2432, + "step": 7850 + }, + { + "epoch": 1.639941690962099, + "grad_norm": 0.011226486414670944, + "learning_rate": 2.5336199646357693e-05, + "loss": 0.1438, + "step": 7875 + }, + { + "epoch": 1.6451478550603915, + "grad_norm": 12.745257377624512, + "learning_rate": 2.518476225156776e-05, + "loss": 0.1853, + "step": 7900 + }, + { + "epoch": 1.6503540191586838, + "grad_norm": 0.028018401935696602, + "learning_rate": 2.5033318076513295e-05, + "loss": 0.269, + "step": 7925 + }, + { + "epoch": 1.6555601832569762, + "grad_norm": 0.29439255595207214, + "learning_rate": 2.4881872678777408e-05, + "loss": 0.2584, + "step": 7950 + }, + { + "epoch": 1.6607663473552687, + "grad_norm": 7.931988716125488, + "learning_rate": 2.473043161598808e-05, + "loss": 0.2546, + "step": 7975 + }, + { + "epoch": 1.665972511453561, + "grad_norm": 0.24659812450408936, + "learning_rate": 2.4579000445614214e-05, + "loss": 0.0941, + "step": 8000 + }, + { + "epoch": 1.6711786755518534, + "grad_norm": 0.13519421219825745, + "learning_rate": 2.4427584724761687e-05, + "loss": 0.1369, + "step": 8025 + }, + { + "epoch": 1.6763848396501457, + "grad_norm": 2.32975435256958, + "learning_rate": 2.4276190009969428e-05, + "loss": 0.0967, + "step": 8050 + }, + { + "epoch": 1.681591003748438, + "grad_norm": 10.135041236877441, + "learning_rate": 2.412482185700548e-05, + "loss": 0.2521, + "step": 8075 + }, + { + "epoch": 1.6867971678467306, + "grad_norm": 5.711327075958252, + "learning_rate": 2.397348582066317e-05, + "loss": 0.2155, + "step": 8100 + }, + { + "epoch": 1.692003331945023, + "grad_norm": 5.974716663360596, + "learning_rate": 2.382218745455721e-05, + "loss": 0.1574, + "step": 8125 + }, + { + "epoch": 1.6972094960433153, + "grad_norm": 0.01547964382916689, + "learning_rate": 2.3670932310919928e-05, + "loss": 0.1252, + "step": 8150 + }, + { + "epoch": 1.7024156601416076, + "grad_norm": 0.8667640089988708, + "learning_rate": 2.3519725940397516e-05, + "loss": 0.1717, + "step": 8175 + }, + { + "epoch": 1.7076218242399, + "grad_norm": 0.2800462245941162, + "learning_rate": 2.3368573891846307e-05, + "loss": 0.1726, + "step": 8200 + }, + { + "epoch": 1.7128279883381925, + "grad_norm": 0.2914607524871826, + "learning_rate": 2.321748171212919e-05, + "loss": 0.1686, + "step": 8225 + }, + { + "epoch": 1.7180341524364848, + "grad_norm": 5.2183051109313965, + "learning_rate": 2.3066454945912003e-05, + "loss": 0.2877, + "step": 8250 + }, + { + "epoch": 1.7232403165347772, + "grad_norm": 5.748877048492432, + "learning_rate": 2.2915499135460123e-05, + "loss": 0.157, + "step": 8275 + }, + { + "epoch": 1.7284464806330697, + "grad_norm": 9.520060539245605, + "learning_rate": 2.276461982043503e-05, + "loss": 0.2562, + "step": 8300 + }, + { + "epoch": 1.7336526447313618, + "grad_norm": 0.03092977963387966, + "learning_rate": 2.2613822537691016e-05, + "loss": 0.1034, + "step": 8325 + }, + { + "epoch": 1.7388588088296544, + "grad_norm": 0.1688009798526764, + "learning_rate": 2.2463112821072063e-05, + "loss": 0.1484, + "step": 8350 + }, + { + "epoch": 1.7440649729279467, + "grad_norm": 0.5068601369857788, + "learning_rate": 2.2312496201208654e-05, + "loss": 0.1381, + "step": 8375 + }, + { + "epoch": 1.749271137026239, + "grad_norm": 0.58343505859375, + "learning_rate": 2.2161978205314934e-05, + "loss": 0.2426, + "step": 8400 + }, + { + "epoch": 1.7544773011245316, + "grad_norm": 5.165789604187012, + "learning_rate": 2.20115643569858e-05, + "loss": 0.2173, + "step": 8425 + }, + { + "epoch": 1.7596834652228237, + "grad_norm": 0.6824327111244202, + "learning_rate": 2.1861260175994203e-05, + "loss": 0.1443, + "step": 8450 + }, + { + "epoch": 1.7648896293211163, + "grad_norm": 0.2100004255771637, + "learning_rate": 2.1711071178088633e-05, + "loss": 0.1741, + "step": 8475 + }, + { + "epoch": 1.7700957934194086, + "grad_norm": 0.46001136302948, + "learning_rate": 2.1561002874790662e-05, + "loss": 0.2116, + "step": 8500 + }, + { + "epoch": 1.775301957517701, + "grad_norm": 7.581360816955566, + "learning_rate": 2.1411060773192704e-05, + "loss": 0.1379, + "step": 8525 + }, + { + "epoch": 1.7805081216159935, + "grad_norm": 7.0736846923828125, + "learning_rate": 2.126125037575594e-05, + "loss": 0.1403, + "step": 8550 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.7845435738563538, + "learning_rate": 2.1111577180108343e-05, + "loss": 0.1406, + "step": 8575 + }, + { + "epoch": 1.7909204498125781, + "grad_norm": 0.2352413535118103, + "learning_rate": 2.0962046678842974e-05, + "loss": 0.1425, + "step": 8600 + }, + { + "epoch": 1.7961266139108705, + "grad_norm": 0.2534882426261902, + "learning_rate": 2.08126643593164e-05, + "loss": 0.2827, + "step": 8625 + }, + { + "epoch": 1.8013327780091628, + "grad_norm": 0.015448580496013165, + "learning_rate": 2.066343570344733e-05, + "loss": 0.2161, + "step": 8650 + }, + { + "epoch": 1.8065389421074554, + "grad_norm": 11.641529083251953, + "learning_rate": 2.0514366187515444e-05, + "loss": 0.141, + "step": 8675 + }, + { + "epoch": 1.8117451062057475, + "grad_norm": 8.481783866882324, + "learning_rate": 2.036546128196041e-05, + "loss": 0.1693, + "step": 8700 + }, + { + "epoch": 1.81695127030404, + "grad_norm": 0.046340491622686386, + "learning_rate": 2.021672645118118e-05, + "loss": 0.1685, + "step": 8725 + }, + { + "epoch": 1.8221574344023324, + "grad_norm": 0.02451934851706028, + "learning_rate": 2.0068167153335388e-05, + "loss": 0.2014, + "step": 8750 + }, + { + "epoch": 1.8273635985006247, + "grad_norm": 0.008840017020702362, + "learning_rate": 1.991978884013914e-05, + "loss": 0.139, + "step": 8775 + }, + { + "epoch": 1.8325697625989172, + "grad_norm": 0.03715880587697029, + "learning_rate": 1.9771596956666876e-05, + "loss": 0.1835, + "step": 8800 + }, + { + "epoch": 1.8377759266972093, + "grad_norm": 0.052263110876083374, + "learning_rate": 1.9623596941151584e-05, + "loss": 0.1522, + "step": 8825 + }, + { + "epoch": 1.842982090795502, + "grad_norm": 0.14420367777347565, + "learning_rate": 1.9475794224785242e-05, + "loss": 0.1649, + "step": 8850 + }, + { + "epoch": 1.8481882548937942, + "grad_norm": 5.102858066558838, + "learning_rate": 1.9328194231519464e-05, + "loss": 0.1966, + "step": 8875 + }, + { + "epoch": 1.8533944189920866, + "grad_norm": 2.9605753421783447, + "learning_rate": 1.918080237786651e-05, + "loss": 0.1057, + "step": 8900 + }, + { + "epoch": 1.8586005830903791, + "grad_norm": 0.06543917208909988, + "learning_rate": 1.9033624072700505e-05, + "loss": 0.2098, + "step": 8925 + }, + { + "epoch": 1.8638067471886712, + "grad_norm": 4.984257221221924, + "learning_rate": 1.8886664717058894e-05, + "loss": 0.2006, + "step": 8950 + }, + { + "epoch": 1.8690129112869638, + "grad_norm": 0.3344804048538208, + "learning_rate": 1.8739929703944314e-05, + "loss": 0.1492, + "step": 8975 + }, + { + "epoch": 1.8742190753852561, + "grad_norm": 6.675142765045166, + "learning_rate": 1.859342441812662e-05, + "loss": 0.1568, + "step": 9000 + }, + { + "epoch": 1.8794252394835484, + "grad_norm": 13.291411399841309, + "learning_rate": 1.8447154235945336e-05, + "loss": 0.2015, + "step": 9025 + }, + { + "epoch": 1.884631403581841, + "grad_norm": 0.030354047194123268, + "learning_rate": 1.8301124525112325e-05, + "loss": 0.1201, + "step": 9050 + }, + { + "epoch": 1.8898375676801331, + "grad_norm": 11.661486625671387, + "learning_rate": 1.8155340644514798e-05, + "loss": 0.1744, + "step": 9075 + }, + { + "epoch": 1.8950437317784257, + "grad_norm": 9.253254890441895, + "learning_rate": 1.8009807944018715e-05, + "loss": 0.1305, + "step": 9100 + }, + { + "epoch": 1.900249895876718, + "grad_norm": 0.049124184995889664, + "learning_rate": 1.7864531764272356e-05, + "loss": 0.225, + "step": 9125 + }, + { + "epoch": 1.9054560599750103, + "grad_norm": 10.553680419921875, + "learning_rate": 1.7719517436510462e-05, + "loss": 0.213, + "step": 9150 + }, + { + "epoch": 1.9106622240733029, + "grad_norm": 0.042677950114011765, + "learning_rate": 1.7574770282358505e-05, + "loss": 0.1058, + "step": 9175 + }, + { + "epoch": 1.9158683881715952, + "grad_norm": 0.8776458501815796, + "learning_rate": 1.7430295613637404e-05, + "loss": 0.1907, + "step": 9200 + }, + { + "epoch": 1.9210745522698875, + "grad_norm": 9.908956527709961, + "learning_rate": 1.7286098732168644e-05, + "loss": 0.1863, + "step": 9225 + }, + { + "epoch": 1.9262807163681799, + "grad_norm": 1.294067144393921, + "learning_rate": 1.7142184929579662e-05, + "loss": 0.1053, + "step": 9250 + }, + { + "epoch": 1.9314868804664722, + "grad_norm": 5.95350980758667, + "learning_rate": 1.6998559487109693e-05, + "loss": 0.1541, + "step": 9275 + }, + { + "epoch": 1.9366930445647648, + "grad_norm": 7.54666805267334, + "learning_rate": 1.6855227675415966e-05, + "loss": 0.1696, + "step": 9300 + }, + { + "epoch": 1.941899208663057, + "grad_norm": 0.0091070756316185, + "learning_rate": 1.671219475438024e-05, + "loss": 0.212, + "step": 9325 + }, + { + "epoch": 1.9471053727613494, + "grad_norm": 0.05602679401636124, + "learning_rate": 1.656946597291584e-05, + "loss": 0.133, + "step": 9350 + }, + { + "epoch": 1.9523115368596418, + "grad_norm": 0.04069928824901581, + "learning_rate": 1.6427046568774977e-05, + "loss": 0.1599, + "step": 9375 + }, + { + "epoch": 1.957517700957934, + "grad_norm": 0.21093979477882385, + "learning_rate": 1.628494176835661e-05, + "loss": 0.1432, + "step": 9400 + }, + { + "epoch": 1.9627238650562266, + "grad_norm": 6.424279689788818, + "learning_rate": 1.614315678651457e-05, + "loss": 0.1286, + "step": 9425 + }, + { + "epoch": 1.967930029154519, + "grad_norm": 8.281681060791016, + "learning_rate": 1.6001696826366247e-05, + "loss": 0.0896, + "step": 9450 + }, + { + "epoch": 1.9731361932528113, + "grad_norm": 3.2924630641937256, + "learning_rate": 1.5860567079101657e-05, + "loss": 0.2068, + "step": 9475 + }, + { + "epoch": 1.9783423573511039, + "grad_norm": 0.10305780172348022, + "learning_rate": 1.5719772723792857e-05, + "loss": 0.133, + "step": 9500 + }, + { + "epoch": 1.983548521449396, + "grad_norm": 0.013987602666020393, + "learning_rate": 1.5579318927204003e-05, + "loss": 0.1602, + "step": 9525 + }, + { + "epoch": 1.9887546855476885, + "grad_norm": 8.741622924804688, + "learning_rate": 1.5439210843601647e-05, + "loss": 0.1248, + "step": 9550 + }, + { + "epoch": 1.9939608496459809, + "grad_norm": 7.777089595794678, + "learning_rate": 1.5299453614565646e-05, + "loss": 0.1656, + "step": 9575 + }, + { + "epoch": 1.9991670137442732, + "grad_norm": 7.620024681091309, + "learning_rate": 1.5160052368800466e-05, + "loss": 0.1867, + "step": 9600 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9082910321489002, + "eval_f1_macro": 0.8934973914545552, + "eval_f1_micro": 0.9082910321489002, + "eval_f1_weighted": 0.8990272847487176, + "eval_loss": 0.23952987790107727, + "eval_precision_macro": 0.9212318034319655, + "eval_precision_micro": 0.9082910321489002, + "eval_precision_weighted": 0.9229994939426459, + "eval_recall_macro": 0.9014322820037105, + "eval_recall_micro": 0.9082910321489002, + "eval_recall_weighted": 0.9082910321489002, + "eval_runtime": 16.0892, + "eval_samples_per_second": 918.319, + "eval_steps_per_second": 57.43, + "step": 9604 + }, + { + "epoch": 2.0043731778425657, + "grad_norm": 0.036709289997816086, + "learning_rate": 1.5021012221946956e-05, + "loss": 0.1042, + "step": 9625 + }, + { + "epoch": 2.009579341940858, + "grad_norm": 0.03700362890958786, + "learning_rate": 1.4882338276394644e-05, + "loss": 0.1846, + "step": 9650 + }, + { + "epoch": 2.0147855060391504, + "grad_norm": 0.2605753540992737, + "learning_rate": 1.4744035621094468e-05, + "loss": 0.1531, + "step": 9675 + }, + { + "epoch": 2.0199916701374425, + "grad_norm": 1.4101078510284424, + "learning_rate": 1.460610933137206e-05, + "loss": 0.1186, + "step": 9700 + }, + { + "epoch": 2.025197834235735, + "grad_norm": 7.530198097229004, + "learning_rate": 1.4468564468741464e-05, + "loss": 0.1534, + "step": 9725 + }, + { + "epoch": 2.0304039983340276, + "grad_norm": 0.06903871148824692, + "learning_rate": 1.433140608071939e-05, + "loss": 0.2258, + "step": 9750 + }, + { + "epoch": 2.0356101624323197, + "grad_norm": 0.9064328670501709, + "learning_rate": 1.4194639200640023e-05, + "loss": 0.1681, + "step": 9775 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 0.04811558499932289, + "learning_rate": 1.4058268847470241e-05, + "loss": 0.1343, + "step": 9800 + }, + { + "epoch": 2.0460224906289044, + "grad_norm": 5.909759998321533, + "learning_rate": 1.3922300025625539e-05, + "loss": 0.0999, + "step": 9825 + }, + { + "epoch": 2.051228654727197, + "grad_norm": 1.272133708000183, + "learning_rate": 1.37867377247863e-05, + "loss": 0.1376, + "step": 9850 + }, + { + "epoch": 2.0564348188254895, + "grad_norm": 3.806577682495117, + "learning_rate": 1.3651586919714671e-05, + "loss": 0.158, + "step": 9875 + }, + { + "epoch": 2.0616409829237816, + "grad_norm": 1.572864294052124, + "learning_rate": 1.3516852570072086e-05, + "loss": 0.1046, + "step": 9900 + }, + { + "epoch": 2.066847147022074, + "grad_norm": 0.07213819772005081, + "learning_rate": 1.33825396202372e-05, + "loss": 0.142, + "step": 9925 + }, + { + "epoch": 2.0720533111203667, + "grad_norm": 6.35291862487793, + "learning_rate": 1.324865299912445e-05, + "loss": 0.1243, + "step": 9950 + }, + { + "epoch": 2.077259475218659, + "grad_norm": 11.946340560913086, + "learning_rate": 1.31151976200032e-05, + "loss": 0.1512, + "step": 9975 + }, + { + "epoch": 2.0824656393169514, + "grad_norm": 0.20692221820354462, + "learning_rate": 1.2982178380317416e-05, + "loss": 0.137, + "step": 10000 + }, + { + "epoch": 2.0876718034152435, + "grad_norm": 0.24739764630794525, + "learning_rate": 1.2849600161505957e-05, + "loss": 0.1535, + "step": 10025 + }, + { + "epoch": 2.092877967513536, + "grad_norm": 0.6494514346122742, + "learning_rate": 1.2717467828823409e-05, + "loss": 0.1838, + "step": 10050 + }, + { + "epoch": 2.0980841316118286, + "grad_norm": 13.001143455505371, + "learning_rate": 1.25857862311616e-05, + "loss": 0.1973, + "step": 10075 + }, + { + "epoch": 2.1032902957101207, + "grad_norm": 2.27883243560791, + "learning_rate": 1.2454560200871612e-05, + "loss": 0.2116, + "step": 10100 + }, + { + "epoch": 2.1084964598084133, + "grad_norm": 14.335226058959961, + "learning_rate": 1.2323794553586462e-05, + "loss": 0.1645, + "step": 10125 + }, + { + "epoch": 2.1137026239067054, + "grad_norm": 0.5073531866073608, + "learning_rate": 1.2193494088044399e-05, + "loss": 0.2067, + "step": 10150 + }, + { + "epoch": 2.118908788004998, + "grad_norm": 10.126875877380371, + "learning_rate": 1.206366358591274e-05, + "loss": 0.2034, + "step": 10175 + }, + { + "epoch": 2.1241149521032905, + "grad_norm": 0.4860388934612274, + "learning_rate": 1.1934307811612502e-05, + "loss": 0.1298, + "step": 10200 + }, + { + "epoch": 2.1293211162015826, + "grad_norm": 11.252408981323242, + "learning_rate": 1.1805431512143473e-05, + "loss": 0.2134, + "step": 10225 + }, + { + "epoch": 2.134527280299875, + "grad_norm": 0.6547976732254028, + "learning_rate": 1.1677039416910018e-05, + "loss": 0.2518, + "step": 10250 + }, + { + "epoch": 2.1397334443981673, + "grad_norm": 0.02918720245361328, + "learning_rate": 1.1549136237547577e-05, + "loss": 0.1141, + "step": 10275 + }, + { + "epoch": 2.14493960849646, + "grad_norm": 2.0306875705718994, + "learning_rate": 1.1421726667749714e-05, + "loss": 0.2479, + "step": 10300 + }, + { + "epoch": 2.1501457725947524, + "grad_norm": 3.7051842212677, + "learning_rate": 1.1294815383095877e-05, + "loss": 0.1666, + "step": 10325 + }, + { + "epoch": 2.1553519366930445, + "grad_norm": 7.603741645812988, + "learning_rate": 1.1168407040879842e-05, + "loss": 0.1395, + "step": 10350 + }, + { + "epoch": 2.160558100791337, + "grad_norm": 0.0051424442790448666, + "learning_rate": 1.1042506279938771e-05, + "loss": 0.1033, + "step": 10375 + }, + { + "epoch": 2.165764264889629, + "grad_norm": 0.43630650639533997, + "learning_rate": 1.0917117720483014e-05, + "loss": 0.061, + "step": 10400 + }, + { + "epoch": 2.1709704289879217, + "grad_norm": 0.06688734143972397, + "learning_rate": 1.0792245963926504e-05, + "loss": 0.1547, + "step": 10425 + }, + { + "epoch": 2.1761765930862143, + "grad_norm": 0.005384028889238834, + "learning_rate": 1.0667895592717969e-05, + "loss": 0.1822, + "step": 10450 + }, + { + "epoch": 2.1813827571845064, + "grad_norm": 6.822201251983643, + "learning_rate": 1.0544071170172723e-05, + "loss": 0.0977, + "step": 10475 + }, + { + "epoch": 2.186588921282799, + "grad_norm": 0.17409928143024445, + "learning_rate": 1.0420777240305213e-05, + "loss": 0.1472, + "step": 10500 + }, + { + "epoch": 2.191795085381091, + "grad_norm": 4.65806245803833, + "learning_rate": 1.0298018327662284e-05, + "loss": 0.1499, + "step": 10525 + }, + { + "epoch": 2.1970012494793836, + "grad_norm": 0.050723303109407425, + "learning_rate": 1.0175798937157085e-05, + "loss": 0.1228, + "step": 10550 + }, + { + "epoch": 2.202207413577676, + "grad_norm": 7.599793434143066, + "learning_rate": 1.0054123553903849e-05, + "loss": 0.1122, + "step": 10575 + }, + { + "epoch": 2.2074135776759682, + "grad_norm": 0.2461252212524414, + "learning_rate": 9.93299664305322e-06, + "loss": 0.0847, + "step": 10600 + }, + { + "epoch": 2.212619741774261, + "grad_norm": 0.014030307531356812, + "learning_rate": 9.812422649628388e-06, + "loss": 0.1286, + "step": 10625 + }, + { + "epoch": 2.217825905872553, + "grad_norm": 0.010962835513055325, + "learning_rate": 9.692405998362045e-06, + "loss": 0.0591, + "step": 10650 + }, + { + "epoch": 2.2230320699708455, + "grad_norm": 0.009474786929786205, + "learning_rate": 9.572951093533946e-06, + "loss": 0.1037, + "step": 10675 + }, + { + "epoch": 2.228238234069138, + "grad_norm": 0.02288031578063965, + "learning_rate": 9.454062318809313e-06, + "loss": 0.1174, + "step": 10700 + }, + { + "epoch": 2.23344439816743, + "grad_norm": 10.316886901855469, + "learning_rate": 9.335744037077918e-06, + "loss": 0.1019, + "step": 10725 + }, + { + "epoch": 2.2386505622657227, + "grad_norm": 3.191359758377075, + "learning_rate": 9.218000590294072e-06, + "loss": 0.0714, + "step": 10750 + }, + { + "epoch": 2.243856726364015, + "grad_norm": 0.9904782176017761, + "learning_rate": 9.100836299317201e-06, + "loss": 0.1085, + "step": 10775 + }, + { + "epoch": 2.2490628904623073, + "grad_norm": 0.00978647731244564, + "learning_rate": 8.9842554637533e-06, + "loss": 0.1005, + "step": 10800 + }, + { + "epoch": 2.2542690545606, + "grad_norm": 0.5314901471138, + "learning_rate": 8.868262361797181e-06, + "loss": 0.1389, + "step": 10825 + }, + { + "epoch": 2.259475218658892, + "grad_norm": 2.777710437774658, + "learning_rate": 8.75286125007545e-06, + "loss": 0.1211, + "step": 10850 + }, + { + "epoch": 2.2646813827571846, + "grad_norm": 9.188045501708984, + "learning_rate": 8.638056363490302e-06, + "loss": 0.1837, + "step": 10875 + }, + { + "epoch": 2.269887546855477, + "grad_norm": 8.704696655273438, + "learning_rate": 8.523851915064129e-06, + "loss": 0.1512, + "step": 10900 + }, + { + "epoch": 2.275093710953769, + "grad_norm": 11.265883445739746, + "learning_rate": 8.410252095784866e-06, + "loss": 0.1485, + "step": 10925 + }, + { + "epoch": 2.2802998750520618, + "grad_norm": 8.006060600280762, + "learning_rate": 8.297261074452281e-06, + "loss": 0.1927, + "step": 10950 + }, + { + "epoch": 2.285506039150354, + "grad_norm": 6.185884475708008, + "learning_rate": 8.184882997524884e-06, + "loss": 0.0961, + "step": 10975 + }, + { + "epoch": 2.2907122032486464, + "grad_norm": 8.097982406616211, + "learning_rate": 8.073121988967849e-06, + "loss": 0.1119, + "step": 11000 + }, + { + "epoch": 2.295918367346939, + "grad_norm": 6.152439117431641, + "learning_rate": 7.961982150101643e-06, + "loss": 0.116, + "step": 11025 + }, + { + "epoch": 2.301124531445231, + "grad_norm": 4.259856224060059, + "learning_rate": 7.851467559451508e-06, + "loss": 0.1936, + "step": 11050 + }, + { + "epoch": 2.3063306955435237, + "grad_norm": 3.0448789596557617, + "learning_rate": 7.741582272597823e-06, + "loss": 0.1005, + "step": 11075 + }, + { + "epoch": 2.3115368596418158, + "grad_norm": 0.018132351338863373, + "learning_rate": 7.632330322027212e-06, + "loss": 0.1356, + "step": 11100 + }, + { + "epoch": 2.3167430237401083, + "grad_norm": 12.663254737854004, + "learning_rate": 7.523715716984661e-06, + "loss": 0.1243, + "step": 11125 + }, + { + "epoch": 2.321949187838401, + "grad_norm": 0.21571685373783112, + "learning_rate": 7.415742443326309e-06, + "loss": 0.1938, + "step": 11150 + }, + { + "epoch": 2.327155351936693, + "grad_norm": 0.023954235017299652, + "learning_rate": 7.308414463373189e-06, + "loss": 0.1548, + "step": 11175 + }, + { + "epoch": 2.3323615160349855, + "grad_norm": 0.010295159183442593, + "learning_rate": 7.2017357157658585e-06, + "loss": 0.145, + "step": 11200 + }, + { + "epoch": 2.3375676801332776, + "grad_norm": 0.6988667845726013, + "learning_rate": 7.095710115319831e-06, + "loss": 0.0901, + "step": 11225 + }, + { + "epoch": 2.34277384423157, + "grad_norm": 4.98328161239624, + "learning_rate": 6.99034155288193e-06, + "loss": 0.1512, + "step": 11250 + }, + { + "epoch": 2.3479800083298628, + "grad_norm": 1.8333989381790161, + "learning_rate": 6.885633895187493e-06, + "loss": 0.152, + "step": 11275 + }, + { + "epoch": 2.353186172428155, + "grad_norm": 0.01870441809296608, + "learning_rate": 6.781590984718461e-06, + "loss": 0.2451, + "step": 11300 + }, + { + "epoch": 2.3583923365264474, + "grad_norm": 0.03275001421570778, + "learning_rate": 6.678216639562429e-06, + "loss": 0.1564, + "step": 11325 + }, + { + "epoch": 2.3635985006247395, + "grad_norm": 0.021069686859846115, + "learning_rate": 6.5755146532724495e-06, + "loss": 0.1644, + "step": 11350 + }, + { + "epoch": 2.368804664723032, + "grad_norm": 5.0085577964782715, + "learning_rate": 6.473488794727878e-06, + "loss": 0.1459, + "step": 11375 + }, + { + "epoch": 2.3740108288213246, + "grad_norm": 8.764829635620117, + "learning_rate": 6.372142807996051e-06, + "loss": 0.1538, + "step": 11400 + }, + { + "epoch": 2.3792169929196167, + "grad_norm": 10.779679298400879, + "learning_rate": 6.27148041219488e-06, + "loss": 0.163, + "step": 11425 + }, + { + "epoch": 2.3844231570179093, + "grad_norm": 0.08707818388938904, + "learning_rate": 6.17150530135639e-06, + "loss": 0.1641, + "step": 11450 + }, + { + "epoch": 2.3896293211162014, + "grad_norm": 0.5683345198631287, + "learning_rate": 6.07222114429111e-06, + "loss": 0.1532, + "step": 11475 + }, + { + "epoch": 2.394835485214494, + "grad_norm": 6.6075639724731445, + "learning_rate": 5.9736315844535235e-06, + "loss": 0.167, + "step": 11500 + }, + { + "epoch": 2.4000416493127865, + "grad_norm": 0.18501617014408112, + "learning_rate": 5.875740239808283e-06, + "loss": 0.1036, + "step": 11525 + }, + { + "epoch": 2.4052478134110786, + "grad_norm": 0.05444851890206337, + "learning_rate": 5.778550702697463e-06, + "loss": 0.1271, + "step": 11550 + }, + { + "epoch": 2.410453977509371, + "grad_norm": 4.797399044036865, + "learning_rate": 5.682066539708763e-06, + "loss": 0.1147, + "step": 11575 + }, + { + "epoch": 2.4156601416076633, + "grad_norm": 1.1078044176101685, + "learning_rate": 5.586291291544585e-06, + "loss": 0.1421, + "step": 11600 + }, + { + "epoch": 2.420866305705956, + "grad_norm": 0.07829868793487549, + "learning_rate": 5.491228472892118e-06, + "loss": 0.1598, + "step": 11625 + }, + { + "epoch": 2.4260724698042484, + "grad_norm": 3.3450565338134766, + "learning_rate": 5.396881572294363e-06, + "loss": 0.1359, + "step": 11650 + }, + { + "epoch": 2.4312786339025405, + "grad_norm": 0.017295390367507935, + "learning_rate": 5.303254052022075e-06, + "loss": 0.0646, + "step": 11675 + }, + { + "epoch": 2.436484798000833, + "grad_norm": 13.109850883483887, + "learning_rate": 5.210349347946783e-06, + "loss": 0.2058, + "step": 11700 + }, + { + "epoch": 2.441690962099125, + "grad_norm": 0.01693105697631836, + "learning_rate": 5.118170869414618e-06, + "loss": 0.1465, + "step": 11725 + }, + { + "epoch": 2.4468971261974177, + "grad_norm": 0.01354676578193903, + "learning_rate": 5.026721999121256e-06, + "loss": 0.0848, + "step": 11750 + }, + { + "epoch": 2.4521032902957103, + "grad_norm": 0.5930526852607727, + "learning_rate": 4.9360060929877636e-06, + "loss": 0.1185, + "step": 11775 + }, + { + "epoch": 2.4573094543940024, + "grad_norm": 12.73507308959961, + "learning_rate": 4.846026480037444e-06, + "loss": 0.1434, + "step": 11800 + }, + { + "epoch": 2.462515618492295, + "grad_norm": 6.715247631072998, + "learning_rate": 4.7567864622736824e-06, + "loss": 0.1317, + "step": 11825 + }, + { + "epoch": 2.467721782590587, + "grad_norm": 0.002648524707183242, + "learning_rate": 4.668289314558735e-06, + "loss": 0.1621, + "step": 11850 + }, + { + "epoch": 2.4729279466888796, + "grad_norm": 8.284907341003418, + "learning_rate": 4.580538284493616e-06, + "loss": 0.1486, + "step": 11875 + }, + { + "epoch": 2.478134110787172, + "grad_norm": 0.4824686348438263, + "learning_rate": 4.493536592298864e-06, + "loss": 0.192, + "step": 11900 + }, + { + "epoch": 2.4833402748854643, + "grad_norm": 6.810537338256836, + "learning_rate": 4.407287430696372e-06, + "loss": 0.1059, + "step": 11925 + }, + { + "epoch": 2.488546438983757, + "grad_norm": 5.896641731262207, + "learning_rate": 4.321793964792262e-06, + "loss": 0.0996, + "step": 11950 + }, + { + "epoch": 2.493752603082049, + "grad_norm": 10.901542663574219, + "learning_rate": 4.237059331960694e-06, + "loss": 0.1538, + "step": 11975 + }, + { + "epoch": 2.4989587671803415, + "grad_norm": 5.155703067779541, + "learning_rate": 4.153086641728765e-06, + "loss": 0.1304, + "step": 12000 + }, + { + "epoch": 2.504164931278634, + "grad_norm": 0.00832182727754116, + "learning_rate": 4.069878975662358e-06, + "loss": 0.1498, + "step": 12025 + }, + { + "epoch": 2.509371095376926, + "grad_norm": 8.369385719299316, + "learning_rate": 3.9874393872531e-06, + "loss": 0.2222, + "step": 12050 + }, + { + "epoch": 2.5145772594752187, + "grad_norm": 3.448890209197998, + "learning_rate": 3.905770901806299e-06, + "loss": 0.1876, + "step": 12075 + }, + { + "epoch": 2.519783423573511, + "grad_norm": 1.8173445463180542, + "learning_rate": 3.824876516329881e-06, + "loss": 0.0964, + "step": 12100 + }, + { + "epoch": 2.5249895876718034, + "grad_norm": 8.964319229125977, + "learning_rate": 3.7447591994244635e-06, + "loss": 0.1555, + "step": 12125 + }, + { + "epoch": 2.530195751770096, + "grad_norm": 1.7448962926864624, + "learning_rate": 3.6654218911743892e-06, + "loss": 0.1877, + "step": 12150 + }, + { + "epoch": 2.535401915868388, + "grad_norm": 0.03552517667412758, + "learning_rate": 3.5868675030398286e-06, + "loss": 0.1345, + "step": 12175 + }, + { + "epoch": 2.5406080799666806, + "grad_norm": 0.007262797094881535, + "learning_rate": 3.509098917749962e-06, + "loss": 0.1211, + "step": 12200 + }, + { + "epoch": 2.5458142440649727, + "grad_norm": 6.443802833557129, + "learning_rate": 3.43211898919715e-06, + "loss": 0.0961, + "step": 12225 + }, + { + "epoch": 2.5510204081632653, + "grad_norm": 3.5260281562805176, + "learning_rate": 3.3559305423322503e-06, + "loss": 0.1721, + "step": 12250 + }, + { + "epoch": 2.556226572261558, + "grad_norm": 2.5505025386810303, + "learning_rate": 3.280536373060919e-06, + "loss": 0.1488, + "step": 12275 + }, + { + "epoch": 2.56143273635985, + "grad_norm": 12.06968879699707, + "learning_rate": 3.2059392481410155e-06, + "loss": 0.145, + "step": 12300 + }, + { + "epoch": 2.5666389004581425, + "grad_norm": 0.0032713667023926973, + "learning_rate": 3.132141905081076e-06, + "loss": 0.1333, + "step": 12325 + }, + { + "epoch": 2.5718450645564346, + "grad_norm": 10.196958541870117, + "learning_rate": 3.0591470520398513e-06, + "loss": 0.1732, + "step": 12350 + }, + { + "epoch": 2.577051228654727, + "grad_norm": 0.015713948756456375, + "learning_rate": 2.9869573677269254e-06, + "loss": 0.1367, + "step": 12375 + }, + { + "epoch": 2.5822573927530197, + "grad_norm": 0.21436667442321777, + "learning_rate": 2.915575501304396e-06, + "loss": 0.235, + "step": 12400 + }, + { + "epoch": 2.587463556851312, + "grad_norm": 0.0038804244250059128, + "learning_rate": 2.845004072289695e-06, + "loss": 0.0992, + "step": 12425 + }, + { + "epoch": 2.5926697209496044, + "grad_norm": 6.616683483123779, + "learning_rate": 2.775245670459439e-06, + "loss": 0.1481, + "step": 12450 + }, + { + "epoch": 2.5978758850478965, + "grad_norm": 0.006720269098877907, + "learning_rate": 2.7063028557543753e-06, + "loss": 0.12, + "step": 12475 + }, + { + "epoch": 2.603082049146189, + "grad_norm": 1.3217498064041138, + "learning_rate": 2.638178158185467e-06, + "loss": 0.0997, + "step": 12500 + }, + { + "epoch": 2.6082882132444816, + "grad_norm": 13.17971420288086, + "learning_rate": 2.570874077741034e-06, + "loss": 0.2045, + "step": 12525 + }, + { + "epoch": 2.6134943773427737, + "grad_norm": 15.589527130126953, + "learning_rate": 2.504393084295015e-06, + "loss": 0.1074, + "step": 12550 + }, + { + "epoch": 2.6187005414410662, + "grad_norm": 1.773633360862732, + "learning_rate": 2.438737617516332e-06, + "loss": 0.1416, + "step": 12575 + }, + { + "epoch": 2.6239067055393583, + "grad_norm": 7.001961708068848, + "learning_rate": 2.373910086779338e-06, + "loss": 0.1821, + "step": 12600 + }, + { + "epoch": 2.629112869637651, + "grad_norm": 7.044321060180664, + "learning_rate": 2.309912871075445e-06, + "loss": 0.1201, + "step": 12625 + }, + { + "epoch": 2.6343190337359434, + "grad_norm": 8.048465728759766, + "learning_rate": 2.246748318925779e-06, + "loss": 0.167, + "step": 12650 + }, + { + "epoch": 2.639525197834236, + "grad_norm": 6.150434494018555, + "learning_rate": 2.1844187482950225e-06, + "loss": 0.1569, + "step": 12675 + }, + { + "epoch": 2.644731361932528, + "grad_norm": 1.490867018699646, + "learning_rate": 2.122926446506332e-06, + "loss": 0.0887, + "step": 12700 + }, + { + "epoch": 2.6499375260308202, + "grad_norm": 6.9760003089904785, + "learning_rate": 2.0622736701574136e-06, + "loss": 0.0619, + "step": 12725 + }, + { + "epoch": 2.6551436901291128, + "grad_norm": 4.531062602996826, + "learning_rate": 2.0024626450377144e-06, + "loss": 0.1153, + "step": 12750 + }, + { + "epoch": 2.6603498542274053, + "grad_norm": 0.011586461216211319, + "learning_rate": 1.9434955660467184e-06, + "loss": 0.1071, + "step": 12775 + }, + { + "epoch": 2.665556018325698, + "grad_norm": 0.5682029128074646, + "learning_rate": 1.885374597113429e-06, + "loss": 0.0935, + "step": 12800 + }, + { + "epoch": 2.67076218242399, + "grad_norm": 0.14094886183738708, + "learning_rate": 1.8281018711169522e-06, + "loss": 0.0884, + "step": 12825 + }, + { + "epoch": 2.6759683465222825, + "grad_norm": 0.025353549048304558, + "learning_rate": 1.7716794898082034e-06, + "loss": 0.1083, + "step": 12850 + }, + { + "epoch": 2.6811745106205747, + "grad_norm": 10.48271656036377, + "learning_rate": 1.716109523732809e-06, + "loss": 0.1437, + "step": 12875 + }, + { + "epoch": 2.686380674718867, + "grad_norm": 3.716670513153076, + "learning_rate": 1.6613940121551014e-06, + "loss": 0.177, + "step": 12900 + }, + { + "epoch": 2.6915868388171598, + "grad_norm": 2.8707427978515625, + "learning_rate": 1.6075349629832954e-06, + "loss": 0.1107, + "step": 12925 + }, + { + "epoch": 2.696793002915452, + "grad_norm": 3.083693742752075, + "learning_rate": 1.554534352695808e-06, + "loss": 0.1544, + "step": 12950 + }, + { + "epoch": 2.7019991670137444, + "grad_norm": 0.012157919816672802, + "learning_rate": 1.5023941262686997e-06, + "loss": 0.1065, + "step": 12975 + }, + { + "epoch": 2.7072053311120365, + "grad_norm": 6.145320415496826, + "learning_rate": 1.4511161971043351e-06, + "loss": 0.0982, + "step": 13000 + }, + { + "epoch": 2.712411495210329, + "grad_norm": 0.1649736911058426, + "learning_rate": 1.4007024469611441e-06, + "loss": 0.1541, + "step": 13025 + }, + { + "epoch": 2.7176176593086216, + "grad_norm": 0.16660048067569733, + "learning_rate": 1.3511547258845763e-06, + "loss": 0.0752, + "step": 13050 + }, + { + "epoch": 2.7228238234069138, + "grad_norm": 5.491020679473877, + "learning_rate": 1.3024748521392017e-06, + "loss": 0.0893, + "step": 13075 + }, + { + "epoch": 2.7280299875052063, + "grad_norm": 6.0252790451049805, + "learning_rate": 1.2546646121419868e-06, + "loss": 0.1347, + "step": 13100 + }, + { + "epoch": 2.7332361516034984, + "grad_norm": 2.7115368843078613, + "learning_rate": 1.2077257603967523e-06, + "loss": 0.1261, + "step": 13125 + }, + { + "epoch": 2.738442315701791, + "grad_norm": 14.104537963867188, + "learning_rate": 1.1616600194297555e-06, + "loss": 0.1585, + "step": 13150 + }, + { + "epoch": 2.7436484798000835, + "grad_norm": 0.006517982110381126, + "learning_rate": 1.1164690797265148e-06, + "loss": 0.1143, + "step": 13175 + }, + { + "epoch": 2.7488546438983756, + "grad_norm": 0.00706452364102006, + "learning_rate": 1.0721545996697585e-06, + "loss": 0.1482, + "step": 13200 + }, + { + "epoch": 2.754060807996668, + "grad_norm": 0.16163934767246246, + "learning_rate": 1.0287182054785544e-06, + "loss": 0.1259, + "step": 13225 + }, + { + "epoch": 2.7592669720949603, + "grad_norm": 0.0032609994523227215, + "learning_rate": 9.861614911486462e-07, + "loss": 0.1311, + "step": 13250 + }, + { + "epoch": 2.764473136193253, + "grad_norm": 1.620920181274414, + "learning_rate": 9.444860183939669e-07, + "loss": 0.1162, + "step": 13275 + }, + { + "epoch": 2.7696793002915454, + "grad_norm": 5.577692031860352, + "learning_rate": 9.036933165893019e-07, + "loss": 0.1143, + "step": 13300 + }, + { + "epoch": 2.7748854643898375, + "grad_norm": 3.6756861209869385, + "learning_rate": 8.637848827141859e-07, + "loss": 0.1157, + "step": 13325 + }, + { + "epoch": 2.78009162848813, + "grad_norm": 8.014864921569824, + "learning_rate": 8.247621812979639e-07, + "loss": 0.1125, + "step": 13350 + }, + { + "epoch": 2.785297792586422, + "grad_norm": 0.027799520641565323, + "learning_rate": 7.866266443660397e-07, + "loss": 0.1147, + "step": 13375 + }, + { + "epoch": 2.7905039566847147, + "grad_norm": 0.20379126071929932, + "learning_rate": 7.493796713873346e-07, + "loss": 0.0776, + "step": 13400 + }, + { + "epoch": 2.7957101207830073, + "grad_norm": 5.621152400970459, + "learning_rate": 7.130226292229314e-07, + "loss": 0.1281, + "step": 13425 + }, + { + "epoch": 2.8009162848812994, + "grad_norm": 9.663355827331543, + "learning_rate": 6.775568520758863e-07, + "loss": 0.1409, + "step": 13450 + }, + { + "epoch": 2.806122448979592, + "grad_norm": 7.826466083526611, + "learning_rate": 6.429836414423212e-07, + "loss": 0.13, + "step": 13475 + }, + { + "epoch": 2.811328613077884, + "grad_norm": 0.012044396251440048, + "learning_rate": 6.093042660636095e-07, + "loss": 0.2328, + "step": 13500 + }, + { + "epoch": 2.8165347771761766, + "grad_norm": 2.4445865154266357, + "learning_rate": 5.765199618798456e-07, + "loss": 0.1251, + "step": 13525 + }, + { + "epoch": 2.821740941274469, + "grad_norm": 4.6207451820373535, + "learning_rate": 5.446319319844794e-07, + "loss": 0.1069, + "step": 13550 + }, + { + "epoch": 2.8269471053727613, + "grad_norm": 0.5208550691604614, + "learning_rate": 5.13641346580171e-07, + "loss": 0.0791, + "step": 13575 + }, + { + "epoch": 2.832153269471054, + "grad_norm": 0.03781859204173088, + "learning_rate": 4.835493429358462e-07, + "loss": 0.1112, + "step": 13600 + }, + { + "epoch": 2.837359433569346, + "grad_norm": 1.6827815771102905, + "learning_rate": 4.5435702534495915e-07, + "loss": 0.1123, + "step": 13625 + }, + { + "epoch": 2.8425655976676385, + "grad_norm": 2.481886863708496, + "learning_rate": 4.2606546508497103e-07, + "loss": 0.1228, + "step": 13650 + }, + { + "epoch": 2.847771761765931, + "grad_norm": 7.118264198303223, + "learning_rate": 3.9867570037803725e-07, + "loss": 0.1172, + "step": 13675 + }, + { + "epoch": 2.852977925864223, + "grad_norm": 8.65257453918457, + "learning_rate": 3.7218873635290195e-07, + "loss": 0.111, + "step": 13700 + }, + { + "epoch": 2.8581840899625157, + "grad_norm": 0.8659380674362183, + "learning_rate": 3.466055450080191e-07, + "loss": 0.0999, + "step": 13725 + }, + { + "epoch": 2.863390254060808, + "grad_norm": 5.575057506561279, + "learning_rate": 3.219270651758782e-07, + "loss": 0.1201, + "step": 13750 + }, + { + "epoch": 2.8685964181591004, + "grad_norm": 4.377348899841309, + "learning_rate": 2.9815420248855977e-07, + "loss": 0.1469, + "step": 13775 + }, + { + "epoch": 2.873802582257393, + "grad_norm": 3.4709956645965576, + "learning_rate": 2.7528782934449215e-07, + "loss": 0.1214, + "step": 13800 + }, + { + "epoch": 2.879008746355685, + "grad_norm": 0.008634321391582489, + "learning_rate": 2.533287848764332e-07, + "loss": 0.1104, + "step": 13825 + }, + { + "epoch": 2.8842149104539776, + "grad_norm": 5.85574197769165, + "learning_rate": 2.3227787492069718e-07, + "loss": 0.1284, + "step": 13850 + }, + { + "epoch": 2.8894210745522697, + "grad_norm": 2.434326648712158, + "learning_rate": 2.1213587198755936e-07, + "loss": 0.0968, + "step": 13875 + }, + { + "epoch": 2.8946272386505623, + "grad_norm": 3.5275282859802246, + "learning_rate": 1.929035152329145e-07, + "loss": 0.1454, + "step": 13900 + }, + { + "epoch": 2.899833402748855, + "grad_norm": 3.965625762939453, + "learning_rate": 1.7458151043116544e-07, + "loss": 0.1299, + "step": 13925 + }, + { + "epoch": 2.905039566847147, + "grad_norm": 0.10151717066764832, + "learning_rate": 1.5717052994929927e-07, + "loss": 0.1459, + "step": 13950 + }, + { + "epoch": 2.9102457309454395, + "grad_norm": 0.03408972918987274, + "learning_rate": 1.4067121272223204e-07, + "loss": 0.11, + "step": 13975 + }, + { + "epoch": 2.9154518950437316, + "grad_norm": 0.0022567359264940023, + "learning_rate": 1.2508416422935542e-07, + "loss": 0.1083, + "step": 14000 + }, + { + "epoch": 2.920658059142024, + "grad_norm": 7.95089054107666, + "learning_rate": 1.1040995647231545e-07, + "loss": 0.1088, + "step": 14025 + }, + { + "epoch": 2.9258642232403167, + "grad_norm": 12.033370018005371, + "learning_rate": 9.664912795402381e-08, + "loss": 0.1305, + "step": 14050 + }, + { + "epoch": 2.931070387338609, + "grad_norm": 0.22287705540657043, + "learning_rate": 8.380218365889592e-08, + "loss": 0.1694, + "step": 14075 + }, + { + "epoch": 2.9362765514369014, + "grad_norm": 0.011814435012638569, + "learning_rate": 7.186959503431845e-08, + "loss": 0.083, + "step": 14100 + }, + { + "epoch": 2.9414827155351935, + "grad_norm": 0.22026073932647705, + "learning_rate": 6.08517999733521e-08, + "loss": 0.0816, + "step": 14125 + }, + { + "epoch": 2.946688879633486, + "grad_norm": 6.017068386077881, + "learning_rate": 5.0749202798652806e-08, + "loss": 0.1286, + "step": 14150 + }, + { + "epoch": 2.9518950437317786, + "grad_norm": 9.714811325073242, + "learning_rate": 4.156217424765296e-08, + "loss": 0.1439, + "step": 14175 + }, + { + "epoch": 2.9571012078300707, + "grad_norm": 0.37943577766418457, + "learning_rate": 3.3291051458927966e-08, + "loss": 0.0811, + "step": 14200 + }, + { + "epoch": 2.9623073719283632, + "grad_norm": 0.007510739378631115, + "learning_rate": 2.5936137959856054e-08, + "loss": 0.1679, + "step": 14225 + }, + { + "epoch": 2.9675135360266554, + "grad_norm": 4.679372310638428, + "learning_rate": 1.9497703655455e-08, + "loss": 0.0982, + "step": 14250 + }, + { + "epoch": 2.972719700124948, + "grad_norm": 5.683784484863281, + "learning_rate": 1.39759848184845e-08, + "loss": 0.1231, + "step": 14275 + }, + { + "epoch": 2.9779258642232405, + "grad_norm": 13.094088554382324, + "learning_rate": 9.37118408078641e-09, + "loss": 0.118, + "step": 14300 + }, + { + "epoch": 2.9831320283215326, + "grad_norm": 7.190179347991943, + "learning_rate": 5.683470425832394e-09, + "loss": 0.1788, + "step": 14325 + }, + { + "epoch": 2.988338192419825, + "grad_norm": 8.636795997619629, + "learning_rate": 2.9129791825344145e-09, + "loss": 0.1482, + "step": 14350 + }, + { + "epoch": 2.9935443565181172, + "grad_norm": 5.166715145111084, + "learning_rate": 1.0598120202681695e-09, + "loss": 0.106, + "step": 14375 + }, + { + "epoch": 2.99875052061641, + "grad_norm": 6.336511611938477, + "learning_rate": 1.2403694515661368e-10, + "loss": 0.1234, + "step": 14400 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9105245346869713, + "eval_f1_macro": 0.8981344155061894, + "eval_f1_micro": 0.9105245346869713, + "eval_f1_weighted": 0.9032382559745838, + "eval_loss": 0.2098342627286911, + "eval_precision_macro": 0.9192533404916904, + "eval_precision_micro": 0.9105245346869713, + "eval_precision_weighted": 0.920957800948379, + "eval_recall_macro": 0.9038812615955473, + "eval_recall_micro": 0.9105245346869713, + "eval_recall_weighted": 0.9105245346869713, + "eval_runtime": 16.1158, + "eval_samples_per_second": 916.801, + "eval_steps_per_second": 57.335, + "step": 14406 + } + ], + "logging_steps": 25, + "max_steps": 14406, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3853101049724160.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}