| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.3869625520110958, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006934812760055479, |
| "grad_norm": 61.60813903808594, |
| "learning_rate": 8.650519031141869e-07, |
| "loss": 2.7928, |
| "mean_token_accuracy": 0.6783367753028869, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.013869625520110958, |
| "grad_norm": 35.546016693115234, |
| "learning_rate": 1.7301038062283738e-06, |
| "loss": 2.3942, |
| "mean_token_accuracy": 0.6943186521530151, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.020804438280166437, |
| "grad_norm": 2.246945858001709, |
| "learning_rate": 2.5951557093425604e-06, |
| "loss": 1.202, |
| "mean_token_accuracy": 0.7397322177886962, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.027739251040221916, |
| "grad_norm": 1.1429805755615234, |
| "learning_rate": 3.4602076124567477e-06, |
| "loss": 0.918, |
| "mean_token_accuracy": 0.7564186692237854, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.03467406380027739, |
| "grad_norm": 0.9538511633872986, |
| "learning_rate": 4.325259515570934e-06, |
| "loss": 0.8104, |
| "mean_token_accuracy": 0.7724308490753173, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.04160887656033287, |
| "grad_norm": 0.7069241404533386, |
| "learning_rate": 5.190311418685121e-06, |
| "loss": 0.7364, |
| "mean_token_accuracy": 0.7827559828758239, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.04854368932038835, |
| "grad_norm": 0.4030636250972748, |
| "learning_rate": 6.055363321799308e-06, |
| "loss": 0.6835, |
| "mean_token_accuracy": 0.7935511350631714, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.05547850208044383, |
| "grad_norm": 0.8142576217651367, |
| "learning_rate": 6.920415224913495e-06, |
| "loss": 0.6478, |
| "mean_token_accuracy": 0.8010085463523865, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06241331484049931, |
| "grad_norm": 0.2626665532588959, |
| "learning_rate": 7.785467128027681e-06, |
| "loss": 0.6267, |
| "mean_token_accuracy": 0.8053073883056641, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.06934812760055478, |
| "grad_norm": 0.23942551016807556, |
| "learning_rate": 8.650519031141868e-06, |
| "loss": 0.6013, |
| "mean_token_accuracy": 0.8112802267074585, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.07628294036061026, |
| "grad_norm": 0.20308136940002441, |
| "learning_rate": 9.515570934256055e-06, |
| "loss": 0.5769, |
| "mean_token_accuracy": 0.8168688178062439, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.08321775312066575, |
| "grad_norm": 0.1854431927204132, |
| "learning_rate": 1.0380622837370241e-05, |
| "loss": 0.5805, |
| "mean_token_accuracy": 0.815436840057373, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09015256588072122, |
| "grad_norm": 0.1700541228055954, |
| "learning_rate": 1.124567474048443e-05, |
| "loss": 0.5652, |
| "mean_token_accuracy": 0.8188095331192017, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.0970873786407767, |
| "grad_norm": 0.18573108315467834, |
| "learning_rate": 1.2110726643598615e-05, |
| "loss": 0.5524, |
| "mean_token_accuracy": 0.8222507953643798, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10402219140083217, |
| "grad_norm": 0.18843185901641846, |
| "learning_rate": 1.2975778546712803e-05, |
| "loss": 0.542, |
| "mean_token_accuracy": 0.8249342203140259, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.11095700416088766, |
| "grad_norm": 0.21635942161083221, |
| "learning_rate": 1.384083044982699e-05, |
| "loss": 0.5401, |
| "mean_token_accuracy": 0.8251730322837829, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.11789181692094314, |
| "grad_norm": 0.21325534582138062, |
| "learning_rate": 1.4705882352941177e-05, |
| "loss": 0.5404, |
| "mean_token_accuracy": 0.8243620276451111, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.12482662968099861, |
| "grad_norm": 0.22691610455513, |
| "learning_rate": 1.5570934256055363e-05, |
| "loss": 0.5278, |
| "mean_token_accuracy": 0.8283108592033386, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.1317614424410541, |
| "grad_norm": 0.2375083714723587, |
| "learning_rate": 1.643598615916955e-05, |
| "loss": 0.5198, |
| "mean_token_accuracy": 0.8296842217445374, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.13869625520110956, |
| "grad_norm": 0.27802157402038574, |
| "learning_rate": 1.7301038062283735e-05, |
| "loss": 0.5346, |
| "mean_token_accuracy": 0.825625765323639, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.14563106796116504, |
| "grad_norm": 0.3691716194152832, |
| "learning_rate": 1.8166089965397926e-05, |
| "loss": 0.5249, |
| "mean_token_accuracy": 0.828523588180542, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.15256588072122051, |
| "grad_norm": 0.30235254764556885, |
| "learning_rate": 1.903114186851211e-05, |
| "loss": 0.514, |
| "mean_token_accuracy": 0.8320501446723938, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.15950069348127602, |
| "grad_norm": 0.3447076380252838, |
| "learning_rate": 1.98961937716263e-05, |
| "loss": 0.5203, |
| "mean_token_accuracy": 0.8298335313796997, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.1664355062413315, |
| "grad_norm": 0.28185489773750305, |
| "learning_rate": 2.0761245674740483e-05, |
| "loss": 0.5305, |
| "mean_token_accuracy": 0.8264262914657593, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.17337031900138697, |
| "grad_norm": 0.2629449963569641, |
| "learning_rate": 2.1626297577854674e-05, |
| "loss": 0.5167, |
| "mean_token_accuracy": 0.8303680658340454, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.18030513176144244, |
| "grad_norm": 0.2788124680519104, |
| "learning_rate": 2.249134948096886e-05, |
| "loss": 0.5275, |
| "mean_token_accuracy": 0.8274267673492431, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.18723994452149792, |
| "grad_norm": 0.24550440907478333, |
| "learning_rate": 2.3356401384083046e-05, |
| "loss": 0.5073, |
| "mean_token_accuracy": 0.8328269720077515, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.1941747572815534, |
| "grad_norm": 0.7636565566062927, |
| "learning_rate": 2.422145328719723e-05, |
| "loss": 0.5138, |
| "mean_token_accuracy": 0.8308726906776428, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.20110957004160887, |
| "grad_norm": 0.6163385510444641, |
| "learning_rate": 2.508650519031142e-05, |
| "loss": 0.5123, |
| "mean_token_accuracy": 0.8311893105506897, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.20804438280166435, |
| "grad_norm": 0.3808706998825073, |
| "learning_rate": 2.5951557093425606e-05, |
| "loss": 0.5018, |
| "mean_token_accuracy": 0.8343647360801697, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.21497919556171982, |
| "grad_norm": 0.2565021216869354, |
| "learning_rate": 2.6816608996539794e-05, |
| "loss": 0.4943, |
| "mean_token_accuracy": 0.8362817883491516, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.22191400832177532, |
| "grad_norm": 0.3511087894439697, |
| "learning_rate": 2.768166089965398e-05, |
| "loss": 0.4933, |
| "mean_token_accuracy": 0.8366880297660828, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2288488210818308, |
| "grad_norm": 0.4006827175617218, |
| "learning_rate": 2.8546712802768166e-05, |
| "loss": 0.4906, |
| "mean_token_accuracy": 0.837351131439209, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.23578363384188628, |
| "grad_norm": 0.4149070978164673, |
| "learning_rate": 2.9411764705882354e-05, |
| "loss": 0.4988, |
| "mean_token_accuracy": 0.8351827621459961, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.24271844660194175, |
| "grad_norm": 0.32881560921669006, |
| "learning_rate": 3.0276816608996538e-05, |
| "loss": 0.5085, |
| "mean_token_accuracy": 0.831884253025055, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.24965325936199723, |
| "grad_norm": 0.46366971731185913, |
| "learning_rate": 3.1141868512110726e-05, |
| "loss": 0.4964, |
| "mean_token_accuracy": 0.8355090618133545, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2565880721220527, |
| "grad_norm": 0.838777482509613, |
| "learning_rate": 3.200692041522492e-05, |
| "loss": 0.5078, |
| "mean_token_accuracy": 0.8320568442344666, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.2635228848821082, |
| "grad_norm": 30.57767677307129, |
| "learning_rate": 3.28719723183391e-05, |
| "loss": 0.4978, |
| "mean_token_accuracy": 0.8360116839408874, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.27045769764216365, |
| "grad_norm": 0.5885879993438721, |
| "learning_rate": 3.373702422145329e-05, |
| "loss": 0.497, |
| "mean_token_accuracy": 0.8356186389923096, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.27739251040221913, |
| "grad_norm": 0.3610420525074005, |
| "learning_rate": 3.460207612456747e-05, |
| "loss": 0.4989, |
| "mean_token_accuracy": 0.8350513100624084, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2843273231622746, |
| "grad_norm": 0.3954995572566986, |
| "learning_rate": 3.546712802768166e-05, |
| "loss": 0.5011, |
| "mean_token_accuracy": 0.83415766954422, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.2912621359223301, |
| "grad_norm": 0.3071337342262268, |
| "learning_rate": 3.633217993079585e-05, |
| "loss": 0.5255, |
| "mean_token_accuracy": 0.8285403490066529, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.29819694868238555, |
| "grad_norm": 0.31758391857147217, |
| "learning_rate": 3.719723183391004e-05, |
| "loss": 0.4954, |
| "mean_token_accuracy": 0.835390031337738, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.30513176144244103, |
| "grad_norm": 0.3296087086200714, |
| "learning_rate": 3.806228373702422e-05, |
| "loss": 0.4923, |
| "mean_token_accuracy": 0.8361375451087951, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3120665742024965, |
| "grad_norm": 0.2980894446372986, |
| "learning_rate": 3.892733564013841e-05, |
| "loss": 0.4996, |
| "mean_token_accuracy": 0.8341476917266846, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.31900138696255204, |
| "grad_norm": 0.2892495095729828, |
| "learning_rate": 3.97923875432526e-05, |
| "loss": 0.4855, |
| "mean_token_accuracy": 0.8382086515426636, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3259361997226075, |
| "grad_norm": 0.29287102818489075, |
| "learning_rate": 4.065743944636679e-05, |
| "loss": 0.4944, |
| "mean_token_accuracy": 0.8353524923324585, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.332871012482663, |
| "grad_norm": 0.28245487809181213, |
| "learning_rate": 4.1522491349480966e-05, |
| "loss": 0.4887, |
| "mean_token_accuracy": 0.8373544692993165, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.33980582524271846, |
| "grad_norm": 0.23551802337169647, |
| "learning_rate": 4.238754325259516e-05, |
| "loss": 0.4925, |
| "mean_token_accuracy": 0.8361364006996155, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.34674063800277394, |
| "grad_norm": 0.24266427755355835, |
| "learning_rate": 4.325259515570935e-05, |
| "loss": 0.4759, |
| "mean_token_accuracy": 0.8410738468170166, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.3536754507628294, |
| "grad_norm": 0.33316895365715027, |
| "learning_rate": 4.411764705882353e-05, |
| "loss": 0.4902, |
| "mean_token_accuracy": 0.8370885252952576, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.3606102635228849, |
| "grad_norm": 0.5113539099693298, |
| "learning_rate": 4.498269896193772e-05, |
| "loss": 0.4918, |
| "mean_token_accuracy": 0.8364068984985351, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.36754507628294036, |
| "grad_norm": 0.3733905851840973, |
| "learning_rate": 4.58477508650519e-05, |
| "loss": 0.49, |
| "mean_token_accuracy": 0.8370036244392395, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.37447988904299584, |
| "grad_norm": 0.4112997353076935, |
| "learning_rate": 4.671280276816609e-05, |
| "loss": 0.4932, |
| "mean_token_accuracy": 0.8356328129768371, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.3814147018030513, |
| "grad_norm": 0.5121487379074097, |
| "learning_rate": 4.7577854671280283e-05, |
| "loss": 0.479, |
| "mean_token_accuracy": 0.839626955986023, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.3883495145631068, |
| "grad_norm": 0.36294957995414734, |
| "learning_rate": 4.844290657439446e-05, |
| "loss": 0.4829, |
| "mean_token_accuracy": 0.8391167283058166, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.39528432732316227, |
| "grad_norm": 0.3162820339202881, |
| "learning_rate": 4.930795847750865e-05, |
| "loss": 0.4899, |
| "mean_token_accuracy": 0.8368083834648132, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.40221914008321774, |
| "grad_norm": 0.3973437547683716, |
| "learning_rate": 4.9980732177263974e-05, |
| "loss": 0.4864, |
| "mean_token_accuracy": 0.8374906539916992, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4091539528432732, |
| "grad_norm": 0.5423433184623718, |
| "learning_rate": 4.9884393063583816e-05, |
| "loss": 0.4907, |
| "mean_token_accuracy": 0.8373413920402527, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.4160887656033287, |
| "grad_norm": 0.39722123742103577, |
| "learning_rate": 4.9788053949903666e-05, |
| "loss": 0.4961, |
| "mean_token_accuracy": 0.8351489901542664, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.42302357836338417, |
| "grad_norm": 0.34169071912765503, |
| "learning_rate": 4.969171483622351e-05, |
| "loss": 0.4891, |
| "mean_token_accuracy": 0.8370493412017822, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.42995839112343964, |
| "grad_norm": 0.3429335951805115, |
| "learning_rate": 4.959537572254335e-05, |
| "loss": 0.4794, |
| "mean_token_accuracy": 0.8396916627883911, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.4368932038834951, |
| "grad_norm": 0.3266272246837616, |
| "learning_rate": 4.94990366088632e-05, |
| "loss": 0.4757, |
| "mean_token_accuracy": 0.8405494570732117, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.44382801664355065, |
| "grad_norm": 0.2874930202960968, |
| "learning_rate": 4.940269749518305e-05, |
| "loss": 0.4978, |
| "mean_token_accuracy": 0.8344841122627258, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4507628294036061, |
| "grad_norm": 0.2812349498271942, |
| "learning_rate": 4.930635838150289e-05, |
| "loss": 0.4839, |
| "mean_token_accuracy": 0.8383953332901001, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.4576976421636616, |
| "grad_norm": 0.25296345353126526, |
| "learning_rate": 4.921001926782274e-05, |
| "loss": 0.4738, |
| "mean_token_accuracy": 0.8412886261940002, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4646324549237171, |
| "grad_norm": 0.22165291011333466, |
| "learning_rate": 4.9113680154142584e-05, |
| "loss": 0.4867, |
| "mean_token_accuracy": 0.8379201173782349, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.47156726768377255, |
| "grad_norm": 0.2551758289337158, |
| "learning_rate": 4.9017341040462426e-05, |
| "loss": 0.4786, |
| "mean_token_accuracy": 0.8399594306945801, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.478502080443828, |
| "grad_norm": 0.25708919763565063, |
| "learning_rate": 4.8921001926782276e-05, |
| "loss": 0.48, |
| "mean_token_accuracy": 0.8395551085472107, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.4854368932038835, |
| "grad_norm": 0.1992408186197281, |
| "learning_rate": 4.8824662813102125e-05, |
| "loss": 0.4714, |
| "mean_token_accuracy": 0.8418668508529663, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.492371705963939, |
| "grad_norm": 0.23445720970630646, |
| "learning_rate": 4.872832369942197e-05, |
| "loss": 0.471, |
| "mean_token_accuracy": 0.8421580553054809, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.49930651872399445, |
| "grad_norm": 0.31462928652763367, |
| "learning_rate": 4.863198458574181e-05, |
| "loss": 0.4711, |
| "mean_token_accuracy": 0.842027747631073, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5062413314840499, |
| "grad_norm": 0.24767646193504333, |
| "learning_rate": 4.853564547206166e-05, |
| "loss": 0.4717, |
| "mean_token_accuracy": 0.8417503118515015, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.5131761442441054, |
| "grad_norm": 0.2389938235282898, |
| "learning_rate": 4.84393063583815e-05, |
| "loss": 0.4677, |
| "mean_token_accuracy": 0.8431912064552307, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5201109570041609, |
| "grad_norm": 0.29998722672462463, |
| "learning_rate": 4.834296724470135e-05, |
| "loss": 0.4877, |
| "mean_token_accuracy": 0.8374402284622192, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.5270457697642164, |
| "grad_norm": 0.2877121865749359, |
| "learning_rate": 4.82466281310212e-05, |
| "loss": 0.4863, |
| "mean_token_accuracy": 0.8380719065666199, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5339805825242718, |
| "grad_norm": 0.24628062546253204, |
| "learning_rate": 4.815028901734104e-05, |
| "loss": 0.4665, |
| "mean_token_accuracy": 0.8434135437011718, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.5409153952843273, |
| "grad_norm": 0.24347947537899017, |
| "learning_rate": 4.8053949903660886e-05, |
| "loss": 0.476, |
| "mean_token_accuracy": 0.8404138565063477, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5478502080443828, |
| "grad_norm": 0.20724909007549286, |
| "learning_rate": 4.7957610789980735e-05, |
| "loss": 0.4881, |
| "mean_token_accuracy": 0.8372583389282227, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.5547850208044383, |
| "grad_norm": 0.2162594497203827, |
| "learning_rate": 4.786127167630058e-05, |
| "loss": 0.4726, |
| "mean_token_accuracy": 0.842011570930481, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5617198335644937, |
| "grad_norm": 0.34494099020957947, |
| "learning_rate": 4.776493256262042e-05, |
| "loss": 0.4783, |
| "mean_token_accuracy": 0.8399308085441589, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.5686546463245492, |
| "grad_norm": 0.24402566254138947, |
| "learning_rate": 4.7668593448940276e-05, |
| "loss": 0.4953, |
| "mean_token_accuracy": 0.8352864623069763, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5755894590846047, |
| "grad_norm": 0.2124612033367157, |
| "learning_rate": 4.757225433526012e-05, |
| "loss": 0.4849, |
| "mean_token_accuracy": 0.8380748987197876, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.5825242718446602, |
| "grad_norm": 0.20577934384346008, |
| "learning_rate": 4.747591522157996e-05, |
| "loss": 0.4591, |
| "mean_token_accuracy": 0.845665693283081, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5894590846047156, |
| "grad_norm": 0.2838655710220337, |
| "learning_rate": 4.737957610789981e-05, |
| "loss": 0.4709, |
| "mean_token_accuracy": 0.8418583750724793, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.5963938973647711, |
| "grad_norm": 0.2222902923822403, |
| "learning_rate": 4.7283236994219653e-05, |
| "loss": 0.4817, |
| "mean_token_accuracy": 0.8388337612152099, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6033287101248266, |
| "grad_norm": 0.25565460324287415, |
| "learning_rate": 4.7186897880539496e-05, |
| "loss": 0.4724, |
| "mean_token_accuracy": 0.8415215969085693, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.6102635228848821, |
| "grad_norm": 0.680081844329834, |
| "learning_rate": 4.709055876685935e-05, |
| "loss": 0.4777, |
| "mean_token_accuracy": 0.8402902245521545, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6171983356449375, |
| "grad_norm": 0.3035682141780853, |
| "learning_rate": 4.6994219653179195e-05, |
| "loss": 0.4749, |
| "mean_token_accuracy": 0.8405117988586426, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.624133148404993, |
| "grad_norm": 0.22393807768821716, |
| "learning_rate": 4.689788053949904e-05, |
| "loss": 0.4735, |
| "mean_token_accuracy": 0.8410566568374633, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6310679611650486, |
| "grad_norm": 0.23452860116958618, |
| "learning_rate": 4.6801541425818887e-05, |
| "loss": 0.4798, |
| "mean_token_accuracy": 0.8394344925880433, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.6380027739251041, |
| "grad_norm": 0.21135355532169342, |
| "learning_rate": 4.670520231213873e-05, |
| "loss": 0.4783, |
| "mean_token_accuracy": 0.8398800015449523, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6449375866851595, |
| "grad_norm": 0.2495516985654831, |
| "learning_rate": 4.660886319845857e-05, |
| "loss": 0.4769, |
| "mean_token_accuracy": 0.8407980084419251, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.651872399445215, |
| "grad_norm": 0.25724372267723083, |
| "learning_rate": 4.651252408477843e-05, |
| "loss": 0.4764, |
| "mean_token_accuracy": 0.8402070879936219, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6588072122052705, |
| "grad_norm": 0.28974995017051697, |
| "learning_rate": 4.641618497109827e-05, |
| "loss": 0.468, |
| "mean_token_accuracy": 0.8425545215606689, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.665742024965326, |
| "grad_norm": 0.26298555731773376, |
| "learning_rate": 4.631984585741811e-05, |
| "loss": 0.4752, |
| "mean_token_accuracy": 0.8405273199081421, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6726768377253814, |
| "grad_norm": 0.3188522756099701, |
| "learning_rate": 4.622350674373796e-05, |
| "loss": 0.4683, |
| "mean_token_accuracy": 0.8426392555236817, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.6796116504854369, |
| "grad_norm": 0.2528276741504669, |
| "learning_rate": 4.6127167630057805e-05, |
| "loss": 0.4753, |
| "mean_token_accuracy": 0.840662169456482, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6865464632454924, |
| "grad_norm": 0.3695737421512604, |
| "learning_rate": 4.603082851637765e-05, |
| "loss": 0.501, |
| "mean_token_accuracy": 0.8371694445610046, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.6934812760055479, |
| "grad_norm": 0.31206727027893066, |
| "learning_rate": 4.59344894026975e-05, |
| "loss": 0.478, |
| "mean_token_accuracy": 0.8401562452316285, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7004160887656034, |
| "grad_norm": 3.478522539138794, |
| "learning_rate": 4.5838150289017346e-05, |
| "loss": 0.49, |
| "mean_token_accuracy": 0.8365014433860779, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.7073509015256588, |
| "grad_norm": 0.4430016875267029, |
| "learning_rate": 4.574181117533719e-05, |
| "loss": 0.47, |
| "mean_token_accuracy": 0.8422938823699951, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 0.28713470697402954, |
| "learning_rate": 4.564547206165704e-05, |
| "loss": 0.4786, |
| "mean_token_accuracy": 0.8401166200637817, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.7212205270457698, |
| "grad_norm": 0.2158370316028595, |
| "learning_rate": 4.554913294797688e-05, |
| "loss": 0.4703, |
| "mean_token_accuracy": 0.8421276092529297, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7281553398058253, |
| "grad_norm": 0.2426484376192093, |
| "learning_rate": 4.545279383429672e-05, |
| "loss": 0.469, |
| "mean_token_accuracy": 0.8426563143730164, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.7350901525658807, |
| "grad_norm": 0.27153995633125305, |
| "learning_rate": 4.535645472061657e-05, |
| "loss": 0.4754, |
| "mean_token_accuracy": 0.8406094431877136, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7420249653259362, |
| "grad_norm": 0.1991535872220993, |
| "learning_rate": 4.526011560693642e-05, |
| "loss": 0.4782, |
| "mean_token_accuracy": 0.8397158980369568, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.7489597780859917, |
| "grad_norm": 0.15923242270946503, |
| "learning_rate": 4.5163776493256264e-05, |
| "loss": 0.4563, |
| "mean_token_accuracy": 0.8461790800094604, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7558945908460472, |
| "grad_norm": 0.18306083977222443, |
| "learning_rate": 4.5067437379576114e-05, |
| "loss": 0.4791, |
| "mean_token_accuracy": 0.8393635034561158, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.7628294036061026, |
| "grad_norm": 0.24309256672859192, |
| "learning_rate": 4.4971098265895956e-05, |
| "loss": 0.4777, |
| "mean_token_accuracy": 0.8401144862174987, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7697642163661581, |
| "grad_norm": 0.20910784602165222, |
| "learning_rate": 4.48747591522158e-05, |
| "loss": 0.4728, |
| "mean_token_accuracy": 0.8417426466941833, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.7766990291262136, |
| "grad_norm": 0.1896984726190567, |
| "learning_rate": 4.477842003853565e-05, |
| "loss": 0.4557, |
| "mean_token_accuracy": 0.8461586833000183, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7836338418862691, |
| "grad_norm": 0.18798613548278809, |
| "learning_rate": 4.46820809248555e-05, |
| "loss": 0.457, |
| "mean_token_accuracy": 0.8459754705429077, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.7905686546463245, |
| "grad_norm": 0.18959036469459534, |
| "learning_rate": 4.458574181117534e-05, |
| "loss": 0.4633, |
| "mean_token_accuracy": 0.8437102913856507, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.79750346740638, |
| "grad_norm": 0.16292130947113037, |
| "learning_rate": 4.448940269749519e-05, |
| "loss": 0.4749, |
| "mean_token_accuracy": 0.8404599308967591, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.8044382801664355, |
| "grad_norm": 0.17686040699481964, |
| "learning_rate": 4.439306358381503e-05, |
| "loss": 0.4601, |
| "mean_token_accuracy": 0.844899308681488, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.811373092926491, |
| "grad_norm": 0.1865614652633667, |
| "learning_rate": 4.4296724470134875e-05, |
| "loss": 0.4533, |
| "mean_token_accuracy": 0.846677553653717, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.8183079056865464, |
| "grad_norm": 0.2037810981273651, |
| "learning_rate": 4.4200385356454724e-05, |
| "loss": 0.4575, |
| "mean_token_accuracy": 0.8457266449928283, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8252427184466019, |
| "grad_norm": 0.16701985895633698, |
| "learning_rate": 4.4104046242774566e-05, |
| "loss": 0.466, |
| "mean_token_accuracy": 0.8428797006607056, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.8321775312066574, |
| "grad_norm": 0.19714096188545227, |
| "learning_rate": 4.4007707129094416e-05, |
| "loss": 0.4696, |
| "mean_token_accuracy": 0.8422728657722474, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8391123439667129, |
| "grad_norm": 0.20772860944271088, |
| "learning_rate": 4.391136801541426e-05, |
| "loss": 0.4635, |
| "mean_token_accuracy": 0.8438523054122925, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.8460471567267683, |
| "grad_norm": 0.35546374320983887, |
| "learning_rate": 4.381502890173411e-05, |
| "loss": 0.4665, |
| "mean_token_accuracy": 0.8430918097496033, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8529819694868238, |
| "grad_norm": 0.19986563920974731, |
| "learning_rate": 4.371868978805395e-05, |
| "loss": 0.4742, |
| "mean_token_accuracy": 0.8409379243850708, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.8599167822468793, |
| "grad_norm": 0.4013294279575348, |
| "learning_rate": 4.36223506743738e-05, |
| "loss": 0.4673, |
| "mean_token_accuracy": 0.8426662087440491, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8668515950069348, |
| "grad_norm": 0.29566317796707153, |
| "learning_rate": 4.352601156069364e-05, |
| "loss": 0.4837, |
| "mean_token_accuracy": 0.8380556702613831, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.8737864077669902, |
| "grad_norm": 0.24461045861244202, |
| "learning_rate": 4.342967244701349e-05, |
| "loss": 0.4648, |
| "mean_token_accuracy": 0.8434231281280518, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8807212205270458, |
| "grad_norm": 0.2197730541229248, |
| "learning_rate": 4.3333333333333334e-05, |
| "loss": 0.4585, |
| "mean_token_accuracy": 0.8448979973793029, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.8876560332871013, |
| "grad_norm": 0.22158759832382202, |
| "learning_rate": 4.3236994219653183e-05, |
| "loss": 0.4678, |
| "mean_token_accuracy": 0.8427410125732422, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8945908460471568, |
| "grad_norm": 0.17014814913272858, |
| "learning_rate": 4.3140655105973026e-05, |
| "loss": 0.4706, |
| "mean_token_accuracy": 0.8416074395179749, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.9015256588072122, |
| "grad_norm": 0.22929687798023224, |
| "learning_rate": 4.304431599229287e-05, |
| "loss": 0.4753, |
| "mean_token_accuracy": 0.8403880834579468, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.9084604715672677, |
| "grad_norm": 0.20894835889339447, |
| "learning_rate": 4.294797687861272e-05, |
| "loss": 0.4734, |
| "mean_token_accuracy": 0.8410162568092346, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.9153952843273232, |
| "grad_norm": 0.18031327426433563, |
| "learning_rate": 4.285163776493257e-05, |
| "loss": 0.4536, |
| "mean_token_accuracy": 0.8469532251358032, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9223300970873787, |
| "grad_norm": 0.17288991808891296, |
| "learning_rate": 4.275529865125241e-05, |
| "loss": 0.4611, |
| "mean_token_accuracy": 0.8443895936012268, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.9292649098474342, |
| "grad_norm": 0.1980760544538498, |
| "learning_rate": 4.265895953757226e-05, |
| "loss": 0.484, |
| "mean_token_accuracy": 0.8379009962081909, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.9361997226074896, |
| "grad_norm": 0.20848602056503296, |
| "learning_rate": 4.25626204238921e-05, |
| "loss": 0.4771, |
| "mean_token_accuracy": 0.8398370265960693, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.9431345353675451, |
| "grad_norm": 0.1636408418416977, |
| "learning_rate": 4.2466281310211944e-05, |
| "loss": 0.4578, |
| "mean_token_accuracy": 0.845670223236084, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.9500693481276006, |
| "grad_norm": 0.22376923263072968, |
| "learning_rate": 4.2369942196531794e-05, |
| "loss": 0.4652, |
| "mean_token_accuracy": 0.8431706428527832, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.957004160887656, |
| "grad_norm": 0.21399416029453278, |
| "learning_rate": 4.2273603082851636e-05, |
| "loss": 0.4537, |
| "mean_token_accuracy": 0.8464810252189636, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9639389736477115, |
| "grad_norm": 2.5790159702301025, |
| "learning_rate": 4.2177263969171485e-05, |
| "loss": 0.4754, |
| "mean_token_accuracy": 0.8421392440795898, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "grad_norm": 0.2648729085922241, |
| "learning_rate": 4.2080924855491335e-05, |
| "loss": 0.469, |
| "mean_token_accuracy": 0.8423485517501831, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9778085991678225, |
| "grad_norm": 0.20691435039043427, |
| "learning_rate": 4.198458574181118e-05, |
| "loss": 0.4534, |
| "mean_token_accuracy": 0.8466127276420593, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.984743411927878, |
| "grad_norm": 0.2122969925403595, |
| "learning_rate": 4.188824662813102e-05, |
| "loss": 0.4744, |
| "mean_token_accuracy": 0.843373692035675, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.9916782246879334, |
| "grad_norm": 0.18356889486312866, |
| "learning_rate": 4.179190751445087e-05, |
| "loss": 0.4735, |
| "mean_token_accuracy": 0.840711236000061, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.9986130374479889, |
| "grad_norm": 0.2710322141647339, |
| "learning_rate": 4.169556840077071e-05, |
| "loss": 0.4893, |
| "mean_token_accuracy": 0.8400939464569092, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.0055478502080444, |
| "grad_norm": 0.28685542941093445, |
| "learning_rate": 4.159922928709056e-05, |
| "loss": 0.4413, |
| "mean_token_accuracy": 0.8504527807235718, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.0124826629680999, |
| "grad_norm": 0.24674533307552338, |
| "learning_rate": 4.150289017341041e-05, |
| "loss": 0.4098, |
| "mean_token_accuracy": 0.8587909460067749, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.0194174757281553, |
| "grad_norm": 0.21753250062465668, |
| "learning_rate": 4.140655105973025e-05, |
| "loss": 0.3935, |
| "mean_token_accuracy": 0.8634741544723511, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.0263522884882108, |
| "grad_norm": 0.20492789149284363, |
| "learning_rate": 4.1310211946050096e-05, |
| "loss": 0.4068, |
| "mean_token_accuracy": 0.8603113770484925, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.0332871012482663, |
| "grad_norm": 0.16923396289348602, |
| "learning_rate": 4.1213872832369945e-05, |
| "loss": 0.4185, |
| "mean_token_accuracy": 0.8562816023826599, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.0402219140083218, |
| "grad_norm": 0.18504321575164795, |
| "learning_rate": 4.111753371868979e-05, |
| "loss": 0.4083, |
| "mean_token_accuracy": 0.8588919401168823, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.0471567267683772, |
| "grad_norm": 0.15754340589046478, |
| "learning_rate": 4.102119460500964e-05, |
| "loss": 0.4001, |
| "mean_token_accuracy": 0.8617303729057312, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.0540915395284327, |
| "grad_norm": 0.16705656051635742, |
| "learning_rate": 4.0924855491329486e-05, |
| "loss": 0.4124, |
| "mean_token_accuracy": 0.8577338337898255, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.0610263522884882, |
| "grad_norm": 0.1913621723651886, |
| "learning_rate": 4.082851637764933e-05, |
| "loss": 0.4027, |
| "mean_token_accuracy": 0.860604989528656, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.0679611650485437, |
| "grad_norm": 0.1807246059179306, |
| "learning_rate": 4.073217726396917e-05, |
| "loss": 0.4154, |
| "mean_token_accuracy": 0.8569133520126343, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.0748959778085991, |
| "grad_norm": 0.16904115676879883, |
| "learning_rate": 4.063583815028902e-05, |
| "loss": 0.4043, |
| "mean_token_accuracy": 0.8603330969810485, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.0818307905686546, |
| "grad_norm": 0.13820037245750427, |
| "learning_rate": 4.053949903660886e-05, |
| "loss": 0.4032, |
| "mean_token_accuracy": 0.8607820630073547, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.08876560332871, |
| "grad_norm": 0.15458045899868011, |
| "learning_rate": 4.0443159922928706e-05, |
| "loss": 0.4126, |
| "mean_token_accuracy": 0.8577965140342713, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.0957004160887656, |
| "grad_norm": 0.14621621370315552, |
| "learning_rate": 4.034682080924856e-05, |
| "loss": 0.4164, |
| "mean_token_accuracy": 0.8568703651428222, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.102635228848821, |
| "grad_norm": 0.22418245673179626, |
| "learning_rate": 4.0250481695568404e-05, |
| "loss": 0.4151, |
| "mean_token_accuracy": 0.8572113513946533, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.1095700416088765, |
| "grad_norm": 0.18166805803775787, |
| "learning_rate": 4.015414258188825e-05, |
| "loss": 0.4236, |
| "mean_token_accuracy": 0.8545473575592041, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.116504854368932, |
| "grad_norm": 0.19410911202430725, |
| "learning_rate": 4.0057803468208096e-05, |
| "loss": 0.4081, |
| "mean_token_accuracy": 0.8594057202339173, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.1234396671289875, |
| "grad_norm": 0.15663549304008484, |
| "learning_rate": 3.996146435452794e-05, |
| "loss": 0.41, |
| "mean_token_accuracy": 0.8585174441337585, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.130374479889043, |
| "grad_norm": 0.2926901578903198, |
| "learning_rate": 3.986512524084778e-05, |
| "loss": 0.4088, |
| "mean_token_accuracy": 0.8590117692947388, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.1373092926490984, |
| "grad_norm": 0.14440152049064636, |
| "learning_rate": 3.976878612716764e-05, |
| "loss": 0.4029, |
| "mean_token_accuracy": 0.8609296917915344, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.1442441054091539, |
| "grad_norm": 0.18435537815093994, |
| "learning_rate": 3.967244701348748e-05, |
| "loss": 0.406, |
| "mean_token_accuracy": 0.8599510669708252, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.1511789181692094, |
| "grad_norm": 0.16614344716072083, |
| "learning_rate": 3.957610789980732e-05, |
| "loss": 0.4165, |
| "mean_token_accuracy": 0.8565038800239563, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.1581137309292648, |
| "grad_norm": 0.180514857172966, |
| "learning_rate": 3.947976878612717e-05, |
| "loss": 0.4233, |
| "mean_token_accuracy": 0.854881489276886, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.1650485436893203, |
| "grad_norm": 0.17873796820640564, |
| "learning_rate": 3.9383429672447015e-05, |
| "loss": 0.4142, |
| "mean_token_accuracy": 0.8575613379478455, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.1719833564493758, |
| "grad_norm": 0.17171607911586761, |
| "learning_rate": 3.928709055876686e-05, |
| "loss": 0.4199, |
| "mean_token_accuracy": 0.8555831432342529, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.1789181692094313, |
| "grad_norm": 0.2052180916070938, |
| "learning_rate": 3.9190751445086707e-05, |
| "loss": 0.4084, |
| "mean_token_accuracy": 0.8590785980224609, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.1858529819694867, |
| "grad_norm": 0.16104774177074432, |
| "learning_rate": 3.9094412331406556e-05, |
| "loss": 0.4064, |
| "mean_token_accuracy": 0.8598842978477478, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.1927877947295422, |
| "grad_norm": 0.16743043065071106, |
| "learning_rate": 3.89980732177264e-05, |
| "loss": 0.4136, |
| "mean_token_accuracy": 0.857661247253418, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.1997226074895977, |
| "grad_norm": 0.15085460245609283, |
| "learning_rate": 3.890173410404625e-05, |
| "loss": 0.4143, |
| "mean_token_accuracy": 0.8573906064033509, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.2066574202496532, |
| "grad_norm": 0.14390355348587036, |
| "learning_rate": 3.880539499036609e-05, |
| "loss": 0.4221, |
| "mean_token_accuracy": 0.8549758553504944, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.2135922330097086, |
| "grad_norm": 0.170955091714859, |
| "learning_rate": 3.870905587668593e-05, |
| "loss": 0.417, |
| "mean_token_accuracy": 0.8564952373504638, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.2205270457697641, |
| "grad_norm": 0.17432747781276703, |
| "learning_rate": 3.861271676300578e-05, |
| "loss": 0.4065, |
| "mean_token_accuracy": 0.8594078302383423, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.2274618585298196, |
| "grad_norm": 0.15886807441711426, |
| "learning_rate": 3.851637764932563e-05, |
| "loss": 0.4191, |
| "mean_token_accuracy": 0.8560209155082703, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.234396671289875, |
| "grad_norm": 0.22739961743354797, |
| "learning_rate": 3.8420038535645474e-05, |
| "loss": 0.4136, |
| "mean_token_accuracy": 0.8580429792404175, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.2413314840499305, |
| "grad_norm": 0.1761876940727234, |
| "learning_rate": 3.832369942196532e-05, |
| "loss": 0.4058, |
| "mean_token_accuracy": 0.8598258137702942, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.248266296809986, |
| "grad_norm": 0.8043875098228455, |
| "learning_rate": 3.8227360308285166e-05, |
| "loss": 0.4146, |
| "mean_token_accuracy": 0.8571544885635376, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.2552011095700415, |
| "grad_norm": 0.18043817579746246, |
| "learning_rate": 3.813102119460501e-05, |
| "loss": 0.4032, |
| "mean_token_accuracy": 0.8605299115180969, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.262135922330097, |
| "grad_norm": 0.16484476625919342, |
| "learning_rate": 3.803468208092486e-05, |
| "loss": 0.3987, |
| "mean_token_accuracy": 0.8620869636535644, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.2690707350901524, |
| "grad_norm": 0.15530748665332794, |
| "learning_rate": 3.793834296724471e-05, |
| "loss": 0.4019, |
| "mean_token_accuracy": 0.8608452916145325, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.276005547850208, |
| "grad_norm": 0.16284696757793427, |
| "learning_rate": 3.784200385356455e-05, |
| "loss": 0.4056, |
| "mean_token_accuracy": 0.8598546504974365, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.2829403606102634, |
| "grad_norm": 0.15156075358390808, |
| "learning_rate": 3.774566473988439e-05, |
| "loss": 0.4148, |
| "mean_token_accuracy": 0.8573850750923157, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.2898751733703189, |
| "grad_norm": 0.18044961988925934, |
| "learning_rate": 3.764932562620424e-05, |
| "loss": 0.4165, |
| "mean_token_accuracy": 0.8561815977096557, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.2968099861303743, |
| "grad_norm": 0.1658436506986618, |
| "learning_rate": 3.7552986512524084e-05, |
| "loss": 0.4056, |
| "mean_token_accuracy": 0.8600709080696106, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.3037447988904298, |
| "grad_norm": 0.16520382463932037, |
| "learning_rate": 3.7456647398843934e-05, |
| "loss": 0.4148, |
| "mean_token_accuracy": 0.8568984508514405, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.3106796116504853, |
| "grad_norm": 0.1799880713224411, |
| "learning_rate": 3.736030828516378e-05, |
| "loss": 0.4188, |
| "mean_token_accuracy": 0.856162166595459, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.317614424410541, |
| "grad_norm": 0.16812920570373535, |
| "learning_rate": 3.7263969171483626e-05, |
| "loss": 0.4111, |
| "mean_token_accuracy": 0.8584610104560852, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.3245492371705965, |
| "grad_norm": 0.15165302157402039, |
| "learning_rate": 3.716763005780347e-05, |
| "loss": 0.404, |
| "mean_token_accuracy": 0.860293960571289, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.331484049930652, |
| "grad_norm": 0.13624367117881775, |
| "learning_rate": 3.707129094412332e-05, |
| "loss": 0.4136, |
| "mean_token_accuracy": 0.8572997689247132, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.3384188626907074, |
| "grad_norm": 0.6036350131034851, |
| "learning_rate": 3.697495183044316e-05, |
| "loss": 0.4139, |
| "mean_token_accuracy": 0.8579724669456482, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.345353675450763, |
| "grad_norm": 0.16172119975090027, |
| "learning_rate": 3.6878612716763e-05, |
| "loss": 0.4194, |
| "mean_token_accuracy": 0.8556413412094116, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.3522884882108184, |
| "grad_norm": 0.13519282639026642, |
| "learning_rate": 3.678227360308285e-05, |
| "loss": 0.4053, |
| "mean_token_accuracy": 0.8597145199775695, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.3592233009708738, |
| "grad_norm": 0.14305779337882996, |
| "learning_rate": 3.66859344894027e-05, |
| "loss": 0.4066, |
| "mean_token_accuracy": 0.8596665501594544, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.3661581137309293, |
| "grad_norm": 0.18043436110019684, |
| "learning_rate": 3.6589595375722544e-05, |
| "loss": 0.4061, |
| "mean_token_accuracy": 0.8598026871681214, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.3730929264909848, |
| "grad_norm": 0.12696458399295807, |
| "learning_rate": 3.649325626204239e-05, |
| "loss": 0.4035, |
| "mean_token_accuracy": 0.860295832157135, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.3800277392510403, |
| "grad_norm": 0.15299014747142792, |
| "learning_rate": 3.6396917148362236e-05, |
| "loss": 0.4045, |
| "mean_token_accuracy": 0.8601055145263672, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.3869625520110958, |
| "grad_norm": 0.14797343313694, |
| "learning_rate": 3.630057803468208e-05, |
| "loss": 0.4064, |
| "mean_token_accuracy": 0.8599012613296508, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2884, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2953357891076096e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|