diff --git "a/cost_to_push_frequency_2128/checkpoint-40000/trainer_state.json" "b/cost_to_push_frequency_2128/checkpoint-40000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/cost_to_push_frequency_2128/checkpoint-40000/trainer_state.json" @@ -0,0 +1,6003 @@ +{ + "best_global_step": 40000, + "best_metric": 3.5569655895233154, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_push_frequency_2128/checkpoint-40000", + "epoch": 11.648261401362921, + "eval_steps": 1000, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014561127613722406, + "grad_norm": 1.0827219486236572, + "learning_rate": 0.000294, + "loss": 8.4429, + "step": 50 + }, + { + "epoch": 0.029122255227444813, + "grad_norm": 0.9950307011604309, + "learning_rate": 0.0005939999999999999, + "loss": 6.7538, + "step": 100 + }, + { + "epoch": 0.04368338284116722, + "grad_norm": 0.40593206882476807, + "learning_rate": 0.0005998286713286713, + "loss": 6.3529, + "step": 150 + }, + { + "epoch": 0.058244510454889625, + "grad_norm": 0.5188880562782288, + "learning_rate": 0.0005996538461538461, + "loss": 6.1387, + "step": 200 + }, + { + "epoch": 0.07280563806861204, + "grad_norm": 0.5219882726669312, + "learning_rate": 0.0005994790209790209, + "loss": 5.9936, + "step": 250 + }, + { + "epoch": 0.08736676568233444, + "grad_norm": 0.4981943368911743, + "learning_rate": 0.0005993041958041958, + "loss": 5.8475, + "step": 300 + }, + { + "epoch": 0.10192789329605685, + "grad_norm": 0.419317364692688, + "learning_rate": 0.0005991293706293705, + "loss": 5.731, + "step": 350 + }, + { + "epoch": 0.11648902090977925, + "grad_norm": 0.4203638732433319, + "learning_rate": 0.0005989545454545454, + "loss": 5.6254, + "step": 400 + }, + { + "epoch": 0.13105014852350166, + "grad_norm": 0.5592066645622253, + "learning_rate": 0.0005987797202797202, + "loss": 5.5068, + "step": 450 + }, + { + "epoch": 0.14561127613722408, + "grad_norm": 0.465763658285141, + "learning_rate": 0.000598604895104895, + "loss": 5.4002, + "step": 500 + }, + { + "epoch": 0.16017240375094646, + "grad_norm": 0.4648076295852661, + "learning_rate": 0.0005984300699300698, + "loss": 5.3308, + "step": 550 + }, + { + "epoch": 0.17473353136466888, + "grad_norm": 0.49379315972328186, + "learning_rate": 0.0005982552447552447, + "loss": 5.2609, + "step": 600 + }, + { + "epoch": 0.1892946589783913, + "grad_norm": 0.4596584439277649, + "learning_rate": 0.0005980804195804195, + "loss": 5.1905, + "step": 650 + }, + { + "epoch": 0.2038557865921137, + "grad_norm": 0.40508726239204407, + "learning_rate": 0.0005979055944055943, + "loss": 5.1331, + "step": 700 + }, + { + "epoch": 0.2184169142058361, + "grad_norm": 0.3763967752456665, + "learning_rate": 0.0005977307692307691, + "loss": 5.0753, + "step": 750 + }, + { + "epoch": 0.2329780418195585, + "grad_norm": 0.4820829927921295, + "learning_rate": 0.000597555944055944, + "loss": 5.0201, + "step": 800 + }, + { + "epoch": 0.24753916943328091, + "grad_norm": 0.4278320074081421, + "learning_rate": 0.0005973811188811188, + "loss": 4.9577, + "step": 850 + }, + { + "epoch": 0.2621002970470033, + "grad_norm": 0.4293597340583801, + "learning_rate": 0.0005972062937062936, + "loss": 4.9213, + "step": 900 + }, + { + "epoch": 0.27666142466072574, + "grad_norm": 0.43348240852355957, + "learning_rate": 0.0005970314685314685, + "loss": 4.8786, + "step": 950 + }, + { + "epoch": 0.29122255227444815, + "grad_norm": 0.4520808160305023, + "learning_rate": 0.0005968566433566433, + "loss": 4.8181, + "step": 1000 + }, + { + "epoch": 0.29122255227444815, + "eval_accuracy": 0.2556966724844482, + "eval_loss": 4.745121955871582, + "eval_runtime": 179.4217, + "eval_samples_per_second": 92.776, + "eval_steps_per_second": 5.802, + "step": 1000 + }, + { + "epoch": 0.30578367988817057, + "grad_norm": 0.45122841000556946, + "learning_rate": 0.0005966818181818181, + "loss": 4.7877, + "step": 1050 + }, + { + "epoch": 0.3203448075018929, + "grad_norm": 0.47879835963249207, + "learning_rate": 0.0005965069930069929, + "loss": 4.7428, + "step": 1100 + }, + { + "epoch": 0.33490593511561534, + "grad_norm": 0.4831642210483551, + "learning_rate": 0.0005963321678321677, + "loss": 4.6996, + "step": 1150 + }, + { + "epoch": 0.34946706272933775, + "grad_norm": 0.4561481177806854, + "learning_rate": 0.0005961573426573425, + "loss": 4.6659, + "step": 1200 + }, + { + "epoch": 0.36402819034306017, + "grad_norm": 0.4561339020729065, + "learning_rate": 0.0005959825174825174, + "loss": 4.6367, + "step": 1250 + }, + { + "epoch": 0.3785893179567826, + "grad_norm": 0.4436923861503601, + "learning_rate": 0.0005958076923076922, + "loss": 4.6064, + "step": 1300 + }, + { + "epoch": 0.393150445570505, + "grad_norm": 0.46087032556533813, + "learning_rate": 0.000595632867132867, + "loss": 4.5797, + "step": 1350 + }, + { + "epoch": 0.4077115731842274, + "grad_norm": 0.47251585125923157, + "learning_rate": 0.0005954580419580418, + "loss": 4.547, + "step": 1400 + }, + { + "epoch": 0.4222727007979498, + "grad_norm": 0.43149644136428833, + "learning_rate": 0.0005952832167832168, + "loss": 4.5216, + "step": 1450 + }, + { + "epoch": 0.4368338284116722, + "grad_norm": 0.3600349724292755, + "learning_rate": 0.0005951083916083916, + "loss": 4.5129, + "step": 1500 + }, + { + "epoch": 0.4513949560253946, + "grad_norm": 0.42545634508132935, + "learning_rate": 0.0005949335664335664, + "loss": 4.478, + "step": 1550 + }, + { + "epoch": 0.465956083639117, + "grad_norm": 0.4261489808559418, + "learning_rate": 0.0005947587412587413, + "loss": 4.466, + "step": 1600 + }, + { + "epoch": 0.4805172112528394, + "grad_norm": 0.382684588432312, + "learning_rate": 0.0005945839160839161, + "loss": 4.4463, + "step": 1650 + }, + { + "epoch": 0.49507833886656183, + "grad_norm": 0.4798526465892792, + "learning_rate": 0.0005944090909090909, + "loss": 4.419, + "step": 1700 + }, + { + "epoch": 0.5096394664802842, + "grad_norm": 0.4271828830242157, + "learning_rate": 0.0005942342657342657, + "loss": 4.4065, + "step": 1750 + }, + { + "epoch": 0.5242005940940067, + "grad_norm": 0.4648028016090393, + "learning_rate": 0.0005940594405594406, + "loss": 4.389, + "step": 1800 + }, + { + "epoch": 0.5387617217077291, + "grad_norm": 0.46727654337882996, + "learning_rate": 0.0005938846153846153, + "loss": 4.3739, + "step": 1850 + }, + { + "epoch": 0.5533228493214515, + "grad_norm": 0.4359632432460785, + "learning_rate": 0.0005937097902097902, + "loss": 4.3727, + "step": 1900 + }, + { + "epoch": 0.5678839769351739, + "grad_norm": 0.39883190393447876, + "learning_rate": 0.000593534965034965, + "loss": 4.3559, + "step": 1950 + }, + { + "epoch": 0.5824451045488963, + "grad_norm": 0.4254516661167145, + "learning_rate": 0.0005933601398601398, + "loss": 4.3438, + "step": 2000 + }, + { + "epoch": 0.5824451045488963, + "eval_accuracy": 0.29953294727340574, + "eval_loss": 4.282804489135742, + "eval_runtime": 179.6292, + "eval_samples_per_second": 92.669, + "eval_steps_per_second": 5.795, + "step": 2000 + }, + { + "epoch": 0.5970062321626187, + "grad_norm": 0.39681392908096313, + "learning_rate": 0.0005931853146853146, + "loss": 4.3252, + "step": 2050 + }, + { + "epoch": 0.6115673597763411, + "grad_norm": 0.36488792300224304, + "learning_rate": 0.0005930104895104895, + "loss": 4.3158, + "step": 2100 + }, + { + "epoch": 0.6261284873900634, + "grad_norm": 0.4375183582305908, + "learning_rate": 0.0005928356643356643, + "loss": 4.299, + "step": 2150 + }, + { + "epoch": 0.6406896150037859, + "grad_norm": 0.38287097215652466, + "learning_rate": 0.0005926608391608391, + "loss": 4.2941, + "step": 2200 + }, + { + "epoch": 0.6552507426175083, + "grad_norm": 0.3945271968841553, + "learning_rate": 0.000592486013986014, + "loss": 4.2685, + "step": 2250 + }, + { + "epoch": 0.6698118702312307, + "grad_norm": 0.3807995617389679, + "learning_rate": 0.0005923111888111888, + "loss": 4.2773, + "step": 2300 + }, + { + "epoch": 0.6843729978449531, + "grad_norm": 0.3736141324043274, + "learning_rate": 0.0005921363636363636, + "loss": 4.2439, + "step": 2350 + }, + { + "epoch": 0.6989341254586755, + "grad_norm": 0.37925609946250916, + "learning_rate": 0.0005919615384615384, + "loss": 4.2377, + "step": 2400 + }, + { + "epoch": 0.7134952530723979, + "grad_norm": 0.40228238701820374, + "learning_rate": 0.0005917867132867133, + "loss": 4.2397, + "step": 2450 + }, + { + "epoch": 0.7280563806861203, + "grad_norm": 0.3505542278289795, + "learning_rate": 0.0005916118881118881, + "loss": 4.2359, + "step": 2500 + }, + { + "epoch": 0.7426175082998427, + "grad_norm": 0.40058302879333496, + "learning_rate": 0.0005914370629370629, + "loss": 4.2241, + "step": 2550 + }, + { + "epoch": 0.7571786359135652, + "grad_norm": 0.3788367509841919, + "learning_rate": 0.0005912622377622377, + "loss": 4.2107, + "step": 2600 + }, + { + "epoch": 0.7717397635272876, + "grad_norm": 0.3747999668121338, + "learning_rate": 0.0005910874125874125, + "loss": 4.2, + "step": 2650 + }, + { + "epoch": 0.78630089114101, + "grad_norm": 0.40086600184440613, + "learning_rate": 0.0005909125874125873, + "loss": 4.1915, + "step": 2700 + }, + { + "epoch": 0.8008620187547324, + "grad_norm": 0.36495792865753174, + "learning_rate": 0.0005907377622377622, + "loss": 4.1941, + "step": 2750 + }, + { + "epoch": 0.8154231463684548, + "grad_norm": 0.3766659200191498, + "learning_rate": 0.000590562937062937, + "loss": 4.1739, + "step": 2800 + }, + { + "epoch": 0.8299842739821772, + "grad_norm": 0.3640320301055908, + "learning_rate": 0.0005903881118881118, + "loss": 4.1626, + "step": 2850 + }, + { + "epoch": 0.8445454015958996, + "grad_norm": 0.3703969717025757, + "learning_rate": 0.0005902132867132867, + "loss": 4.1557, + "step": 2900 + }, + { + "epoch": 0.8591065292096219, + "grad_norm": 0.3352505564689636, + "learning_rate": 0.0005900384615384615, + "loss": 4.1426, + "step": 2950 + }, + { + "epoch": 0.8736676568233444, + "grad_norm": 0.3644249141216278, + "learning_rate": 0.0005898636363636363, + "loss": 4.1483, + "step": 3000 + }, + { + "epoch": 0.8736676568233444, + "eval_accuracy": 0.31578797630784283, + "eval_loss": 4.0948615074157715, + "eval_runtime": 179.7252, + "eval_samples_per_second": 92.619, + "eval_steps_per_second": 5.792, + "step": 3000 + }, + { + "epoch": 0.8882287844370668, + "grad_norm": 0.3400101065635681, + "learning_rate": 0.0005896888111888111, + "loss": 4.1436, + "step": 3050 + }, + { + "epoch": 0.9027899120507892, + "grad_norm": 0.3571796417236328, + "learning_rate": 0.000589513986013986, + "loss": 4.1302, + "step": 3100 + }, + { + "epoch": 0.9173510396645116, + "grad_norm": 0.34732452034950256, + "learning_rate": 0.0005893391608391608, + "loss": 4.1203, + "step": 3150 + }, + { + "epoch": 0.931912167278234, + "grad_norm": 0.36288565397262573, + "learning_rate": 0.0005891643356643356, + "loss": 4.1241, + "step": 3200 + }, + { + "epoch": 0.9464732948919564, + "grad_norm": 0.34131136536598206, + "learning_rate": 0.0005889895104895104, + "loss": 4.1136, + "step": 3250 + }, + { + "epoch": 0.9610344225056788, + "grad_norm": 0.35798367857933044, + "learning_rate": 0.0005888146853146853, + "loss": 4.1029, + "step": 3300 + }, + { + "epoch": 0.9755955501194012, + "grad_norm": 0.3709186613559723, + "learning_rate": 0.00058863986013986, + "loss": 4.0891, + "step": 3350 + }, + { + "epoch": 0.9901566777331237, + "grad_norm": 0.3378744423389435, + "learning_rate": 0.0005884650349650349, + "loss": 4.0959, + "step": 3400 + }, + { + "epoch": 1.004659560836391, + "grad_norm": 0.3469085097312927, + "learning_rate": 0.0005882902097902097, + "loss": 4.0733, + "step": 3450 + }, + { + "epoch": 1.0192206884501136, + "grad_norm": 0.3355250954627991, + "learning_rate": 0.0005881153846153845, + "loss": 4.0135, + "step": 3500 + }, + { + "epoch": 1.033781816063836, + "grad_norm": 0.34765860438346863, + "learning_rate": 0.0005879405594405594, + "loss": 4.0131, + "step": 3550 + }, + { + "epoch": 1.0483429436775584, + "grad_norm": 0.3484998941421509, + "learning_rate": 0.0005877657342657342, + "loss": 4.0352, + "step": 3600 + }, + { + "epoch": 1.0629040712912807, + "grad_norm": 0.34341979026794434, + "learning_rate": 0.000587590909090909, + "loss": 4.0047, + "step": 3650 + }, + { + "epoch": 1.0774651989050033, + "grad_norm": 0.36538752913475037, + "learning_rate": 0.0005874160839160838, + "loss": 4.0016, + "step": 3700 + }, + { + "epoch": 1.0920263265187256, + "grad_norm": 0.3458220064640045, + "learning_rate": 0.0005872412587412587, + "loss": 4.0163, + "step": 3750 + }, + { + "epoch": 1.106587454132448, + "grad_norm": 0.3493204414844513, + "learning_rate": 0.0005870664335664335, + "loss": 4.0035, + "step": 3800 + }, + { + "epoch": 1.1211485817461704, + "grad_norm": 0.3274590075016022, + "learning_rate": 0.0005868916083916083, + "loss": 4.0167, + "step": 3850 + }, + { + "epoch": 1.135709709359893, + "grad_norm": 0.3461831510066986, + "learning_rate": 0.0005867167832167831, + "loss": 3.99, + "step": 3900 + }, + { + "epoch": 1.1502708369736152, + "grad_norm": 0.3442121148109436, + "learning_rate": 0.000586541958041958, + "loss": 3.9825, + "step": 3950 + }, + { + "epoch": 1.1648319645873377, + "grad_norm": 0.3337996006011963, + "learning_rate": 0.0005863671328671328, + "loss": 3.9794, + "step": 4000 + }, + { + "epoch": 1.1648319645873377, + "eval_accuracy": 0.32491283320475906, + "eval_loss": 3.989677906036377, + "eval_runtime": 179.7843, + "eval_samples_per_second": 92.589, + "eval_steps_per_second": 5.79, + "step": 4000 + }, + { + "epoch": 1.17939309220106, + "grad_norm": 0.33036714792251587, + "learning_rate": 0.0005861923076923076, + "loss": 3.9821, + "step": 4050 + }, + { + "epoch": 1.1939542198147826, + "grad_norm": 0.33033114671707153, + "learning_rate": 0.0005860174825174824, + "loss": 3.9925, + "step": 4100 + }, + { + "epoch": 1.2085153474285049, + "grad_norm": 0.3445809781551361, + "learning_rate": 0.0005858426573426573, + "loss": 3.9873, + "step": 4150 + }, + { + "epoch": 1.2230764750422272, + "grad_norm": 0.32692384719848633, + "learning_rate": 0.000585667832167832, + "loss": 3.9814, + "step": 4200 + }, + { + "epoch": 1.2376376026559497, + "grad_norm": 0.3487424850463867, + "learning_rate": 0.000585493006993007, + "loss": 3.9712, + "step": 4250 + }, + { + "epoch": 1.2521987302696722, + "grad_norm": 0.345749169588089, + "learning_rate": 0.0005853181818181817, + "loss": 3.9784, + "step": 4300 + }, + { + "epoch": 1.2667598578833945, + "grad_norm": 0.36335498094558716, + "learning_rate": 0.0005851433566433565, + "loss": 3.9808, + "step": 4350 + }, + { + "epoch": 1.2813209854971168, + "grad_norm": 0.31872642040252686, + "learning_rate": 0.0005849685314685315, + "loss": 3.9746, + "step": 4400 + }, + { + "epoch": 1.2958821131108393, + "grad_norm": 0.357146680355072, + "learning_rate": 0.0005847937062937063, + "loss": 3.9645, + "step": 4450 + }, + { + "epoch": 1.3104432407245616, + "grad_norm": 0.325870543718338, + "learning_rate": 0.0005846188811188811, + "loss": 3.9639, + "step": 4500 + }, + { + "epoch": 1.3250043683382842, + "grad_norm": 0.3136429488658905, + "learning_rate": 0.0005844440559440559, + "loss": 3.9582, + "step": 4550 + }, + { + "epoch": 1.3395654959520065, + "grad_norm": 0.35432639718055725, + "learning_rate": 0.0005842692307692308, + "loss": 3.9456, + "step": 4600 + }, + { + "epoch": 1.354126623565729, + "grad_norm": 0.3514183759689331, + "learning_rate": 0.0005840944055944056, + "loss": 3.9475, + "step": 4650 + }, + { + "epoch": 1.3686877511794513, + "grad_norm": 0.33868497610092163, + "learning_rate": 0.0005839195804195804, + "loss": 3.9486, + "step": 4700 + }, + { + "epoch": 1.3832488787931738, + "grad_norm": 0.3391216993331909, + "learning_rate": 0.0005837447552447552, + "loss": 3.9525, + "step": 4750 + }, + { + "epoch": 1.3978100064068961, + "grad_norm": 0.34010815620422363, + "learning_rate": 0.0005835699300699301, + "loss": 3.947, + "step": 4800 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 0.3243875205516815, + "learning_rate": 0.0005833951048951048, + "loss": 3.9515, + "step": 4850 + }, + { + "epoch": 1.426932261634341, + "grad_norm": 0.35085731744766235, + "learning_rate": 0.0005832202797202797, + "loss": 3.9402, + "step": 4900 + }, + { + "epoch": 1.4414933892480635, + "grad_norm": 0.34375637769699097, + "learning_rate": 0.0005830454545454546, + "loss": 3.9424, + "step": 4950 + }, + { + "epoch": 1.4560545168617858, + "grad_norm": 0.3360918164253235, + "learning_rate": 0.0005828706293706293, + "loss": 3.946, + "step": 5000 + }, + { + "epoch": 1.4560545168617858, + "eval_accuracy": 0.3315629972163526, + "eval_loss": 3.9168527126312256, + "eval_runtime": 179.6234, + "eval_samples_per_second": 92.672, + "eval_steps_per_second": 5.795, + "step": 5000 + }, + { + "epoch": 1.470615644475508, + "grad_norm": 0.3384229838848114, + "learning_rate": 0.0005826958041958042, + "loss": 3.9282, + "step": 5050 + }, + { + "epoch": 1.4851767720892306, + "grad_norm": 0.3160015642642975, + "learning_rate": 0.000582520979020979, + "loss": 3.9223, + "step": 5100 + }, + { + "epoch": 1.4997378997029531, + "grad_norm": 0.31337279081344604, + "learning_rate": 0.0005823461538461538, + "loss": 3.9139, + "step": 5150 + }, + { + "epoch": 1.5142990273166754, + "grad_norm": 0.3430108428001404, + "learning_rate": 0.0005821713286713286, + "loss": 3.9192, + "step": 5200 + }, + { + "epoch": 1.5288601549303977, + "grad_norm": 0.32244783639907837, + "learning_rate": 0.0005819965034965035, + "loss": 3.9181, + "step": 5250 + }, + { + "epoch": 1.5434212825441203, + "grad_norm": 0.32754674553871155, + "learning_rate": 0.0005818216783216783, + "loss": 3.9076, + "step": 5300 + }, + { + "epoch": 1.5579824101578428, + "grad_norm": 0.3257962167263031, + "learning_rate": 0.0005816468531468531, + "loss": 3.9091, + "step": 5350 + }, + { + "epoch": 1.572543537771565, + "grad_norm": 0.319021999835968, + "learning_rate": 0.0005814720279720279, + "loss": 3.8997, + "step": 5400 + }, + { + "epoch": 1.5871046653852874, + "grad_norm": 0.34583571553230286, + "learning_rate": 0.0005812972027972028, + "loss": 3.9082, + "step": 5450 + }, + { + "epoch": 1.6016657929990097, + "grad_norm": 0.31768912076950073, + "learning_rate": 0.0005811223776223776, + "loss": 3.9119, + "step": 5500 + }, + { + "epoch": 1.6162269206127322, + "grad_norm": 0.30981358885765076, + "learning_rate": 0.0005809475524475524, + "loss": 3.8991, + "step": 5550 + }, + { + "epoch": 1.6307880482264547, + "grad_norm": 0.3583605885505676, + "learning_rate": 0.0005807727272727272, + "loss": 3.8898, + "step": 5600 + }, + { + "epoch": 1.645349175840177, + "grad_norm": 0.35432425141334534, + "learning_rate": 0.0005805979020979021, + "loss": 3.9067, + "step": 5650 + }, + { + "epoch": 1.6599103034538993, + "grad_norm": 0.32656440138816833, + "learning_rate": 0.0005804230769230769, + "loss": 3.8878, + "step": 5700 + }, + { + "epoch": 1.6744714310676219, + "grad_norm": 0.32895249128341675, + "learning_rate": 0.0005802482517482517, + "loss": 3.8858, + "step": 5750 + }, + { + "epoch": 1.6890325586813444, + "grad_norm": 0.3573879897594452, + "learning_rate": 0.0005800734265734265, + "loss": 3.8995, + "step": 5800 + }, + { + "epoch": 1.7035936862950667, + "grad_norm": 0.3116515278816223, + "learning_rate": 0.0005798986013986013, + "loss": 3.8855, + "step": 5850 + }, + { + "epoch": 1.718154813908789, + "grad_norm": 0.32921165227890015, + "learning_rate": 0.0005797237762237762, + "loss": 3.8858, + "step": 5900 + }, + { + "epoch": 1.7327159415225115, + "grad_norm": 0.32322996854782104, + "learning_rate": 0.000579548951048951, + "loss": 3.8747, + "step": 5950 + }, + { + "epoch": 1.747277069136234, + "grad_norm": 0.3198484778404236, + "learning_rate": 0.0005793741258741258, + "loss": 3.8796, + "step": 6000 + }, + { + "epoch": 1.747277069136234, + "eval_accuracy": 0.33665428105410394, + "eval_loss": 3.859868049621582, + "eval_runtime": 179.7598, + "eval_samples_per_second": 92.601, + "eval_steps_per_second": 5.791, + "step": 6000 + }, + { + "epoch": 1.7618381967499563, + "grad_norm": 0.32858818769454956, + "learning_rate": 0.0005791993006993006, + "loss": 3.8737, + "step": 6050 + }, + { + "epoch": 1.7763993243636786, + "grad_norm": 0.31307506561279297, + "learning_rate": 0.0005790244755244755, + "loss": 3.8731, + "step": 6100 + }, + { + "epoch": 1.7909604519774012, + "grad_norm": 0.32378000020980835, + "learning_rate": 0.0005788496503496503, + "loss": 3.8751, + "step": 6150 + }, + { + "epoch": 1.8055215795911237, + "grad_norm": 0.3218482434749603, + "learning_rate": 0.0005786748251748251, + "loss": 3.8731, + "step": 6200 + }, + { + "epoch": 1.820082707204846, + "grad_norm": 0.3510587215423584, + "learning_rate": 0.0005784999999999999, + "loss": 3.8621, + "step": 6250 + }, + { + "epoch": 1.8346438348185683, + "grad_norm": 0.32646113634109497, + "learning_rate": 0.0005783251748251748, + "loss": 3.8652, + "step": 6300 + }, + { + "epoch": 1.8492049624322906, + "grad_norm": 0.34067031741142273, + "learning_rate": 0.0005781503496503496, + "loss": 3.8638, + "step": 6350 + }, + { + "epoch": 1.8637660900460131, + "grad_norm": 0.327680766582489, + "learning_rate": 0.0005779755244755244, + "loss": 3.8617, + "step": 6400 + }, + { + "epoch": 1.8783272176597356, + "grad_norm": 0.31625163555145264, + "learning_rate": 0.0005778006993006993, + "loss": 3.8561, + "step": 6450 + }, + { + "epoch": 1.892888345273458, + "grad_norm": 0.312741219997406, + "learning_rate": 0.000577625874125874, + "loss": 3.842, + "step": 6500 + }, + { + "epoch": 1.9074494728871803, + "grad_norm": 0.32632362842559814, + "learning_rate": 0.0005774510489510489, + "loss": 3.8528, + "step": 6550 + }, + { + "epoch": 1.9220106005009028, + "grad_norm": 0.32156306505203247, + "learning_rate": 0.0005772762237762237, + "loss": 3.8587, + "step": 6600 + }, + { + "epoch": 1.9365717281146253, + "grad_norm": 0.3177630305290222, + "learning_rate": 0.0005771013986013985, + "loss": 3.8592, + "step": 6650 + }, + { + "epoch": 1.9511328557283476, + "grad_norm": 0.3381432890892029, + "learning_rate": 0.0005769265734265733, + "loss": 3.8487, + "step": 6700 + }, + { + "epoch": 1.96569398334207, + "grad_norm": 0.31193795800209045, + "learning_rate": 0.0005767517482517482, + "loss": 3.8599, + "step": 6750 + }, + { + "epoch": 1.9802551109557924, + "grad_norm": 0.33586713671684265, + "learning_rate": 0.000576576923076923, + "loss": 3.8439, + "step": 6800 + }, + { + "epoch": 1.994816238569515, + "grad_norm": 0.3259575068950653, + "learning_rate": 0.0005764020979020978, + "loss": 3.8482, + "step": 6850 + }, + { + "epoch": 2.009319121672782, + "grad_norm": 0.3125501275062561, + "learning_rate": 0.0005762272727272726, + "loss": 3.7815, + "step": 6900 + }, + { + "epoch": 2.023880249286505, + "grad_norm": 0.3336809575557709, + "learning_rate": 0.0005760524475524475, + "loss": 3.7473, + "step": 6950 + }, + { + "epoch": 2.038441376900227, + "grad_norm": 0.3166639804840088, + "learning_rate": 0.0005758776223776223, + "loss": 3.7474, + "step": 7000 + }, + { + "epoch": 2.038441376900227, + "eval_accuracy": 0.34135384628406934, + "eval_loss": 3.8145618438720703, + "eval_runtime": 179.8334, + "eval_samples_per_second": 92.563, + "eval_steps_per_second": 5.789, + "step": 7000 + }, + { + "epoch": 2.0530025045139495, + "grad_norm": 0.3304164409637451, + "learning_rate": 0.0005757027972027971, + "loss": 3.7559, + "step": 7050 + }, + { + "epoch": 2.067563632127672, + "grad_norm": 0.35328182578086853, + "learning_rate": 0.000575527972027972, + "loss": 3.741, + "step": 7100 + }, + { + "epoch": 2.0821247597413945, + "grad_norm": 0.3486672043800354, + "learning_rate": 0.0005753531468531468, + "loss": 3.751, + "step": 7150 + }, + { + "epoch": 2.096685887355117, + "grad_norm": 0.32075631618499756, + "learning_rate": 0.0005751783216783216, + "loss": 3.7516, + "step": 7200 + }, + { + "epoch": 2.111247014968839, + "grad_norm": 0.3235573172569275, + "learning_rate": 0.0005750034965034964, + "loss": 3.7561, + "step": 7250 + }, + { + "epoch": 2.1258081425825615, + "grad_norm": 0.32960283756256104, + "learning_rate": 0.0005748286713286712, + "loss": 3.7471, + "step": 7300 + }, + { + "epoch": 2.140369270196284, + "grad_norm": 0.3249431848526001, + "learning_rate": 0.000574653846153846, + "loss": 3.7479, + "step": 7350 + }, + { + "epoch": 2.1549303978100065, + "grad_norm": 0.32068416476249695, + "learning_rate": 0.000574479020979021, + "loss": 3.7515, + "step": 7400 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 0.35874906182289124, + "learning_rate": 0.0005743041958041958, + "loss": 3.7665, + "step": 7450 + }, + { + "epoch": 2.184052653037451, + "grad_norm": 0.34327706694602966, + "learning_rate": 0.0005741293706293706, + "loss": 3.7511, + "step": 7500 + }, + { + "epoch": 2.198613780651174, + "grad_norm": 0.3151525855064392, + "learning_rate": 0.0005739545454545454, + "loss": 3.7454, + "step": 7550 + }, + { + "epoch": 2.213174908264896, + "grad_norm": 0.3023368716239929, + "learning_rate": 0.0005737797202797203, + "loss": 3.7624, + "step": 7600 + }, + { + "epoch": 2.2277360358786185, + "grad_norm": 0.3228301703929901, + "learning_rate": 0.0005736048951048951, + "loss": 3.7529, + "step": 7650 + }, + { + "epoch": 2.2422971634923408, + "grad_norm": 0.33145347237586975, + "learning_rate": 0.0005734300699300699, + "loss": 3.76, + "step": 7700 + }, + { + "epoch": 2.256858291106063, + "grad_norm": 0.31790366768836975, + "learning_rate": 0.0005732552447552448, + "loss": 3.7657, + "step": 7750 + }, + { + "epoch": 2.271419418719786, + "grad_norm": 0.32009178400039673, + "learning_rate": 0.0005730804195804196, + "loss": 3.7592, + "step": 7800 + }, + { + "epoch": 2.285980546333508, + "grad_norm": 0.31966885924339294, + "learning_rate": 0.0005729055944055944, + "loss": 3.7606, + "step": 7850 + }, + { + "epoch": 2.3005416739472304, + "grad_norm": 0.3291054368019104, + "learning_rate": 0.0005727307692307692, + "loss": 3.7479, + "step": 7900 + }, + { + "epoch": 2.3151028015609527, + "grad_norm": 0.33194002509117126, + "learning_rate": 0.0005725559440559441, + "loss": 3.757, + "step": 7950 + }, + { + "epoch": 2.3296639291746755, + "grad_norm": 0.30678218603134155, + "learning_rate": 0.0005723811188811188, + "loss": 3.7545, + "step": 8000 + }, + { + "epoch": 2.3296639291746755, + "eval_accuracy": 0.34440224469340025, + "eval_loss": 3.782811164855957, + "eval_runtime": 179.7459, + "eval_samples_per_second": 92.609, + "eval_steps_per_second": 5.792, + "step": 8000 + }, + { + "epoch": 2.3442250567883978, + "grad_norm": 0.31450313329696655, + "learning_rate": 0.0005722062937062937, + "loss": 3.7648, + "step": 8050 + }, + { + "epoch": 2.35878618440212, + "grad_norm": 0.3125315308570862, + "learning_rate": 0.0005720314685314685, + "loss": 3.7461, + "step": 8100 + }, + { + "epoch": 2.3733473120158424, + "grad_norm": 0.3463304936885834, + "learning_rate": 0.0005718566433566433, + "loss": 3.7542, + "step": 8150 + }, + { + "epoch": 2.387908439629565, + "grad_norm": 0.3375414311885834, + "learning_rate": 0.0005716818181818181, + "loss": 3.7424, + "step": 8200 + }, + { + "epoch": 2.4024695672432874, + "grad_norm": 0.3216915428638458, + "learning_rate": 0.000571506993006993, + "loss": 3.7559, + "step": 8250 + }, + { + "epoch": 2.4170306948570097, + "grad_norm": 0.37400275468826294, + "learning_rate": 0.0005713321678321678, + "loss": 3.7556, + "step": 8300 + }, + { + "epoch": 2.431591822470732, + "grad_norm": 0.3273051977157593, + "learning_rate": 0.0005711573426573426, + "loss": 3.7541, + "step": 8350 + }, + { + "epoch": 2.4461529500844543, + "grad_norm": 0.31118476390838623, + "learning_rate": 0.0005709825174825175, + "loss": 3.7479, + "step": 8400 + }, + { + "epoch": 2.460714077698177, + "grad_norm": 0.33436667919158936, + "learning_rate": 0.0005708076923076923, + "loss": 3.7398, + "step": 8450 + }, + { + "epoch": 2.4752752053118994, + "grad_norm": 0.32443201541900635, + "learning_rate": 0.0005706328671328671, + "loss": 3.7483, + "step": 8500 + }, + { + "epoch": 2.4898363329256217, + "grad_norm": 0.3430940806865692, + "learning_rate": 0.0005704580419580419, + "loss": 3.75, + "step": 8550 + }, + { + "epoch": 2.5043974605393444, + "grad_norm": 0.31686174869537354, + "learning_rate": 0.0005702832167832168, + "loss": 3.7418, + "step": 8600 + }, + { + "epoch": 2.5189585881530667, + "grad_norm": 0.3173408508300781, + "learning_rate": 0.0005701083916083916, + "loss": 3.7437, + "step": 8650 + }, + { + "epoch": 2.533519715766789, + "grad_norm": 0.3175743818283081, + "learning_rate": 0.0005699335664335664, + "loss": 3.7417, + "step": 8700 + }, + { + "epoch": 2.5480808433805113, + "grad_norm": 0.3153781592845917, + "learning_rate": 0.0005697587412587412, + "loss": 3.7459, + "step": 8750 + }, + { + "epoch": 2.5626419709942336, + "grad_norm": 0.3198295831680298, + "learning_rate": 0.000569583916083916, + "loss": 3.7524, + "step": 8800 + }, + { + "epoch": 2.5772030986079564, + "grad_norm": 0.31497374176979065, + "learning_rate": 0.0005694090909090908, + "loss": 3.7366, + "step": 8850 + }, + { + "epoch": 2.5917642262216787, + "grad_norm": 0.3190245032310486, + "learning_rate": 0.0005692342657342657, + "loss": 3.7408, + "step": 8900 + }, + { + "epoch": 2.606325353835401, + "grad_norm": 0.3084900975227356, + "learning_rate": 0.0005690594405594405, + "loss": 3.7355, + "step": 8950 + }, + { + "epoch": 2.6208864814491233, + "grad_norm": 0.3053756356239319, + "learning_rate": 0.0005688846153846153, + "loss": 3.7487, + "step": 9000 + }, + { + "epoch": 2.6208864814491233, + "eval_accuracy": 0.34699638118781967, + "eval_loss": 3.7568321228027344, + "eval_runtime": 179.8658, + "eval_samples_per_second": 92.547, + "eval_steps_per_second": 5.788, + "step": 9000 + }, + { + "epoch": 2.6354476090628456, + "grad_norm": 0.3176893889904022, + "learning_rate": 0.0005687097902097901, + "loss": 3.7455, + "step": 9050 + }, + { + "epoch": 2.6500087366765683, + "grad_norm": 0.3208650052547455, + "learning_rate": 0.000568534965034965, + "loss": 3.7457, + "step": 9100 + }, + { + "epoch": 2.6645698642902906, + "grad_norm": 0.3182576596736908, + "learning_rate": 0.0005683601398601398, + "loss": 3.7312, + "step": 9150 + }, + { + "epoch": 2.679130991904013, + "grad_norm": 0.31629255414009094, + "learning_rate": 0.0005681853146853146, + "loss": 3.7155, + "step": 9200 + }, + { + "epoch": 2.6936921195177357, + "grad_norm": 0.33148428797721863, + "learning_rate": 0.0005680104895104895, + "loss": 3.7379, + "step": 9250 + }, + { + "epoch": 2.708253247131458, + "grad_norm": 0.3020288646221161, + "learning_rate": 0.0005678356643356643, + "loss": 3.7264, + "step": 9300 + }, + { + "epoch": 2.7228143747451803, + "grad_norm": 0.34346917271614075, + "learning_rate": 0.0005676608391608391, + "loss": 3.7374, + "step": 9350 + }, + { + "epoch": 2.7373755023589026, + "grad_norm": 0.31063133478164673, + "learning_rate": 0.0005674860139860139, + "loss": 3.7298, + "step": 9400 + }, + { + "epoch": 2.751936629972625, + "grad_norm": 0.31841859221458435, + "learning_rate": 0.0005673111888111888, + "loss": 3.7237, + "step": 9450 + }, + { + "epoch": 2.7664977575863476, + "grad_norm": 0.3212113082408905, + "learning_rate": 0.0005671363636363635, + "loss": 3.7389, + "step": 9500 + }, + { + "epoch": 2.78105888520007, + "grad_norm": 0.319784551858902, + "learning_rate": 0.0005669615384615384, + "loss": 3.7401, + "step": 9550 + }, + { + "epoch": 2.7956200128137922, + "grad_norm": 0.31253302097320557, + "learning_rate": 0.0005667867132867132, + "loss": 3.7299, + "step": 9600 + }, + { + "epoch": 2.8101811404275145, + "grad_norm": 0.3241884708404541, + "learning_rate": 0.000566611888111888, + "loss": 3.7281, + "step": 9650 + }, + { + "epoch": 2.824742268041237, + "grad_norm": 0.3327905833721161, + "learning_rate": 0.0005664370629370628, + "loss": 3.7403, + "step": 9700 + }, + { + "epoch": 2.8393033956549596, + "grad_norm": 0.33363252878189087, + "learning_rate": 0.0005662622377622377, + "loss": 3.7429, + "step": 9750 + }, + { + "epoch": 2.853864523268682, + "grad_norm": 0.3250058591365814, + "learning_rate": 0.0005660874125874125, + "loss": 3.7313, + "step": 9800 + }, + { + "epoch": 2.868425650882404, + "grad_norm": 0.3366358280181885, + "learning_rate": 0.0005659125874125873, + "loss": 3.732, + "step": 9850 + }, + { + "epoch": 2.882986778496127, + "grad_norm": 0.3395000100135803, + "learning_rate": 0.0005657377622377622, + "loss": 3.7283, + "step": 9900 + }, + { + "epoch": 2.8975479061098492, + "grad_norm": 0.30396348237991333, + "learning_rate": 0.000565562937062937, + "loss": 3.7282, + "step": 9950 + }, + { + "epoch": 2.9121090337235715, + "grad_norm": 0.310280442237854, + "learning_rate": 0.0005653881118881118, + "loss": 3.7222, + "step": 10000 + }, + { + "epoch": 2.9121090337235715, + "eval_accuracy": 0.3495220962447447, + "eval_loss": 3.729001045227051, + "eval_runtime": 179.9042, + "eval_samples_per_second": 92.527, + "eval_steps_per_second": 5.786, + "step": 10000 + }, + { + "epoch": 2.926670161337294, + "grad_norm": 0.3250355124473572, + "learning_rate": 0.0005652132867132866, + "loss": 3.7166, + "step": 10050 + }, + { + "epoch": 2.941231288951016, + "grad_norm": 0.30567246675491333, + "learning_rate": 0.0005650384615384615, + "loss": 3.7325, + "step": 10100 + }, + { + "epoch": 2.955792416564739, + "grad_norm": 0.34791237115859985, + "learning_rate": 0.0005648636363636363, + "loss": 3.7056, + "step": 10150 + }, + { + "epoch": 2.970353544178461, + "grad_norm": 0.31332409381866455, + "learning_rate": 0.0005646888111888111, + "loss": 3.7251, + "step": 10200 + }, + { + "epoch": 2.9849146717921835, + "grad_norm": 0.2971247136592865, + "learning_rate": 0.000564513986013986, + "loss": 3.7126, + "step": 10250 + }, + { + "epoch": 2.9994757994059063, + "grad_norm": 0.32203900814056396, + "learning_rate": 0.0005643391608391607, + "loss": 3.7198, + "step": 10300 + }, + { + "epoch": 3.0139786825091734, + "grad_norm": 0.3143203854560852, + "learning_rate": 0.0005641643356643355, + "loss": 3.6176, + "step": 10350 + }, + { + "epoch": 3.0285398101228957, + "grad_norm": 0.33899393677711487, + "learning_rate": 0.0005639895104895105, + "loss": 3.6184, + "step": 10400 + }, + { + "epoch": 3.0431009377366185, + "grad_norm": 0.33629149198532104, + "learning_rate": 0.0005638146853146853, + "loss": 3.6218, + "step": 10450 + }, + { + "epoch": 3.057662065350341, + "grad_norm": 0.33977800607681274, + "learning_rate": 0.0005636398601398601, + "loss": 3.6169, + "step": 10500 + }, + { + "epoch": 3.072223192964063, + "grad_norm": 0.3242505192756653, + "learning_rate": 0.000563465034965035, + "loss": 3.6248, + "step": 10550 + }, + { + "epoch": 3.0867843205777854, + "grad_norm": 0.33569052815437317, + "learning_rate": 0.0005632902097902098, + "loss": 3.6438, + "step": 10600 + }, + { + "epoch": 3.101345448191508, + "grad_norm": 0.3249237835407257, + "learning_rate": 0.0005631153846153846, + "loss": 3.6286, + "step": 10650 + }, + { + "epoch": 3.1159065758052304, + "grad_norm": 0.3126699924468994, + "learning_rate": 0.0005629405594405594, + "loss": 3.6282, + "step": 10700 + }, + { + "epoch": 3.1304677034189528, + "grad_norm": 0.3072546720504761, + "learning_rate": 0.0005627657342657343, + "loss": 3.6303, + "step": 10750 + }, + { + "epoch": 3.145028831032675, + "grad_norm": 0.30215486884117126, + "learning_rate": 0.0005625909090909091, + "loss": 3.6246, + "step": 10800 + }, + { + "epoch": 3.1595899586463974, + "grad_norm": 0.30103379487991333, + "learning_rate": 0.0005624160839160839, + "loss": 3.632, + "step": 10850 + }, + { + "epoch": 3.17415108626012, + "grad_norm": 0.40593844652175903, + "learning_rate": 0.0005622412587412587, + "loss": 3.6411, + "step": 10900 + }, + { + "epoch": 3.1887122138738424, + "grad_norm": 0.30845344066619873, + "learning_rate": 0.0005620664335664336, + "loss": 3.6405, + "step": 10950 + }, + { + "epoch": 3.2032733414875647, + "grad_norm": 0.31571993231773376, + "learning_rate": 0.0005618916083916083, + "loss": 3.6434, + "step": 11000 + }, + { + "epoch": 3.2032733414875647, + "eval_accuracy": 0.3514291968616427, + "eval_loss": 3.715721607208252, + "eval_runtime": 179.7371, + "eval_samples_per_second": 92.613, + "eval_steps_per_second": 5.792, + "step": 11000 + }, + { + "epoch": 3.217834469101287, + "grad_norm": 0.3318782448768616, + "learning_rate": 0.0005617167832167832, + "loss": 3.6317, + "step": 11050 + }, + { + "epoch": 3.2323955967150098, + "grad_norm": 0.33287033438682556, + "learning_rate": 0.000561541958041958, + "loss": 3.6503, + "step": 11100 + }, + { + "epoch": 3.246956724328732, + "grad_norm": 0.3447157144546509, + "learning_rate": 0.0005613671328671328, + "loss": 3.6464, + "step": 11150 + }, + { + "epoch": 3.2615178519424544, + "grad_norm": 0.31866371631622314, + "learning_rate": 0.0005611923076923077, + "loss": 3.6459, + "step": 11200 + }, + { + "epoch": 3.2760789795561767, + "grad_norm": 0.3190111517906189, + "learning_rate": 0.0005610174825174825, + "loss": 3.6375, + "step": 11250 + }, + { + "epoch": 3.2906401071698994, + "grad_norm": 0.3384534418582916, + "learning_rate": 0.0005608426573426573, + "loss": 3.6297, + "step": 11300 + }, + { + "epoch": 3.3052012347836217, + "grad_norm": 0.3122884929180145, + "learning_rate": 0.0005606678321678321, + "loss": 3.6488, + "step": 11350 + }, + { + "epoch": 3.319762362397344, + "grad_norm": 0.3280264139175415, + "learning_rate": 0.000560493006993007, + "loss": 3.6409, + "step": 11400 + }, + { + "epoch": 3.3343234900110663, + "grad_norm": 0.3291660249233246, + "learning_rate": 0.0005603181818181818, + "loss": 3.6371, + "step": 11450 + }, + { + "epoch": 3.3488846176247886, + "grad_norm": 0.3122524619102478, + "learning_rate": 0.0005601433566433566, + "loss": 3.6437, + "step": 11500 + }, + { + "epoch": 3.3634457452385114, + "grad_norm": 0.3195066452026367, + "learning_rate": 0.0005599685314685314, + "loss": 3.654, + "step": 11550 + }, + { + "epoch": 3.3780068728522337, + "grad_norm": 0.32396697998046875, + "learning_rate": 0.0005597937062937063, + "loss": 3.6451, + "step": 11600 + }, + { + "epoch": 3.392568000465956, + "grad_norm": 0.31407713890075684, + "learning_rate": 0.0005596188811188811, + "loss": 3.6336, + "step": 11650 + }, + { + "epoch": 3.4071291280796787, + "grad_norm": 0.31519898772239685, + "learning_rate": 0.0005594440559440559, + "loss": 3.6432, + "step": 11700 + }, + { + "epoch": 3.421690255693401, + "grad_norm": 0.33295854926109314, + "learning_rate": 0.0005592692307692307, + "loss": 3.6271, + "step": 11750 + }, + { + "epoch": 3.4362513833071233, + "grad_norm": 0.3175846338272095, + "learning_rate": 0.0005590944055944055, + "loss": 3.6419, + "step": 11800 + }, + { + "epoch": 3.4508125109208456, + "grad_norm": 0.3179056942462921, + "learning_rate": 0.0005589195804195803, + "loss": 3.649, + "step": 11850 + }, + { + "epoch": 3.465373638534568, + "grad_norm": 0.31343457102775574, + "learning_rate": 0.0005587447552447552, + "loss": 3.6439, + "step": 11900 + }, + { + "epoch": 3.4799347661482907, + "grad_norm": 0.3348383903503418, + "learning_rate": 0.00055856993006993, + "loss": 3.6389, + "step": 11950 + }, + { + "epoch": 3.494495893762013, + "grad_norm": 0.33012107014656067, + "learning_rate": 0.0005583951048951048, + "loss": 3.6493, + "step": 12000 + }, + { + "epoch": 3.494495893762013, + "eval_accuracy": 0.3528794491862669, + "eval_loss": 3.7000977993011475, + "eval_runtime": 179.7672, + "eval_samples_per_second": 92.598, + "eval_steps_per_second": 5.791, + "step": 12000 + }, + { + "epoch": 3.5090570213757353, + "grad_norm": 0.30933046340942383, + "learning_rate": 0.0005582202797202797, + "loss": 3.6413, + "step": 12050 + }, + { + "epoch": 3.523618148989458, + "grad_norm": 0.3057238757610321, + "learning_rate": 0.0005580454545454545, + "loss": 3.6378, + "step": 12100 + }, + { + "epoch": 3.53817927660318, + "grad_norm": 0.3380361497402191, + "learning_rate": 0.0005578706293706293, + "loss": 3.6462, + "step": 12150 + }, + { + "epoch": 3.5527404042169026, + "grad_norm": 0.32907187938690186, + "learning_rate": 0.0005576958041958041, + "loss": 3.6464, + "step": 12200 + }, + { + "epoch": 3.567301531830625, + "grad_norm": 0.3162597417831421, + "learning_rate": 0.000557520979020979, + "loss": 3.656, + "step": 12250 + }, + { + "epoch": 3.5818626594443472, + "grad_norm": 0.3106593191623688, + "learning_rate": 0.0005573461538461538, + "loss": 3.6515, + "step": 12300 + }, + { + "epoch": 3.59642378705807, + "grad_norm": 0.29408252239227295, + "learning_rate": 0.0005571713286713286, + "loss": 3.6489, + "step": 12350 + }, + { + "epoch": 3.6109849146717923, + "grad_norm": 0.3639216721057892, + "learning_rate": 0.0005569965034965034, + "loss": 3.6424, + "step": 12400 + }, + { + "epoch": 3.6255460422855146, + "grad_norm": 0.31863993406295776, + "learning_rate": 0.0005568216783216783, + "loss": 3.6446, + "step": 12450 + }, + { + "epoch": 3.640107169899237, + "grad_norm": 0.3066108822822571, + "learning_rate": 0.000556646853146853, + "loss": 3.6432, + "step": 12500 + }, + { + "epoch": 3.654668297512959, + "grad_norm": 0.30826711654663086, + "learning_rate": 0.0005564720279720279, + "loss": 3.6457, + "step": 12550 + }, + { + "epoch": 3.669229425126682, + "grad_norm": 0.3210170567035675, + "learning_rate": 0.0005562972027972027, + "loss": 3.6411, + "step": 12600 + }, + { + "epoch": 3.6837905527404042, + "grad_norm": 0.31402987241744995, + "learning_rate": 0.0005561223776223775, + "loss": 3.6542, + "step": 12650 + }, + { + "epoch": 3.6983516803541265, + "grad_norm": 0.33224406838417053, + "learning_rate": 0.0005559475524475524, + "loss": 3.6385, + "step": 12700 + }, + { + "epoch": 3.7129128079678493, + "grad_norm": 0.3081912398338318, + "learning_rate": 0.0005557727272727272, + "loss": 3.6361, + "step": 12750 + }, + { + "epoch": 3.7274739355815716, + "grad_norm": 0.31198635697364807, + "learning_rate": 0.000555597902097902, + "loss": 3.6456, + "step": 12800 + }, + { + "epoch": 3.742035063195294, + "grad_norm": 0.31249940395355225, + "learning_rate": 0.0005554230769230768, + "loss": 3.6301, + "step": 12850 + }, + { + "epoch": 3.756596190809016, + "grad_norm": 0.29419270157814026, + "learning_rate": 0.0005552482517482517, + "loss": 3.6319, + "step": 12900 + }, + { + "epoch": 3.7711573184227385, + "grad_norm": 0.3123679459095001, + "learning_rate": 0.0005550734265734265, + "loss": 3.6439, + "step": 12950 + }, + { + "epoch": 3.7857184460364612, + "grad_norm": 0.3085649013519287, + "learning_rate": 0.0005548986013986013, + "loss": 3.6429, + "step": 13000 + }, + { + "epoch": 3.7857184460364612, + "eval_accuracy": 0.3545970388800704, + "eval_loss": 3.6809747219085693, + "eval_runtime": 179.6902, + "eval_samples_per_second": 92.637, + "eval_steps_per_second": 5.793, + "step": 13000 + }, + { + "epoch": 3.8002795736501835, + "grad_norm": 0.3226883113384247, + "learning_rate": 0.0005547237762237761, + "loss": 3.644, + "step": 13050 + }, + { + "epoch": 3.814840701263906, + "grad_norm": 0.32543593645095825, + "learning_rate": 0.000554548951048951, + "loss": 3.6455, + "step": 13100 + }, + { + "epoch": 3.829401828877628, + "grad_norm": 0.313363254070282, + "learning_rate": 0.0005543741258741258, + "loss": 3.647, + "step": 13150 + }, + { + "epoch": 3.8439629564913504, + "grad_norm": 0.3085945248603821, + "learning_rate": 0.0005541993006993006, + "loss": 3.6409, + "step": 13200 + }, + { + "epoch": 3.858524084105073, + "grad_norm": 0.32422712445259094, + "learning_rate": 0.0005540244755244756, + "loss": 3.6415, + "step": 13250 + }, + { + "epoch": 3.8730852117187955, + "grad_norm": 0.31334224343299866, + "learning_rate": 0.0005538496503496502, + "loss": 3.6376, + "step": 13300 + }, + { + "epoch": 3.887646339332518, + "grad_norm": 0.3215864598751068, + "learning_rate": 0.0005536748251748252, + "loss": 3.6382, + "step": 13350 + }, + { + "epoch": 3.9022074669462405, + "grad_norm": 0.32258346676826477, + "learning_rate": 0.0005535, + "loss": 3.6334, + "step": 13400 + }, + { + "epoch": 3.916768594559963, + "grad_norm": 0.32085853815078735, + "learning_rate": 0.0005533251748251748, + "loss": 3.6264, + "step": 13450 + }, + { + "epoch": 3.931329722173685, + "grad_norm": 0.30639684200286865, + "learning_rate": 0.0005531503496503496, + "loss": 3.6552, + "step": 13500 + }, + { + "epoch": 3.9458908497874075, + "grad_norm": 0.31769323348999023, + "learning_rate": 0.0005529755244755245, + "loss": 3.6226, + "step": 13550 + }, + { + "epoch": 3.9604519774011298, + "grad_norm": 0.31194061040878296, + "learning_rate": 0.0005528006993006993, + "loss": 3.6418, + "step": 13600 + }, + { + "epoch": 3.9750131050148525, + "grad_norm": 0.326402485370636, + "learning_rate": 0.0005526258741258741, + "loss": 3.6404, + "step": 13650 + }, + { + "epoch": 3.989574232628575, + "grad_norm": 0.3246409595012665, + "learning_rate": 0.0005524510489510489, + "loss": 3.6294, + "step": 13700 + }, + { + "epoch": 4.004077115731842, + "grad_norm": 0.32423749566078186, + "learning_rate": 0.0005522762237762238, + "loss": 3.6146, + "step": 13750 + }, + { + "epoch": 4.018638243345564, + "grad_norm": 0.311954140663147, + "learning_rate": 0.0005521013986013986, + "loss": 3.5222, + "step": 13800 + }, + { + "epoch": 4.033199370959287, + "grad_norm": 0.31119635701179504, + "learning_rate": 0.0005519265734265734, + "loss": 3.5373, + "step": 13850 + }, + { + "epoch": 4.04776049857301, + "grad_norm": 0.31895068287849426, + "learning_rate": 0.0005517517482517482, + "loss": 3.5214, + "step": 13900 + }, + { + "epoch": 4.062321626186732, + "grad_norm": 0.34818094968795776, + "learning_rate": 0.0005515769230769231, + "loss": 3.5465, + "step": 13950 + }, + { + "epoch": 4.076882753800454, + "grad_norm": 0.33164742588996887, + "learning_rate": 0.0005514020979020979, + "loss": 3.5427, + "step": 14000 + }, + { + "epoch": 4.076882753800454, + "eval_accuracy": 0.3561751993215227, + "eval_loss": 3.6737630367279053, + "eval_runtime": 179.6974, + "eval_samples_per_second": 92.633, + "eval_steps_per_second": 5.793, + "step": 14000 + }, + { + "epoch": 4.091443881414177, + "grad_norm": 0.3311789333820343, + "learning_rate": 0.0005512272727272727, + "loss": 3.5457, + "step": 14050 + }, + { + "epoch": 4.106005009027899, + "grad_norm": 0.3225516378879547, + "learning_rate": 0.0005510524475524475, + "loss": 3.5393, + "step": 14100 + }, + { + "epoch": 4.120566136641622, + "grad_norm": 0.3110713064670563, + "learning_rate": 0.0005508776223776223, + "loss": 3.5559, + "step": 14150 + }, + { + "epoch": 4.135127264255344, + "grad_norm": 0.32352516055107117, + "learning_rate": 0.0005507027972027972, + "loss": 3.557, + "step": 14200 + }, + { + "epoch": 4.149688391869066, + "grad_norm": 0.32771018147468567, + "learning_rate": 0.000550527972027972, + "loss": 3.5614, + "step": 14250 + }, + { + "epoch": 4.164249519482789, + "grad_norm": 0.3170819580554962, + "learning_rate": 0.0005503531468531468, + "loss": 3.5519, + "step": 14300 + }, + { + "epoch": 4.178810647096511, + "grad_norm": 0.3334265947341919, + "learning_rate": 0.0005501783216783216, + "loss": 3.5502, + "step": 14350 + }, + { + "epoch": 4.193371774710234, + "grad_norm": 0.30677902698516846, + "learning_rate": 0.0005500034965034965, + "loss": 3.574, + "step": 14400 + }, + { + "epoch": 4.207932902323956, + "grad_norm": 0.33088985085487366, + "learning_rate": 0.0005498286713286713, + "loss": 3.5655, + "step": 14450 + }, + { + "epoch": 4.222494029937678, + "grad_norm": 0.31959256529808044, + "learning_rate": 0.0005496538461538461, + "loss": 3.5559, + "step": 14500 + }, + { + "epoch": 4.237055157551401, + "grad_norm": 0.31475120782852173, + "learning_rate": 0.0005494790209790209, + "loss": 3.559, + "step": 14550 + }, + { + "epoch": 4.251616285165123, + "grad_norm": 0.3372187912464142, + "learning_rate": 0.0005493041958041958, + "loss": 3.568, + "step": 14600 + }, + { + "epoch": 4.266177412778846, + "grad_norm": 0.3159469962120056, + "learning_rate": 0.0005491293706293706, + "loss": 3.5742, + "step": 14650 + }, + { + "epoch": 4.280738540392568, + "grad_norm": 0.34496167302131653, + "learning_rate": 0.0005489545454545454, + "loss": 3.569, + "step": 14700 + }, + { + "epoch": 4.29529966800629, + "grad_norm": 0.3201475441455841, + "learning_rate": 0.0005487797202797203, + "loss": 3.573, + "step": 14750 + }, + { + "epoch": 4.309860795620013, + "grad_norm": 0.3239315450191498, + "learning_rate": 0.000548604895104895, + "loss": 3.577, + "step": 14800 + }, + { + "epoch": 4.324421923233735, + "grad_norm": 0.30931442975997925, + "learning_rate": 0.0005484300699300699, + "loss": 3.5692, + "step": 14850 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 0.3285701870918274, + "learning_rate": 0.0005482552447552447, + "loss": 3.566, + "step": 14900 + }, + { + "epoch": 4.35354417846118, + "grad_norm": 0.325842022895813, + "learning_rate": 0.0005480804195804195, + "loss": 3.5647, + "step": 14950 + }, + { + "epoch": 4.368105306074902, + "grad_norm": 0.3167710304260254, + "learning_rate": 0.0005479055944055943, + "loss": 3.5735, + "step": 15000 + }, + { + "epoch": 4.368105306074902, + "eval_accuracy": 0.3571037087946, + "eval_loss": 3.659233570098877, + "eval_runtime": 179.796, + "eval_samples_per_second": 92.583, + "eval_steps_per_second": 5.79, + "step": 15000 + }, + { + "epoch": 4.382666433688625, + "grad_norm": 0.3091343939304352, + "learning_rate": 0.0005477307692307692, + "loss": 3.5822, + "step": 15050 + }, + { + "epoch": 4.397227561302348, + "grad_norm": 0.33039334416389465, + "learning_rate": 0.000547555944055944, + "loss": 3.586, + "step": 15100 + }, + { + "epoch": 4.41178868891607, + "grad_norm": 0.30892929434776306, + "learning_rate": 0.0005473811188811188, + "loss": 3.5716, + "step": 15150 + }, + { + "epoch": 4.426349816529792, + "grad_norm": 0.3354114592075348, + "learning_rate": 0.0005472062937062936, + "loss": 3.5646, + "step": 15200 + }, + { + "epoch": 4.440910944143514, + "grad_norm": 0.3432832360267639, + "learning_rate": 0.0005470314685314685, + "loss": 3.5779, + "step": 15250 + }, + { + "epoch": 4.455472071757237, + "grad_norm": 0.3167623281478882, + "learning_rate": 0.0005468566433566433, + "loss": 3.5657, + "step": 15300 + }, + { + "epoch": 4.47003319937096, + "grad_norm": 0.3280886113643646, + "learning_rate": 0.0005466818181818181, + "loss": 3.5732, + "step": 15350 + }, + { + "epoch": 4.4845943269846815, + "grad_norm": 0.3291832208633423, + "learning_rate": 0.000546506993006993, + "loss": 3.5683, + "step": 15400 + }, + { + "epoch": 4.499155454598404, + "grad_norm": 0.31101885437965393, + "learning_rate": 0.0005463321678321678, + "loss": 3.5722, + "step": 15450 + }, + { + "epoch": 4.513716582212126, + "grad_norm": 0.3118363320827484, + "learning_rate": 0.0005461573426573426, + "loss": 3.5855, + "step": 15500 + }, + { + "epoch": 4.528277709825849, + "grad_norm": 0.31627270579338074, + "learning_rate": 0.0005459825174825174, + "loss": 3.5871, + "step": 15550 + }, + { + "epoch": 4.542838837439572, + "grad_norm": 0.32284530997276306, + "learning_rate": 0.0005458076923076922, + "loss": 3.5754, + "step": 15600 + }, + { + "epoch": 4.5573999650532935, + "grad_norm": 0.32503610849380493, + "learning_rate": 0.000545632867132867, + "loss": 3.569, + "step": 15650 + }, + { + "epoch": 4.571961092667016, + "grad_norm": 0.3345843553543091, + "learning_rate": 0.0005454580419580419, + "loss": 3.566, + "step": 15700 + }, + { + "epoch": 4.586522220280738, + "grad_norm": 0.31699925661087036, + "learning_rate": 0.0005452832167832167, + "loss": 3.5757, + "step": 15750 + }, + { + "epoch": 4.601083347894461, + "grad_norm": 0.3411146402359009, + "learning_rate": 0.0005451083916083915, + "loss": 3.5894, + "step": 15800 + }, + { + "epoch": 4.615644475508184, + "grad_norm": 0.31675615906715393, + "learning_rate": 0.0005449335664335663, + "loss": 3.5752, + "step": 15850 + }, + { + "epoch": 4.630205603121905, + "grad_norm": 0.3413219153881073, + "learning_rate": 0.0005447587412587412, + "loss": 3.5711, + "step": 15900 + }, + { + "epoch": 4.644766730735628, + "grad_norm": 0.3177620470523834, + "learning_rate": 0.000544583916083916, + "loss": 3.5798, + "step": 15950 + }, + { + "epoch": 4.659327858349351, + "grad_norm": 0.31724312901496887, + "learning_rate": 0.0005444090909090908, + "loss": 3.5796, + "step": 16000 + }, + { + "epoch": 4.659327858349351, + "eval_accuracy": 0.35869268499593115, + "eval_loss": 3.648486375808716, + "eval_runtime": 179.9066, + "eval_samples_per_second": 92.526, + "eval_steps_per_second": 5.786, + "step": 16000 + }, + { + "epoch": 4.673888985963073, + "grad_norm": 0.32944586873054504, + "learning_rate": 0.0005442342657342657, + "loss": 3.5742, + "step": 16050 + }, + { + "epoch": 4.6884501135767955, + "grad_norm": 0.320095956325531, + "learning_rate": 0.0005440594405594405, + "loss": 3.5843, + "step": 16100 + }, + { + "epoch": 4.703011241190518, + "grad_norm": 0.3284047245979309, + "learning_rate": 0.0005438846153846153, + "loss": 3.566, + "step": 16150 + }, + { + "epoch": 4.71757236880424, + "grad_norm": 0.338379830121994, + "learning_rate": 0.0005437097902097901, + "loss": 3.5667, + "step": 16200 + }, + { + "epoch": 4.732133496417963, + "grad_norm": 0.3109598159790039, + "learning_rate": 0.0005435349650349651, + "loss": 3.5742, + "step": 16250 + }, + { + "epoch": 4.746694624031685, + "grad_norm": 0.30519962310791016, + "learning_rate": 0.0005433601398601397, + "loss": 3.5789, + "step": 16300 + }, + { + "epoch": 4.7612557516454075, + "grad_norm": 0.3150230944156647, + "learning_rate": 0.0005431853146853147, + "loss": 3.5769, + "step": 16350 + }, + { + "epoch": 4.77581687925913, + "grad_norm": 0.29910922050476074, + "learning_rate": 0.0005430104895104895, + "loss": 3.5761, + "step": 16400 + }, + { + "epoch": 4.790378006872852, + "grad_norm": 0.3157634437084198, + "learning_rate": 0.0005428356643356643, + "loss": 3.5709, + "step": 16450 + }, + { + "epoch": 4.804939134486575, + "grad_norm": 0.3214448094367981, + "learning_rate": 0.0005426608391608391, + "loss": 3.5804, + "step": 16500 + }, + { + "epoch": 4.819500262100297, + "grad_norm": 0.31892773509025574, + "learning_rate": 0.000542486013986014, + "loss": 3.5899, + "step": 16550 + }, + { + "epoch": 4.834061389714019, + "grad_norm": 0.3179968595504761, + "learning_rate": 0.0005423111888111888, + "loss": 3.5709, + "step": 16600 + }, + { + "epoch": 4.848622517327742, + "grad_norm": 0.33231818675994873, + "learning_rate": 0.0005421363636363636, + "loss": 3.5737, + "step": 16650 + }, + { + "epoch": 4.863183644941464, + "grad_norm": 0.30390241742134094, + "learning_rate": 0.0005419615384615385, + "loss": 3.572, + "step": 16700 + }, + { + "epoch": 4.877744772555187, + "grad_norm": 0.3263714909553528, + "learning_rate": 0.0005417867132867133, + "loss": 3.5714, + "step": 16750 + }, + { + "epoch": 4.892305900168909, + "grad_norm": 0.31608420610427856, + "learning_rate": 0.0005416118881118881, + "loss": 3.573, + "step": 16800 + }, + { + "epoch": 4.906867027782631, + "grad_norm": 0.3054676353931427, + "learning_rate": 0.0005414370629370629, + "loss": 3.5793, + "step": 16850 + }, + { + "epoch": 4.921428155396354, + "grad_norm": 0.3099980354309082, + "learning_rate": 0.0005412622377622378, + "loss": 3.5697, + "step": 16900 + }, + { + "epoch": 4.935989283010076, + "grad_norm": 0.29981857538223267, + "learning_rate": 0.0005410874125874126, + "loss": 3.5735, + "step": 16950 + }, + { + "epoch": 4.950550410623799, + "grad_norm": 0.3208276033401489, + "learning_rate": 0.0005409125874125874, + "loss": 3.5819, + "step": 17000 + }, + { + "epoch": 4.950550410623799, + "eval_accuracy": 0.3599148658622406, + "eval_loss": 3.634756326675415, + "eval_runtime": 179.7751, + "eval_samples_per_second": 92.593, + "eval_steps_per_second": 5.791, + "step": 17000 + }, + { + "epoch": 4.9651115382375215, + "grad_norm": 0.310529500246048, + "learning_rate": 0.0005407377622377622, + "loss": 3.5832, + "step": 17050 + }, + { + "epoch": 4.979672665851243, + "grad_norm": 0.32999780774116516, + "learning_rate": 0.000540562937062937, + "loss": 3.5711, + "step": 17100 + }, + { + "epoch": 4.994233793464966, + "grad_norm": 0.3354627192020416, + "learning_rate": 0.0005403881118881118, + "loss": 3.5734, + "step": 17150 + }, + { + "epoch": 5.008736676568233, + "grad_norm": 0.35508137941360474, + "learning_rate": 0.0005402132867132867, + "loss": 3.5155, + "step": 17200 + }, + { + "epoch": 5.023297804181956, + "grad_norm": 0.31227484345436096, + "learning_rate": 0.0005400384615384615, + "loss": 3.4713, + "step": 17250 + }, + { + "epoch": 5.037858931795678, + "grad_norm": 0.31459367275238037, + "learning_rate": 0.0005398636363636363, + "loss": 3.48, + "step": 17300 + }, + { + "epoch": 5.052420059409401, + "grad_norm": 0.31045621633529663, + "learning_rate": 0.0005396888111888111, + "loss": 3.4772, + "step": 17350 + }, + { + "epoch": 5.066981187023123, + "grad_norm": 0.3227365016937256, + "learning_rate": 0.000539513986013986, + "loss": 3.4702, + "step": 17400 + }, + { + "epoch": 5.081542314636845, + "grad_norm": 0.30600887537002563, + "learning_rate": 0.0005393391608391608, + "loss": 3.4776, + "step": 17450 + }, + { + "epoch": 5.096103442250568, + "grad_norm": 0.3312874138355255, + "learning_rate": 0.0005391643356643356, + "loss": 3.4876, + "step": 17500 + }, + { + "epoch": 5.110664569864291, + "grad_norm": 0.3330562114715576, + "learning_rate": 0.0005389895104895105, + "loss": 3.4802, + "step": 17550 + }, + { + "epoch": 5.125225697478013, + "grad_norm": 0.32655513286590576, + "learning_rate": 0.0005388146853146853, + "loss": 3.4899, + "step": 17600 + }, + { + "epoch": 5.139786825091735, + "grad_norm": 0.34551799297332764, + "learning_rate": 0.0005386398601398601, + "loss": 3.493, + "step": 17650 + }, + { + "epoch": 5.154347952705457, + "grad_norm": 0.3142414093017578, + "learning_rate": 0.0005384650349650349, + "loss": 3.5019, + "step": 17700 + }, + { + "epoch": 5.16890908031918, + "grad_norm": 0.3235276937484741, + "learning_rate": 0.0005382902097902098, + "loss": 3.4889, + "step": 17750 + }, + { + "epoch": 5.183470207932903, + "grad_norm": 0.3249594569206238, + "learning_rate": 0.0005381153846153845, + "loss": 3.4947, + "step": 17800 + }, + { + "epoch": 5.1980313355466246, + "grad_norm": 0.32166171073913574, + "learning_rate": 0.0005379405594405594, + "loss": 3.5064, + "step": 17850 + }, + { + "epoch": 5.212592463160347, + "grad_norm": 0.3284703195095062, + "learning_rate": 0.0005377657342657342, + "loss": 3.5105, + "step": 17900 + }, + { + "epoch": 5.227153590774069, + "grad_norm": 0.32744383811950684, + "learning_rate": 0.000537590909090909, + "loss": 3.5143, + "step": 17950 + }, + { + "epoch": 5.241714718387792, + "grad_norm": 0.312739759683609, + "learning_rate": 0.0005374160839160838, + "loss": 3.5007, + "step": 18000 + }, + { + "epoch": 5.241714718387792, + "eval_accuracy": 0.36018232079402723, + "eval_loss": 3.6365652084350586, + "eval_runtime": 179.8304, + "eval_samples_per_second": 92.565, + "eval_steps_per_second": 5.789, + "step": 18000 + }, + { + "epoch": 5.256275846001515, + "grad_norm": 0.31837671995162964, + "learning_rate": 0.0005372412587412587, + "loss": 3.5128, + "step": 18050 + }, + { + "epoch": 5.2708369736152365, + "grad_norm": 0.33519458770751953, + "learning_rate": 0.0005370664335664335, + "loss": 3.5119, + "step": 18100 + }, + { + "epoch": 5.285398101228959, + "grad_norm": 0.34740373492240906, + "learning_rate": 0.0005368916083916083, + "loss": 3.5228, + "step": 18150 + }, + { + "epoch": 5.299959228842681, + "grad_norm": 0.34328994154930115, + "learning_rate": 0.0005367167832167832, + "loss": 3.5142, + "step": 18200 + }, + { + "epoch": 5.314520356456404, + "grad_norm": 0.3207642436027527, + "learning_rate": 0.000536541958041958, + "loss": 3.5114, + "step": 18250 + }, + { + "epoch": 5.329081484070127, + "grad_norm": 0.335101455450058, + "learning_rate": 0.0005363671328671328, + "loss": 3.5175, + "step": 18300 + }, + { + "epoch": 5.3436426116838485, + "grad_norm": 0.34362977743148804, + "learning_rate": 0.0005361923076923076, + "loss": 3.519, + "step": 18350 + }, + { + "epoch": 5.358203739297571, + "grad_norm": 0.3147866725921631, + "learning_rate": 0.0005360174825174825, + "loss": 3.5155, + "step": 18400 + }, + { + "epoch": 5.372764866911294, + "grad_norm": 0.33346375823020935, + "learning_rate": 0.0005358426573426573, + "loss": 3.5163, + "step": 18450 + }, + { + "epoch": 5.387325994525016, + "grad_norm": 0.3331373631954193, + "learning_rate": 0.0005356678321678321, + "loss": 3.5133, + "step": 18500 + }, + { + "epoch": 5.401887122138739, + "grad_norm": 0.3066289722919464, + "learning_rate": 0.0005354930069930069, + "loss": 3.5202, + "step": 18550 + }, + { + "epoch": 5.41644824975246, + "grad_norm": 0.32293954491615295, + "learning_rate": 0.0005353181818181817, + "loss": 3.5251, + "step": 18600 + }, + { + "epoch": 5.431009377366183, + "grad_norm": 0.33153200149536133, + "learning_rate": 0.0005351433566433565, + "loss": 3.5089, + "step": 18650 + }, + { + "epoch": 5.445570504979906, + "grad_norm": 0.32844340801239014, + "learning_rate": 0.0005349685314685314, + "loss": 3.5175, + "step": 18700 + }, + { + "epoch": 5.460131632593628, + "grad_norm": 0.33013710379600525, + "learning_rate": 0.0005347937062937062, + "loss": 3.5155, + "step": 18750 + }, + { + "epoch": 5.4746927602073505, + "grad_norm": 0.318752259016037, + "learning_rate": 0.000534618881118881, + "loss": 3.5226, + "step": 18800 + }, + { + "epoch": 5.489253887821073, + "grad_norm": 0.3632429242134094, + "learning_rate": 0.0005344440559440559, + "loss": 3.5256, + "step": 18850 + }, + { + "epoch": 5.503815015434795, + "grad_norm": 0.31200987100601196, + "learning_rate": 0.0005342692307692307, + "loss": 3.5277, + "step": 18900 + }, + { + "epoch": 5.518376143048518, + "grad_norm": 0.35066500306129456, + "learning_rate": 0.0005340944055944055, + "loss": 3.5224, + "step": 18950 + }, + { + "epoch": 5.53293727066224, + "grad_norm": 0.3067936301231384, + "learning_rate": 0.0005339195804195803, + "loss": 3.5156, + "step": 19000 + }, + { + "epoch": 5.53293727066224, + "eval_accuracy": 0.3610321808827682, + "eval_loss": 3.6285228729248047, + "eval_runtime": 180.0932, + "eval_samples_per_second": 92.43, + "eval_steps_per_second": 5.78, + "step": 19000 + }, + { + "epoch": 5.5474983982759625, + "grad_norm": 0.3141394853591919, + "learning_rate": 0.0005337447552447552, + "loss": 3.5173, + "step": 19050 + }, + { + "epoch": 5.562059525889685, + "grad_norm": 0.334416925907135, + "learning_rate": 0.00053356993006993, + "loss": 3.5189, + "step": 19100 + }, + { + "epoch": 5.576620653503407, + "grad_norm": 0.3050374686717987, + "learning_rate": 0.0005333951048951048, + "loss": 3.5142, + "step": 19150 + }, + { + "epoch": 5.59118178111713, + "grad_norm": 0.33711856603622437, + "learning_rate": 0.0005332202797202796, + "loss": 3.5282, + "step": 19200 + }, + { + "epoch": 5.605742908730852, + "grad_norm": 0.34378382563591003, + "learning_rate": 0.0005330454545454546, + "loss": 3.5195, + "step": 19250 + }, + { + "epoch": 5.620304036344574, + "grad_norm": 0.3297707736492157, + "learning_rate": 0.0005328706293706292, + "loss": 3.532, + "step": 19300 + }, + { + "epoch": 5.634865163958297, + "grad_norm": 0.33016687631607056, + "learning_rate": 0.0005326958041958042, + "loss": 3.5425, + "step": 19350 + }, + { + "epoch": 5.649426291572019, + "grad_norm": 0.34170061349868774, + "learning_rate": 0.000532520979020979, + "loss": 3.5282, + "step": 19400 + }, + { + "epoch": 5.663987419185742, + "grad_norm": 0.3264179825782776, + "learning_rate": 0.0005323461538461538, + "loss": 3.5302, + "step": 19450 + }, + { + "epoch": 5.6785485467994645, + "grad_norm": 0.3002929091453552, + "learning_rate": 0.0005321713286713287, + "loss": 3.5267, + "step": 19500 + }, + { + "epoch": 5.693109674413186, + "grad_norm": 0.35670411586761475, + "learning_rate": 0.0005319965034965035, + "loss": 3.5173, + "step": 19550 + }, + { + "epoch": 5.707670802026909, + "grad_norm": 0.3164016902446747, + "learning_rate": 0.0005318216783216783, + "loss": 3.5437, + "step": 19600 + }, + { + "epoch": 5.722231929640631, + "grad_norm": 0.3452078700065613, + "learning_rate": 0.0005316468531468531, + "loss": 3.5239, + "step": 19650 + }, + { + "epoch": 5.736793057254354, + "grad_norm": 0.3179798424243927, + "learning_rate": 0.000531472027972028, + "loss": 3.5254, + "step": 19700 + }, + { + "epoch": 5.7513541848680765, + "grad_norm": 0.32574138045310974, + "learning_rate": 0.0005312972027972028, + "loss": 3.5335, + "step": 19750 + }, + { + "epoch": 5.765915312481798, + "grad_norm": 0.32392826676368713, + "learning_rate": 0.0005311223776223776, + "loss": 3.5268, + "step": 19800 + }, + { + "epoch": 5.780476440095521, + "grad_norm": 0.34594979882240295, + "learning_rate": 0.0005309475524475524, + "loss": 3.534, + "step": 19850 + }, + { + "epoch": 5.795037567709244, + "grad_norm": 0.31376367807388306, + "learning_rate": 0.0005307727272727273, + "loss": 3.5306, + "step": 19900 + }, + { + "epoch": 5.809598695322966, + "grad_norm": 0.32450011372566223, + "learning_rate": 0.0005305979020979021, + "loss": 3.5337, + "step": 19950 + }, + { + "epoch": 5.824159822936688, + "grad_norm": 0.30886128544807434, + "learning_rate": 0.0005304230769230769, + "loss": 3.5239, + "step": 20000 + }, + { + "epoch": 5.824159822936688, + "eval_accuracy": 0.3622778742705534, + "eval_loss": 3.6140716075897217, + "eval_runtime": 180.1478, + "eval_samples_per_second": 92.402, + "eval_steps_per_second": 5.779, + "step": 20000 + }, + { + "epoch": 5.83872095055041, + "grad_norm": 0.32404589653015137, + "learning_rate": 0.0005302482517482517, + "loss": 3.5409, + "step": 20050 + }, + { + "epoch": 5.853282078164133, + "grad_norm": 0.30877238512039185, + "learning_rate": 0.0005300734265734265, + "loss": 3.5373, + "step": 20100 + }, + { + "epoch": 5.867843205777856, + "grad_norm": 0.31356489658355713, + "learning_rate": 0.0005298986013986013, + "loss": 3.5219, + "step": 20150 + }, + { + "epoch": 5.882404333391578, + "grad_norm": 0.30876606702804565, + "learning_rate": 0.0005297237762237762, + "loss": 3.529, + "step": 20200 + }, + { + "epoch": 5.8969654610053, + "grad_norm": 0.3364260494709015, + "learning_rate": 0.000529548951048951, + "loss": 3.5252, + "step": 20250 + }, + { + "epoch": 5.911526588619022, + "grad_norm": 0.3011105954647064, + "learning_rate": 0.0005293741258741258, + "loss": 3.5244, + "step": 20300 + }, + { + "epoch": 5.926087716232745, + "grad_norm": 0.31753775477409363, + "learning_rate": 0.0005291993006993007, + "loss": 3.5309, + "step": 20350 + }, + { + "epoch": 5.940648843846468, + "grad_norm": 0.3421807289123535, + "learning_rate": 0.0005290244755244755, + "loss": 3.537, + "step": 20400 + }, + { + "epoch": 5.95520997146019, + "grad_norm": 0.3219417631626129, + "learning_rate": 0.0005288496503496503, + "loss": 3.5311, + "step": 20450 + }, + { + "epoch": 5.969771099073912, + "grad_norm": 0.3096925616264343, + "learning_rate": 0.0005286748251748251, + "loss": 3.5334, + "step": 20500 + }, + { + "epoch": 5.984332226687634, + "grad_norm": 0.3308550715446472, + "learning_rate": 0.0005285, + "loss": 3.5199, + "step": 20550 + }, + { + "epoch": 5.998893354301357, + "grad_norm": 0.31948336958885193, + "learning_rate": 0.0005283251748251748, + "loss": 3.5393, + "step": 20600 + }, + { + "epoch": 6.013396237404625, + "grad_norm": 0.31365492939949036, + "learning_rate": 0.0005281503496503496, + "loss": 3.432, + "step": 20650 + }, + { + "epoch": 6.027957365018347, + "grad_norm": 0.32687506079673767, + "learning_rate": 0.0005279755244755244, + "loss": 3.4276, + "step": 20700 + }, + { + "epoch": 6.04251849263207, + "grad_norm": 0.32380980253219604, + "learning_rate": 0.0005278006993006993, + "loss": 3.4312, + "step": 20750 + }, + { + "epoch": 6.0570796202457915, + "grad_norm": 0.3151368498802185, + "learning_rate": 0.000527625874125874, + "loss": 3.4158, + "step": 20800 + }, + { + "epoch": 6.071640747859514, + "grad_norm": 0.315514475107193, + "learning_rate": 0.0005274510489510489, + "loss": 3.4395, + "step": 20850 + }, + { + "epoch": 6.086201875473237, + "grad_norm": 0.32791003584861755, + "learning_rate": 0.0005272762237762238, + "loss": 3.4373, + "step": 20900 + }, + { + "epoch": 6.100763003086959, + "grad_norm": 0.3153580129146576, + "learning_rate": 0.0005271013986013985, + "loss": 3.4479, + "step": 20950 + }, + { + "epoch": 6.115324130700682, + "grad_norm": 0.34948551654815674, + "learning_rate": 0.0005269265734265734, + "loss": 3.4463, + "step": 21000 + }, + { + "epoch": 6.115324130700682, + "eval_accuracy": 0.3622545968742924, + "eval_loss": 3.6173741817474365, + "eval_runtime": 179.8785, + "eval_samples_per_second": 92.54, + "eval_steps_per_second": 5.787, + "step": 21000 + }, + { + "epoch": 6.1298852583144035, + "grad_norm": 0.3471393883228302, + "learning_rate": 0.0005267517482517482, + "loss": 3.4418, + "step": 21050 + }, + { + "epoch": 6.144446385928126, + "grad_norm": 0.32299190759658813, + "learning_rate": 0.000526576923076923, + "loss": 3.462, + "step": 21100 + }, + { + "epoch": 6.159007513541849, + "grad_norm": 0.3276447355747223, + "learning_rate": 0.0005264020979020978, + "loss": 3.4441, + "step": 21150 + }, + { + "epoch": 6.173568641155571, + "grad_norm": 0.3275761604309082, + "learning_rate": 0.0005262272727272727, + "loss": 3.4414, + "step": 21200 + }, + { + "epoch": 6.1881297687692935, + "grad_norm": 0.32831233739852905, + "learning_rate": 0.0005260524475524475, + "loss": 3.4573, + "step": 21250 + }, + { + "epoch": 6.202690896383016, + "grad_norm": 0.32581037282943726, + "learning_rate": 0.0005258776223776223, + "loss": 3.4413, + "step": 21300 + }, + { + "epoch": 6.217252023996738, + "grad_norm": 0.3218664228916168, + "learning_rate": 0.0005257027972027971, + "loss": 3.4494, + "step": 21350 + }, + { + "epoch": 6.231813151610461, + "grad_norm": 0.34039339423179626, + "learning_rate": 0.000525527972027972, + "loss": 3.458, + "step": 21400 + }, + { + "epoch": 6.246374279224183, + "grad_norm": 0.3327193260192871, + "learning_rate": 0.0005253531468531468, + "loss": 3.4557, + "step": 21450 + }, + { + "epoch": 6.2609354068379055, + "grad_norm": 0.3233095705509186, + "learning_rate": 0.0005251783216783216, + "loss": 3.4511, + "step": 21500 + }, + { + "epoch": 6.275496534451628, + "grad_norm": 0.3496866822242737, + "learning_rate": 0.0005250034965034965, + "loss": 3.4622, + "step": 21550 + }, + { + "epoch": 6.29005766206535, + "grad_norm": 0.3645714521408081, + "learning_rate": 0.0005248286713286712, + "loss": 3.4517, + "step": 21600 + }, + { + "epoch": 6.304618789679073, + "grad_norm": 0.3256557881832123, + "learning_rate": 0.0005246538461538461, + "loss": 3.4739, + "step": 21650 + }, + { + "epoch": 6.319179917292795, + "grad_norm": 0.3235686719417572, + "learning_rate": 0.0005244790209790209, + "loss": 3.4725, + "step": 21700 + }, + { + "epoch": 6.3337410449065175, + "grad_norm": 0.3351970911026001, + "learning_rate": 0.0005243041958041957, + "loss": 3.4857, + "step": 21750 + }, + { + "epoch": 6.34830217252024, + "grad_norm": 0.3423496186733246, + "learning_rate": 0.0005241293706293705, + "loss": 3.4746, + "step": 21800 + }, + { + "epoch": 6.362863300133962, + "grad_norm": 0.3310966491699219, + "learning_rate": 0.0005239545454545454, + "loss": 3.4763, + "step": 21850 + }, + { + "epoch": 6.377424427747685, + "grad_norm": 0.31002819538116455, + "learning_rate": 0.0005237797202797202, + "loss": 3.4714, + "step": 21900 + }, + { + "epoch": 6.391985555361408, + "grad_norm": 0.3289186358451843, + "learning_rate": 0.000523604895104895, + "loss": 3.4637, + "step": 21950 + }, + { + "epoch": 6.406546682975129, + "grad_norm": 0.3141127824783325, + "learning_rate": 0.0005234300699300698, + "loss": 3.4779, + "step": 22000 + }, + { + "epoch": 6.406546682975129, + "eval_accuracy": 0.36285122710673956, + "eval_loss": 3.610785722732544, + "eval_runtime": 179.7092, + "eval_samples_per_second": 92.627, + "eval_steps_per_second": 5.793, + "step": 22000 + }, + { + "epoch": 6.421107810588852, + "grad_norm": 0.3150128722190857, + "learning_rate": 0.0005232552447552447, + "loss": 3.4842, + "step": 22050 + }, + { + "epoch": 6.435668938202574, + "grad_norm": 0.3259349465370178, + "learning_rate": 0.0005230804195804195, + "loss": 3.4848, + "step": 22100 + }, + { + "epoch": 6.450230065816297, + "grad_norm": 0.32301968336105347, + "learning_rate": 0.0005229055944055943, + "loss": 3.4818, + "step": 22150 + }, + { + "epoch": 6.4647911934300195, + "grad_norm": 0.3123028874397278, + "learning_rate": 0.0005227307692307691, + "loss": 3.4914, + "step": 22200 + }, + { + "epoch": 6.479352321043741, + "grad_norm": 0.3286699652671814, + "learning_rate": 0.0005225559440559441, + "loss": 3.4875, + "step": 22250 + }, + { + "epoch": 6.493913448657464, + "grad_norm": 0.3313329517841339, + "learning_rate": 0.0005223811188811189, + "loss": 3.4791, + "step": 22300 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 0.31018057465553284, + "learning_rate": 0.0005222062937062937, + "loss": 3.4807, + "step": 22350 + }, + { + "epoch": 6.523035703884909, + "grad_norm": 0.32716143131256104, + "learning_rate": 0.0005220314685314686, + "loss": 3.4846, + "step": 22400 + }, + { + "epoch": 6.5375968314986315, + "grad_norm": 0.3213047981262207, + "learning_rate": 0.0005218566433566433, + "loss": 3.4959, + "step": 22450 + }, + { + "epoch": 6.552157959112353, + "grad_norm": 0.3478303849697113, + "learning_rate": 0.0005216818181818182, + "loss": 3.4794, + "step": 22500 + }, + { + "epoch": 6.566719086726076, + "grad_norm": 0.334625780582428, + "learning_rate": 0.000521506993006993, + "loss": 3.4993, + "step": 22550 + }, + { + "epoch": 6.581280214339799, + "grad_norm": 0.3324287235736847, + "learning_rate": 0.0005213321678321678, + "loss": 3.4947, + "step": 22600 + }, + { + "epoch": 6.595841341953521, + "grad_norm": 0.3208302855491638, + "learning_rate": 0.0005211573426573426, + "loss": 3.4816, + "step": 22650 + }, + { + "epoch": 6.610402469567243, + "grad_norm": 0.3206283152103424, + "learning_rate": 0.0005209825174825175, + "loss": 3.4811, + "step": 22700 + }, + { + "epoch": 6.624963597180965, + "grad_norm": 0.3405255377292633, + "learning_rate": 0.0005208076923076923, + "loss": 3.4839, + "step": 22750 + }, + { + "epoch": 6.639524724794688, + "grad_norm": 0.33559542894363403, + "learning_rate": 0.0005206328671328671, + "loss": 3.4962, + "step": 22800 + }, + { + "epoch": 6.654085852408411, + "grad_norm": 0.3277864456176758, + "learning_rate": 0.0005204580419580419, + "loss": 3.4831, + "step": 22850 + }, + { + "epoch": 6.668646980022133, + "grad_norm": 0.3352718949317932, + "learning_rate": 0.0005202832167832168, + "loss": 3.4782, + "step": 22900 + }, + { + "epoch": 6.683208107635855, + "grad_norm": 0.31568098068237305, + "learning_rate": 0.0005201083916083916, + "loss": 3.4802, + "step": 22950 + }, + { + "epoch": 6.697769235249577, + "grad_norm": 0.3398934602737427, + "learning_rate": 0.0005199335664335664, + "loss": 3.4888, + "step": 23000 + }, + { + "epoch": 6.697769235249577, + "eval_accuracy": 0.3639860589557666, + "eval_loss": 3.598484992980957, + "eval_runtime": 179.7416, + "eval_samples_per_second": 92.611, + "eval_steps_per_second": 5.792, + "step": 23000 + }, + { + "epoch": 6.7123303628633, + "grad_norm": 0.30721819400787354, + "learning_rate": 0.0005197587412587413, + "loss": 3.4866, + "step": 23050 + }, + { + "epoch": 6.726891490477023, + "grad_norm": 0.3224666714668274, + "learning_rate": 0.0005195839160839161, + "loss": 3.4968, + "step": 23100 + }, + { + "epoch": 6.741452618090745, + "grad_norm": 0.32522931694984436, + "learning_rate": 0.0005194090909090909, + "loss": 3.4878, + "step": 23150 + }, + { + "epoch": 6.756013745704467, + "grad_norm": 0.31341007351875305, + "learning_rate": 0.0005192342657342657, + "loss": 3.4833, + "step": 23200 + }, + { + "epoch": 6.77057487331819, + "grad_norm": 0.3186572790145874, + "learning_rate": 0.0005190594405594405, + "loss": 3.4975, + "step": 23250 + }, + { + "epoch": 6.785136000931912, + "grad_norm": 0.33995872735977173, + "learning_rate": 0.0005188846153846153, + "loss": 3.4878, + "step": 23300 + }, + { + "epoch": 6.799697128545635, + "grad_norm": 0.3231462836265564, + "learning_rate": 0.0005187097902097902, + "loss": 3.4897, + "step": 23350 + }, + { + "epoch": 6.814258256159357, + "grad_norm": 0.31064069271087646, + "learning_rate": 0.000518534965034965, + "loss": 3.4984, + "step": 23400 + }, + { + "epoch": 6.828819383773079, + "grad_norm": 0.31749048829078674, + "learning_rate": 0.0005183601398601398, + "loss": 3.4867, + "step": 23450 + }, + { + "epoch": 6.843380511386802, + "grad_norm": 0.31053680181503296, + "learning_rate": 0.0005181853146853146, + "loss": 3.4937, + "step": 23500 + }, + { + "epoch": 6.857941639000524, + "grad_norm": 0.3226015269756317, + "learning_rate": 0.0005180104895104895, + "loss": 3.4918, + "step": 23550 + }, + { + "epoch": 6.872502766614247, + "grad_norm": 0.3255876302719116, + "learning_rate": 0.0005178356643356643, + "loss": 3.4998, + "step": 23600 + }, + { + "epoch": 6.887063894227969, + "grad_norm": 0.32611915469169617, + "learning_rate": 0.0005176608391608391, + "loss": 3.4874, + "step": 23650 + }, + { + "epoch": 6.901625021841691, + "grad_norm": 0.3349880874156952, + "learning_rate": 0.000517486013986014, + "loss": 3.4898, + "step": 23700 + }, + { + "epoch": 6.916186149455414, + "grad_norm": 0.32357269525527954, + "learning_rate": 0.0005173111888111888, + "loss": 3.4919, + "step": 23750 + }, + { + "epoch": 6.930747277069136, + "grad_norm": 0.30893370509147644, + "learning_rate": 0.0005171363636363636, + "loss": 3.488, + "step": 23800 + }, + { + "epoch": 6.945308404682859, + "grad_norm": 0.34728315472602844, + "learning_rate": 0.0005169615384615384, + "loss": 3.4851, + "step": 23850 + }, + { + "epoch": 6.959869532296581, + "grad_norm": 0.34141796827316284, + "learning_rate": 0.0005167867132867133, + "loss": 3.4924, + "step": 23900 + }, + { + "epoch": 6.974430659910303, + "grad_norm": 0.33731377124786377, + "learning_rate": 0.000516611888111888, + "loss": 3.4936, + "step": 23950 + }, + { + "epoch": 6.988991787524026, + "grad_norm": 0.330599308013916, + "learning_rate": 0.0005164370629370629, + "loss": 3.4999, + "step": 24000 + }, + { + "epoch": 6.988991787524026, + "eval_accuracy": 0.36480441226573007, + "eval_loss": 3.590554714202881, + "eval_runtime": 179.6543, + "eval_samples_per_second": 92.656, + "eval_steps_per_second": 5.794, + "step": 24000 + }, + { + "epoch": 7.003494670627293, + "grad_norm": 0.3449358642101288, + "learning_rate": 0.0005162622377622377, + "loss": 3.471, + "step": 24050 + }, + { + "epoch": 7.018055798241016, + "grad_norm": 0.35293149948120117, + "learning_rate": 0.0005160874125874125, + "loss": 3.3888, + "step": 24100 + }, + { + "epoch": 7.032616925854738, + "grad_norm": 0.3265637755393982, + "learning_rate": 0.0005159125874125873, + "loss": 3.3925, + "step": 24150 + }, + { + "epoch": 7.0471780534684605, + "grad_norm": 0.32121822237968445, + "learning_rate": 0.0005157377622377622, + "loss": 3.3897, + "step": 24200 + }, + { + "epoch": 7.061739181082183, + "grad_norm": 0.3485367000102997, + "learning_rate": 0.000515562937062937, + "loss": 3.3965, + "step": 24250 + }, + { + "epoch": 7.076300308695905, + "grad_norm": 0.32369834184646606, + "learning_rate": 0.0005153881118881118, + "loss": 3.4083, + "step": 24300 + }, + { + "epoch": 7.090861436309628, + "grad_norm": 0.3367840349674225, + "learning_rate": 0.0005152132867132867, + "loss": 3.4072, + "step": 24350 + }, + { + "epoch": 7.105422563923351, + "grad_norm": 0.3350302278995514, + "learning_rate": 0.0005150384615384615, + "loss": 3.403, + "step": 24400 + }, + { + "epoch": 7.1199836915370724, + "grad_norm": 0.3556578755378723, + "learning_rate": 0.0005148636363636363, + "loss": 3.3993, + "step": 24450 + }, + { + "epoch": 7.134544819150795, + "grad_norm": 0.33493995666503906, + "learning_rate": 0.0005146888111888111, + "loss": 3.4033, + "step": 24500 + }, + { + "epoch": 7.149105946764517, + "grad_norm": 0.3266991674900055, + "learning_rate": 0.000514513986013986, + "loss": 3.4133, + "step": 24550 + }, + { + "epoch": 7.16366707437824, + "grad_norm": 0.33190712332725525, + "learning_rate": 0.0005143391608391608, + "loss": 3.4191, + "step": 24600 + }, + { + "epoch": 7.1782282019919625, + "grad_norm": 0.33754125237464905, + "learning_rate": 0.0005141643356643356, + "loss": 3.4116, + "step": 24650 + }, + { + "epoch": 7.192789329605684, + "grad_norm": 0.3015083074569702, + "learning_rate": 0.0005139895104895104, + "loss": 3.4225, + "step": 24700 + }, + { + "epoch": 7.207350457219407, + "grad_norm": 0.3270661532878876, + "learning_rate": 0.0005138146853146852, + "loss": 3.4205, + "step": 24750 + }, + { + "epoch": 7.22191158483313, + "grad_norm": 0.3491705656051636, + "learning_rate": 0.00051363986013986, + "loss": 3.4145, + "step": 24800 + }, + { + "epoch": 7.236472712446852, + "grad_norm": 0.3363984525203705, + "learning_rate": 0.0005134650349650349, + "loss": 3.4213, + "step": 24850 + }, + { + "epoch": 7.2510338400605745, + "grad_norm": 0.33105769753456116, + "learning_rate": 0.0005132902097902097, + "loss": 3.4229, + "step": 24900 + }, + { + "epoch": 7.265594967674296, + "grad_norm": 0.3505908250808716, + "learning_rate": 0.0005131153846153845, + "loss": 3.4278, + "step": 24950 + }, + { + "epoch": 7.280156095288019, + "grad_norm": 0.3380582332611084, + "learning_rate": 0.0005129405594405594, + "loss": 3.428, + "step": 25000 + }, + { + "epoch": 7.280156095288019, + "eval_accuracy": 0.36480793914395143, + "eval_loss": 3.5974769592285156, + "eval_runtime": 179.8567, + "eval_samples_per_second": 92.551, + "eval_steps_per_second": 5.788, + "step": 25000 + }, + { + "epoch": 7.294717222901742, + "grad_norm": 0.31806254386901855, + "learning_rate": 0.0005127657342657342, + "loss": 3.4173, + "step": 25050 + }, + { + "epoch": 7.309278350515464, + "grad_norm": 0.3278155028820038, + "learning_rate": 0.000512590909090909, + "loss": 3.4293, + "step": 25100 + }, + { + "epoch": 7.3238394781291865, + "grad_norm": 0.31251752376556396, + "learning_rate": 0.0005124160839160838, + "loss": 3.4386, + "step": 25150 + }, + { + "epoch": 7.338400605742908, + "grad_norm": 0.3372874855995178, + "learning_rate": 0.0005122412587412588, + "loss": 3.4216, + "step": 25200 + }, + { + "epoch": 7.352961733356631, + "grad_norm": 0.32962003350257874, + "learning_rate": 0.0005120664335664336, + "loss": 3.4316, + "step": 25250 + }, + { + "epoch": 7.367522860970354, + "grad_norm": 0.3354533612728119, + "learning_rate": 0.0005118916083916084, + "loss": 3.4331, + "step": 25300 + }, + { + "epoch": 7.382083988584076, + "grad_norm": 0.32760855555534363, + "learning_rate": 0.0005117167832167832, + "loss": 3.4298, + "step": 25350 + }, + { + "epoch": 7.396645116197798, + "grad_norm": 0.323398232460022, + "learning_rate": 0.0005115419580419581, + "loss": 3.4329, + "step": 25400 + }, + { + "epoch": 7.411206243811521, + "grad_norm": 0.3129633665084839, + "learning_rate": 0.0005113671328671328, + "loss": 3.4451, + "step": 25450 + }, + { + "epoch": 7.425767371425243, + "grad_norm": 0.308672159910202, + "learning_rate": 0.0005111923076923077, + "loss": 3.4402, + "step": 25500 + }, + { + "epoch": 7.440328499038966, + "grad_norm": 0.3408229649066925, + "learning_rate": 0.0005110174825174825, + "loss": 3.4397, + "step": 25550 + }, + { + "epoch": 7.454889626652688, + "grad_norm": 0.320758581161499, + "learning_rate": 0.0005108426573426573, + "loss": 3.4447, + "step": 25600 + }, + { + "epoch": 7.46945075426641, + "grad_norm": 0.33821046352386475, + "learning_rate": 0.0005106678321678321, + "loss": 3.4399, + "step": 25650 + }, + { + "epoch": 7.484011881880133, + "grad_norm": 0.32798120379447937, + "learning_rate": 0.000510493006993007, + "loss": 3.4441, + "step": 25700 + }, + { + "epoch": 7.498573009493855, + "grad_norm": 0.36191534996032715, + "learning_rate": 0.0005103181818181818, + "loss": 3.4465, + "step": 25750 + }, + { + "epoch": 7.513134137107578, + "grad_norm": 0.333870530128479, + "learning_rate": 0.0005101433566433566, + "loss": 3.4483, + "step": 25800 + }, + { + "epoch": 7.5276952647213005, + "grad_norm": 0.3584294319152832, + "learning_rate": 0.0005099685314685315, + "loss": 3.4472, + "step": 25850 + }, + { + "epoch": 7.542256392335022, + "grad_norm": 0.3232259750366211, + "learning_rate": 0.0005097937062937063, + "loss": 3.4551, + "step": 25900 + }, + { + "epoch": 7.556817519948745, + "grad_norm": 0.34521010518074036, + "learning_rate": 0.0005096188811188811, + "loss": 3.4492, + "step": 25950 + }, + { + "epoch": 7.571378647562467, + "grad_norm": 0.3537822365760803, + "learning_rate": 0.0005094440559440559, + "loss": 3.4552, + "step": 26000 + }, + { + "epoch": 7.571378647562467, + "eval_accuracy": 0.36535131351525596, + "eval_loss": 3.590632200241089, + "eval_runtime": 179.7252, + "eval_samples_per_second": 92.619, + "eval_steps_per_second": 5.792, + "step": 26000 + }, + { + "epoch": 7.58593977517619, + "grad_norm": 0.3477293848991394, + "learning_rate": 0.0005092692307692308, + "loss": 3.4485, + "step": 26050 + }, + { + "epoch": 7.600500902789912, + "grad_norm": 0.3164335787296295, + "learning_rate": 0.0005090944055944056, + "loss": 3.4537, + "step": 26100 + }, + { + "epoch": 7.615062030403634, + "grad_norm": 0.31365999579429626, + "learning_rate": 0.0005089195804195804, + "loss": 3.4563, + "step": 26150 + }, + { + "epoch": 7.629623158017357, + "grad_norm": 0.33597031235694885, + "learning_rate": 0.0005087447552447552, + "loss": 3.4469, + "step": 26200 + }, + { + "epoch": 7.644184285631079, + "grad_norm": 0.33030572533607483, + "learning_rate": 0.00050856993006993, + "loss": 3.4471, + "step": 26250 + }, + { + "epoch": 7.658745413244802, + "grad_norm": 0.34268873929977417, + "learning_rate": 0.0005083951048951048, + "loss": 3.4566, + "step": 26300 + }, + { + "epoch": 7.673306540858524, + "grad_norm": 0.34644824266433716, + "learning_rate": 0.0005082202797202797, + "loss": 3.4572, + "step": 26350 + }, + { + "epoch": 7.687867668472246, + "grad_norm": 0.3286401331424713, + "learning_rate": 0.0005080454545454545, + "loss": 3.4614, + "step": 26400 + }, + { + "epoch": 7.702428796085969, + "grad_norm": 0.3406911790370941, + "learning_rate": 0.0005078706293706293, + "loss": 3.4613, + "step": 26450 + }, + { + "epoch": 7.716989923699691, + "grad_norm": 0.32939502596855164, + "learning_rate": 0.0005076958041958042, + "loss": 3.4677, + "step": 26500 + }, + { + "epoch": 7.731551051313414, + "grad_norm": 0.33044230937957764, + "learning_rate": 0.000507520979020979, + "loss": 3.4601, + "step": 26550 + }, + { + "epoch": 7.746112178927136, + "grad_norm": 0.315995454788208, + "learning_rate": 0.0005073461538461538, + "loss": 3.4459, + "step": 26600 + }, + { + "epoch": 7.760673306540858, + "grad_norm": 0.35745933651924133, + "learning_rate": 0.0005071713286713286, + "loss": 3.4574, + "step": 26650 + }, + { + "epoch": 7.775234434154581, + "grad_norm": 0.3426244258880615, + "learning_rate": 0.0005069965034965035, + "loss": 3.4537, + "step": 26700 + }, + { + "epoch": 7.789795561768304, + "grad_norm": 0.3141034245491028, + "learning_rate": 0.0005068216783216783, + "loss": 3.4541, + "step": 26750 + }, + { + "epoch": 7.8043566893820255, + "grad_norm": 0.34187954664230347, + "learning_rate": 0.0005066468531468531, + "loss": 3.4703, + "step": 26800 + }, + { + "epoch": 7.818917816995748, + "grad_norm": 0.32608917355537415, + "learning_rate": 0.0005064720279720279, + "loss": 3.4433, + "step": 26850 + }, + { + "epoch": 7.833478944609471, + "grad_norm": 0.30253276228904724, + "learning_rate": 0.0005062972027972028, + "loss": 3.4582, + "step": 26900 + }, + { + "epoch": 7.848040072223193, + "grad_norm": 0.3292168378829956, + "learning_rate": 0.0005061223776223775, + "loss": 3.4609, + "step": 26950 + }, + { + "epoch": 7.862601199836916, + "grad_norm": 0.3352425992488861, + "learning_rate": 0.0005059475524475524, + "loss": 3.4646, + "step": 27000 + }, + { + "epoch": 7.862601199836916, + "eval_accuracy": 0.3661527378097569, + "eval_loss": 3.5805842876434326, + "eval_runtime": 179.8829, + "eval_samples_per_second": 92.538, + "eval_steps_per_second": 5.787, + "step": 27000 + }, + { + "epoch": 7.8771623274506375, + "grad_norm": 0.33013811707496643, + "learning_rate": 0.0005057727272727272, + "loss": 3.4593, + "step": 27050 + }, + { + "epoch": 7.89172345506436, + "grad_norm": 0.3326588273048401, + "learning_rate": 0.000505597902097902, + "loss": 3.4569, + "step": 27100 + }, + { + "epoch": 7.906284582678083, + "grad_norm": 0.3302481472492218, + "learning_rate": 0.0005054230769230769, + "loss": 3.4571, + "step": 27150 + }, + { + "epoch": 7.920845710291805, + "grad_norm": 0.3329846262931824, + "learning_rate": 0.0005052482517482517, + "loss": 3.471, + "step": 27200 + }, + { + "epoch": 7.935406837905528, + "grad_norm": 0.3458568751811981, + "learning_rate": 0.0005050734265734265, + "loss": 3.4765, + "step": 27250 + }, + { + "epoch": 7.9499679655192494, + "grad_norm": 0.3226156532764435, + "learning_rate": 0.0005048986013986013, + "loss": 3.4703, + "step": 27300 + }, + { + "epoch": 7.964529093132972, + "grad_norm": 0.34230631589889526, + "learning_rate": 0.0005047237762237762, + "loss": 3.4688, + "step": 27350 + }, + { + "epoch": 7.979090220746695, + "grad_norm": 0.31827977299690247, + "learning_rate": 0.000504548951048951, + "loss": 3.4583, + "step": 27400 + }, + { + "epoch": 7.993651348360417, + "grad_norm": 0.32115548849105835, + "learning_rate": 0.0005043741258741258, + "loss": 3.4671, + "step": 27450 + }, + { + "epoch": 8.008154231463685, + "grad_norm": 0.3317052721977234, + "learning_rate": 0.0005041993006993006, + "loss": 3.4071, + "step": 27500 + }, + { + "epoch": 8.022715359077408, + "grad_norm": 0.3432307541370392, + "learning_rate": 0.0005040244755244755, + "loss": 3.3524, + "step": 27550 + }, + { + "epoch": 8.037276486691129, + "grad_norm": 0.3754727244377136, + "learning_rate": 0.0005038496503496503, + "loss": 3.3459, + "step": 27600 + }, + { + "epoch": 8.051837614304851, + "grad_norm": 0.35126733779907227, + "learning_rate": 0.0005036748251748251, + "loss": 3.3583, + "step": 27650 + }, + { + "epoch": 8.066398741918574, + "grad_norm": 0.3542656898498535, + "learning_rate": 0.0005034999999999999, + "loss": 3.3705, + "step": 27700 + }, + { + "epoch": 8.080959869532297, + "grad_norm": 0.34104204177856445, + "learning_rate": 0.0005033251748251747, + "loss": 3.3645, + "step": 27750 + }, + { + "epoch": 8.09552099714602, + "grad_norm": 0.34891462326049805, + "learning_rate": 0.0005031503496503496, + "loss": 3.3766, + "step": 27800 + }, + { + "epoch": 8.11008212475974, + "grad_norm": 0.3348483145236969, + "learning_rate": 0.0005029755244755244, + "loss": 3.3758, + "step": 27850 + }, + { + "epoch": 8.124643252373463, + "grad_norm": 0.35943523049354553, + "learning_rate": 0.0005028006993006992, + "loss": 3.3722, + "step": 27900 + }, + { + "epoch": 8.139204379987186, + "grad_norm": 0.360538125038147, + "learning_rate": 0.000502625874125874, + "loss": 3.3764, + "step": 27950 + }, + { + "epoch": 8.153765507600909, + "grad_norm": 0.34357813000679016, + "learning_rate": 0.000502451048951049, + "loss": 3.3921, + "step": 28000 + }, + { + "epoch": 8.153765507600909, + "eval_accuracy": 0.3661058303294128, + "eval_loss": 3.58715558052063, + "eval_runtime": 180.3137, + "eval_samples_per_second": 92.317, + "eval_steps_per_second": 5.773, + "step": 28000 + }, + { + "epoch": 8.168326635214632, + "grad_norm": 0.3614567220211029, + "learning_rate": 0.0005022762237762237, + "loss": 3.3796, + "step": 28050 + }, + { + "epoch": 8.182887762828354, + "grad_norm": 0.33387571573257446, + "learning_rate": 0.0005021013986013985, + "loss": 3.383, + "step": 28100 + }, + { + "epoch": 8.197448890442075, + "grad_norm": 0.3599357008934021, + "learning_rate": 0.0005019265734265733, + "loss": 3.3902, + "step": 28150 + }, + { + "epoch": 8.212010018055798, + "grad_norm": 0.3254016041755676, + "learning_rate": 0.0005017517482517483, + "loss": 3.3856, + "step": 28200 + }, + { + "epoch": 8.22657114566952, + "grad_norm": 0.3269076347351074, + "learning_rate": 0.0005015769230769231, + "loss": 3.3963, + "step": 28250 + }, + { + "epoch": 8.241132273283243, + "grad_norm": 0.3196601867675781, + "learning_rate": 0.0005014020979020979, + "loss": 3.3973, + "step": 28300 + }, + { + "epoch": 8.255693400896966, + "grad_norm": 0.3544836640357971, + "learning_rate": 0.0005012272727272727, + "loss": 3.3985, + "step": 28350 + }, + { + "epoch": 8.270254528510687, + "grad_norm": 0.33133646845817566, + "learning_rate": 0.0005010524475524476, + "loss": 3.3971, + "step": 28400 + }, + { + "epoch": 8.28481565612441, + "grad_norm": 0.365125834941864, + "learning_rate": 0.0005008776223776223, + "loss": 3.3979, + "step": 28450 + }, + { + "epoch": 8.299376783738133, + "grad_norm": 0.3482271730899811, + "learning_rate": 0.0005007027972027972, + "loss": 3.4087, + "step": 28500 + }, + { + "epoch": 8.313937911351855, + "grad_norm": 0.3457016050815582, + "learning_rate": 0.000500527972027972, + "loss": 3.4072, + "step": 28550 + }, + { + "epoch": 8.328499038965578, + "grad_norm": 0.3350307047367096, + "learning_rate": 0.0005003531468531468, + "loss": 3.3984, + "step": 28600 + }, + { + "epoch": 8.3430601665793, + "grad_norm": 0.33122938871383667, + "learning_rate": 0.0005001783216783217, + "loss": 3.4036, + "step": 28650 + }, + { + "epoch": 8.357621294193022, + "grad_norm": 0.3646140992641449, + "learning_rate": 0.0005000034965034965, + "loss": 3.4019, + "step": 28700 + }, + { + "epoch": 8.372182421806745, + "grad_norm": 0.339650422334671, + "learning_rate": 0.0004998286713286713, + "loss": 3.3992, + "step": 28750 + }, + { + "epoch": 8.386743549420467, + "grad_norm": 0.31742623448371887, + "learning_rate": 0.0004996538461538461, + "loss": 3.404, + "step": 28800 + }, + { + "epoch": 8.40130467703419, + "grad_norm": 0.3145395815372467, + "learning_rate": 0.000499479020979021, + "loss": 3.3995, + "step": 28850 + }, + { + "epoch": 8.415865804647911, + "grad_norm": 0.34881776571273804, + "learning_rate": 0.0004993041958041958, + "loss": 3.4046, + "step": 28900 + }, + { + "epoch": 8.430426932261634, + "grad_norm": 0.3403722941875458, + "learning_rate": 0.0004991293706293706, + "loss": 3.4167, + "step": 28950 + }, + { + "epoch": 8.444988059875357, + "grad_norm": 0.3250523507595062, + "learning_rate": 0.0004989545454545454, + "loss": 3.4086, + "step": 29000 + }, + { + "epoch": 8.444988059875357, + "eval_accuracy": 0.3662244510002579, + "eval_loss": 3.5857577323913574, + "eval_runtime": 179.7516, + "eval_samples_per_second": 92.606, + "eval_steps_per_second": 5.791, + "step": 29000 + }, + { + "epoch": 8.45954918748908, + "grad_norm": 0.3275490403175354, + "learning_rate": 0.0004987797202797203, + "loss": 3.4166, + "step": 29050 + }, + { + "epoch": 8.474110315102802, + "grad_norm": 0.31486833095550537, + "learning_rate": 0.0004986048951048951, + "loss": 3.4041, + "step": 29100 + }, + { + "epoch": 8.488671442716523, + "grad_norm": 0.3729318082332611, + "learning_rate": 0.0004984300699300699, + "loss": 3.4167, + "step": 29150 + }, + { + "epoch": 8.503232570330246, + "grad_norm": 0.3305770456790924, + "learning_rate": 0.0004982552447552448, + "loss": 3.4228, + "step": 29200 + }, + { + "epoch": 8.517793697943969, + "grad_norm": 0.3442740738391876, + "learning_rate": 0.0004980804195804195, + "loss": 3.406, + "step": 29250 + }, + { + "epoch": 8.532354825557691, + "grad_norm": 0.32196056842803955, + "learning_rate": 0.0004979055944055944, + "loss": 3.4296, + "step": 29300 + }, + { + "epoch": 8.546915953171414, + "grad_norm": 0.3387078642845154, + "learning_rate": 0.0004977307692307692, + "loss": 3.4227, + "step": 29350 + }, + { + "epoch": 8.561477080785137, + "grad_norm": 0.32302534580230713, + "learning_rate": 0.000497555944055944, + "loss": 3.414, + "step": 29400 + }, + { + "epoch": 8.576038208398858, + "grad_norm": 0.3491160571575165, + "learning_rate": 0.0004973811188811188, + "loss": 3.4214, + "step": 29450 + }, + { + "epoch": 8.59059933601258, + "grad_norm": 0.32889190316200256, + "learning_rate": 0.0004972062937062937, + "loss": 3.4281, + "step": 29500 + }, + { + "epoch": 8.605160463626303, + "grad_norm": 0.32402417063713074, + "learning_rate": 0.0004970314685314685, + "loss": 3.4171, + "step": 29550 + }, + { + "epoch": 8.619721591240026, + "grad_norm": 0.3430418074131012, + "learning_rate": 0.0004968566433566433, + "loss": 3.4293, + "step": 29600 + }, + { + "epoch": 8.634282718853749, + "grad_norm": 0.34214910864830017, + "learning_rate": 0.0004966818181818181, + "loss": 3.4223, + "step": 29650 + }, + { + "epoch": 8.64884384646747, + "grad_norm": 0.3425740897655487, + "learning_rate": 0.000496506993006993, + "loss": 3.4194, + "step": 29700 + }, + { + "epoch": 8.663404974081192, + "grad_norm": 0.34497156739234924, + "learning_rate": 0.0004963321678321678, + "loss": 3.4311, + "step": 29750 + }, + { + "epoch": 8.677966101694915, + "grad_norm": 0.35663503408432007, + "learning_rate": 0.0004961573426573426, + "loss": 3.431, + "step": 29800 + }, + { + "epoch": 8.692527229308638, + "grad_norm": 0.34114986658096313, + "learning_rate": 0.0004959825174825175, + "loss": 3.4287, + "step": 29850 + }, + { + "epoch": 8.70708835692236, + "grad_norm": 0.3398053050041199, + "learning_rate": 0.0004958076923076923, + "loss": 3.4288, + "step": 29900 + }, + { + "epoch": 8.721649484536082, + "grad_norm": 0.34339818358421326, + "learning_rate": 0.0004956328671328671, + "loss": 3.4414, + "step": 29950 + }, + { + "epoch": 8.736210612149804, + "grad_norm": 0.3127419352531433, + "learning_rate": 0.0004954580419580419, + "loss": 3.4342, + "step": 30000 + }, + { + "epoch": 8.736210612149804, + "eval_accuracy": 0.36726194101037535, + "eval_loss": 3.5715725421905518, + "eval_runtime": 179.7611, + "eval_samples_per_second": 92.601, + "eval_steps_per_second": 5.791, + "step": 30000 + }, + { + "epoch": 8.750771739763527, + "grad_norm": 0.3249102532863617, + "learning_rate": 0.0004952832167832167, + "loss": 3.428, + "step": 30050 + }, + { + "epoch": 8.76533286737725, + "grad_norm": 0.34389105439186096, + "learning_rate": 0.0004951083916083915, + "loss": 3.4286, + "step": 30100 + }, + { + "epoch": 8.779893994990973, + "grad_norm": 0.34607070684432983, + "learning_rate": 0.0004949335664335664, + "loss": 3.4325, + "step": 30150 + }, + { + "epoch": 8.794455122604695, + "grad_norm": 0.33967599272727966, + "learning_rate": 0.0004947587412587412, + "loss": 3.4188, + "step": 30200 + }, + { + "epoch": 8.809016250218416, + "grad_norm": 0.34365391731262207, + "learning_rate": 0.000494583916083916, + "loss": 3.4258, + "step": 30250 + }, + { + "epoch": 8.82357737783214, + "grad_norm": 0.31158357858657837, + "learning_rate": 0.0004944090909090908, + "loss": 3.4302, + "step": 30300 + }, + { + "epoch": 8.838138505445862, + "grad_norm": 0.3425881564617157, + "learning_rate": 0.0004942342657342657, + "loss": 3.4471, + "step": 30350 + }, + { + "epoch": 8.852699633059585, + "grad_norm": 0.33694136142730713, + "learning_rate": 0.0004940594405594405, + "loss": 3.4363, + "step": 30400 + }, + { + "epoch": 8.867260760673307, + "grad_norm": 0.33916687965393066, + "learning_rate": 0.0004938846153846153, + "loss": 3.4398, + "step": 30450 + }, + { + "epoch": 8.881821888287028, + "grad_norm": 0.3424004912376404, + "learning_rate": 0.0004937097902097901, + "loss": 3.4373, + "step": 30500 + }, + { + "epoch": 8.896383015900751, + "grad_norm": 0.3579810857772827, + "learning_rate": 0.000493534965034965, + "loss": 3.4423, + "step": 30550 + }, + { + "epoch": 8.910944143514474, + "grad_norm": 0.35978007316589355, + "learning_rate": 0.0004933601398601398, + "loss": 3.4223, + "step": 30600 + }, + { + "epoch": 8.925505271128197, + "grad_norm": 0.34889093041419983, + "learning_rate": 0.0004931853146853146, + "loss": 3.4384, + "step": 30650 + }, + { + "epoch": 8.94006639874192, + "grad_norm": 0.3178730010986328, + "learning_rate": 0.0004930104895104895, + "loss": 3.4316, + "step": 30700 + }, + { + "epoch": 8.95462752635564, + "grad_norm": 0.3225439190864563, + "learning_rate": 0.0004928356643356642, + "loss": 3.4376, + "step": 30750 + }, + { + "epoch": 8.969188653969363, + "grad_norm": 0.32753077149391174, + "learning_rate": 0.0004926608391608391, + "loss": 3.4457, + "step": 30800 + }, + { + "epoch": 8.983749781583086, + "grad_norm": 0.3687169551849365, + "learning_rate": 0.0004924860139860139, + "loss": 3.4323, + "step": 30850 + }, + { + "epoch": 8.998310909196809, + "grad_norm": 0.3431978225708008, + "learning_rate": 0.0004923111888111887, + "loss": 3.4443, + "step": 30900 + }, + { + "epoch": 9.012813792300076, + "grad_norm": 0.3386369049549103, + "learning_rate": 0.0004921363636363635, + "loss": 3.3368, + "step": 30950 + }, + { + "epoch": 9.027374919913798, + "grad_norm": 0.35466766357421875, + "learning_rate": 0.0004919615384615384, + "loss": 3.3258, + "step": 31000 + }, + { + "epoch": 9.027374919913798, + "eval_accuracy": 0.36759758225444167, + "eval_loss": 3.57794451713562, + "eval_runtime": 184.3404, + "eval_samples_per_second": 90.3, + "eval_steps_per_second": 5.647, + "step": 31000 + }, + { + "epoch": 9.041936047527521, + "grad_norm": 0.3356529176235199, + "learning_rate": 0.0004917867132867132, + "loss": 3.3184, + "step": 31050 + }, + { + "epoch": 9.056497175141242, + "grad_norm": 0.3212270140647888, + "learning_rate": 0.000491611888111888, + "loss": 3.3383, + "step": 31100 + }, + { + "epoch": 9.071058302754965, + "grad_norm": 0.3324335813522339, + "learning_rate": 0.0004914370629370628, + "loss": 3.3392, + "step": 31150 + }, + { + "epoch": 9.085619430368688, + "grad_norm": 0.32331228256225586, + "learning_rate": 0.0004912622377622378, + "loss": 3.3521, + "step": 31200 + }, + { + "epoch": 9.10018055798241, + "grad_norm": 0.31954678893089294, + "learning_rate": 0.0004910874125874126, + "loss": 3.3435, + "step": 31250 + }, + { + "epoch": 9.114741685596133, + "grad_norm": 0.32974445819854736, + "learning_rate": 0.0004909125874125874, + "loss": 3.3501, + "step": 31300 + }, + { + "epoch": 9.129302813209854, + "grad_norm": 0.35506731271743774, + "learning_rate": 0.0004907377622377623, + "loss": 3.3467, + "step": 31350 + }, + { + "epoch": 9.143863940823577, + "grad_norm": 0.32969748973846436, + "learning_rate": 0.0004905629370629371, + "loss": 3.357, + "step": 31400 + }, + { + "epoch": 9.1584250684373, + "grad_norm": 0.3305834233760834, + "learning_rate": 0.0004903881118881119, + "loss": 3.3573, + "step": 31450 + }, + { + "epoch": 9.172986196051022, + "grad_norm": 0.33574923872947693, + "learning_rate": 0.0004902132867132867, + "loss": 3.3572, + "step": 31500 + }, + { + "epoch": 9.187547323664745, + "grad_norm": 0.32476624846458435, + "learning_rate": 0.0004900384615384615, + "loss": 3.3454, + "step": 31550 + }, + { + "epoch": 9.202108451278466, + "grad_norm": 0.36604878306388855, + "learning_rate": 0.0004898636363636363, + "loss": 3.3598, + "step": 31600 + }, + { + "epoch": 9.216669578892189, + "grad_norm": 0.3407774567604065, + "learning_rate": 0.0004896888111888112, + "loss": 3.3655, + "step": 31650 + }, + { + "epoch": 9.231230706505912, + "grad_norm": 0.3136043846607208, + "learning_rate": 0.000489513986013986, + "loss": 3.3658, + "step": 31700 + }, + { + "epoch": 9.245791834119634, + "grad_norm": 0.34752407670021057, + "learning_rate": 0.0004893391608391608, + "loss": 3.374, + "step": 31750 + }, + { + "epoch": 9.260352961733357, + "grad_norm": 0.33697524666786194, + "learning_rate": 0.0004891643356643356, + "loss": 3.3715, + "step": 31800 + }, + { + "epoch": 9.27491408934708, + "grad_norm": 0.3399849832057953, + "learning_rate": 0.0004889895104895105, + "loss": 3.3781, + "step": 31850 + }, + { + "epoch": 9.2894752169608, + "grad_norm": 0.32320427894592285, + "learning_rate": 0.0004888146853146853, + "loss": 3.3606, + "step": 31900 + }, + { + "epoch": 9.304036344574524, + "grad_norm": 0.3273387849330902, + "learning_rate": 0.0004886398601398601, + "loss": 3.3726, + "step": 31950 + }, + { + "epoch": 9.318597472188246, + "grad_norm": 0.33997225761413574, + "learning_rate": 0.000488465034965035, + "loss": 3.3831, + "step": 32000 + }, + { + "epoch": 9.318597472188246, + "eval_accuracy": 0.3672809861527707, + "eval_loss": 3.577822685241699, + "eval_runtime": 186.0219, + "eval_samples_per_second": 89.484, + "eval_steps_per_second": 5.596, + "step": 32000 + }, + { + "epoch": 9.333158599801969, + "grad_norm": 0.3397623896598816, + "learning_rate": 0.0004882902097902098, + "loss": 3.3802, + "step": 32050 + }, + { + "epoch": 9.347719727415692, + "grad_norm": 0.3965780735015869, + "learning_rate": 0.0004881153846153846, + "loss": 3.3961, + "step": 32100 + }, + { + "epoch": 9.362280855029413, + "grad_norm": 0.32509127259254456, + "learning_rate": 0.0004879405594405594, + "loss": 3.392, + "step": 32150 + }, + { + "epoch": 9.376841982643136, + "grad_norm": 0.3580123484134674, + "learning_rate": 0.00048776573426573424, + "loss": 3.3685, + "step": 32200 + }, + { + "epoch": 9.391403110256858, + "grad_norm": 0.33572641015052795, + "learning_rate": 0.00048759090909090904, + "loss": 3.3738, + "step": 32250 + }, + { + "epoch": 9.405964237870581, + "grad_norm": 0.34592849016189575, + "learning_rate": 0.0004874160839160839, + "loss": 3.3792, + "step": 32300 + }, + { + "epoch": 9.420525365484304, + "grad_norm": 0.39023056626319885, + "learning_rate": 0.0004872412587412587, + "loss": 3.3712, + "step": 32350 + }, + { + "epoch": 9.435086493098025, + "grad_norm": 0.3557857871055603, + "learning_rate": 0.00048706643356643354, + "loss": 3.3959, + "step": 32400 + }, + { + "epoch": 9.449647620711747, + "grad_norm": 0.3627590537071228, + "learning_rate": 0.00048689160839160834, + "loss": 3.3798, + "step": 32450 + }, + { + "epoch": 9.46420874832547, + "grad_norm": 0.34032562375068665, + "learning_rate": 0.0004867167832167832, + "loss": 3.3949, + "step": 32500 + }, + { + "epoch": 9.478769875939193, + "grad_norm": 0.32405319809913635, + "learning_rate": 0.00048654195804195794, + "loss": 3.382, + "step": 32550 + }, + { + "epoch": 9.493331003552916, + "grad_norm": 0.34905362129211426, + "learning_rate": 0.00048636713286713285, + "loss": 3.3953, + "step": 32600 + }, + { + "epoch": 9.507892131166638, + "grad_norm": 0.3418472409248352, + "learning_rate": 0.0004861923076923077, + "loss": 3.3972, + "step": 32650 + }, + { + "epoch": 9.52245325878036, + "grad_norm": 0.3480176031589508, + "learning_rate": 0.00048601748251748245, + "loss": 3.3983, + "step": 32700 + }, + { + "epoch": 9.537014386394082, + "grad_norm": 0.3377174139022827, + "learning_rate": 0.0004858426573426573, + "loss": 3.3821, + "step": 32750 + }, + { + "epoch": 9.551575514007805, + "grad_norm": 0.3357522487640381, + "learning_rate": 0.0004856678321678321, + "loss": 3.3975, + "step": 32800 + }, + { + "epoch": 9.566136641621528, + "grad_norm": 0.32841238379478455, + "learning_rate": 0.00048549300699300696, + "loss": 3.3976, + "step": 32850 + }, + { + "epoch": 9.58069776923525, + "grad_norm": 0.33749887347221375, + "learning_rate": 0.00048531818181818176, + "loss": 3.4136, + "step": 32900 + }, + { + "epoch": 9.595258896848971, + "grad_norm": 0.3626416325569153, + "learning_rate": 0.0004851433566433566, + "loss": 3.3985, + "step": 32950 + }, + { + "epoch": 9.609820024462694, + "grad_norm": 0.36860400438308716, + "learning_rate": 0.0004849685314685314, + "loss": 3.3965, + "step": 33000 + }, + { + "epoch": 9.609820024462694, + "eval_accuracy": 0.36782729958925975, + "eval_loss": 3.5715696811676025, + "eval_runtime": 183.923, + "eval_samples_per_second": 90.505, + "eval_steps_per_second": 5.66, + "step": 33000 + }, + { + "epoch": 9.624381152076417, + "grad_norm": 0.3426574468612671, + "learning_rate": 0.00048479370629370627, + "loss": 3.4014, + "step": 33050 + }, + { + "epoch": 9.63894227969014, + "grad_norm": 0.3384750783443451, + "learning_rate": 0.00048461888111888106, + "loss": 3.4112, + "step": 33100 + }, + { + "epoch": 9.653503407303862, + "grad_norm": 0.35202690958976746, + "learning_rate": 0.0004844440559440559, + "loss": 3.4129, + "step": 33150 + }, + { + "epoch": 9.668064534917583, + "grad_norm": 0.355497270822525, + "learning_rate": 0.0004842692307692307, + "loss": 3.3892, + "step": 33200 + }, + { + "epoch": 9.682625662531306, + "grad_norm": 0.32850146293640137, + "learning_rate": 0.00048409440559440557, + "loss": 3.4, + "step": 33250 + }, + { + "epoch": 9.697186790145029, + "grad_norm": 0.3368713855743408, + "learning_rate": 0.0004839195804195803, + "loss": 3.4127, + "step": 33300 + }, + { + "epoch": 9.711747917758752, + "grad_norm": 0.3568696677684784, + "learning_rate": 0.0004837447552447552, + "loss": 3.3944, + "step": 33350 + }, + { + "epoch": 9.726309045372474, + "grad_norm": 0.32732048630714417, + "learning_rate": 0.0004835699300699301, + "loss": 3.4031, + "step": 33400 + }, + { + "epoch": 9.740870172986195, + "grad_norm": 0.3446010649204254, + "learning_rate": 0.0004833951048951048, + "loss": 3.4169, + "step": 33450 + }, + { + "epoch": 9.755431300599918, + "grad_norm": 0.32168522477149963, + "learning_rate": 0.0004832202797202797, + "loss": 3.4106, + "step": 33500 + }, + { + "epoch": 9.76999242821364, + "grad_norm": 0.35548439621925354, + "learning_rate": 0.0004830454545454545, + "loss": 3.3974, + "step": 33550 + }, + { + "epoch": 9.784553555827364, + "grad_norm": 0.3315522074699402, + "learning_rate": 0.00048287062937062933, + "loss": 3.4138, + "step": 33600 + }, + { + "epoch": 9.799114683441086, + "grad_norm": 0.33013713359832764, + "learning_rate": 0.00048269580419580413, + "loss": 3.4092, + "step": 33650 + }, + { + "epoch": 9.813675811054807, + "grad_norm": 0.34848856925964355, + "learning_rate": 0.000482520979020979, + "loss": 3.4104, + "step": 33700 + }, + { + "epoch": 9.82823693866853, + "grad_norm": 0.32687628269195557, + "learning_rate": 0.0004823461538461538, + "loss": 3.4018, + "step": 33750 + }, + { + "epoch": 9.842798066282253, + "grad_norm": 0.36140188574790955, + "learning_rate": 0.00048217132867132864, + "loss": 3.4006, + "step": 33800 + }, + { + "epoch": 9.857359193895975, + "grad_norm": 0.32018741965293884, + "learning_rate": 0.00048199650349650344, + "loss": 3.4013, + "step": 33850 + }, + { + "epoch": 9.871920321509698, + "grad_norm": 0.34114909172058105, + "learning_rate": 0.0004818216783216783, + "loss": 3.3959, + "step": 33900 + }, + { + "epoch": 9.88648144912342, + "grad_norm": 0.32743898034095764, + "learning_rate": 0.0004816468531468531, + "loss": 3.4191, + "step": 33950 + }, + { + "epoch": 9.901042576737142, + "grad_norm": 0.37873852252960205, + "learning_rate": 0.00048147202797202795, + "loss": 3.4187, + "step": 34000 + }, + { + "epoch": 9.901042576737142, + "eval_accuracy": 0.36834492774954836, + "eval_loss": 3.561105728149414, + "eval_runtime": 179.7864, + "eval_samples_per_second": 92.588, + "eval_steps_per_second": 5.79, + "step": 34000 + }, + { + "epoch": 9.915603704350865, + "grad_norm": 0.33194535970687866, + "learning_rate": 0.0004812972027972028, + "loss": 3.4104, + "step": 34050 + }, + { + "epoch": 9.930164831964587, + "grad_norm": 0.3524761497974396, + "learning_rate": 0.0004811223776223776, + "loss": 3.4132, + "step": 34100 + }, + { + "epoch": 9.94472595957831, + "grad_norm": 0.3482424020767212, + "learning_rate": 0.00048094755244755245, + "loss": 3.4059, + "step": 34150 + }, + { + "epoch": 9.959287087192033, + "grad_norm": 0.350705087184906, + "learning_rate": 0.0004807727272727272, + "loss": 3.4047, + "step": 34200 + }, + { + "epoch": 9.973848214805754, + "grad_norm": 0.33391574025154114, + "learning_rate": 0.00048059790209790205, + "loss": 3.4213, + "step": 34250 + }, + { + "epoch": 9.988409342419477, + "grad_norm": 0.3514692187309265, + "learning_rate": 0.00048042307692307685, + "loss": 3.4138, + "step": 34300 + }, + { + "epoch": 10.002912225522744, + "grad_norm": 0.3422977030277252, + "learning_rate": 0.0004802482517482517, + "loss": 3.3925, + "step": 34350 + }, + { + "epoch": 10.017473353136467, + "grad_norm": 0.33694586157798767, + "learning_rate": 0.0004800734265734265, + "loss": 3.296, + "step": 34400 + }, + { + "epoch": 10.03203448075019, + "grad_norm": 0.3611753284931183, + "learning_rate": 0.00047989860139860136, + "loss": 3.2954, + "step": 34450 + }, + { + "epoch": 10.046595608363912, + "grad_norm": 0.3551093637943268, + "learning_rate": 0.00047972377622377616, + "loss": 3.2935, + "step": 34500 + }, + { + "epoch": 10.061156735977635, + "grad_norm": 0.37375402450561523, + "learning_rate": 0.000479548951048951, + "loss": 3.3175, + "step": 34550 + }, + { + "epoch": 10.075717863591356, + "grad_norm": 0.365528404712677, + "learning_rate": 0.0004793741258741258, + "loss": 3.3185, + "step": 34600 + }, + { + "epoch": 10.090278991205079, + "grad_norm": 0.35895606875419617, + "learning_rate": 0.00047919930069930067, + "loss": 3.3174, + "step": 34650 + }, + { + "epoch": 10.104840118818801, + "grad_norm": 0.32946503162384033, + "learning_rate": 0.0004790244755244755, + "loss": 3.32, + "step": 34700 + }, + { + "epoch": 10.119401246432524, + "grad_norm": 0.33243829011917114, + "learning_rate": 0.0004788496503496503, + "loss": 3.3142, + "step": 34750 + }, + { + "epoch": 10.133962374046247, + "grad_norm": 0.3511507511138916, + "learning_rate": 0.0004786748251748252, + "loss": 3.3314, + "step": 34800 + }, + { + "epoch": 10.148523501659968, + "grad_norm": 0.3584575653076172, + "learning_rate": 0.0004785, + "loss": 3.3345, + "step": 34850 + }, + { + "epoch": 10.16308462927369, + "grad_norm": 0.3298545479774475, + "learning_rate": 0.00047832517482517483, + "loss": 3.339, + "step": 34900 + }, + { + "epoch": 10.177645756887413, + "grad_norm": 0.3483952581882477, + "learning_rate": 0.0004781503496503496, + "loss": 3.324, + "step": 34950 + }, + { + "epoch": 10.192206884501136, + "grad_norm": 0.3524647057056427, + "learning_rate": 0.00047797552447552443, + "loss": 3.344, + "step": 35000 + }, + { + "epoch": 10.192206884501136, + "eval_accuracy": 0.3683687929588463, + "eval_loss": 3.5726046562194824, + "eval_runtime": 180.0575, + "eval_samples_per_second": 92.448, + "eval_steps_per_second": 5.781, + "step": 35000 + }, + { + "epoch": 10.206768012114859, + "grad_norm": 0.3376518189907074, + "learning_rate": 0.00047780069930069923, + "loss": 3.3347, + "step": 35050 + }, + { + "epoch": 10.221329139728581, + "grad_norm": 0.3457695543766022, + "learning_rate": 0.0004776258741258741, + "loss": 3.3483, + "step": 35100 + }, + { + "epoch": 10.235890267342302, + "grad_norm": 0.37430188059806824, + "learning_rate": 0.0004774510489510489, + "loss": 3.3552, + "step": 35150 + }, + { + "epoch": 10.250451394956025, + "grad_norm": 0.3510351777076721, + "learning_rate": 0.00047727622377622374, + "loss": 3.3549, + "step": 35200 + }, + { + "epoch": 10.265012522569748, + "grad_norm": 0.37889590859413147, + "learning_rate": 0.00047710139860139854, + "loss": 3.3416, + "step": 35250 + }, + { + "epoch": 10.27957365018347, + "grad_norm": 0.3422775864601135, + "learning_rate": 0.0004769265734265734, + "loss": 3.3383, + "step": 35300 + }, + { + "epoch": 10.294134777797193, + "grad_norm": 0.38626229763031006, + "learning_rate": 0.0004767517482517482, + "loss": 3.3461, + "step": 35350 + }, + { + "epoch": 10.308695905410914, + "grad_norm": 0.3493908643722534, + "learning_rate": 0.00047657692307692304, + "loss": 3.3535, + "step": 35400 + }, + { + "epoch": 10.323257033024637, + "grad_norm": 0.35432669520378113, + "learning_rate": 0.0004764020979020979, + "loss": 3.3555, + "step": 35450 + }, + { + "epoch": 10.33781816063836, + "grad_norm": 0.3410918116569519, + "learning_rate": 0.0004762272727272727, + "loss": 3.3469, + "step": 35500 + }, + { + "epoch": 10.352379288252083, + "grad_norm": 0.36023515462875366, + "learning_rate": 0.00047605244755244755, + "loss": 3.3584, + "step": 35550 + }, + { + "epoch": 10.366940415865805, + "grad_norm": 0.3287743330001831, + "learning_rate": 0.00047587762237762235, + "loss": 3.3674, + "step": 35600 + }, + { + "epoch": 10.381501543479526, + "grad_norm": 0.3435341715812683, + "learning_rate": 0.0004757027972027972, + "loss": 3.3646, + "step": 35650 + }, + { + "epoch": 10.396062671093249, + "grad_norm": 0.3478064239025116, + "learning_rate": 0.00047552797202797195, + "loss": 3.3675, + "step": 35700 + }, + { + "epoch": 10.410623798706972, + "grad_norm": 0.3641142249107361, + "learning_rate": 0.0004753531468531468, + "loss": 3.3591, + "step": 35750 + }, + { + "epoch": 10.425184926320695, + "grad_norm": 0.3393605649471283, + "learning_rate": 0.0004751783216783216, + "loss": 3.3615, + "step": 35800 + }, + { + "epoch": 10.439746053934417, + "grad_norm": 0.38742467761039734, + "learning_rate": 0.00047500349650349646, + "loss": 3.3701, + "step": 35850 + }, + { + "epoch": 10.454307181548138, + "grad_norm": 0.37009376287460327, + "learning_rate": 0.00047482867132867126, + "loss": 3.3576, + "step": 35900 + }, + { + "epoch": 10.468868309161861, + "grad_norm": 0.36964377760887146, + "learning_rate": 0.0004746538461538461, + "loss": 3.3546, + "step": 35950 + }, + { + "epoch": 10.483429436775584, + "grad_norm": 0.3347964882850647, + "learning_rate": 0.0004744790209790209, + "loss": 3.3789, + "step": 36000 + }, + { + "epoch": 10.483429436775584, + "eval_accuracy": 0.3682949636414124, + "eval_loss": 3.5694682598114014, + "eval_runtime": 179.8767, + "eval_samples_per_second": 92.541, + "eval_steps_per_second": 5.787, + "step": 36000 + }, + { + "epoch": 10.497990564389307, + "grad_norm": 0.3568975329399109, + "learning_rate": 0.00047430419580419576, + "loss": 3.3692, + "step": 36050 + }, + { + "epoch": 10.51255169200303, + "grad_norm": 0.3243386447429657, + "learning_rate": 0.0004741293706293706, + "loss": 3.3693, + "step": 36100 + }, + { + "epoch": 10.52711281961675, + "grad_norm": 0.3336549997329712, + "learning_rate": 0.0004739545454545454, + "loss": 3.3785, + "step": 36150 + }, + { + "epoch": 10.541673947230473, + "grad_norm": 0.3561848998069763, + "learning_rate": 0.00047377972027972027, + "loss": 3.3691, + "step": 36200 + }, + { + "epoch": 10.556235074844196, + "grad_norm": 0.356851726770401, + "learning_rate": 0.00047360489510489507, + "loss": 3.3643, + "step": 36250 + }, + { + "epoch": 10.570796202457919, + "grad_norm": 0.33825376629829407, + "learning_rate": 0.0004734300699300699, + "loss": 3.3913, + "step": 36300 + }, + { + "epoch": 10.585357330071641, + "grad_norm": 0.3185909390449524, + "learning_rate": 0.0004732552447552447, + "loss": 3.3784, + "step": 36350 + }, + { + "epoch": 10.599918457685362, + "grad_norm": 0.3499145805835724, + "learning_rate": 0.0004730804195804196, + "loss": 3.379, + "step": 36400 + }, + { + "epoch": 10.614479585299085, + "grad_norm": NaN, + "learning_rate": 0.0004729055944055943, + "loss": 3.3796, + "step": 36450 + }, + { + "epoch": 10.629040712912808, + "grad_norm": 0.34612196683883667, + "learning_rate": 0.0004727307692307692, + "loss": 3.3845, + "step": 36500 + }, + { + "epoch": 10.64360184052653, + "grad_norm": 0.3680227994918823, + "learning_rate": 0.000472555944055944, + "loss": 3.3761, + "step": 36550 + }, + { + "epoch": 10.658162968140253, + "grad_norm": 0.36743855476379395, + "learning_rate": 0.00047238111888111883, + "loss": 3.3684, + "step": 36600 + }, + { + "epoch": 10.672724095753976, + "grad_norm": 0.33822494745254517, + "learning_rate": 0.00047220629370629363, + "loss": 3.3854, + "step": 36650 + }, + { + "epoch": 10.687285223367697, + "grad_norm": 0.3840745687484741, + "learning_rate": 0.0004720314685314685, + "loss": 3.3676, + "step": 36700 + }, + { + "epoch": 10.70184635098142, + "grad_norm": 0.3411411941051483, + "learning_rate": 0.0004718566433566433, + "loss": 3.3732, + "step": 36750 + }, + { + "epoch": 10.716407478595142, + "grad_norm": 0.3389338552951813, + "learning_rate": 0.00047168181818181814, + "loss": 3.3848, + "step": 36800 + }, + { + "epoch": 10.730968606208865, + "grad_norm": 0.33619245886802673, + "learning_rate": 0.000471506993006993, + "loss": 3.3761, + "step": 36850 + }, + { + "epoch": 10.745529733822588, + "grad_norm": 0.35158175230026245, + "learning_rate": 0.0004713321678321678, + "loss": 3.3823, + "step": 36900 + }, + { + "epoch": 10.760090861436309, + "grad_norm": 0.3377283811569214, + "learning_rate": 0.00047115734265734265, + "loss": 3.4041, + "step": 36950 + }, + { + "epoch": 10.774651989050032, + "grad_norm": 0.3622257113456726, + "learning_rate": 0.00047098251748251745, + "loss": 3.4005, + "step": 37000 + }, + { + "epoch": 10.774651989050032, + "eval_accuracy": 0.36925674333237796, + "eval_loss": 3.5602827072143555, + "eval_runtime": 179.7742, + "eval_samples_per_second": 92.594, + "eval_steps_per_second": 5.791, + "step": 37000 + }, + { + "epoch": 10.789213116663754, + "grad_norm": 0.3160146474838257, + "learning_rate": 0.0004708076923076923, + "loss": 3.3912, + "step": 37050 + }, + { + "epoch": 10.803774244277477, + "grad_norm": 0.3281710147857666, + "learning_rate": 0.0004706328671328671, + "loss": 3.3967, + "step": 37100 + }, + { + "epoch": 10.8183353718912, + "grad_norm": 0.3841243386268616, + "learning_rate": 0.00047045804195804195, + "loss": 3.3979, + "step": 37150 + }, + { + "epoch": 10.83289649950492, + "grad_norm": 0.33870750665664673, + "learning_rate": 0.0004702832167832167, + "loss": 3.3879, + "step": 37200 + }, + { + "epoch": 10.847457627118644, + "grad_norm": 0.34300288558006287, + "learning_rate": 0.00047010839160839155, + "loss": 3.3959, + "step": 37250 + }, + { + "epoch": 10.862018754732366, + "grad_norm": 0.35645779967308044, + "learning_rate": 0.00046993356643356635, + "loss": 3.3974, + "step": 37300 + }, + { + "epoch": 10.876579882346089, + "grad_norm": 0.35175466537475586, + "learning_rate": 0.0004697587412587412, + "loss": 3.3888, + "step": 37350 + }, + { + "epoch": 10.891141009959812, + "grad_norm": 0.3222729563713074, + "learning_rate": 0.000469583916083916, + "loss": 3.3981, + "step": 37400 + }, + { + "epoch": 10.905702137573535, + "grad_norm": 0.32414838671684265, + "learning_rate": 0.00046940909090909086, + "loss": 3.391, + "step": 37450 + }, + { + "epoch": 10.920263265187256, + "grad_norm": 0.34362274408340454, + "learning_rate": 0.0004692342657342657, + "loss": 3.3832, + "step": 37500 + }, + { + "epoch": 10.934824392800978, + "grad_norm": 0.3682989776134491, + "learning_rate": 0.0004690594405594405, + "loss": 3.3894, + "step": 37550 + }, + { + "epoch": 10.949385520414701, + "grad_norm": 0.3218347728252411, + "learning_rate": 0.00046888461538461537, + "loss": 3.3901, + "step": 37600 + }, + { + "epoch": 10.963946648028424, + "grad_norm": 0.37173983454704285, + "learning_rate": 0.00046870979020979017, + "loss": 3.3884, + "step": 37650 + }, + { + "epoch": 10.978507775642147, + "grad_norm": 0.37107351422309875, + "learning_rate": 0.000468534965034965, + "loss": 3.3922, + "step": 37700 + }, + { + "epoch": 10.993068903255867, + "grad_norm": 0.35204780101776123, + "learning_rate": 0.0004683601398601398, + "loss": 3.3753, + "step": 37750 + }, + { + "epoch": 11.007571786359136, + "grad_norm": 0.33582913875579834, + "learning_rate": 0.0004681853146853147, + "loss": 3.3251, + "step": 37800 + }, + { + "epoch": 11.022132913972857, + "grad_norm": 0.33512166142463684, + "learning_rate": 0.0004680104895104895, + "loss": 3.2653, + "step": 37850 + }, + { + "epoch": 11.03669404158658, + "grad_norm": 0.3530137240886688, + "learning_rate": 0.00046783566433566433, + "loss": 3.2888, + "step": 37900 + }, + { + "epoch": 11.051255169200303, + "grad_norm": 0.3322924077510834, + "learning_rate": 0.0004676608391608391, + "loss": 3.304, + "step": 37950 + }, + { + "epoch": 11.065816296814026, + "grad_norm": 0.34434235095977783, + "learning_rate": 0.00046748601398601393, + "loss": 3.2886, + "step": 38000 + }, + { + "epoch": 11.065816296814026, + "eval_accuracy": 0.36925674333237796, + "eval_loss": 3.5678048133850098, + "eval_runtime": 179.7265, + "eval_samples_per_second": 92.618, + "eval_steps_per_second": 5.792, + "step": 38000 + }, + { + "epoch": 11.080377424427748, + "grad_norm": 0.3417208790779114, + "learning_rate": 0.00046731118881118873, + "loss": 3.2852, + "step": 38050 + }, + { + "epoch": 11.09493855204147, + "grad_norm": 0.36699753999710083, + "learning_rate": 0.0004671363636363636, + "loss": 3.2885, + "step": 38100 + }, + { + "epoch": 11.109499679655192, + "grad_norm": 0.336487740278244, + "learning_rate": 0.00046696153846153844, + "loss": 3.309, + "step": 38150 + }, + { + "epoch": 11.124060807268915, + "grad_norm": 0.35883960127830505, + "learning_rate": 0.00046678671328671324, + "loss": 3.3019, + "step": 38200 + }, + { + "epoch": 11.138621934882638, + "grad_norm": 0.3807854950428009, + "learning_rate": 0.0004666118881118881, + "loss": 3.3034, + "step": 38250 + }, + { + "epoch": 11.15318306249636, + "grad_norm": 0.3594328463077545, + "learning_rate": 0.0004664370629370629, + "loss": 3.2984, + "step": 38300 + }, + { + "epoch": 11.167744190110081, + "grad_norm": 0.344275563955307, + "learning_rate": 0.00046626223776223774, + "loss": 3.3229, + "step": 38350 + }, + { + "epoch": 11.182305317723804, + "grad_norm": 0.3437211811542511, + "learning_rate": 0.00046608741258741254, + "loss": 3.3241, + "step": 38400 + }, + { + "epoch": 11.196866445337527, + "grad_norm": 0.3443619906902313, + "learning_rate": 0.0004659125874125874, + "loss": 3.319, + "step": 38450 + }, + { + "epoch": 11.21142757295125, + "grad_norm": 0.3490242063999176, + "learning_rate": 0.0004657377622377622, + "loss": 3.3212, + "step": 38500 + }, + { + "epoch": 11.225988700564972, + "grad_norm": 0.33340850472450256, + "learning_rate": 0.00046556293706293705, + "loss": 3.3279, + "step": 38550 + }, + { + "epoch": 11.240549828178693, + "grad_norm": 0.33329835534095764, + "learning_rate": 0.00046538811188811185, + "loss": 3.3262, + "step": 38600 + }, + { + "epoch": 11.255110955792416, + "grad_norm": 0.3586530387401581, + "learning_rate": 0.0004652132867132867, + "loss": 3.3209, + "step": 38650 + }, + { + "epoch": 11.269672083406139, + "grad_norm": 0.3424071669578552, + "learning_rate": 0.00046503846153846145, + "loss": 3.3249, + "step": 38700 + }, + { + "epoch": 11.284233211019862, + "grad_norm": 0.36820656061172485, + "learning_rate": 0.0004648636363636363, + "loss": 3.3411, + "step": 38750 + }, + { + "epoch": 11.298794338633584, + "grad_norm": 0.33335718512535095, + "learning_rate": 0.0004646888111888111, + "loss": 3.3367, + "step": 38800 + }, + { + "epoch": 11.313355466247307, + "grad_norm": 0.3624469041824341, + "learning_rate": 0.00046451398601398596, + "loss": 3.3382, + "step": 38850 + }, + { + "epoch": 11.327916593861028, + "grad_norm": 0.3541378974914551, + "learning_rate": 0.0004643391608391608, + "loss": 3.335, + "step": 38900 + }, + { + "epoch": 11.34247772147475, + "grad_norm": 0.3461047112941742, + "learning_rate": 0.0004641643356643356, + "loss": 3.3381, + "step": 38950 + }, + { + "epoch": 11.357038849088473, + "grad_norm": 0.33990010619163513, + "learning_rate": 0.00046398951048951046, + "loss": 3.339, + "step": 39000 + }, + { + "epoch": 11.357038849088473, + "eval_accuracy": 0.36928096122949794, + "eval_loss": 3.5659894943237305, + "eval_runtime": 179.756, + "eval_samples_per_second": 92.603, + "eval_steps_per_second": 5.791, + "step": 39000 + }, + { + "epoch": 11.371599976702196, + "grad_norm": 0.32082730531692505, + "learning_rate": 0.00046381468531468526, + "loss": 3.3427, + "step": 39050 + }, + { + "epoch": 11.386161104315919, + "grad_norm": 0.34358900785446167, + "learning_rate": 0.0004636398601398601, + "loss": 3.3326, + "step": 39100 + }, + { + "epoch": 11.40072223192964, + "grad_norm": 0.3438430428504944, + "learning_rate": 0.0004634650349650349, + "loss": 3.3397, + "step": 39150 + }, + { + "epoch": 11.415283359543363, + "grad_norm": 0.33695173263549805, + "learning_rate": 0.00046329020979020977, + "loss": 3.3419, + "step": 39200 + }, + { + "epoch": 11.429844487157085, + "grad_norm": 0.35443180799484253, + "learning_rate": 0.00046311538461538457, + "loss": 3.3384, + "step": 39250 + }, + { + "epoch": 11.444405614770808, + "grad_norm": 0.3469848930835724, + "learning_rate": 0.0004629405594405594, + "loss": 3.3428, + "step": 39300 + }, + { + "epoch": 11.458966742384531, + "grad_norm": 0.3489019274711609, + "learning_rate": 0.0004627657342657342, + "loss": 3.3367, + "step": 39350 + }, + { + "epoch": 11.473527869998252, + "grad_norm": 0.34575167298316956, + "learning_rate": 0.0004625909090909091, + "loss": 3.3421, + "step": 39400 + }, + { + "epoch": 11.488088997611975, + "grad_norm": 0.36594027280807495, + "learning_rate": 0.0004624160839160838, + "loss": 3.3443, + "step": 39450 + }, + { + "epoch": 11.502650125225697, + "grad_norm": 0.37035486102104187, + "learning_rate": 0.0004622412587412587, + "loss": 3.3556, + "step": 39500 + }, + { + "epoch": 11.51721125283942, + "grad_norm": 0.378604918718338, + "learning_rate": 0.00046206643356643353, + "loss": 3.3449, + "step": 39550 + }, + { + "epoch": 11.531772380453143, + "grad_norm": 0.37700214982032776, + "learning_rate": 0.00046189160839160833, + "loss": 3.3588, + "step": 39600 + }, + { + "epoch": 11.546333508066864, + "grad_norm": 0.3387846350669861, + "learning_rate": 0.0004617167832167832, + "loss": 3.3551, + "step": 39650 + }, + { + "epoch": 11.560894635680587, + "grad_norm": 0.35091421008110046, + "learning_rate": 0.000461541958041958, + "loss": 3.3484, + "step": 39700 + }, + { + "epoch": 11.57545576329431, + "grad_norm": 0.36825278401374817, + "learning_rate": 0.00046136713286713284, + "loss": 3.3578, + "step": 39750 + }, + { + "epoch": 11.590016890908032, + "grad_norm": 0.3793783485889435, + "learning_rate": 0.00046119230769230764, + "loss": 3.3567, + "step": 39800 + }, + { + "epoch": 11.604578018521755, + "grad_norm": 0.35660460591316223, + "learning_rate": 0.0004610174825174825, + "loss": 3.3665, + "step": 39850 + }, + { + "epoch": 11.619139146135478, + "grad_norm": 0.3568241000175476, + "learning_rate": 0.0004608426573426573, + "loss": 3.3687, + "step": 39900 + }, + { + "epoch": 11.633700273749199, + "grad_norm": 0.3692575991153717, + "learning_rate": 0.00046066783216783215, + "loss": 3.3489, + "step": 39950 + }, + { + "epoch": 11.648261401362921, + "grad_norm": 0.33610156178474426, + "learning_rate": 0.00046049300699300695, + "loss": 3.3622, + "step": 40000 + }, + { + "epoch": 11.648261401362921, + "eval_accuracy": 0.36969924898655154, + "eval_loss": 3.5569655895233154, + "eval_runtime": 179.7612, + "eval_samples_per_second": 92.601, + "eval_steps_per_second": 5.791, + "step": 40000 + } + ], + "logging_steps": 50, + "max_steps": 171700, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.3607701815296e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}