diff --git "a/last_to_push_frequency_3591/checkpoint-30000/trainer_state.json" "b/last_to_push_frequency_3591/checkpoint-30000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last_to_push_frequency_3591/checkpoint-30000/trainer_state.json" @@ -0,0 +1,4513 @@ +{ + "best_global_step": 30000, + "best_metric": 3.57382869720459, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_push_frequency_3591/checkpoint-30000", + "epoch": 8.736210612149804, + "eval_steps": 1000, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.014561127613722406, + "grad_norm": 0.8466977477073669, + "learning_rate": 0.000294, + "loss": 8.4098, + "step": 50 + }, + { + "epoch": 0.029122255227444813, + "grad_norm": 0.6842089295387268, + "learning_rate": 0.0005939999999999999, + "loss": 6.7014, + "step": 100 + }, + { + "epoch": 0.04368338284116722, + "grad_norm": 0.7142037749290466, + "learning_rate": 0.0005998286713286713, + "loss": 6.34, + "step": 150 + }, + { + "epoch": 0.058244510454889625, + "grad_norm": 0.5938196778297424, + "learning_rate": 0.0005996538461538461, + "loss": 6.139, + "step": 200 + }, + { + "epoch": 0.07280563806861204, + "grad_norm": 0.3856406807899475, + "learning_rate": 0.0005994790209790209, + "loss": 5.9921, + "step": 250 + }, + { + "epoch": 0.08736676568233444, + "grad_norm": 0.39986154437065125, + "learning_rate": 0.0005993041958041958, + "loss": 5.8431, + "step": 300 + }, + { + "epoch": 0.10192789329605685, + "grad_norm": 0.5341420769691467, + "learning_rate": 0.0005991293706293705, + "loss": 5.7043, + "step": 350 + }, + { + "epoch": 0.11648902090977925, + "grad_norm": 0.4427669644355774, + "learning_rate": 0.0005989545454545454, + "loss": 5.6019, + "step": 400 + }, + { + "epoch": 0.13105014852350166, + "grad_norm": 0.45295220613479614, + "learning_rate": 0.0005987797202797202, + "loss": 5.4996, + "step": 450 + }, + { + "epoch": 0.14561127613722408, + "grad_norm": 0.5014562606811523, + "learning_rate": 0.000598604895104895, + "loss": 5.3991, + "step": 500 + }, + { + "epoch": 0.16017240375094646, + "grad_norm": 0.4532122015953064, + "learning_rate": 0.0005984300699300698, + "loss": 5.3103, + "step": 550 + }, + { + "epoch": 0.17473353136466888, + "grad_norm": 0.45960476994514465, + "learning_rate": 0.0005982552447552447, + "loss": 5.2526, + "step": 600 + }, + { + "epoch": 0.1892946589783913, + "grad_norm": 0.4369155764579773, + "learning_rate": 0.0005980804195804195, + "loss": 5.1734, + "step": 650 + }, + { + "epoch": 0.2038557865921137, + "grad_norm": 0.48178285360336304, + "learning_rate": 0.0005979055944055943, + "loss": 5.1075, + "step": 700 + }, + { + "epoch": 0.2184169142058361, + "grad_norm": 0.44058653712272644, + "learning_rate": 0.0005977307692307691, + "loss": 5.0671, + "step": 750 + }, + { + "epoch": 0.2329780418195585, + "grad_norm": 0.45520836114883423, + "learning_rate": 0.000597555944055944, + "loss": 5.0176, + "step": 800 + }, + { + "epoch": 0.24753916943328091, + "grad_norm": 0.4496179521083832, + "learning_rate": 0.0005973811188811188, + "loss": 4.972, + "step": 850 + }, + { + "epoch": 0.2621002970470033, + "grad_norm": 0.5189023613929749, + "learning_rate": 0.0005972062937062936, + "loss": 4.9247, + "step": 900 + }, + { + "epoch": 0.27666142466072574, + "grad_norm": 0.542473554611206, + "learning_rate": 0.0005970314685314685, + "loss": 4.8671, + "step": 950 + }, + { + "epoch": 0.29122255227444815, + "grad_norm": 0.45608243346214294, + "learning_rate": 0.0005968566433566433, + "loss": 4.8193, + "step": 1000 + }, + { + "epoch": 0.29122255227444815, + "eval_accuracy": 0.2555685292424054, + "eval_loss": 4.740924835205078, + "eval_runtime": 182.37, + "eval_samples_per_second": 91.276, + "eval_steps_per_second": 5.708, + "step": 1000 + }, + { + "epoch": 0.30578367988817057, + "grad_norm": 0.4338395893573761, + "learning_rate": 0.0005966818181818181, + "loss": 4.7728, + "step": 1050 + }, + { + "epoch": 0.3203448075018929, + "grad_norm": 0.5276835560798645, + "learning_rate": 0.0005965069930069929, + "loss": 4.7444, + "step": 1100 + }, + { + "epoch": 0.33490593511561534, + "grad_norm": 0.44655585289001465, + "learning_rate": 0.0005963321678321677, + "loss": 4.6967, + "step": 1150 + }, + { + "epoch": 0.34946706272933775, + "grad_norm": 0.38414475321769714, + "learning_rate": 0.0005961573426573425, + "loss": 4.6614, + "step": 1200 + }, + { + "epoch": 0.36402819034306017, + "grad_norm": 0.4639468193054199, + "learning_rate": 0.0005959825174825174, + "loss": 4.6234, + "step": 1250 + }, + { + "epoch": 0.3785893179567826, + "grad_norm": 0.4747146964073181, + "learning_rate": 0.0005958076923076922, + "loss": 4.6, + "step": 1300 + }, + { + "epoch": 0.393150445570505, + "grad_norm": 0.5148531198501587, + "learning_rate": 0.000595632867132867, + "loss": 4.5744, + "step": 1350 + }, + { + "epoch": 0.4077115731842274, + "grad_norm": 0.4540764391422272, + "learning_rate": 0.0005954580419580418, + "loss": 4.5458, + "step": 1400 + }, + { + "epoch": 0.4222727007979498, + "grad_norm": 0.4469960033893585, + "learning_rate": 0.0005952832167832168, + "loss": 4.5213, + "step": 1450 + }, + { + "epoch": 0.4368338284116722, + "grad_norm": 0.4068995416164398, + "learning_rate": 0.0005951083916083916, + "loss": 4.4989, + "step": 1500 + }, + { + "epoch": 0.4513949560253946, + "grad_norm": 0.36514100432395935, + "learning_rate": 0.0005949335664335664, + "loss": 4.4959, + "step": 1550 + }, + { + "epoch": 0.465956083639117, + "grad_norm": 0.41950780153274536, + "learning_rate": 0.0005947587412587413, + "loss": 4.4468, + "step": 1600 + }, + { + "epoch": 0.4805172112528394, + "grad_norm": 0.42646339535713196, + "learning_rate": 0.0005945839160839161, + "loss": 4.4366, + "step": 1650 + }, + { + "epoch": 0.49507833886656183, + "grad_norm": 0.3759300112724304, + "learning_rate": 0.0005944090909090909, + "loss": 4.425, + "step": 1700 + }, + { + "epoch": 0.5096394664802842, + "grad_norm": 0.40312129259109497, + "learning_rate": 0.0005942342657342657, + "loss": 4.3986, + "step": 1750 + }, + { + "epoch": 0.5242005940940067, + "grad_norm": 0.3916018009185791, + "learning_rate": 0.0005940594405594406, + "loss": 4.3877, + "step": 1800 + }, + { + "epoch": 0.5387617217077291, + "grad_norm": 0.36997199058532715, + "learning_rate": 0.0005938846153846153, + "loss": 4.3737, + "step": 1850 + }, + { + "epoch": 0.5533228493214515, + "grad_norm": 0.4766369163990021, + "learning_rate": 0.0005937097902097902, + "loss": 4.3561, + "step": 1900 + }, + { + "epoch": 0.5678839769351739, + "grad_norm": 0.4964778423309326, + "learning_rate": 0.000593534965034965, + "loss": 4.3618, + "step": 1950 + }, + { + "epoch": 0.5824451045488963, + "grad_norm": 0.43863603472709656, + "learning_rate": 0.0005933601398601398, + "loss": 4.3232, + "step": 2000 + }, + { + "epoch": 0.5824451045488963, + "eval_accuracy": 0.29954776016193546, + "eval_loss": 4.281495094299316, + "eval_runtime": 182.5417, + "eval_samples_per_second": 91.19, + "eval_steps_per_second": 5.703, + "step": 2000 + }, + { + "epoch": 0.5970062321626187, + "grad_norm": 0.3895237147808075, + "learning_rate": 0.0005931853146853146, + "loss": 4.3153, + "step": 2050 + }, + { + "epoch": 0.6115673597763411, + "grad_norm": 0.35456663370132446, + "learning_rate": 0.0005930104895104895, + "loss": 4.3017, + "step": 2100 + }, + { + "epoch": 0.6261284873900634, + "grad_norm": 0.36657285690307617, + "learning_rate": 0.0005928356643356643, + "loss": 4.3021, + "step": 2150 + }, + { + "epoch": 0.6406896150037859, + "grad_norm": 0.3920918107032776, + "learning_rate": 0.0005926608391608391, + "loss": 4.2813, + "step": 2200 + }, + { + "epoch": 0.6552507426175083, + "grad_norm": 0.3744465708732605, + "learning_rate": 0.000592486013986014, + "loss": 4.2666, + "step": 2250 + }, + { + "epoch": 0.6698118702312307, + "grad_norm": 0.38081395626068115, + "learning_rate": 0.0005923111888111888, + "loss": 4.2704, + "step": 2300 + }, + { + "epoch": 0.6843729978449531, + "grad_norm": 0.36270445585250854, + "learning_rate": 0.0005921363636363636, + "loss": 4.24, + "step": 2350 + }, + { + "epoch": 0.6989341254586755, + "grad_norm": 0.40953528881073, + "learning_rate": 0.0005919615384615384, + "loss": 4.2401, + "step": 2400 + }, + { + "epoch": 0.7134952530723979, + "grad_norm": 0.36897608637809753, + "learning_rate": 0.0005917867132867133, + "loss": 4.2372, + "step": 2450 + }, + { + "epoch": 0.7280563806861203, + "grad_norm": 0.35832679271698, + "learning_rate": 0.0005916118881118881, + "loss": 4.2331, + "step": 2500 + }, + { + "epoch": 0.7426175082998427, + "grad_norm": 0.38073498010635376, + "learning_rate": 0.0005914370629370629, + "loss": 4.2068, + "step": 2550 + }, + { + "epoch": 0.7571786359135652, + "grad_norm": 0.4003191888332367, + "learning_rate": 0.0005912622377622377, + "loss": 4.2046, + "step": 2600 + }, + { + "epoch": 0.7717397635272876, + "grad_norm": 0.3635654151439667, + "learning_rate": 0.0005910874125874125, + "loss": 4.1995, + "step": 2650 + }, + { + "epoch": 0.78630089114101, + "grad_norm": 0.35873937606811523, + "learning_rate": 0.0005909125874125873, + "loss": 4.2033, + "step": 2700 + }, + { + "epoch": 0.8008620187547324, + "grad_norm": 0.3539746105670929, + "learning_rate": 0.0005907377622377622, + "loss": 4.1872, + "step": 2750 + }, + { + "epoch": 0.8154231463684548, + "grad_norm": 0.37548962235450745, + "learning_rate": 0.000590562937062937, + "loss": 4.168, + "step": 2800 + }, + { + "epoch": 0.8299842739821772, + "grad_norm": 0.3538251221179962, + "learning_rate": 0.0005903881118881118, + "loss": 4.179, + "step": 2850 + }, + { + "epoch": 0.8445454015958996, + "grad_norm": 0.3582804501056671, + "learning_rate": 0.0005902132867132867, + "loss": 4.1529, + "step": 2900 + }, + { + "epoch": 0.8591065292096219, + "grad_norm": 0.3666636049747467, + "learning_rate": 0.0005900384615384615, + "loss": 4.1502, + "step": 2950 + }, + { + "epoch": 0.8736676568233444, + "grad_norm": 0.37483951449394226, + "learning_rate": 0.0005898636363636363, + "loss": 4.1292, + "step": 3000 + }, + { + "epoch": 0.8736676568233444, + "eval_accuracy": 0.31617228847136397, + "eval_loss": 4.089555263519287, + "eval_runtime": 182.8843, + "eval_samples_per_second": 91.019, + "eval_steps_per_second": 5.692, + "step": 3000 + }, + { + "epoch": 0.8882287844370668, + "grad_norm": 0.3864665925502777, + "learning_rate": 0.0005896888111888111, + "loss": 4.1193, + "step": 3050 + }, + { + "epoch": 0.9027899120507892, + "grad_norm": 0.347319632768631, + "learning_rate": 0.000589513986013986, + "loss": 4.1179, + "step": 3100 + }, + { + "epoch": 0.9173510396645116, + "grad_norm": 0.3676200807094574, + "learning_rate": 0.0005893391608391608, + "loss": 4.1154, + "step": 3150 + }, + { + "epoch": 0.931912167278234, + "grad_norm": 0.3425697088241577, + "learning_rate": 0.0005891643356643356, + "loss": 4.1291, + "step": 3200 + }, + { + "epoch": 0.9464732948919564, + "grad_norm": 0.3959852457046509, + "learning_rate": 0.0005889895104895104, + "loss": 4.1053, + "step": 3250 + }, + { + "epoch": 0.9610344225056788, + "grad_norm": 0.3836718499660492, + "learning_rate": 0.0005888146853146853, + "loss": 4.1068, + "step": 3300 + }, + { + "epoch": 0.9755955501194012, + "grad_norm": 0.4188532531261444, + "learning_rate": 0.00058863986013986, + "loss": 4.0989, + "step": 3350 + }, + { + "epoch": 0.9901566777331237, + "grad_norm": 0.33719179034233093, + "learning_rate": 0.0005884650349650349, + "loss": 4.0963, + "step": 3400 + }, + { + "epoch": 1.004659560836391, + "grad_norm": 0.36592167615890503, + "learning_rate": 0.0005882902097902097, + "loss": 4.0577, + "step": 3450 + }, + { + "epoch": 1.0192206884501136, + "grad_norm": 0.3381110727787018, + "learning_rate": 0.0005881153846153845, + "loss": 4.0016, + "step": 3500 + }, + { + "epoch": 1.033781816063836, + "grad_norm": 0.35888952016830444, + "learning_rate": 0.0005879405594405594, + "loss": 4.0032, + "step": 3550 + }, + { + "epoch": 1.0483429436775584, + "grad_norm": 0.35463494062423706, + "learning_rate": 0.0005877657342657342, + "loss": 4.0018, + "step": 3600 + }, + { + "epoch": 1.0629040712912807, + "grad_norm": 0.3339248299598694, + "learning_rate": 0.000587590909090909, + "loss": 4.0116, + "step": 3650 + }, + { + "epoch": 1.0774651989050033, + "grad_norm": 0.37050846219062805, + "learning_rate": 0.0005874160839160838, + "loss": 4.0031, + "step": 3700 + }, + { + "epoch": 1.0920263265187256, + "grad_norm": 0.37132593989372253, + "learning_rate": 0.0005872412587412587, + "loss": 3.996, + "step": 3750 + }, + { + "epoch": 1.106587454132448, + "grad_norm": 0.3583703339099884, + "learning_rate": 0.0005870664335664335, + "loss": 3.9926, + "step": 3800 + }, + { + "epoch": 1.1211485817461704, + "grad_norm": 0.32278576493263245, + "learning_rate": 0.0005868916083916083, + "loss": 3.9928, + "step": 3850 + }, + { + "epoch": 1.135709709359893, + "grad_norm": 0.46608036756515503, + "learning_rate": 0.0005867167832167831, + "loss": 3.9823, + "step": 3900 + }, + { + "epoch": 1.1502708369736152, + "grad_norm": 0.3258165717124939, + "learning_rate": 0.000586541958041958, + "loss": 3.99, + "step": 3950 + }, + { + "epoch": 1.1648319645873377, + "grad_norm": 0.36865976452827454, + "learning_rate": 0.0005863671328671328, + "loss": 3.9822, + "step": 4000 + }, + { + "epoch": 1.1648319645873377, + "eval_accuracy": 0.3258375806744003, + "eval_loss": 3.9846527576446533, + "eval_runtime": 182.7722, + "eval_samples_per_second": 91.075, + "eval_steps_per_second": 5.696, + "step": 4000 + }, + { + "epoch": 1.17939309220106, + "grad_norm": 0.3421526551246643, + "learning_rate": 0.0005861923076923076, + "loss": 3.995, + "step": 4050 + }, + { + "epoch": 1.1939542198147826, + "grad_norm": 0.34738972783088684, + "learning_rate": 0.0005860174825174824, + "loss": 3.9821, + "step": 4100 + }, + { + "epoch": 1.2085153474285049, + "grad_norm": 0.33670172095298767, + "learning_rate": 0.0005858426573426573, + "loss": 3.9627, + "step": 4150 + }, + { + "epoch": 1.2230764750422272, + "grad_norm": 0.3570351004600525, + "learning_rate": 0.000585667832167832, + "loss": 3.9683, + "step": 4200 + }, + { + "epoch": 1.2376376026559497, + "grad_norm": 0.34402894973754883, + "learning_rate": 0.000585493006993007, + "loss": 3.9625, + "step": 4250 + }, + { + "epoch": 1.2521987302696722, + "grad_norm": 0.3514033555984497, + "learning_rate": 0.0005853181818181817, + "loss": 3.9698, + "step": 4300 + }, + { + "epoch": 1.2667598578833945, + "grad_norm": 0.34714460372924805, + "learning_rate": 0.0005851433566433565, + "loss": 3.9567, + "step": 4350 + }, + { + "epoch": 1.2813209854971168, + "grad_norm": 0.34446805715560913, + "learning_rate": 0.0005849685314685315, + "loss": 3.9564, + "step": 4400 + }, + { + "epoch": 1.2958821131108393, + "grad_norm": 0.3477458655834198, + "learning_rate": 0.0005847937062937063, + "loss": 3.9627, + "step": 4450 + }, + { + "epoch": 1.3104432407245616, + "grad_norm": 0.32496753334999084, + "learning_rate": 0.0005846188811188811, + "loss": 3.9557, + "step": 4500 + }, + { + "epoch": 1.3250043683382842, + "grad_norm": 0.37617602944374084, + "learning_rate": 0.0005844440559440559, + "loss": 3.9518, + "step": 4550 + }, + { + "epoch": 1.3395654959520065, + "grad_norm": 0.35174936056137085, + "learning_rate": 0.0005842692307692308, + "loss": 3.9558, + "step": 4600 + }, + { + "epoch": 1.354126623565729, + "grad_norm": 0.36389508843421936, + "learning_rate": 0.0005840944055944056, + "loss": 3.9488, + "step": 4650 + }, + { + "epoch": 1.3686877511794513, + "grad_norm": 0.3404442071914673, + "learning_rate": 0.0005839195804195804, + "loss": 3.9436, + "step": 4700 + }, + { + "epoch": 1.3832488787931738, + "grad_norm": 0.3432350158691406, + "learning_rate": 0.0005837447552447552, + "loss": 3.934, + "step": 4750 + }, + { + "epoch": 1.3978100064068961, + "grad_norm": 0.3505391776561737, + "learning_rate": 0.0005835699300699301, + "loss": 3.9345, + "step": 4800 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 0.3216480016708374, + "learning_rate": 0.0005833951048951048, + "loss": 3.9266, + "step": 4850 + }, + { + "epoch": 1.426932261634341, + "grad_norm": 0.3275972902774811, + "learning_rate": 0.0005832202797202797, + "loss": 3.9393, + "step": 4900 + }, + { + "epoch": 1.4414933892480635, + "grad_norm": 0.35102906823158264, + "learning_rate": 0.0005830454545454546, + "loss": 3.9359, + "step": 4950 + }, + { + "epoch": 1.4560545168617858, + "grad_norm": 0.3343009650707245, + "learning_rate": 0.0005828706293706293, + "loss": 3.9236, + "step": 5000 + }, + { + "epoch": 1.4560545168617858, + "eval_accuracy": 0.33192285635753893, + "eval_loss": 3.9100613594055176, + "eval_runtime": 184.3964, + "eval_samples_per_second": 90.273, + "eval_steps_per_second": 5.645, + "step": 5000 + }, + { + "epoch": 1.470615644475508, + "grad_norm": 0.32075586915016174, + "learning_rate": 0.0005826958041958042, + "loss": 3.923, + "step": 5050 + }, + { + "epoch": 1.4851767720892306, + "grad_norm": 0.3327905237674713, + "learning_rate": 0.000582520979020979, + "loss": 3.9322, + "step": 5100 + }, + { + "epoch": 1.4997378997029531, + "grad_norm": 0.34814631938934326, + "learning_rate": 0.0005823461538461538, + "loss": 3.9174, + "step": 5150 + }, + { + "epoch": 1.5142990273166754, + "grad_norm": 0.3348972797393799, + "learning_rate": 0.0005821713286713286, + "loss": 3.9043, + "step": 5200 + }, + { + "epoch": 1.5288601549303977, + "grad_norm": 0.3347136676311493, + "learning_rate": 0.0005819965034965035, + "loss": 3.9046, + "step": 5250 + }, + { + "epoch": 1.5434212825441203, + "grad_norm": 0.336887001991272, + "learning_rate": 0.0005818216783216783, + "loss": 3.915, + "step": 5300 + }, + { + "epoch": 1.5579824101578428, + "grad_norm": 0.3282018005847931, + "learning_rate": 0.0005816468531468531, + "loss": 3.9147, + "step": 5350 + }, + { + "epoch": 1.572543537771565, + "grad_norm": 0.34454506635665894, + "learning_rate": 0.0005814720279720279, + "loss": 3.9073, + "step": 5400 + }, + { + "epoch": 1.5871046653852874, + "grad_norm": 0.3495652377605438, + "learning_rate": 0.0005812972027972028, + "loss": 3.8947, + "step": 5450 + }, + { + "epoch": 1.6016657929990097, + "grad_norm": 0.33489686250686646, + "learning_rate": 0.0005811223776223776, + "loss": 3.9016, + "step": 5500 + }, + { + "epoch": 1.6162269206127322, + "grad_norm": 0.3139170706272125, + "learning_rate": 0.0005809475524475524, + "loss": 3.8813, + "step": 5550 + }, + { + "epoch": 1.6307880482264547, + "grad_norm": 0.3473421335220337, + "learning_rate": 0.0005807727272727272, + "loss": 3.8912, + "step": 5600 + }, + { + "epoch": 1.645349175840177, + "grad_norm": 0.3421488106250763, + "learning_rate": 0.0005805979020979021, + "loss": 3.8781, + "step": 5650 + }, + { + "epoch": 1.6599103034538993, + "grad_norm": 0.3326078951358795, + "learning_rate": 0.0005804230769230769, + "loss": 3.8902, + "step": 5700 + }, + { + "epoch": 1.6744714310676219, + "grad_norm": 0.34903430938720703, + "learning_rate": 0.0005802482517482517, + "loss": 3.8875, + "step": 5750 + }, + { + "epoch": 1.6890325586813444, + "grad_norm": 0.337380051612854, + "learning_rate": 0.0005800734265734265, + "loss": 3.8923, + "step": 5800 + }, + { + "epoch": 1.7035936862950667, + "grad_norm": 0.36713698506355286, + "learning_rate": 0.0005798986013986013, + "loss": 3.8702, + "step": 5850 + }, + { + "epoch": 1.718154813908789, + "grad_norm": 0.34020107984542847, + "learning_rate": 0.0005797237762237762, + "loss": 3.8835, + "step": 5900 + }, + { + "epoch": 1.7327159415225115, + "grad_norm": 0.325308233499527, + "learning_rate": 0.000579548951048951, + "loss": 3.8765, + "step": 5950 + }, + { + "epoch": 1.747277069136234, + "grad_norm": 0.3312442898750305, + "learning_rate": 0.0005793741258741258, + "loss": 3.8783, + "step": 6000 + }, + { + "epoch": 1.747277069136234, + "eval_accuracy": 0.3375568091909506, + "eval_loss": 3.8538968563079834, + "eval_runtime": 182.6837, + "eval_samples_per_second": 91.119, + "eval_steps_per_second": 5.698, + "step": 6000 + }, + { + "epoch": 1.7618381967499563, + "grad_norm": 0.3400241732597351, + "learning_rate": 0.0005791993006993006, + "loss": 3.8657, + "step": 6050 + }, + { + "epoch": 1.7763993243636786, + "grad_norm": 0.3185133934020996, + "learning_rate": 0.0005790244755244755, + "loss": 3.8761, + "step": 6100 + }, + { + "epoch": 1.7909604519774012, + "grad_norm": 0.3425745964050293, + "learning_rate": 0.0005788496503496503, + "loss": 3.8653, + "step": 6150 + }, + { + "epoch": 1.8055215795911237, + "grad_norm": 0.31458574533462524, + "learning_rate": 0.0005786748251748251, + "loss": 3.8693, + "step": 6200 + }, + { + "epoch": 1.820082707204846, + "grad_norm": 0.33563944697380066, + "learning_rate": 0.0005784999999999999, + "loss": 3.8654, + "step": 6250 + }, + { + "epoch": 1.8346438348185683, + "grad_norm": 0.3224489092826843, + "learning_rate": 0.0005783251748251748, + "loss": 3.8461, + "step": 6300 + }, + { + "epoch": 1.8492049624322906, + "grad_norm": 0.3258609175682068, + "learning_rate": 0.0005781503496503496, + "loss": 3.857, + "step": 6350 + }, + { + "epoch": 1.8637660900460131, + "grad_norm": 0.31683316826820374, + "learning_rate": 0.0005779755244755244, + "loss": 3.8613, + "step": 6400 + }, + { + "epoch": 1.8783272176597356, + "grad_norm": 0.3589027523994446, + "learning_rate": 0.0005778006993006993, + "loss": 3.8568, + "step": 6450 + }, + { + "epoch": 1.892888345273458, + "grad_norm": 0.3233237862586975, + "learning_rate": 0.000577625874125874, + "loss": 3.8619, + "step": 6500 + }, + { + "epoch": 1.9074494728871803, + "grad_norm": 0.3178718388080597, + "learning_rate": 0.0005774510489510489, + "loss": 3.8528, + "step": 6550 + }, + { + "epoch": 1.9220106005009028, + "grad_norm": 0.3323863446712494, + "learning_rate": 0.0005772762237762237, + "loss": 3.8432, + "step": 6600 + }, + { + "epoch": 1.9365717281146253, + "grad_norm": 0.3357718586921692, + "learning_rate": 0.0005771013986013985, + "loss": 3.8408, + "step": 6650 + }, + { + "epoch": 1.9511328557283476, + "grad_norm": 0.3276062607765198, + "learning_rate": 0.0005769265734265733, + "loss": 3.8554, + "step": 6700 + }, + { + "epoch": 1.96569398334207, + "grad_norm": 0.35431572794914246, + "learning_rate": 0.0005767517482517482, + "loss": 3.8394, + "step": 6750 + }, + { + "epoch": 1.9802551109557924, + "grad_norm": 0.3411652147769928, + "learning_rate": 0.000576576923076923, + "loss": 3.8523, + "step": 6800 + }, + { + "epoch": 1.994816238569515, + "grad_norm": 0.32246044278144836, + "learning_rate": 0.0005764020979020978, + "loss": 3.8446, + "step": 6850 + }, + { + "epoch": 2.009319121672782, + "grad_norm": 0.3211682438850403, + "learning_rate": 0.0005762272727272726, + "loss": 3.7718, + "step": 6900 + }, + { + "epoch": 2.023880249286505, + "grad_norm": 0.327195405960083, + "learning_rate": 0.0005760524475524475, + "loss": 3.7531, + "step": 6950 + }, + { + "epoch": 2.038441376900227, + "grad_norm": 0.327593058347702, + "learning_rate": 0.0005758776223776223, + "loss": 3.7409, + "step": 7000 + }, + { + "epoch": 2.038441376900227, + "eval_accuracy": 0.3418543103036807, + "eval_loss": 3.807173013687134, + "eval_runtime": 182.7201, + "eval_samples_per_second": 91.101, + "eval_steps_per_second": 5.697, + "step": 7000 + }, + { + "epoch": 2.0530025045139495, + "grad_norm": 0.31732338666915894, + "learning_rate": 0.0005757027972027971, + "loss": 3.7579, + "step": 7050 + }, + { + "epoch": 2.067563632127672, + "grad_norm": 0.31844356656074524, + "learning_rate": 0.000575527972027972, + "loss": 3.7387, + "step": 7100 + }, + { + "epoch": 2.0821247597413945, + "grad_norm": 0.3230411410331726, + "learning_rate": 0.0005753531468531468, + "loss": 3.751, + "step": 7150 + }, + { + "epoch": 2.096685887355117, + "grad_norm": 0.32469573616981506, + "learning_rate": 0.0005751783216783216, + "loss": 3.7505, + "step": 7200 + }, + { + "epoch": 2.111247014968839, + "grad_norm": 0.32758161425590515, + "learning_rate": 0.0005750034965034964, + "loss": 3.749, + "step": 7250 + }, + { + "epoch": 2.1258081425825615, + "grad_norm": 0.32330748438835144, + "learning_rate": 0.0005748286713286712, + "loss": 3.7479, + "step": 7300 + }, + { + "epoch": 2.140369270196284, + "grad_norm": 0.3409372568130493, + "learning_rate": 0.000574653846153846, + "loss": 3.7407, + "step": 7350 + }, + { + "epoch": 2.1549303978100065, + "grad_norm": 0.3466344177722931, + "learning_rate": 0.000574479020979021, + "loss": 3.7507, + "step": 7400 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 0.3374614119529724, + "learning_rate": 0.0005743041958041958, + "loss": 3.752, + "step": 7450 + }, + { + "epoch": 2.184052653037451, + "grad_norm": 0.31741416454315186, + "learning_rate": 0.0005741293706293706, + "loss": 3.7516, + "step": 7500 + }, + { + "epoch": 2.198613780651174, + "grad_norm": 0.3337715268135071, + "learning_rate": 0.0005739545454545454, + "loss": 3.7466, + "step": 7550 + }, + { + "epoch": 2.213174908264896, + "grad_norm": 0.3211740255355835, + "learning_rate": 0.0005737797202797203, + "loss": 3.7529, + "step": 7600 + }, + { + "epoch": 2.2277360358786185, + "grad_norm": 0.3633030652999878, + "learning_rate": 0.0005736048951048951, + "loss": 3.7411, + "step": 7650 + }, + { + "epoch": 2.2422971634923408, + "grad_norm": 0.3291577100753784, + "learning_rate": 0.0005734300699300699, + "loss": 3.7522, + "step": 7700 + }, + { + "epoch": 2.256858291106063, + "grad_norm": 0.32250499725341797, + "learning_rate": 0.0005732552447552448, + "loss": 3.7576, + "step": 7750 + }, + { + "epoch": 2.271419418719786, + "grad_norm": 0.30450233817100525, + "learning_rate": 0.0005730804195804196, + "loss": 3.7453, + "step": 7800 + }, + { + "epoch": 2.285980546333508, + "grad_norm": 0.30964726209640503, + "learning_rate": 0.0005729055944055944, + "loss": 3.755, + "step": 7850 + }, + { + "epoch": 2.3005416739472304, + "grad_norm": 0.31292903423309326, + "learning_rate": 0.0005727307692307692, + "loss": 3.7539, + "step": 7900 + }, + { + "epoch": 2.3151028015609527, + "grad_norm": 0.3473982512950897, + "learning_rate": 0.0005725559440559441, + "loss": 3.7509, + "step": 7950 + }, + { + "epoch": 2.3296639291746755, + "grad_norm": 0.35157695412635803, + "learning_rate": 0.0005723811188811188, + "loss": 3.7556, + "step": 8000 + }, + { + "epoch": 2.3296639291746755, + "eval_accuracy": 0.3448868377610154, + "eval_loss": 3.778634786605835, + "eval_runtime": 184.554, + "eval_samples_per_second": 90.196, + "eval_steps_per_second": 5.641, + "step": 8000 + }, + { + "epoch": 2.3442250567883978, + "grad_norm": 0.3254748582839966, + "learning_rate": 0.0005722062937062937, + "loss": 3.7366, + "step": 8050 + }, + { + "epoch": 2.35878618440212, + "grad_norm": 0.33637189865112305, + "learning_rate": 0.0005720314685314685, + "loss": 3.749, + "step": 8100 + }, + { + "epoch": 2.3733473120158424, + "grad_norm": 0.3259914517402649, + "learning_rate": 0.0005718566433566433, + "loss": 3.7439, + "step": 8150 + }, + { + "epoch": 2.387908439629565, + "grad_norm": 0.31481459736824036, + "learning_rate": 0.0005716818181818181, + "loss": 3.7376, + "step": 8200 + }, + { + "epoch": 2.4024695672432874, + "grad_norm": 0.3136726915836334, + "learning_rate": 0.000571506993006993, + "loss": 3.7284, + "step": 8250 + }, + { + "epoch": 2.4170306948570097, + "grad_norm": 0.32896652817726135, + "learning_rate": 0.0005713321678321678, + "loss": 3.7478, + "step": 8300 + }, + { + "epoch": 2.431591822470732, + "grad_norm": 0.3367721140384674, + "learning_rate": 0.0005711573426573426, + "loss": 3.7418, + "step": 8350 + }, + { + "epoch": 2.4461529500844543, + "grad_norm": 0.32438716292381287, + "learning_rate": 0.0005709825174825175, + "loss": 3.752, + "step": 8400 + }, + { + "epoch": 2.460714077698177, + "grad_norm": 0.3197824954986572, + "learning_rate": 0.0005708076923076923, + "loss": 3.7425, + "step": 8450 + }, + { + "epoch": 2.4752752053118994, + "grad_norm": 0.3408988416194916, + "learning_rate": 0.0005706328671328671, + "loss": 3.7424, + "step": 8500 + }, + { + "epoch": 2.4898363329256217, + "grad_norm": 0.3419415354728699, + "learning_rate": 0.0005704580419580419, + "loss": 3.7546, + "step": 8550 + }, + { + "epoch": 2.5043974605393444, + "grad_norm": 0.3166530132293701, + "learning_rate": 0.0005702832167832168, + "loss": 3.7339, + "step": 8600 + }, + { + "epoch": 2.5189585881530667, + "grad_norm": 0.3352281451225281, + "learning_rate": 0.0005701083916083916, + "loss": 3.7278, + "step": 8650 + }, + { + "epoch": 2.533519715766789, + "grad_norm": 0.31808385252952576, + "learning_rate": 0.0005699335664335664, + "loss": 3.7421, + "step": 8700 + }, + { + "epoch": 2.5480808433805113, + "grad_norm": 0.3182724714279175, + "learning_rate": 0.0005697587412587412, + "loss": 3.7366, + "step": 8750 + }, + { + "epoch": 2.5626419709942336, + "grad_norm": 0.3110598623752594, + "learning_rate": 0.000569583916083916, + "loss": 3.7345, + "step": 8800 + }, + { + "epoch": 2.5772030986079564, + "grad_norm": 0.3083634674549103, + "learning_rate": 0.0005694090909090908, + "loss": 3.7335, + "step": 8850 + }, + { + "epoch": 2.5917642262216787, + "grad_norm": 0.31325507164001465, + "learning_rate": 0.0005692342657342657, + "loss": 3.735, + "step": 8900 + }, + { + "epoch": 2.606325353835401, + "grad_norm": 0.325335830450058, + "learning_rate": 0.0005690594405594405, + "loss": 3.7352, + "step": 8950 + }, + { + "epoch": 2.6208864814491233, + "grad_norm": 0.3095828592777252, + "learning_rate": 0.0005688846153846153, + "loss": 3.7318, + "step": 9000 + }, + { + "epoch": 2.6208864814491233, + "eval_accuracy": 0.34765026441006025, + "eval_loss": 3.7488150596618652, + "eval_runtime": 181.151, + "eval_samples_per_second": 91.89, + "eval_steps_per_second": 5.747, + "step": 9000 + }, + { + "epoch": 2.6354476090628456, + "grad_norm": 0.32213926315307617, + "learning_rate": 0.0005687097902097901, + "loss": 3.7308, + "step": 9050 + }, + { + "epoch": 2.6500087366765683, + "grad_norm": 0.3269072473049164, + "learning_rate": 0.000568534965034965, + "loss": 3.7258, + "step": 9100 + }, + { + "epoch": 2.6645698642902906, + "grad_norm": 0.3074745833873749, + "learning_rate": 0.0005683601398601398, + "loss": 3.7307, + "step": 9150 + }, + { + "epoch": 2.679130991904013, + "grad_norm": 0.3062356412410736, + "learning_rate": 0.0005681853146853146, + "loss": 3.7292, + "step": 9200 + }, + { + "epoch": 2.6936921195177357, + "grad_norm": 0.3062867820262909, + "learning_rate": 0.0005680104895104895, + "loss": 3.7162, + "step": 9250 + }, + { + "epoch": 2.708253247131458, + "grad_norm": 0.3553263545036316, + "learning_rate": 0.0005678356643356643, + "loss": 3.7239, + "step": 9300 + }, + { + "epoch": 2.7228143747451803, + "grad_norm": 0.31543877720832825, + "learning_rate": 0.0005676608391608391, + "loss": 3.7271, + "step": 9350 + }, + { + "epoch": 2.7373755023589026, + "grad_norm": 0.3128660023212433, + "learning_rate": 0.0005674860139860139, + "loss": 3.7304, + "step": 9400 + }, + { + "epoch": 2.751936629972625, + "grad_norm": 0.31978827714920044, + "learning_rate": 0.0005673111888111888, + "loss": 3.7252, + "step": 9450 + }, + { + "epoch": 2.7664977575863476, + "grad_norm": 0.3351915776729584, + "learning_rate": 0.0005671363636363635, + "loss": 3.7215, + "step": 9500 + }, + { + "epoch": 2.78105888520007, + "grad_norm": 0.32556962966918945, + "learning_rate": 0.0005669615384615384, + "loss": 3.7109, + "step": 9550 + }, + { + "epoch": 2.7956200128137922, + "grad_norm": 0.30667567253112793, + "learning_rate": 0.0005667867132867132, + "loss": 3.7309, + "step": 9600 + }, + { + "epoch": 2.8101811404275145, + "grad_norm": 0.2958901524543762, + "learning_rate": 0.000566611888111888, + "loss": 3.7281, + "step": 9650 + }, + { + "epoch": 2.824742268041237, + "grad_norm": 0.3355960249900818, + "learning_rate": 0.0005664370629370628, + "loss": 3.7114, + "step": 9700 + }, + { + "epoch": 2.8393033956549596, + "grad_norm": 0.3073790967464447, + "learning_rate": 0.0005662622377622377, + "loss": 3.7258, + "step": 9750 + }, + { + "epoch": 2.853864523268682, + "grad_norm": 0.30925753712654114, + "learning_rate": 0.0005660874125874125, + "loss": 3.721, + "step": 9800 + }, + { + "epoch": 2.868425650882404, + "grad_norm": 0.3250259459018707, + "learning_rate": 0.0005659125874125873, + "loss": 3.7338, + "step": 9850 + }, + { + "epoch": 2.882986778496127, + "grad_norm": 0.3341335356235504, + "learning_rate": 0.0005657377622377622, + "loss": 3.7089, + "step": 9900 + }, + { + "epoch": 2.8975479061098492, + "grad_norm": 0.325226366519928, + "learning_rate": 0.000565562937062937, + "loss": 3.7195, + "step": 9950 + }, + { + "epoch": 2.9121090337235715, + "grad_norm": 0.32413652539253235, + "learning_rate": 0.0005653881118881118, + "loss": 3.7152, + "step": 10000 + }, + { + "epoch": 2.9121090337235715, + "eval_accuracy": 0.3496235527749125, + "eval_loss": 3.7240853309631348, + "eval_runtime": 180.9488, + "eval_samples_per_second": 91.993, + "eval_steps_per_second": 5.753, + "step": 10000 + }, + { + "epoch": 2.926670161337294, + "grad_norm": 0.3107230067253113, + "learning_rate": 0.0005652132867132866, + "loss": 3.7198, + "step": 10050 + }, + { + "epoch": 2.941231288951016, + "grad_norm": 0.3085416257381439, + "learning_rate": 0.0005650384615384615, + "loss": 3.7137, + "step": 10100 + }, + { + "epoch": 2.955792416564739, + "grad_norm": 0.3245227634906769, + "learning_rate": 0.0005648636363636363, + "loss": 3.7077, + "step": 10150 + }, + { + "epoch": 2.970353544178461, + "grad_norm": 0.31815105676651, + "learning_rate": 0.0005646888111888111, + "loss": 3.7133, + "step": 10200 + }, + { + "epoch": 2.9849146717921835, + "grad_norm": 0.3235960900783539, + "learning_rate": 0.000564513986013986, + "loss": 3.7025, + "step": 10250 + }, + { + "epoch": 2.9994757994059063, + "grad_norm": 0.34538596868515015, + "learning_rate": 0.0005643391608391607, + "loss": 3.7129, + "step": 10300 + }, + { + "epoch": 3.0139786825091734, + "grad_norm": 0.31785765290260315, + "learning_rate": 0.0005641643356643355, + "loss": 3.6054, + "step": 10350 + }, + { + "epoch": 3.0285398101228957, + "grad_norm": 0.3313532769680023, + "learning_rate": 0.0005639895104895105, + "loss": 3.6084, + "step": 10400 + }, + { + "epoch": 3.0431009377366185, + "grad_norm": 0.3206842839717865, + "learning_rate": 0.0005638146853146853, + "loss": 3.6094, + "step": 10450 + }, + { + "epoch": 3.057662065350341, + "grad_norm": 0.3367646336555481, + "learning_rate": 0.0005636398601398601, + "loss": 3.6184, + "step": 10500 + }, + { + "epoch": 3.072223192964063, + "grad_norm": 0.3168485164642334, + "learning_rate": 0.000563465034965035, + "loss": 3.6267, + "step": 10550 + }, + { + "epoch": 3.0867843205777854, + "grad_norm": 0.3227006196975708, + "learning_rate": 0.0005632902097902098, + "loss": 3.6165, + "step": 10600 + }, + { + "epoch": 3.101345448191508, + "grad_norm": 0.3266565799713135, + "learning_rate": 0.0005631153846153846, + "loss": 3.6185, + "step": 10650 + }, + { + "epoch": 3.1159065758052304, + "grad_norm": 0.3158946633338928, + "learning_rate": 0.0005629405594405594, + "loss": 3.6104, + "step": 10700 + }, + { + "epoch": 3.1304677034189528, + "grad_norm": 0.32722142338752747, + "learning_rate": 0.0005627657342657343, + "loss": 3.619, + "step": 10750 + }, + { + "epoch": 3.145028831032675, + "grad_norm": 0.30216503143310547, + "learning_rate": 0.0005625909090909091, + "loss": 3.629, + "step": 10800 + }, + { + "epoch": 3.1595899586463974, + "grad_norm": 0.3186177909374237, + "learning_rate": 0.0005624160839160839, + "loss": 3.6195, + "step": 10850 + }, + { + "epoch": 3.17415108626012, + "grad_norm": 0.3162807822227478, + "learning_rate": 0.0005622412587412587, + "loss": 3.6169, + "step": 10900 + }, + { + "epoch": 3.1887122138738424, + "grad_norm": 0.33053115010261536, + "learning_rate": 0.0005620664335664336, + "loss": 3.6338, + "step": 10950 + }, + { + "epoch": 3.2032733414875647, + "grad_norm": 0.30891671776771545, + "learning_rate": 0.0005618916083916083, + "loss": 3.6271, + "step": 11000 + }, + { + "epoch": 3.2032733414875647, + "eval_accuracy": 0.3519776264250645, + "eval_loss": 3.709869384765625, + "eval_runtime": 180.919, + "eval_samples_per_second": 92.008, + "eval_steps_per_second": 5.754, + "step": 11000 + }, + { + "epoch": 3.217834469101287, + "grad_norm": 0.33202117681503296, + "learning_rate": 0.0005617167832167832, + "loss": 3.6235, + "step": 11050 + }, + { + "epoch": 3.2323955967150098, + "grad_norm": 0.3139243423938751, + "learning_rate": 0.000561541958041958, + "loss": 3.6294, + "step": 11100 + }, + { + "epoch": 3.246956724328732, + "grad_norm": 0.3314415216445923, + "learning_rate": 0.0005613671328671328, + "loss": 3.6461, + "step": 11150 + }, + { + "epoch": 3.2615178519424544, + "grad_norm": 0.3135373592376709, + "learning_rate": 0.0005611923076923077, + "loss": 3.6442, + "step": 11200 + }, + { + "epoch": 3.2760789795561767, + "grad_norm": 0.32920390367507935, + "learning_rate": 0.0005610174825174825, + "loss": 3.6266, + "step": 11250 + }, + { + "epoch": 3.2906401071698994, + "grad_norm": 0.3141324520111084, + "learning_rate": 0.0005608426573426573, + "loss": 3.641, + "step": 11300 + }, + { + "epoch": 3.3052012347836217, + "grad_norm": 0.3209163546562195, + "learning_rate": 0.0005606678321678321, + "loss": 3.6214, + "step": 11350 + }, + { + "epoch": 3.319762362397344, + "grad_norm": 0.33475250005722046, + "learning_rate": 0.000560493006993007, + "loss": 3.6157, + "step": 11400 + }, + { + "epoch": 3.3343234900110663, + "grad_norm": 0.33062008023262024, + "learning_rate": 0.0005603181818181818, + "loss": 3.6323, + "step": 11450 + }, + { + "epoch": 3.3488846176247886, + "grad_norm": 0.32883119583129883, + "learning_rate": 0.0005601433566433566, + "loss": 3.6385, + "step": 11500 + }, + { + "epoch": 3.3634457452385114, + "grad_norm": 0.31883203983306885, + "learning_rate": 0.0005599685314685314, + "loss": 3.6394, + "step": 11550 + }, + { + "epoch": 3.3780068728522337, + "grad_norm": 0.30644893646240234, + "learning_rate": 0.0005597937062937063, + "loss": 3.6358, + "step": 11600 + }, + { + "epoch": 3.392568000465956, + "grad_norm": 0.30966854095458984, + "learning_rate": 0.0005596188811188811, + "loss": 3.6315, + "step": 11650 + }, + { + "epoch": 3.4071291280796787, + "grad_norm": 0.33119064569473267, + "learning_rate": 0.0005594440559440559, + "loss": 3.6342, + "step": 11700 + }, + { + "epoch": 3.421690255693401, + "grad_norm": 0.340206116437912, + "learning_rate": 0.0005592692307692307, + "loss": 3.6359, + "step": 11750 + }, + { + "epoch": 3.4362513833071233, + "grad_norm": 0.3317243754863739, + "learning_rate": 0.0005590944055944055, + "loss": 3.6418, + "step": 11800 + }, + { + "epoch": 3.4508125109208456, + "grad_norm": 0.35198959708213806, + "learning_rate": 0.0005589195804195803, + "loss": 3.6433, + "step": 11850 + }, + { + "epoch": 3.465373638534568, + "grad_norm": 0.3379652202129364, + "learning_rate": 0.0005587447552447552, + "loss": 3.63, + "step": 11900 + }, + { + "epoch": 3.4799347661482907, + "grad_norm": 0.3148704171180725, + "learning_rate": 0.00055856993006993, + "loss": 3.6433, + "step": 11950 + }, + { + "epoch": 3.494495893762013, + "grad_norm": 0.32295045256614685, + "learning_rate": 0.0005583951048951048, + "loss": 3.637, + "step": 12000 + }, + { + "epoch": 3.494495893762013, + "eval_accuracy": 0.3538141894775353, + "eval_loss": 3.6922500133514404, + "eval_runtime": 181.0231, + "eval_samples_per_second": 91.955, + "eval_steps_per_second": 5.751, + "step": 12000 + }, + { + "epoch": 3.5090570213757353, + "grad_norm": 0.33041051030158997, + "learning_rate": 0.0005582202797202797, + "loss": 3.6393, + "step": 12050 + }, + { + "epoch": 3.523618148989458, + "grad_norm": 0.30944448709487915, + "learning_rate": 0.0005580454545454545, + "loss": 3.636, + "step": 12100 + }, + { + "epoch": 3.53817927660318, + "grad_norm": 0.3318426311016083, + "learning_rate": 0.0005578706293706293, + "loss": 3.6265, + "step": 12150 + }, + { + "epoch": 3.5527404042169026, + "grad_norm": 0.31078043580055237, + "learning_rate": 0.0005576958041958041, + "loss": 3.6335, + "step": 12200 + }, + { + "epoch": 3.567301531830625, + "grad_norm": 0.31694719195365906, + "learning_rate": 0.000557520979020979, + "loss": 3.6387, + "step": 12250 + }, + { + "epoch": 3.5818626594443472, + "grad_norm": 0.32518377900123596, + "learning_rate": 0.0005573461538461538, + "loss": 3.6292, + "step": 12300 + }, + { + "epoch": 3.59642378705807, + "grad_norm": 0.33264607191085815, + "learning_rate": 0.0005571713286713286, + "loss": 3.6424, + "step": 12350 + }, + { + "epoch": 3.6109849146717923, + "grad_norm": 0.31201255321502686, + "learning_rate": 0.0005569965034965034, + "loss": 3.6499, + "step": 12400 + }, + { + "epoch": 3.6255460422855146, + "grad_norm": 0.32355648279190063, + "learning_rate": 0.0005568216783216783, + "loss": 3.6397, + "step": 12450 + }, + { + "epoch": 3.640107169899237, + "grad_norm": 0.3250090479850769, + "learning_rate": 0.000556646853146853, + "loss": 3.6408, + "step": 12500 + }, + { + "epoch": 3.654668297512959, + "grad_norm": 0.3053436577320099, + "learning_rate": 0.0005564720279720279, + "loss": 3.6246, + "step": 12550 + }, + { + "epoch": 3.669229425126682, + "grad_norm": 0.3021223545074463, + "learning_rate": 0.0005562972027972027, + "loss": 3.6422, + "step": 12600 + }, + { + "epoch": 3.6837905527404042, + "grad_norm": 0.30464479327201843, + "learning_rate": 0.0005561223776223775, + "loss": 3.6258, + "step": 12650 + }, + { + "epoch": 3.6983516803541265, + "grad_norm": 0.32816681265830994, + "learning_rate": 0.0005559475524475524, + "loss": 3.6374, + "step": 12700 + }, + { + "epoch": 3.7129128079678493, + "grad_norm": 0.3056129217147827, + "learning_rate": 0.0005557727272727272, + "loss": 3.6422, + "step": 12750 + }, + { + "epoch": 3.7274739355815716, + "grad_norm": 0.3144519031047821, + "learning_rate": 0.000555597902097902, + "loss": 3.6329, + "step": 12800 + }, + { + "epoch": 3.742035063195294, + "grad_norm": 0.3042009472846985, + "learning_rate": 0.0005554230769230768, + "loss": 3.6296, + "step": 12850 + }, + { + "epoch": 3.756596190809016, + "grad_norm": 0.3230903446674347, + "learning_rate": 0.0005552482517482517, + "loss": 3.6342, + "step": 12900 + }, + { + "epoch": 3.7711573184227385, + "grad_norm": 0.30217060446739197, + "learning_rate": 0.0005550734265734265, + "loss": 3.638, + "step": 12950 + }, + { + "epoch": 3.7857184460364612, + "grad_norm": 0.3292492628097534, + "learning_rate": 0.0005548986013986013, + "loss": 3.6368, + "step": 13000 + }, + { + "epoch": 3.7857184460364612, + "eval_accuracy": 0.35537424527745126, + "eval_loss": 3.6732499599456787, + "eval_runtime": 181.1768, + "eval_samples_per_second": 91.877, + "eval_steps_per_second": 5.746, + "step": 13000 + }, + { + "epoch": 3.8002795736501835, + "grad_norm": 0.3120541274547577, + "learning_rate": 0.0005547237762237761, + "loss": 3.6445, + "step": 13050 + }, + { + "epoch": 3.814840701263906, + "grad_norm": 0.3175022006034851, + "learning_rate": 0.000554548951048951, + "loss": 3.6454, + "step": 13100 + }, + { + "epoch": 3.829401828877628, + "grad_norm": 0.3213273286819458, + "learning_rate": 0.0005543741258741258, + "loss": 3.6236, + "step": 13150 + }, + { + "epoch": 3.8439629564913504, + "grad_norm": 0.3022514283657074, + "learning_rate": 0.0005541993006993006, + "loss": 3.6383, + "step": 13200 + }, + { + "epoch": 3.858524084105073, + "grad_norm": 0.3136868476867676, + "learning_rate": 0.0005540244755244756, + "loss": 3.6342, + "step": 13250 + }, + { + "epoch": 3.8730852117187955, + "grad_norm": 0.3210989236831665, + "learning_rate": 0.0005538496503496502, + "loss": 3.636, + "step": 13300 + }, + { + "epoch": 3.887646339332518, + "grad_norm": 0.31793737411499023, + "learning_rate": 0.0005536748251748252, + "loss": 3.6384, + "step": 13350 + }, + { + "epoch": 3.9022074669462405, + "grad_norm": 0.3172999620437622, + "learning_rate": 0.0005535, + "loss": 3.6322, + "step": 13400 + }, + { + "epoch": 3.916768594559963, + "grad_norm": 0.3040863573551178, + "learning_rate": 0.0005533251748251748, + "loss": 3.6361, + "step": 13450 + }, + { + "epoch": 3.931329722173685, + "grad_norm": 0.2977621555328369, + "learning_rate": 0.0005531503496503496, + "loss": 3.6293, + "step": 13500 + }, + { + "epoch": 3.9458908497874075, + "grad_norm": 0.32015374302864075, + "learning_rate": 0.0005529755244755245, + "loss": 3.6437, + "step": 13550 + }, + { + "epoch": 3.9604519774011298, + "grad_norm": 0.30774080753326416, + "learning_rate": 0.0005528006993006993, + "loss": 3.6305, + "step": 13600 + }, + { + "epoch": 3.9750131050148525, + "grad_norm": 0.32130375504493713, + "learning_rate": 0.0005526258741258741, + "loss": 3.6176, + "step": 13650 + }, + { + "epoch": 3.989574232628575, + "grad_norm": 0.3188425600528717, + "learning_rate": 0.0005524510489510489, + "loss": 3.6412, + "step": 13700 + }, + { + "epoch": 4.004077115731842, + "grad_norm": 0.31326737999916077, + "learning_rate": 0.0005522762237762238, + "loss": 3.599, + "step": 13750 + }, + { + "epoch": 4.018638243345564, + "grad_norm": 0.30300453305244446, + "learning_rate": 0.0005521013986013986, + "loss": 3.5165, + "step": 13800 + }, + { + "epoch": 4.033199370959287, + "grad_norm": 0.3266518712043762, + "learning_rate": 0.0005519265734265734, + "loss": 3.5381, + "step": 13850 + }, + { + "epoch": 4.04776049857301, + "grad_norm": 0.3203023076057434, + "learning_rate": 0.0005517517482517482, + "loss": 3.5187, + "step": 13900 + }, + { + "epoch": 4.062321626186732, + "grad_norm": 0.3111482262611389, + "learning_rate": 0.0005515769230769231, + "loss": 3.5362, + "step": 13950 + }, + { + "epoch": 4.076882753800454, + "grad_norm": 0.34344446659088135, + "learning_rate": 0.0005514020979020979, + "loss": 3.5417, + "step": 14000 + }, + { + "epoch": 4.076882753800454, + "eval_accuracy": 0.35658572794648924, + "eval_loss": 3.6685900688171387, + "eval_runtime": 180.9843, + "eval_samples_per_second": 91.975, + "eval_steps_per_second": 5.752, + "step": 14000 + }, + { + "epoch": 4.091443881414177, + "grad_norm": 0.3067654073238373, + "learning_rate": 0.0005512272727272727, + "loss": 3.539, + "step": 14050 + }, + { + "epoch": 4.106005009027899, + "grad_norm": 0.30828049778938293, + "learning_rate": 0.0005510524475524475, + "loss": 3.5469, + "step": 14100 + }, + { + "epoch": 4.120566136641622, + "grad_norm": 0.31522074341773987, + "learning_rate": 0.0005508776223776223, + "loss": 3.5362, + "step": 14150 + }, + { + "epoch": 4.135127264255344, + "grad_norm": 0.3391481637954712, + "learning_rate": 0.0005507027972027972, + "loss": 3.5471, + "step": 14200 + }, + { + "epoch": 4.149688391869066, + "grad_norm": 0.3163962662220001, + "learning_rate": 0.000550527972027972, + "loss": 3.5409, + "step": 14250 + }, + { + "epoch": 4.164249519482789, + "grad_norm": 0.3650486171245575, + "learning_rate": 0.0005503531468531468, + "loss": 3.5534, + "step": 14300 + }, + { + "epoch": 4.178810647096511, + "grad_norm": 0.3179774284362793, + "learning_rate": 0.0005501783216783216, + "loss": 3.5485, + "step": 14350 + }, + { + "epoch": 4.193371774710234, + "grad_norm": 0.3414445221424103, + "learning_rate": 0.0005500034965034965, + "loss": 3.5559, + "step": 14400 + }, + { + "epoch": 4.207932902323956, + "grad_norm": 0.3165196180343628, + "learning_rate": 0.0005498286713286713, + "loss": 3.5449, + "step": 14450 + }, + { + "epoch": 4.222494029937678, + "grad_norm": 0.2997719347476959, + "learning_rate": 0.0005496538461538461, + "loss": 3.5552, + "step": 14500 + }, + { + "epoch": 4.237055157551401, + "grad_norm": 0.3242793679237366, + "learning_rate": 0.0005494790209790209, + "loss": 3.5649, + "step": 14550 + }, + { + "epoch": 4.251616285165123, + "grad_norm": 0.3411242663860321, + "learning_rate": 0.0005493041958041958, + "loss": 3.5586, + "step": 14600 + }, + { + "epoch": 4.266177412778846, + "grad_norm": 0.3131914734840393, + "learning_rate": 0.0005491293706293706, + "loss": 3.5638, + "step": 14650 + }, + { + "epoch": 4.280738540392568, + "grad_norm": 0.3113696873188019, + "learning_rate": 0.0005489545454545454, + "loss": 3.5583, + "step": 14700 + }, + { + "epoch": 4.29529966800629, + "grad_norm": 0.3232826590538025, + "learning_rate": 0.0005487797202797203, + "loss": 3.5635, + "step": 14750 + }, + { + "epoch": 4.309860795620013, + "grad_norm": 0.3019048273563385, + "learning_rate": 0.000548604895104895, + "loss": 3.5583, + "step": 14800 + }, + { + "epoch": 4.324421923233735, + "grad_norm": 0.30543556809425354, + "learning_rate": 0.0005484300699300699, + "loss": 3.5636, + "step": 14850 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 0.327394038438797, + "learning_rate": 0.0005482552447552447, + "loss": 3.5602, + "step": 14900 + }, + { + "epoch": 4.35354417846118, + "grad_norm": 0.30762121081352234, + "learning_rate": 0.0005480804195804195, + "loss": 3.555, + "step": 14950 + }, + { + "epoch": 4.368105306074902, + "grad_norm": 0.3166114389896393, + "learning_rate": 0.0005479055944055943, + "loss": 3.5573, + "step": 15000 + }, + { + "epoch": 4.368105306074902, + "eval_accuracy": 0.3576027620629228, + "eval_loss": 3.6566529273986816, + "eval_runtime": 181.0498, + "eval_samples_per_second": 91.942, + "eval_steps_per_second": 5.75, + "step": 15000 + }, + { + "epoch": 4.382666433688625, + "grad_norm": 0.3189477026462555, + "learning_rate": 0.0005477307692307692, + "loss": 3.5526, + "step": 15050 + }, + { + "epoch": 4.397227561302348, + "grad_norm": 0.31075966358184814, + "learning_rate": 0.000547555944055944, + "loss": 3.5699, + "step": 15100 + }, + { + "epoch": 4.41178868891607, + "grad_norm": 0.33162787556648254, + "learning_rate": 0.0005473811188811188, + "loss": 3.573, + "step": 15150 + }, + { + "epoch": 4.426349816529792, + "grad_norm": 0.31193026900291443, + "learning_rate": 0.0005472062937062936, + "loss": 3.5564, + "step": 15200 + }, + { + "epoch": 4.440910944143514, + "grad_norm": 0.30927059054374695, + "learning_rate": 0.0005470314685314685, + "loss": 3.5524, + "step": 15250 + }, + { + "epoch": 4.455472071757237, + "grad_norm": 0.31786641478538513, + "learning_rate": 0.0005468566433566433, + "loss": 3.5712, + "step": 15300 + }, + { + "epoch": 4.47003319937096, + "grad_norm": 0.31433233618736267, + "learning_rate": 0.0005466818181818181, + "loss": 3.5586, + "step": 15350 + }, + { + "epoch": 4.4845943269846815, + "grad_norm": 0.3186981976032257, + "learning_rate": 0.000546506993006993, + "loss": 3.571, + "step": 15400 + }, + { + "epoch": 4.499155454598404, + "grad_norm": 0.31594040989875793, + "learning_rate": 0.0005463321678321678, + "loss": 3.5668, + "step": 15450 + }, + { + "epoch": 4.513716582212126, + "grad_norm": 0.3135336637496948, + "learning_rate": 0.0005461573426573426, + "loss": 3.5652, + "step": 15500 + }, + { + "epoch": 4.528277709825849, + "grad_norm": 0.37292537093162537, + "learning_rate": 0.0005459825174825174, + "loss": 3.5808, + "step": 15550 + }, + { + "epoch": 4.542838837439572, + "grad_norm": 0.3025301694869995, + "learning_rate": 0.0005458076923076922, + "loss": 3.5782, + "step": 15600 + }, + { + "epoch": 4.5573999650532935, + "grad_norm": 0.3217926025390625, + "learning_rate": 0.000545632867132867, + "loss": 3.567, + "step": 15650 + }, + { + "epoch": 4.571961092667016, + "grad_norm": 0.33000338077545166, + "learning_rate": 0.0005454580419580419, + "loss": 3.5761, + "step": 15700 + }, + { + "epoch": 4.586522220280738, + "grad_norm": 0.3380807340145111, + "learning_rate": 0.0005452832167832167, + "loss": 3.5729, + "step": 15750 + }, + { + "epoch": 4.601083347894461, + "grad_norm": 0.2971727252006531, + "learning_rate": 0.0005451083916083915, + "loss": 3.5666, + "step": 15800 + }, + { + "epoch": 4.615644475508184, + "grad_norm": 0.32622653245925903, + "learning_rate": 0.0005449335664335663, + "loss": 3.5744, + "step": 15850 + }, + { + "epoch": 4.630205603121905, + "grad_norm": 0.33031386137008667, + "learning_rate": 0.0005447587412587412, + "loss": 3.5836, + "step": 15900 + }, + { + "epoch": 4.644766730735628, + "grad_norm": 0.31544092297554016, + "learning_rate": 0.000544583916083916, + "loss": 3.58, + "step": 15950 + }, + { + "epoch": 4.659327858349351, + "grad_norm": 0.32821419835090637, + "learning_rate": 0.0005444090909090908, + "loss": 3.5738, + "step": 16000 + }, + { + "epoch": 4.659327858349351, + "eval_accuracy": 0.35873136309375875, + "eval_loss": 3.642806053161621, + "eval_runtime": 181.1419, + "eval_samples_per_second": 91.895, + "eval_steps_per_second": 5.747, + "step": 16000 + }, + { + "epoch": 4.673888985963073, + "grad_norm": 0.313347727060318, + "learning_rate": 0.0005442342657342657, + "loss": 3.5561, + "step": 16050 + }, + { + "epoch": 4.6884501135767955, + "grad_norm": 0.29170653223991394, + "learning_rate": 0.0005440594405594405, + "loss": 3.5704, + "step": 16100 + }, + { + "epoch": 4.703011241190518, + "grad_norm": 0.3108586370944977, + "learning_rate": 0.0005438846153846153, + "loss": 3.5616, + "step": 16150 + }, + { + "epoch": 4.71757236880424, + "grad_norm": 0.31426018476486206, + "learning_rate": 0.0005437097902097901, + "loss": 3.569, + "step": 16200 + }, + { + "epoch": 4.732133496417963, + "grad_norm": 0.3121700584888458, + "learning_rate": 0.0005435349650349651, + "loss": 3.5648, + "step": 16250 + }, + { + "epoch": 4.746694624031685, + "grad_norm": 0.3235621154308319, + "learning_rate": 0.0005433601398601397, + "loss": 3.5713, + "step": 16300 + }, + { + "epoch": 4.7612557516454075, + "grad_norm": 0.3033587336540222, + "learning_rate": 0.0005431853146853147, + "loss": 3.5571, + "step": 16350 + }, + { + "epoch": 4.77581687925913, + "grad_norm": 0.3489319086074829, + "learning_rate": 0.0005430104895104895, + "loss": 3.5774, + "step": 16400 + }, + { + "epoch": 4.790378006872852, + "grad_norm": 0.3031330108642578, + "learning_rate": 0.0005428356643356643, + "loss": 3.571, + "step": 16450 + }, + { + "epoch": 4.804939134486575, + "grad_norm": 0.31163015961647034, + "learning_rate": 0.0005426608391608391, + "loss": 3.5646, + "step": 16500 + }, + { + "epoch": 4.819500262100297, + "grad_norm": 0.32922112941741943, + "learning_rate": 0.000542486013986014, + "loss": 3.5692, + "step": 16550 + }, + { + "epoch": 4.834061389714019, + "grad_norm": 0.32564711570739746, + "learning_rate": 0.0005423111888111888, + "loss": 3.5759, + "step": 16600 + }, + { + "epoch": 4.848622517327742, + "grad_norm": 0.3296327590942383, + "learning_rate": 0.0005421363636363636, + "loss": 3.5785, + "step": 16650 + }, + { + "epoch": 4.863183644941464, + "grad_norm": 0.3310580253601074, + "learning_rate": 0.0005419615384615385, + "loss": 3.5685, + "step": 16700 + }, + { + "epoch": 4.877744772555187, + "grad_norm": 0.31274470686912537, + "learning_rate": 0.0005417867132867133, + "loss": 3.5587, + "step": 16750 + }, + { + "epoch": 4.892305900168909, + "grad_norm": 0.323668897151947, + "learning_rate": 0.0005416118881118881, + "loss": 3.5657, + "step": 16800 + }, + { + "epoch": 4.906867027782631, + "grad_norm": 0.3014122247695923, + "learning_rate": 0.0005414370629370629, + "loss": 3.5707, + "step": 16850 + }, + { + "epoch": 4.921428155396354, + "grad_norm": 0.32767361402511597, + "learning_rate": 0.0005412622377622378, + "loss": 3.5724, + "step": 16900 + }, + { + "epoch": 4.935989283010076, + "grad_norm": 0.3055601119995117, + "learning_rate": 0.0005410874125874126, + "loss": 3.56, + "step": 16950 + }, + { + "epoch": 4.950550410623799, + "grad_norm": 0.30220872163772583, + "learning_rate": 0.0005409125874125874, + "loss": 3.585, + "step": 17000 + }, + { + "epoch": 4.950550410623799, + "eval_accuracy": 0.360180674917524, + "eval_loss": 3.6294431686401367, + "eval_runtime": 181.1546, + "eval_samples_per_second": 91.888, + "eval_steps_per_second": 5.746, + "step": 17000 + }, + { + "epoch": 4.9651115382375215, + "grad_norm": 0.34501272439956665, + "learning_rate": 0.0005407377622377622, + "loss": 3.5698, + "step": 17050 + }, + { + "epoch": 4.979672665851243, + "grad_norm": 0.31808799505233765, + "learning_rate": 0.000540562937062937, + "loss": 3.5684, + "step": 17100 + }, + { + "epoch": 4.994233793464966, + "grad_norm": 0.30529871582984924, + "learning_rate": 0.0005403881118881118, + "loss": 3.5667, + "step": 17150 + }, + { + "epoch": 5.008736676568233, + "grad_norm": 0.32560649514198303, + "learning_rate": 0.0005402132867132867, + "loss": 3.5124, + "step": 17200 + }, + { + "epoch": 5.023297804181956, + "grad_norm": 0.33668822050094604, + "learning_rate": 0.0005400384615384615, + "loss": 3.448, + "step": 17250 + }, + { + "epoch": 5.037858931795678, + "grad_norm": 0.3430330753326416, + "learning_rate": 0.0005398636363636363, + "loss": 3.462, + "step": 17300 + }, + { + "epoch": 5.052420059409401, + "grad_norm": 0.32133740186691284, + "learning_rate": 0.0005396888111888111, + "loss": 3.4736, + "step": 17350 + }, + { + "epoch": 5.066981187023123, + "grad_norm": 0.30943241715431213, + "learning_rate": 0.000539513986013986, + "loss": 3.4657, + "step": 17400 + }, + { + "epoch": 5.081542314636845, + "grad_norm": 0.3143688142299652, + "learning_rate": 0.0005393391608391608, + "loss": 3.4726, + "step": 17450 + }, + { + "epoch": 5.096103442250568, + "grad_norm": 0.3386751711368561, + "learning_rate": 0.0005391643356643356, + "loss": 3.4804, + "step": 17500 + }, + { + "epoch": 5.110664569864291, + "grad_norm": 0.31570807099342346, + "learning_rate": 0.0005389895104895105, + "loss": 3.4793, + "step": 17550 + }, + { + "epoch": 5.125225697478013, + "grad_norm": 0.32351601123809814, + "learning_rate": 0.0005388146853146853, + "loss": 3.4856, + "step": 17600 + }, + { + "epoch": 5.139786825091735, + "grad_norm": 0.30925703048706055, + "learning_rate": 0.0005386398601398601, + "loss": 3.486, + "step": 17650 + }, + { + "epoch": 5.154347952705457, + "grad_norm": 0.31706124544143677, + "learning_rate": 0.0005384650349650349, + "loss": 3.4876, + "step": 17700 + }, + { + "epoch": 5.16890908031918, + "grad_norm": 0.3108503818511963, + "learning_rate": 0.0005382902097902098, + "loss": 3.4841, + "step": 17750 + }, + { + "epoch": 5.183470207932903, + "grad_norm": 0.3236485719680786, + "learning_rate": 0.0005381153846153845, + "loss": 3.4851, + "step": 17800 + }, + { + "epoch": 5.1980313355466246, + "grad_norm": 0.30849018692970276, + "learning_rate": 0.0005379405594405594, + "loss": 3.4838, + "step": 17850 + }, + { + "epoch": 5.212592463160347, + "grad_norm": 0.3113718032836914, + "learning_rate": 0.0005377657342657342, + "loss": 3.476, + "step": 17900 + }, + { + "epoch": 5.227153590774069, + "grad_norm": 0.35100990533828735, + "learning_rate": 0.000537590909090909, + "loss": 3.4976, + "step": 17950 + }, + { + "epoch": 5.241714718387792, + "grad_norm": 0.3076687455177307, + "learning_rate": 0.0005374160839160838, + "loss": 3.5044, + "step": 18000 + }, + { + "epoch": 5.241714718387792, + "eval_accuracy": 0.3602995307135839, + "eval_loss": 3.6342709064483643, + "eval_runtime": 181.2085, + "eval_samples_per_second": 91.861, + "eval_steps_per_second": 5.745, + "step": 18000 + }, + { + "epoch": 5.256275846001515, + "grad_norm": 0.3252745270729065, + "learning_rate": 0.0005372412587412587, + "loss": 3.5038, + "step": 18050 + }, + { + "epoch": 5.2708369736152365, + "grad_norm": 0.31030163168907166, + "learning_rate": 0.0005370664335664335, + "loss": 3.49, + "step": 18100 + }, + { + "epoch": 5.285398101228959, + "grad_norm": 0.36103177070617676, + "learning_rate": 0.0005368916083916083, + "loss": 3.492, + "step": 18150 + }, + { + "epoch": 5.299959228842681, + "grad_norm": 0.32504352927207947, + "learning_rate": 0.0005367167832167832, + "loss": 3.4948, + "step": 18200 + }, + { + "epoch": 5.314520356456404, + "grad_norm": 0.3331771790981293, + "learning_rate": 0.000536541958041958, + "loss": 3.5126, + "step": 18250 + }, + { + "epoch": 5.329081484070127, + "grad_norm": 0.31116366386413574, + "learning_rate": 0.0005363671328671328, + "loss": 3.5076, + "step": 18300 + }, + { + "epoch": 5.3436426116838485, + "grad_norm": 0.3158913850784302, + "learning_rate": 0.0005361923076923076, + "loss": 3.5178, + "step": 18350 + }, + { + "epoch": 5.358203739297571, + "grad_norm": 0.32401207089424133, + "learning_rate": 0.0005360174825174825, + "loss": 3.5149, + "step": 18400 + }, + { + "epoch": 5.372764866911294, + "grad_norm": 0.340380996465683, + "learning_rate": 0.0005358426573426573, + "loss": 3.5078, + "step": 18450 + }, + { + "epoch": 5.387325994525016, + "grad_norm": 0.31110548973083496, + "learning_rate": 0.0005356678321678321, + "loss": 3.5107, + "step": 18500 + }, + { + "epoch": 5.401887122138739, + "grad_norm": 0.32105040550231934, + "learning_rate": 0.0005354930069930069, + "loss": 3.4964, + "step": 18550 + }, + { + "epoch": 5.41644824975246, + "grad_norm": 0.3346503674983978, + "learning_rate": 0.0005353181818181817, + "loss": 3.5029, + "step": 18600 + }, + { + "epoch": 5.431009377366183, + "grad_norm": 0.3195594549179077, + "learning_rate": 0.0005351433566433565, + "loss": 3.5149, + "step": 18650 + }, + { + "epoch": 5.445570504979906, + "grad_norm": 0.3379792273044586, + "learning_rate": 0.0005349685314685314, + "loss": 3.5073, + "step": 18700 + }, + { + "epoch": 5.460131632593628, + "grad_norm": 0.3183291256427765, + "learning_rate": 0.0005347937062937062, + "loss": 3.5091, + "step": 18750 + }, + { + "epoch": 5.4746927602073505, + "grad_norm": 0.3273472785949707, + "learning_rate": 0.000534618881118881, + "loss": 3.5196, + "step": 18800 + }, + { + "epoch": 5.489253887821073, + "grad_norm": 0.3325451910495758, + "learning_rate": 0.0005344440559440559, + "loss": 3.5159, + "step": 18850 + }, + { + "epoch": 5.503815015434795, + "grad_norm": 0.33070364594459534, + "learning_rate": 0.0005342692307692307, + "loss": 3.518, + "step": 18900 + }, + { + "epoch": 5.518376143048518, + "grad_norm": 0.33015328645706177, + "learning_rate": 0.0005340944055944055, + "loss": 3.5238, + "step": 18950 + }, + { + "epoch": 5.53293727066224, + "grad_norm": 0.307849645614624, + "learning_rate": 0.0005339195804195803, + "loss": 3.5108, + "step": 19000 + }, + { + "epoch": 5.53293727066224, + "eval_accuracy": 0.36167172146690857, + "eval_loss": 3.62229061126709, + "eval_runtime": 181.6251, + "eval_samples_per_second": 91.65, + "eval_steps_per_second": 5.732, + "step": 19000 + }, + { + "epoch": 5.5474983982759625, + "grad_norm": 0.2991660237312317, + "learning_rate": 0.0005337447552447552, + "loss": 3.5153, + "step": 19050 + }, + { + "epoch": 5.562059525889685, + "grad_norm": 0.33114874362945557, + "learning_rate": 0.00053356993006993, + "loss": 3.5165, + "step": 19100 + }, + { + "epoch": 5.576620653503407, + "grad_norm": 0.3147285282611847, + "learning_rate": 0.0005333951048951048, + "loss": 3.5263, + "step": 19150 + }, + { + "epoch": 5.59118178111713, + "grad_norm": 0.32433706521987915, + "learning_rate": 0.0005332202797202796, + "loss": 3.5213, + "step": 19200 + }, + { + "epoch": 5.605742908730852, + "grad_norm": 0.386737197637558, + "learning_rate": 0.0005330454545454546, + "loss": 3.5211, + "step": 19250 + }, + { + "epoch": 5.620304036344574, + "grad_norm": 0.32082608342170715, + "learning_rate": 0.0005328706293706292, + "loss": 3.53, + "step": 19300 + }, + { + "epoch": 5.634865163958297, + "grad_norm": 0.2935941219329834, + "learning_rate": 0.0005326958041958042, + "loss": 3.5197, + "step": 19350 + }, + { + "epoch": 5.649426291572019, + "grad_norm": 0.31667831540107727, + "learning_rate": 0.000532520979020979, + "loss": 3.5095, + "step": 19400 + }, + { + "epoch": 5.663987419185742, + "grad_norm": 0.306999534368515, + "learning_rate": 0.0005323461538461538, + "loss": 3.5187, + "step": 19450 + }, + { + "epoch": 5.6785485467994645, + "grad_norm": 0.30406883358955383, + "learning_rate": 0.0005321713286713287, + "loss": 3.5199, + "step": 19500 + }, + { + "epoch": 5.693109674413186, + "grad_norm": 0.3210119903087616, + "learning_rate": 0.0005319965034965035, + "loss": 3.527, + "step": 19550 + }, + { + "epoch": 5.707670802026909, + "grad_norm": 0.33628612756729126, + "learning_rate": 0.0005318216783216783, + "loss": 3.5168, + "step": 19600 + }, + { + "epoch": 5.722231929640631, + "grad_norm": 0.3189808428287506, + "learning_rate": 0.0005316468531468531, + "loss": 3.5246, + "step": 19650 + }, + { + "epoch": 5.736793057254354, + "grad_norm": 0.3225274682044983, + "learning_rate": 0.000531472027972028, + "loss": 3.5168, + "step": 19700 + }, + { + "epoch": 5.7513541848680765, + "grad_norm": 0.32404860854148865, + "learning_rate": 0.0005312972027972028, + "loss": 3.5305, + "step": 19750 + }, + { + "epoch": 5.765915312481798, + "grad_norm": 0.33228975534439087, + "learning_rate": 0.0005311223776223776, + "loss": 3.5191, + "step": 19800 + }, + { + "epoch": 5.780476440095521, + "grad_norm": 0.3346024453639984, + "learning_rate": 0.0005309475524475524, + "loss": 3.5233, + "step": 19850 + }, + { + "epoch": 5.795037567709244, + "grad_norm": 0.30757418274879456, + "learning_rate": 0.0005307727272727273, + "loss": 3.5203, + "step": 19900 + }, + { + "epoch": 5.809598695322966, + "grad_norm": 0.33240988850593567, + "learning_rate": 0.0005305979020979021, + "loss": 3.5232, + "step": 19950 + }, + { + "epoch": 5.824159822936688, + "grad_norm": 0.3051111698150635, + "learning_rate": 0.0005304230769230769, + "loss": 3.5355, + "step": 20000 + }, + { + "epoch": 5.824159822936688, + "eval_accuracy": 0.3623371258246723, + "eval_loss": 3.6103367805480957, + "eval_runtime": 181.4963, + "eval_samples_per_second": 91.715, + "eval_steps_per_second": 5.736, + "step": 20000 + }, + { + "epoch": 5.83872095055041, + "grad_norm": 0.3242921233177185, + "learning_rate": 0.0005302482517482517, + "loss": 3.5295, + "step": 20050 + }, + { + "epoch": 5.853282078164133, + "grad_norm": 0.3176933526992798, + "learning_rate": 0.0005300734265734265, + "loss": 3.5191, + "step": 20100 + }, + { + "epoch": 5.867843205777856, + "grad_norm": 0.34109026193618774, + "learning_rate": 0.0005298986013986013, + "loss": 3.5357, + "step": 20150 + }, + { + "epoch": 5.882404333391578, + "grad_norm": 0.2950345575809479, + "learning_rate": 0.0005297237762237762, + "loss": 3.5309, + "step": 20200 + }, + { + "epoch": 5.8969654610053, + "grad_norm": 0.30492424964904785, + "learning_rate": 0.000529548951048951, + "loss": 3.5426, + "step": 20250 + }, + { + "epoch": 5.911526588619022, + "grad_norm": 0.3427123725414276, + "learning_rate": 0.0005293741258741258, + "loss": 3.5293, + "step": 20300 + }, + { + "epoch": 5.926087716232745, + "grad_norm": 0.30794087052345276, + "learning_rate": 0.0005291993006993007, + "loss": 3.5297, + "step": 20350 + }, + { + "epoch": 5.940648843846468, + "grad_norm": 0.30005186796188354, + "learning_rate": 0.0005290244755244755, + "loss": 3.5312, + "step": 20400 + }, + { + "epoch": 5.95520997146019, + "grad_norm": 0.3218655586242676, + "learning_rate": 0.0005288496503496503, + "loss": 3.5224, + "step": 20450 + }, + { + "epoch": 5.969771099073912, + "grad_norm": 0.3385027348995209, + "learning_rate": 0.0005286748251748251, + "loss": 3.5319, + "step": 20500 + }, + { + "epoch": 5.984332226687634, + "grad_norm": 0.33277666568756104, + "learning_rate": 0.0005285, + "loss": 3.5229, + "step": 20550 + }, + { + "epoch": 5.998893354301357, + "grad_norm": 0.2984437346458435, + "learning_rate": 0.0005283251748251748, + "loss": 3.5246, + "step": 20600 + }, + { + "epoch": 6.013396237404625, + "grad_norm": 0.3273489773273468, + "learning_rate": 0.0005281503496503496, + "loss": 3.4096, + "step": 20650 + }, + { + "epoch": 6.027957365018347, + "grad_norm": 0.32948926091194153, + "learning_rate": 0.0005279755244755244, + "loss": 3.4094, + "step": 20700 + }, + { + "epoch": 6.04251849263207, + "grad_norm": 0.3348131775856018, + "learning_rate": 0.0005278006993006993, + "loss": 3.4196, + "step": 20750 + }, + { + "epoch": 6.0570796202457915, + "grad_norm": 0.3126414716243744, + "learning_rate": 0.000527625874125874, + "loss": 3.4201, + "step": 20800 + }, + { + "epoch": 6.071640747859514, + "grad_norm": 0.33334097266197205, + "learning_rate": 0.0005274510489510489, + "loss": 3.4269, + "step": 20850 + }, + { + "epoch": 6.086201875473237, + "grad_norm": 0.32628950476646423, + "learning_rate": 0.0005272762237762238, + "loss": 3.432, + "step": 20900 + }, + { + "epoch": 6.100763003086959, + "grad_norm": 0.3498513996601105, + "learning_rate": 0.0005271013986013985, + "loss": 3.425, + "step": 20950 + }, + { + "epoch": 6.115324130700682, + "grad_norm": 0.29833361506462097, + "learning_rate": 0.0005269265734265734, + "loss": 3.4252, + "step": 21000 + }, + { + "epoch": 6.115324130700682, + "eval_accuracy": 0.36289260914453686, + "eval_loss": 3.613590717315674, + "eval_runtime": 181.366, + "eval_samples_per_second": 91.781, + "eval_steps_per_second": 5.74, + "step": 21000 + }, + { + "epoch": 6.1298852583144035, + "grad_norm": 0.3302992880344391, + "learning_rate": 0.0005267517482517482, + "loss": 3.4361, + "step": 21050 + }, + { + "epoch": 6.144446385928126, + "grad_norm": 0.3029468059539795, + "learning_rate": 0.000526576923076923, + "loss": 3.4468, + "step": 21100 + }, + { + "epoch": 6.159007513541849, + "grad_norm": 0.3242226243019104, + "learning_rate": 0.0005264020979020978, + "loss": 3.4352, + "step": 21150 + }, + { + "epoch": 6.173568641155571, + "grad_norm": 0.31961268186569214, + "learning_rate": 0.0005262272727272727, + "loss": 3.4358, + "step": 21200 + }, + { + "epoch": 6.1881297687692935, + "grad_norm": 0.33194205164909363, + "learning_rate": 0.0005260524475524475, + "loss": 3.4442, + "step": 21250 + }, + { + "epoch": 6.202690896383016, + "grad_norm": 0.32972991466522217, + "learning_rate": 0.0005258776223776223, + "loss": 3.4521, + "step": 21300 + }, + { + "epoch": 6.217252023996738, + "grad_norm": 0.31378456950187683, + "learning_rate": 0.0005257027972027971, + "loss": 3.4469, + "step": 21350 + }, + { + "epoch": 6.231813151610461, + "grad_norm": 0.3384927809238434, + "learning_rate": 0.000525527972027972, + "loss": 3.4621, + "step": 21400 + }, + { + "epoch": 6.246374279224183, + "grad_norm": 0.3361877202987671, + "learning_rate": 0.0005253531468531468, + "loss": 3.4574, + "step": 21450 + }, + { + "epoch": 6.2609354068379055, + "grad_norm": 0.3300999402999878, + "learning_rate": 0.0005251783216783216, + "loss": 3.4494, + "step": 21500 + }, + { + "epoch": 6.275496534451628, + "grad_norm": 0.31691470742225647, + "learning_rate": 0.0005250034965034965, + "loss": 3.4443, + "step": 21550 + }, + { + "epoch": 6.29005766206535, + "grad_norm": 0.30609896779060364, + "learning_rate": 0.0005248286713286712, + "loss": 3.465, + "step": 21600 + }, + { + "epoch": 6.304618789679073, + "grad_norm": 0.3142344653606415, + "learning_rate": 0.0005246538461538461, + "loss": 3.4489, + "step": 21650 + }, + { + "epoch": 6.319179917292795, + "grad_norm": 0.3219060003757477, + "learning_rate": 0.0005244790209790209, + "loss": 3.4663, + "step": 21700 + }, + { + "epoch": 6.3337410449065175, + "grad_norm": 0.3099410831928253, + "learning_rate": 0.0005243041958041957, + "loss": 3.472, + "step": 21750 + }, + { + "epoch": 6.34830217252024, + "grad_norm": 0.31502556800842285, + "learning_rate": 0.0005241293706293705, + "loss": 3.4804, + "step": 21800 + }, + { + "epoch": 6.362863300133962, + "grad_norm": 0.30814129114151, + "learning_rate": 0.0005239545454545454, + "loss": 3.4601, + "step": 21850 + }, + { + "epoch": 6.377424427747685, + "grad_norm": 0.3281274735927582, + "learning_rate": 0.0005237797202797202, + "loss": 3.4743, + "step": 21900 + }, + { + "epoch": 6.391985555361408, + "grad_norm": 0.33183300495147705, + "learning_rate": 0.000523604895104895, + "loss": 3.4548, + "step": 21950 + }, + { + "epoch": 6.406546682975129, + "grad_norm": 0.33608192205429077, + "learning_rate": 0.0005234300699300698, + "loss": 3.4717, + "step": 22000 + }, + { + "epoch": 6.406546682975129, + "eval_accuracy": 0.3633190087214996, + "eval_loss": 3.6065382957458496, + "eval_runtime": 181.0468, + "eval_samples_per_second": 91.943, + "eval_steps_per_second": 5.75, + "step": 22000 + }, + { + "epoch": 6.421107810588852, + "grad_norm": 0.3260321319103241, + "learning_rate": 0.0005232552447552447, + "loss": 3.4682, + "step": 22050 + }, + { + "epoch": 6.435668938202574, + "grad_norm": 0.33026236295700073, + "learning_rate": 0.0005230804195804195, + "loss": 3.4802, + "step": 22100 + }, + { + "epoch": 6.450230065816297, + "grad_norm": 0.30417993664741516, + "learning_rate": 0.0005229055944055943, + "loss": 3.476, + "step": 22150 + }, + { + "epoch": 6.4647911934300195, + "grad_norm": 0.3218960464000702, + "learning_rate": 0.0005227307692307691, + "loss": 3.4754, + "step": 22200 + }, + { + "epoch": 6.479352321043741, + "grad_norm": 0.31791952252388, + "learning_rate": 0.0005225559440559441, + "loss": 3.4712, + "step": 22250 + }, + { + "epoch": 6.493913448657464, + "grad_norm": 0.31013575196266174, + "learning_rate": 0.0005223811188811189, + "loss": 3.4725, + "step": 22300 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 0.3217204213142395, + "learning_rate": 0.0005222062937062937, + "loss": 3.4725, + "step": 22350 + }, + { + "epoch": 6.523035703884909, + "grad_norm": 0.31372180581092834, + "learning_rate": 0.0005220314685314686, + "loss": 3.4723, + "step": 22400 + }, + { + "epoch": 6.5375968314986315, + "grad_norm": 0.3291754126548767, + "learning_rate": 0.0005218566433566433, + "loss": 3.4768, + "step": 22450 + }, + { + "epoch": 6.552157959112353, + "grad_norm": 0.3256649672985077, + "learning_rate": 0.0005216818181818182, + "loss": 3.4629, + "step": 22500 + }, + { + "epoch": 6.566719086726076, + "grad_norm": 0.323404461145401, + "learning_rate": 0.000521506993006993, + "loss": 3.4747, + "step": 22550 + }, + { + "epoch": 6.581280214339799, + "grad_norm": 0.33667072653770447, + "learning_rate": 0.0005213321678321678, + "loss": 3.4652, + "step": 22600 + }, + { + "epoch": 6.595841341953521, + "grad_norm": 0.3154466450214386, + "learning_rate": 0.0005211573426573426, + "loss": 3.4836, + "step": 22650 + }, + { + "epoch": 6.610402469567243, + "grad_norm": 0.34323811531066895, + "learning_rate": 0.0005209825174825175, + "loss": 3.4798, + "step": 22700 + }, + { + "epoch": 6.624963597180965, + "grad_norm": 0.3353569209575653, + "learning_rate": 0.0005208076923076923, + "loss": 3.4756, + "step": 22750 + }, + { + "epoch": 6.639524724794688, + "grad_norm": 0.3173069655895233, + "learning_rate": 0.0005206328671328671, + "loss": 3.4838, + "step": 22800 + }, + { + "epoch": 6.654085852408411, + "grad_norm": 0.3082946836948395, + "learning_rate": 0.0005204580419580419, + "loss": 3.4859, + "step": 22850 + }, + { + "epoch": 6.668646980022133, + "grad_norm": 0.32030490040779114, + "learning_rate": 0.0005202832167832168, + "loss": 3.4902, + "step": 22900 + }, + { + "epoch": 6.683208107635855, + "grad_norm": 0.3350687623023987, + "learning_rate": 0.0005201083916083916, + "loss": 3.4842, + "step": 22950 + }, + { + "epoch": 6.697769235249577, + "grad_norm": 0.324341356754303, + "learning_rate": 0.0005199335664335664, + "loss": 3.4676, + "step": 23000 + }, + { + "epoch": 6.697769235249577, + "eval_accuracy": 0.3642195382940208, + "eval_loss": 3.5976805686950684, + "eval_runtime": 181.2142, + "eval_samples_per_second": 91.858, + "eval_steps_per_second": 5.745, + "step": 23000 + }, + { + "epoch": 6.7123303628633, + "grad_norm": 0.31654438376426697, + "learning_rate": 0.0005197587412587413, + "loss": 3.487, + "step": 23050 + }, + { + "epoch": 6.726891490477023, + "grad_norm": 0.324179083108902, + "learning_rate": 0.0005195839160839161, + "loss": 3.4772, + "step": 23100 + }, + { + "epoch": 6.741452618090745, + "grad_norm": 0.3194001019001007, + "learning_rate": 0.0005194090909090909, + "loss": 3.4873, + "step": 23150 + }, + { + "epoch": 6.756013745704467, + "grad_norm": 0.3092760741710663, + "learning_rate": 0.0005192342657342657, + "loss": 3.4903, + "step": 23200 + }, + { + "epoch": 6.77057487331819, + "grad_norm": 0.3114040195941925, + "learning_rate": 0.0005190594405594405, + "loss": 3.4924, + "step": 23250 + }, + { + "epoch": 6.785136000931912, + "grad_norm": 0.3235475420951843, + "learning_rate": 0.0005188846153846153, + "loss": 3.4826, + "step": 23300 + }, + { + "epoch": 6.799697128545635, + "grad_norm": 0.3458871841430664, + "learning_rate": 0.0005187097902097902, + "loss": 3.4872, + "step": 23350 + }, + { + "epoch": 6.814258256159357, + "grad_norm": 0.34217914938926697, + "learning_rate": 0.000518534965034965, + "loss": 3.4947, + "step": 23400 + }, + { + "epoch": 6.828819383773079, + "grad_norm": 0.3139799237251282, + "learning_rate": 0.0005183601398601398, + "loss": 3.4925, + "step": 23450 + }, + { + "epoch": 6.843380511386802, + "grad_norm": 0.33358052372932434, + "learning_rate": 0.0005181853146853146, + "loss": 3.4904, + "step": 23500 + }, + { + "epoch": 6.857941639000524, + "grad_norm": 0.32350531220436096, + "learning_rate": 0.0005180104895104895, + "loss": 3.4951, + "step": 23550 + }, + { + "epoch": 6.872502766614247, + "grad_norm": 0.32974010705947876, + "learning_rate": 0.0005178356643356643, + "loss": 3.493, + "step": 23600 + }, + { + "epoch": 6.887063894227969, + "grad_norm": 0.33600473403930664, + "learning_rate": 0.0005176608391608391, + "loss": 3.4893, + "step": 23650 + }, + { + "epoch": 6.901625021841691, + "grad_norm": 0.32952529191970825, + "learning_rate": 0.000517486013986014, + "loss": 3.4971, + "step": 23700 + }, + { + "epoch": 6.916186149455414, + "grad_norm": 0.325702041387558, + "learning_rate": 0.0005173111888111888, + "loss": 3.4979, + "step": 23750 + }, + { + "epoch": 6.930747277069136, + "grad_norm": 0.32848599553108215, + "learning_rate": 0.0005171363636363636, + "loss": 3.4868, + "step": 23800 + }, + { + "epoch": 6.945308404682859, + "grad_norm": 0.3094465732574463, + "learning_rate": 0.0005169615384615384, + "loss": 3.4919, + "step": 23850 + }, + { + "epoch": 6.959869532296581, + "grad_norm": 0.3336765468120575, + "learning_rate": 0.0005167867132867133, + "loss": 3.4885, + "step": 23900 + }, + { + "epoch": 6.974430659910303, + "grad_norm": 0.32281845808029175, + "learning_rate": 0.000516611888111888, + "loss": 3.4788, + "step": 23950 + }, + { + "epoch": 6.988991787524026, + "grad_norm": 0.37186485528945923, + "learning_rate": 0.0005164370629370629, + "loss": 3.4867, + "step": 24000 + }, + { + "epoch": 6.988991787524026, + "eval_accuracy": 0.3649564207170708, + "eval_loss": 3.5883593559265137, + "eval_runtime": 181.1707, + "eval_samples_per_second": 91.88, + "eval_steps_per_second": 5.746, + "step": 24000 + }, + { + "epoch": 7.003494670627293, + "grad_norm": 0.31783783435821533, + "learning_rate": 0.0005162622377622377, + "loss": 3.459, + "step": 24050 + }, + { + "epoch": 7.018055798241016, + "grad_norm": 0.34084847569465637, + "learning_rate": 0.0005160874125874125, + "loss": 3.379, + "step": 24100 + }, + { + "epoch": 7.032616925854738, + "grad_norm": 0.3169894218444824, + "learning_rate": 0.0005159125874125873, + "loss": 3.3842, + "step": 24150 + }, + { + "epoch": 7.0471780534684605, + "grad_norm": 0.32522594928741455, + "learning_rate": 0.0005157377622377622, + "loss": 3.3877, + "step": 24200 + }, + { + "epoch": 7.061739181082183, + "grad_norm": 0.3461341857910156, + "learning_rate": 0.000515562937062937, + "loss": 3.3905, + "step": 24250 + }, + { + "epoch": 7.076300308695905, + "grad_norm": 0.328916996717453, + "learning_rate": 0.0005153881118881118, + "loss": 3.3949, + "step": 24300 + }, + { + "epoch": 7.090861436309628, + "grad_norm": 0.34981271624565125, + "learning_rate": 0.0005152132867132867, + "loss": 3.399, + "step": 24350 + }, + { + "epoch": 7.105422563923351, + "grad_norm": 0.3182472884654999, + "learning_rate": 0.0005150384615384615, + "loss": 3.4032, + "step": 24400 + }, + { + "epoch": 7.1199836915370724, + "grad_norm": 0.3407321870326996, + "learning_rate": 0.0005148636363636363, + "loss": 3.4098, + "step": 24450 + }, + { + "epoch": 7.134544819150795, + "grad_norm": 0.31166911125183105, + "learning_rate": 0.0005146888111888111, + "loss": 3.3845, + "step": 24500 + }, + { + "epoch": 7.149105946764517, + "grad_norm": 0.3245963454246521, + "learning_rate": 0.000514513986013986, + "loss": 3.4064, + "step": 24550 + }, + { + "epoch": 7.16366707437824, + "grad_norm": 0.35094335675239563, + "learning_rate": 0.0005143391608391608, + "loss": 3.4088, + "step": 24600 + }, + { + "epoch": 7.1782282019919625, + "grad_norm": 0.31868040561676025, + "learning_rate": 0.0005141643356643356, + "loss": 3.4047, + "step": 24650 + }, + { + "epoch": 7.192789329605684, + "grad_norm": 0.32330843806266785, + "learning_rate": 0.0005139895104895104, + "loss": 3.4007, + "step": 24700 + }, + { + "epoch": 7.207350457219407, + "grad_norm": 0.34358319640159607, + "learning_rate": 0.0005138146853146852, + "loss": 3.4181, + "step": 24750 + }, + { + "epoch": 7.22191158483313, + "grad_norm": 0.3291915953159332, + "learning_rate": 0.00051363986013986, + "loss": 3.4165, + "step": 24800 + }, + { + "epoch": 7.236472712446852, + "grad_norm": 0.31977158784866333, + "learning_rate": 0.0005134650349650349, + "loss": 3.4246, + "step": 24850 + }, + { + "epoch": 7.2510338400605745, + "grad_norm": 0.33271709084510803, + "learning_rate": 0.0005132902097902097, + "loss": 3.4101, + "step": 24900 + }, + { + "epoch": 7.265594967674296, + "grad_norm": 0.3385375738143921, + "learning_rate": 0.0005131153846153845, + "loss": 3.4185, + "step": 24950 + }, + { + "epoch": 7.280156095288019, + "grad_norm": 0.359745055437088, + "learning_rate": 0.0005129405594405594, + "loss": 3.4248, + "step": 25000 + }, + { + "epoch": 7.280156095288019, + "eval_accuracy": 0.36494066732768204, + "eval_loss": 3.5982604026794434, + "eval_runtime": 181.1034, + "eval_samples_per_second": 91.914, + "eval_steps_per_second": 5.748, + "step": 25000 + }, + { + "epoch": 7.294717222901742, + "grad_norm": 0.32137957215309143, + "learning_rate": 0.0005127657342657342, + "loss": 3.4147, + "step": 25050 + }, + { + "epoch": 7.309278350515464, + "grad_norm": 0.3147618770599365, + "learning_rate": 0.000512590909090909, + "loss": 3.4202, + "step": 25100 + }, + { + "epoch": 7.3238394781291865, + "grad_norm": 0.3232477605342865, + "learning_rate": 0.0005124160839160838, + "loss": 3.4304, + "step": 25150 + }, + { + "epoch": 7.338400605742908, + "grad_norm": 0.3342421352863312, + "learning_rate": 0.0005122412587412588, + "loss": 3.4261, + "step": 25200 + }, + { + "epoch": 7.352961733356631, + "grad_norm": 0.3140034079551697, + "learning_rate": 0.0005120664335664336, + "loss": 3.435, + "step": 25250 + }, + { + "epoch": 7.367522860970354, + "grad_norm": 0.33676761388778687, + "learning_rate": 0.0005118916083916084, + "loss": 3.4313, + "step": 25300 + }, + { + "epoch": 7.382083988584076, + "grad_norm": 0.33039015531539917, + "learning_rate": 0.0005117167832167832, + "loss": 3.435, + "step": 25350 + }, + { + "epoch": 7.396645116197798, + "grad_norm": 0.34258249402046204, + "learning_rate": 0.0005115419580419581, + "loss": 3.4223, + "step": 25400 + }, + { + "epoch": 7.411206243811521, + "grad_norm": 0.3384631276130676, + "learning_rate": 0.0005113671328671328, + "loss": 3.4243, + "step": 25450 + }, + { + "epoch": 7.425767371425243, + "grad_norm": 0.32956787943840027, + "learning_rate": 0.0005111923076923077, + "loss": 3.4416, + "step": 25500 + }, + { + "epoch": 7.440328499038966, + "grad_norm": 0.3352907598018646, + "learning_rate": 0.0005110174825174825, + "loss": 3.4284, + "step": 25550 + }, + { + "epoch": 7.454889626652688, + "grad_norm": 0.3281717002391815, + "learning_rate": 0.0005108426573426573, + "loss": 3.4443, + "step": 25600 + }, + { + "epoch": 7.46945075426641, + "grad_norm": 0.34179285168647766, + "learning_rate": 0.0005106678321678321, + "loss": 3.435, + "step": 25650 + }, + { + "epoch": 7.484011881880133, + "grad_norm": 0.35076916217803955, + "learning_rate": 0.000510493006993007, + "loss": 3.4447, + "step": 25700 + }, + { + "epoch": 7.498573009493855, + "grad_norm": 0.3248736262321472, + "learning_rate": 0.0005103181818181818, + "loss": 3.4386, + "step": 25750 + }, + { + "epoch": 7.513134137107578, + "grad_norm": 0.3269752860069275, + "learning_rate": 0.0005101433566433566, + "loss": 3.4369, + "step": 25800 + }, + { + "epoch": 7.5276952647213005, + "grad_norm": 0.343089759349823, + "learning_rate": 0.0005099685314685315, + "loss": 3.447, + "step": 25850 + }, + { + "epoch": 7.542256392335022, + "grad_norm": 0.3274691104888916, + "learning_rate": 0.0005097937062937063, + "loss": 3.4378, + "step": 25900 + }, + { + "epoch": 7.556817519948745, + "grad_norm": 0.31769490242004395, + "learning_rate": 0.0005096188811188811, + "loss": 3.4429, + "step": 25950 + }, + { + "epoch": 7.571378647562467, + "grad_norm": 0.30322396755218506, + "learning_rate": 0.0005094440559440559, + "loss": 3.4455, + "step": 26000 + }, + { + "epoch": 7.571378647562467, + "eval_accuracy": 0.3655577534538131, + "eval_loss": 3.586430072784424, + "eval_runtime": 181.088, + "eval_samples_per_second": 91.922, + "eval_steps_per_second": 5.749, + "step": 26000 + }, + { + "epoch": 7.58593977517619, + "grad_norm": 0.3594802916049957, + "learning_rate": 0.0005092692307692308, + "loss": 3.4449, + "step": 26050 + }, + { + "epoch": 7.600500902789912, + "grad_norm": 0.34348103404045105, + "learning_rate": 0.0005090944055944056, + "loss": 3.4474, + "step": 26100 + }, + { + "epoch": 7.615062030403634, + "grad_norm": 0.32391390204429626, + "learning_rate": 0.0005089195804195804, + "loss": 3.445, + "step": 26150 + }, + { + "epoch": 7.629623158017357, + "grad_norm": 0.3242975175380707, + "learning_rate": 0.0005087447552447552, + "loss": 3.4404, + "step": 26200 + }, + { + "epoch": 7.644184285631079, + "grad_norm": 0.35450035333633423, + "learning_rate": 0.00050856993006993, + "loss": 3.4475, + "step": 26250 + }, + { + "epoch": 7.658745413244802, + "grad_norm": 0.32055407762527466, + "learning_rate": 0.0005083951048951048, + "loss": 3.4406, + "step": 26300 + }, + { + "epoch": 7.673306540858524, + "grad_norm": 0.31129223108291626, + "learning_rate": 0.0005082202797202797, + "loss": 3.4532, + "step": 26350 + }, + { + "epoch": 7.687867668472246, + "grad_norm": 0.33475667238235474, + "learning_rate": 0.0005080454545454545, + "loss": 3.4582, + "step": 26400 + }, + { + "epoch": 7.702428796085969, + "grad_norm": 0.3636000454425812, + "learning_rate": 0.0005078706293706293, + "loss": 3.442, + "step": 26450 + }, + { + "epoch": 7.716989923699691, + "grad_norm": 0.3021737039089203, + "learning_rate": 0.0005076958041958042, + "loss": 3.4586, + "step": 26500 + }, + { + "epoch": 7.731551051313414, + "grad_norm": 0.32579872012138367, + "learning_rate": 0.000507520979020979, + "loss": 3.4468, + "step": 26550 + }, + { + "epoch": 7.746112178927136, + "grad_norm": 0.3306429982185364, + "learning_rate": 0.0005073461538461538, + "loss": 3.4508, + "step": 26600 + }, + { + "epoch": 7.760673306540858, + "grad_norm": 0.33379238843917847, + "learning_rate": 0.0005071713286713286, + "loss": 3.4554, + "step": 26650 + }, + { + "epoch": 7.775234434154581, + "grad_norm": 0.3489669859409332, + "learning_rate": 0.0005069965034965035, + "loss": 3.4479, + "step": 26700 + }, + { + "epoch": 7.789795561768304, + "grad_norm": 0.3256928324699402, + "learning_rate": 0.0005068216783216783, + "loss": 3.4562, + "step": 26750 + }, + { + "epoch": 7.8043566893820255, + "grad_norm": 0.30848199129104614, + "learning_rate": 0.0005066468531468531, + "loss": 3.4483, + "step": 26800 + }, + { + "epoch": 7.818917816995748, + "grad_norm": 0.3393913507461548, + "learning_rate": 0.0005064720279720279, + "loss": 3.4709, + "step": 26850 + }, + { + "epoch": 7.833478944609471, + "grad_norm": 0.31153106689453125, + "learning_rate": 0.0005062972027972028, + "loss": 3.4658, + "step": 26900 + }, + { + "epoch": 7.848040072223193, + "grad_norm": 0.328171044588089, + "learning_rate": 0.0005061223776223775, + "loss": 3.4517, + "step": 26950 + }, + { + "epoch": 7.862601199836916, + "grad_norm": 0.32433098554611206, + "learning_rate": 0.0005059475524475524, + "loss": 3.4606, + "step": 27000 + }, + { + "epoch": 7.862601199836916, + "eval_accuracy": 0.366670836220475, + "eval_loss": 3.5772430896759033, + "eval_runtime": 181.1731, + "eval_samples_per_second": 91.879, + "eval_steps_per_second": 5.746, + "step": 27000 + }, + { + "epoch": 7.8771623274506375, + "grad_norm": 0.3007674515247345, + "learning_rate": 0.0005057727272727272, + "loss": 3.4629, + "step": 27050 + }, + { + "epoch": 7.89172345506436, + "grad_norm": 0.3290211856365204, + "learning_rate": 0.000505597902097902, + "loss": 3.4581, + "step": 27100 + }, + { + "epoch": 7.906284582678083, + "grad_norm": 0.3350273072719574, + "learning_rate": 0.0005054230769230769, + "loss": 3.4522, + "step": 27150 + }, + { + "epoch": 7.920845710291805, + "grad_norm": 0.3044837713241577, + "learning_rate": 0.0005052482517482517, + "loss": 3.4548, + "step": 27200 + }, + { + "epoch": 7.935406837905528, + "grad_norm": 0.3208082914352417, + "learning_rate": 0.0005050734265734265, + "loss": 3.4407, + "step": 27250 + }, + { + "epoch": 7.9499679655192494, + "grad_norm": 0.33409732580184937, + "learning_rate": 0.0005048986013986013, + "loss": 3.4498, + "step": 27300 + }, + { + "epoch": 7.964529093132972, + "grad_norm": 0.33270469307899475, + "learning_rate": 0.0005047237762237762, + "loss": 3.4592, + "step": 27350 + }, + { + "epoch": 7.979090220746695, + "grad_norm": 0.336601585149765, + "learning_rate": 0.000504548951048951, + "loss": 3.4603, + "step": 27400 + }, + { + "epoch": 7.993651348360417, + "grad_norm": 0.32460638880729675, + "learning_rate": 0.0005043741258741258, + "loss": 3.4516, + "step": 27450 + }, + { + "epoch": 8.008154231463685, + "grad_norm": 0.33659544587135315, + "learning_rate": 0.0005041993006993006, + "loss": 3.404, + "step": 27500 + }, + { + "epoch": 8.022715359077408, + "grad_norm": 0.3334440588951111, + "learning_rate": 0.0005040244755244755, + "loss": 3.3413, + "step": 27550 + }, + { + "epoch": 8.037276486691129, + "grad_norm": 0.32501035928726196, + "learning_rate": 0.0005038496503496503, + "loss": 3.3455, + "step": 27600 + }, + { + "epoch": 8.051837614304851, + "grad_norm": 0.33923956751823425, + "learning_rate": 0.0005036748251748251, + "loss": 3.3632, + "step": 27650 + }, + { + "epoch": 8.066398741918574, + "grad_norm": 0.3274213671684265, + "learning_rate": 0.0005034999999999999, + "loss": 3.3568, + "step": 27700 + }, + { + "epoch": 8.080959869532297, + "grad_norm": 0.3114986717700958, + "learning_rate": 0.0005033251748251747, + "loss": 3.3671, + "step": 27750 + }, + { + "epoch": 8.09552099714602, + "grad_norm": 0.33176517486572266, + "learning_rate": 0.0005031503496503496, + "loss": 3.3705, + "step": 27800 + }, + { + "epoch": 8.11008212475974, + "grad_norm": 0.3320193290710449, + "learning_rate": 0.0005029755244755244, + "loss": 3.365, + "step": 27850 + }, + { + "epoch": 8.124643252373463, + "grad_norm": 0.3689771890640259, + "learning_rate": 0.0005028006993006992, + "loss": 3.3782, + "step": 27900 + }, + { + "epoch": 8.139204379987186, + "grad_norm": 0.32468920946121216, + "learning_rate": 0.000502625874125874, + "loss": 3.372, + "step": 27950 + }, + { + "epoch": 8.153765507600909, + "grad_norm": 0.3300785422325134, + "learning_rate": 0.000502451048951049, + "loss": 3.3815, + "step": 28000 + }, + { + "epoch": 8.153765507600909, + "eval_accuracy": 0.3664210156797952, + "eval_loss": 3.589651584625244, + "eval_runtime": 181.0562, + "eval_samples_per_second": 91.938, + "eval_steps_per_second": 5.75, + "step": 28000 + }, + { + "epoch": 8.168326635214632, + "grad_norm": 0.3502894639968872, + "learning_rate": 0.0005022762237762237, + "loss": 3.3722, + "step": 28050 + }, + { + "epoch": 8.182887762828354, + "grad_norm": 0.32582199573516846, + "learning_rate": 0.0005021013986013985, + "loss": 3.3825, + "step": 28100 + }, + { + "epoch": 8.197448890442075, + "grad_norm": 0.3195355534553528, + "learning_rate": 0.0005019265734265733, + "loss": 3.3769, + "step": 28150 + }, + { + "epoch": 8.212010018055798, + "grad_norm": 0.3466898500919342, + "learning_rate": 0.0005017517482517483, + "loss": 3.3755, + "step": 28200 + }, + { + "epoch": 8.22657114566952, + "grad_norm": 0.31910231709480286, + "learning_rate": 0.0005015769230769231, + "loss": 3.3877, + "step": 28250 + }, + { + "epoch": 8.241132273283243, + "grad_norm": 0.3181063234806061, + "learning_rate": 0.0005014020979020979, + "loss": 3.3976, + "step": 28300 + }, + { + "epoch": 8.255693400896966, + "grad_norm": 0.3086499869823456, + "learning_rate": 0.0005012272727272727, + "loss": 3.3798, + "step": 28350 + }, + { + "epoch": 8.270254528510687, + "grad_norm": 0.32086050510406494, + "learning_rate": 0.0005010524475524476, + "loss": 3.3847, + "step": 28400 + }, + { + "epoch": 8.28481565612441, + "grad_norm": 0.33809781074523926, + "learning_rate": 0.0005008776223776223, + "loss": 3.4049, + "step": 28450 + }, + { + "epoch": 8.299376783738133, + "grad_norm": 0.365345299243927, + "learning_rate": 0.0005007027972027972, + "loss": 3.3976, + "step": 28500 + }, + { + "epoch": 8.313937911351855, + "grad_norm": 0.31952720880508423, + "learning_rate": 0.000500527972027972, + "loss": 3.4071, + "step": 28550 + }, + { + "epoch": 8.328499038965578, + "grad_norm": 0.3320535123348236, + "learning_rate": 0.0005003531468531468, + "loss": 3.4125, + "step": 28600 + }, + { + "epoch": 8.3430601665793, + "grad_norm": 0.3217061758041382, + "learning_rate": 0.0005001783216783217, + "loss": 3.3984, + "step": 28650 + }, + { + "epoch": 8.357621294193022, + "grad_norm": 0.3326950967311859, + "learning_rate": 0.0005000034965034965, + "loss": 3.3986, + "step": 28700 + }, + { + "epoch": 8.372182421806745, + "grad_norm": 0.33330121636390686, + "learning_rate": 0.0004998286713286713, + "loss": 3.3976, + "step": 28750 + }, + { + "epoch": 8.386743549420467, + "grad_norm": 0.3166747987270355, + "learning_rate": 0.0004996538461538461, + "loss": 3.4002, + "step": 28800 + }, + { + "epoch": 8.40130467703419, + "grad_norm": 0.34696197509765625, + "learning_rate": 0.000499479020979021, + "loss": 3.4108, + "step": 28850 + }, + { + "epoch": 8.415865804647911, + "grad_norm": 0.3111904263496399, + "learning_rate": 0.0004993041958041958, + "loss": 3.3988, + "step": 28900 + }, + { + "epoch": 8.430426932261634, + "grad_norm": 0.3558425307273865, + "learning_rate": 0.0004991293706293706, + "loss": 3.4024, + "step": 28950 + }, + { + "epoch": 8.444988059875357, + "grad_norm": 0.37141433358192444, + "learning_rate": 0.0004989545454545454, + "loss": 3.4063, + "step": 29000 + }, + { + "epoch": 8.444988059875357, + "eval_accuracy": 0.3664435877004119, + "eval_loss": 3.581801414489746, + "eval_runtime": 181.2088, + "eval_samples_per_second": 91.861, + "eval_steps_per_second": 5.745, + "step": 29000 + }, + { + "epoch": 8.45954918748908, + "grad_norm": 0.31924572587013245, + "learning_rate": 0.0004987797202797203, + "loss": 3.4258, + "step": 29050 + }, + { + "epoch": 8.474110315102802, + "grad_norm": 0.3425278961658478, + "learning_rate": 0.0004986048951048951, + "loss": 3.4101, + "step": 29100 + }, + { + "epoch": 8.488671442716523, + "grad_norm": 0.3478877544403076, + "learning_rate": 0.0004984300699300699, + "loss": 3.4145, + "step": 29150 + }, + { + "epoch": 8.503232570330246, + "grad_norm": 0.35354796051979065, + "learning_rate": 0.0004982552447552448, + "loss": 3.4072, + "step": 29200 + }, + { + "epoch": 8.517793697943969, + "grad_norm": 0.32307496666908264, + "learning_rate": 0.0004980804195804195, + "loss": 3.4305, + "step": 29250 + }, + { + "epoch": 8.532354825557691, + "grad_norm": 0.3157086670398712, + "learning_rate": 0.0004979055944055944, + "loss": 3.4064, + "step": 29300 + }, + { + "epoch": 8.546915953171414, + "grad_norm": 0.3342621326446533, + "learning_rate": 0.0004977307692307692, + "loss": 3.3926, + "step": 29350 + }, + { + "epoch": 8.561477080785137, + "grad_norm": 0.31849896907806396, + "learning_rate": 0.000497555944055944, + "loss": 3.4163, + "step": 29400 + }, + { + "epoch": 8.576038208398858, + "grad_norm": 0.31333833932876587, + "learning_rate": 0.0004973811188811188, + "loss": 3.4039, + "step": 29450 + }, + { + "epoch": 8.59059933601258, + "grad_norm": 0.3372572660446167, + "learning_rate": 0.0004972062937062937, + "loss": 3.4262, + "step": 29500 + }, + { + "epoch": 8.605160463626303, + "grad_norm": 0.3162132799625397, + "learning_rate": 0.0004970314685314685, + "loss": 3.4172, + "step": 29550 + }, + { + "epoch": 8.619721591240026, + "grad_norm": 0.3454541862010956, + "learning_rate": 0.0004968566433566433, + "loss": 3.4144, + "step": 29600 + }, + { + "epoch": 8.634282718853749, + "grad_norm": 0.3138660192489624, + "learning_rate": 0.0004966818181818181, + "loss": 3.4081, + "step": 29650 + }, + { + "epoch": 8.64884384646747, + "grad_norm": 0.3220912516117096, + "learning_rate": 0.000496506993006993, + "loss": 3.417, + "step": 29700 + }, + { + "epoch": 8.663404974081192, + "grad_norm": 0.32789018750190735, + "learning_rate": 0.0004963321678321678, + "loss": 3.4131, + "step": 29750 + }, + { + "epoch": 8.677966101694915, + "grad_norm": 0.3420283794403076, + "learning_rate": 0.0004961573426573426, + "loss": 3.4202, + "step": 29800 + }, + { + "epoch": 8.692527229308638, + "grad_norm": 0.35813525319099426, + "learning_rate": 0.0004959825174825175, + "loss": 3.4203, + "step": 29850 + }, + { + "epoch": 8.70708835692236, + "grad_norm": 0.33372020721435547, + "learning_rate": 0.0004958076923076923, + "loss": 3.4255, + "step": 29900 + }, + { + "epoch": 8.721649484536082, + "grad_norm": 0.3299264907836914, + "learning_rate": 0.0004956328671328671, + "loss": 3.4189, + "step": 29950 + }, + { + "epoch": 8.736210612149804, + "grad_norm": 0.34100714325904846, + "learning_rate": 0.0004954580419580419, + "loss": 3.4294, + "step": 30000 + }, + { + "epoch": 8.736210612149804, + "eval_accuracy": 0.367117809253729, + "eval_loss": 3.57382869720459, + "eval_runtime": 181.1298, + "eval_samples_per_second": 91.901, + "eval_steps_per_second": 5.747, + "step": 30000 + } + ], + "logging_steps": 50, + "max_steps": 171700, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.2705907007488e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}