diff --git "a/exp5.ethereal-galaxy-3/checkpoint-125500/trainer_state.json" "b/exp5.ethereal-galaxy-3/checkpoint-125500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/exp5.ethereal-galaxy-3/checkpoint-125500/trainer_state.json" @@ -0,0 +1,89892 @@ +{ + "best_global_step": 125500, + "best_metric": 1.5240540504455566, + "best_model_checkpoint": "outputs/exp5/checkpoint-125500", + "epoch": 0.37988285684016165, + "eval_steps": 500, + "global_step": 125500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.0269550345829612e-05, + "grad_norm": 32.71391677856445, + "learning_rate": 1.125e-06, + "loss": 7.795901489257813, + "step": 10 + }, + { + "epoch": 6.0539100691659225e-05, + "grad_norm": 21.27573585510254, + "learning_rate": 2.375e-06, + "loss": 5.765261459350586, + "step": 20 + }, + { + "epoch": 9.080865103748884e-05, + "grad_norm": 7.867293834686279, + "learning_rate": 3.6250000000000004e-06, + "loss": 4.801158905029297, + "step": 30 + }, + { + "epoch": 0.00012107820138331845, + "grad_norm": 4.84231424331665, + "learning_rate": 4.875e-06, + "loss": 4.257933044433594, + "step": 40 + }, + { + "epoch": 0.00015134775172914806, + "grad_norm": 3.894972801208496, + "learning_rate": 6.125000000000001e-06, + "loss": 3.925360107421875, + "step": 50 + }, + { + "epoch": 0.00018161730207497768, + "grad_norm": 4.6741042137146, + "learning_rate": 7.375e-06, + "loss": 3.6218875885009765, + "step": 60 + }, + { + "epoch": 0.00021188685242080728, + "grad_norm": 4.66612434387207, + "learning_rate": 8.625000000000001e-06, + "loss": 3.546315383911133, + "step": 70 + }, + { + "epoch": 0.0002421564027666369, + "grad_norm": 5.095178604125977, + "learning_rate": 9.875e-06, + "loss": 3.437714385986328, + "step": 80 + }, + { + "epoch": 0.0002724259531124665, + "grad_norm": 3.830501079559326, + "learning_rate": 1.1125e-05, + "loss": 3.4018318176269533, + "step": 90 + }, + { + "epoch": 0.0003026955034582961, + "grad_norm": 2.6779887676239014, + "learning_rate": 1.2375000000000001e-05, + "loss": 3.2854904174804687, + "step": 100 + }, + { + "epoch": 0.0003329650538041257, + "grad_norm": 2.1662638187408447, + "learning_rate": 1.3625e-05, + "loss": 3.2177150726318358, + "step": 110 + }, + { + "epoch": 0.00036323460414995536, + "grad_norm": 3.692639112472534, + "learning_rate": 1.4874999999999999e-05, + "loss": 3.1245296478271483, + "step": 120 + }, + { + "epoch": 0.00039350415449578496, + "grad_norm": 3.2205567359924316, + "learning_rate": 1.6125000000000002e-05, + "loss": 3.0563894271850587, + "step": 130 + }, + { + "epoch": 0.00042377370484161455, + "grad_norm": 3.0387134552001953, + "learning_rate": 1.7375000000000002e-05, + "loss": 2.859049606323242, + "step": 140 + }, + { + "epoch": 0.0004540432551874442, + "grad_norm": 1.6792466640472412, + "learning_rate": 1.8625e-05, + "loss": 2.731751251220703, + "step": 150 + }, + { + "epoch": 0.0004843128055332738, + "grad_norm": 2.712183713912964, + "learning_rate": 1.9875000000000002e-05, + "loss": 2.692796325683594, + "step": 160 + }, + { + "epoch": 0.0005145823558791034, + "grad_norm": 4.2321577072143555, + "learning_rate": 2.1125000000000002e-05, + "loss": 2.6435142517089845, + "step": 170 + }, + { + "epoch": 0.000544851906224933, + "grad_norm": 2.334047794342041, + "learning_rate": 2.2375e-05, + "loss": 2.607231330871582, + "step": 180 + }, + { + "epoch": 0.0005751214565707627, + "grad_norm": 1.8972078561782837, + "learning_rate": 2.3625e-05, + "loss": 2.5825185775756836, + "step": 190 + }, + { + "epoch": 0.0006053910069165922, + "grad_norm": 2.09916090965271, + "learning_rate": 2.4875e-05, + "loss": 2.582083511352539, + "step": 200 + }, + { + "epoch": 0.0006356605572624219, + "grad_norm": 3.025733470916748, + "learning_rate": 2.6124999999999998e-05, + "loss": 2.4973955154418945, + "step": 210 + }, + { + "epoch": 0.0006659301076082514, + "grad_norm": 2.1977810859680176, + "learning_rate": 2.7375e-05, + "loss": 2.5492927551269533, + "step": 220 + }, + { + "epoch": 0.0006961996579540811, + "grad_norm": 3.313371181488037, + "learning_rate": 2.8625e-05, + "loss": 2.461900520324707, + "step": 230 + }, + { + "epoch": 0.0007264692082999107, + "grad_norm": 2.105548143386841, + "learning_rate": 2.9874999999999998e-05, + "loss": 2.538843536376953, + "step": 240 + }, + { + "epoch": 0.0007567387586457403, + "grad_norm": 1.6633719205856323, + "learning_rate": 3.1125e-05, + "loss": 2.473529243469238, + "step": 250 + }, + { + "epoch": 0.0007870083089915699, + "grad_norm": 1.7097644805908203, + "learning_rate": 3.2375e-05, + "loss": 2.502284812927246, + "step": 260 + }, + { + "epoch": 0.0008172778593373996, + "grad_norm": 2.726830005645752, + "learning_rate": 3.3625000000000004e-05, + "loss": 2.4924468994140625, + "step": 270 + }, + { + "epoch": 0.0008475474096832291, + "grad_norm": 1.3766112327575684, + "learning_rate": 3.4875e-05, + "loss": 2.508126449584961, + "step": 280 + }, + { + "epoch": 0.0008778169600290588, + "grad_norm": 1.4641327857971191, + "learning_rate": 3.6125e-05, + "loss": 2.4579084396362303, + "step": 290 + }, + { + "epoch": 0.0009080865103748884, + "grad_norm": 1.6018173694610596, + "learning_rate": 3.7375e-05, + "loss": 2.4401905059814455, + "step": 300 + }, + { + "epoch": 0.000938356060720718, + "grad_norm": 1.992253065109253, + "learning_rate": 3.8625000000000004e-05, + "loss": 2.446523666381836, + "step": 310 + }, + { + "epoch": 0.0009686256110665476, + "grad_norm": 1.1965806484222412, + "learning_rate": 3.9875e-05, + "loss": 2.429392623901367, + "step": 320 + }, + { + "epoch": 0.0009988951614123772, + "grad_norm": 1.1730443239212036, + "learning_rate": 4.1125000000000004e-05, + "loss": 2.410472869873047, + "step": 330 + }, + { + "epoch": 0.0010291647117582068, + "grad_norm": 2.0595803260803223, + "learning_rate": 4.237500000000001e-05, + "loss": 2.431297492980957, + "step": 340 + }, + { + "epoch": 0.0010594342621040365, + "grad_norm": 1.324243426322937, + "learning_rate": 4.3624999999999997e-05, + "loss": 2.459763526916504, + "step": 350 + }, + { + "epoch": 0.001089703812449866, + "grad_norm": 1.6105445623397827, + "learning_rate": 4.4875e-05, + "loss": 2.4891971588134765, + "step": 360 + }, + { + "epoch": 0.0011199733627956956, + "grad_norm": 1.0466638803482056, + "learning_rate": 4.6125e-05, + "loss": 2.4199922561645506, + "step": 370 + }, + { + "epoch": 0.0011502429131415254, + "grad_norm": 1.4988551139831543, + "learning_rate": 4.7375e-05, + "loss": 2.393979072570801, + "step": 380 + }, + { + "epoch": 0.001180512463487355, + "grad_norm": 1.259209394454956, + "learning_rate": 4.8625e-05, + "loss": 2.437427520751953, + "step": 390 + }, + { + "epoch": 0.0012107820138331845, + "grad_norm": 1.1468561887741089, + "learning_rate": 4.9875000000000006e-05, + "loss": 2.4045894622802733, + "step": 400 + }, + { + "epoch": 0.001241051564179014, + "grad_norm": 1.296635389328003, + "learning_rate": 5.1124999999999996e-05, + "loss": 2.394451141357422, + "step": 410 + }, + { + "epoch": 0.0012713211145248438, + "grad_norm": 0.773993194103241, + "learning_rate": 5.2375e-05, + "loss": 2.404071807861328, + "step": 420 + }, + { + "epoch": 0.0013015906648706733, + "grad_norm": 1.0140191316604614, + "learning_rate": 5.3625e-05, + "loss": 2.4008155822753907, + "step": 430 + }, + { + "epoch": 0.0013318602152165029, + "grad_norm": 1.1962729692459106, + "learning_rate": 5.4875e-05, + "loss": 2.395618438720703, + "step": 440 + }, + { + "epoch": 0.0013621297655623326, + "grad_norm": 0.9425991177558899, + "learning_rate": 5.6125e-05, + "loss": 2.3512508392333986, + "step": 450 + }, + { + "epoch": 0.0013923993159081622, + "grad_norm": 0.6022359728813171, + "learning_rate": 5.7375000000000005e-05, + "loss": 2.3993629455566405, + "step": 460 + }, + { + "epoch": 0.0014226688662539917, + "grad_norm": 0.7575365900993347, + "learning_rate": 5.8624999999999995e-05, + "loss": 2.404616928100586, + "step": 470 + }, + { + "epoch": 0.0014529384165998215, + "grad_norm": 0.5825437307357788, + "learning_rate": 5.9875e-05, + "loss": 2.3783794403076173, + "step": 480 + }, + { + "epoch": 0.001483207966945651, + "grad_norm": 0.6951585412025452, + "learning_rate": 6.1125e-05, + "loss": 2.291559600830078, + "step": 490 + }, + { + "epoch": 0.0015134775172914805, + "grad_norm": 0.9498372673988342, + "learning_rate": 6.2375e-05, + "loss": 2.3654369354248046, + "step": 500 + }, + { + "epoch": 0.0015134775172914805, + "eval_loss": 2.355363368988037, + "eval_runtime": 28.1772, + "eval_samples_per_second": 17.745, + "eval_steps_per_second": 1.136, + "step": 500 + }, + { + "epoch": 0.0015437470676373103, + "grad_norm": 0.7193517684936523, + "learning_rate": 6.3625e-05, + "loss": 2.360519599914551, + "step": 510 + }, + { + "epoch": 0.0015740166179831398, + "grad_norm": 0.7454327940940857, + "learning_rate": 6.4875e-05, + "loss": 2.341416931152344, + "step": 520 + }, + { + "epoch": 0.0016042861683289694, + "grad_norm": 0.6519851088523865, + "learning_rate": 6.612500000000001e-05, + "loss": 2.346295928955078, + "step": 530 + }, + { + "epoch": 0.0016345557186747991, + "grad_norm": 1.2784044742584229, + "learning_rate": 6.737500000000001e-05, + "loss": 2.3802040100097654, + "step": 540 + }, + { + "epoch": 0.0016648252690206287, + "grad_norm": 0.7724595665931702, + "learning_rate": 6.8625e-05, + "loss": 2.396955871582031, + "step": 550 + }, + { + "epoch": 0.0016950948193664582, + "grad_norm": 1.415008544921875, + "learning_rate": 6.9875e-05, + "loss": 2.3536972045898437, + "step": 560 + }, + { + "epoch": 0.001725364369712288, + "grad_norm": 0.708676278591156, + "learning_rate": 7.1125e-05, + "loss": 2.33492374420166, + "step": 570 + }, + { + "epoch": 0.0017556339200581175, + "grad_norm": 0.9310423135757446, + "learning_rate": 7.2375e-05, + "loss": 2.305674362182617, + "step": 580 + }, + { + "epoch": 0.001785903470403947, + "grad_norm": 0.53343665599823, + "learning_rate": 7.3625e-05, + "loss": 2.28658390045166, + "step": 590 + }, + { + "epoch": 0.0018161730207497768, + "grad_norm": 0.5393661260604858, + "learning_rate": 7.4875e-05, + "loss": 2.314031219482422, + "step": 600 + }, + { + "epoch": 0.0018464425710956064, + "grad_norm": 0.5602970123291016, + "learning_rate": 7.6125e-05, + "loss": 2.245905113220215, + "step": 610 + }, + { + "epoch": 0.001876712121441436, + "grad_norm": 0.6536660194396973, + "learning_rate": 7.7375e-05, + "loss": 2.285323715209961, + "step": 620 + }, + { + "epoch": 0.0019069816717872657, + "grad_norm": 0.5619924664497375, + "learning_rate": 7.8625e-05, + "loss": 2.351390838623047, + "step": 630 + }, + { + "epoch": 0.0019372512221330952, + "grad_norm": 0.5314536094665527, + "learning_rate": 7.9875e-05, + "loss": 2.3575042724609374, + "step": 640 + }, + { + "epoch": 0.001967520772478925, + "grad_norm": 0.6183397769927979, + "learning_rate": 8.112500000000001e-05, + "loss": 2.2976593017578124, + "step": 650 + }, + { + "epoch": 0.0019977903228247545, + "grad_norm": 0.5341349244117737, + "learning_rate": 8.237500000000001e-05, + "loss": 2.381595802307129, + "step": 660 + }, + { + "epoch": 0.002028059873170584, + "grad_norm": 0.677995502948761, + "learning_rate": 8.362500000000001e-05, + "loss": 2.3346620559692384, + "step": 670 + }, + { + "epoch": 0.0020583294235164136, + "grad_norm": 0.9394902586936951, + "learning_rate": 8.4875e-05, + "loss": 2.31637020111084, + "step": 680 + }, + { + "epoch": 0.002088598973862243, + "grad_norm": 0.7744619250297546, + "learning_rate": 8.612499999999999e-05, + "loss": 2.308848571777344, + "step": 690 + }, + { + "epoch": 0.002118868524208073, + "grad_norm": 0.5292350649833679, + "learning_rate": 8.7375e-05, + "loss": 2.302325439453125, + "step": 700 + }, + { + "epoch": 0.0021491380745539026, + "grad_norm": 0.5283247828483582, + "learning_rate": 8.8625e-05, + "loss": 2.323068618774414, + "step": 710 + }, + { + "epoch": 0.002179407624899732, + "grad_norm": 0.4465847611427307, + "learning_rate": 8.9875e-05, + "loss": 2.2445022583007814, + "step": 720 + }, + { + "epoch": 0.0022096771752455617, + "grad_norm": 0.5856287479400635, + "learning_rate": 9.1125e-05, + "loss": 2.284208297729492, + "step": 730 + }, + { + "epoch": 0.0022399467255913913, + "grad_norm": 0.4706282317638397, + "learning_rate": 9.2375e-05, + "loss": 2.32335205078125, + "step": 740 + }, + { + "epoch": 0.002270216275937221, + "grad_norm": 0.6542089581489563, + "learning_rate": 9.3625e-05, + "loss": 2.262136459350586, + "step": 750 + }, + { + "epoch": 0.0023004858262830508, + "grad_norm": 0.9211499094963074, + "learning_rate": 9.4875e-05, + "loss": 2.323548126220703, + "step": 760 + }, + { + "epoch": 0.0023307553766288803, + "grad_norm": 1.7154865264892578, + "learning_rate": 9.6125e-05, + "loss": 2.4311717987060546, + "step": 770 + }, + { + "epoch": 0.00236102492697471, + "grad_norm": 1.372588872909546, + "learning_rate": 9.737500000000001e-05, + "loss": 2.374242973327637, + "step": 780 + }, + { + "epoch": 0.0023912944773205394, + "grad_norm": 0.7709490060806274, + "learning_rate": 9.862500000000001e-05, + "loss": 2.3787513732910157, + "step": 790 + }, + { + "epoch": 0.002421564027666369, + "grad_norm": 0.6706852912902832, + "learning_rate": 9.9875e-05, + "loss": 2.342292594909668, + "step": 800 + }, + { + "epoch": 0.0024518335780121985, + "grad_norm": 0.503828227519989, + "learning_rate": 0.000101125, + "loss": 2.2601911544799806, + "step": 810 + }, + { + "epoch": 0.002482103128358028, + "grad_norm": 0.48011815547943115, + "learning_rate": 0.000102375, + "loss": 2.3075019836425783, + "step": 820 + }, + { + "epoch": 0.002512372678703858, + "grad_norm": 0.6168056726455688, + "learning_rate": 0.000103625, + "loss": 2.2731496810913088, + "step": 830 + }, + { + "epoch": 0.0025426422290496875, + "grad_norm": 0.4228500425815582, + "learning_rate": 0.000104875, + "loss": 2.2583412170410155, + "step": 840 + }, + { + "epoch": 0.002572911779395517, + "grad_norm": 0.6193754076957703, + "learning_rate": 0.000106125, + "loss": 2.342001724243164, + "step": 850 + }, + { + "epoch": 0.0026031813297413466, + "grad_norm": 0.4351731538772583, + "learning_rate": 0.000107375, + "loss": 2.300341033935547, + "step": 860 + }, + { + "epoch": 0.002633450880087176, + "grad_norm": 0.43540018796920776, + "learning_rate": 0.000108625, + "loss": 2.2967491149902344, + "step": 870 + }, + { + "epoch": 0.0026637204304330057, + "grad_norm": 0.6158967614173889, + "learning_rate": 0.000109875, + "loss": 2.289077377319336, + "step": 880 + }, + { + "epoch": 0.0026939899807788357, + "grad_norm": 0.44917112588882446, + "learning_rate": 0.000111125, + "loss": 2.286356544494629, + "step": 890 + }, + { + "epoch": 0.0027242595311246652, + "grad_norm": 0.35842180252075195, + "learning_rate": 0.00011237500000000001, + "loss": 2.3379711151123046, + "step": 900 + }, + { + "epoch": 0.0027545290814704948, + "grad_norm": 0.40836119651794434, + "learning_rate": 0.00011362500000000001, + "loss": 2.2408206939697264, + "step": 910 + }, + { + "epoch": 0.0027847986318163243, + "grad_norm": 0.7359377145767212, + "learning_rate": 0.00011487500000000001, + "loss": 2.313113212585449, + "step": 920 + }, + { + "epoch": 0.002815068182162154, + "grad_norm": 0.50320965051651, + "learning_rate": 0.000116125, + "loss": 2.2863956451416017, + "step": 930 + }, + { + "epoch": 0.0028453377325079834, + "grad_norm": 0.3925902247428894, + "learning_rate": 0.000117375, + "loss": 2.3059410095214843, + "step": 940 + }, + { + "epoch": 0.0028756072828538134, + "grad_norm": 0.47227758169174194, + "learning_rate": 0.000118625, + "loss": 2.2974617004394533, + "step": 950 + }, + { + "epoch": 0.002905876833199643, + "grad_norm": 0.5198391675949097, + "learning_rate": 0.000119875, + "loss": 2.299764633178711, + "step": 960 + }, + { + "epoch": 0.0029361463835454724, + "grad_norm": 0.5888251066207886, + "learning_rate": 0.000121125, + "loss": 2.31335506439209, + "step": 970 + }, + { + "epoch": 0.002966415933891302, + "grad_norm": 0.39387762546539307, + "learning_rate": 0.000122375, + "loss": 2.304210662841797, + "step": 980 + }, + { + "epoch": 0.0029966854842371315, + "grad_norm": 0.490307480096817, + "learning_rate": 0.000123625, + "loss": 2.303770065307617, + "step": 990 + }, + { + "epoch": 0.003026955034582961, + "grad_norm": 0.4279431700706482, + "learning_rate": 0.000124875, + "loss": 2.2766277313232424, + "step": 1000 + }, + { + "epoch": 0.003026955034582961, + "eval_loss": 2.27677321434021, + "eval_runtime": 28.1178, + "eval_samples_per_second": 17.782, + "eval_steps_per_second": 1.138, + "step": 1000 + }, + { + "epoch": 0.003057224584928791, + "grad_norm": 0.35737746953964233, + "learning_rate": 0.00012499658433652634, + "loss": 2.317249298095703, + "step": 1010 + }, + { + "epoch": 0.0030874941352746206, + "grad_norm": 0.3830890953540802, + "learning_rate": 0.00012499278915488896, + "loss": 2.275107002258301, + "step": 1020 + }, + { + "epoch": 0.00311776368562045, + "grad_norm": 0.4502268135547638, + "learning_rate": 0.00012498899397325156, + "loss": 2.3318347930908203, + "step": 1030 + }, + { + "epoch": 0.0031480332359662797, + "grad_norm": 0.40929052233695984, + "learning_rate": 0.00012498519879161418, + "loss": 2.2548442840576173, + "step": 1040 + }, + { + "epoch": 0.003178302786312109, + "grad_norm": 0.38467562198638916, + "learning_rate": 0.00012498140360997677, + "loss": 2.2719120025634765, + "step": 1050 + }, + { + "epoch": 0.0032085723366579388, + "grad_norm": 0.5000295042991638, + "learning_rate": 0.0001249776084283394, + "loss": 2.2626646041870115, + "step": 1060 + }, + { + "epoch": 0.0032388418870037687, + "grad_norm": 0.43877390027046204, + "learning_rate": 0.00012497381324670198, + "loss": 2.2627986907958983, + "step": 1070 + }, + { + "epoch": 0.0032691114373495983, + "grad_norm": 0.39350026845932007, + "learning_rate": 0.0001249700180650646, + "loss": 2.314296913146973, + "step": 1080 + }, + { + "epoch": 0.003299380987695428, + "grad_norm": 0.4367413818836212, + "learning_rate": 0.0001249662228834272, + "loss": 2.2867048263549803, + "step": 1090 + }, + { + "epoch": 0.0033296505380412574, + "grad_norm": 0.5027052760124207, + "learning_rate": 0.0001249624277017898, + "loss": 2.264872741699219, + "step": 1100 + }, + { + "epoch": 0.003359920088387087, + "grad_norm": 0.409188836812973, + "learning_rate": 0.00012495863252015243, + "loss": 2.310885429382324, + "step": 1110 + }, + { + "epoch": 0.0033901896387329164, + "grad_norm": 0.32223549485206604, + "learning_rate": 0.00012495483733851502, + "loss": 2.284876251220703, + "step": 1120 + }, + { + "epoch": 0.0034204591890787464, + "grad_norm": 0.45639532804489136, + "learning_rate": 0.00012495104215687764, + "loss": 2.3058679580688475, + "step": 1130 + }, + { + "epoch": 0.003450728739424576, + "grad_norm": 0.3911088705062866, + "learning_rate": 0.00012494724697524023, + "loss": 2.2711437225341795, + "step": 1140 + }, + { + "epoch": 0.0034809982897704055, + "grad_norm": 0.398386150598526, + "learning_rate": 0.00012494345179360285, + "loss": 2.247482681274414, + "step": 1150 + }, + { + "epoch": 0.003511267840116235, + "grad_norm": 0.5121650099754333, + "learning_rate": 0.00012493965661196545, + "loss": 2.276239204406738, + "step": 1160 + }, + { + "epoch": 0.0035415373904620646, + "grad_norm": 0.3912726640701294, + "learning_rate": 0.00012493586143032807, + "loss": 2.265783500671387, + "step": 1170 + }, + { + "epoch": 0.003571806940807894, + "grad_norm": 0.5067486763000488, + "learning_rate": 0.00012493206624869066, + "loss": 2.269870948791504, + "step": 1180 + }, + { + "epoch": 0.003602076491153724, + "grad_norm": 0.3654330372810364, + "learning_rate": 0.00012492827106705328, + "loss": 2.304250717163086, + "step": 1190 + }, + { + "epoch": 0.0036323460414995536, + "grad_norm": 0.6996779441833496, + "learning_rate": 0.00012492447588541587, + "loss": 2.2965980529785157, + "step": 1200 + }, + { + "epoch": 0.003662615591845383, + "grad_norm": 1.005761742591858, + "learning_rate": 0.0001249206807037785, + "loss": 2.302958106994629, + "step": 1210 + }, + { + "epoch": 0.0036928851421912127, + "grad_norm": 0.4714489281177521, + "learning_rate": 0.00012491688552214108, + "loss": 2.2661712646484373, + "step": 1220 + }, + { + "epoch": 0.0037231546925370423, + "grad_norm": 0.5924913287162781, + "learning_rate": 0.0001249130903405037, + "loss": 2.238137435913086, + "step": 1230 + }, + { + "epoch": 0.003753424242882872, + "grad_norm": 0.37848570942878723, + "learning_rate": 0.0001249092951588663, + "loss": 2.304804801940918, + "step": 1240 + }, + { + "epoch": 0.0037836937932287018, + "grad_norm": 0.3319951593875885, + "learning_rate": 0.00012490549997722891, + "loss": 2.254572296142578, + "step": 1250 + }, + { + "epoch": 0.0038139633435745313, + "grad_norm": 0.34522920846939087, + "learning_rate": 0.0001249017047955915, + "loss": 2.2848175048828123, + "step": 1260 + }, + { + "epoch": 0.003844232893920361, + "grad_norm": 0.3933546543121338, + "learning_rate": 0.00012489790961395413, + "loss": 2.3127267837524412, + "step": 1270 + }, + { + "epoch": 0.0038745024442661904, + "grad_norm": 0.3861636221408844, + "learning_rate": 0.00012489411443231672, + "loss": 2.266646957397461, + "step": 1280 + }, + { + "epoch": 0.00390477199461202, + "grad_norm": 0.3596566319465637, + "learning_rate": 0.00012489031925067934, + "loss": 2.244002914428711, + "step": 1290 + }, + { + "epoch": 0.00393504154495785, + "grad_norm": 0.36064261198043823, + "learning_rate": 0.00012488652406904193, + "loss": 2.2584978103637696, + "step": 1300 + }, + { + "epoch": 0.003965311095303679, + "grad_norm": 0.37361598014831543, + "learning_rate": 0.00012488272888740455, + "loss": 2.2741426467895507, + "step": 1310 + }, + { + "epoch": 0.003995580645649509, + "grad_norm": 0.31100353598594666, + "learning_rate": 0.00012487893370576717, + "loss": 2.2559486389160157, + "step": 1320 + }, + { + "epoch": 0.004025850195995338, + "grad_norm": 0.3119608163833618, + "learning_rate": 0.00012487513852412976, + "loss": 2.2707168579101564, + "step": 1330 + }, + { + "epoch": 0.004056119746341168, + "grad_norm": 0.31787872314453125, + "learning_rate": 0.00012487134334249238, + "loss": 2.2790401458740233, + "step": 1340 + }, + { + "epoch": 0.004086389296686998, + "grad_norm": 0.2728416621685028, + "learning_rate": 0.00012486754816085497, + "loss": 2.191335105895996, + "step": 1350 + }, + { + "epoch": 0.004116658847032827, + "grad_norm": 0.2649732232093811, + "learning_rate": 0.0001248637529792176, + "loss": 2.276183319091797, + "step": 1360 + }, + { + "epoch": 0.004146928397378657, + "grad_norm": 0.3630141317844391, + "learning_rate": 0.00012485995779758019, + "loss": 2.2969081878662108, + "step": 1370 + }, + { + "epoch": 0.004177197947724486, + "grad_norm": 0.3274456858634949, + "learning_rate": 0.0001248561626159428, + "loss": 2.282969665527344, + "step": 1380 + }, + { + "epoch": 0.004207467498070316, + "grad_norm": 0.48780515789985657, + "learning_rate": 0.0001248523674343054, + "loss": 2.198868751525879, + "step": 1390 + }, + { + "epoch": 0.004237737048416146, + "grad_norm": 0.3891117572784424, + "learning_rate": 0.00012484857225266802, + "loss": 2.311367988586426, + "step": 1400 + }, + { + "epoch": 0.004268006598761975, + "grad_norm": 0.34386229515075684, + "learning_rate": 0.0001248447770710306, + "loss": 2.233053207397461, + "step": 1410 + }, + { + "epoch": 0.004298276149107805, + "grad_norm": 0.31046003103256226, + "learning_rate": 0.00012484098188939323, + "loss": 2.22042179107666, + "step": 1420 + }, + { + "epoch": 0.004328545699453634, + "grad_norm": 0.310148149728775, + "learning_rate": 0.00012483718670775582, + "loss": 2.2561418533325197, + "step": 1430 + }, + { + "epoch": 0.004358815249799464, + "grad_norm": 0.42584627866744995, + "learning_rate": 0.00012483339152611844, + "loss": 2.235622596740723, + "step": 1440 + }, + { + "epoch": 0.0043890848001452935, + "grad_norm": 0.3091937303543091, + "learning_rate": 0.00012482959634448103, + "loss": 2.225381851196289, + "step": 1450 + }, + { + "epoch": 0.0044193543504911234, + "grad_norm": 0.3652314841747284, + "learning_rate": 0.00012482580116284365, + "loss": 2.246646499633789, + "step": 1460 + }, + { + "epoch": 0.004449623900836953, + "grad_norm": 0.3559296131134033, + "learning_rate": 0.00012482200598120624, + "loss": 2.2351226806640625, + "step": 1470 + }, + { + "epoch": 0.0044798934511827825, + "grad_norm": 0.3532382547855377, + "learning_rate": 0.00012481821079956886, + "loss": 2.2540813446044923, + "step": 1480 + }, + { + "epoch": 0.0045101630015286125, + "grad_norm": 0.48027941584587097, + "learning_rate": 0.00012481441561793148, + "loss": 2.2686412811279295, + "step": 1490 + }, + { + "epoch": 0.004540432551874442, + "grad_norm": 0.2854057252407074, + "learning_rate": 0.00012481062043629408, + "loss": 2.2307220458984376, + "step": 1500 + }, + { + "epoch": 0.004540432551874442, + "eval_loss": 2.2453041076660156, + "eval_runtime": 28.4598, + "eval_samples_per_second": 17.569, + "eval_steps_per_second": 1.124, + "step": 1500 + }, + { + "epoch": 0.004570702102220272, + "grad_norm": 0.3890022039413452, + "learning_rate": 0.0001248068252546567, + "loss": 2.28124885559082, + "step": 1510 + }, + { + "epoch": 0.0046009716525661016, + "grad_norm": 0.4052814245223999, + "learning_rate": 0.0001248030300730193, + "loss": 2.2739429473876953, + "step": 1520 + }, + { + "epoch": 0.004631241202911931, + "grad_norm": 0.32541707158088684, + "learning_rate": 0.0001247992348913819, + "loss": 2.2678466796875, + "step": 1530 + }, + { + "epoch": 0.004661510753257761, + "grad_norm": 0.2985486686229706, + "learning_rate": 0.0001247954397097445, + "loss": 2.2564830780029297, + "step": 1540 + }, + { + "epoch": 0.00469178030360359, + "grad_norm": 0.3229452967643738, + "learning_rate": 0.00012479164452810712, + "loss": 2.202931213378906, + "step": 1550 + }, + { + "epoch": 0.00472204985394942, + "grad_norm": 0.46477168798446655, + "learning_rate": 0.0001247878493464697, + "loss": 2.232157897949219, + "step": 1560 + }, + { + "epoch": 0.004752319404295249, + "grad_norm": 0.40824243426322937, + "learning_rate": 0.00012478405416483233, + "loss": 2.2822921752929686, + "step": 1570 + }, + { + "epoch": 0.004782588954641079, + "grad_norm": 0.3972085118293762, + "learning_rate": 0.00012478025898319492, + "loss": 2.2024198532104493, + "step": 1580 + }, + { + "epoch": 0.004812858504986909, + "grad_norm": 0.41837695240974426, + "learning_rate": 0.00012477646380155754, + "loss": 2.232089042663574, + "step": 1590 + }, + { + "epoch": 0.004843128055332738, + "grad_norm": 0.44099071621894836, + "learning_rate": 0.00012477266861992014, + "loss": 2.1927522659301757, + "step": 1600 + }, + { + "epoch": 0.004873397605678568, + "grad_norm": 0.46607622504234314, + "learning_rate": 0.00012476887343828276, + "loss": 2.251165199279785, + "step": 1610 + }, + { + "epoch": 0.004903667156024397, + "grad_norm": 0.39608192443847656, + "learning_rate": 0.00012476507825664537, + "loss": 2.256379318237305, + "step": 1620 + }, + { + "epoch": 0.004933936706370227, + "grad_norm": 0.3015270233154297, + "learning_rate": 0.00012476128307500797, + "loss": 2.2743602752685548, + "step": 1630 + }, + { + "epoch": 0.004964206256716056, + "grad_norm": 0.3477328419685364, + "learning_rate": 0.0001247574878933706, + "loss": 2.212748336791992, + "step": 1640 + }, + { + "epoch": 0.004994475807061886, + "grad_norm": 0.3065662086009979, + "learning_rate": 0.00012475369271173318, + "loss": 2.182935905456543, + "step": 1650 + }, + { + "epoch": 0.005024745357407716, + "grad_norm": 0.2873033285140991, + "learning_rate": 0.0001247498975300958, + "loss": 2.2437759399414063, + "step": 1660 + }, + { + "epoch": 0.005055014907753545, + "grad_norm": 0.355082243680954, + "learning_rate": 0.0001247461023484584, + "loss": 2.22430534362793, + "step": 1670 + }, + { + "epoch": 0.005085284458099375, + "grad_norm": 0.35845014452934265, + "learning_rate": 0.000124742307166821, + "loss": 2.2114038467407227, + "step": 1680 + }, + { + "epoch": 0.005115554008445204, + "grad_norm": 0.35345712304115295, + "learning_rate": 0.0001247385119851836, + "loss": 2.246626281738281, + "step": 1690 + }, + { + "epoch": 0.005145823558791034, + "grad_norm": 0.29233214259147644, + "learning_rate": 0.00012473471680354622, + "loss": 2.1894485473632814, + "step": 1700 + }, + { + "epoch": 0.005176093109136864, + "grad_norm": 0.3551572561264038, + "learning_rate": 0.00012473092162190881, + "loss": 2.1826766967773437, + "step": 1710 + }, + { + "epoch": 0.005206362659482693, + "grad_norm": 0.37532177567481995, + "learning_rate": 0.00012472712644027143, + "loss": 2.150156021118164, + "step": 1720 + }, + { + "epoch": 0.005236632209828523, + "grad_norm": 0.3272901475429535, + "learning_rate": 0.00012472333125863405, + "loss": 2.165321159362793, + "step": 1730 + }, + { + "epoch": 0.005266901760174352, + "grad_norm": 0.29010140895843506, + "learning_rate": 0.00012471953607699665, + "loss": 2.1862926483154297, + "step": 1740 + }, + { + "epoch": 0.005297171310520182, + "grad_norm": 0.3292561173439026, + "learning_rate": 0.00012471574089535927, + "loss": 2.2181652069091795, + "step": 1750 + }, + { + "epoch": 0.005327440860866011, + "grad_norm": 0.30203378200531006, + "learning_rate": 0.00012471194571372186, + "loss": 2.212194061279297, + "step": 1760 + }, + { + "epoch": 0.005357710411211841, + "grad_norm": 0.3314816951751709, + "learning_rate": 0.00012470815053208448, + "loss": 2.217387008666992, + "step": 1770 + }, + { + "epoch": 0.005387979961557671, + "grad_norm": 0.5073200464248657, + "learning_rate": 0.00012470435535044707, + "loss": 2.2013362884521483, + "step": 1780 + }, + { + "epoch": 0.0054182495119035005, + "grad_norm": 0.283540278673172, + "learning_rate": 0.0001247005601688097, + "loss": 2.1967632293701174, + "step": 1790 + }, + { + "epoch": 0.0054485190622493304, + "grad_norm": 0.29696398973464966, + "learning_rate": 0.00012469676498717228, + "loss": 2.2612445831298826, + "step": 1800 + }, + { + "epoch": 0.0054787886125951596, + "grad_norm": 0.3946264386177063, + "learning_rate": 0.0001246929698055349, + "loss": 2.2246156692504884, + "step": 1810 + }, + { + "epoch": 0.0055090581629409895, + "grad_norm": 0.31621474027633667, + "learning_rate": 0.0001246891746238975, + "loss": 2.237888526916504, + "step": 1820 + }, + { + "epoch": 0.0055393277132868195, + "grad_norm": 3.1130847930908203, + "learning_rate": 0.0001246853794422601, + "loss": 2.269235610961914, + "step": 1830 + }, + { + "epoch": 0.005569597263632649, + "grad_norm": 0.386030912399292, + "learning_rate": 0.0001246815842606227, + "loss": 2.206276702880859, + "step": 1840 + }, + { + "epoch": 0.005599866813978479, + "grad_norm": 0.30042919516563416, + "learning_rate": 0.00012467778907898533, + "loss": 2.2733642578125, + "step": 1850 + }, + { + "epoch": 0.005630136364324308, + "grad_norm": 0.6596691012382507, + "learning_rate": 0.00012467399389734794, + "loss": 2.2362237930297852, + "step": 1860 + }, + { + "epoch": 0.005660405914670138, + "grad_norm": 0.4766746759414673, + "learning_rate": 0.00012467019871571054, + "loss": 2.22604923248291, + "step": 1870 + }, + { + "epoch": 0.005690675465015967, + "grad_norm": 0.41266652941703796, + "learning_rate": 0.00012466640353407316, + "loss": 2.247857666015625, + "step": 1880 + }, + { + "epoch": 0.005720945015361797, + "grad_norm": 0.336088627576828, + "learning_rate": 0.00012466260835243575, + "loss": 2.2273160934448244, + "step": 1890 + }, + { + "epoch": 0.005751214565707627, + "grad_norm": 0.31323322653770447, + "learning_rate": 0.00012465881317079837, + "loss": 2.194530487060547, + "step": 1900 + }, + { + "epoch": 0.005781484116053456, + "grad_norm": 0.3926990330219269, + "learning_rate": 0.00012465501798916096, + "loss": 2.149825096130371, + "step": 1910 + }, + { + "epoch": 0.005811753666399286, + "grad_norm": 0.4148895740509033, + "learning_rate": 0.00012465122280752358, + "loss": 2.176031494140625, + "step": 1920 + }, + { + "epoch": 0.005842023216745115, + "grad_norm": 0.30084070563316345, + "learning_rate": 0.00012464742762588617, + "loss": 2.1756879806518556, + "step": 1930 + }, + { + "epoch": 0.005872292767090945, + "grad_norm": 0.3186253309249878, + "learning_rate": 0.0001246436324442488, + "loss": 2.215835762023926, + "step": 1940 + }, + { + "epoch": 0.005902562317436775, + "grad_norm": 0.2795025408267975, + "learning_rate": 0.00012463983726261138, + "loss": 2.1661773681640626, + "step": 1950 + }, + { + "epoch": 0.005932831867782604, + "grad_norm": 0.35839542746543884, + "learning_rate": 0.000124636042080974, + "loss": 2.1719900131225587, + "step": 1960 + }, + { + "epoch": 0.005963101418128434, + "grad_norm": 0.2729169726371765, + "learning_rate": 0.0001246322468993366, + "loss": 2.192332077026367, + "step": 1970 + }, + { + "epoch": 0.005993370968474263, + "grad_norm": 0.3199402987957001, + "learning_rate": 0.00012462845171769922, + "loss": 2.2086748123168944, + "step": 1980 + }, + { + "epoch": 0.006023640518820093, + "grad_norm": 0.272156298160553, + "learning_rate": 0.00012462465653606184, + "loss": 2.22485408782959, + "step": 1990 + }, + { + "epoch": 0.006053910069165922, + "grad_norm": 0.32220110297203064, + "learning_rate": 0.00012462086135442443, + "loss": 2.202153778076172, + "step": 2000 + }, + { + "epoch": 0.006053910069165922, + "eval_loss": 2.1705596446990967, + "eval_runtime": 28.2947, + "eval_samples_per_second": 17.671, + "eval_steps_per_second": 1.131, + "step": 2000 + }, + { + "epoch": 0.006084179619511752, + "grad_norm": 0.2545999586582184, + "learning_rate": 0.00012461706617278705, + "loss": 2.1977577209472656, + "step": 2010 + }, + { + "epoch": 0.006114449169857582, + "grad_norm": 0.39694979786872864, + "learning_rate": 0.00012461327099114964, + "loss": 2.2239688873291015, + "step": 2020 + }, + { + "epoch": 0.006144718720203411, + "grad_norm": 0.3952403664588928, + "learning_rate": 0.00012460947580951226, + "loss": 2.1940792083740233, + "step": 2030 + }, + { + "epoch": 0.006174988270549241, + "grad_norm": 0.384624183177948, + "learning_rate": 0.00012460568062787485, + "loss": 2.200948143005371, + "step": 2040 + }, + { + "epoch": 0.00620525782089507, + "grad_norm": 0.315493106842041, + "learning_rate": 0.00012460188544623747, + "loss": 2.214159393310547, + "step": 2050 + }, + { + "epoch": 0.0062355273712409, + "grad_norm": 0.2682420313358307, + "learning_rate": 0.00012459809026460006, + "loss": 2.182878875732422, + "step": 2060 + }, + { + "epoch": 0.00626579692158673, + "grad_norm": 0.2551768720149994, + "learning_rate": 0.00012459429508296268, + "loss": 2.151245880126953, + "step": 2070 + }, + { + "epoch": 0.006296066471932559, + "grad_norm": 0.2586243748664856, + "learning_rate": 0.00012459049990132528, + "loss": 2.1749557495117187, + "step": 2080 + }, + { + "epoch": 0.006326336022278389, + "grad_norm": 0.31636759638786316, + "learning_rate": 0.0001245867047196879, + "loss": 2.2014278411865233, + "step": 2090 + }, + { + "epoch": 0.006356605572624218, + "grad_norm": 0.2285032570362091, + "learning_rate": 0.0001245829095380505, + "loss": 2.1820953369140623, + "step": 2100 + }, + { + "epoch": 0.006386875122970048, + "grad_norm": 0.30622413754463196, + "learning_rate": 0.0001245791143564131, + "loss": 2.1998008728027343, + "step": 2110 + }, + { + "epoch": 0.0064171446733158775, + "grad_norm": 0.32284772396087646, + "learning_rate": 0.00012457531917477573, + "loss": 2.1909259796142577, + "step": 2120 + }, + { + "epoch": 0.0064474142236617075, + "grad_norm": 0.3381090462207794, + "learning_rate": 0.00012457152399313832, + "loss": 2.221919631958008, + "step": 2130 + }, + { + "epoch": 0.0064776837740075375, + "grad_norm": 0.27904850244522095, + "learning_rate": 0.00012456772881150094, + "loss": 2.2041790008544924, + "step": 2140 + }, + { + "epoch": 0.0065079533243533666, + "grad_norm": 0.29387781023979187, + "learning_rate": 0.00012456393362986353, + "loss": 2.165576934814453, + "step": 2150 + }, + { + "epoch": 0.0065382228746991965, + "grad_norm": 0.3295604884624481, + "learning_rate": 0.00012456013844822615, + "loss": 2.2117387771606447, + "step": 2160 + }, + { + "epoch": 0.006568492425045026, + "grad_norm": 0.2970358431339264, + "learning_rate": 0.00012455634326658874, + "loss": 2.121778869628906, + "step": 2170 + }, + { + "epoch": 0.006598761975390856, + "grad_norm": 0.2665001153945923, + "learning_rate": 0.00012455254808495136, + "loss": 2.1682857513427733, + "step": 2180 + }, + { + "epoch": 0.006629031525736686, + "grad_norm": 0.30530068278312683, + "learning_rate": 0.00012454875290331395, + "loss": 2.2214118957519533, + "step": 2190 + }, + { + "epoch": 0.006659301076082515, + "grad_norm": 0.3912297785282135, + "learning_rate": 0.00012454495772167657, + "loss": 2.1493722915649416, + "step": 2200 + }, + { + "epoch": 0.006689570626428345, + "grad_norm": 0.36684200167655945, + "learning_rate": 0.00012454116254003917, + "loss": 2.18780403137207, + "step": 2210 + }, + { + "epoch": 0.006719840176774174, + "grad_norm": 0.38868221640586853, + "learning_rate": 0.00012453736735840179, + "loss": 2.1759737014770506, + "step": 2220 + }, + { + "epoch": 0.006750109727120004, + "grad_norm": 0.38483813405036926, + "learning_rate": 0.00012453357217676438, + "loss": 2.1476922988891602, + "step": 2230 + }, + { + "epoch": 0.006780379277465833, + "grad_norm": 0.2742156982421875, + "learning_rate": 0.000124529776995127, + "loss": 2.2145851135253904, + "step": 2240 + }, + { + "epoch": 0.006810648827811663, + "grad_norm": 0.27535387873649597, + "learning_rate": 0.0001245259818134896, + "loss": 2.156772422790527, + "step": 2250 + }, + { + "epoch": 0.006840918378157493, + "grad_norm": 0.3140769600868225, + "learning_rate": 0.0001245221866318522, + "loss": 2.192996597290039, + "step": 2260 + }, + { + "epoch": 0.006871187928503322, + "grad_norm": 0.31611117720603943, + "learning_rate": 0.0001245183914502148, + "loss": 2.1789873123168944, + "step": 2270 + }, + { + "epoch": 0.006901457478849152, + "grad_norm": 0.2882102429866791, + "learning_rate": 0.00012451459626857742, + "loss": 2.2114583969116213, + "step": 2280 + }, + { + "epoch": 0.006931727029194981, + "grad_norm": 0.2675201892852783, + "learning_rate": 0.00012451080108694001, + "loss": 2.201374053955078, + "step": 2290 + }, + { + "epoch": 0.006961996579540811, + "grad_norm": 0.3323476314544678, + "learning_rate": 0.00012450700590530263, + "loss": 2.1678314208984375, + "step": 2300 + }, + { + "epoch": 0.006992266129886641, + "grad_norm": 0.3076449930667877, + "learning_rate": 0.00012450321072366523, + "loss": 2.158628463745117, + "step": 2310 + }, + { + "epoch": 0.00702253568023247, + "grad_norm": 0.328043133020401, + "learning_rate": 0.00012449941554202785, + "loss": 2.1785234451293944, + "step": 2320 + }, + { + "epoch": 0.0070528052305783, + "grad_norm": 0.27618956565856934, + "learning_rate": 0.00012449562036039046, + "loss": 2.1564903259277344, + "step": 2330 + }, + { + "epoch": 0.007083074780924129, + "grad_norm": 0.2746771574020386, + "learning_rate": 0.00012449182517875306, + "loss": 2.1895416259765623, + "step": 2340 + }, + { + "epoch": 0.007113344331269959, + "grad_norm": 0.286941260099411, + "learning_rate": 0.00012448802999711568, + "loss": 2.1625356674194336, + "step": 2350 + }, + { + "epoch": 0.007143613881615788, + "grad_norm": 0.2655028700828552, + "learning_rate": 0.00012448423481547827, + "loss": 2.1482288360595705, + "step": 2360 + }, + { + "epoch": 0.007173883431961618, + "grad_norm": 0.2395583987236023, + "learning_rate": 0.0001244804396338409, + "loss": 2.2132368087768555, + "step": 2370 + }, + { + "epoch": 0.007204152982307448, + "grad_norm": 0.24503737688064575, + "learning_rate": 0.00012447664445220348, + "loss": 2.1395776748657225, + "step": 2380 + }, + { + "epoch": 0.007234422532653277, + "grad_norm": 0.28875255584716797, + "learning_rate": 0.0001244728492705661, + "loss": 2.120320510864258, + "step": 2390 + }, + { + "epoch": 0.007264692082999107, + "grad_norm": 0.29013779759407043, + "learning_rate": 0.0001244690540889287, + "loss": 2.1522600173950197, + "step": 2400 + }, + { + "epoch": 0.007294961633344936, + "grad_norm": 0.2975386381149292, + "learning_rate": 0.0001244652589072913, + "loss": 2.162734603881836, + "step": 2410 + }, + { + "epoch": 0.007325231183690766, + "grad_norm": 0.425430566072464, + "learning_rate": 0.0001244614637256539, + "loss": 2.193921661376953, + "step": 2420 + }, + { + "epoch": 0.0073555007340365955, + "grad_norm": 0.43254658579826355, + "learning_rate": 0.00012445766854401652, + "loss": 2.1607135772705077, + "step": 2430 + }, + { + "epoch": 0.007385770284382425, + "grad_norm": 0.3104337453842163, + "learning_rate": 0.00012445387336237912, + "loss": 2.17493896484375, + "step": 2440 + }, + { + "epoch": 0.007416039834728255, + "grad_norm": 0.25534459948539734, + "learning_rate": 0.00012445007818074174, + "loss": 2.1634496688842773, + "step": 2450 + }, + { + "epoch": 0.0074463093850740845, + "grad_norm": 0.3201392590999603, + "learning_rate": 0.00012444628299910433, + "loss": 2.181294822692871, + "step": 2460 + }, + { + "epoch": 0.0074765789354199145, + "grad_norm": 0.25341376662254333, + "learning_rate": 0.00012444248781746695, + "loss": 2.161159133911133, + "step": 2470 + }, + { + "epoch": 0.007506848485765744, + "grad_norm": 0.37650999426841736, + "learning_rate": 0.00012443869263582954, + "loss": 2.141987991333008, + "step": 2480 + }, + { + "epoch": 0.007537118036111574, + "grad_norm": 0.24119248986244202, + "learning_rate": 0.00012443489745419216, + "loss": 2.1335723876953123, + "step": 2490 + }, + { + "epoch": 0.0075673875864574035, + "grad_norm": 0.2901994287967682, + "learning_rate": 0.00012443110227255475, + "loss": 2.1195810317993162, + "step": 2500 + }, + { + "epoch": 0.0075673875864574035, + "eval_loss": 2.162506580352783, + "eval_runtime": 27.8539, + "eval_samples_per_second": 17.951, + "eval_steps_per_second": 1.149, + "step": 2500 + }, + { + "epoch": 0.007597657136803233, + "grad_norm": 0.22699272632598877, + "learning_rate": 0.00012442730709091737, + "loss": 2.2030670166015627, + "step": 2510 + }, + { + "epoch": 0.007627926687149063, + "grad_norm": 0.39209678769111633, + "learning_rate": 0.00012442351190927996, + "loss": 2.146108627319336, + "step": 2520 + }, + { + "epoch": 0.007658196237494892, + "grad_norm": 0.33116209506988525, + "learning_rate": 0.00012441971672764258, + "loss": 2.1558454513549803, + "step": 2530 + }, + { + "epoch": 0.007688465787840722, + "grad_norm": 0.29668259620666504, + "learning_rate": 0.0001244159215460052, + "loss": 2.174657440185547, + "step": 2540 + }, + { + "epoch": 0.007718735338186551, + "grad_norm": 0.2632780373096466, + "learning_rate": 0.0001244121263643678, + "loss": 2.1351720809936525, + "step": 2550 + }, + { + "epoch": 0.007749004888532381, + "grad_norm": 0.41674140095710754, + "learning_rate": 0.00012440833118273042, + "loss": 2.142501449584961, + "step": 2560 + }, + { + "epoch": 0.007779274438878211, + "grad_norm": 0.48520490527153015, + "learning_rate": 0.000124404536001093, + "loss": 2.1265514373779295, + "step": 2570 + }, + { + "epoch": 0.00780954398922404, + "grad_norm": 0.34049656987190247, + "learning_rate": 0.00012440074081945563, + "loss": 2.0608917236328126, + "step": 2580 + }, + { + "epoch": 0.007839813539569869, + "grad_norm": 0.5531021356582642, + "learning_rate": 0.00012439694563781822, + "loss": 2.1428510665893556, + "step": 2590 + }, + { + "epoch": 0.0078700830899157, + "grad_norm": 0.5064747333526611, + "learning_rate": 0.00012439315045618084, + "loss": 2.169996643066406, + "step": 2600 + }, + { + "epoch": 0.007900352640261529, + "grad_norm": 0.33577844500541687, + "learning_rate": 0.00012438935527454343, + "loss": 2.166944885253906, + "step": 2610 + }, + { + "epoch": 0.007930622190607358, + "grad_norm": 0.31606781482696533, + "learning_rate": 0.00012438556009290605, + "loss": 2.174839210510254, + "step": 2620 + }, + { + "epoch": 0.007960891740953189, + "grad_norm": 0.3311719000339508, + "learning_rate": 0.00012438176491126864, + "loss": 2.109915542602539, + "step": 2630 + }, + { + "epoch": 0.007991161291299018, + "grad_norm": 0.566001296043396, + "learning_rate": 0.00012437796972963126, + "loss": 2.08585205078125, + "step": 2640 + }, + { + "epoch": 0.008021430841644847, + "grad_norm": 0.36418411135673523, + "learning_rate": 0.00012437417454799386, + "loss": 2.166944122314453, + "step": 2650 + }, + { + "epoch": 0.008051700391990676, + "grad_norm": 0.2885774075984955, + "learning_rate": 0.00012437037936635647, + "loss": 2.122261810302734, + "step": 2660 + }, + { + "epoch": 0.008081969942336507, + "grad_norm": 0.29218026995658875, + "learning_rate": 0.00012436658418471907, + "loss": 2.086006927490234, + "step": 2670 + }, + { + "epoch": 0.008112239492682336, + "grad_norm": 0.3597090542316437, + "learning_rate": 0.0001243627890030817, + "loss": 2.150644302368164, + "step": 2680 + }, + { + "epoch": 0.008142509043028165, + "grad_norm": 0.32162460684776306, + "learning_rate": 0.00012435899382144428, + "loss": 2.108465576171875, + "step": 2690 + }, + { + "epoch": 0.008172778593373996, + "grad_norm": 0.32052719593048096, + "learning_rate": 0.0001243551986398069, + "loss": 2.1104164123535156, + "step": 2700 + }, + { + "epoch": 0.008203048143719825, + "grad_norm": 0.26884129643440247, + "learning_rate": 0.0001243514034581695, + "loss": 2.1615360260009764, + "step": 2710 + }, + { + "epoch": 0.008233317694065654, + "grad_norm": 0.2708166837692261, + "learning_rate": 0.0001243476082765321, + "loss": 2.1270076751708986, + "step": 2720 + }, + { + "epoch": 0.008263587244411483, + "grad_norm": 0.4100921154022217, + "learning_rate": 0.00012434381309489473, + "loss": 2.097702980041504, + "step": 2730 + }, + { + "epoch": 0.008293856794757314, + "grad_norm": 0.3187369704246521, + "learning_rate": 0.00012434001791325732, + "loss": 2.0882164001464845, + "step": 2740 + }, + { + "epoch": 0.008324126345103143, + "grad_norm": 0.3128569424152374, + "learning_rate": 0.00012433622273161994, + "loss": 2.121413803100586, + "step": 2750 + }, + { + "epoch": 0.008354395895448972, + "grad_norm": 0.3276948630809784, + "learning_rate": 0.00012433242754998253, + "loss": 2.1304771423339846, + "step": 2760 + }, + { + "epoch": 0.008384665445794803, + "grad_norm": 0.30711522698402405, + "learning_rate": 0.00012432863236834515, + "loss": 2.0737613677978515, + "step": 2770 + }, + { + "epoch": 0.008414934996140632, + "grad_norm": 0.2812264561653137, + "learning_rate": 0.00012432483718670775, + "loss": 2.1191532135009767, + "step": 2780 + }, + { + "epoch": 0.008445204546486462, + "grad_norm": 0.2731015980243683, + "learning_rate": 0.00012432104200507037, + "loss": 2.050865364074707, + "step": 2790 + }, + { + "epoch": 0.008475474096832292, + "grad_norm": 0.3254620134830475, + "learning_rate": 0.00012431724682343296, + "loss": 2.0841367721557615, + "step": 2800 + }, + { + "epoch": 0.008505743647178121, + "grad_norm": 0.30100157856941223, + "learning_rate": 0.00012431345164179558, + "loss": 2.117908477783203, + "step": 2810 + }, + { + "epoch": 0.00853601319752395, + "grad_norm": 0.38519927859306335, + "learning_rate": 0.00012430965646015817, + "loss": 2.0901065826416017, + "step": 2820 + }, + { + "epoch": 0.00856628274786978, + "grad_norm": 0.3293122351169586, + "learning_rate": 0.0001243058612785208, + "loss": 2.068133544921875, + "step": 2830 + }, + { + "epoch": 0.00859655229821561, + "grad_norm": 0.30197224020957947, + "learning_rate": 0.0001243020660968834, + "loss": 2.118581008911133, + "step": 2840 + }, + { + "epoch": 0.00862682184856144, + "grad_norm": 0.2740367352962494, + "learning_rate": 0.000124298270915246, + "loss": 2.0793430328369142, + "step": 2850 + }, + { + "epoch": 0.008657091398907269, + "grad_norm": 0.35810422897338867, + "learning_rate": 0.00012429447573360862, + "loss": 2.113850784301758, + "step": 2860 + }, + { + "epoch": 0.0086873609492531, + "grad_norm": 0.4350786805152893, + "learning_rate": 0.0001242906805519712, + "loss": 2.0676183700561523, + "step": 2870 + }, + { + "epoch": 0.008717630499598929, + "grad_norm": 0.32381147146224976, + "learning_rate": 0.00012428688537033383, + "loss": 2.0597057342529297, + "step": 2880 + }, + { + "epoch": 0.008747900049944758, + "grad_norm": 0.3216345012187958, + "learning_rate": 0.00012428309018869643, + "loss": 2.1094112396240234, + "step": 2890 + }, + { + "epoch": 0.008778169600290587, + "grad_norm": 0.2604828476905823, + "learning_rate": 0.00012427929500705904, + "loss": 2.0910869598388673, + "step": 2900 + }, + { + "epoch": 0.008808439150636418, + "grad_norm": 0.29565027356147766, + "learning_rate": 0.00012427549982542164, + "loss": 2.0726194381713867, + "step": 2910 + }, + { + "epoch": 0.008838708700982247, + "grad_norm": 0.33008304238319397, + "learning_rate": 0.00012427170464378426, + "loss": 2.0756526947021485, + "step": 2920 + }, + { + "epoch": 0.008868978251328076, + "grad_norm": 0.3674861490726471, + "learning_rate": 0.00012426790946214685, + "loss": 2.005488967895508, + "step": 2930 + }, + { + "epoch": 0.008899247801673907, + "grad_norm": 0.2846050262451172, + "learning_rate": 0.00012426411428050947, + "loss": 2.0239822387695314, + "step": 2940 + }, + { + "epoch": 0.008929517352019736, + "grad_norm": 0.42008233070373535, + "learning_rate": 0.00012426031909887206, + "loss": 2.0978939056396486, + "step": 2950 + }, + { + "epoch": 0.008959786902365565, + "grad_norm": 0.3039669096469879, + "learning_rate": 0.00012425652391723468, + "loss": 2.0702850341796877, + "step": 2960 + }, + { + "epoch": 0.008990056452711394, + "grad_norm": 0.424319326877594, + "learning_rate": 0.0001242527287355973, + "loss": 2.0339693069458007, + "step": 2970 + }, + { + "epoch": 0.009020326003057225, + "grad_norm": 0.29115745425224304, + "learning_rate": 0.0001242489335539599, + "loss": 2.038237953186035, + "step": 2980 + }, + { + "epoch": 0.009050595553403054, + "grad_norm": 0.2899591028690338, + "learning_rate": 0.0001242451383723225, + "loss": 2.087693786621094, + "step": 2990 + }, + { + "epoch": 0.009080865103748883, + "grad_norm": 0.2841193974018097, + "learning_rate": 0.0001242413431906851, + "loss": 2.0297943115234376, + "step": 3000 + }, + { + "epoch": 0.009080865103748883, + "eval_loss": 2.0453286170959473, + "eval_runtime": 27.7527, + "eval_samples_per_second": 18.016, + "eval_steps_per_second": 1.153, + "step": 3000 + }, + { + "epoch": 0.009111134654094714, + "grad_norm": 0.31646278500556946, + "learning_rate": 0.00012423754800904772, + "loss": 2.0815319061279296, + "step": 3010 + }, + { + "epoch": 0.009141404204440543, + "grad_norm": 0.46433982253074646, + "learning_rate": 0.00012423375282741032, + "loss": 2.0433563232421874, + "step": 3020 + }, + { + "epoch": 0.009171673754786372, + "grad_norm": 0.4293697774410248, + "learning_rate": 0.00012422995764577294, + "loss": 2.0468101501464844, + "step": 3030 + }, + { + "epoch": 0.009201943305132203, + "grad_norm": 0.3405691087245941, + "learning_rate": 0.00012422616246413553, + "loss": 2.054305839538574, + "step": 3040 + }, + { + "epoch": 0.009232212855478032, + "grad_norm": 0.3713436722755432, + "learning_rate": 0.00012422236728249815, + "loss": 2.069599914550781, + "step": 3050 + }, + { + "epoch": 0.009262482405823861, + "grad_norm": 0.3154786229133606, + "learning_rate": 0.00012421857210086074, + "loss": 2.039751434326172, + "step": 3060 + }, + { + "epoch": 0.00929275195616969, + "grad_norm": 0.34450510144233704, + "learning_rate": 0.00012421477691922336, + "loss": 2.0292503356933596, + "step": 3070 + }, + { + "epoch": 0.009323021506515521, + "grad_norm": 0.3476787805557251, + "learning_rate": 0.00012421098173758595, + "loss": 2.019110107421875, + "step": 3080 + }, + { + "epoch": 0.00935329105686135, + "grad_norm": 0.3116609454154968, + "learning_rate": 0.00012420718655594857, + "loss": 2.010850715637207, + "step": 3090 + }, + { + "epoch": 0.00938356060720718, + "grad_norm": 0.3096717894077301, + "learning_rate": 0.0001242033913743112, + "loss": 1.9551448822021484, + "step": 3100 + }, + { + "epoch": 0.00941383015755301, + "grad_norm": 0.37774479389190674, + "learning_rate": 0.00012419959619267378, + "loss": 1.9666107177734375, + "step": 3110 + }, + { + "epoch": 0.00944409970789884, + "grad_norm": 0.4023415446281433, + "learning_rate": 0.0001241958010110364, + "loss": 2.044429397583008, + "step": 3120 + }, + { + "epoch": 0.009474369258244669, + "grad_norm": 0.3221231698989868, + "learning_rate": 0.000124192005829399, + "loss": 2.015119934082031, + "step": 3130 + }, + { + "epoch": 0.009504638808590498, + "grad_norm": 0.34648871421813965, + "learning_rate": 0.00012418821064776161, + "loss": 2.0408962249755858, + "step": 3140 + }, + { + "epoch": 0.009534908358936329, + "grad_norm": 0.4792883098125458, + "learning_rate": 0.0001241844154661242, + "loss": 1.976161575317383, + "step": 3150 + }, + { + "epoch": 0.009565177909282158, + "grad_norm": 0.4400143623352051, + "learning_rate": 0.00012418062028448683, + "loss": 1.9706695556640625, + "step": 3160 + }, + { + "epoch": 0.009595447459627987, + "grad_norm": 0.4058054983615875, + "learning_rate": 0.00012417682510284942, + "loss": 2.0031814575195312, + "step": 3170 + }, + { + "epoch": 0.009625717009973818, + "grad_norm": 0.33001935482025146, + "learning_rate": 0.00012417302992121204, + "loss": 1.9491825103759766, + "step": 3180 + }, + { + "epoch": 0.009655986560319647, + "grad_norm": 0.35491055250167847, + "learning_rate": 0.00012416923473957463, + "loss": 1.9508338928222657, + "step": 3190 + }, + { + "epoch": 0.009686256110665476, + "grad_norm": 0.3406505584716797, + "learning_rate": 0.00012416543955793725, + "loss": 1.9997772216796874, + "step": 3200 + }, + { + "epoch": 0.009716525661011305, + "grad_norm": 0.40323957800865173, + "learning_rate": 0.00012416164437629987, + "loss": 1.9523050308227539, + "step": 3210 + }, + { + "epoch": 0.009746795211357136, + "grad_norm": 0.3781239688396454, + "learning_rate": 0.00012415784919466246, + "loss": 2.007707977294922, + "step": 3220 + }, + { + "epoch": 0.009777064761702965, + "grad_norm": 0.3140753507614136, + "learning_rate": 0.00012415405401302508, + "loss": 1.9726596832275392, + "step": 3230 + }, + { + "epoch": 0.009807334312048794, + "grad_norm": 0.34161821007728577, + "learning_rate": 0.00012415025883138767, + "loss": 1.9303871154785157, + "step": 3240 + }, + { + "epoch": 0.009837603862394625, + "grad_norm": 0.3149389326572418, + "learning_rate": 0.0001241464636497503, + "loss": 1.9501468658447265, + "step": 3250 + }, + { + "epoch": 0.009867873412740454, + "grad_norm": 0.34309545159339905, + "learning_rate": 0.00012414266846811289, + "loss": 2.0069507598876952, + "step": 3260 + }, + { + "epoch": 0.009898142963086283, + "grad_norm": 0.29673323035240173, + "learning_rate": 0.0001241388732864755, + "loss": 1.9506305694580077, + "step": 3270 + }, + { + "epoch": 0.009928412513432112, + "grad_norm": 0.36085087060928345, + "learning_rate": 0.0001241350781048381, + "loss": 1.9894586563110352, + "step": 3280 + }, + { + "epoch": 0.009958682063777943, + "grad_norm": 0.39669889211654663, + "learning_rate": 0.00012413128292320072, + "loss": 1.9311748504638673, + "step": 3290 + }, + { + "epoch": 0.009988951614123772, + "grad_norm": 0.3396036922931671, + "learning_rate": 0.0001241274877415633, + "loss": 1.9009269714355468, + "step": 3300 + }, + { + "epoch": 0.010019221164469601, + "grad_norm": 0.35745537281036377, + "learning_rate": 0.00012412369255992593, + "loss": 1.9124662399291992, + "step": 3310 + }, + { + "epoch": 0.010049490714815432, + "grad_norm": 0.37736013531684875, + "learning_rate": 0.00012411989737828852, + "loss": 1.9677207946777344, + "step": 3320 + }, + { + "epoch": 0.010079760265161261, + "grad_norm": 0.388942152261734, + "learning_rate": 0.00012411610219665114, + "loss": 1.9206418991088867, + "step": 3330 + }, + { + "epoch": 0.01011002981550709, + "grad_norm": 0.32444149255752563, + "learning_rate": 0.00012411230701501376, + "loss": 1.9066593170166015, + "step": 3340 + }, + { + "epoch": 0.010140299365852921, + "grad_norm": 0.33314213156700134, + "learning_rate": 0.00012410851183337635, + "loss": 1.97900333404541, + "step": 3350 + }, + { + "epoch": 0.01017056891619875, + "grad_norm": 0.35803937911987305, + "learning_rate": 0.00012410471665173897, + "loss": 1.9236131668090821, + "step": 3360 + }, + { + "epoch": 0.01020083846654458, + "grad_norm": 0.467756986618042, + "learning_rate": 0.00012410092147010156, + "loss": 2.0075328826904295, + "step": 3370 + }, + { + "epoch": 0.010231108016890408, + "grad_norm": 0.32090142369270325, + "learning_rate": 0.00012409712628846418, + "loss": 1.908135986328125, + "step": 3380 + }, + { + "epoch": 0.01026137756723624, + "grad_norm": 0.32007551193237305, + "learning_rate": 0.00012409333110682678, + "loss": 1.93969783782959, + "step": 3390 + }, + { + "epoch": 0.010291647117582068, + "grad_norm": 0.29098251461982727, + "learning_rate": 0.0001240895359251894, + "loss": 1.9398090362548828, + "step": 3400 + }, + { + "epoch": 0.010321916667927897, + "grad_norm": 0.3022040128707886, + "learning_rate": 0.000124085740743552, + "loss": 1.956418228149414, + "step": 3410 + }, + { + "epoch": 0.010352186218273728, + "grad_norm": 0.3375268280506134, + "learning_rate": 0.0001240819455619146, + "loss": 1.9158157348632812, + "step": 3420 + }, + { + "epoch": 0.010382455768619557, + "grad_norm": 0.37046584486961365, + "learning_rate": 0.0001240781503802772, + "loss": 1.9361875534057618, + "step": 3430 + }, + { + "epoch": 0.010412725318965387, + "grad_norm": 0.37684166431427, + "learning_rate": 0.00012407435519863982, + "loss": 1.9289997100830079, + "step": 3440 + }, + { + "epoch": 0.010442994869311216, + "grad_norm": 0.26773715019226074, + "learning_rate": 0.0001240705600170024, + "loss": 1.9693740844726562, + "step": 3450 + }, + { + "epoch": 0.010473264419657046, + "grad_norm": 0.4540451765060425, + "learning_rate": 0.00012406676483536503, + "loss": 1.9559246063232423, + "step": 3460 + }, + { + "epoch": 0.010503533970002876, + "grad_norm": 0.4570782482624054, + "learning_rate": 0.00012406296965372762, + "loss": 1.9231464385986328, + "step": 3470 + }, + { + "epoch": 0.010533803520348705, + "grad_norm": 0.2539197504520416, + "learning_rate": 0.00012405917447209024, + "loss": 1.9138807296752929, + "step": 3480 + }, + { + "epoch": 0.010564073070694536, + "grad_norm": 0.34259724617004395, + "learning_rate": 0.00012405537929045284, + "loss": 1.9342781066894532, + "step": 3490 + }, + { + "epoch": 0.010594342621040365, + "grad_norm": 0.4166645407676697, + "learning_rate": 0.00012405158410881546, + "loss": 1.9119661331176758, + "step": 3500 + }, + { + "epoch": 0.010594342621040365, + "eval_loss": 1.943803310394287, + "eval_runtime": 28.1416, + "eval_samples_per_second": 17.767, + "eval_steps_per_second": 1.137, + "step": 3500 + }, + { + "epoch": 0.010624612171386194, + "grad_norm": 0.27105358242988586, + "learning_rate": 0.00012404778892717805, + "loss": 1.9630365371704102, + "step": 3510 + }, + { + "epoch": 0.010654881721732023, + "grad_norm": 0.37853434681892395, + "learning_rate": 0.00012404399374554067, + "loss": 1.9532964706420899, + "step": 3520 + }, + { + "epoch": 0.010685151272077854, + "grad_norm": 0.2856719195842743, + "learning_rate": 0.00012404019856390326, + "loss": 1.9380802154541015, + "step": 3530 + }, + { + "epoch": 0.010715420822423683, + "grad_norm": 0.32729601860046387, + "learning_rate": 0.00012403640338226588, + "loss": 1.9262615203857423, + "step": 3540 + }, + { + "epoch": 0.010745690372769512, + "grad_norm": 0.34289830923080444, + "learning_rate": 0.0001240326082006285, + "loss": 1.9397499084472656, + "step": 3550 + }, + { + "epoch": 0.010775959923115343, + "grad_norm": 0.30251580476760864, + "learning_rate": 0.0001240288130189911, + "loss": 1.9517763137817383, + "step": 3560 + }, + { + "epoch": 0.010806229473461172, + "grad_norm": 0.3997442126274109, + "learning_rate": 0.0001240250178373537, + "loss": 1.969324493408203, + "step": 3570 + }, + { + "epoch": 0.010836499023807001, + "grad_norm": 0.3930775225162506, + "learning_rate": 0.0001240212226557163, + "loss": 1.9548988342285156, + "step": 3580 + }, + { + "epoch": 0.010866768574152832, + "grad_norm": 0.30366799235343933, + "learning_rate": 0.00012401742747407892, + "loss": 1.9223320007324218, + "step": 3590 + }, + { + "epoch": 0.010897038124498661, + "grad_norm": 0.44366124272346497, + "learning_rate": 0.00012401363229244152, + "loss": 1.868903350830078, + "step": 3600 + }, + { + "epoch": 0.01092730767484449, + "grad_norm": 0.395310640335083, + "learning_rate": 0.00012400983711080413, + "loss": 1.9041851043701172, + "step": 3610 + }, + { + "epoch": 0.010957577225190319, + "grad_norm": 0.26351720094680786, + "learning_rate": 0.00012400604192916673, + "loss": 1.9273880004882813, + "step": 3620 + }, + { + "epoch": 0.01098784677553615, + "grad_norm": 0.2550382912158966, + "learning_rate": 0.00012400224674752935, + "loss": 1.912146759033203, + "step": 3630 + }, + { + "epoch": 0.011018116325881979, + "grad_norm": 0.29087018966674805, + "learning_rate": 0.00012399845156589194, + "loss": 1.9174476623535157, + "step": 3640 + }, + { + "epoch": 0.011048385876227808, + "grad_norm": 0.32615023851394653, + "learning_rate": 0.00012399465638425456, + "loss": 1.867264747619629, + "step": 3650 + }, + { + "epoch": 0.011078655426573639, + "grad_norm": 0.2264884114265442, + "learning_rate": 0.00012399086120261715, + "loss": 1.9531402587890625, + "step": 3660 + }, + { + "epoch": 0.011108924976919468, + "grad_norm": 0.32450738549232483, + "learning_rate": 0.00012398706602097977, + "loss": 1.9222349166870116, + "step": 3670 + }, + { + "epoch": 0.011139194527265297, + "grad_norm": 0.4259231388568878, + "learning_rate": 0.00012398327083934236, + "loss": 1.9556777954101563, + "step": 3680 + }, + { + "epoch": 0.011169464077611126, + "grad_norm": 0.334347665309906, + "learning_rate": 0.00012397947565770498, + "loss": 1.9238739013671875, + "step": 3690 + }, + { + "epoch": 0.011199733627956957, + "grad_norm": 0.2684175968170166, + "learning_rate": 0.00012397568047606757, + "loss": 1.936290168762207, + "step": 3700 + }, + { + "epoch": 0.011230003178302786, + "grad_norm": 0.285921186208725, + "learning_rate": 0.0001239718852944302, + "loss": 1.9208641052246094, + "step": 3710 + }, + { + "epoch": 0.011260272728648615, + "grad_norm": 0.25411608815193176, + "learning_rate": 0.0001239680901127928, + "loss": 1.9586799621582032, + "step": 3720 + }, + { + "epoch": 0.011290542278994446, + "grad_norm": 0.34944239258766174, + "learning_rate": 0.0001239642949311554, + "loss": 1.9658367156982421, + "step": 3730 + }, + { + "epoch": 0.011320811829340275, + "grad_norm": 0.2644929587841034, + "learning_rate": 0.00012396049974951803, + "loss": 1.9098628997802733, + "step": 3740 + }, + { + "epoch": 0.011351081379686104, + "grad_norm": 0.37215495109558105, + "learning_rate": 0.00012395670456788062, + "loss": 1.8803329467773438, + "step": 3750 + }, + { + "epoch": 0.011381350930031934, + "grad_norm": 0.2925165891647339, + "learning_rate": 0.00012395290938624324, + "loss": 1.8632402420043945, + "step": 3760 + }, + { + "epoch": 0.011411620480377764, + "grad_norm": 0.25775495171546936, + "learning_rate": 0.00012394911420460583, + "loss": 1.9387794494628907, + "step": 3770 + }, + { + "epoch": 0.011441890030723594, + "grad_norm": 0.41319382190704346, + "learning_rate": 0.00012394531902296845, + "loss": 1.9010311126708985, + "step": 3780 + }, + { + "epoch": 0.011472159581069423, + "grad_norm": 0.287296861410141, + "learning_rate": 0.00012394152384133104, + "loss": 1.901277732849121, + "step": 3790 + }, + { + "epoch": 0.011502429131415253, + "grad_norm": 0.4102442264556885, + "learning_rate": 0.00012393772865969366, + "loss": 1.850543212890625, + "step": 3800 + }, + { + "epoch": 0.011532698681761083, + "grad_norm": 0.3235026001930237, + "learning_rate": 0.00012393393347805625, + "loss": 1.922449493408203, + "step": 3810 + }, + { + "epoch": 0.011562968232106912, + "grad_norm": 0.3135455548763275, + "learning_rate": 0.00012393013829641887, + "loss": 1.9297306060791015, + "step": 3820 + }, + { + "epoch": 0.011593237782452743, + "grad_norm": 0.23509103059768677, + "learning_rate": 0.00012392634311478147, + "loss": 1.9212421417236327, + "step": 3830 + }, + { + "epoch": 0.011623507332798572, + "grad_norm": 0.34724947810173035, + "learning_rate": 0.00012392254793314409, + "loss": 1.8756023406982423, + "step": 3840 + }, + { + "epoch": 0.0116537768831444, + "grad_norm": 0.2769686281681061, + "learning_rate": 0.00012391875275150668, + "loss": 1.935162353515625, + "step": 3850 + }, + { + "epoch": 0.01168404643349023, + "grad_norm": 0.2767464518547058, + "learning_rate": 0.0001239149575698693, + "loss": 1.9255970001220704, + "step": 3860 + }, + { + "epoch": 0.01171431598383606, + "grad_norm": 0.3807431161403656, + "learning_rate": 0.0001239111623882319, + "loss": 1.9000593185424806, + "step": 3870 + }, + { + "epoch": 0.01174458553418189, + "grad_norm": 0.3271266520023346, + "learning_rate": 0.0001239073672065945, + "loss": 1.9453601837158203, + "step": 3880 + }, + { + "epoch": 0.011774855084527719, + "grad_norm": 0.2980647087097168, + "learning_rate": 0.0001239035720249571, + "loss": 1.95582275390625, + "step": 3890 + }, + { + "epoch": 0.01180512463487355, + "grad_norm": 0.3841542899608612, + "learning_rate": 0.00012389977684331972, + "loss": 1.9542476654052734, + "step": 3900 + }, + { + "epoch": 0.011835394185219379, + "grad_norm": 0.3464236557483673, + "learning_rate": 0.0001238959816616823, + "loss": 1.9413948059082031, + "step": 3910 + }, + { + "epoch": 0.011865663735565208, + "grad_norm": 0.28021949529647827, + "learning_rate": 0.00012389218648004493, + "loss": 1.8875160217285156, + "step": 3920 + }, + { + "epoch": 0.011895933285911037, + "grad_norm": 0.30121728777885437, + "learning_rate": 0.00012388839129840753, + "loss": 1.9262630462646484, + "step": 3930 + }, + { + "epoch": 0.011926202836256868, + "grad_norm": 0.28160926699638367, + "learning_rate": 0.00012388459611677014, + "loss": 1.9395143508911132, + "step": 3940 + }, + { + "epoch": 0.011956472386602697, + "grad_norm": 0.2565731108188629, + "learning_rate": 0.00012388080093513276, + "loss": 1.8803733825683593, + "step": 3950 + }, + { + "epoch": 0.011986741936948526, + "grad_norm": 0.35275566577911377, + "learning_rate": 0.00012387700575349536, + "loss": 1.8870786666870116, + "step": 3960 + }, + { + "epoch": 0.012017011487294357, + "grad_norm": 0.3012009263038635, + "learning_rate": 0.00012387321057185798, + "loss": 1.9627601623535156, + "step": 3970 + }, + { + "epoch": 0.012047281037640186, + "grad_norm": 0.2770478129386902, + "learning_rate": 0.00012386941539022057, + "loss": 1.8862644195556642, + "step": 3980 + }, + { + "epoch": 0.012077550587986015, + "grad_norm": 0.3405110239982605, + "learning_rate": 0.0001238656202085832, + "loss": 1.942399024963379, + "step": 3990 + }, + { + "epoch": 0.012107820138331844, + "grad_norm": 0.3098236918449402, + "learning_rate": 0.00012386182502694578, + "loss": 1.907712745666504, + "step": 4000 + }, + { + "epoch": 0.012107820138331844, + "eval_loss": 1.8891717195510864, + "eval_runtime": 27.9516, + "eval_samples_per_second": 17.888, + "eval_steps_per_second": 1.145, + "step": 4000 + }, + { + "epoch": 0.012138089688677675, + "grad_norm": 0.29338812828063965, + "learning_rate": 0.0001238580298453084, + "loss": 1.9390369415283204, + "step": 4010 + }, + { + "epoch": 0.012168359239023504, + "grad_norm": 0.23808608949184418, + "learning_rate": 0.000123854234663671, + "loss": 1.9072799682617188, + "step": 4020 + }, + { + "epoch": 0.012198628789369333, + "grad_norm": 0.25660380721092224, + "learning_rate": 0.0001238504394820336, + "loss": 1.895766258239746, + "step": 4030 + }, + { + "epoch": 0.012228898339715164, + "grad_norm": 0.2982146441936493, + "learning_rate": 0.0001238466443003962, + "loss": 1.9301937103271485, + "step": 4040 + }, + { + "epoch": 0.012259167890060993, + "grad_norm": 0.2872021794319153, + "learning_rate": 0.00012384284911875882, + "loss": 1.9118858337402345, + "step": 4050 + }, + { + "epoch": 0.012289437440406822, + "grad_norm": 0.3294188976287842, + "learning_rate": 0.00012383905393712142, + "loss": 1.9042741775512695, + "step": 4060 + }, + { + "epoch": 0.012319706990752652, + "grad_norm": 0.23938508331775665, + "learning_rate": 0.00012383525875548404, + "loss": 1.9443710327148438, + "step": 4070 + }, + { + "epoch": 0.012349976541098482, + "grad_norm": 0.22572362422943115, + "learning_rate": 0.00012383146357384666, + "loss": 1.9217494964599608, + "step": 4080 + }, + { + "epoch": 0.012380246091444311, + "grad_norm": 0.2597970962524414, + "learning_rate": 0.00012382766839220925, + "loss": 1.9113960266113281, + "step": 4090 + }, + { + "epoch": 0.01241051564179014, + "grad_norm": 0.23652693629264832, + "learning_rate": 0.00012382387321057187, + "loss": 1.9050600051879882, + "step": 4100 + }, + { + "epoch": 0.012440785192135971, + "grad_norm": 0.2569594085216522, + "learning_rate": 0.00012382007802893446, + "loss": 1.9108182907104492, + "step": 4110 + }, + { + "epoch": 0.0124710547424818, + "grad_norm": 0.2615516781806946, + "learning_rate": 0.00012381628284729708, + "loss": 1.8997745513916016, + "step": 4120 + }, + { + "epoch": 0.01250132429282763, + "grad_norm": 0.2723720967769623, + "learning_rate": 0.00012381248766565967, + "loss": 1.8923328399658204, + "step": 4130 + }, + { + "epoch": 0.01253159384317346, + "grad_norm": 0.2679499387741089, + "learning_rate": 0.0001238086924840223, + "loss": 1.898726463317871, + "step": 4140 + }, + { + "epoch": 0.01256186339351929, + "grad_norm": 0.26473379135131836, + "learning_rate": 0.00012380489730238488, + "loss": 1.9594440460205078, + "step": 4150 + }, + { + "epoch": 0.012592132943865119, + "grad_norm": 0.3180883228778839, + "learning_rate": 0.0001238011021207475, + "loss": 1.9350635528564453, + "step": 4160 + }, + { + "epoch": 0.012622402494210948, + "grad_norm": 0.2669268548488617, + "learning_rate": 0.0001237973069391101, + "loss": 1.9109725952148438, + "step": 4170 + }, + { + "epoch": 0.012652672044556779, + "grad_norm": 0.27721458673477173, + "learning_rate": 0.00012379351175747271, + "loss": 1.8715505599975586, + "step": 4180 + }, + { + "epoch": 0.012682941594902608, + "grad_norm": 0.3055400252342224, + "learning_rate": 0.0001237897165758353, + "loss": 1.9022449493408202, + "step": 4190 + }, + { + "epoch": 0.012713211145248437, + "grad_norm": 0.2962500751018524, + "learning_rate": 0.00012378592139419793, + "loss": 1.8981288909912108, + "step": 4200 + }, + { + "epoch": 0.012743480695594268, + "grad_norm": 0.29074186086654663, + "learning_rate": 0.00012378212621256055, + "loss": 1.9314289093017578, + "step": 4210 + }, + { + "epoch": 0.012773750245940097, + "grad_norm": 0.30589428544044495, + "learning_rate": 0.00012377833103092314, + "loss": 1.8799354553222656, + "step": 4220 + }, + { + "epoch": 0.012804019796285926, + "grad_norm": 0.2224518358707428, + "learning_rate": 0.00012377453584928576, + "loss": 1.9296253204345704, + "step": 4230 + }, + { + "epoch": 0.012834289346631755, + "grad_norm": 0.23311235010623932, + "learning_rate": 0.00012377074066764835, + "loss": 1.932366180419922, + "step": 4240 + }, + { + "epoch": 0.012864558896977586, + "grad_norm": 0.3010726571083069, + "learning_rate": 0.00012376694548601097, + "loss": 1.8897518157958983, + "step": 4250 + }, + { + "epoch": 0.012894828447323415, + "grad_norm": 0.24853327870368958, + "learning_rate": 0.00012376315030437356, + "loss": 1.930649185180664, + "step": 4260 + }, + { + "epoch": 0.012925097997669244, + "grad_norm": 0.2780199944972992, + "learning_rate": 0.00012375935512273618, + "loss": 1.8991191864013672, + "step": 4270 + }, + { + "epoch": 0.012955367548015075, + "grad_norm": 0.26509806513786316, + "learning_rate": 0.00012375555994109877, + "loss": 1.8879085540771485, + "step": 4280 + }, + { + "epoch": 0.012985637098360904, + "grad_norm": 0.23368676006793976, + "learning_rate": 0.0001237517647594614, + "loss": 1.8887823104858399, + "step": 4290 + }, + { + "epoch": 0.013015906648706733, + "grad_norm": 0.39394044876098633, + "learning_rate": 0.00012374796957782399, + "loss": 1.8785224914550782, + "step": 4300 + }, + { + "epoch": 0.013046176199052562, + "grad_norm": 0.2508772909641266, + "learning_rate": 0.0001237441743961866, + "loss": 1.9400569915771484, + "step": 4310 + }, + { + "epoch": 0.013076445749398393, + "grad_norm": 0.24062590301036835, + "learning_rate": 0.00012374037921454923, + "loss": 1.925467300415039, + "step": 4320 + }, + { + "epoch": 0.013106715299744222, + "grad_norm": 0.2659651041030884, + "learning_rate": 0.00012373658403291182, + "loss": 1.9272682189941406, + "step": 4330 + }, + { + "epoch": 0.013136984850090051, + "grad_norm": 0.2767602503299713, + "learning_rate": 0.00012373278885127444, + "loss": 1.8883703231811524, + "step": 4340 + }, + { + "epoch": 0.013167254400435882, + "grad_norm": 0.2629595994949341, + "learning_rate": 0.00012372899366963703, + "loss": 1.8906242370605468, + "step": 4350 + }, + { + "epoch": 0.013197523950781711, + "grad_norm": 0.2870247960090637, + "learning_rate": 0.00012372519848799965, + "loss": 1.853766632080078, + "step": 4360 + }, + { + "epoch": 0.01322779350112754, + "grad_norm": 0.44669944047927856, + "learning_rate": 0.00012372140330636224, + "loss": 1.949212646484375, + "step": 4370 + }, + { + "epoch": 0.013258063051473371, + "grad_norm": 0.3391066789627075, + "learning_rate": 0.00012371760812472486, + "loss": 1.904806900024414, + "step": 4380 + }, + { + "epoch": 0.0132883326018192, + "grad_norm": 0.27578985691070557, + "learning_rate": 0.00012371381294308745, + "loss": 1.9218704223632812, + "step": 4390 + }, + { + "epoch": 0.01331860215216503, + "grad_norm": 0.3184855878353119, + "learning_rate": 0.00012371001776145007, + "loss": 1.8986486434936523, + "step": 4400 + }, + { + "epoch": 0.013348871702510859, + "grad_norm": 0.3348824083805084, + "learning_rate": 0.00012370622257981267, + "loss": 1.8541698455810547, + "step": 4410 + }, + { + "epoch": 0.01337914125285669, + "grad_norm": 0.24305424094200134, + "learning_rate": 0.00012370242739817528, + "loss": 1.9017505645751953, + "step": 4420 + }, + { + "epoch": 0.013409410803202518, + "grad_norm": 0.380051851272583, + "learning_rate": 0.00012369863221653788, + "loss": 1.857004165649414, + "step": 4430 + }, + { + "epoch": 0.013439680353548348, + "grad_norm": 0.2866714596748352, + "learning_rate": 0.0001236948370349005, + "loss": 1.8619119644165039, + "step": 4440 + }, + { + "epoch": 0.013469949903894178, + "grad_norm": 0.2866414189338684, + "learning_rate": 0.00012369104185326312, + "loss": 1.9219026565551758, + "step": 4450 + }, + { + "epoch": 0.013500219454240008, + "grad_norm": 0.3180442154407501, + "learning_rate": 0.0001236872466716257, + "loss": 1.873021125793457, + "step": 4460 + }, + { + "epoch": 0.013530489004585837, + "grad_norm": 0.34044498205184937, + "learning_rate": 0.00012368345148998833, + "loss": 1.8868118286132813, + "step": 4470 + }, + { + "epoch": 0.013560758554931666, + "grad_norm": 0.3138453960418701, + "learning_rate": 0.00012367965630835092, + "loss": 1.898858642578125, + "step": 4480 + }, + { + "epoch": 0.013591028105277497, + "grad_norm": 0.26116618514060974, + "learning_rate": 0.00012367586112671354, + "loss": 1.8915843963623047, + "step": 4490 + }, + { + "epoch": 0.013621297655623326, + "grad_norm": 0.23798561096191406, + "learning_rate": 0.00012367206594507613, + "loss": 1.856490707397461, + "step": 4500 + }, + { + "epoch": 0.013621297655623326, + "eval_loss": 1.9008175134658813, + "eval_runtime": 28.1142, + "eval_samples_per_second": 17.785, + "eval_steps_per_second": 1.138, + "step": 4500 + }, + { + "epoch": 0.013651567205969155, + "grad_norm": 0.3169378340244293, + "learning_rate": 0.00012366827076343875, + "loss": 1.9211906433105468, + "step": 4510 + }, + { + "epoch": 0.013681836756314986, + "grad_norm": 0.2591759264469147, + "learning_rate": 0.00012366447558180134, + "loss": 1.9526098251342774, + "step": 4520 + }, + { + "epoch": 0.013712106306660815, + "grad_norm": 0.30226537585258484, + "learning_rate": 0.00012366068040016396, + "loss": 1.8673307418823242, + "step": 4530 + }, + { + "epoch": 0.013742375857006644, + "grad_norm": 0.2768282890319824, + "learning_rate": 0.00012365688521852656, + "loss": 1.8639335632324219, + "step": 4540 + }, + { + "epoch": 0.013772645407352473, + "grad_norm": 0.31099388003349304, + "learning_rate": 0.00012365309003688918, + "loss": 1.9412487030029297, + "step": 4550 + }, + { + "epoch": 0.013802914957698304, + "grad_norm": 0.24632598459720612, + "learning_rate": 0.0001236492948552518, + "loss": 1.8909345626831056, + "step": 4560 + }, + { + "epoch": 0.013833184508044133, + "grad_norm": 0.3007356822490692, + "learning_rate": 0.0001236454996736144, + "loss": 1.9086128234863282, + "step": 4570 + }, + { + "epoch": 0.013863454058389962, + "grad_norm": 0.24967360496520996, + "learning_rate": 0.000123641704491977, + "loss": 1.8909217834472656, + "step": 4580 + }, + { + "epoch": 0.013893723608735793, + "grad_norm": 0.23926059901714325, + "learning_rate": 0.0001236379093103396, + "loss": 1.8683368682861328, + "step": 4590 + }, + { + "epoch": 0.013923993159081622, + "grad_norm": 0.210995152592659, + "learning_rate": 0.00012363411412870222, + "loss": 1.9029260635375977, + "step": 4600 + }, + { + "epoch": 0.013954262709427451, + "grad_norm": 0.22477193176746368, + "learning_rate": 0.0001236303189470648, + "loss": 1.8827213287353515, + "step": 4610 + }, + { + "epoch": 0.013984532259773282, + "grad_norm": 0.23820945620536804, + "learning_rate": 0.00012362652376542743, + "loss": 1.886850929260254, + "step": 4620 + }, + { + "epoch": 0.014014801810119111, + "grad_norm": 0.240162655711174, + "learning_rate": 0.00012362272858379002, + "loss": 1.8538917541503905, + "step": 4630 + }, + { + "epoch": 0.01404507136046494, + "grad_norm": 0.23596374690532684, + "learning_rate": 0.00012361893340215264, + "loss": 1.9055660247802735, + "step": 4640 + }, + { + "epoch": 0.01407534091081077, + "grad_norm": 0.24479711055755615, + "learning_rate": 0.00012361513822051524, + "loss": 1.8628681182861329, + "step": 4650 + }, + { + "epoch": 0.0141056104611566, + "grad_norm": 0.2623870372772217, + "learning_rate": 0.00012361134303887785, + "loss": 1.858268928527832, + "step": 4660 + }, + { + "epoch": 0.01413588001150243, + "grad_norm": 0.2809491753578186, + "learning_rate": 0.00012360754785724045, + "loss": 1.8968439102172852, + "step": 4670 + }, + { + "epoch": 0.014166149561848258, + "grad_norm": 0.21378090977668762, + "learning_rate": 0.00012360375267560307, + "loss": 1.9339839935302734, + "step": 4680 + }, + { + "epoch": 0.01419641911219409, + "grad_norm": 0.21137161552906036, + "learning_rate": 0.00012359995749396566, + "loss": 1.8340679168701173, + "step": 4690 + }, + { + "epoch": 0.014226688662539918, + "grad_norm": 0.2410220503807068, + "learning_rate": 0.00012359616231232828, + "loss": 1.8039390563964843, + "step": 4700 + }, + { + "epoch": 0.014256958212885747, + "grad_norm": 0.255355566740036, + "learning_rate": 0.00012359236713069087, + "loss": 1.8908870697021485, + "step": 4710 + }, + { + "epoch": 0.014287227763231576, + "grad_norm": 0.3616943061351776, + "learning_rate": 0.0001235885719490535, + "loss": 1.8479272842407226, + "step": 4720 + }, + { + "epoch": 0.014317497313577407, + "grad_norm": 0.29068800806999207, + "learning_rate": 0.00012358477676741608, + "loss": 1.884122657775879, + "step": 4730 + }, + { + "epoch": 0.014347766863923236, + "grad_norm": 0.2853012979030609, + "learning_rate": 0.0001235809815857787, + "loss": 1.8600671768188477, + "step": 4740 + }, + { + "epoch": 0.014378036414269066, + "grad_norm": 0.23633678257465363, + "learning_rate": 0.0001235771864041413, + "loss": 1.9135967254638673, + "step": 4750 + }, + { + "epoch": 0.014408305964614896, + "grad_norm": 0.28245455026626587, + "learning_rate": 0.00012357339122250391, + "loss": 1.913901138305664, + "step": 4760 + }, + { + "epoch": 0.014438575514960725, + "grad_norm": 0.28844889998435974, + "learning_rate": 0.00012356959604086653, + "loss": 1.8805212020874023, + "step": 4770 + }, + { + "epoch": 0.014468845065306555, + "grad_norm": 0.23659618198871613, + "learning_rate": 0.00012356580085922913, + "loss": 1.9021408081054687, + "step": 4780 + }, + { + "epoch": 0.014499114615652384, + "grad_norm": 0.29485368728637695, + "learning_rate": 0.00012356200567759175, + "loss": 1.9321481704711914, + "step": 4790 + }, + { + "epoch": 0.014529384165998215, + "grad_norm": 0.2761848568916321, + "learning_rate": 0.00012355821049595434, + "loss": 1.9141853332519532, + "step": 4800 + }, + { + "epoch": 0.014559653716344044, + "grad_norm": 0.2636793851852417, + "learning_rate": 0.00012355441531431696, + "loss": 1.900324249267578, + "step": 4810 + }, + { + "epoch": 0.014589923266689873, + "grad_norm": 0.27517250180244446, + "learning_rate": 0.00012355062013267955, + "loss": 1.901956558227539, + "step": 4820 + }, + { + "epoch": 0.014620192817035704, + "grad_norm": 0.32680046558380127, + "learning_rate": 0.00012354682495104217, + "loss": 1.866572380065918, + "step": 4830 + }, + { + "epoch": 0.014650462367381533, + "grad_norm": 0.21372181177139282, + "learning_rate": 0.00012354302976940476, + "loss": 1.8651968002319337, + "step": 4840 + }, + { + "epoch": 0.014680731917727362, + "grad_norm": 0.23316927254199982, + "learning_rate": 0.00012353923458776738, + "loss": 1.8530044555664062, + "step": 4850 + }, + { + "epoch": 0.014711001468073191, + "grad_norm": 0.251675009727478, + "learning_rate": 0.00012353543940612997, + "loss": 1.874077606201172, + "step": 4860 + }, + { + "epoch": 0.014741271018419022, + "grad_norm": 0.2437405288219452, + "learning_rate": 0.0001235316442244926, + "loss": 1.8950462341308594, + "step": 4870 + }, + { + "epoch": 0.01477154056876485, + "grad_norm": 0.25441688299179077, + "learning_rate": 0.00012352784904285519, + "loss": 1.8784103393554688, + "step": 4880 + }, + { + "epoch": 0.01480181011911068, + "grad_norm": 0.34064894914627075, + "learning_rate": 0.0001235240538612178, + "loss": 1.832791519165039, + "step": 4890 + }, + { + "epoch": 0.01483207966945651, + "grad_norm": 0.24014398455619812, + "learning_rate": 0.0001235202586795804, + "loss": 1.9048770904541015, + "step": 4900 + }, + { + "epoch": 0.01486234921980234, + "grad_norm": 0.30487072467803955, + "learning_rate": 0.00012351646349794302, + "loss": 1.901239776611328, + "step": 4910 + }, + { + "epoch": 0.014892618770148169, + "grad_norm": 0.2696550488471985, + "learning_rate": 0.0001235126683163056, + "loss": 1.9040008544921876, + "step": 4920 + }, + { + "epoch": 0.014922888320494, + "grad_norm": 0.23349542915821075, + "learning_rate": 0.00012350887313466823, + "loss": 1.8446977615356446, + "step": 4930 + }, + { + "epoch": 0.014953157870839829, + "grad_norm": 0.2770819365978241, + "learning_rate": 0.00012350507795303082, + "loss": 1.895076370239258, + "step": 4940 + }, + { + "epoch": 0.014983427421185658, + "grad_norm": 0.2632441818714142, + "learning_rate": 0.00012350128277139344, + "loss": 1.9233711242675782, + "step": 4950 + }, + { + "epoch": 0.015013696971531487, + "grad_norm": 0.27830448746681213, + "learning_rate": 0.00012349748758975606, + "loss": 1.8789072036743164, + "step": 4960 + }, + { + "epoch": 0.015043966521877318, + "grad_norm": 0.2810215950012207, + "learning_rate": 0.00012349369240811865, + "loss": 1.843227005004883, + "step": 4970 + }, + { + "epoch": 0.015074236072223147, + "grad_norm": 0.2648703157901764, + "learning_rate": 0.00012348989722648127, + "loss": 1.912017250061035, + "step": 4980 + }, + { + "epoch": 0.015104505622568976, + "grad_norm": 0.24735310673713684, + "learning_rate": 0.00012348610204484386, + "loss": 1.837997817993164, + "step": 4990 + }, + { + "epoch": 0.015134775172914807, + "grad_norm": 0.3049739897251129, + "learning_rate": 0.00012348230686320648, + "loss": 1.8803739547729492, + "step": 5000 + }, + { + "epoch": 0.015134775172914807, + "eval_loss": 1.8610879182815552, + "eval_runtime": 28.2627, + "eval_samples_per_second": 17.691, + "eval_steps_per_second": 1.132, + "step": 5000 + }, + { + "epoch": 0.015165044723260636, + "grad_norm": 0.24480222165584564, + "learning_rate": 0.00012347851168156908, + "loss": 1.8849212646484375, + "step": 5010 + }, + { + "epoch": 0.015195314273606465, + "grad_norm": 0.30254724621772766, + "learning_rate": 0.0001234747164999317, + "loss": 1.8945137023925782, + "step": 5020 + }, + { + "epoch": 0.015225583823952294, + "grad_norm": 0.33459797501564026, + "learning_rate": 0.0001234709213182943, + "loss": 1.904586410522461, + "step": 5030 + }, + { + "epoch": 0.015255853374298125, + "grad_norm": 0.24789991974830627, + "learning_rate": 0.0001234671261366569, + "loss": 1.8533878326416016, + "step": 5040 + }, + { + "epoch": 0.015286122924643954, + "grad_norm": 0.30486035346984863, + "learning_rate": 0.0001234633309550195, + "loss": 1.889594268798828, + "step": 5050 + }, + { + "epoch": 0.015316392474989783, + "grad_norm": 0.25495409965515137, + "learning_rate": 0.00012345953577338212, + "loss": 1.8812091827392579, + "step": 5060 + }, + { + "epoch": 0.015346662025335614, + "grad_norm": 0.272303968667984, + "learning_rate": 0.0001234557405917447, + "loss": 1.8434999465942383, + "step": 5070 + }, + { + "epoch": 0.015376931575681443, + "grad_norm": 0.25553882122039795, + "learning_rate": 0.00012345194541010733, + "loss": 1.8755352020263671, + "step": 5080 + }, + { + "epoch": 0.015407201126027273, + "grad_norm": 0.27479925751686096, + "learning_rate": 0.00012344815022846992, + "loss": 1.8640640258789063, + "step": 5090 + }, + { + "epoch": 0.015437470676373102, + "grad_norm": 0.19477246701717377, + "learning_rate": 0.00012344435504683254, + "loss": 1.886240577697754, + "step": 5100 + }, + { + "epoch": 0.015467740226718932, + "grad_norm": 0.2354641854763031, + "learning_rate": 0.00012344055986519514, + "loss": 1.8570943832397462, + "step": 5110 + }, + { + "epoch": 0.015498009777064762, + "grad_norm": 0.26076990365982056, + "learning_rate": 0.00012343676468355776, + "loss": 1.8188495635986328, + "step": 5120 + }, + { + "epoch": 0.01552827932741059, + "grad_norm": 0.21378110349178314, + "learning_rate": 0.00012343296950192035, + "loss": 1.902928924560547, + "step": 5130 + }, + { + "epoch": 0.015558548877756422, + "grad_norm": 0.2518676221370697, + "learning_rate": 0.00012342917432028297, + "loss": 1.9025999069213868, + "step": 5140 + }, + { + "epoch": 0.01558881842810225, + "grad_norm": 0.2853963077068329, + "learning_rate": 0.00012342537913864556, + "loss": 1.8967933654785156, + "step": 5150 + }, + { + "epoch": 0.01561908797844808, + "grad_norm": 0.2582893669605255, + "learning_rate": 0.00012342158395700818, + "loss": 1.8174520492553712, + "step": 5160 + }, + { + "epoch": 0.01564935752879391, + "grad_norm": 0.21561996638774872, + "learning_rate": 0.0001234177887753708, + "loss": 1.8910381317138671, + "step": 5170 + }, + { + "epoch": 0.015679627079139738, + "grad_norm": 0.23453497886657715, + "learning_rate": 0.0001234139935937334, + "loss": 1.8363632202148437, + "step": 5180 + }, + { + "epoch": 0.01570989662948557, + "grad_norm": 0.2406529039144516, + "learning_rate": 0.000123410198412096, + "loss": 1.9145915985107422, + "step": 5190 + }, + { + "epoch": 0.0157401661798314, + "grad_norm": 0.3145064413547516, + "learning_rate": 0.0001234064032304586, + "loss": 1.8626583099365235, + "step": 5200 + }, + { + "epoch": 0.015770435730177227, + "grad_norm": 0.2631511092185974, + "learning_rate": 0.00012340260804882122, + "loss": 1.868096923828125, + "step": 5210 + }, + { + "epoch": 0.015800705280523058, + "grad_norm": 0.4047050476074219, + "learning_rate": 0.00012339881286718381, + "loss": 1.8571674346923828, + "step": 5220 + }, + { + "epoch": 0.01583097483086889, + "grad_norm": 0.3370504379272461, + "learning_rate": 0.00012339501768554643, + "loss": 1.8599119186401367, + "step": 5230 + }, + { + "epoch": 0.015861244381214716, + "grad_norm": 0.24051040410995483, + "learning_rate": 0.00012339122250390903, + "loss": 1.8876026153564454, + "step": 5240 + }, + { + "epoch": 0.015891513931560547, + "grad_norm": 0.2463655024766922, + "learning_rate": 0.00012338742732227165, + "loss": 1.8781461715698242, + "step": 5250 + }, + { + "epoch": 0.015921783481906378, + "grad_norm": 0.29726219177246094, + "learning_rate": 0.00012338363214063424, + "loss": 1.8835794448852539, + "step": 5260 + }, + { + "epoch": 0.015952053032252205, + "grad_norm": 0.23191097378730774, + "learning_rate": 0.00012337983695899686, + "loss": 1.9027042388916016, + "step": 5270 + }, + { + "epoch": 0.015982322582598036, + "grad_norm": 0.27363455295562744, + "learning_rate": 0.00012337604177735945, + "loss": 1.8503656387329102, + "step": 5280 + }, + { + "epoch": 0.016012592132943867, + "grad_norm": 0.27940624952316284, + "learning_rate": 0.00012337224659572207, + "loss": 1.8308637619018555, + "step": 5290 + }, + { + "epoch": 0.016042861683289694, + "grad_norm": 0.2356436848640442, + "learning_rate": 0.00012336845141408466, + "loss": 1.848540687561035, + "step": 5300 + }, + { + "epoch": 0.016073131233635525, + "grad_norm": 0.2679150700569153, + "learning_rate": 0.00012336465623244728, + "loss": 1.875124740600586, + "step": 5310 + }, + { + "epoch": 0.016103400783981352, + "grad_norm": 0.2035965472459793, + "learning_rate": 0.0001233608610508099, + "loss": 1.894278335571289, + "step": 5320 + }, + { + "epoch": 0.016133670334327183, + "grad_norm": 0.2574589252471924, + "learning_rate": 0.0001233570658691725, + "loss": 1.9304889678955077, + "step": 5330 + }, + { + "epoch": 0.016163939884673014, + "grad_norm": 0.2864076495170593, + "learning_rate": 0.0001233532706875351, + "loss": 1.869478416442871, + "step": 5340 + }, + { + "epoch": 0.01619420943501884, + "grad_norm": 0.24327047169208527, + "learning_rate": 0.0001233494755058977, + "loss": 1.867136573791504, + "step": 5350 + }, + { + "epoch": 0.016224478985364672, + "grad_norm": 0.25472140312194824, + "learning_rate": 0.00012334568032426033, + "loss": 1.8773862838745117, + "step": 5360 + }, + { + "epoch": 0.016254748535710503, + "grad_norm": 0.2282087504863739, + "learning_rate": 0.00012334188514262292, + "loss": 1.8592761993408202, + "step": 5370 + }, + { + "epoch": 0.01628501808605633, + "grad_norm": 0.2372155636548996, + "learning_rate": 0.00012333808996098554, + "loss": 1.8794776916503906, + "step": 5380 + }, + { + "epoch": 0.01631528763640216, + "grad_norm": 0.24496018886566162, + "learning_rate": 0.00012333429477934813, + "loss": 1.7953052520751953, + "step": 5390 + }, + { + "epoch": 0.016345557186747992, + "grad_norm": 0.297922819852829, + "learning_rate": 0.00012333049959771075, + "loss": 1.8810043334960938, + "step": 5400 + }, + { + "epoch": 0.01637582673709382, + "grad_norm": 0.2765595018863678, + "learning_rate": 0.00012332670441607334, + "loss": 1.887176513671875, + "step": 5410 + }, + { + "epoch": 0.01640609628743965, + "grad_norm": 0.20638135075569153, + "learning_rate": 0.00012332290923443596, + "loss": 1.8457176208496093, + "step": 5420 + }, + { + "epoch": 0.01643636583778548, + "grad_norm": 0.21391719579696655, + "learning_rate": 0.00012331911405279858, + "loss": 1.865023422241211, + "step": 5430 + }, + { + "epoch": 0.01646663538813131, + "grad_norm": 0.28115060925483704, + "learning_rate": 0.00012331531887116117, + "loss": 1.8583858489990235, + "step": 5440 + }, + { + "epoch": 0.01649690493847714, + "grad_norm": 0.22863751649856567, + "learning_rate": 0.0001233115236895238, + "loss": 1.8499774932861328, + "step": 5450 + }, + { + "epoch": 0.016527174488822967, + "grad_norm": 0.22742745280265808, + "learning_rate": 0.00012330772850788638, + "loss": 1.8379901885986327, + "step": 5460 + }, + { + "epoch": 0.016557444039168798, + "grad_norm": 0.30645281076431274, + "learning_rate": 0.000123303933326249, + "loss": 1.8616260528564452, + "step": 5470 + }, + { + "epoch": 0.01658771358951463, + "grad_norm": 0.2676772475242615, + "learning_rate": 0.0001233001381446116, + "loss": 1.863214874267578, + "step": 5480 + }, + { + "epoch": 0.016617983139860456, + "grad_norm": 0.25432392954826355, + "learning_rate": 0.00012329634296297422, + "loss": 1.8741313934326171, + "step": 5490 + }, + { + "epoch": 0.016648252690206287, + "grad_norm": 0.2506123185157776, + "learning_rate": 0.0001232925477813368, + "loss": 1.850498580932617, + "step": 5500 + }, + { + "epoch": 0.016648252690206287, + "eval_loss": 1.8569122552871704, + "eval_runtime": 28.3072, + "eval_samples_per_second": 17.663, + "eval_steps_per_second": 1.13, + "step": 5500 + }, + { + "epoch": 0.016678522240552118, + "grad_norm": 0.22849151492118835, + "learning_rate": 0.00012328875259969943, + "loss": 1.842625045776367, + "step": 5510 + }, + { + "epoch": 0.016708791790897945, + "grad_norm": 0.2467656433582306, + "learning_rate": 0.00012328495741806202, + "loss": 1.90612735748291, + "step": 5520 + }, + { + "epoch": 0.016739061341243776, + "grad_norm": 0.2146947830915451, + "learning_rate": 0.00012328116223642464, + "loss": 1.8384645462036133, + "step": 5530 + }, + { + "epoch": 0.016769330891589607, + "grad_norm": 0.25169840455055237, + "learning_rate": 0.00012327736705478723, + "loss": 1.845937728881836, + "step": 5540 + }, + { + "epoch": 0.016799600441935434, + "grad_norm": 0.23651912808418274, + "learning_rate": 0.00012327357187314985, + "loss": 1.8714561462402344, + "step": 5550 + }, + { + "epoch": 0.016829869992281265, + "grad_norm": 0.2649262547492981, + "learning_rate": 0.00012326977669151247, + "loss": 1.8647109985351562, + "step": 5560 + }, + { + "epoch": 0.016860139542627096, + "grad_norm": 0.22586017847061157, + "learning_rate": 0.00012326598150987506, + "loss": 1.907602310180664, + "step": 5570 + }, + { + "epoch": 0.016890409092972923, + "grad_norm": 0.2180119901895523, + "learning_rate": 0.00012326218632823768, + "loss": 1.8720455169677734, + "step": 5580 + }, + { + "epoch": 0.016920678643318754, + "grad_norm": 0.211221843957901, + "learning_rate": 0.00012325839114660028, + "loss": 1.8769329071044922, + "step": 5590 + }, + { + "epoch": 0.016950948193664585, + "grad_norm": 0.239295095205307, + "learning_rate": 0.0001232545959649629, + "loss": 1.8180503845214844, + "step": 5600 + }, + { + "epoch": 0.016981217744010412, + "grad_norm": 0.304980993270874, + "learning_rate": 0.0001232508007833255, + "loss": 1.8330080032348632, + "step": 5610 + }, + { + "epoch": 0.017011487294356243, + "grad_norm": 0.3079758286476135, + "learning_rate": 0.0001232470056016881, + "loss": 1.855228805541992, + "step": 5620 + }, + { + "epoch": 0.01704175684470207, + "grad_norm": 0.25345754623413086, + "learning_rate": 0.0001232432104200507, + "loss": 1.8405036926269531, + "step": 5630 + }, + { + "epoch": 0.0170720263950479, + "grad_norm": 0.27817678451538086, + "learning_rate": 0.00012323941523841332, + "loss": 1.8118030548095703, + "step": 5640 + }, + { + "epoch": 0.017102295945393732, + "grad_norm": 0.2770339548587799, + "learning_rate": 0.0001232356200567759, + "loss": 1.816292953491211, + "step": 5650 + }, + { + "epoch": 0.01713256549573956, + "grad_norm": 0.20876964926719666, + "learning_rate": 0.00012323182487513853, + "loss": 1.8668548583984375, + "step": 5660 + }, + { + "epoch": 0.01716283504608539, + "grad_norm": 0.23269157111644745, + "learning_rate": 0.00012322802969350115, + "loss": 1.8139965057373046, + "step": 5670 + }, + { + "epoch": 0.01719310459643122, + "grad_norm": 0.25825366377830505, + "learning_rate": 0.00012322423451186374, + "loss": 1.8379072189331054, + "step": 5680 + }, + { + "epoch": 0.01722337414677705, + "grad_norm": 0.2793043851852417, + "learning_rate": 0.00012322043933022636, + "loss": 1.9015451431274415, + "step": 5690 + }, + { + "epoch": 0.01725364369712288, + "grad_norm": 0.20909543335437775, + "learning_rate": 0.00012321664414858895, + "loss": 1.8493988037109375, + "step": 5700 + }, + { + "epoch": 0.01728391324746871, + "grad_norm": 0.23394574224948883, + "learning_rate": 0.00012321284896695157, + "loss": 1.8877681732177733, + "step": 5710 + }, + { + "epoch": 0.017314182797814538, + "grad_norm": 0.2578103542327881, + "learning_rate": 0.00012320905378531417, + "loss": 1.869552230834961, + "step": 5720 + }, + { + "epoch": 0.01734445234816037, + "grad_norm": 0.23299092054367065, + "learning_rate": 0.00012320525860367679, + "loss": 1.8627557754516602, + "step": 5730 + }, + { + "epoch": 0.0173747218985062, + "grad_norm": 0.22221864759922028, + "learning_rate": 0.00012320146342203938, + "loss": 1.8214000701904296, + "step": 5740 + }, + { + "epoch": 0.017404991448852027, + "grad_norm": 0.32061871886253357, + "learning_rate": 0.000123197668240402, + "loss": 1.8399248123168945, + "step": 5750 + }, + { + "epoch": 0.017435260999197857, + "grad_norm": 0.23021167516708374, + "learning_rate": 0.0001231938730587646, + "loss": 1.8707616806030274, + "step": 5760 + }, + { + "epoch": 0.017465530549543685, + "grad_norm": 0.207881897687912, + "learning_rate": 0.0001231900778771272, + "loss": 1.859281349182129, + "step": 5770 + }, + { + "epoch": 0.017495800099889516, + "grad_norm": 0.29350048303604126, + "learning_rate": 0.0001231862826954898, + "loss": 1.8165010452270507, + "step": 5780 + }, + { + "epoch": 0.017526069650235346, + "grad_norm": 0.2479042112827301, + "learning_rate": 0.00012318248751385242, + "loss": 1.8554283142089845, + "step": 5790 + }, + { + "epoch": 0.017556339200581174, + "grad_norm": 0.24666979908943176, + "learning_rate": 0.00012317869233221504, + "loss": 1.834951400756836, + "step": 5800 + }, + { + "epoch": 0.017586608750927005, + "grad_norm": 0.2244744449853897, + "learning_rate": 0.00012317489715057763, + "loss": 1.8664566040039063, + "step": 5810 + }, + { + "epoch": 0.017616878301272836, + "grad_norm": 0.23000025749206543, + "learning_rate": 0.00012317110196894025, + "loss": 1.9252750396728515, + "step": 5820 + }, + { + "epoch": 0.017647147851618663, + "grad_norm": 0.28660231828689575, + "learning_rate": 0.00012316730678730285, + "loss": 1.8050323486328126, + "step": 5830 + }, + { + "epoch": 0.017677417401964494, + "grad_norm": 0.23037204146385193, + "learning_rate": 0.00012316351160566547, + "loss": 1.8776996612548829, + "step": 5840 + }, + { + "epoch": 0.017707686952310325, + "grad_norm": 0.31796780228614807, + "learning_rate": 0.00012315971642402806, + "loss": 1.8262557983398438, + "step": 5850 + }, + { + "epoch": 0.017737956502656152, + "grad_norm": 0.30971595644950867, + "learning_rate": 0.00012315592124239068, + "loss": 1.840397834777832, + "step": 5860 + }, + { + "epoch": 0.017768226053001983, + "grad_norm": 0.23678290843963623, + "learning_rate": 0.00012315212606075327, + "loss": 1.8204227447509767, + "step": 5870 + }, + { + "epoch": 0.017798495603347814, + "grad_norm": 0.2065805196762085, + "learning_rate": 0.0001231483308791159, + "loss": 1.830202293395996, + "step": 5880 + }, + { + "epoch": 0.01782876515369364, + "grad_norm": 0.26590782403945923, + "learning_rate": 0.00012314453569747848, + "loss": 1.8260190963745118, + "step": 5890 + }, + { + "epoch": 0.017859034704039472, + "grad_norm": 0.18731461465358734, + "learning_rate": 0.0001231407405158411, + "loss": 1.8655424118041992, + "step": 5900 + }, + { + "epoch": 0.017889304254385303, + "grad_norm": 0.21137437224388123, + "learning_rate": 0.0001231369453342037, + "loss": 1.841649627685547, + "step": 5910 + }, + { + "epoch": 0.01791957380473113, + "grad_norm": 0.2290867418050766, + "learning_rate": 0.0001231331501525663, + "loss": 1.898290252685547, + "step": 5920 + }, + { + "epoch": 0.01794984335507696, + "grad_norm": 0.22258107364177704, + "learning_rate": 0.0001231293549709289, + "loss": 1.8232282638549804, + "step": 5930 + }, + { + "epoch": 0.01798011290542279, + "grad_norm": 0.19661648571491241, + "learning_rate": 0.00012312555978929152, + "loss": 1.8326240539550782, + "step": 5940 + }, + { + "epoch": 0.01801038245576862, + "grad_norm": 0.22036531567573547, + "learning_rate": 0.00012312176460765412, + "loss": 1.8623146057128905, + "step": 5950 + }, + { + "epoch": 0.01804065200611445, + "grad_norm": 0.20838315784931183, + "learning_rate": 0.00012311796942601674, + "loss": 1.8690370559692382, + "step": 5960 + }, + { + "epoch": 0.018070921556460277, + "grad_norm": 0.2170795053243637, + "learning_rate": 0.00012311417424437933, + "loss": 1.8661581039428712, + "step": 5970 + }, + { + "epoch": 0.018101191106806108, + "grad_norm": 0.19695264101028442, + "learning_rate": 0.00012311037906274195, + "loss": 1.8907184600830078, + "step": 5980 + }, + { + "epoch": 0.01813146065715194, + "grad_norm": 0.20342952013015747, + "learning_rate": 0.00012310658388110457, + "loss": 1.8140668869018555, + "step": 5990 + }, + { + "epoch": 0.018161730207497766, + "grad_norm": 0.24226219952106476, + "learning_rate": 0.00012310278869946716, + "loss": 1.8727760314941406, + "step": 6000 + }, + { + "epoch": 0.018161730207497766, + "eval_loss": 1.8633904457092285, + "eval_runtime": 28.3205, + "eval_samples_per_second": 17.655, + "eval_steps_per_second": 1.13, + "step": 6000 + }, + { + "epoch": 0.018191999757843597, + "grad_norm": 0.25276055932044983, + "learning_rate": 0.00012309899351782978, + "loss": 1.8126571655273438, + "step": 6010 + }, + { + "epoch": 0.018222269308189428, + "grad_norm": 0.21117310225963593, + "learning_rate": 0.00012309519833619237, + "loss": 1.835637664794922, + "step": 6020 + }, + { + "epoch": 0.018252538858535255, + "grad_norm": 0.24854497611522675, + "learning_rate": 0.000123091403154555, + "loss": 1.8853317260742188, + "step": 6030 + }, + { + "epoch": 0.018282808408881086, + "grad_norm": 0.3813914358615875, + "learning_rate": 0.00012308760797291758, + "loss": 1.821036148071289, + "step": 6040 + }, + { + "epoch": 0.018313077959226917, + "grad_norm": 0.26731252670288086, + "learning_rate": 0.0001230838127912802, + "loss": 1.8074243545532227, + "step": 6050 + }, + { + "epoch": 0.018343347509572745, + "grad_norm": 0.28862103819847107, + "learning_rate": 0.0001230800176096428, + "loss": 1.8315677642822266, + "step": 6060 + }, + { + "epoch": 0.018373617059918575, + "grad_norm": 0.23154671490192413, + "learning_rate": 0.00012307622242800542, + "loss": 1.8506645202636718, + "step": 6070 + }, + { + "epoch": 0.018403886610264406, + "grad_norm": 0.43288159370422363, + "learning_rate": 0.000123072427246368, + "loss": 1.8638360977172852, + "step": 6080 + }, + { + "epoch": 0.018434156160610234, + "grad_norm": 0.2719382643699646, + "learning_rate": 0.00012306863206473063, + "loss": 1.8539705276489258, + "step": 6090 + }, + { + "epoch": 0.018464425710956064, + "grad_norm": 0.22034567594528198, + "learning_rate": 0.00012306483688309322, + "loss": 1.8523931503295898, + "step": 6100 + }, + { + "epoch": 0.018494695261301892, + "grad_norm": 0.2449348270893097, + "learning_rate": 0.00012306104170145584, + "loss": 1.8393508911132812, + "step": 6110 + }, + { + "epoch": 0.018524964811647723, + "grad_norm": 0.29956138134002686, + "learning_rate": 0.00012305724651981843, + "loss": 1.884957504272461, + "step": 6120 + }, + { + "epoch": 0.018555234361993553, + "grad_norm": 0.3209877908229828, + "learning_rate": 0.00012305345133818105, + "loss": 1.9046735763549805, + "step": 6130 + }, + { + "epoch": 0.01858550391233938, + "grad_norm": 0.2689042389392853, + "learning_rate": 0.00012304965615654364, + "loss": 1.8400609970092774, + "step": 6140 + }, + { + "epoch": 0.01861577346268521, + "grad_norm": 0.271123468875885, + "learning_rate": 0.00012304586097490626, + "loss": 1.8367677688598634, + "step": 6150 + }, + { + "epoch": 0.018646043013031043, + "grad_norm": 0.44027626514434814, + "learning_rate": 0.00012304206579326886, + "loss": 1.8688486099243165, + "step": 6160 + }, + { + "epoch": 0.01867631256337687, + "grad_norm": 0.3534659743309021, + "learning_rate": 0.00012303827061163147, + "loss": 1.8421443939208983, + "step": 6170 + }, + { + "epoch": 0.0187065821137227, + "grad_norm": 0.2321607619524002, + "learning_rate": 0.0001230344754299941, + "loss": 1.7885398864746094, + "step": 6180 + }, + { + "epoch": 0.01873685166406853, + "grad_norm": 0.21565553545951843, + "learning_rate": 0.0001230306802483567, + "loss": 1.8375547409057618, + "step": 6190 + }, + { + "epoch": 0.01876712121441436, + "grad_norm": 0.2381637543439865, + "learning_rate": 0.0001230268850667193, + "loss": 1.7880277633666992, + "step": 6200 + }, + { + "epoch": 0.01879739076476019, + "grad_norm": 0.2030518501996994, + "learning_rate": 0.0001230230898850819, + "loss": 1.8668819427490235, + "step": 6210 + }, + { + "epoch": 0.01882766031510602, + "grad_norm": 0.2862457036972046, + "learning_rate": 0.00012301929470344452, + "loss": 1.8594593048095702, + "step": 6220 + }, + { + "epoch": 0.018857929865451848, + "grad_norm": 0.29942235350608826, + "learning_rate": 0.0001230154995218071, + "loss": 1.7902561187744142, + "step": 6230 + }, + { + "epoch": 0.01888819941579768, + "grad_norm": 0.28895333409309387, + "learning_rate": 0.00012301170434016973, + "loss": 1.8038532257080078, + "step": 6240 + }, + { + "epoch": 0.018918468966143506, + "grad_norm": 0.2944753170013428, + "learning_rate": 0.00012300790915853232, + "loss": 1.8358541488647462, + "step": 6250 + }, + { + "epoch": 0.018948738516489337, + "grad_norm": 0.27628955245018005, + "learning_rate": 0.00012300411397689494, + "loss": 1.8457122802734376, + "step": 6260 + }, + { + "epoch": 0.018979008066835168, + "grad_norm": 0.31969892978668213, + "learning_rate": 0.00012300031879525753, + "loss": 1.895846176147461, + "step": 6270 + }, + { + "epoch": 0.019009277617180995, + "grad_norm": 0.25154152512550354, + "learning_rate": 0.00012299652361362015, + "loss": 1.859012985229492, + "step": 6280 + }, + { + "epoch": 0.019039547167526826, + "grad_norm": 0.21850022673606873, + "learning_rate": 0.00012299272843198275, + "loss": 1.9081188201904298, + "step": 6290 + }, + { + "epoch": 0.019069816717872657, + "grad_norm": 0.24895280599594116, + "learning_rate": 0.00012298893325034537, + "loss": 1.8485198974609376, + "step": 6300 + }, + { + "epoch": 0.019100086268218484, + "grad_norm": 0.27901172637939453, + "learning_rate": 0.00012298513806870796, + "loss": 1.8485441207885742, + "step": 6310 + }, + { + "epoch": 0.019130355818564315, + "grad_norm": 0.29009586572647095, + "learning_rate": 0.00012298134288707058, + "loss": 1.8434883117675782, + "step": 6320 + }, + { + "epoch": 0.019160625368910146, + "grad_norm": 0.25638070702552795, + "learning_rate": 0.00012297754770543317, + "loss": 1.8703422546386719, + "step": 6330 + }, + { + "epoch": 0.019190894919255973, + "grad_norm": 0.27668607234954834, + "learning_rate": 0.0001229737525237958, + "loss": 1.8345870971679688, + "step": 6340 + }, + { + "epoch": 0.019221164469601804, + "grad_norm": 0.27874910831451416, + "learning_rate": 0.00012296995734215838, + "loss": 1.8233196258544921, + "step": 6350 + }, + { + "epoch": 0.019251434019947635, + "grad_norm": 0.26419076323509216, + "learning_rate": 0.000122966162160521, + "loss": 1.8977561950683595, + "step": 6360 + }, + { + "epoch": 0.019281703570293462, + "grad_norm": 0.4617539346218109, + "learning_rate": 0.0001229623669788836, + "loss": 1.821173858642578, + "step": 6370 + }, + { + "epoch": 0.019311973120639293, + "grad_norm": 0.24737511575222015, + "learning_rate": 0.0001229585717972462, + "loss": 1.856190299987793, + "step": 6380 + }, + { + "epoch": 0.019342242670985124, + "grad_norm": 0.26688069105148315, + "learning_rate": 0.00012295477661560883, + "loss": 1.814908790588379, + "step": 6390 + }, + { + "epoch": 0.01937251222133095, + "grad_norm": 0.28824689984321594, + "learning_rate": 0.00012295098143397143, + "loss": 1.8601123809814453, + "step": 6400 + }, + { + "epoch": 0.019402781771676782, + "grad_norm": 0.28268033266067505, + "learning_rate": 0.00012294718625233404, + "loss": 1.8646556854248046, + "step": 6410 + }, + { + "epoch": 0.01943305132202261, + "grad_norm": 0.23650816082954407, + "learning_rate": 0.00012294339107069664, + "loss": 1.8755260467529298, + "step": 6420 + }, + { + "epoch": 0.01946332087236844, + "grad_norm": 0.310432106256485, + "learning_rate": 0.00012293959588905926, + "loss": 1.8687093734741211, + "step": 6430 + }, + { + "epoch": 0.01949359042271427, + "grad_norm": 0.2796304523944855, + "learning_rate": 0.00012293580070742185, + "loss": 1.8640806198120117, + "step": 6440 + }, + { + "epoch": 0.0195238599730601, + "grad_norm": 0.24441856145858765, + "learning_rate": 0.00012293200552578447, + "loss": 1.8595924377441406, + "step": 6450 + }, + { + "epoch": 0.01955412952340593, + "grad_norm": 0.28475576639175415, + "learning_rate": 0.00012292821034414706, + "loss": 1.8307975769042968, + "step": 6460 + }, + { + "epoch": 0.01958439907375176, + "grad_norm": 0.24744892120361328, + "learning_rate": 0.00012292441516250968, + "loss": 1.8365306854248047, + "step": 6470 + }, + { + "epoch": 0.019614668624097588, + "grad_norm": 0.2056393325328827, + "learning_rate": 0.00012292061998087227, + "loss": 1.842251968383789, + "step": 6480 + }, + { + "epoch": 0.01964493817444342, + "grad_norm": 0.22604092955589294, + "learning_rate": 0.0001229168247992349, + "loss": 1.794837188720703, + "step": 6490 + }, + { + "epoch": 0.01967520772478925, + "grad_norm": 0.2148168534040451, + "learning_rate": 0.00012291302961759748, + "loss": 1.8496208190917969, + "step": 6500 + }, + { + "epoch": 0.01967520772478925, + "eval_loss": 1.8404966592788696, + "eval_runtime": 28.3416, + "eval_samples_per_second": 17.642, + "eval_steps_per_second": 1.129, + "step": 6500 + }, + { + "epoch": 0.019705477275135077, + "grad_norm": 0.23835286498069763, + "learning_rate": 0.0001229092344359601, + "loss": 1.8446155548095704, + "step": 6510 + }, + { + "epoch": 0.019735746825480908, + "grad_norm": 0.25797292590141296, + "learning_rate": 0.0001229054392543227, + "loss": 1.8697132110595702, + "step": 6520 + }, + { + "epoch": 0.01976601637582674, + "grad_norm": 0.26177841424942017, + "learning_rate": 0.00012290164407268532, + "loss": 1.8707530975341797, + "step": 6530 + }, + { + "epoch": 0.019796285926172566, + "grad_norm": 0.25804197788238525, + "learning_rate": 0.00012289784889104794, + "loss": 1.818470573425293, + "step": 6540 + }, + { + "epoch": 0.019826555476518397, + "grad_norm": 0.20585913956165314, + "learning_rate": 0.00012289405370941053, + "loss": 1.8400741577148438, + "step": 6550 + }, + { + "epoch": 0.019856825026864224, + "grad_norm": 0.23863548040390015, + "learning_rate": 0.00012289025852777315, + "loss": 1.8273506164550781, + "step": 6560 + }, + { + "epoch": 0.019887094577210055, + "grad_norm": 0.25192201137542725, + "learning_rate": 0.00012288646334613574, + "loss": 1.8322021484375, + "step": 6570 + }, + { + "epoch": 0.019917364127555886, + "grad_norm": 0.26560500264167786, + "learning_rate": 0.00012288266816449836, + "loss": 1.846490478515625, + "step": 6580 + }, + { + "epoch": 0.019947633677901713, + "grad_norm": 0.24061128497123718, + "learning_rate": 0.00012287887298286095, + "loss": 1.831591796875, + "step": 6590 + }, + { + "epoch": 0.019977903228247544, + "grad_norm": 0.32109883427619934, + "learning_rate": 0.00012287507780122357, + "loss": 1.8442148208618163, + "step": 6600 + }, + { + "epoch": 0.020008172778593375, + "grad_norm": 0.2683485448360443, + "learning_rate": 0.00012287128261958616, + "loss": 1.8334827423095703, + "step": 6610 + }, + { + "epoch": 0.020038442328939202, + "grad_norm": 0.2258332520723343, + "learning_rate": 0.00012286748743794878, + "loss": 1.844548225402832, + "step": 6620 + }, + { + "epoch": 0.020068711879285033, + "grad_norm": 0.24002057313919067, + "learning_rate": 0.00012286369225631138, + "loss": 1.8512149810791017, + "step": 6630 + }, + { + "epoch": 0.020098981429630864, + "grad_norm": 0.25613611936569214, + "learning_rate": 0.000122859897074674, + "loss": 1.821291732788086, + "step": 6640 + }, + { + "epoch": 0.02012925097997669, + "grad_norm": 0.30198177695274353, + "learning_rate": 0.0001228561018930366, + "loss": 1.8834835052490235, + "step": 6650 + }, + { + "epoch": 0.020159520530322522, + "grad_norm": 0.25182902812957764, + "learning_rate": 0.0001228523067113992, + "loss": 1.8647573471069336, + "step": 6660 + }, + { + "epoch": 0.020189790080668353, + "grad_norm": 0.3054657280445099, + "learning_rate": 0.00012284851152976183, + "loss": 1.817154312133789, + "step": 6670 + }, + { + "epoch": 0.02022005963101418, + "grad_norm": 0.23358261585235596, + "learning_rate": 0.00012284471634812442, + "loss": 1.8315048217773438, + "step": 6680 + }, + { + "epoch": 0.02025032918136001, + "grad_norm": 0.27083149552345276, + "learning_rate": 0.00012284092116648704, + "loss": 1.823702621459961, + "step": 6690 + }, + { + "epoch": 0.020280598731705842, + "grad_norm": 0.2612442672252655, + "learning_rate": 0.00012283712598484963, + "loss": 1.8200311660766602, + "step": 6700 + }, + { + "epoch": 0.02031086828205167, + "grad_norm": 0.2225186675786972, + "learning_rate": 0.00012283333080321225, + "loss": 1.844890022277832, + "step": 6710 + }, + { + "epoch": 0.0203411378323975, + "grad_norm": 0.24667301774024963, + "learning_rate": 0.00012282953562157484, + "loss": 1.824467086791992, + "step": 6720 + }, + { + "epoch": 0.020371407382743328, + "grad_norm": 0.25109490752220154, + "learning_rate": 0.00012282574043993746, + "loss": 1.841268539428711, + "step": 6730 + }, + { + "epoch": 0.02040167693308916, + "grad_norm": 0.1909966915845871, + "learning_rate": 0.00012282194525830005, + "loss": 1.9139432907104492, + "step": 6740 + }, + { + "epoch": 0.02043194648343499, + "grad_norm": 0.24838218092918396, + "learning_rate": 0.00012281815007666267, + "loss": 1.814112663269043, + "step": 6750 + }, + { + "epoch": 0.020462216033780817, + "grad_norm": 0.22963310778141022, + "learning_rate": 0.00012281435489502527, + "loss": 1.8453838348388671, + "step": 6760 + }, + { + "epoch": 0.020492485584126648, + "grad_norm": 0.22465305030345917, + "learning_rate": 0.00012281055971338789, + "loss": 1.8640630722045899, + "step": 6770 + }, + { + "epoch": 0.02052275513447248, + "grad_norm": 0.25553643703460693, + "learning_rate": 0.0001228067645317505, + "loss": 1.8147336959838867, + "step": 6780 + }, + { + "epoch": 0.020553024684818306, + "grad_norm": 0.2442479282617569, + "learning_rate": 0.0001228029693501131, + "loss": 1.862268829345703, + "step": 6790 + }, + { + "epoch": 0.020583294235164137, + "grad_norm": 0.27190813422203064, + "learning_rate": 0.00012279917416847572, + "loss": 1.849327850341797, + "step": 6800 + }, + { + "epoch": 0.020613563785509968, + "grad_norm": 0.30125993490219116, + "learning_rate": 0.0001227953789868383, + "loss": 1.8144601821899413, + "step": 6810 + }, + { + "epoch": 0.020643833335855795, + "grad_norm": 0.20705290138721466, + "learning_rate": 0.00012279158380520093, + "loss": 1.8273654937744142, + "step": 6820 + }, + { + "epoch": 0.020674102886201626, + "grad_norm": 0.2528172731399536, + "learning_rate": 0.00012278778862356352, + "loss": 1.8092430114746094, + "step": 6830 + }, + { + "epoch": 0.020704372436547457, + "grad_norm": 0.29034164547920227, + "learning_rate": 0.00012278399344192614, + "loss": 1.8674217224121095, + "step": 6840 + }, + { + "epoch": 0.020734641986893284, + "grad_norm": 0.22404071688652039, + "learning_rate": 0.00012278019826028873, + "loss": 1.8363109588623048, + "step": 6850 + }, + { + "epoch": 0.020764911537239115, + "grad_norm": 0.23020470142364502, + "learning_rate": 0.00012277640307865135, + "loss": 1.845700454711914, + "step": 6860 + }, + { + "epoch": 0.020795181087584946, + "grad_norm": 0.2505245506763458, + "learning_rate": 0.00012277260789701395, + "loss": 1.8242191314697265, + "step": 6870 + }, + { + "epoch": 0.020825450637930773, + "grad_norm": 0.2691003084182739, + "learning_rate": 0.00012276881271537657, + "loss": 1.8640363693237305, + "step": 6880 + }, + { + "epoch": 0.020855720188276604, + "grad_norm": 0.3238118290901184, + "learning_rate": 0.00012276501753373916, + "loss": 1.8493711471557617, + "step": 6890 + }, + { + "epoch": 0.02088598973862243, + "grad_norm": 0.20476210117340088, + "learning_rate": 0.00012276122235210178, + "loss": 1.79691162109375, + "step": 6900 + }, + { + "epoch": 0.020916259288968262, + "grad_norm": 0.2234606146812439, + "learning_rate": 0.0001227574271704644, + "loss": 1.8573738098144532, + "step": 6910 + }, + { + "epoch": 0.020946528839314093, + "grad_norm": 0.30502432584762573, + "learning_rate": 0.000122753631988827, + "loss": 1.8436855316162108, + "step": 6920 + }, + { + "epoch": 0.02097679838965992, + "grad_norm": 0.45579543709754944, + "learning_rate": 0.0001227498368071896, + "loss": 1.8738540649414062, + "step": 6930 + }, + { + "epoch": 0.02100706794000575, + "grad_norm": 0.2401237189769745, + "learning_rate": 0.0001227460416255522, + "loss": 1.83262939453125, + "step": 6940 + }, + { + "epoch": 0.021037337490351582, + "grad_norm": 0.24324019253253937, + "learning_rate": 0.00012274224644391482, + "loss": 1.8350664138793946, + "step": 6950 + }, + { + "epoch": 0.02106760704069741, + "grad_norm": 0.21258684992790222, + "learning_rate": 0.0001227384512622774, + "loss": 1.8342929840087892, + "step": 6960 + }, + { + "epoch": 0.02109787659104324, + "grad_norm": 0.2532143294811249, + "learning_rate": 0.00012273465608064003, + "loss": 1.8415254592895507, + "step": 6970 + }, + { + "epoch": 0.02112814614138907, + "grad_norm": 0.18937240540981293, + "learning_rate": 0.00012273086089900262, + "loss": 1.8493858337402345, + "step": 6980 + }, + { + "epoch": 0.0211584156917349, + "grad_norm": 0.24343012273311615, + "learning_rate": 0.00012272706571736524, + "loss": 1.8543655395507812, + "step": 6990 + }, + { + "epoch": 0.02118868524208073, + "grad_norm": 0.24899309873580933, + "learning_rate": 0.00012272327053572784, + "loss": 1.8185108184814454, + "step": 7000 + }, + { + "epoch": 0.02118868524208073, + "eval_loss": 1.8241745233535767, + "eval_runtime": 28.126, + "eval_samples_per_second": 17.777, + "eval_steps_per_second": 1.138, + "step": 7000 + }, + { + "epoch": 0.02121895479242656, + "grad_norm": 0.2038814276456833, + "learning_rate": 0.00012271947535409046, + "loss": 1.8342687606811523, + "step": 7010 + }, + { + "epoch": 0.021249224342772387, + "grad_norm": 0.21157386898994446, + "learning_rate": 0.00012271568017245305, + "loss": 1.8357662200927733, + "step": 7020 + }, + { + "epoch": 0.02127949389311822, + "grad_norm": 0.2425507754087448, + "learning_rate": 0.00012271188499081567, + "loss": 1.8508167266845703, + "step": 7030 + }, + { + "epoch": 0.021309763443464046, + "grad_norm": 0.25302112102508545, + "learning_rate": 0.0001227080898091783, + "loss": 1.8325750350952148, + "step": 7040 + }, + { + "epoch": 0.021340032993809877, + "grad_norm": 0.22466222941875458, + "learning_rate": 0.00012270429462754088, + "loss": 1.8303985595703125, + "step": 7050 + }, + { + "epoch": 0.021370302544155707, + "grad_norm": 0.24949897825717926, + "learning_rate": 0.0001227004994459035, + "loss": 1.8023599624633788, + "step": 7060 + }, + { + "epoch": 0.021400572094501535, + "grad_norm": 0.31988421082496643, + "learning_rate": 0.0001226967042642661, + "loss": 1.845266342163086, + "step": 7070 + }, + { + "epoch": 0.021430841644847366, + "grad_norm": 0.23856782913208008, + "learning_rate": 0.0001226929090826287, + "loss": 1.8088703155517578, + "step": 7080 + }, + { + "epoch": 0.021461111195193196, + "grad_norm": 0.25207221508026123, + "learning_rate": 0.0001226891139009913, + "loss": 1.7854413986206055, + "step": 7090 + }, + { + "epoch": 0.021491380745539024, + "grad_norm": 0.2812952697277069, + "learning_rate": 0.00012268531871935392, + "loss": 1.7975898742675782, + "step": 7100 + }, + { + "epoch": 0.021521650295884855, + "grad_norm": 0.24855318665504456, + "learning_rate": 0.00012268152353771652, + "loss": 1.8155899047851562, + "step": 7110 + }, + { + "epoch": 0.021551919846230685, + "grad_norm": 0.1948927491903305, + "learning_rate": 0.00012267772835607914, + "loss": 1.8275360107421874, + "step": 7120 + }, + { + "epoch": 0.021582189396576513, + "grad_norm": 0.1961018443107605, + "learning_rate": 0.00012267393317444173, + "loss": 1.8547653198242187, + "step": 7130 + }, + { + "epoch": 0.021612458946922344, + "grad_norm": 0.21102578938007355, + "learning_rate": 0.00012267013799280435, + "loss": 1.8024551391601562, + "step": 7140 + }, + { + "epoch": 0.021642728497268175, + "grad_norm": 0.21022699773311615, + "learning_rate": 0.00012266634281116694, + "loss": 1.829681396484375, + "step": 7150 + }, + { + "epoch": 0.021672998047614002, + "grad_norm": 0.2516497075557709, + "learning_rate": 0.00012266254762952956, + "loss": 1.85716552734375, + "step": 7160 + }, + { + "epoch": 0.021703267597959833, + "grad_norm": 0.2686770558357239, + "learning_rate": 0.00012265875244789215, + "loss": 1.8298610687255858, + "step": 7170 + }, + { + "epoch": 0.021733537148305664, + "grad_norm": 0.2340860813856125, + "learning_rate": 0.00012265495726625477, + "loss": 1.8123004913330079, + "step": 7180 + }, + { + "epoch": 0.02176380669865149, + "grad_norm": 0.19383245706558228, + "learning_rate": 0.0001226511620846174, + "loss": 1.832462692260742, + "step": 7190 + }, + { + "epoch": 0.021794076248997322, + "grad_norm": 0.21472038328647614, + "learning_rate": 0.00012264736690297998, + "loss": 1.7741214752197265, + "step": 7200 + }, + { + "epoch": 0.02182434579934315, + "grad_norm": 0.2028181403875351, + "learning_rate": 0.0001226435717213426, + "loss": 1.8448486328125, + "step": 7210 + }, + { + "epoch": 0.02185461534968898, + "grad_norm": 0.21208511292934418, + "learning_rate": 0.0001226397765397052, + "loss": 1.8872795104980469, + "step": 7220 + }, + { + "epoch": 0.02188488490003481, + "grad_norm": 0.22747129201889038, + "learning_rate": 0.00012263598135806781, + "loss": 1.8526178359985352, + "step": 7230 + }, + { + "epoch": 0.021915154450380638, + "grad_norm": 0.24180510640144348, + "learning_rate": 0.0001226321861764304, + "loss": 1.8334465026855469, + "step": 7240 + }, + { + "epoch": 0.02194542400072647, + "grad_norm": 0.24309557676315308, + "learning_rate": 0.00012262839099479303, + "loss": 1.820363998413086, + "step": 7250 + }, + { + "epoch": 0.0219756935510723, + "grad_norm": 0.1994132250547409, + "learning_rate": 0.00012262459581315562, + "loss": 1.8822036743164063, + "step": 7260 + }, + { + "epoch": 0.022005963101418127, + "grad_norm": 0.23749731481075287, + "learning_rate": 0.00012262080063151824, + "loss": 1.827754020690918, + "step": 7270 + }, + { + "epoch": 0.022036232651763958, + "grad_norm": 0.23714783787727356, + "learning_rate": 0.00012261700544988083, + "loss": 1.824763298034668, + "step": 7280 + }, + { + "epoch": 0.02206650220210979, + "grad_norm": 0.18288274109363556, + "learning_rate": 0.00012261321026824345, + "loss": 1.8070976257324218, + "step": 7290 + }, + { + "epoch": 0.022096771752455616, + "grad_norm": 0.20266231894493103, + "learning_rate": 0.00012260941508660604, + "loss": 1.8709224700927733, + "step": 7300 + }, + { + "epoch": 0.022127041302801447, + "grad_norm": 0.2162034958600998, + "learning_rate": 0.00012260561990496866, + "loss": 1.8060428619384765, + "step": 7310 + }, + { + "epoch": 0.022157310853147278, + "grad_norm": 0.19639430940151215, + "learning_rate": 0.00012260182472333125, + "loss": 1.850008773803711, + "step": 7320 + }, + { + "epoch": 0.022187580403493105, + "grad_norm": 0.20404842495918274, + "learning_rate": 0.00012259802954169387, + "loss": 1.7921585083007812, + "step": 7330 + }, + { + "epoch": 0.022217849953838936, + "grad_norm": 0.18643228709697723, + "learning_rate": 0.00012259423436005647, + "loss": 1.847456169128418, + "step": 7340 + }, + { + "epoch": 0.022248119504184764, + "grad_norm": 0.24789747595787048, + "learning_rate": 0.00012259043917841909, + "loss": 1.8138195037841798, + "step": 7350 + }, + { + "epoch": 0.022278389054530594, + "grad_norm": 0.2733500599861145, + "learning_rate": 0.00012258664399678168, + "loss": 1.7583673477172852, + "step": 7360 + }, + { + "epoch": 0.022308658604876425, + "grad_norm": 0.21805180609226227, + "learning_rate": 0.0001225828488151443, + "loss": 1.8305435180664062, + "step": 7370 + }, + { + "epoch": 0.022338928155222253, + "grad_norm": 0.19130976498126984, + "learning_rate": 0.0001225790536335069, + "loss": 1.8422369003295898, + "step": 7380 + }, + { + "epoch": 0.022369197705568084, + "grad_norm": 0.1909477263689041, + "learning_rate": 0.0001225752584518695, + "loss": 1.8270648956298827, + "step": 7390 + }, + { + "epoch": 0.022399467255913914, + "grad_norm": 0.18463557958602905, + "learning_rate": 0.00012257146327023213, + "loss": 1.8101734161376952, + "step": 7400 + }, + { + "epoch": 0.02242973680625974, + "grad_norm": 0.23548930883407593, + "learning_rate": 0.00012256766808859472, + "loss": 1.8527286529541016, + "step": 7410 + }, + { + "epoch": 0.022460006356605573, + "grad_norm": 0.23957738280296326, + "learning_rate": 0.00012256387290695734, + "loss": 1.8371932983398438, + "step": 7420 + }, + { + "epoch": 0.022490275906951403, + "grad_norm": 0.2070585936307907, + "learning_rate": 0.00012256007772531993, + "loss": 1.7964906692504883, + "step": 7430 + }, + { + "epoch": 0.02252054545729723, + "grad_norm": 0.20841321349143982, + "learning_rate": 0.00012255628254368255, + "loss": 1.8370723724365234, + "step": 7440 + }, + { + "epoch": 0.02255081500764306, + "grad_norm": 0.18264615535736084, + "learning_rate": 0.00012255248736204515, + "loss": 1.7853500366210937, + "step": 7450 + }, + { + "epoch": 0.022581084557988892, + "grad_norm": 0.236237034201622, + "learning_rate": 0.00012254869218040776, + "loss": 1.8441932678222657, + "step": 7460 + }, + { + "epoch": 0.02261135410833472, + "grad_norm": 0.21234890818595886, + "learning_rate": 0.00012254489699877036, + "loss": 1.8104103088378907, + "step": 7470 + }, + { + "epoch": 0.02264162365868055, + "grad_norm": 0.22114109992980957, + "learning_rate": 0.00012254110181713298, + "loss": 1.8334014892578125, + "step": 7480 + }, + { + "epoch": 0.02267189320902638, + "grad_norm": 0.19833935797214508, + "learning_rate": 0.00012253730663549557, + "loss": 1.8365512847900392, + "step": 7490 + }, + { + "epoch": 0.02270216275937221, + "grad_norm": 0.20741049945354462, + "learning_rate": 0.0001225335114538582, + "loss": 1.8216032028198241, + "step": 7500 + }, + { + "epoch": 0.02270216275937221, + "eval_loss": 1.795928716659546, + "eval_runtime": 27.8104, + "eval_samples_per_second": 17.979, + "eval_steps_per_second": 1.151, + "step": 7500 + }, + { + "epoch": 0.02273243230971804, + "grad_norm": 0.19699527323246002, + "learning_rate": 0.00012252971627222078, + "loss": 1.83348388671875, + "step": 7510 + }, + { + "epoch": 0.022762701860063867, + "grad_norm": 0.24201422929763794, + "learning_rate": 0.0001225259210905834, + "loss": 1.8118097305297851, + "step": 7520 + }, + { + "epoch": 0.022792971410409698, + "grad_norm": 0.21539416909217834, + "learning_rate": 0.000122522125908946, + "loss": 1.8063438415527344, + "step": 7530 + }, + { + "epoch": 0.02282324096075553, + "grad_norm": 0.19895049929618835, + "learning_rate": 0.0001225183307273086, + "loss": 1.904414176940918, + "step": 7540 + }, + { + "epoch": 0.022853510511101356, + "grad_norm": 0.20179982483386993, + "learning_rate": 0.0001225145355456712, + "loss": 1.8083698272705078, + "step": 7550 + }, + { + "epoch": 0.022883780061447187, + "grad_norm": 0.2164190411567688, + "learning_rate": 0.00012251074036403382, + "loss": 1.7967960357666015, + "step": 7560 + }, + { + "epoch": 0.022914049611793018, + "grad_norm": 0.23189975321292877, + "learning_rate": 0.00012250694518239642, + "loss": 1.7533887863159179, + "step": 7570 + }, + { + "epoch": 0.022944319162138845, + "grad_norm": 0.23545615375041962, + "learning_rate": 0.00012250315000075904, + "loss": 1.8389610290527343, + "step": 7580 + }, + { + "epoch": 0.022974588712484676, + "grad_norm": 0.2261921912431717, + "learning_rate": 0.00012249935481912163, + "loss": 1.7833084106445312, + "step": 7590 + }, + { + "epoch": 0.023004858262830507, + "grad_norm": 0.200758159160614, + "learning_rate": 0.00012249555963748425, + "loss": 1.8214902877807617, + "step": 7600 + }, + { + "epoch": 0.023035127813176334, + "grad_norm": 0.18078924715518951, + "learning_rate": 0.00012249176445584687, + "loss": 1.8385560989379883, + "step": 7610 + }, + { + "epoch": 0.023065397363522165, + "grad_norm": 0.23380646109580994, + "learning_rate": 0.00012248796927420946, + "loss": 1.8474555969238282, + "step": 7620 + }, + { + "epoch": 0.023095666913867996, + "grad_norm": 0.2023615539073944, + "learning_rate": 0.00012248417409257208, + "loss": 1.7881818771362306, + "step": 7630 + }, + { + "epoch": 0.023125936464213823, + "grad_norm": 0.19560493528842926, + "learning_rate": 0.00012248037891093467, + "loss": 1.8165767669677735, + "step": 7640 + }, + { + "epoch": 0.023156206014559654, + "grad_norm": 0.1980629414319992, + "learning_rate": 0.0001224765837292973, + "loss": 1.8116657257080078, + "step": 7650 + }, + { + "epoch": 0.023186475564905485, + "grad_norm": 0.20965147018432617, + "learning_rate": 0.00012247278854765988, + "loss": 1.858496856689453, + "step": 7660 + }, + { + "epoch": 0.023216745115251312, + "grad_norm": 0.20566339790821075, + "learning_rate": 0.0001224689933660225, + "loss": 1.885434341430664, + "step": 7670 + }, + { + "epoch": 0.023247014665597143, + "grad_norm": 0.25994664430618286, + "learning_rate": 0.0001224651981843851, + "loss": 1.80203857421875, + "step": 7680 + }, + { + "epoch": 0.02327728421594297, + "grad_norm": 0.23856748640537262, + "learning_rate": 0.00012246140300274771, + "loss": 1.844719696044922, + "step": 7690 + }, + { + "epoch": 0.0233075537662888, + "grad_norm": 0.2181333601474762, + "learning_rate": 0.0001224576078211103, + "loss": 1.774932289123535, + "step": 7700 + }, + { + "epoch": 0.023337823316634632, + "grad_norm": 0.19703926146030426, + "learning_rate": 0.00012245381263947293, + "loss": 1.8325675964355468, + "step": 7710 + }, + { + "epoch": 0.02336809286698046, + "grad_norm": 0.20819276571273804, + "learning_rate": 0.00012245001745783552, + "loss": 1.8454681396484376, + "step": 7720 + }, + { + "epoch": 0.02339836241732629, + "grad_norm": 0.24658706784248352, + "learning_rate": 0.00012244622227619814, + "loss": 1.8260488510131836, + "step": 7730 + }, + { + "epoch": 0.02342863196767212, + "grad_norm": 0.25946682691574097, + "learning_rate": 0.00012244242709456073, + "loss": 1.8086275100708007, + "step": 7740 + }, + { + "epoch": 0.02345890151801795, + "grad_norm": 0.1999141424894333, + "learning_rate": 0.00012243863191292335, + "loss": 1.7867116928100586, + "step": 7750 + }, + { + "epoch": 0.02348917106836378, + "grad_norm": 0.19836241006851196, + "learning_rate": 0.00012243483673128594, + "loss": 1.8339958190917969, + "step": 7760 + }, + { + "epoch": 0.02351944061870961, + "grad_norm": 0.21631796658039093, + "learning_rate": 0.00012243104154964856, + "loss": 1.7928874969482422, + "step": 7770 + }, + { + "epoch": 0.023549710169055438, + "grad_norm": 0.27000126242637634, + "learning_rate": 0.00012242724636801118, + "loss": 1.863869857788086, + "step": 7780 + }, + { + "epoch": 0.02357997971940127, + "grad_norm": 0.20761846005916595, + "learning_rate": 0.00012242345118637377, + "loss": 1.913619041442871, + "step": 7790 + }, + { + "epoch": 0.0236102492697471, + "grad_norm": 0.21957893669605255, + "learning_rate": 0.0001224196560047364, + "loss": 1.7805500030517578, + "step": 7800 + }, + { + "epoch": 0.023640518820092927, + "grad_norm": 0.21224263310432434, + "learning_rate": 0.00012241586082309899, + "loss": 1.8387226104736327, + "step": 7810 + }, + { + "epoch": 0.023670788370438758, + "grad_norm": 0.2046273797750473, + "learning_rate": 0.0001224120656414616, + "loss": 1.7955577850341797, + "step": 7820 + }, + { + "epoch": 0.023701057920784585, + "grad_norm": 0.21075598895549774, + "learning_rate": 0.0001224082704598242, + "loss": 1.8216327667236327, + "step": 7830 + }, + { + "epoch": 0.023731327471130416, + "grad_norm": 0.25197941064834595, + "learning_rate": 0.00012240447527818682, + "loss": 1.818185043334961, + "step": 7840 + }, + { + "epoch": 0.023761597021476247, + "grad_norm": 0.28233882784843445, + "learning_rate": 0.0001224006800965494, + "loss": 1.7960668563842774, + "step": 7850 + }, + { + "epoch": 0.023791866571822074, + "grad_norm": 0.294552206993103, + "learning_rate": 0.00012239688491491203, + "loss": 1.8257959365844727, + "step": 7860 + }, + { + "epoch": 0.023822136122167905, + "grad_norm": 0.1865861415863037, + "learning_rate": 0.00012239308973327462, + "loss": 1.8513433456420898, + "step": 7870 + }, + { + "epoch": 0.023852405672513736, + "grad_norm": 0.2951979339122772, + "learning_rate": 0.00012238929455163724, + "loss": 1.8094532012939453, + "step": 7880 + }, + { + "epoch": 0.023882675222859563, + "grad_norm": 0.21469606459140778, + "learning_rate": 0.00012238549936999986, + "loss": 1.8292741775512695, + "step": 7890 + }, + { + "epoch": 0.023912944773205394, + "grad_norm": 0.230330690741539, + "learning_rate": 0.00012238170418836245, + "loss": 1.8206932067871093, + "step": 7900 + }, + { + "epoch": 0.023943214323551225, + "grad_norm": 0.20562703907489777, + "learning_rate": 0.00012237790900672507, + "loss": 1.822073745727539, + "step": 7910 + }, + { + "epoch": 0.023973483873897052, + "grad_norm": 0.23487725853919983, + "learning_rate": 0.00012237411382508767, + "loss": 1.836660385131836, + "step": 7920 + }, + { + "epoch": 0.024003753424242883, + "grad_norm": 0.19059844315052032, + "learning_rate": 0.00012237031864345028, + "loss": 1.8283145904541016, + "step": 7930 + }, + { + "epoch": 0.024034022974588714, + "grad_norm": 0.17904500663280487, + "learning_rate": 0.00012236652346181288, + "loss": 1.8221220016479491, + "step": 7940 + }, + { + "epoch": 0.02406429252493454, + "grad_norm": 0.21904247999191284, + "learning_rate": 0.0001223627282801755, + "loss": 1.7984203338623046, + "step": 7950 + }, + { + "epoch": 0.024094562075280372, + "grad_norm": 0.22441910207271576, + "learning_rate": 0.0001223589330985381, + "loss": 1.8681182861328125, + "step": 7960 + }, + { + "epoch": 0.024124831625626203, + "grad_norm": 0.20481789112091064, + "learning_rate": 0.0001223551379169007, + "loss": 1.8633394241333008, + "step": 7970 + }, + { + "epoch": 0.02415510117597203, + "grad_norm": 0.24019591510295868, + "learning_rate": 0.0001223513427352633, + "loss": 1.8135051727294922, + "step": 7980 + }, + { + "epoch": 0.02418537072631786, + "grad_norm": 0.1972866654396057, + "learning_rate": 0.00012234754755362592, + "loss": 1.7809494018554688, + "step": 7990 + }, + { + "epoch": 0.02421564027666369, + "grad_norm": 0.22989672422409058, + "learning_rate": 0.0001223437523719885, + "loss": 1.818352508544922, + "step": 8000 + }, + { + "epoch": 0.02421564027666369, + "eval_loss": 1.8210963010787964, + "eval_runtime": 28.1999, + "eval_samples_per_second": 17.731, + "eval_steps_per_second": 1.135, + "step": 8000 + }, + { + "epoch": 0.02424590982700952, + "grad_norm": 0.18287402391433716, + "learning_rate": 0.00012233995719035113, + "loss": 1.775103759765625, + "step": 8010 + }, + { + "epoch": 0.02427617937735535, + "grad_norm": 0.18002770841121674, + "learning_rate": 0.00012233616200871375, + "loss": 1.816849136352539, + "step": 8020 + }, + { + "epoch": 0.024306448927701178, + "grad_norm": 0.2167147994041443, + "learning_rate": 0.00012233236682707634, + "loss": 1.8373222351074219, + "step": 8030 + }, + { + "epoch": 0.02433671847804701, + "grad_norm": 0.20943914353847504, + "learning_rate": 0.00012232857164543896, + "loss": 1.8086132049560546, + "step": 8040 + }, + { + "epoch": 0.02436698802839284, + "grad_norm": 0.2062809020280838, + "learning_rate": 0.00012232477646380156, + "loss": 1.8259361267089844, + "step": 8050 + }, + { + "epoch": 0.024397257578738667, + "grad_norm": 0.19747717678546906, + "learning_rate": 0.00012232098128216418, + "loss": 1.8339122772216796, + "step": 8060 + }, + { + "epoch": 0.024427527129084498, + "grad_norm": 0.221780925989151, + "learning_rate": 0.00012231718610052677, + "loss": 1.7855224609375, + "step": 8070 + }, + { + "epoch": 0.02445779667943033, + "grad_norm": 0.20809388160705566, + "learning_rate": 0.0001223133909188894, + "loss": 1.81933536529541, + "step": 8080 + }, + { + "epoch": 0.024488066229776156, + "grad_norm": 0.2079768031835556, + "learning_rate": 0.00012230959573725198, + "loss": 1.818729782104492, + "step": 8090 + }, + { + "epoch": 0.024518335780121987, + "grad_norm": 0.24692712724208832, + "learning_rate": 0.0001223058005556146, + "loss": 1.8316942214965821, + "step": 8100 + }, + { + "epoch": 0.024548605330467817, + "grad_norm": 0.2004041075706482, + "learning_rate": 0.0001223020053739772, + "loss": 1.8429975509643555, + "step": 8110 + }, + { + "epoch": 0.024578874880813645, + "grad_norm": 0.21004225313663483, + "learning_rate": 0.0001222982101923398, + "loss": 1.849380874633789, + "step": 8120 + }, + { + "epoch": 0.024609144431159476, + "grad_norm": 0.2856302857398987, + "learning_rate": 0.0001222944150107024, + "loss": 1.8313854217529297, + "step": 8130 + }, + { + "epoch": 0.024639413981505303, + "grad_norm": 0.226398304104805, + "learning_rate": 0.00012229061982906502, + "loss": 1.8012054443359375, + "step": 8140 + }, + { + "epoch": 0.024669683531851134, + "grad_norm": 0.2504754066467285, + "learning_rate": 0.00012228682464742764, + "loss": 1.8021514892578125, + "step": 8150 + }, + { + "epoch": 0.024699953082196965, + "grad_norm": 0.23625099658966064, + "learning_rate": 0.00012228302946579024, + "loss": 1.8150196075439453, + "step": 8160 + }, + { + "epoch": 0.024730222632542792, + "grad_norm": 0.2102413922548294, + "learning_rate": 0.00012227923428415285, + "loss": 1.7970970153808594, + "step": 8170 + }, + { + "epoch": 0.024760492182888623, + "grad_norm": 0.20337338745594025, + "learning_rate": 0.00012227543910251545, + "loss": 1.7898998260498047, + "step": 8180 + }, + { + "epoch": 0.024790761733234454, + "grad_norm": 0.2089945524930954, + "learning_rate": 0.00012227164392087807, + "loss": 1.8498077392578125, + "step": 8190 + }, + { + "epoch": 0.02482103128358028, + "grad_norm": 0.24515454471111298, + "learning_rate": 0.00012226784873924066, + "loss": 1.8420064926147461, + "step": 8200 + }, + { + "epoch": 0.024851300833926112, + "grad_norm": 0.21030293405056, + "learning_rate": 0.00012226405355760328, + "loss": 1.8072559356689453, + "step": 8210 + }, + { + "epoch": 0.024881570384271943, + "grad_norm": 0.21372458338737488, + "learning_rate": 0.00012226025837596587, + "loss": 1.8173004150390626, + "step": 8220 + }, + { + "epoch": 0.02491183993461777, + "grad_norm": 0.19835703074932098, + "learning_rate": 0.0001222564631943285, + "loss": 1.7471446990966797, + "step": 8230 + }, + { + "epoch": 0.0249421094849636, + "grad_norm": 0.23791387677192688, + "learning_rate": 0.00012225266801269108, + "loss": 1.8918785095214843, + "step": 8240 + }, + { + "epoch": 0.024972379035309432, + "grad_norm": 0.19404584169387817, + "learning_rate": 0.0001222488728310537, + "loss": 1.842071533203125, + "step": 8250 + }, + { + "epoch": 0.02500264858565526, + "grad_norm": 0.23504756391048431, + "learning_rate": 0.00012224507764941632, + "loss": 1.8190093994140626, + "step": 8260 + }, + { + "epoch": 0.02503291813600109, + "grad_norm": 0.22359061241149902, + "learning_rate": 0.00012224128246777891, + "loss": 1.7919435501098633, + "step": 8270 + }, + { + "epoch": 0.02506318768634692, + "grad_norm": 0.19229893386363983, + "learning_rate": 0.00012223748728614153, + "loss": 1.8059741973876953, + "step": 8280 + }, + { + "epoch": 0.02509345723669275, + "grad_norm": 0.24494431912899017, + "learning_rate": 0.00012223369210450413, + "loss": 1.7925300598144531, + "step": 8290 + }, + { + "epoch": 0.02512372678703858, + "grad_norm": 0.22513502836227417, + "learning_rate": 0.00012222989692286675, + "loss": 1.8383285522460937, + "step": 8300 + }, + { + "epoch": 0.025153996337384407, + "grad_norm": 0.22519338130950928, + "learning_rate": 0.00012222610174122934, + "loss": 1.7946369171142578, + "step": 8310 + }, + { + "epoch": 0.025184265887730237, + "grad_norm": 0.22292353212833405, + "learning_rate": 0.00012222230655959196, + "loss": 1.8064815521240234, + "step": 8320 + }, + { + "epoch": 0.025214535438076068, + "grad_norm": 0.21169617772102356, + "learning_rate": 0.00012221851137795455, + "loss": 1.804694366455078, + "step": 8330 + }, + { + "epoch": 0.025244804988421896, + "grad_norm": 0.20836685597896576, + "learning_rate": 0.00012221471619631717, + "loss": 1.8607990264892578, + "step": 8340 + }, + { + "epoch": 0.025275074538767726, + "grad_norm": 0.22112831473350525, + "learning_rate": 0.00012221092101467976, + "loss": 1.8250297546386718, + "step": 8350 + }, + { + "epoch": 0.025305344089113557, + "grad_norm": 0.19712014496326447, + "learning_rate": 0.00012220712583304238, + "loss": 1.8247810363769532, + "step": 8360 + }, + { + "epoch": 0.025335613639459385, + "grad_norm": 0.19592556357383728, + "learning_rate": 0.00012220333065140497, + "loss": 1.8312469482421876, + "step": 8370 + }, + { + "epoch": 0.025365883189805215, + "grad_norm": 0.2150866985321045, + "learning_rate": 0.0001221995354697676, + "loss": 1.8149084091186523, + "step": 8380 + }, + { + "epoch": 0.025396152740151046, + "grad_norm": 0.21093247830867767, + "learning_rate": 0.00012219574028813019, + "loss": 1.8210159301757813, + "step": 8390 + }, + { + "epoch": 0.025426422290496874, + "grad_norm": 0.2250814437866211, + "learning_rate": 0.0001221919451064928, + "loss": 1.8130990982055664, + "step": 8400 + }, + { + "epoch": 0.025456691840842705, + "grad_norm": 0.24928945302963257, + "learning_rate": 0.00012218814992485542, + "loss": 1.7792308807373047, + "step": 8410 + }, + { + "epoch": 0.025486961391188535, + "grad_norm": 0.20292319357395172, + "learning_rate": 0.00012218435474321802, + "loss": 1.8133369445800782, + "step": 8420 + }, + { + "epoch": 0.025517230941534363, + "grad_norm": 0.4126882553100586, + "learning_rate": 0.00012218055956158064, + "loss": 1.8120363235473633, + "step": 8430 + }, + { + "epoch": 0.025547500491880194, + "grad_norm": 0.29910966753959656, + "learning_rate": 0.00012217676437994323, + "loss": 1.8034189224243165, + "step": 8440 + }, + { + "epoch": 0.025577770042226024, + "grad_norm": 0.22049354016780853, + "learning_rate": 0.00012217296919830585, + "loss": 1.775800895690918, + "step": 8450 + }, + { + "epoch": 0.025608039592571852, + "grad_norm": 0.19831055402755737, + "learning_rate": 0.00012216917401666844, + "loss": 1.8329397201538087, + "step": 8460 + }, + { + "epoch": 0.025638309142917683, + "grad_norm": 0.21179386973381042, + "learning_rate": 0.00012216537883503106, + "loss": 1.800735092163086, + "step": 8470 + }, + { + "epoch": 0.02566857869326351, + "grad_norm": 0.18978257477283478, + "learning_rate": 0.00012216158365339365, + "loss": 1.8093666076660155, + "step": 8480 + }, + { + "epoch": 0.02569884824360934, + "grad_norm": 0.2527192533016205, + "learning_rate": 0.00012215778847175627, + "loss": 1.8356266021728516, + "step": 8490 + }, + { + "epoch": 0.02572911779395517, + "grad_norm": 0.22178207337856293, + "learning_rate": 0.00012215399329011886, + "loss": 1.8264503479003906, + "step": 8500 + }, + { + "epoch": 0.02572911779395517, + "eval_loss": 1.829110860824585, + "eval_runtime": 28.4963, + "eval_samples_per_second": 17.546, + "eval_steps_per_second": 1.123, + "step": 8500 + }, + { + "epoch": 0.025759387344301, + "grad_norm": 0.18713848292827606, + "learning_rate": 0.00012215019810848148, + "loss": 1.8165231704711915, + "step": 8510 + }, + { + "epoch": 0.02578965689464683, + "grad_norm": 0.19015954434871674, + "learning_rate": 0.00012214640292684408, + "loss": 1.798746109008789, + "step": 8520 + }, + { + "epoch": 0.02581992644499266, + "grad_norm": 0.21424317359924316, + "learning_rate": 0.0001221426077452067, + "loss": 1.799159622192383, + "step": 8530 + }, + { + "epoch": 0.025850195995338488, + "grad_norm": 0.19944430887699127, + "learning_rate": 0.0001221388125635693, + "loss": 1.7842597961425781, + "step": 8540 + }, + { + "epoch": 0.02588046554568432, + "grad_norm": 0.2095043659210205, + "learning_rate": 0.0001221350173819319, + "loss": 1.7819953918457032, + "step": 8550 + }, + { + "epoch": 0.02591073509603015, + "grad_norm": 0.2516118288040161, + "learning_rate": 0.0001221312222002945, + "loss": 1.8238533020019532, + "step": 8560 + }, + { + "epoch": 0.025941004646375977, + "grad_norm": 0.24343332648277283, + "learning_rate": 0.00012212742701865712, + "loss": 1.8176218032836915, + "step": 8570 + }, + { + "epoch": 0.025971274196721808, + "grad_norm": 0.1739025115966797, + "learning_rate": 0.0001221236318370197, + "loss": 1.7740123748779297, + "step": 8580 + }, + { + "epoch": 0.02600154374706764, + "grad_norm": 0.19885504245758057, + "learning_rate": 0.00012211983665538233, + "loss": 1.8050455093383788, + "step": 8590 + }, + { + "epoch": 0.026031813297413466, + "grad_norm": 0.21868681907653809, + "learning_rate": 0.00012211604147374492, + "loss": 1.7529836654663087, + "step": 8600 + }, + { + "epoch": 0.026062082847759297, + "grad_norm": 0.19322271645069122, + "learning_rate": 0.00012211224629210754, + "loss": 1.8282230377197266, + "step": 8610 + }, + { + "epoch": 0.026092352398105124, + "grad_norm": 0.19754427671432495, + "learning_rate": 0.00012210845111047016, + "loss": 1.8051074981689452, + "step": 8620 + }, + { + "epoch": 0.026122621948450955, + "grad_norm": 0.21113812923431396, + "learning_rate": 0.00012210465592883276, + "loss": 1.8049243927001952, + "step": 8630 + }, + { + "epoch": 0.026152891498796786, + "grad_norm": 0.21462635695934296, + "learning_rate": 0.00012210086074719538, + "loss": 1.8085468292236329, + "step": 8640 + }, + { + "epoch": 0.026183161049142614, + "grad_norm": 0.25219517946243286, + "learning_rate": 0.00012209706556555797, + "loss": 1.8234172821044923, + "step": 8650 + }, + { + "epoch": 0.026213430599488444, + "grad_norm": 0.25821197032928467, + "learning_rate": 0.0001220932703839206, + "loss": 1.7828405380249024, + "step": 8660 + }, + { + "epoch": 0.026243700149834275, + "grad_norm": 0.22503645718097687, + "learning_rate": 0.00012208947520228318, + "loss": 1.8019550323486329, + "step": 8670 + }, + { + "epoch": 0.026273969700180103, + "grad_norm": 0.21593834459781647, + "learning_rate": 0.0001220856800206458, + "loss": 1.8248817443847656, + "step": 8680 + }, + { + "epoch": 0.026304239250525933, + "grad_norm": 0.22455234825611115, + "learning_rate": 0.0001220818848390084, + "loss": 1.8117586135864259, + "step": 8690 + }, + { + "epoch": 0.026334508800871764, + "grad_norm": 0.21863995492458344, + "learning_rate": 0.000122078089657371, + "loss": 1.7730846405029297, + "step": 8700 + }, + { + "epoch": 0.02636477835121759, + "grad_norm": 0.23124751448631287, + "learning_rate": 0.0001220742944757336, + "loss": 1.7887662887573241, + "step": 8710 + }, + { + "epoch": 0.026395047901563422, + "grad_norm": 0.2053178995847702, + "learning_rate": 0.00012207049929409622, + "loss": 1.815992546081543, + "step": 8720 + }, + { + "epoch": 0.026425317451909253, + "grad_norm": 0.1971340775489807, + "learning_rate": 0.00012206670411245882, + "loss": 1.8318918228149415, + "step": 8730 + }, + { + "epoch": 0.02645558700225508, + "grad_norm": 0.2284456491470337, + "learning_rate": 0.00012206290893082143, + "loss": 1.793014144897461, + "step": 8740 + }, + { + "epoch": 0.02648585655260091, + "grad_norm": 0.24421477317810059, + "learning_rate": 0.00012205911374918403, + "loss": 1.7972835540771483, + "step": 8750 + }, + { + "epoch": 0.026516126102946742, + "grad_norm": 0.2675950825214386, + "learning_rate": 0.00012205531856754665, + "loss": 1.829206657409668, + "step": 8760 + }, + { + "epoch": 0.02654639565329257, + "grad_norm": 0.3526819050312042, + "learning_rate": 0.00012205152338590925, + "loss": 1.776956558227539, + "step": 8770 + }, + { + "epoch": 0.0265766652036384, + "grad_norm": 0.25420787930488586, + "learning_rate": 0.00012204772820427186, + "loss": 1.8010639190673827, + "step": 8780 + }, + { + "epoch": 0.026606934753984228, + "grad_norm": 0.20218612253665924, + "learning_rate": 0.00012204393302263446, + "loss": 1.780923843383789, + "step": 8790 + }, + { + "epoch": 0.02663720430433006, + "grad_norm": 0.2195245325565338, + "learning_rate": 0.00012204013784099707, + "loss": 1.7891159057617188, + "step": 8800 + }, + { + "epoch": 0.02666747385467589, + "grad_norm": 0.1940789818763733, + "learning_rate": 0.00012203634265935968, + "loss": 1.8013034820556642, + "step": 8810 + }, + { + "epoch": 0.026697743405021717, + "grad_norm": 0.20817795395851135, + "learning_rate": 0.00012203254747772228, + "loss": 1.7889816284179687, + "step": 8820 + }, + { + "epoch": 0.026728012955367548, + "grad_norm": 0.19202837347984314, + "learning_rate": 0.0001220287522960849, + "loss": 1.792188835144043, + "step": 8830 + }, + { + "epoch": 0.02675828250571338, + "grad_norm": 0.2234664261341095, + "learning_rate": 0.0001220249571144475, + "loss": 1.8234642028808594, + "step": 8840 + }, + { + "epoch": 0.026788552056059206, + "grad_norm": 0.30848655104637146, + "learning_rate": 0.00012202116193281011, + "loss": 1.7992759704589845, + "step": 8850 + }, + { + "epoch": 0.026818821606405037, + "grad_norm": 0.25158455967903137, + "learning_rate": 0.0001220173667511727, + "loss": 1.7922191619873047, + "step": 8860 + }, + { + "epoch": 0.026849091156750868, + "grad_norm": 0.2032368928194046, + "learning_rate": 0.00012201357156953533, + "loss": 1.7697975158691406, + "step": 8870 + }, + { + "epoch": 0.026879360707096695, + "grad_norm": 0.18842406570911407, + "learning_rate": 0.00012200977638789793, + "loss": 1.8155811309814454, + "step": 8880 + }, + { + "epoch": 0.026909630257442526, + "grad_norm": 0.24386262893676758, + "learning_rate": 0.00012200598120626054, + "loss": 1.8534505844116211, + "step": 8890 + }, + { + "epoch": 0.026939899807788357, + "grad_norm": 0.25903838872909546, + "learning_rate": 0.00012200218602462314, + "loss": 1.7785507202148438, + "step": 8900 + }, + { + "epoch": 0.026970169358134184, + "grad_norm": 0.2719416916370392, + "learning_rate": 0.00012199839084298575, + "loss": 1.8134485244750977, + "step": 8910 + }, + { + "epoch": 0.027000438908480015, + "grad_norm": 0.22760528326034546, + "learning_rate": 0.00012199459566134836, + "loss": 1.8164848327636718, + "step": 8920 + }, + { + "epoch": 0.027030708458825842, + "grad_norm": 0.22754289209842682, + "learning_rate": 0.00012199080047971096, + "loss": 1.826169776916504, + "step": 8930 + }, + { + "epoch": 0.027060978009171673, + "grad_norm": 0.21410313248634338, + "learning_rate": 0.00012198700529807357, + "loss": 1.7940914154052734, + "step": 8940 + }, + { + "epoch": 0.027091247559517504, + "grad_norm": 0.20419107377529144, + "learning_rate": 0.00012198321011643617, + "loss": 1.811562728881836, + "step": 8950 + }, + { + "epoch": 0.02712151710986333, + "grad_norm": 0.18913941085338593, + "learning_rate": 0.00012197941493479878, + "loss": 1.8264135360717773, + "step": 8960 + }, + { + "epoch": 0.027151786660209162, + "grad_norm": 0.21212640404701233, + "learning_rate": 0.00012197561975316139, + "loss": 1.7720531463623046, + "step": 8970 + }, + { + "epoch": 0.027182056210554993, + "grad_norm": 0.21341852843761444, + "learning_rate": 0.00012197182457152399, + "loss": 1.8284603118896485, + "step": 8980 + }, + { + "epoch": 0.02721232576090082, + "grad_norm": 0.20826059579849243, + "learning_rate": 0.0001219680293898866, + "loss": 1.8082267761230468, + "step": 8990 + }, + { + "epoch": 0.02724259531124665, + "grad_norm": 0.2397451102733612, + "learning_rate": 0.0001219642342082492, + "loss": 1.8141908645629883, + "step": 9000 + }, + { + "epoch": 0.02724259531124665, + "eval_loss": 1.8413883447647095, + "eval_runtime": 27.9701, + "eval_samples_per_second": 17.876, + "eval_steps_per_second": 1.144, + "step": 9000 + }, + { + "epoch": 0.027272864861592482, + "grad_norm": 0.3028580844402313, + "learning_rate": 0.00012196043902661182, + "loss": 1.804438591003418, + "step": 9010 + }, + { + "epoch": 0.02730313441193831, + "grad_norm": 0.21089862287044525, + "learning_rate": 0.00012195664384497443, + "loss": 1.7953571319580077, + "step": 9020 + }, + { + "epoch": 0.02733340396228414, + "grad_norm": 0.19936352968215942, + "learning_rate": 0.00012195284866333703, + "loss": 1.8288698196411133, + "step": 9030 + }, + { + "epoch": 0.02736367351262997, + "grad_norm": 0.2508029043674469, + "learning_rate": 0.00012194905348169964, + "loss": 1.8077468872070312, + "step": 9040 + }, + { + "epoch": 0.0273939430629758, + "grad_norm": 0.21750284731388092, + "learning_rate": 0.00012194525830006225, + "loss": 1.8007923126220704, + "step": 9050 + }, + { + "epoch": 0.02742421261332163, + "grad_norm": 0.24437600374221802, + "learning_rate": 0.00012194146311842485, + "loss": 1.839505386352539, + "step": 9060 + }, + { + "epoch": 0.02745448216366746, + "grad_norm": 0.18949852883815765, + "learning_rate": 0.00012193766793678746, + "loss": 1.7501792907714844, + "step": 9070 + }, + { + "epoch": 0.027484751714013288, + "grad_norm": 0.20429013669490814, + "learning_rate": 0.00012193387275515006, + "loss": 1.8021713256835938, + "step": 9080 + }, + { + "epoch": 0.02751502126435912, + "grad_norm": 0.26944348216056824, + "learning_rate": 0.00012193007757351267, + "loss": 1.814869499206543, + "step": 9090 + }, + { + "epoch": 0.027545290814704946, + "grad_norm": 0.1983078420162201, + "learning_rate": 0.00012192628239187528, + "loss": 1.796541976928711, + "step": 9100 + }, + { + "epoch": 0.027575560365050777, + "grad_norm": 0.22966325283050537, + "learning_rate": 0.00012192248721023788, + "loss": 1.8117382049560546, + "step": 9110 + }, + { + "epoch": 0.027605829915396608, + "grad_norm": 0.1895318478345871, + "learning_rate": 0.0001219186920286005, + "loss": 1.7979364395141602, + "step": 9120 + }, + { + "epoch": 0.027636099465742435, + "grad_norm": 0.20740437507629395, + "learning_rate": 0.0001219148968469631, + "loss": 1.847325325012207, + "step": 9130 + }, + { + "epoch": 0.027666369016088266, + "grad_norm": 0.19957415759563446, + "learning_rate": 0.00012191110166532571, + "loss": 1.779805564880371, + "step": 9140 + }, + { + "epoch": 0.027696638566434097, + "grad_norm": 0.2867756485939026, + "learning_rate": 0.0001219073064836883, + "loss": 1.7684234619140624, + "step": 9150 + }, + { + "epoch": 0.027726908116779924, + "grad_norm": 0.21617378294467926, + "learning_rate": 0.00012190351130205093, + "loss": 1.7943819046020508, + "step": 9160 + }, + { + "epoch": 0.027757177667125755, + "grad_norm": 0.19442452490329742, + "learning_rate": 0.00012189971612041352, + "loss": 1.827237319946289, + "step": 9170 + }, + { + "epoch": 0.027787447217471586, + "grad_norm": 0.22950126230716705, + "learning_rate": 0.00012189592093877614, + "loss": 1.78143367767334, + "step": 9180 + }, + { + "epoch": 0.027817716767817413, + "grad_norm": 0.25405991077423096, + "learning_rate": 0.00012189212575713873, + "loss": 1.855116081237793, + "step": 9190 + }, + { + "epoch": 0.027847986318163244, + "grad_norm": 0.18448588252067566, + "learning_rate": 0.00012188833057550135, + "loss": 1.8293960571289063, + "step": 9200 + }, + { + "epoch": 0.027878255868509075, + "grad_norm": 0.1953759789466858, + "learning_rate": 0.00012188453539386394, + "loss": 1.8203128814697265, + "step": 9210 + }, + { + "epoch": 0.027908525418854902, + "grad_norm": 0.21629519760608673, + "learning_rate": 0.00012188074021222656, + "loss": 1.822698211669922, + "step": 9220 + }, + { + "epoch": 0.027938794969200733, + "grad_norm": 0.21702100336551666, + "learning_rate": 0.00012187694503058917, + "loss": 1.785955810546875, + "step": 9230 + }, + { + "epoch": 0.027969064519546564, + "grad_norm": 0.206680566072464, + "learning_rate": 0.00012187314984895177, + "loss": 1.7403903961181642, + "step": 9240 + }, + { + "epoch": 0.02799933406989239, + "grad_norm": 0.23517583310604095, + "learning_rate": 0.00012186935466731439, + "loss": 1.815192413330078, + "step": 9250 + }, + { + "epoch": 0.028029603620238222, + "grad_norm": 0.34312039613723755, + "learning_rate": 0.00012186555948567698, + "loss": 1.8212242126464844, + "step": 9260 + }, + { + "epoch": 0.02805987317058405, + "grad_norm": 0.19950783252716064, + "learning_rate": 0.0001218617643040396, + "loss": 1.7926986694335938, + "step": 9270 + }, + { + "epoch": 0.02809014272092988, + "grad_norm": 0.21120411157608032, + "learning_rate": 0.0001218579691224022, + "loss": 1.8149890899658203, + "step": 9280 + }, + { + "epoch": 0.02812041227127571, + "grad_norm": 0.2637511193752289, + "learning_rate": 0.00012185417394076482, + "loss": 1.795364761352539, + "step": 9290 + }, + { + "epoch": 0.02815068182162154, + "grad_norm": 0.20024080574512482, + "learning_rate": 0.00012185037875912741, + "loss": 1.7980976104736328, + "step": 9300 + }, + { + "epoch": 0.02818095137196737, + "grad_norm": 0.22448661923408508, + "learning_rate": 0.00012184658357749003, + "loss": 1.783590316772461, + "step": 9310 + }, + { + "epoch": 0.0282112209223132, + "grad_norm": 0.21047239005565643, + "learning_rate": 0.00012184278839585262, + "loss": 1.8090339660644532, + "step": 9320 + }, + { + "epoch": 0.028241490472659028, + "grad_norm": 0.23363564908504486, + "learning_rate": 0.00012183899321421524, + "loss": 1.7555196762084961, + "step": 9330 + }, + { + "epoch": 0.02827176002300486, + "grad_norm": 0.25043681263923645, + "learning_rate": 0.00012183519803257783, + "loss": 1.7971830368041992, + "step": 9340 + }, + { + "epoch": 0.02830202957335069, + "grad_norm": 0.21947306394577026, + "learning_rate": 0.00012183140285094045, + "loss": 1.7598270416259765, + "step": 9350 + }, + { + "epoch": 0.028332299123696517, + "grad_norm": 0.8362742066383362, + "learning_rate": 0.00012182760766930304, + "loss": 1.7882678985595704, + "step": 9360 + }, + { + "epoch": 0.028362568674042347, + "grad_norm": 0.30263492465019226, + "learning_rate": 0.00012182381248766566, + "loss": 1.7969474792480469, + "step": 9370 + }, + { + "epoch": 0.02839283822438818, + "grad_norm": 0.22231480479240417, + "learning_rate": 0.00012182001730602827, + "loss": 1.830681610107422, + "step": 9380 + }, + { + "epoch": 0.028423107774734006, + "grad_norm": 0.17971323430538177, + "learning_rate": 0.00012181622212439088, + "loss": 1.7833454132080078, + "step": 9390 + }, + { + "epoch": 0.028453377325079836, + "grad_norm": 0.1949961632490158, + "learning_rate": 0.00012181242694275348, + "loss": 1.7944637298583985, + "step": 9400 + }, + { + "epoch": 0.028483646875425664, + "grad_norm": 0.19990438222885132, + "learning_rate": 0.00012180863176111609, + "loss": 1.7892753601074218, + "step": 9410 + }, + { + "epoch": 0.028513916425771495, + "grad_norm": 0.22556141018867493, + "learning_rate": 0.0001218048365794787, + "loss": 1.8044206619262695, + "step": 9420 + }, + { + "epoch": 0.028544185976117326, + "grad_norm": 0.24999955296516418, + "learning_rate": 0.0001218010413978413, + "loss": 1.7681888580322265, + "step": 9430 + }, + { + "epoch": 0.028574455526463153, + "grad_norm": 0.24935884773731232, + "learning_rate": 0.00012179724621620392, + "loss": 1.8503162384033203, + "step": 9440 + }, + { + "epoch": 0.028604725076808984, + "grad_norm": 0.2032618671655655, + "learning_rate": 0.00012179345103456651, + "loss": 1.7927757263183595, + "step": 9450 + }, + { + "epoch": 0.028634994627154815, + "grad_norm": 0.1951366364955902, + "learning_rate": 0.00012178965585292913, + "loss": 1.7972427368164063, + "step": 9460 + }, + { + "epoch": 0.028665264177500642, + "grad_norm": 0.21954835951328278, + "learning_rate": 0.00012178586067129172, + "loss": 1.7947469711303712, + "step": 9470 + }, + { + "epoch": 0.028695533727846473, + "grad_norm": 0.18072572350502014, + "learning_rate": 0.00012178206548965434, + "loss": 1.8116661071777345, + "step": 9480 + }, + { + "epoch": 0.028725803278192304, + "grad_norm": 0.1763623207807541, + "learning_rate": 0.00012177827030801694, + "loss": 1.799334716796875, + "step": 9490 + }, + { + "epoch": 0.02875607282853813, + "grad_norm": 0.17989970743656158, + "learning_rate": 0.00012177447512637955, + "loss": 1.8232246398925782, + "step": 9500 + }, + { + "epoch": 0.02875607282853813, + "eval_loss": 1.7818307876586914, + "eval_runtime": 28.2684, + "eval_samples_per_second": 17.688, + "eval_steps_per_second": 1.132, + "step": 9500 + }, + { + "epoch": 0.028786342378883962, + "grad_norm": 0.1798158437013626, + "learning_rate": 0.00012177067994474216, + "loss": 1.7810359954833985, + "step": 9510 + }, + { + "epoch": 0.028816611929229793, + "grad_norm": 0.2018480896949768, + "learning_rate": 0.00012176688476310477, + "loss": 1.8173233032226563, + "step": 9520 + }, + { + "epoch": 0.02884688147957562, + "grad_norm": 0.22124886512756348, + "learning_rate": 0.00012176308958146737, + "loss": 1.7879131317138672, + "step": 9530 + }, + { + "epoch": 0.02887715102992145, + "grad_norm": 0.18961511552333832, + "learning_rate": 0.00012175929439982998, + "loss": 1.8160186767578126, + "step": 9540 + }, + { + "epoch": 0.028907420580267282, + "grad_norm": 0.17783163487911224, + "learning_rate": 0.00012175549921819258, + "loss": 1.8118251800537108, + "step": 9550 + }, + { + "epoch": 0.02893769013061311, + "grad_norm": 0.1942843496799469, + "learning_rate": 0.00012175170403655519, + "loss": 1.7919082641601562, + "step": 9560 + }, + { + "epoch": 0.02896795968095894, + "grad_norm": 0.18045905232429504, + "learning_rate": 0.0001217479088549178, + "loss": 1.8170217514038085, + "step": 9570 + }, + { + "epoch": 0.028998229231304767, + "grad_norm": 0.17625610530376434, + "learning_rate": 0.0001217441136732804, + "loss": 1.846391487121582, + "step": 9580 + }, + { + "epoch": 0.029028498781650598, + "grad_norm": 0.18645288050174713, + "learning_rate": 0.00012174031849164301, + "loss": 1.7845891952514648, + "step": 9590 + }, + { + "epoch": 0.02905876833199643, + "grad_norm": 0.22082644701004028, + "learning_rate": 0.00012173652331000561, + "loss": 1.8308349609375, + "step": 9600 + }, + { + "epoch": 0.029089037882342256, + "grad_norm": 0.20433729887008667, + "learning_rate": 0.00012173272812836822, + "loss": 1.7911787033081055, + "step": 9610 + }, + { + "epoch": 0.029119307432688087, + "grad_norm": 0.19534234702587128, + "learning_rate": 0.00012172893294673084, + "loss": 1.7818195343017578, + "step": 9620 + }, + { + "epoch": 0.029149576983033918, + "grad_norm": 0.17920421063899994, + "learning_rate": 0.00012172513776509345, + "loss": 1.8212112426757812, + "step": 9630 + }, + { + "epoch": 0.029179846533379745, + "grad_norm": 0.22612716257572174, + "learning_rate": 0.00012172134258345605, + "loss": 1.8284578323364258, + "step": 9640 + }, + { + "epoch": 0.029210116083725576, + "grad_norm": 0.2591439187526703, + "learning_rate": 0.00012171754740181866, + "loss": 1.800368309020996, + "step": 9650 + }, + { + "epoch": 0.029240385634071407, + "grad_norm": 0.17664207518100739, + "learning_rate": 0.00012171375222018126, + "loss": 1.7582433700561524, + "step": 9660 + }, + { + "epoch": 0.029270655184417235, + "grad_norm": 0.20304732024669647, + "learning_rate": 0.00012170995703854387, + "loss": 1.7982755661010743, + "step": 9670 + }, + { + "epoch": 0.029300924734763065, + "grad_norm": 0.2014065980911255, + "learning_rate": 0.00012170616185690648, + "loss": 1.7839908599853516, + "step": 9680 + }, + { + "epoch": 0.029331194285108896, + "grad_norm": 0.2161848247051239, + "learning_rate": 0.00012170236667526908, + "loss": 1.8005992889404296, + "step": 9690 + }, + { + "epoch": 0.029361463835454724, + "grad_norm": 0.21261228621006012, + "learning_rate": 0.00012169857149363169, + "loss": 1.8074455261230469, + "step": 9700 + }, + { + "epoch": 0.029391733385800554, + "grad_norm": 0.2739905118942261, + "learning_rate": 0.00012169477631199429, + "loss": 1.7941024780273438, + "step": 9710 + }, + { + "epoch": 0.029422002936146382, + "grad_norm": 0.2108381688594818, + "learning_rate": 0.0001216909811303569, + "loss": 1.7916130065917968, + "step": 9720 + }, + { + "epoch": 0.029452272486492213, + "grad_norm": 0.19966985285282135, + "learning_rate": 0.0001216871859487195, + "loss": 1.8069730758666993, + "step": 9730 + }, + { + "epoch": 0.029482542036838044, + "grad_norm": 0.40068313479423523, + "learning_rate": 0.00012168339076708211, + "loss": 1.8125221252441406, + "step": 9740 + }, + { + "epoch": 0.02951281158718387, + "grad_norm": 0.2569311857223511, + "learning_rate": 0.00012167959558544473, + "loss": 1.8411754608154296, + "step": 9750 + }, + { + "epoch": 0.0295430811375297, + "grad_norm": 0.20103801786899567, + "learning_rate": 0.00012167580040380732, + "loss": 1.8199678421020509, + "step": 9760 + }, + { + "epoch": 0.029573350687875533, + "grad_norm": 0.24154643714427948, + "learning_rate": 0.00012167200522216994, + "loss": 1.830724334716797, + "step": 9770 + }, + { + "epoch": 0.02960362023822136, + "grad_norm": 0.20512732863426208, + "learning_rate": 0.00012166821004053253, + "loss": 1.8174280166625976, + "step": 9780 + }, + { + "epoch": 0.02963388978856719, + "grad_norm": 0.18548934161663055, + "learning_rate": 0.00012166441485889515, + "loss": 1.8051891326904297, + "step": 9790 + }, + { + "epoch": 0.02966415933891302, + "grad_norm": 0.19430455565452576, + "learning_rate": 0.00012166061967725775, + "loss": 1.8279590606689453, + "step": 9800 + }, + { + "epoch": 0.02969442888925885, + "grad_norm": 0.19736336171627045, + "learning_rate": 0.00012165682449562037, + "loss": 1.7737205505371094, + "step": 9810 + }, + { + "epoch": 0.02972469843960468, + "grad_norm": 0.23153893649578094, + "learning_rate": 0.00012165302931398296, + "loss": 1.7658246994018554, + "step": 9820 + }, + { + "epoch": 0.02975496798995051, + "grad_norm": 0.21181322634220123, + "learning_rate": 0.00012164923413234558, + "loss": 1.7766716003417968, + "step": 9830 + }, + { + "epoch": 0.029785237540296338, + "grad_norm": 0.17565672099590302, + "learning_rate": 0.00012164543895070818, + "loss": 1.7991422653198241, + "step": 9840 + }, + { + "epoch": 0.02981550709064217, + "grad_norm": 0.2183520346879959, + "learning_rate": 0.00012164164376907079, + "loss": 1.7935199737548828, + "step": 9850 + }, + { + "epoch": 0.029845776640988, + "grad_norm": 0.21496553719043732, + "learning_rate": 0.00012163784858743341, + "loss": 1.7891284942626953, + "step": 9860 + }, + { + "epoch": 0.029876046191333827, + "grad_norm": 0.20475144684314728, + "learning_rate": 0.000121634053405796, + "loss": 1.803128433227539, + "step": 9870 + }, + { + "epoch": 0.029906315741679658, + "grad_norm": 0.21312251687049866, + "learning_rate": 0.00012163025822415862, + "loss": 1.7311262130737304, + "step": 9880 + }, + { + "epoch": 0.029936585292025485, + "grad_norm": 0.18883304297924042, + "learning_rate": 0.00012162646304252121, + "loss": 1.7967708587646485, + "step": 9890 + }, + { + "epoch": 0.029966854842371316, + "grad_norm": 0.17291571199893951, + "learning_rate": 0.00012162266786088383, + "loss": 1.8162826538085937, + "step": 9900 + }, + { + "epoch": 0.029997124392717147, + "grad_norm": 0.16252970695495605, + "learning_rate": 0.00012161887267924643, + "loss": 1.7529903411865235, + "step": 9910 + }, + { + "epoch": 0.030027393943062974, + "grad_norm": 0.20874102413654327, + "learning_rate": 0.00012161507749760905, + "loss": 1.7644939422607422, + "step": 9920 + }, + { + "epoch": 0.030057663493408805, + "grad_norm": 0.18958742916584015, + "learning_rate": 0.00012161128231597164, + "loss": 1.8230598449707032, + "step": 9930 + }, + { + "epoch": 0.030087933043754636, + "grad_norm": 0.2216808944940567, + "learning_rate": 0.00012160748713433426, + "loss": 1.7784168243408203, + "step": 9940 + }, + { + "epoch": 0.030118202594100463, + "grad_norm": 0.21268759667873383, + "learning_rate": 0.00012160369195269685, + "loss": 1.8006797790527345, + "step": 9950 + }, + { + "epoch": 0.030148472144446294, + "grad_norm": 0.2061108946800232, + "learning_rate": 0.00012159989677105947, + "loss": 1.7910720825195312, + "step": 9960 + }, + { + "epoch": 0.030178741694792125, + "grad_norm": 0.27064254879951477, + "learning_rate": 0.00012159610158942206, + "loss": 1.7637554168701173, + "step": 9970 + }, + { + "epoch": 0.030209011245137952, + "grad_norm": 0.18661317229270935, + "learning_rate": 0.00012159230640778468, + "loss": 1.7618806838989258, + "step": 9980 + }, + { + "epoch": 0.030239280795483783, + "grad_norm": 0.19400788843631744, + "learning_rate": 0.00012158851122614729, + "loss": 1.7899293899536133, + "step": 9990 + }, + { + "epoch": 0.030269550345829614, + "grad_norm": 0.17090055346488953, + "learning_rate": 0.00012158471604450989, + "loss": 1.7961809158325195, + "step": 10000 + }, + { + "epoch": 0.030269550345829614, + "eval_loss": 1.7923696041107178, + "eval_runtime": 28.4143, + "eval_samples_per_second": 17.597, + "eval_steps_per_second": 1.126, + "step": 10000 + }, + { + "epoch": 0.03029981989617544, + "grad_norm": 0.19372910261154175, + "learning_rate": 0.0001215809208628725, + "loss": 1.7740577697753905, + "step": 10010 + }, + { + "epoch": 0.030330089446521272, + "grad_norm": 0.28460198640823364, + "learning_rate": 0.0001215771256812351, + "loss": 1.764866828918457, + "step": 10020 + }, + { + "epoch": 0.030360358996867103, + "grad_norm": 0.19347435235977173, + "learning_rate": 0.00012157333049959772, + "loss": 1.8116182327270507, + "step": 10030 + }, + { + "epoch": 0.03039062854721293, + "grad_norm": 0.21315138041973114, + "learning_rate": 0.00012156953531796032, + "loss": 1.7976654052734375, + "step": 10040 + }, + { + "epoch": 0.03042089809755876, + "grad_norm": 0.2205672711133957, + "learning_rate": 0.00012156574013632294, + "loss": 1.7892723083496094, + "step": 10050 + }, + { + "epoch": 0.03045116764790459, + "grad_norm": 0.18015240132808685, + "learning_rate": 0.00012156194495468553, + "loss": 1.797494888305664, + "step": 10060 + }, + { + "epoch": 0.03048143719825042, + "grad_norm": 0.20958445966243744, + "learning_rate": 0.00012155814977304815, + "loss": 1.7603050231933595, + "step": 10070 + }, + { + "epoch": 0.03051170674859625, + "grad_norm": 0.17182354629039764, + "learning_rate": 0.00012155435459141074, + "loss": 1.800454330444336, + "step": 10080 + }, + { + "epoch": 0.030541976298942078, + "grad_norm": 0.18623031675815582, + "learning_rate": 0.00012155055940977336, + "loss": 1.7731513977050781, + "step": 10090 + }, + { + "epoch": 0.03057224584928791, + "grad_norm": 0.19711777567863464, + "learning_rate": 0.00012154676422813595, + "loss": 1.7540142059326171, + "step": 10100 + }, + { + "epoch": 0.03060251539963374, + "grad_norm": 0.18102140724658966, + "learning_rate": 0.00012154296904649857, + "loss": 1.7655746459960937, + "step": 10110 + }, + { + "epoch": 0.030632784949979567, + "grad_norm": 0.21997329592704773, + "learning_rate": 0.00012153917386486118, + "loss": 1.755071258544922, + "step": 10120 + }, + { + "epoch": 0.030663054500325398, + "grad_norm": 0.19613875448703766, + "learning_rate": 0.00012153537868322378, + "loss": 1.7656410217285157, + "step": 10130 + }, + { + "epoch": 0.03069332405067123, + "grad_norm": 0.21561700105667114, + "learning_rate": 0.00012153158350158639, + "loss": 1.7626501083374024, + "step": 10140 + }, + { + "epoch": 0.030723593601017056, + "grad_norm": 0.186470627784729, + "learning_rate": 0.000121527788319949, + "loss": 1.762226676940918, + "step": 10150 + }, + { + "epoch": 0.030753863151362887, + "grad_norm": 0.1780197024345398, + "learning_rate": 0.0001215239931383116, + "loss": 1.7623924255371093, + "step": 10160 + }, + { + "epoch": 0.030784132701708718, + "grad_norm": 0.15433700382709503, + "learning_rate": 0.00012152019795667421, + "loss": 1.8178604125976563, + "step": 10170 + }, + { + "epoch": 0.030814402252054545, + "grad_norm": 0.19380946457386017, + "learning_rate": 0.00012151640277503681, + "loss": 1.8033523559570312, + "step": 10180 + }, + { + "epoch": 0.030844671802400376, + "grad_norm": 0.21751634776592255, + "learning_rate": 0.00012151260759339942, + "loss": 1.819150733947754, + "step": 10190 + }, + { + "epoch": 0.030874941352746203, + "grad_norm": 0.2003001719713211, + "learning_rate": 0.00012150881241176203, + "loss": 1.77821044921875, + "step": 10200 + }, + { + "epoch": 0.030905210903092034, + "grad_norm": 0.18392899632453918, + "learning_rate": 0.00012150501723012463, + "loss": 1.8308151245117188, + "step": 10210 + }, + { + "epoch": 0.030935480453437865, + "grad_norm": 0.20869818329811096, + "learning_rate": 0.00012150122204848724, + "loss": 1.7719348907470702, + "step": 10220 + }, + { + "epoch": 0.030965750003783692, + "grad_norm": 0.18560989201068878, + "learning_rate": 0.00012149742686684986, + "loss": 1.7600238800048829, + "step": 10230 + }, + { + "epoch": 0.030996019554129523, + "grad_norm": 0.18233564496040344, + "learning_rate": 0.00012149363168521246, + "loss": 1.7550361633300782, + "step": 10240 + }, + { + "epoch": 0.031026289104475354, + "grad_norm": 0.17999140918254852, + "learning_rate": 0.00012148983650357507, + "loss": 1.8329483032226563, + "step": 10250 + }, + { + "epoch": 0.03105655865482118, + "grad_norm": 0.2536616027355194, + "learning_rate": 0.00012148604132193767, + "loss": 1.7603130340576172, + "step": 10260 + }, + { + "epoch": 0.031086828205167012, + "grad_norm": 0.1813410222530365, + "learning_rate": 0.00012148224614030028, + "loss": 1.8122591018676757, + "step": 10270 + }, + { + "epoch": 0.031117097755512843, + "grad_norm": 0.17145730555057526, + "learning_rate": 0.00012147845095866289, + "loss": 1.7946882247924805, + "step": 10280 + }, + { + "epoch": 0.03114736730585867, + "grad_norm": 0.17755194008350372, + "learning_rate": 0.00012147465577702549, + "loss": 1.791783332824707, + "step": 10290 + }, + { + "epoch": 0.0311776368562045, + "grad_norm": 0.1796780526638031, + "learning_rate": 0.0001214708605953881, + "loss": 1.7815656661987305, + "step": 10300 + }, + { + "epoch": 0.031207906406550332, + "grad_norm": 0.1865658164024353, + "learning_rate": 0.0001214670654137507, + "loss": 1.7631534576416015, + "step": 10310 + }, + { + "epoch": 0.03123817595689616, + "grad_norm": 0.185190811753273, + "learning_rate": 0.00012146327023211331, + "loss": 1.7866924285888672, + "step": 10320 + }, + { + "epoch": 0.03126844550724199, + "grad_norm": 0.1986771821975708, + "learning_rate": 0.00012145947505047592, + "loss": 1.7971696853637695, + "step": 10330 + }, + { + "epoch": 0.03129871505758782, + "grad_norm": 0.19565702974796295, + "learning_rate": 0.00012145567986883852, + "loss": 1.789938735961914, + "step": 10340 + }, + { + "epoch": 0.03132898460793365, + "grad_norm": 0.2074950933456421, + "learning_rate": 0.00012145188468720113, + "loss": 1.7412471771240234, + "step": 10350 + }, + { + "epoch": 0.031359254158279476, + "grad_norm": 0.23772718012332916, + "learning_rate": 0.00012144808950556375, + "loss": 1.7870153427124023, + "step": 10360 + }, + { + "epoch": 0.03138952370862531, + "grad_norm": 0.233688086271286, + "learning_rate": 0.00012144429432392634, + "loss": 1.8259246826171875, + "step": 10370 + }, + { + "epoch": 0.03141979325897114, + "grad_norm": 0.18624268472194672, + "learning_rate": 0.00012144049914228896, + "loss": 1.7919357299804688, + "step": 10380 + }, + { + "epoch": 0.03145006280931697, + "grad_norm": 0.23947840929031372, + "learning_rate": 0.00012143670396065155, + "loss": 1.7763025283813476, + "step": 10390 + }, + { + "epoch": 0.0314803323596628, + "grad_norm": 0.1961040049791336, + "learning_rate": 0.00012143290877901417, + "loss": 1.7840147018432617, + "step": 10400 + }, + { + "epoch": 0.03151060191000863, + "grad_norm": 0.19663673639297485, + "learning_rate": 0.00012142911359737676, + "loss": 1.7724218368530273, + "step": 10410 + }, + { + "epoch": 0.031540871460354454, + "grad_norm": 0.21076859533786774, + "learning_rate": 0.00012142531841573938, + "loss": 1.7428447723388671, + "step": 10420 + }, + { + "epoch": 0.031571141010700285, + "grad_norm": 0.18725641071796417, + "learning_rate": 0.00012142152323410198, + "loss": 1.7999176025390624, + "step": 10430 + }, + { + "epoch": 0.031601410561046116, + "grad_norm": 0.15940538048744202, + "learning_rate": 0.0001214177280524646, + "loss": 1.8337699890136718, + "step": 10440 + }, + { + "epoch": 0.03163168011139195, + "grad_norm": 0.32573381066322327, + "learning_rate": 0.0001214139328708272, + "loss": 1.798654556274414, + "step": 10450 + }, + { + "epoch": 0.03166194966173778, + "grad_norm": 0.23905476927757263, + "learning_rate": 0.00012141013768918981, + "loss": 1.7970787048339845, + "step": 10460 + }, + { + "epoch": 0.0316922192120836, + "grad_norm": 0.27000924944877625, + "learning_rate": 0.00012140634250755241, + "loss": 1.8151439666748046, + "step": 10470 + }, + { + "epoch": 0.03172248876242943, + "grad_norm": 0.2287033200263977, + "learning_rate": 0.00012140254732591502, + "loss": 1.7589183807373048, + "step": 10480 + }, + { + "epoch": 0.03175275831277526, + "grad_norm": 0.21582618355751038, + "learning_rate": 0.00012139875214427764, + "loss": 1.7804309844970703, + "step": 10490 + }, + { + "epoch": 0.031783027863121094, + "grad_norm": 0.19913914799690247, + "learning_rate": 0.00012139495696264023, + "loss": 1.7635154724121094, + "step": 10500 + }, + { + "epoch": 0.031783027863121094, + "eval_loss": 1.7955862283706665, + "eval_runtime": 28.2464, + "eval_samples_per_second": 17.701, + "eval_steps_per_second": 1.133, + "step": 10500 + }, + { + "epoch": 0.031813297413466925, + "grad_norm": 0.2291792631149292, + "learning_rate": 0.00012139116178100285, + "loss": 1.7798480987548828, + "step": 10510 + }, + { + "epoch": 0.031843566963812756, + "grad_norm": 0.24254757165908813, + "learning_rate": 0.00012138736659936544, + "loss": 1.7794044494628907, + "step": 10520 + }, + { + "epoch": 0.03187383651415858, + "grad_norm": 0.20471307635307312, + "learning_rate": 0.00012138357141772806, + "loss": 1.7212406158447267, + "step": 10530 + }, + { + "epoch": 0.03190410606450441, + "grad_norm": 0.18075819313526154, + "learning_rate": 0.00012137977623609065, + "loss": 1.7691638946533204, + "step": 10540 + }, + { + "epoch": 0.03193437561485024, + "grad_norm": 0.1924358755350113, + "learning_rate": 0.00012137598105445327, + "loss": 1.7576839447021484, + "step": 10550 + }, + { + "epoch": 0.03196464516519607, + "grad_norm": 0.1928793340921402, + "learning_rate": 0.00012137218587281587, + "loss": 1.81495361328125, + "step": 10560 + }, + { + "epoch": 0.0319949147155419, + "grad_norm": 0.17672504484653473, + "learning_rate": 0.00012136839069117849, + "loss": 1.7559324264526368, + "step": 10570 + }, + { + "epoch": 0.032025184265887734, + "grad_norm": 0.19906049966812134, + "learning_rate": 0.00012136459550954108, + "loss": 1.7701953887939452, + "step": 10580 + }, + { + "epoch": 0.03205545381623356, + "grad_norm": 0.20114165544509888, + "learning_rate": 0.0001213608003279037, + "loss": 1.7636653900146484, + "step": 10590 + }, + { + "epoch": 0.03208572336657939, + "grad_norm": 0.24754294753074646, + "learning_rate": 0.00012135700514626629, + "loss": 1.7590484619140625, + "step": 10600 + }, + { + "epoch": 0.03211599291692522, + "grad_norm": 0.32413730025291443, + "learning_rate": 0.00012135320996462891, + "loss": 1.7443984985351562, + "step": 10610 + }, + { + "epoch": 0.03214626246727105, + "grad_norm": 0.1860598623752594, + "learning_rate": 0.00012134941478299152, + "loss": 1.756931686401367, + "step": 10620 + }, + { + "epoch": 0.03217653201761688, + "grad_norm": 0.18759527802467346, + "learning_rate": 0.00012134561960135412, + "loss": 1.8031768798828125, + "step": 10630 + }, + { + "epoch": 0.032206801567962705, + "grad_norm": 0.19905121624469757, + "learning_rate": 0.00012134182441971674, + "loss": 1.8129060745239258, + "step": 10640 + }, + { + "epoch": 0.032237071118308536, + "grad_norm": 0.22101745009422302, + "learning_rate": 0.00012133802923807933, + "loss": 1.778619384765625, + "step": 10650 + }, + { + "epoch": 0.032267340668654367, + "grad_norm": 0.22782480716705322, + "learning_rate": 0.00012133423405644195, + "loss": 1.7878313064575195, + "step": 10660 + }, + { + "epoch": 0.0322976102190002, + "grad_norm": 0.1908925473690033, + "learning_rate": 0.00012133043887480455, + "loss": 1.7841499328613282, + "step": 10670 + }, + { + "epoch": 0.03232787976934603, + "grad_norm": 0.23434749245643616, + "learning_rate": 0.00012132664369316717, + "loss": 1.7781749725341798, + "step": 10680 + }, + { + "epoch": 0.03235814931969186, + "grad_norm": 0.22630493342876434, + "learning_rate": 0.00012132284851152976, + "loss": 1.7546283721923828, + "step": 10690 + }, + { + "epoch": 0.03238841887003768, + "grad_norm": 0.1859847903251648, + "learning_rate": 0.00012131905332989238, + "loss": 1.7906108856201173, + "step": 10700 + }, + { + "epoch": 0.032418688420383514, + "grad_norm": 0.1937168687582016, + "learning_rate": 0.00012131525814825497, + "loss": 1.7990987777709961, + "step": 10710 + }, + { + "epoch": 0.032448957970729345, + "grad_norm": 0.17573721706867218, + "learning_rate": 0.00012131146296661759, + "loss": 1.8107410430908204, + "step": 10720 + }, + { + "epoch": 0.032479227521075175, + "grad_norm": 0.1771533042192459, + "learning_rate": 0.0001213076677849802, + "loss": 1.7466524124145508, + "step": 10730 + }, + { + "epoch": 0.032509497071421006, + "grad_norm": 0.21324443817138672, + "learning_rate": 0.0001213038726033428, + "loss": 1.7802581787109375, + "step": 10740 + }, + { + "epoch": 0.03253976662176683, + "grad_norm": 0.19063329696655273, + "learning_rate": 0.00012130007742170541, + "loss": 1.7849044799804688, + "step": 10750 + }, + { + "epoch": 0.03257003617211266, + "grad_norm": 0.21500816941261292, + "learning_rate": 0.00012129628224006801, + "loss": 1.777529525756836, + "step": 10760 + }, + { + "epoch": 0.03260030572245849, + "grad_norm": 0.21476364135742188, + "learning_rate": 0.00012129248705843062, + "loss": 1.7910472869873046, + "step": 10770 + }, + { + "epoch": 0.03263057527280432, + "grad_norm": 0.18540430068969727, + "learning_rate": 0.00012128869187679322, + "loss": 1.7547155380249024, + "step": 10780 + }, + { + "epoch": 0.032660844823150154, + "grad_norm": 0.24761426448822021, + "learning_rate": 0.00012128489669515583, + "loss": 1.815026092529297, + "step": 10790 + }, + { + "epoch": 0.032691114373495984, + "grad_norm": 0.17571406066417694, + "learning_rate": 0.00012128110151351844, + "loss": 1.7807960510253906, + "step": 10800 + }, + { + "epoch": 0.03272138392384181, + "grad_norm": 0.2279021143913269, + "learning_rate": 0.00012127730633188104, + "loss": 1.7713846206665038, + "step": 10810 + }, + { + "epoch": 0.03275165347418764, + "grad_norm": 0.213734433054924, + "learning_rate": 0.00012127351115024365, + "loss": 1.789585494995117, + "step": 10820 + }, + { + "epoch": 0.03278192302453347, + "grad_norm": 0.2489873766899109, + "learning_rate": 0.00012126971596860625, + "loss": 1.748987579345703, + "step": 10830 + }, + { + "epoch": 0.0328121925748793, + "grad_norm": 0.2328478991985321, + "learning_rate": 0.00012126592078696886, + "loss": 1.7551170349121095, + "step": 10840 + }, + { + "epoch": 0.03284246212522513, + "grad_norm": 0.2368607372045517, + "learning_rate": 0.00012126212560533148, + "loss": 1.7622756958007812, + "step": 10850 + }, + { + "epoch": 0.03287273167557096, + "grad_norm": 0.2142396718263626, + "learning_rate": 0.00012125833042369409, + "loss": 1.7894058227539062, + "step": 10860 + }, + { + "epoch": 0.032903001225916786, + "grad_norm": 0.17057257890701294, + "learning_rate": 0.00012125453524205669, + "loss": 1.7350788116455078, + "step": 10870 + }, + { + "epoch": 0.03293327077626262, + "grad_norm": 0.21762457489967346, + "learning_rate": 0.0001212507400604193, + "loss": 1.8298835754394531, + "step": 10880 + }, + { + "epoch": 0.03296354032660845, + "grad_norm": 0.2021481841802597, + "learning_rate": 0.0001212469448787819, + "loss": 1.7808603286743163, + "step": 10890 + }, + { + "epoch": 0.03299380987695428, + "grad_norm": 0.2129676640033722, + "learning_rate": 0.00012124314969714451, + "loss": 1.7055288314819337, + "step": 10900 + }, + { + "epoch": 0.03302407942730011, + "grad_norm": 0.23985104262828827, + "learning_rate": 0.00012123935451550712, + "loss": 1.762937355041504, + "step": 10910 + }, + { + "epoch": 0.033054348977645934, + "grad_norm": 0.2065276801586151, + "learning_rate": 0.00012123555933386972, + "loss": 1.7861808776855468, + "step": 10920 + }, + { + "epoch": 0.033084618527991765, + "grad_norm": 0.18459618091583252, + "learning_rate": 0.00012123176415223233, + "loss": 1.7973377227783203, + "step": 10930 + }, + { + "epoch": 0.033114888078337595, + "grad_norm": 0.20053018629550934, + "learning_rate": 0.00012122796897059493, + "loss": 1.7720592498779297, + "step": 10940 + }, + { + "epoch": 0.033145157628683426, + "grad_norm": 0.17675305902957916, + "learning_rate": 0.00012122417378895754, + "loss": 1.759811782836914, + "step": 10950 + }, + { + "epoch": 0.03317542717902926, + "grad_norm": 0.19320198893547058, + "learning_rate": 0.00012122037860732015, + "loss": 1.8026729583740235, + "step": 10960 + }, + { + "epoch": 0.03320569672937509, + "grad_norm": 0.17001570761203766, + "learning_rate": 0.00012121658342568276, + "loss": 1.8208707809448241, + "step": 10970 + }, + { + "epoch": 0.03323596627972091, + "grad_norm": 0.21592159569263458, + "learning_rate": 0.00012121278824404536, + "loss": 1.7608907699584961, + "step": 10980 + }, + { + "epoch": 0.03326623583006674, + "grad_norm": 0.18452878296375275, + "learning_rate": 0.00012120899306240798, + "loss": 1.7745512008666993, + "step": 10990 + }, + { + "epoch": 0.033296505380412574, + "grad_norm": 0.17320497334003448, + "learning_rate": 0.00012120519788077057, + "loss": 1.787297821044922, + "step": 11000 + }, + { + "epoch": 0.033296505380412574, + "eval_loss": 1.7648112773895264, + "eval_runtime": 27.9446, + "eval_samples_per_second": 17.893, + "eval_steps_per_second": 1.145, + "step": 11000 + }, + { + "epoch": 0.033326774930758404, + "grad_norm": 0.16125769913196564, + "learning_rate": 0.00012120140269913319, + "loss": 1.7601472854614257, + "step": 11010 + }, + { + "epoch": 0.033357044481104235, + "grad_norm": 0.16469813883304596, + "learning_rate": 0.00012119760751749578, + "loss": 1.755365753173828, + "step": 11020 + }, + { + "epoch": 0.033387314031450066, + "grad_norm": 0.21988464891910553, + "learning_rate": 0.0001211938123358584, + "loss": 1.7910747528076172, + "step": 11030 + }, + { + "epoch": 0.03341758358179589, + "grad_norm": 0.2203807383775711, + "learning_rate": 0.00012119001715422099, + "loss": 1.7727497100830079, + "step": 11040 + }, + { + "epoch": 0.03344785313214172, + "grad_norm": 0.2011508345603943, + "learning_rate": 0.00012118622197258361, + "loss": 1.7807247161865234, + "step": 11050 + }, + { + "epoch": 0.03347812268248755, + "grad_norm": 0.2034253627061844, + "learning_rate": 0.00012118242679094622, + "loss": 1.8078556060791016, + "step": 11060 + }, + { + "epoch": 0.03350839223283338, + "grad_norm": 0.2045706808567047, + "learning_rate": 0.00012117863160930882, + "loss": 1.7845617294311524, + "step": 11070 + }, + { + "epoch": 0.03353866178317921, + "grad_norm": 0.21744471788406372, + "learning_rate": 0.00012117483642767143, + "loss": 1.7384605407714844, + "step": 11080 + }, + { + "epoch": 0.03356893133352504, + "grad_norm": 0.18505075573921204, + "learning_rate": 0.00012117104124603404, + "loss": 1.8155136108398438, + "step": 11090 + }, + { + "epoch": 0.03359920088387087, + "grad_norm": 0.20393607020378113, + "learning_rate": 0.00012116724606439666, + "loss": 1.7248432159423828, + "step": 11100 + }, + { + "epoch": 0.0336294704342167, + "grad_norm": 0.22092492878437042, + "learning_rate": 0.00012116345088275925, + "loss": 1.79954833984375, + "step": 11110 + }, + { + "epoch": 0.03365973998456253, + "grad_norm": 0.22539979219436646, + "learning_rate": 0.00012115965570112187, + "loss": 1.8068201065063476, + "step": 11120 + }, + { + "epoch": 0.03369000953490836, + "grad_norm": 0.18019607663154602, + "learning_rate": 0.00012115586051948446, + "loss": 1.7389690399169921, + "step": 11130 + }, + { + "epoch": 0.03372027908525419, + "grad_norm": 0.1774706393480301, + "learning_rate": 0.00012115206533784708, + "loss": 1.7334741592407226, + "step": 11140 + }, + { + "epoch": 0.033750548635600015, + "grad_norm": 0.23219986259937286, + "learning_rate": 0.00012114827015620967, + "loss": 1.7931087493896485, + "step": 11150 + }, + { + "epoch": 0.033780818185945846, + "grad_norm": 0.1801132708787918, + "learning_rate": 0.00012114447497457229, + "loss": 1.78414306640625, + "step": 11160 + }, + { + "epoch": 0.03381108773629168, + "grad_norm": 0.2012542486190796, + "learning_rate": 0.00012114067979293488, + "loss": 1.762604331970215, + "step": 11170 + }, + { + "epoch": 0.03384135728663751, + "grad_norm": 0.1955466866493225, + "learning_rate": 0.0001211368846112975, + "loss": 1.7536960601806642, + "step": 11180 + }, + { + "epoch": 0.03387162683698334, + "grad_norm": 0.19861078262329102, + "learning_rate": 0.0001211330894296601, + "loss": 1.7852380752563477, + "step": 11190 + }, + { + "epoch": 0.03390189638732917, + "grad_norm": 0.17824766039848328, + "learning_rate": 0.00012112929424802272, + "loss": 1.7982643127441407, + "step": 11200 + }, + { + "epoch": 0.03393216593767499, + "grad_norm": 0.1977834403514862, + "learning_rate": 0.00012112549906638531, + "loss": 1.7876373291015626, + "step": 11210 + }, + { + "epoch": 0.033962435488020824, + "grad_norm": 0.17728015780448914, + "learning_rate": 0.00012112170388474793, + "loss": 1.787740135192871, + "step": 11220 + }, + { + "epoch": 0.033992705038366655, + "grad_norm": 0.20381221175193787, + "learning_rate": 0.00012111790870311053, + "loss": 1.776831817626953, + "step": 11230 + }, + { + "epoch": 0.034022974588712486, + "grad_norm": 0.23864994943141937, + "learning_rate": 0.00012111411352147314, + "loss": 1.793006706237793, + "step": 11240 + }, + { + "epoch": 0.03405324413905832, + "grad_norm": 0.1676866114139557, + "learning_rate": 0.00012111031833983576, + "loss": 1.802475357055664, + "step": 11250 + }, + { + "epoch": 0.03408351368940414, + "grad_norm": 0.1939769834280014, + "learning_rate": 0.00012110652315819835, + "loss": 1.8103557586669923, + "step": 11260 + }, + { + "epoch": 0.03411378323974997, + "grad_norm": 0.18783363699913025, + "learning_rate": 0.00012110272797656097, + "loss": 1.773366928100586, + "step": 11270 + }, + { + "epoch": 0.0341440527900958, + "grad_norm": 0.21097077429294586, + "learning_rate": 0.00012109893279492356, + "loss": 1.7514610290527344, + "step": 11280 + }, + { + "epoch": 0.03417432234044163, + "grad_norm": 0.20176143944263458, + "learning_rate": 0.00012109513761328618, + "loss": 1.8083694458007813, + "step": 11290 + }, + { + "epoch": 0.034204591890787464, + "grad_norm": 0.23031462728977203, + "learning_rate": 0.00012109134243164877, + "loss": 1.7736328125, + "step": 11300 + }, + { + "epoch": 0.034234861441133295, + "grad_norm": 0.22429445385932922, + "learning_rate": 0.0001210875472500114, + "loss": 1.746749496459961, + "step": 11310 + }, + { + "epoch": 0.03426513099147912, + "grad_norm": 0.19236129522323608, + "learning_rate": 0.00012108375206837399, + "loss": 1.776224136352539, + "step": 11320 + }, + { + "epoch": 0.03429540054182495, + "grad_norm": 0.21054214239120483, + "learning_rate": 0.0001210799568867366, + "loss": 1.8079999923706054, + "step": 11330 + }, + { + "epoch": 0.03432567009217078, + "grad_norm": 0.18402300775051117, + "learning_rate": 0.00012107616170509921, + "loss": 1.7841318130493165, + "step": 11340 + }, + { + "epoch": 0.03435593964251661, + "grad_norm": 0.22843140363693237, + "learning_rate": 0.00012107236652346182, + "loss": 1.7860357284545898, + "step": 11350 + }, + { + "epoch": 0.03438620919286244, + "grad_norm": 0.2159404158592224, + "learning_rate": 0.00012106857134182442, + "loss": 1.8035659790039062, + "step": 11360 + }, + { + "epoch": 0.03441647874320827, + "grad_norm": 0.1866220086812973, + "learning_rate": 0.00012106477616018703, + "loss": 1.7555837631225586, + "step": 11370 + }, + { + "epoch": 0.0344467482935541, + "grad_norm": 0.23105451464653015, + "learning_rate": 0.00012106098097854964, + "loss": 1.7595249176025392, + "step": 11380 + }, + { + "epoch": 0.03447701784389993, + "grad_norm": 0.23547624051570892, + "learning_rate": 0.00012105718579691224, + "loss": 1.7508884429931642, + "step": 11390 + }, + { + "epoch": 0.03450728739424576, + "grad_norm": 0.1937316656112671, + "learning_rate": 0.00012105339061527485, + "loss": 1.8139806747436524, + "step": 11400 + }, + { + "epoch": 0.03453755694459159, + "grad_norm": 0.20588672161102295, + "learning_rate": 0.00012104959543363745, + "loss": 1.758477783203125, + "step": 11410 + }, + { + "epoch": 0.03456782649493742, + "grad_norm": 0.19759172201156616, + "learning_rate": 0.00012104580025200006, + "loss": 1.791468048095703, + "step": 11420 + }, + { + "epoch": 0.034598096045283244, + "grad_norm": 0.2043461799621582, + "learning_rate": 0.00012104200507036267, + "loss": 1.7662223815917968, + "step": 11430 + }, + { + "epoch": 0.034628365595629075, + "grad_norm": 0.17471791803836823, + "learning_rate": 0.00012103820988872527, + "loss": 1.778099822998047, + "step": 11440 + }, + { + "epoch": 0.034658635145974906, + "grad_norm": 0.17375832796096802, + "learning_rate": 0.00012103441470708788, + "loss": 1.7886154174804687, + "step": 11450 + }, + { + "epoch": 0.03468890469632074, + "grad_norm": 0.19073821604251862, + "learning_rate": 0.0001210306195254505, + "loss": 1.7216436386108398, + "step": 11460 + }, + { + "epoch": 0.03471917424666657, + "grad_norm": 0.19461172819137573, + "learning_rate": 0.0001210268243438131, + "loss": 1.7976158142089844, + "step": 11470 + }, + { + "epoch": 0.0347494437970124, + "grad_norm": 0.1727682501077652, + "learning_rate": 0.00012102302916217571, + "loss": 1.8005537033081054, + "step": 11480 + }, + { + "epoch": 0.03477971334735822, + "grad_norm": 0.19359137117862701, + "learning_rate": 0.00012101923398053831, + "loss": 1.7607131958007813, + "step": 11490 + }, + { + "epoch": 0.03480998289770405, + "grad_norm": 0.1983732134103775, + "learning_rate": 0.00012101543879890092, + "loss": 1.8051668167114259, + "step": 11500 + }, + { + "epoch": 0.03480998289770405, + "eval_loss": 1.7666350603103638, + "eval_runtime": 28.3824, + "eval_samples_per_second": 17.617, + "eval_steps_per_second": 1.127, + "step": 11500 + }, + { + "epoch": 0.034840252448049884, + "grad_norm": 0.18428583443164825, + "learning_rate": 0.00012101164361726353, + "loss": 1.771998977661133, + "step": 11510 + }, + { + "epoch": 0.034870521998395715, + "grad_norm": 0.17060455679893494, + "learning_rate": 0.00012100784843562613, + "loss": 1.7952213287353516, + "step": 11520 + }, + { + "epoch": 0.034900791548741546, + "grad_norm": 0.19393622875213623, + "learning_rate": 0.00012100405325398874, + "loss": 1.759169578552246, + "step": 11530 + }, + { + "epoch": 0.03493106109908737, + "grad_norm": 0.16823145747184753, + "learning_rate": 0.00012100025807235134, + "loss": 1.736836814880371, + "step": 11540 + }, + { + "epoch": 0.0349613306494332, + "grad_norm": 0.21479842066764832, + "learning_rate": 0.00012099646289071395, + "loss": 1.7995800018310546, + "step": 11550 + }, + { + "epoch": 0.03499160019977903, + "grad_norm": 0.18079976737499237, + "learning_rate": 0.00012099266770907656, + "loss": 1.7813140869140625, + "step": 11560 + }, + { + "epoch": 0.03502186975012486, + "grad_norm": 0.1839640885591507, + "learning_rate": 0.00012098887252743916, + "loss": 1.7856040954589845, + "step": 11570 + }, + { + "epoch": 0.03505213930047069, + "grad_norm": 0.16187715530395508, + "learning_rate": 0.00012098507734580177, + "loss": 1.7698509216308593, + "step": 11580 + }, + { + "epoch": 0.035082408850816524, + "grad_norm": 0.17862848937511444, + "learning_rate": 0.00012098128216416437, + "loss": 1.749121856689453, + "step": 11590 + }, + { + "epoch": 0.03511267840116235, + "grad_norm": 0.1983695924282074, + "learning_rate": 0.000120977486982527, + "loss": 1.8030876159667968, + "step": 11600 + }, + { + "epoch": 0.03514294795150818, + "grad_norm": 0.2095569223165512, + "learning_rate": 0.00012097369180088959, + "loss": 1.79429931640625, + "step": 11610 + }, + { + "epoch": 0.03517321750185401, + "grad_norm": 0.18930462002754211, + "learning_rate": 0.0001209698966192522, + "loss": 1.803076171875, + "step": 11620 + }, + { + "epoch": 0.03520348705219984, + "grad_norm": 0.2721637785434723, + "learning_rate": 0.0001209661014376148, + "loss": 1.7944412231445312, + "step": 11630 + }, + { + "epoch": 0.03523375660254567, + "grad_norm": 0.22475779056549072, + "learning_rate": 0.00012096230625597742, + "loss": 1.7743541717529296, + "step": 11640 + }, + { + "epoch": 0.0352640261528915, + "grad_norm": 0.1894504427909851, + "learning_rate": 0.00012095851107434001, + "loss": 1.794776725769043, + "step": 11650 + }, + { + "epoch": 0.035294295703237326, + "grad_norm": 0.16863900423049927, + "learning_rate": 0.00012095471589270263, + "loss": 1.8078229904174805, + "step": 11660 + }, + { + "epoch": 0.03532456525358316, + "grad_norm": 0.22741714119911194, + "learning_rate": 0.00012095092071106524, + "loss": 1.8027488708496093, + "step": 11670 + }, + { + "epoch": 0.03535483480392899, + "grad_norm": 0.23461872339248657, + "learning_rate": 0.00012094712552942784, + "loss": 1.7253631591796874, + "step": 11680 + }, + { + "epoch": 0.03538510435427482, + "grad_norm": 0.20201389491558075, + "learning_rate": 0.00012094333034779045, + "loss": 1.785316276550293, + "step": 11690 + }, + { + "epoch": 0.03541537390462065, + "grad_norm": 0.2045748233795166, + "learning_rate": 0.00012093953516615305, + "loss": 1.782442855834961, + "step": 11700 + }, + { + "epoch": 0.03544564345496647, + "grad_norm": 0.20134030282497406, + "learning_rate": 0.00012093573998451567, + "loss": 1.749948501586914, + "step": 11710 + }, + { + "epoch": 0.035475913005312304, + "grad_norm": 0.19871453940868378, + "learning_rate": 0.00012093194480287827, + "loss": 1.790837860107422, + "step": 11720 + }, + { + "epoch": 0.035506182555658135, + "grad_norm": 0.16079270839691162, + "learning_rate": 0.00012092814962124088, + "loss": 1.7690195083618163, + "step": 11730 + }, + { + "epoch": 0.035536452106003966, + "grad_norm": 0.1828652173280716, + "learning_rate": 0.00012092435443960348, + "loss": 1.7303022384643554, + "step": 11740 + }, + { + "epoch": 0.035566721656349796, + "grad_norm": 0.20738865435123444, + "learning_rate": 0.0001209205592579661, + "loss": 1.8177852630615234, + "step": 11750 + }, + { + "epoch": 0.03559699120669563, + "grad_norm": 0.21322332322597504, + "learning_rate": 0.00012091676407632869, + "loss": 1.7730701446533204, + "step": 11760 + }, + { + "epoch": 0.03562726075704145, + "grad_norm": 0.21263986825942993, + "learning_rate": 0.00012091296889469131, + "loss": 1.7356698989868165, + "step": 11770 + }, + { + "epoch": 0.03565753030738728, + "grad_norm": 0.19316843152046204, + "learning_rate": 0.0001209091737130539, + "loss": 1.789940071105957, + "step": 11780 + }, + { + "epoch": 0.03568779985773311, + "grad_norm": 0.1989917904138565, + "learning_rate": 0.00012090537853141652, + "loss": 1.8066375732421875, + "step": 11790 + }, + { + "epoch": 0.035718069408078944, + "grad_norm": 0.19790147244930267, + "learning_rate": 0.00012090158334977911, + "loss": 1.768894577026367, + "step": 11800 + }, + { + "epoch": 0.035748338958424775, + "grad_norm": 0.20424163341522217, + "learning_rate": 0.00012089778816814173, + "loss": 1.7601917266845704, + "step": 11810 + }, + { + "epoch": 0.035778608508770605, + "grad_norm": 0.18053874373435974, + "learning_rate": 0.00012089399298650432, + "loss": 1.7711267471313477, + "step": 11820 + }, + { + "epoch": 0.03580887805911643, + "grad_norm": 0.1930505931377411, + "learning_rate": 0.00012089019780486694, + "loss": 1.7827461242675782, + "step": 11830 + }, + { + "epoch": 0.03583914760946226, + "grad_norm": 0.20353750884532928, + "learning_rate": 0.00012088640262322955, + "loss": 1.7647167205810548, + "step": 11840 + }, + { + "epoch": 0.03586941715980809, + "grad_norm": 0.18005606532096863, + "learning_rate": 0.00012088260744159216, + "loss": 1.782649040222168, + "step": 11850 + }, + { + "epoch": 0.03589968671015392, + "grad_norm": 0.1902252733707428, + "learning_rate": 0.00012087881225995478, + "loss": 1.7822921752929688, + "step": 11860 + }, + { + "epoch": 0.03592995626049975, + "grad_norm": 0.20721790194511414, + "learning_rate": 0.00012087501707831737, + "loss": 1.7737892150878907, + "step": 11870 + }, + { + "epoch": 0.03596022581084558, + "grad_norm": 0.18854397535324097, + "learning_rate": 0.00012087122189667999, + "loss": 1.77437744140625, + "step": 11880 + }, + { + "epoch": 0.03599049536119141, + "grad_norm": 0.20764729380607605, + "learning_rate": 0.00012086742671504258, + "loss": 1.7854639053344727, + "step": 11890 + }, + { + "epoch": 0.03602076491153724, + "grad_norm": 0.18797703087329865, + "learning_rate": 0.0001208636315334052, + "loss": 1.710232925415039, + "step": 11900 + }, + { + "epoch": 0.03605103446188307, + "grad_norm": 0.1786220520734787, + "learning_rate": 0.00012085983635176779, + "loss": 1.774953269958496, + "step": 11910 + }, + { + "epoch": 0.0360813040122289, + "grad_norm": 0.22429826855659485, + "learning_rate": 0.00012085604117013041, + "loss": 1.7705310821533202, + "step": 11920 + }, + { + "epoch": 0.03611157356257473, + "grad_norm": 0.1810997575521469, + "learning_rate": 0.000120852245988493, + "loss": 1.7978910446166991, + "step": 11930 + }, + { + "epoch": 0.036141843112920555, + "grad_norm": 0.19045056402683258, + "learning_rate": 0.00012084845080685562, + "loss": 1.7290393829345703, + "step": 11940 + }, + { + "epoch": 0.036172112663266386, + "grad_norm": 0.21413172781467438, + "learning_rate": 0.00012084465562521822, + "loss": 1.8206741333007812, + "step": 11950 + }, + { + "epoch": 0.036202382213612216, + "grad_norm": 0.2060937136411667, + "learning_rate": 0.00012084086044358084, + "loss": 1.7691753387451172, + "step": 11960 + }, + { + "epoch": 0.03623265176395805, + "grad_norm": 0.17898666858673096, + "learning_rate": 0.00012083706526194344, + "loss": 1.7393657684326171, + "step": 11970 + }, + { + "epoch": 0.03626292131430388, + "grad_norm": 0.19633668661117554, + "learning_rate": 0.00012083327008030605, + "loss": 1.7426324844360352, + "step": 11980 + }, + { + "epoch": 0.03629319086464971, + "grad_norm": 0.2159820944070816, + "learning_rate": 0.00012082947489866865, + "loss": 1.773455810546875, + "step": 11990 + }, + { + "epoch": 0.03632346041499553, + "grad_norm": 0.22815565764904022, + "learning_rate": 0.00012082567971703126, + "loss": 1.799327278137207, + "step": 12000 + }, + { + "epoch": 0.03632346041499553, + "eval_loss": 1.7702727317810059, + "eval_runtime": 27.7916, + "eval_samples_per_second": 17.991, + "eval_steps_per_second": 1.151, + "step": 12000 + }, + { + "epoch": 0.036353729965341364, + "grad_norm": 0.20140095055103302, + "learning_rate": 0.00012082188453539386, + "loss": 1.7289833068847655, + "step": 12010 + }, + { + "epoch": 0.036383999515687195, + "grad_norm": 0.19758982956409454, + "learning_rate": 0.00012081808935375647, + "loss": 1.78018798828125, + "step": 12020 + }, + { + "epoch": 0.036414269066033025, + "grad_norm": 0.21285013854503632, + "learning_rate": 0.00012081429417211908, + "loss": 1.7608371734619142, + "step": 12030 + }, + { + "epoch": 0.036444538616378856, + "grad_norm": 0.2199997752904892, + "learning_rate": 0.00012081049899048168, + "loss": 1.770453643798828, + "step": 12040 + }, + { + "epoch": 0.03647480816672468, + "grad_norm": 0.18991059064865112, + "learning_rate": 0.00012080670380884429, + "loss": 1.7850563049316406, + "step": 12050 + }, + { + "epoch": 0.03650507771707051, + "grad_norm": 0.20659691095352173, + "learning_rate": 0.0001208029086272069, + "loss": 1.7409164428710937, + "step": 12060 + }, + { + "epoch": 0.03653534726741634, + "grad_norm": 0.17823761701583862, + "learning_rate": 0.00012079911344556951, + "loss": 1.77275390625, + "step": 12070 + }, + { + "epoch": 0.03656561681776217, + "grad_norm": 0.17997579276561737, + "learning_rate": 0.00012079531826393212, + "loss": 1.7704124450683594, + "step": 12080 + }, + { + "epoch": 0.036595886368108003, + "grad_norm": 0.1921563744544983, + "learning_rate": 0.00012079152308229473, + "loss": 1.747597885131836, + "step": 12090 + }, + { + "epoch": 0.036626155918453834, + "grad_norm": 0.25735148787498474, + "learning_rate": 0.00012078772790065733, + "loss": 1.7356277465820313, + "step": 12100 + }, + { + "epoch": 0.03665642546879966, + "grad_norm": 0.1874469816684723, + "learning_rate": 0.00012078393271901994, + "loss": 1.7484329223632813, + "step": 12110 + }, + { + "epoch": 0.03668669501914549, + "grad_norm": 0.21431641280651093, + "learning_rate": 0.00012078013753738254, + "loss": 1.7316944122314453, + "step": 12120 + }, + { + "epoch": 0.03671696456949132, + "grad_norm": 0.18537013232707977, + "learning_rate": 0.00012077634235574515, + "loss": 1.7297491073608398, + "step": 12130 + }, + { + "epoch": 0.03674723411983715, + "grad_norm": 0.1980476975440979, + "learning_rate": 0.00012077254717410776, + "loss": 1.7631118774414063, + "step": 12140 + }, + { + "epoch": 0.03677750367018298, + "grad_norm": 0.1956130415201187, + "learning_rate": 0.00012076875199247036, + "loss": 1.7716178894042969, + "step": 12150 + }, + { + "epoch": 0.03680777322052881, + "grad_norm": 0.1644938439130783, + "learning_rate": 0.00012076495681083297, + "loss": 1.7351564407348632, + "step": 12160 + }, + { + "epoch": 0.036838042770874636, + "grad_norm": 0.16381879150867462, + "learning_rate": 0.00012076116162919557, + "loss": 1.7292415618896484, + "step": 12170 + }, + { + "epoch": 0.03686831232122047, + "grad_norm": 0.1941019743680954, + "learning_rate": 0.00012075736644755818, + "loss": 1.7321346282958985, + "step": 12180 + }, + { + "epoch": 0.0368985818715663, + "grad_norm": 0.20615653693675995, + "learning_rate": 0.00012075357126592079, + "loss": 1.7324954986572265, + "step": 12190 + }, + { + "epoch": 0.03692885142191213, + "grad_norm": 0.18544171750545502, + "learning_rate": 0.00012074977608428339, + "loss": 1.74775390625, + "step": 12200 + }, + { + "epoch": 0.03695912097225796, + "grad_norm": 0.173048734664917, + "learning_rate": 0.00012074598090264601, + "loss": 1.8013456344604493, + "step": 12210 + }, + { + "epoch": 0.036989390522603784, + "grad_norm": 0.194186732172966, + "learning_rate": 0.0001207421857210086, + "loss": 1.7760417938232422, + "step": 12220 + }, + { + "epoch": 0.037019660072949614, + "grad_norm": 0.19316084682941437, + "learning_rate": 0.00012073839053937122, + "loss": 1.774334716796875, + "step": 12230 + }, + { + "epoch": 0.037049929623295445, + "grad_norm": 0.20587004721164703, + "learning_rate": 0.00012073459535773382, + "loss": 1.7758663177490235, + "step": 12240 + }, + { + "epoch": 0.037080199173641276, + "grad_norm": 0.1606922745704651, + "learning_rate": 0.00012073080017609643, + "loss": 1.7242782592773438, + "step": 12250 + }, + { + "epoch": 0.03711046872398711, + "grad_norm": 0.20562432706356049, + "learning_rate": 0.00012072700499445903, + "loss": 1.7568241119384767, + "step": 12260 + }, + { + "epoch": 0.03714073827433294, + "grad_norm": 0.21674111485481262, + "learning_rate": 0.00012072320981282165, + "loss": 1.769333267211914, + "step": 12270 + }, + { + "epoch": 0.03717100782467876, + "grad_norm": 0.21587158739566803, + "learning_rate": 0.00012071941463118425, + "loss": 1.7514759063720704, + "step": 12280 + }, + { + "epoch": 0.03720127737502459, + "grad_norm": 0.18636560440063477, + "learning_rate": 0.00012071561944954686, + "loss": 1.7375408172607423, + "step": 12290 + }, + { + "epoch": 0.03723154692537042, + "grad_norm": 0.20543068647384644, + "learning_rate": 0.00012071182426790946, + "loss": 1.7626983642578125, + "step": 12300 + }, + { + "epoch": 0.037261816475716254, + "grad_norm": 0.22152721881866455, + "learning_rate": 0.00012070802908627207, + "loss": 1.7325916290283203, + "step": 12310 + }, + { + "epoch": 0.037292086026062085, + "grad_norm": 0.16660083830356598, + "learning_rate": 0.00012070423390463468, + "loss": 1.765165138244629, + "step": 12320 + }, + { + "epoch": 0.03732235557640791, + "grad_norm": 0.16196271777153015, + "learning_rate": 0.00012070043872299728, + "loss": 1.77545223236084, + "step": 12330 + }, + { + "epoch": 0.03735262512675374, + "grad_norm": 0.19076858460903168, + "learning_rate": 0.0001206966435413599, + "loss": 1.758135986328125, + "step": 12340 + }, + { + "epoch": 0.03738289467709957, + "grad_norm": 0.18446460366249084, + "learning_rate": 0.0001206928483597225, + "loss": 1.7646717071533202, + "step": 12350 + }, + { + "epoch": 0.0374131642274454, + "grad_norm": 0.16736112534999847, + "learning_rate": 0.00012068905317808511, + "loss": 1.7786527633666993, + "step": 12360 + }, + { + "epoch": 0.03744343377779123, + "grad_norm": 0.19579027593135834, + "learning_rate": 0.0001206852579964477, + "loss": 1.7457920074462892, + "step": 12370 + }, + { + "epoch": 0.03747370332813706, + "grad_norm": 0.18608462810516357, + "learning_rate": 0.00012068146281481033, + "loss": 1.7484577178955079, + "step": 12380 + }, + { + "epoch": 0.03750397287848289, + "grad_norm": 0.16067704558372498, + "learning_rate": 0.00012067766763317292, + "loss": 1.776709747314453, + "step": 12390 + }, + { + "epoch": 0.03753424242882872, + "grad_norm": 0.23909471929073334, + "learning_rate": 0.00012067387245153554, + "loss": 1.7291221618652344, + "step": 12400 + }, + { + "epoch": 0.03756451197917455, + "grad_norm": 0.16588544845581055, + "learning_rate": 0.00012067007726989813, + "loss": 1.7557880401611328, + "step": 12410 + }, + { + "epoch": 0.03759478152952038, + "grad_norm": 0.18720529973506927, + "learning_rate": 0.00012066628208826075, + "loss": 1.7586822509765625, + "step": 12420 + }, + { + "epoch": 0.03762505107986621, + "grad_norm": 0.18118834495544434, + "learning_rate": 0.00012066248690662334, + "loss": 1.7369844436645507, + "step": 12430 + }, + { + "epoch": 0.03765532063021204, + "grad_norm": 0.20457711815834045, + "learning_rate": 0.00012065869172498596, + "loss": 1.8063739776611327, + "step": 12440 + }, + { + "epoch": 0.037685590180557865, + "grad_norm": 0.2066357433795929, + "learning_rate": 0.00012065489654334857, + "loss": 1.7289573669433593, + "step": 12450 + }, + { + "epoch": 0.037715859730903696, + "grad_norm": 0.22933360934257507, + "learning_rate": 0.00012065110136171117, + "loss": 1.759572982788086, + "step": 12460 + }, + { + "epoch": 0.03774612928124953, + "grad_norm": 0.1984691619873047, + "learning_rate": 0.00012064730618007379, + "loss": 1.78497314453125, + "step": 12470 + }, + { + "epoch": 0.03777639883159536, + "grad_norm": 0.2085035741329193, + "learning_rate": 0.00012064351099843639, + "loss": 1.779637908935547, + "step": 12480 + }, + { + "epoch": 0.03780666838194119, + "grad_norm": 0.18563035130500793, + "learning_rate": 0.000120639715816799, + "loss": 1.7689384460449218, + "step": 12490 + }, + { + "epoch": 0.03783693793228701, + "grad_norm": 0.19097955524921417, + "learning_rate": 0.0001206359206351616, + "loss": 1.7593318939208984, + "step": 12500 + }, + { + "epoch": 0.03783693793228701, + "eval_loss": 1.740777850151062, + "eval_runtime": 27.9675, + "eval_samples_per_second": 17.878, + "eval_steps_per_second": 1.144, + "step": 12500 + }, + { + "epoch": 0.03786720748263284, + "grad_norm": 0.1607484221458435, + "learning_rate": 0.00012063212545352422, + "loss": 1.7811344146728516, + "step": 12510 + }, + { + "epoch": 0.037897477032978674, + "grad_norm": 0.18552175164222717, + "learning_rate": 0.00012062833027188681, + "loss": 1.743914794921875, + "step": 12520 + }, + { + "epoch": 0.037927746583324505, + "grad_norm": 0.18630504608154297, + "learning_rate": 0.00012062453509024943, + "loss": 1.7364585876464844, + "step": 12530 + }, + { + "epoch": 0.037958016133670336, + "grad_norm": 0.2198808342218399, + "learning_rate": 0.00012062073990861202, + "loss": 1.811487579345703, + "step": 12540 + }, + { + "epoch": 0.03798828568401617, + "grad_norm": 0.20198000967502594, + "learning_rate": 0.00012061694472697464, + "loss": 1.7262451171875, + "step": 12550 + }, + { + "epoch": 0.03801855523436199, + "grad_norm": 0.15853972733020782, + "learning_rate": 0.00012061314954533723, + "loss": 1.7267070770263673, + "step": 12560 + }, + { + "epoch": 0.03804882478470782, + "grad_norm": 0.1693117320537567, + "learning_rate": 0.00012060935436369985, + "loss": 1.7444786071777343, + "step": 12570 + }, + { + "epoch": 0.03807909433505365, + "grad_norm": 0.19820481538772583, + "learning_rate": 0.00012060555918206246, + "loss": 1.7872474670410157, + "step": 12580 + }, + { + "epoch": 0.03810936388539948, + "grad_norm": 0.18155410885810852, + "learning_rate": 0.00012060176400042506, + "loss": 1.754117202758789, + "step": 12590 + }, + { + "epoch": 0.038139633435745314, + "grad_norm": 0.20653407275676727, + "learning_rate": 0.00012059796881878767, + "loss": 1.7496095657348634, + "step": 12600 + }, + { + "epoch": 0.038169902986091145, + "grad_norm": 0.2083865851163864, + "learning_rate": 0.00012059417363715028, + "loss": 1.7600046157836915, + "step": 12610 + }, + { + "epoch": 0.03820017253643697, + "grad_norm": 0.19396792352199554, + "learning_rate": 0.00012059037845551288, + "loss": 1.7358238220214843, + "step": 12620 + }, + { + "epoch": 0.0382304420867828, + "grad_norm": 0.15879277884960175, + "learning_rate": 0.00012058658327387549, + "loss": 1.743050765991211, + "step": 12630 + }, + { + "epoch": 0.03826071163712863, + "grad_norm": 0.19823069870471954, + "learning_rate": 0.0001205827880922381, + "loss": 1.7456962585449218, + "step": 12640 + }, + { + "epoch": 0.03829098118747446, + "grad_norm": 0.20308160781860352, + "learning_rate": 0.0001205789929106007, + "loss": 1.7985404968261718, + "step": 12650 + }, + { + "epoch": 0.03832125073782029, + "grad_norm": 0.18613150715827942, + "learning_rate": 0.0001205751977289633, + "loss": 1.8155315399169922, + "step": 12660 + }, + { + "epoch": 0.038351520288166116, + "grad_norm": 0.21331165730953217, + "learning_rate": 0.00012057140254732591, + "loss": 1.7993278503417969, + "step": 12670 + }, + { + "epoch": 0.03838178983851195, + "grad_norm": 0.200071781873703, + "learning_rate": 0.00012056760736568853, + "loss": 1.7509702682495116, + "step": 12680 + }, + { + "epoch": 0.03841205938885778, + "grad_norm": 0.20472463965415955, + "learning_rate": 0.00012056381218405112, + "loss": 1.7557113647460938, + "step": 12690 + }, + { + "epoch": 0.03844232893920361, + "grad_norm": 0.20739984512329102, + "learning_rate": 0.00012056001700241374, + "loss": 1.7842903137207031, + "step": 12700 + }, + { + "epoch": 0.03847259848954944, + "grad_norm": 0.22452524304389954, + "learning_rate": 0.00012055622182077635, + "loss": 1.779120635986328, + "step": 12710 + }, + { + "epoch": 0.03850286803989527, + "grad_norm": 0.1800244152545929, + "learning_rate": 0.00012055242663913896, + "loss": 1.76122989654541, + "step": 12720 + }, + { + "epoch": 0.038533137590241094, + "grad_norm": 0.19604811072349548, + "learning_rate": 0.00012054863145750156, + "loss": 1.7330657958984375, + "step": 12730 + }, + { + "epoch": 0.038563407140586925, + "grad_norm": 0.17780688405036926, + "learning_rate": 0.00012054483627586417, + "loss": 1.7902698516845703, + "step": 12740 + }, + { + "epoch": 0.038593676690932756, + "grad_norm": 0.15945371985435486, + "learning_rate": 0.00012054104109422677, + "loss": 1.6944252014160157, + "step": 12750 + }, + { + "epoch": 0.03862394624127859, + "grad_norm": 0.19123387336730957, + "learning_rate": 0.00012053724591258938, + "loss": 1.7610214233398438, + "step": 12760 + }, + { + "epoch": 0.03865421579162442, + "grad_norm": 0.17772366106510162, + "learning_rate": 0.00012053345073095198, + "loss": 1.7920310974121094, + "step": 12770 + }, + { + "epoch": 0.03868448534197025, + "grad_norm": 0.20376408100128174, + "learning_rate": 0.00012052965554931459, + "loss": 1.7848802566528321, + "step": 12780 + }, + { + "epoch": 0.03871475489231607, + "grad_norm": 0.190400168299675, + "learning_rate": 0.0001205258603676772, + "loss": 1.7429248809814453, + "step": 12790 + }, + { + "epoch": 0.0387450244426619, + "grad_norm": 0.17716510593891144, + "learning_rate": 0.0001205220651860398, + "loss": 1.750137710571289, + "step": 12800 + }, + { + "epoch": 0.038775293993007734, + "grad_norm": 0.15764304995536804, + "learning_rate": 0.00012051827000440241, + "loss": 1.796088409423828, + "step": 12810 + }, + { + "epoch": 0.038805563543353565, + "grad_norm": 0.1826615333557129, + "learning_rate": 0.00012051447482276503, + "loss": 1.7445344924926758, + "step": 12820 + }, + { + "epoch": 0.038835833093699396, + "grad_norm": 0.1858287900686264, + "learning_rate": 0.00012051067964112762, + "loss": 1.774649429321289, + "step": 12830 + }, + { + "epoch": 0.03886610264404522, + "grad_norm": 0.1760910004377365, + "learning_rate": 0.00012050688445949024, + "loss": 1.7310970306396485, + "step": 12840 + }, + { + "epoch": 0.03889637219439105, + "grad_norm": 0.18919254839420319, + "learning_rate": 0.00012050308927785283, + "loss": 1.7853763580322266, + "step": 12850 + }, + { + "epoch": 0.03892664174473688, + "grad_norm": 0.204681858420372, + "learning_rate": 0.00012049929409621545, + "loss": 1.7544771194458009, + "step": 12860 + }, + { + "epoch": 0.03895691129508271, + "grad_norm": 0.19657455384731293, + "learning_rate": 0.00012049549891457806, + "loss": 1.729749870300293, + "step": 12870 + }, + { + "epoch": 0.03898718084542854, + "grad_norm": 0.19575084745883942, + "learning_rate": 0.00012049170373294066, + "loss": 1.711581039428711, + "step": 12880 + }, + { + "epoch": 0.039017450395774374, + "grad_norm": 0.19140438735485077, + "learning_rate": 0.00012048790855130327, + "loss": 1.7339616775512696, + "step": 12890 + }, + { + "epoch": 0.0390477199461202, + "grad_norm": 0.18713219463825226, + "learning_rate": 0.00012048411336966588, + "loss": 1.7651737213134766, + "step": 12900 + }, + { + "epoch": 0.03907798949646603, + "grad_norm": 0.1670863926410675, + "learning_rate": 0.00012048031818802848, + "loss": 1.7276529312133788, + "step": 12910 + }, + { + "epoch": 0.03910825904681186, + "grad_norm": 0.21686093509197235, + "learning_rate": 0.00012047652300639109, + "loss": 1.7776008605957032, + "step": 12920 + }, + { + "epoch": 0.03913852859715769, + "grad_norm": 0.19218672811985016, + "learning_rate": 0.0001204727278247537, + "loss": 1.7332252502441405, + "step": 12930 + }, + { + "epoch": 0.03916879814750352, + "grad_norm": 0.16840191185474396, + "learning_rate": 0.0001204689326431163, + "loss": 1.746719741821289, + "step": 12940 + }, + { + "epoch": 0.03919906769784935, + "grad_norm": 0.16809816658496857, + "learning_rate": 0.00012046513746147892, + "loss": 1.7393482208251954, + "step": 12950 + }, + { + "epoch": 0.039229337248195176, + "grad_norm": 0.22800995409488678, + "learning_rate": 0.00012046134227984151, + "loss": 1.7501832962036132, + "step": 12960 + }, + { + "epoch": 0.03925960679854101, + "grad_norm": 0.2029687762260437, + "learning_rate": 0.00012045754709820413, + "loss": 1.7535402297973632, + "step": 12970 + }, + { + "epoch": 0.03928987634888684, + "grad_norm": 0.19658470153808594, + "learning_rate": 0.00012045375191656672, + "loss": 1.7490306854248048, + "step": 12980 + }, + { + "epoch": 0.03932014589923267, + "grad_norm": 0.19039981067180634, + "learning_rate": 0.00012044995673492934, + "loss": 1.7825069427490234, + "step": 12990 + }, + { + "epoch": 0.0393504154495785, + "grad_norm": 0.17959314584732056, + "learning_rate": 0.00012044616155329194, + "loss": 1.7782352447509766, + "step": 13000 + }, + { + "epoch": 0.0393504154495785, + "eval_loss": 1.743008017539978, + "eval_runtime": 27.8274, + "eval_samples_per_second": 17.968, + "eval_steps_per_second": 1.15, + "step": 13000 + }, + { + "epoch": 0.03938068499992432, + "grad_norm": 0.17454519867897034, + "learning_rate": 0.00012044236637165455, + "loss": 1.7342693328857421, + "step": 13010 + }, + { + "epoch": 0.039410954550270154, + "grad_norm": 0.18090924620628357, + "learning_rate": 0.00012043857119001715, + "loss": 1.7344669342041015, + "step": 13020 + }, + { + "epoch": 0.039441224100615985, + "grad_norm": 0.1723354309797287, + "learning_rate": 0.00012043477600837977, + "loss": 1.7815452575683595, + "step": 13030 + }, + { + "epoch": 0.039471493650961816, + "grad_norm": 0.24302424490451813, + "learning_rate": 0.00012043098082674236, + "loss": 1.7881488800048828, + "step": 13040 + }, + { + "epoch": 0.039501763201307646, + "grad_norm": 0.19516544044017792, + "learning_rate": 0.00012042718564510498, + "loss": 1.7581626892089843, + "step": 13050 + }, + { + "epoch": 0.03953203275165348, + "grad_norm": 0.18137000501155853, + "learning_rate": 0.00012042339046346757, + "loss": 1.7801715850830078, + "step": 13060 + }, + { + "epoch": 0.0395623023019993, + "grad_norm": 0.20636413991451263, + "learning_rate": 0.00012041959528183019, + "loss": 1.808746337890625, + "step": 13070 + }, + { + "epoch": 0.03959257185234513, + "grad_norm": 0.183779776096344, + "learning_rate": 0.00012041580010019281, + "loss": 1.7888254165649413, + "step": 13080 + }, + { + "epoch": 0.03962284140269096, + "grad_norm": 0.18584540486335754, + "learning_rate": 0.0001204120049185554, + "loss": 1.748407745361328, + "step": 13090 + }, + { + "epoch": 0.039653110953036794, + "grad_norm": 0.18651019036769867, + "learning_rate": 0.00012040820973691802, + "loss": 1.7863639831542968, + "step": 13100 + }, + { + "epoch": 0.039683380503382625, + "grad_norm": 0.18368609249591827, + "learning_rate": 0.00012040441455528061, + "loss": 1.7445903778076173, + "step": 13110 + }, + { + "epoch": 0.03971365005372845, + "grad_norm": 0.1818002611398697, + "learning_rate": 0.00012040061937364323, + "loss": 1.7679527282714844, + "step": 13120 + }, + { + "epoch": 0.03974391960407428, + "grad_norm": 0.20435172319412231, + "learning_rate": 0.00012039682419200583, + "loss": 1.769185447692871, + "step": 13130 + }, + { + "epoch": 0.03977418915442011, + "grad_norm": 0.18262256681919098, + "learning_rate": 0.00012039302901036845, + "loss": 1.7694990158081054, + "step": 13140 + }, + { + "epoch": 0.03980445870476594, + "grad_norm": 0.17911460995674133, + "learning_rate": 0.00012038923382873104, + "loss": 1.7504880905151368, + "step": 13150 + }, + { + "epoch": 0.03983472825511177, + "grad_norm": 0.2265026718378067, + "learning_rate": 0.00012038543864709366, + "loss": 1.7185216903686524, + "step": 13160 + }, + { + "epoch": 0.0398649978054576, + "grad_norm": 0.1769646406173706, + "learning_rate": 0.00012038164346545625, + "loss": 1.7737201690673827, + "step": 13170 + }, + { + "epoch": 0.03989526735580343, + "grad_norm": 0.19220760464668274, + "learning_rate": 0.00012037784828381887, + "loss": 1.7046998977661132, + "step": 13180 + }, + { + "epoch": 0.03992553690614926, + "grad_norm": 0.18975882232189178, + "learning_rate": 0.00012037405310218148, + "loss": 1.713824462890625, + "step": 13190 + }, + { + "epoch": 0.03995580645649509, + "grad_norm": 0.1779467910528183, + "learning_rate": 0.00012037025792054408, + "loss": 1.7801677703857421, + "step": 13200 + }, + { + "epoch": 0.03998607600684092, + "grad_norm": 0.17740623652935028, + "learning_rate": 0.00012036646273890669, + "loss": 1.7194740295410156, + "step": 13210 + }, + { + "epoch": 0.04001634555718675, + "grad_norm": 0.16334693133831024, + "learning_rate": 0.0001203626675572693, + "loss": 1.7729217529296875, + "step": 13220 + }, + { + "epoch": 0.04004661510753258, + "grad_norm": 0.22115746140480042, + "learning_rate": 0.0001203588723756319, + "loss": 1.7560644149780273, + "step": 13230 + }, + { + "epoch": 0.040076884657878405, + "grad_norm": 0.17935076355934143, + "learning_rate": 0.0001203550771939945, + "loss": 1.79996337890625, + "step": 13240 + }, + { + "epoch": 0.040107154208224235, + "grad_norm": 0.19851888716220856, + "learning_rate": 0.00012035128201235711, + "loss": 1.7416290283203124, + "step": 13250 + }, + { + "epoch": 0.040137423758570066, + "grad_norm": 0.17944234609603882, + "learning_rate": 0.00012034748683071972, + "loss": 1.7414129257202149, + "step": 13260 + }, + { + "epoch": 0.0401676933089159, + "grad_norm": 0.1679111272096634, + "learning_rate": 0.00012034369164908232, + "loss": 1.8087766647338868, + "step": 13270 + }, + { + "epoch": 0.04019796285926173, + "grad_norm": 0.2154501974582672, + "learning_rate": 0.00012033989646744493, + "loss": 1.7561063766479492, + "step": 13280 + }, + { + "epoch": 0.04022823240960755, + "grad_norm": 0.19333690404891968, + "learning_rate": 0.00012033610128580755, + "loss": 1.7794849395751953, + "step": 13290 + }, + { + "epoch": 0.04025850195995338, + "grad_norm": 0.187736377120018, + "learning_rate": 0.00012033230610417014, + "loss": 1.7513614654541017, + "step": 13300 + }, + { + "epoch": 0.040288771510299214, + "grad_norm": 0.1809137463569641, + "learning_rate": 0.00012032851092253276, + "loss": 1.741775894165039, + "step": 13310 + }, + { + "epoch": 0.040319041060645044, + "grad_norm": 0.17475377023220062, + "learning_rate": 0.00012032471574089537, + "loss": 1.7735427856445312, + "step": 13320 + }, + { + "epoch": 0.040349310610990875, + "grad_norm": 0.1616712361574173, + "learning_rate": 0.00012032092055925797, + "loss": 1.7541824340820313, + "step": 13330 + }, + { + "epoch": 0.040379580161336706, + "grad_norm": 0.1782093644142151, + "learning_rate": 0.00012031712537762058, + "loss": 1.715081787109375, + "step": 13340 + }, + { + "epoch": 0.04040984971168253, + "grad_norm": 0.179222971200943, + "learning_rate": 0.00012031333019598318, + "loss": 1.8010370254516601, + "step": 13350 + }, + { + "epoch": 0.04044011926202836, + "grad_norm": 0.19791781902313232, + "learning_rate": 0.00012030953501434579, + "loss": 1.7584857940673828, + "step": 13360 + }, + { + "epoch": 0.04047038881237419, + "grad_norm": 0.17983871698379517, + "learning_rate": 0.0001203057398327084, + "loss": 1.7390338897705078, + "step": 13370 + }, + { + "epoch": 0.04050065836272002, + "grad_norm": 0.19727101922035217, + "learning_rate": 0.000120301944651071, + "loss": 1.720174217224121, + "step": 13380 + }, + { + "epoch": 0.04053092791306585, + "grad_norm": 0.18716569244861603, + "learning_rate": 0.00012029814946943361, + "loss": 1.7777257919311524, + "step": 13390 + }, + { + "epoch": 0.040561197463411684, + "grad_norm": 0.19718551635742188, + "learning_rate": 0.00012029435428779621, + "loss": 1.7673015594482422, + "step": 13400 + }, + { + "epoch": 0.04059146701375751, + "grad_norm": 0.17554475367069244, + "learning_rate": 0.00012029055910615882, + "loss": 1.7452379226684571, + "step": 13410 + }, + { + "epoch": 0.04062173656410334, + "grad_norm": 0.17279857397079468, + "learning_rate": 0.00012028676392452143, + "loss": 1.7680917739868165, + "step": 13420 + }, + { + "epoch": 0.04065200611444917, + "grad_norm": 0.18191656470298767, + "learning_rate": 0.00012028296874288403, + "loss": 1.7195472717285156, + "step": 13430 + }, + { + "epoch": 0.040682275664795, + "grad_norm": 0.18640153110027313, + "learning_rate": 0.00012027917356124664, + "loss": 1.7838153839111328, + "step": 13440 + }, + { + "epoch": 0.04071254521514083, + "grad_norm": 0.20571544766426086, + "learning_rate": 0.00012027537837960926, + "loss": 1.7529645919799806, + "step": 13450 + }, + { + "epoch": 0.040742814765486655, + "grad_norm": 0.20231769979000092, + "learning_rate": 0.00012027158319797185, + "loss": 1.7503007888793944, + "step": 13460 + }, + { + "epoch": 0.040773084315832486, + "grad_norm": 0.1954278200864792, + "learning_rate": 0.00012026778801633447, + "loss": 1.765749740600586, + "step": 13470 + }, + { + "epoch": 0.04080335386617832, + "grad_norm": 0.17566236853599548, + "learning_rate": 0.00012026399283469708, + "loss": 1.7248004913330077, + "step": 13480 + }, + { + "epoch": 0.04083362341652415, + "grad_norm": 0.208215594291687, + "learning_rate": 0.00012026019765305968, + "loss": 1.7481956481933594, + "step": 13490 + }, + { + "epoch": 0.04086389296686998, + "grad_norm": 0.17323258519172668, + "learning_rate": 0.00012025640247142229, + "loss": 1.789501953125, + "step": 13500 + }, + { + "epoch": 0.04086389296686998, + "eval_loss": 1.735045075416565, + "eval_runtime": 28.0674, + "eval_samples_per_second": 17.814, + "eval_steps_per_second": 1.14, + "step": 13500 + }, + { + "epoch": 0.04089416251721581, + "grad_norm": 0.17957119643688202, + "learning_rate": 0.00012025260728978489, + "loss": 1.7293859481811524, + "step": 13510 + }, + { + "epoch": 0.040924432067561634, + "grad_norm": 0.18474826216697693, + "learning_rate": 0.0001202488121081475, + "loss": 1.725011444091797, + "step": 13520 + }, + { + "epoch": 0.040954701617907464, + "grad_norm": 0.1818131059408188, + "learning_rate": 0.0001202450169265101, + "loss": 1.758985710144043, + "step": 13530 + }, + { + "epoch": 0.040984971168253295, + "grad_norm": 0.16283327341079712, + "learning_rate": 0.00012024122174487271, + "loss": 1.7332633972167968, + "step": 13540 + }, + { + "epoch": 0.041015240718599126, + "grad_norm": 0.17361266911029816, + "learning_rate": 0.00012023742656323532, + "loss": 1.7817157745361327, + "step": 13550 + }, + { + "epoch": 0.04104551026894496, + "grad_norm": 0.18222862482070923, + "learning_rate": 0.00012023363138159794, + "loss": 1.731698989868164, + "step": 13560 + }, + { + "epoch": 0.04107577981929079, + "grad_norm": 0.23929773271083832, + "learning_rate": 0.00012022983619996053, + "loss": 1.7700477600097657, + "step": 13570 + }, + { + "epoch": 0.04110604936963661, + "grad_norm": 0.19580557942390442, + "learning_rate": 0.00012022604101832315, + "loss": 1.7134944915771484, + "step": 13580 + }, + { + "epoch": 0.04113631891998244, + "grad_norm": 0.2028198391199112, + "learning_rate": 0.00012022224583668574, + "loss": 1.7406444549560547, + "step": 13590 + }, + { + "epoch": 0.04116658847032827, + "grad_norm": 0.1853455901145935, + "learning_rate": 0.00012021845065504836, + "loss": 1.7489383697509766, + "step": 13600 + }, + { + "epoch": 0.041196858020674104, + "grad_norm": 0.16219240427017212, + "learning_rate": 0.00012021465547341095, + "loss": 1.7694246292114257, + "step": 13610 + }, + { + "epoch": 0.041227127571019935, + "grad_norm": 0.19770458340644836, + "learning_rate": 0.00012021086029177357, + "loss": 1.7137886047363282, + "step": 13620 + }, + { + "epoch": 0.04125739712136576, + "grad_norm": 0.17352211475372314, + "learning_rate": 0.00012020706511013616, + "loss": 1.7193473815917968, + "step": 13630 + }, + { + "epoch": 0.04128766667171159, + "grad_norm": 0.17367295920848846, + "learning_rate": 0.00012020326992849878, + "loss": 1.7519439697265624, + "step": 13640 + }, + { + "epoch": 0.04131793622205742, + "grad_norm": 0.18518418073654175, + "learning_rate": 0.00012019947474686138, + "loss": 1.7494606018066405, + "step": 13650 + }, + { + "epoch": 0.04134820577240325, + "grad_norm": 0.19072413444519043, + "learning_rate": 0.000120195679565224, + "loss": 1.757786750793457, + "step": 13660 + }, + { + "epoch": 0.04137847532274908, + "grad_norm": 0.20946915447711945, + "learning_rate": 0.00012019188438358659, + "loss": 1.734091567993164, + "step": 13670 + }, + { + "epoch": 0.04140874487309491, + "grad_norm": 0.17597158253192902, + "learning_rate": 0.00012018808920194921, + "loss": 1.7961626052856445, + "step": 13680 + }, + { + "epoch": 0.04143901442344074, + "grad_norm": 0.17287413775920868, + "learning_rate": 0.00012018429402031183, + "loss": 1.7463478088378905, + "step": 13690 + }, + { + "epoch": 0.04146928397378657, + "grad_norm": 0.1871025562286377, + "learning_rate": 0.00012018049883867442, + "loss": 1.737774658203125, + "step": 13700 + }, + { + "epoch": 0.0414995535241324, + "grad_norm": 0.1619962900876999, + "learning_rate": 0.00012017670365703704, + "loss": 1.7238525390625, + "step": 13710 + }, + { + "epoch": 0.04152982307447823, + "grad_norm": 0.17981526255607605, + "learning_rate": 0.00012017290847539963, + "loss": 1.7079404830932616, + "step": 13720 + }, + { + "epoch": 0.04156009262482406, + "grad_norm": 0.18630073964595795, + "learning_rate": 0.00012016911329376225, + "loss": 1.758395004272461, + "step": 13730 + }, + { + "epoch": 0.04159036217516989, + "grad_norm": 0.18463560938835144, + "learning_rate": 0.00012016531811212484, + "loss": 1.7727378845214843, + "step": 13740 + }, + { + "epoch": 0.041620631725515715, + "grad_norm": 0.16692188382148743, + "learning_rate": 0.00012016152293048746, + "loss": 1.72459716796875, + "step": 13750 + }, + { + "epoch": 0.041650901275861546, + "grad_norm": 0.14725461602210999, + "learning_rate": 0.00012015772774885006, + "loss": 1.7367519378662108, + "step": 13760 + }, + { + "epoch": 0.04168117082620738, + "grad_norm": 0.1960172802209854, + "learning_rate": 0.00012015393256721267, + "loss": 1.7469257354736327, + "step": 13770 + }, + { + "epoch": 0.04171144037655321, + "grad_norm": 0.17779439687728882, + "learning_rate": 0.00012015013738557527, + "loss": 1.7661436080932618, + "step": 13780 + }, + { + "epoch": 0.04174170992689904, + "grad_norm": 0.16867290437221527, + "learning_rate": 0.00012014634220393789, + "loss": 1.7858551025390625, + "step": 13790 + }, + { + "epoch": 0.04177197947724486, + "grad_norm": 0.1975652575492859, + "learning_rate": 0.00012014254702230048, + "loss": 1.7527427673339844, + "step": 13800 + }, + { + "epoch": 0.04180224902759069, + "grad_norm": 0.23240695893764496, + "learning_rate": 0.0001201387518406631, + "loss": 1.7205188751220704, + "step": 13810 + }, + { + "epoch": 0.041832518577936524, + "grad_norm": 0.16113147139549255, + "learning_rate": 0.0001201349566590257, + "loss": 1.733612060546875, + "step": 13820 + }, + { + "epoch": 0.041862788128282355, + "grad_norm": 0.18088199198246002, + "learning_rate": 0.00012013116147738831, + "loss": 1.8175506591796875, + "step": 13830 + }, + { + "epoch": 0.041893057678628186, + "grad_norm": 0.18426766991615295, + "learning_rate": 0.00012012736629575092, + "loss": 1.752951431274414, + "step": 13840 + }, + { + "epoch": 0.04192332722897402, + "grad_norm": 0.18799886107444763, + "learning_rate": 0.00012012357111411352, + "loss": 1.7437236785888672, + "step": 13850 + }, + { + "epoch": 0.04195359677931984, + "grad_norm": 0.17403484880924225, + "learning_rate": 0.00012011977593247613, + "loss": 1.770573043823242, + "step": 13860 + }, + { + "epoch": 0.04198386632966567, + "grad_norm": 0.16681817173957825, + "learning_rate": 0.00012011598075083873, + "loss": 1.7426767349243164, + "step": 13870 + }, + { + "epoch": 0.0420141358800115, + "grad_norm": 0.16127339005470276, + "learning_rate": 0.00012011218556920134, + "loss": 1.7328937530517579, + "step": 13880 + }, + { + "epoch": 0.04204440543035733, + "grad_norm": 0.16737167537212372, + "learning_rate": 0.00012010839038756395, + "loss": 1.7452091217041015, + "step": 13890 + }, + { + "epoch": 0.042074674980703164, + "grad_norm": 0.18835389614105225, + "learning_rate": 0.00012010459520592657, + "loss": 1.779402732849121, + "step": 13900 + }, + { + "epoch": 0.04210494453104899, + "grad_norm": 0.17328964173793793, + "learning_rate": 0.00012010080002428916, + "loss": 1.7611547470092774, + "step": 13910 + }, + { + "epoch": 0.04213521408139482, + "grad_norm": 0.1787450760602951, + "learning_rate": 0.00012009700484265178, + "loss": 1.7475461959838867, + "step": 13920 + }, + { + "epoch": 0.04216548363174065, + "grad_norm": 0.16936291754245758, + "learning_rate": 0.00012009320966101438, + "loss": 1.761199951171875, + "step": 13930 + }, + { + "epoch": 0.04219575318208648, + "grad_norm": 0.17279411852359772, + "learning_rate": 0.00012008941447937699, + "loss": 1.7955635070800782, + "step": 13940 + }, + { + "epoch": 0.04222602273243231, + "grad_norm": 0.1612602174282074, + "learning_rate": 0.0001200856192977396, + "loss": 1.7721523284912108, + "step": 13950 + }, + { + "epoch": 0.04225629228277814, + "grad_norm": 0.2060185670852661, + "learning_rate": 0.0001200818241161022, + "loss": 1.6714910507202148, + "step": 13960 + }, + { + "epoch": 0.042286561833123966, + "grad_norm": 0.1780710071325302, + "learning_rate": 0.00012007802893446481, + "loss": 1.7492908477783202, + "step": 13970 + }, + { + "epoch": 0.0423168313834698, + "grad_norm": 0.17856891453266144, + "learning_rate": 0.00012007423375282741, + "loss": 1.7314262390136719, + "step": 13980 + }, + { + "epoch": 0.04234710093381563, + "grad_norm": 0.2072468250989914, + "learning_rate": 0.00012007043857119002, + "loss": 1.7795562744140625, + "step": 13990 + }, + { + "epoch": 0.04237737048416146, + "grad_norm": 0.16539330780506134, + "learning_rate": 0.00012006664338955263, + "loss": 1.7938018798828126, + "step": 14000 + }, + { + "epoch": 0.04237737048416146, + "eval_loss": 1.7292258739471436, + "eval_runtime": 28.0776, + "eval_samples_per_second": 17.808, + "eval_steps_per_second": 1.14, + "step": 14000 + }, + { + "epoch": 0.04240764003450729, + "grad_norm": 0.19215872883796692, + "learning_rate": 0.00012006284820791523, + "loss": 1.7640308380126952, + "step": 14010 + }, + { + "epoch": 0.04243790958485312, + "grad_norm": 0.1693001687526703, + "learning_rate": 0.00012005905302627784, + "loss": 1.7870361328125, + "step": 14020 + }, + { + "epoch": 0.042468179135198944, + "grad_norm": 0.19060374796390533, + "learning_rate": 0.00012005525784464044, + "loss": 1.7327438354492188, + "step": 14030 + }, + { + "epoch": 0.042498448685544775, + "grad_norm": 0.17156744003295898, + "learning_rate": 0.00012005146266300305, + "loss": 1.7166481018066406, + "step": 14040 + }, + { + "epoch": 0.042528718235890606, + "grad_norm": 0.17401690781116486, + "learning_rate": 0.00012004766748136565, + "loss": 1.7702869415283202, + "step": 14050 + }, + { + "epoch": 0.04255898778623644, + "grad_norm": 0.18804049491882324, + "learning_rate": 0.00012004387229972827, + "loss": 1.7454605102539062, + "step": 14060 + }, + { + "epoch": 0.04258925733658227, + "grad_norm": 0.18308714032173157, + "learning_rate": 0.00012004007711809087, + "loss": 1.7572120666503905, + "step": 14070 + }, + { + "epoch": 0.04261952688692809, + "grad_norm": 0.18224672973155975, + "learning_rate": 0.00012003628193645349, + "loss": 1.7596040725708009, + "step": 14080 + }, + { + "epoch": 0.04264979643727392, + "grad_norm": 0.20016279816627502, + "learning_rate": 0.00012003248675481609, + "loss": 1.7494400024414063, + "step": 14090 + }, + { + "epoch": 0.04268006598761975, + "grad_norm": 0.19592168927192688, + "learning_rate": 0.0001200286915731787, + "loss": 1.7407802581787108, + "step": 14100 + }, + { + "epoch": 0.042710335537965584, + "grad_norm": 0.18456250429153442, + "learning_rate": 0.0001200248963915413, + "loss": 1.7148189544677734, + "step": 14110 + }, + { + "epoch": 0.042740605088311415, + "grad_norm": 0.1865435242652893, + "learning_rate": 0.00012002110120990391, + "loss": 1.7185688018798828, + "step": 14120 + }, + { + "epoch": 0.042770874638657246, + "grad_norm": 0.2091720849275589, + "learning_rate": 0.00012001730602826652, + "loss": 1.7312046051025392, + "step": 14130 + }, + { + "epoch": 0.04280114418900307, + "grad_norm": 0.16481128334999084, + "learning_rate": 0.00012001351084662912, + "loss": 1.733538818359375, + "step": 14140 + }, + { + "epoch": 0.0428314137393489, + "grad_norm": 0.18892452120780945, + "learning_rate": 0.00012000971566499173, + "loss": 1.7565052032470703, + "step": 14150 + }, + { + "epoch": 0.04286168328969473, + "grad_norm": 0.18391379714012146, + "learning_rate": 0.00012000592048335433, + "loss": 1.7563861846923827, + "step": 14160 + }, + { + "epoch": 0.04289195284004056, + "grad_norm": 0.21417047083377838, + "learning_rate": 0.00012000212530171695, + "loss": 1.7419715881347657, + "step": 14170 + }, + { + "epoch": 0.04292222239038639, + "grad_norm": 0.19400015473365784, + "learning_rate": 0.00011999833012007955, + "loss": 1.745657730102539, + "step": 14180 + }, + { + "epoch": 0.042952491940732224, + "grad_norm": 0.1897994577884674, + "learning_rate": 0.00011999453493844217, + "loss": 1.7186363220214844, + "step": 14190 + }, + { + "epoch": 0.04298276149107805, + "grad_norm": 0.16780182719230652, + "learning_rate": 0.00011999073975680476, + "loss": 1.7754467010498047, + "step": 14200 + }, + { + "epoch": 0.04301303104142388, + "grad_norm": 0.1898294985294342, + "learning_rate": 0.00011998694457516738, + "loss": 1.7491655349731445, + "step": 14210 + }, + { + "epoch": 0.04304330059176971, + "grad_norm": 0.2069225311279297, + "learning_rate": 0.00011998314939352997, + "loss": 1.7194015502929687, + "step": 14220 + }, + { + "epoch": 0.04307357014211554, + "grad_norm": 0.1948952078819275, + "learning_rate": 0.00011997935421189259, + "loss": 1.763393020629883, + "step": 14230 + }, + { + "epoch": 0.04310383969246137, + "grad_norm": 0.20133914053440094, + "learning_rate": 0.00011997555903025518, + "loss": 1.7028976440429688, + "step": 14240 + }, + { + "epoch": 0.043134109242807195, + "grad_norm": 0.17606927454471588, + "learning_rate": 0.0001199717638486178, + "loss": 1.7481338500976562, + "step": 14250 + }, + { + "epoch": 0.043164378793153026, + "grad_norm": 0.1832461655139923, + "learning_rate": 0.0001199679686669804, + "loss": 1.766820526123047, + "step": 14260 + }, + { + "epoch": 0.043194648343498857, + "grad_norm": 0.1743033230304718, + "learning_rate": 0.00011996417348534301, + "loss": 1.7463661193847657, + "step": 14270 + }, + { + "epoch": 0.04322491789384469, + "grad_norm": 0.18499088287353516, + "learning_rate": 0.0001199603783037056, + "loss": 1.714169692993164, + "step": 14280 + }, + { + "epoch": 0.04325518744419052, + "grad_norm": 0.17816999554634094, + "learning_rate": 0.00011995658312206822, + "loss": 1.7672019958496095, + "step": 14290 + }, + { + "epoch": 0.04328545699453635, + "grad_norm": 0.18682952225208282, + "learning_rate": 0.00011995278794043084, + "loss": 1.7435739517211915, + "step": 14300 + }, + { + "epoch": 0.04331572654488217, + "grad_norm": 0.1701037734746933, + "learning_rate": 0.00011994899275879344, + "loss": 1.7591400146484375, + "step": 14310 + }, + { + "epoch": 0.043345996095228004, + "grad_norm": 0.1806858777999878, + "learning_rate": 0.00011994519757715606, + "loss": 1.7104784011840821, + "step": 14320 + }, + { + "epoch": 0.043376265645573835, + "grad_norm": 0.17015185952186584, + "learning_rate": 0.00011994140239551865, + "loss": 1.7589088439941407, + "step": 14330 + }, + { + "epoch": 0.043406535195919665, + "grad_norm": 0.1718611866235733, + "learning_rate": 0.00011993760721388127, + "loss": 1.7199819564819336, + "step": 14340 + }, + { + "epoch": 0.043436804746265496, + "grad_norm": 0.18406511843204498, + "learning_rate": 0.00011993381203224386, + "loss": 1.7153614044189454, + "step": 14350 + }, + { + "epoch": 0.04346707429661133, + "grad_norm": 0.20466765761375427, + "learning_rate": 0.00011993001685060648, + "loss": 1.7348785400390625, + "step": 14360 + }, + { + "epoch": 0.04349734384695715, + "grad_norm": 0.16898930072784424, + "learning_rate": 0.00011992622166896907, + "loss": 1.7583480834960938, + "step": 14370 + }, + { + "epoch": 0.04352761339730298, + "grad_norm": 0.20080913603305817, + "learning_rate": 0.00011992242648733169, + "loss": 1.724459457397461, + "step": 14380 + }, + { + "epoch": 0.04355788294764881, + "grad_norm": 0.21476244926452637, + "learning_rate": 0.00011991863130569428, + "loss": 1.7795913696289063, + "step": 14390 + }, + { + "epoch": 0.043588152497994644, + "grad_norm": 0.17869892716407776, + "learning_rate": 0.0001199148361240569, + "loss": 1.7196514129638671, + "step": 14400 + }, + { + "epoch": 0.043618422048340474, + "grad_norm": 0.18494221568107605, + "learning_rate": 0.0001199110409424195, + "loss": 1.7445798873901368, + "step": 14410 + }, + { + "epoch": 0.0436486915986863, + "grad_norm": 0.15888933837413788, + "learning_rate": 0.00011990724576078212, + "loss": 1.7451488494873046, + "step": 14420 + }, + { + "epoch": 0.04367896114903213, + "grad_norm": 0.1917649358510971, + "learning_rate": 0.00011990345057914472, + "loss": 1.7686351776123046, + "step": 14430 + }, + { + "epoch": 0.04370923069937796, + "grad_norm": 0.17262959480285645, + "learning_rate": 0.00011989965539750733, + "loss": 1.7674821853637694, + "step": 14440 + }, + { + "epoch": 0.04373950024972379, + "grad_norm": 0.19667452573776245, + "learning_rate": 0.00011989586021586993, + "loss": 1.7815055847167969, + "step": 14450 + }, + { + "epoch": 0.04376976980006962, + "grad_norm": 0.17918208241462708, + "learning_rate": 0.00011989206503423254, + "loss": 1.703373336791992, + "step": 14460 + }, + { + "epoch": 0.04380003935041545, + "grad_norm": 0.19743920862674713, + "learning_rate": 0.00011988826985259515, + "loss": 1.7512672424316407, + "step": 14470 + }, + { + "epoch": 0.043830308900761276, + "grad_norm": 0.2518566846847534, + "learning_rate": 0.00011988447467095775, + "loss": 1.7639007568359375, + "step": 14480 + }, + { + "epoch": 0.04386057845110711, + "grad_norm": 0.16371209919452667, + "learning_rate": 0.00011988067948932036, + "loss": 1.7775278091430664, + "step": 14490 + }, + { + "epoch": 0.04389084800145294, + "grad_norm": 0.18142881989479065, + "learning_rate": 0.00011987688430768296, + "loss": 1.739966583251953, + "step": 14500 + }, + { + "epoch": 0.04389084800145294, + "eval_loss": 1.7382194995880127, + "eval_runtime": 27.9117, + "eval_samples_per_second": 17.914, + "eval_steps_per_second": 1.146, + "step": 14500 + }, + { + "epoch": 0.04392111755179877, + "grad_norm": 0.17960120737552643, + "learning_rate": 0.00011987308912604558, + "loss": 1.7511640548706056, + "step": 14510 + }, + { + "epoch": 0.0439513871021446, + "grad_norm": 0.18513600528240204, + "learning_rate": 0.00011986929394440818, + "loss": 1.765946388244629, + "step": 14520 + }, + { + "epoch": 0.04398165665249043, + "grad_norm": 0.1887723058462143, + "learning_rate": 0.0001198654987627708, + "loss": 1.7450008392333984, + "step": 14530 + }, + { + "epoch": 0.044011926202836255, + "grad_norm": 0.21786852180957794, + "learning_rate": 0.00011986170358113339, + "loss": 1.7374395370483398, + "step": 14540 + }, + { + "epoch": 0.044042195753182085, + "grad_norm": 0.18950217962265015, + "learning_rate": 0.000119857908399496, + "loss": 1.7683862686157226, + "step": 14550 + }, + { + "epoch": 0.044072465303527916, + "grad_norm": 0.17175182700157166, + "learning_rate": 0.00011985411321785861, + "loss": 1.7274057388305664, + "step": 14560 + }, + { + "epoch": 0.04410273485387375, + "grad_norm": 0.15343539416790009, + "learning_rate": 0.00011985031803622122, + "loss": 1.7041234970092773, + "step": 14570 + }, + { + "epoch": 0.04413300440421958, + "grad_norm": 0.18026931583881378, + "learning_rate": 0.00011984652285458382, + "loss": 1.7521316528320312, + "step": 14580 + }, + { + "epoch": 0.0441632739545654, + "grad_norm": 0.1646578460931778, + "learning_rate": 0.00011984272767294643, + "loss": 1.7265161514282226, + "step": 14590 + }, + { + "epoch": 0.04419354350491123, + "grad_norm": 0.18639174103736877, + "learning_rate": 0.00011983893249130904, + "loss": 1.6800477981567383, + "step": 14600 + }, + { + "epoch": 0.044223813055257064, + "grad_norm": 0.2035665065050125, + "learning_rate": 0.00011983513730967164, + "loss": 1.7011871337890625, + "step": 14610 + }, + { + "epoch": 0.044254082605602894, + "grad_norm": 0.19389726221561432, + "learning_rate": 0.00011983134212803425, + "loss": 1.7325912475585938, + "step": 14620 + }, + { + "epoch": 0.044284352155948725, + "grad_norm": 0.15475638210773468, + "learning_rate": 0.00011982754694639685, + "loss": 1.7419193267822266, + "step": 14630 + }, + { + "epoch": 0.044314621706294556, + "grad_norm": 0.19030606746673584, + "learning_rate": 0.00011982375176475946, + "loss": 1.753494644165039, + "step": 14640 + }, + { + "epoch": 0.04434489125664038, + "grad_norm": 0.1568521410226822, + "learning_rate": 0.00011981995658312207, + "loss": 1.7561931610107422, + "step": 14650 + }, + { + "epoch": 0.04437516080698621, + "grad_norm": 0.19153009355068207, + "learning_rate": 0.00011981616140148467, + "loss": 1.7164268493652344, + "step": 14660 + }, + { + "epoch": 0.04440543035733204, + "grad_norm": 0.1543383151292801, + "learning_rate": 0.00011981236621984729, + "loss": 1.7567937850952149, + "step": 14670 + }, + { + "epoch": 0.04443569990767787, + "grad_norm": 0.14424064755439758, + "learning_rate": 0.00011980857103820988, + "loss": 1.7455873489379883, + "step": 14680 + }, + { + "epoch": 0.0444659694580237, + "grad_norm": 0.21119561791419983, + "learning_rate": 0.0001198047758565725, + "loss": 1.7436794281005858, + "step": 14690 + }, + { + "epoch": 0.04449623900836953, + "grad_norm": 0.15824276208877563, + "learning_rate": 0.00011980098067493511, + "loss": 1.725181770324707, + "step": 14700 + }, + { + "epoch": 0.04452650855871536, + "grad_norm": 0.16568604111671448, + "learning_rate": 0.00011979718549329772, + "loss": 1.7349466323852538, + "step": 14710 + }, + { + "epoch": 0.04455677810906119, + "grad_norm": 0.19671747088432312, + "learning_rate": 0.00011979339031166032, + "loss": 1.755495262145996, + "step": 14720 + }, + { + "epoch": 0.04458704765940702, + "grad_norm": 0.17975610494613647, + "learning_rate": 0.00011978959513002293, + "loss": 1.7253509521484376, + "step": 14730 + }, + { + "epoch": 0.04461731720975285, + "grad_norm": 0.21265223622322083, + "learning_rate": 0.00011978579994838553, + "loss": 1.7288043975830079, + "step": 14740 + }, + { + "epoch": 0.04464758676009868, + "grad_norm": 0.17312440276145935, + "learning_rate": 0.00011978200476674814, + "loss": 1.7547290802001954, + "step": 14750 + }, + { + "epoch": 0.044677856310444505, + "grad_norm": 0.20605690777301788, + "learning_rate": 0.00011977820958511075, + "loss": 1.7467412948608398, + "step": 14760 + }, + { + "epoch": 0.044708125860790336, + "grad_norm": 0.15189123153686523, + "learning_rate": 0.00011977441440347335, + "loss": 1.7274515151977539, + "step": 14770 + }, + { + "epoch": 0.04473839541113617, + "grad_norm": 0.18519970774650574, + "learning_rate": 0.00011977061922183596, + "loss": 1.7497386932373047, + "step": 14780 + }, + { + "epoch": 0.044768664961482, + "grad_norm": 0.17603541910648346, + "learning_rate": 0.00011976682404019856, + "loss": 1.7349891662597656, + "step": 14790 + }, + { + "epoch": 0.04479893451182783, + "grad_norm": 0.16185466945171356, + "learning_rate": 0.00011976302885856118, + "loss": 1.697409439086914, + "step": 14800 + }, + { + "epoch": 0.04482920406217366, + "grad_norm": 0.19471588730812073, + "learning_rate": 0.00011975923367692377, + "loss": 1.7182979583740234, + "step": 14810 + }, + { + "epoch": 0.04485947361251948, + "grad_norm": 0.15572790801525116, + "learning_rate": 0.0001197554384952864, + "loss": 1.7081634521484375, + "step": 14820 + }, + { + "epoch": 0.044889743162865314, + "grad_norm": 0.17412185668945312, + "learning_rate": 0.00011975164331364899, + "loss": 1.7417400360107422, + "step": 14830 + }, + { + "epoch": 0.044920012713211145, + "grad_norm": 0.19875513017177582, + "learning_rate": 0.0001197478481320116, + "loss": 1.7548625946044922, + "step": 14840 + }, + { + "epoch": 0.044950282263556976, + "grad_norm": 0.16934682428836823, + "learning_rate": 0.0001197440529503742, + "loss": 1.7366930007934571, + "step": 14850 + }, + { + "epoch": 0.04498055181390281, + "grad_norm": 0.14857904613018036, + "learning_rate": 0.00011974025776873682, + "loss": 1.7375789642333985, + "step": 14860 + }, + { + "epoch": 0.04501082136424863, + "grad_norm": 0.18197095394134521, + "learning_rate": 0.00011973646258709941, + "loss": 1.741905975341797, + "step": 14870 + }, + { + "epoch": 0.04504109091459446, + "grad_norm": 0.19837833940982819, + "learning_rate": 0.00011973266740546203, + "loss": 1.735296630859375, + "step": 14880 + }, + { + "epoch": 0.04507136046494029, + "grad_norm": 0.18432314693927765, + "learning_rate": 0.00011972887222382462, + "loss": 1.7460723876953126, + "step": 14890 + }, + { + "epoch": 0.04510163001528612, + "grad_norm": 0.17462560534477234, + "learning_rate": 0.00011972507704218724, + "loss": 1.7105358123779297, + "step": 14900 + }, + { + "epoch": 0.045131899565631954, + "grad_norm": 0.1874571144580841, + "learning_rate": 0.00011972128186054986, + "loss": 1.7021743774414062, + "step": 14910 + }, + { + "epoch": 0.045162169115977785, + "grad_norm": 0.20673780143260956, + "learning_rate": 0.00011971748667891245, + "loss": 1.7012874603271484, + "step": 14920 + }, + { + "epoch": 0.04519243866632361, + "grad_norm": 0.1877063363790512, + "learning_rate": 0.00011971369149727507, + "loss": 1.707111930847168, + "step": 14930 + }, + { + "epoch": 0.04522270821666944, + "grad_norm": 0.17107343673706055, + "learning_rate": 0.00011970989631563767, + "loss": 1.7342489242553711, + "step": 14940 + }, + { + "epoch": 0.04525297776701527, + "grad_norm": 0.16976042091846466, + "learning_rate": 0.00011970610113400029, + "loss": 1.7543094635009766, + "step": 14950 + }, + { + "epoch": 0.0452832473173611, + "grad_norm": 0.18398115038871765, + "learning_rate": 0.00011970230595236288, + "loss": 1.7484050750732423, + "step": 14960 + }, + { + "epoch": 0.04531351686770693, + "grad_norm": 0.17024371027946472, + "learning_rate": 0.0001196985107707255, + "loss": 1.7258453369140625, + "step": 14970 + }, + { + "epoch": 0.04534378641805276, + "grad_norm": 0.17193388938903809, + "learning_rate": 0.00011969471558908809, + "loss": 1.7651382446289063, + "step": 14980 + }, + { + "epoch": 0.04537405596839859, + "grad_norm": 0.18811486661434174, + "learning_rate": 0.00011969092040745071, + "loss": 1.7081710815429687, + "step": 14990 + }, + { + "epoch": 0.04540432551874442, + "grad_norm": 0.19252943992614746, + "learning_rate": 0.0001196871252258133, + "loss": 1.7302135467529296, + "step": 15000 + }, + { + "epoch": 0.04540432551874442, + "eval_loss": 1.7345775365829468, + "eval_runtime": 28.3299, + "eval_samples_per_second": 17.649, + "eval_steps_per_second": 1.13, + "step": 15000 + }, + { + "epoch": 0.04543459506909025, + "grad_norm": 0.18548524379730225, + "learning_rate": 0.00011968333004417592, + "loss": 1.7565540313720702, + "step": 15010 + }, + { + "epoch": 0.04546486461943608, + "grad_norm": 0.19831228256225586, + "learning_rate": 0.00011967953486253851, + "loss": 1.7662382125854492, + "step": 15020 + }, + { + "epoch": 0.04549513416978191, + "grad_norm": 0.1789189875125885, + "learning_rate": 0.00011967573968090113, + "loss": 1.7150768280029296, + "step": 15030 + }, + { + "epoch": 0.045525403720127734, + "grad_norm": 0.19308000802993774, + "learning_rate": 0.00011967194449926374, + "loss": 1.7378524780273437, + "step": 15040 + }, + { + "epoch": 0.045555673270473565, + "grad_norm": 0.1659131646156311, + "learning_rate": 0.00011966814931762634, + "loss": 1.723185157775879, + "step": 15050 + }, + { + "epoch": 0.045585942820819396, + "grad_norm": 0.1883753538131714, + "learning_rate": 0.00011966435413598895, + "loss": 1.7174747467041016, + "step": 15060 + }, + { + "epoch": 0.04561621237116523, + "grad_norm": 0.1673242747783661, + "learning_rate": 0.00011966055895435156, + "loss": 1.70947265625, + "step": 15070 + }, + { + "epoch": 0.04564648192151106, + "grad_norm": 0.16948828101158142, + "learning_rate": 0.00011965676377271416, + "loss": 1.7875238418579102, + "step": 15080 + }, + { + "epoch": 0.04567675147185689, + "grad_norm": 0.2127043455839157, + "learning_rate": 0.00011965296859107677, + "loss": 1.7657604217529297, + "step": 15090 + }, + { + "epoch": 0.04570702102220271, + "grad_norm": 0.15054194629192352, + "learning_rate": 0.00011964917340943937, + "loss": 1.7654430389404296, + "step": 15100 + }, + { + "epoch": 0.04573729057254854, + "grad_norm": 0.17622110247612, + "learning_rate": 0.00011964537822780198, + "loss": 1.7157949447631835, + "step": 15110 + }, + { + "epoch": 0.045767560122894374, + "grad_norm": 0.16868750751018524, + "learning_rate": 0.0001196415830461646, + "loss": 1.7459602355957031, + "step": 15120 + }, + { + "epoch": 0.045797829673240205, + "grad_norm": 0.16735656559467316, + "learning_rate": 0.00011963778786452719, + "loss": 1.7019775390625, + "step": 15130 + }, + { + "epoch": 0.045828099223586036, + "grad_norm": 0.14676278829574585, + "learning_rate": 0.00011963399268288981, + "loss": 1.7121162414550781, + "step": 15140 + }, + { + "epoch": 0.04585836877393187, + "grad_norm": 0.17670106887817383, + "learning_rate": 0.0001196301975012524, + "loss": 1.7394733428955078, + "step": 15150 + }, + { + "epoch": 0.04588863832427769, + "grad_norm": 0.15814396739006042, + "learning_rate": 0.00011962640231961502, + "loss": 1.7294940948486328, + "step": 15160 + }, + { + "epoch": 0.04591890787462352, + "grad_norm": 0.18856281042099, + "learning_rate": 0.00011962260713797763, + "loss": 1.7510719299316406, + "step": 15170 + }, + { + "epoch": 0.04594917742496935, + "grad_norm": 0.16161905229091644, + "learning_rate": 0.00011961881195634024, + "loss": 1.7212028503417969, + "step": 15180 + }, + { + "epoch": 0.04597944697531518, + "grad_norm": 0.15753786265850067, + "learning_rate": 0.00011961501677470284, + "loss": 1.7004146575927734, + "step": 15190 + }, + { + "epoch": 0.046009716525661014, + "grad_norm": 0.16932277381420135, + "learning_rate": 0.00011961122159306545, + "loss": 1.7315086364746093, + "step": 15200 + }, + { + "epoch": 0.04603998607600684, + "grad_norm": 0.18362613022327423, + "learning_rate": 0.00011960742641142805, + "loss": 1.739221954345703, + "step": 15210 + }, + { + "epoch": 0.04607025562635267, + "grad_norm": 0.17440269887447357, + "learning_rate": 0.00011960363122979066, + "loss": 1.7284454345703124, + "step": 15220 + }, + { + "epoch": 0.0461005251766985, + "grad_norm": 0.1697126179933548, + "learning_rate": 0.00011959983604815327, + "loss": 1.768639373779297, + "step": 15230 + }, + { + "epoch": 0.04613079472704433, + "grad_norm": 0.16622565686702728, + "learning_rate": 0.00011959604086651587, + "loss": 1.7401168823242188, + "step": 15240 + }, + { + "epoch": 0.04616106427739016, + "grad_norm": 0.16923129558563232, + "learning_rate": 0.00011959224568487848, + "loss": 1.7418886184692384, + "step": 15250 + }, + { + "epoch": 0.04619133382773599, + "grad_norm": 0.18817614018917084, + "learning_rate": 0.00011958845050324108, + "loss": 1.7238922119140625, + "step": 15260 + }, + { + "epoch": 0.046221603378081816, + "grad_norm": 0.16008564829826355, + "learning_rate": 0.00011958465532160369, + "loss": 1.7335784912109375, + "step": 15270 + }, + { + "epoch": 0.04625187292842765, + "grad_norm": 0.17186567187309265, + "learning_rate": 0.00011958086013996631, + "loss": 1.721875, + "step": 15280 + }, + { + "epoch": 0.04628214247877348, + "grad_norm": 0.1632358282804489, + "learning_rate": 0.0001195770649583289, + "loss": 1.7727594375610352, + "step": 15290 + }, + { + "epoch": 0.04631241202911931, + "grad_norm": 0.1610432267189026, + "learning_rate": 0.00011957326977669152, + "loss": 1.7836874008178711, + "step": 15300 + }, + { + "epoch": 0.04634268157946514, + "grad_norm": 0.19367088377475739, + "learning_rate": 0.00011956947459505413, + "loss": 1.729536247253418, + "step": 15310 + }, + { + "epoch": 0.04637295112981097, + "grad_norm": 0.19382606446743011, + "learning_rate": 0.00011956567941341673, + "loss": 1.722417449951172, + "step": 15320 + }, + { + "epoch": 0.046403220680156794, + "grad_norm": 0.17965832352638245, + "learning_rate": 0.00011956188423177934, + "loss": 1.7053428649902345, + "step": 15330 + }, + { + "epoch": 0.046433490230502625, + "grad_norm": 0.1853189468383789, + "learning_rate": 0.00011955808905014194, + "loss": 1.735154151916504, + "step": 15340 + }, + { + "epoch": 0.046463759780848456, + "grad_norm": 0.18838711082935333, + "learning_rate": 0.00011955429386850455, + "loss": 1.6880496978759765, + "step": 15350 + }, + { + "epoch": 0.046494029331194286, + "grad_norm": 0.1892237812280655, + "learning_rate": 0.00011955049868686716, + "loss": 1.6983619689941407, + "step": 15360 + }, + { + "epoch": 0.04652429888154012, + "grad_norm": 0.17441150546073914, + "learning_rate": 0.00011954670350522976, + "loss": 1.7504875183105468, + "step": 15370 + }, + { + "epoch": 0.04655456843188594, + "grad_norm": 0.18130230903625488, + "learning_rate": 0.00011954290832359237, + "loss": 1.7428390502929687, + "step": 15380 + }, + { + "epoch": 0.04658483798223177, + "grad_norm": 0.1810934841632843, + "learning_rate": 0.00011953911314195497, + "loss": 1.7162420272827148, + "step": 15390 + }, + { + "epoch": 0.0466151075325776, + "grad_norm": 0.16861186921596527, + "learning_rate": 0.00011953531796031758, + "loss": 1.7042919158935548, + "step": 15400 + }, + { + "epoch": 0.046645377082923434, + "grad_norm": 0.2005193829536438, + "learning_rate": 0.0001195315227786802, + "loss": 1.740058708190918, + "step": 15410 + }, + { + "epoch": 0.046675646633269265, + "grad_norm": 0.20071950554847717, + "learning_rate": 0.00011952772759704279, + "loss": 1.6804908752441405, + "step": 15420 + }, + { + "epoch": 0.046705916183615095, + "grad_norm": 0.20389804244041443, + "learning_rate": 0.00011952393241540541, + "loss": 1.734159278869629, + "step": 15430 + }, + { + "epoch": 0.04673618573396092, + "grad_norm": 0.15580898523330688, + "learning_rate": 0.000119520137233768, + "loss": 1.7160850524902345, + "step": 15440 + }, + { + "epoch": 0.04676645528430675, + "grad_norm": 0.18770413100719452, + "learning_rate": 0.00011951634205213062, + "loss": 1.7263519287109375, + "step": 15450 + }, + { + "epoch": 0.04679672483465258, + "grad_norm": 0.18875491619110107, + "learning_rate": 0.00011951254687049322, + "loss": 1.7284862518310546, + "step": 15460 + }, + { + "epoch": 0.04682699438499841, + "grad_norm": 0.16365402936935425, + "learning_rate": 0.00011950875168885584, + "loss": 1.7496402740478516, + "step": 15470 + }, + { + "epoch": 0.04685726393534424, + "grad_norm": 0.17380517721176147, + "learning_rate": 0.00011950495650721843, + "loss": 1.7212854385375977, + "step": 15480 + }, + { + "epoch": 0.04688753348569007, + "grad_norm": 0.16368021070957184, + "learning_rate": 0.00011950116132558105, + "loss": 1.7705076217651368, + "step": 15490 + }, + { + "epoch": 0.0469178030360359, + "grad_norm": 0.1631741225719452, + "learning_rate": 0.00011949736614394364, + "loss": 1.7607147216796875, + "step": 15500 + }, + { + "epoch": 0.0469178030360359, + "eval_loss": 1.7325981855392456, + "eval_runtime": 28.1448, + "eval_samples_per_second": 17.765, + "eval_steps_per_second": 1.137, + "step": 15500 + }, + { + "epoch": 0.04694807258638173, + "grad_norm": 0.17129811644554138, + "learning_rate": 0.00011949357096230626, + "loss": 1.7494564056396484, + "step": 15510 + }, + { + "epoch": 0.04697834213672756, + "grad_norm": 0.18354876339435577, + "learning_rate": 0.00011948977578066887, + "loss": 1.719790267944336, + "step": 15520 + }, + { + "epoch": 0.04700861168707339, + "grad_norm": 0.18157967925071716, + "learning_rate": 0.00011948598059903147, + "loss": 1.7199769973754884, + "step": 15530 + }, + { + "epoch": 0.04703888123741922, + "grad_norm": 0.17455241084098816, + "learning_rate": 0.00011948218541739409, + "loss": 1.708387565612793, + "step": 15540 + }, + { + "epoch": 0.047069150787765045, + "grad_norm": 0.17068909108638763, + "learning_rate": 0.00011947839023575668, + "loss": 1.7329776763916016, + "step": 15550 + }, + { + "epoch": 0.047099420338110876, + "grad_norm": 0.18007458746433258, + "learning_rate": 0.0001194745950541193, + "loss": 1.7380970001220704, + "step": 15560 + }, + { + "epoch": 0.047129689888456706, + "grad_norm": 0.23683296144008636, + "learning_rate": 0.0001194707998724819, + "loss": 1.741158676147461, + "step": 15570 + }, + { + "epoch": 0.04715995943880254, + "grad_norm": 0.1955578476190567, + "learning_rate": 0.00011946700469084451, + "loss": 1.7500732421875, + "step": 15580 + }, + { + "epoch": 0.04719022898914837, + "grad_norm": 0.21719832718372345, + "learning_rate": 0.00011946320950920711, + "loss": 1.735059928894043, + "step": 15590 + }, + { + "epoch": 0.0472204985394942, + "grad_norm": 0.19840635359287262, + "learning_rate": 0.00011945941432756973, + "loss": 1.7428953170776367, + "step": 15600 + }, + { + "epoch": 0.04725076808984002, + "grad_norm": 0.22100575268268585, + "learning_rate": 0.00011945561914593232, + "loss": 1.699905776977539, + "step": 15610 + }, + { + "epoch": 0.047281037640185854, + "grad_norm": 0.3211846649646759, + "learning_rate": 0.00011945182396429494, + "loss": 1.7499166488647462, + "step": 15620 + }, + { + "epoch": 0.047311307190531685, + "grad_norm": 0.19388148188591003, + "learning_rate": 0.00011944802878265753, + "loss": 1.706789779663086, + "step": 15630 + }, + { + "epoch": 0.047341576740877515, + "grad_norm": 0.17883457243442535, + "learning_rate": 0.00011944423360102015, + "loss": 1.6941719055175781, + "step": 15640 + }, + { + "epoch": 0.047371846291223346, + "grad_norm": 0.17270627617835999, + "learning_rate": 0.00011944043841938274, + "loss": 1.705789566040039, + "step": 15650 + }, + { + "epoch": 0.04740211584156917, + "grad_norm": 0.16856633126735687, + "learning_rate": 0.00011943664323774536, + "loss": 1.7464153289794921, + "step": 15660 + }, + { + "epoch": 0.047432385391915, + "grad_norm": 0.16398704051971436, + "learning_rate": 0.00011943284805610797, + "loss": 1.746862030029297, + "step": 15670 + }, + { + "epoch": 0.04746265494226083, + "grad_norm": 0.18721163272857666, + "learning_rate": 0.00011942905287447057, + "loss": 1.7569931030273438, + "step": 15680 + }, + { + "epoch": 0.04749292449260666, + "grad_norm": 0.17386965453624725, + "learning_rate": 0.00011942525769283318, + "loss": 1.778826904296875, + "step": 15690 + }, + { + "epoch": 0.047523194042952493, + "grad_norm": 0.17368023097515106, + "learning_rate": 0.00011942146251119579, + "loss": 1.748598289489746, + "step": 15700 + }, + { + "epoch": 0.047553463593298324, + "grad_norm": 0.17626537382602692, + "learning_rate": 0.0001194176673295584, + "loss": 1.7629823684692383, + "step": 15710 + }, + { + "epoch": 0.04758373314364415, + "grad_norm": 0.1879081130027771, + "learning_rate": 0.000119413872147921, + "loss": 1.7326570510864259, + "step": 15720 + }, + { + "epoch": 0.04761400269398998, + "grad_norm": 0.19845059514045715, + "learning_rate": 0.00011941007696628362, + "loss": 1.7438514709472657, + "step": 15730 + }, + { + "epoch": 0.04764427224433581, + "grad_norm": 0.16456186771392822, + "learning_rate": 0.00011940628178464621, + "loss": 1.7094642639160156, + "step": 15740 + }, + { + "epoch": 0.04767454179468164, + "grad_norm": 0.1595337986946106, + "learning_rate": 0.00011940248660300883, + "loss": 1.707891845703125, + "step": 15750 + }, + { + "epoch": 0.04770481134502747, + "grad_norm": 0.1813562661409378, + "learning_rate": 0.00011939869142137142, + "loss": 1.7679725646972657, + "step": 15760 + }, + { + "epoch": 0.0477350808953733, + "grad_norm": 0.16095314919948578, + "learning_rate": 0.00011939489623973404, + "loss": 1.7094100952148437, + "step": 15770 + }, + { + "epoch": 0.047765350445719126, + "grad_norm": 0.18492421507835388, + "learning_rate": 0.00011939110105809665, + "loss": 1.7345800399780273, + "step": 15780 + }, + { + "epoch": 0.04779561999606496, + "grad_norm": 0.18340153992176056, + "learning_rate": 0.00011938730587645925, + "loss": 1.7166151046752929, + "step": 15790 + }, + { + "epoch": 0.04782588954641079, + "grad_norm": 0.18819847702980042, + "learning_rate": 0.00011938351069482186, + "loss": 1.7510547637939453, + "step": 15800 + }, + { + "epoch": 0.04785615909675662, + "grad_norm": 0.19238974153995514, + "learning_rate": 0.00011937971551318446, + "loss": 1.7247043609619142, + "step": 15810 + }, + { + "epoch": 0.04788642864710245, + "grad_norm": 0.1580587923526764, + "learning_rate": 0.00011937592033154707, + "loss": 1.6937122344970703, + "step": 15820 + }, + { + "epoch": 0.047916698197448274, + "grad_norm": 0.15863528847694397, + "learning_rate": 0.00011937212514990968, + "loss": 1.7514890670776366, + "step": 15830 + }, + { + "epoch": 0.047946967747794104, + "grad_norm": 0.17033129930496216, + "learning_rate": 0.00011936832996827228, + "loss": 1.7183685302734375, + "step": 15840 + }, + { + "epoch": 0.047977237298139935, + "grad_norm": 0.21745972335338593, + "learning_rate": 0.00011936453478663489, + "loss": 1.7078773498535156, + "step": 15850 + }, + { + "epoch": 0.048007506848485766, + "grad_norm": 0.1715000569820404, + "learning_rate": 0.0001193607396049975, + "loss": 1.761247444152832, + "step": 15860 + }, + { + "epoch": 0.0480377763988316, + "grad_norm": 0.17939244210720062, + "learning_rate": 0.0001193569444233601, + "loss": 1.6997785568237305, + "step": 15870 + }, + { + "epoch": 0.04806804594917743, + "grad_norm": 0.18657276034355164, + "learning_rate": 0.0001193531492417227, + "loss": 1.7243152618408204, + "step": 15880 + }, + { + "epoch": 0.04809831549952325, + "grad_norm": 0.18418394029140472, + "learning_rate": 0.00011934935406008531, + "loss": 1.7594295501708985, + "step": 15890 + }, + { + "epoch": 0.04812858504986908, + "grad_norm": 0.1711241453886032, + "learning_rate": 0.00011934555887844792, + "loss": 1.716511344909668, + "step": 15900 + }, + { + "epoch": 0.04815885460021491, + "grad_norm": 0.1686420738697052, + "learning_rate": 0.00011934176369681054, + "loss": 1.727484893798828, + "step": 15910 + }, + { + "epoch": 0.048189124150560744, + "grad_norm": 0.17064258456230164, + "learning_rate": 0.00011933796851517314, + "loss": 1.713526725769043, + "step": 15920 + }, + { + "epoch": 0.048219393700906575, + "grad_norm": 0.16063976287841797, + "learning_rate": 0.00011933417333353575, + "loss": 1.7214115142822266, + "step": 15930 + }, + { + "epoch": 0.048249663251252406, + "grad_norm": 0.16921308636665344, + "learning_rate": 0.00011933037815189836, + "loss": 1.7121997833251954, + "step": 15940 + }, + { + "epoch": 0.04827993280159823, + "grad_norm": 0.14775428175926208, + "learning_rate": 0.00011932658297026096, + "loss": 1.7370136260986329, + "step": 15950 + }, + { + "epoch": 0.04831020235194406, + "grad_norm": 0.177538201212883, + "learning_rate": 0.00011932278778862357, + "loss": 1.6949077606201173, + "step": 15960 + }, + { + "epoch": 0.04834047190228989, + "grad_norm": 0.17004625499248505, + "learning_rate": 0.00011931899260698617, + "loss": 1.7189868927001952, + "step": 15970 + }, + { + "epoch": 0.04837074145263572, + "grad_norm": 0.19114652276039124, + "learning_rate": 0.00011931519742534878, + "loss": 1.79437198638916, + "step": 15980 + }, + { + "epoch": 0.04840101100298155, + "grad_norm": 0.18796400725841522, + "learning_rate": 0.00011931140224371139, + "loss": 1.7473373413085938, + "step": 15990 + }, + { + "epoch": 0.04843128055332738, + "grad_norm": 0.1714278757572174, + "learning_rate": 0.00011930760706207399, + "loss": 1.716026496887207, + "step": 16000 + }, + { + "epoch": 0.04843128055332738, + "eval_loss": 1.7301207780838013, + "eval_runtime": 28.4653, + "eval_samples_per_second": 17.565, + "eval_steps_per_second": 1.124, + "step": 16000 + }, + { + "epoch": 0.04846155010367321, + "grad_norm": 0.1642516851425171, + "learning_rate": 0.0001193038118804366, + "loss": 1.7399181365966796, + "step": 16010 + }, + { + "epoch": 0.04849181965401904, + "grad_norm": 0.16926062107086182, + "learning_rate": 0.00011930001669879922, + "loss": 1.7193208694458009, + "step": 16020 + }, + { + "epoch": 0.04852208920436487, + "grad_norm": 0.17379391193389893, + "learning_rate": 0.00011929622151716181, + "loss": 1.7203481674194336, + "step": 16030 + }, + { + "epoch": 0.0485523587547107, + "grad_norm": 0.17757444083690643, + "learning_rate": 0.00011929242633552443, + "loss": 1.7236614227294922, + "step": 16040 + }, + { + "epoch": 0.04858262830505653, + "grad_norm": 0.17789089679718018, + "learning_rate": 0.00011928863115388702, + "loss": 1.755011749267578, + "step": 16050 + }, + { + "epoch": 0.048612897855402355, + "grad_norm": 0.172845721244812, + "learning_rate": 0.00011928483597224964, + "loss": 1.7246267318725585, + "step": 16060 + }, + { + "epoch": 0.048643167405748186, + "grad_norm": 0.18075011670589447, + "learning_rate": 0.00011928104079061223, + "loss": 1.7528104782104492, + "step": 16070 + }, + { + "epoch": 0.04867343695609402, + "grad_norm": 0.16264793276786804, + "learning_rate": 0.00011927724560897485, + "loss": 1.735614013671875, + "step": 16080 + }, + { + "epoch": 0.04870370650643985, + "grad_norm": 0.17408204078674316, + "learning_rate": 0.00011927345042733744, + "loss": 1.7520442962646485, + "step": 16090 + }, + { + "epoch": 0.04873397605678568, + "grad_norm": 0.17187224328517914, + "learning_rate": 0.00011926965524570006, + "loss": 1.6915679931640626, + "step": 16100 + }, + { + "epoch": 0.04876424560713151, + "grad_norm": 0.16065573692321777, + "learning_rate": 0.00011926586006406266, + "loss": 1.6836845397949218, + "step": 16110 + }, + { + "epoch": 0.04879451515747733, + "grad_norm": 0.187100350856781, + "learning_rate": 0.00011926206488242528, + "loss": 1.7009056091308594, + "step": 16120 + }, + { + "epoch": 0.048824784707823164, + "grad_norm": 0.15974736213684082, + "learning_rate": 0.00011925826970078788, + "loss": 1.6756366729736327, + "step": 16130 + }, + { + "epoch": 0.048855054258168995, + "grad_norm": 0.1867290437221527, + "learning_rate": 0.00011925447451915049, + "loss": 1.747239875793457, + "step": 16140 + }, + { + "epoch": 0.048885323808514826, + "grad_norm": 0.1677830070257187, + "learning_rate": 0.00011925067933751311, + "loss": 1.742671585083008, + "step": 16150 + }, + { + "epoch": 0.04891559335886066, + "grad_norm": 0.17163926362991333, + "learning_rate": 0.0001192468841558757, + "loss": 1.7407196044921875, + "step": 16160 + }, + { + "epoch": 0.04894586290920648, + "grad_norm": 0.16844968497753143, + "learning_rate": 0.00011924308897423832, + "loss": 1.7245170593261718, + "step": 16170 + }, + { + "epoch": 0.04897613245955231, + "grad_norm": 0.17876200377941132, + "learning_rate": 0.00011923929379260091, + "loss": 1.7388313293457032, + "step": 16180 + }, + { + "epoch": 0.04900640200989814, + "grad_norm": 0.2417580485343933, + "learning_rate": 0.00011923549861096353, + "loss": 1.7740646362304688, + "step": 16190 + }, + { + "epoch": 0.04903667156024397, + "grad_norm": 0.1916506439447403, + "learning_rate": 0.00011923170342932612, + "loss": 1.759440040588379, + "step": 16200 + }, + { + "epoch": 0.049066941110589804, + "grad_norm": 0.23565927147865295, + "learning_rate": 0.00011922790824768874, + "loss": 1.775625228881836, + "step": 16210 + }, + { + "epoch": 0.049097210660935635, + "grad_norm": 0.20933224260807037, + "learning_rate": 0.00011922411306605134, + "loss": 1.7227685928344727, + "step": 16220 + }, + { + "epoch": 0.04912748021128146, + "grad_norm": 0.18862345814704895, + "learning_rate": 0.00011922031788441396, + "loss": 1.720424270629883, + "step": 16230 + }, + { + "epoch": 0.04915774976162729, + "grad_norm": 0.192548930644989, + "learning_rate": 0.00011921652270277655, + "loss": 1.7140825271606446, + "step": 16240 + }, + { + "epoch": 0.04918801931197312, + "grad_norm": 0.16607536375522614, + "learning_rate": 0.00011921272752113917, + "loss": 1.738730239868164, + "step": 16250 + }, + { + "epoch": 0.04921828886231895, + "grad_norm": 0.15898066759109497, + "learning_rate": 0.00011920893233950176, + "loss": 1.7489986419677734, + "step": 16260 + }, + { + "epoch": 0.04924855841266478, + "grad_norm": 0.1590709686279297, + "learning_rate": 0.00011920513715786438, + "loss": 1.7332189559936524, + "step": 16270 + }, + { + "epoch": 0.049278827963010606, + "grad_norm": 0.18513372540473938, + "learning_rate": 0.00011920134197622699, + "loss": 1.7153732299804687, + "step": 16280 + }, + { + "epoch": 0.04930909751335644, + "grad_norm": 0.19138173758983612, + "learning_rate": 0.00011919754679458959, + "loss": 1.674129104614258, + "step": 16290 + }, + { + "epoch": 0.04933936706370227, + "grad_norm": 0.18398012220859528, + "learning_rate": 0.0001191937516129522, + "loss": 1.762230682373047, + "step": 16300 + }, + { + "epoch": 0.0493696366140481, + "grad_norm": 0.16735167801380157, + "learning_rate": 0.0001191899564313148, + "loss": 1.7491022109985352, + "step": 16310 + }, + { + "epoch": 0.04939990616439393, + "grad_norm": 0.18889065086841583, + "learning_rate": 0.00011918616124967742, + "loss": 1.696385383605957, + "step": 16320 + }, + { + "epoch": 0.04943017571473976, + "grad_norm": 0.2125576138496399, + "learning_rate": 0.00011918236606804001, + "loss": 1.7066619873046875, + "step": 16330 + }, + { + "epoch": 0.049460445265085584, + "grad_norm": 0.19219323992729187, + "learning_rate": 0.00011917857088640263, + "loss": 1.749522590637207, + "step": 16340 + }, + { + "epoch": 0.049490714815431415, + "grad_norm": 0.1730070561170578, + "learning_rate": 0.00011917477570476523, + "loss": 1.7332019805908203, + "step": 16350 + }, + { + "epoch": 0.049520984365777246, + "grad_norm": 0.22810667753219604, + "learning_rate": 0.00011917098052312785, + "loss": 1.7616985321044922, + "step": 16360 + }, + { + "epoch": 0.04955125391612308, + "grad_norm": 0.16979080438613892, + "learning_rate": 0.00011916718534149044, + "loss": 1.7152320861816406, + "step": 16370 + }, + { + "epoch": 0.04958152346646891, + "grad_norm": 0.1819756031036377, + "learning_rate": 0.00011916339015985306, + "loss": 1.7219911575317384, + "step": 16380 + }, + { + "epoch": 0.04961179301681474, + "grad_norm": 0.16787917912006378, + "learning_rate": 0.00011915959497821566, + "loss": 1.6896846771240235, + "step": 16390 + }, + { + "epoch": 0.04964206256716056, + "grad_norm": 0.1757906824350357, + "learning_rate": 0.00011915579979657827, + "loss": 1.7566661834716797, + "step": 16400 + }, + { + "epoch": 0.04967233211750639, + "grad_norm": 0.18071249127388, + "learning_rate": 0.00011915200461494088, + "loss": 1.6780879974365235, + "step": 16410 + }, + { + "epoch": 0.049702601667852224, + "grad_norm": 0.1541723757982254, + "learning_rate": 0.00011914820943330348, + "loss": 1.700411605834961, + "step": 16420 + }, + { + "epoch": 0.049732871218198055, + "grad_norm": 0.16862037777900696, + "learning_rate": 0.00011914441425166609, + "loss": 1.6805679321289062, + "step": 16430 + }, + { + "epoch": 0.049763140768543886, + "grad_norm": 0.1529170721769333, + "learning_rate": 0.0001191406190700287, + "loss": 1.7712574005126953, + "step": 16440 + }, + { + "epoch": 0.04979341031888971, + "grad_norm": 0.14528730511665344, + "learning_rate": 0.0001191368238883913, + "loss": 1.7398836135864257, + "step": 16450 + }, + { + "epoch": 0.04982367986923554, + "grad_norm": 0.1981564313173294, + "learning_rate": 0.0001191330287067539, + "loss": 1.7461172103881837, + "step": 16460 + }, + { + "epoch": 0.04985394941958137, + "grad_norm": 0.16945981979370117, + "learning_rate": 0.00011912923352511651, + "loss": 1.7180513381958007, + "step": 16470 + }, + { + "epoch": 0.0498842189699272, + "grad_norm": 0.15805353224277496, + "learning_rate": 0.00011912543834347912, + "loss": 1.7525352478027343, + "step": 16480 + }, + { + "epoch": 0.04991448852027303, + "grad_norm": 0.17952023446559906, + "learning_rate": 0.00011912164316184172, + "loss": 1.7508859634399414, + "step": 16490 + }, + { + "epoch": 0.049944758070618864, + "grad_norm": 0.17755497992038727, + "learning_rate": 0.00011911784798020433, + "loss": 1.7174615859985352, + "step": 16500 + }, + { + "epoch": 0.049944758070618864, + "eval_loss": 1.735124945640564, + "eval_runtime": 28.106, + "eval_samples_per_second": 17.79, + "eval_steps_per_second": 1.139, + "step": 16500 + }, + { + "epoch": 0.04997502762096469, + "grad_norm": 0.1985631287097931, + "learning_rate": 0.00011911405279856694, + "loss": 1.7567533493041991, + "step": 16510 + }, + { + "epoch": 0.05000529717131052, + "grad_norm": 0.1643555760383606, + "learning_rate": 0.00011911025761692956, + "loss": 1.736489486694336, + "step": 16520 + }, + { + "epoch": 0.05003556672165635, + "grad_norm": 0.16754597425460815, + "learning_rate": 0.00011910646243529216, + "loss": 1.7177066802978516, + "step": 16530 + }, + { + "epoch": 0.05006583627200218, + "grad_norm": 0.1808469444513321, + "learning_rate": 0.00011910266725365477, + "loss": 1.7248617172241212, + "step": 16540 + }, + { + "epoch": 0.05009610582234801, + "grad_norm": 0.16134992241859436, + "learning_rate": 0.00011909887207201737, + "loss": 1.7432590484619142, + "step": 16550 + }, + { + "epoch": 0.05012637537269384, + "grad_norm": 0.15555547177791595, + "learning_rate": 0.00011909507689037998, + "loss": 1.6990116119384766, + "step": 16560 + }, + { + "epoch": 0.050156644923039666, + "grad_norm": 0.2878791093826294, + "learning_rate": 0.00011909128170874258, + "loss": 1.7486038208007812, + "step": 16570 + }, + { + "epoch": 0.0501869144733855, + "grad_norm": 0.1975313425064087, + "learning_rate": 0.00011908748652710519, + "loss": 1.726253128051758, + "step": 16580 + }, + { + "epoch": 0.05021718402373133, + "grad_norm": 0.1996987909078598, + "learning_rate": 0.0001190836913454678, + "loss": 1.7399641036987306, + "step": 16590 + }, + { + "epoch": 0.05024745357407716, + "grad_norm": 0.20904164016246796, + "learning_rate": 0.0001190798961638304, + "loss": 1.7037397384643556, + "step": 16600 + }, + { + "epoch": 0.05027772312442299, + "grad_norm": 0.19287380576133728, + "learning_rate": 0.00011907610098219301, + "loss": 1.71832275390625, + "step": 16610 + }, + { + "epoch": 0.05030799267476881, + "grad_norm": 0.18895766139030457, + "learning_rate": 0.00011907230580055561, + "loss": 1.7449869155883788, + "step": 16620 + }, + { + "epoch": 0.050338262225114644, + "grad_norm": 0.18680831789970398, + "learning_rate": 0.00011906851061891822, + "loss": 1.7518253326416016, + "step": 16630 + }, + { + "epoch": 0.050368531775460475, + "grad_norm": 0.16286009550094604, + "learning_rate": 0.00011906471543728083, + "loss": 1.7208602905273438, + "step": 16640 + }, + { + "epoch": 0.050398801325806306, + "grad_norm": 0.18914036452770233, + "learning_rate": 0.00011906092025564345, + "loss": 1.7786588668823242, + "step": 16650 + }, + { + "epoch": 0.050429070876152136, + "grad_norm": 0.16523197293281555, + "learning_rate": 0.00011905712507400604, + "loss": 1.7371498107910157, + "step": 16660 + }, + { + "epoch": 0.05045934042649797, + "grad_norm": 0.16697075963020325, + "learning_rate": 0.00011905332989236866, + "loss": 1.707395362854004, + "step": 16670 + }, + { + "epoch": 0.05048960997684379, + "grad_norm": 0.15311063826084137, + "learning_rate": 0.00011904953471073125, + "loss": 1.7175060272216798, + "step": 16680 + }, + { + "epoch": 0.05051987952718962, + "grad_norm": 0.19707892835140228, + "learning_rate": 0.00011904573952909387, + "loss": 1.6927181243896485, + "step": 16690 + }, + { + "epoch": 0.05055014907753545, + "grad_norm": 0.17256151139736176, + "learning_rate": 0.00011904194434745646, + "loss": 1.7031524658203125, + "step": 16700 + }, + { + "epoch": 0.050580418627881284, + "grad_norm": 0.18130135536193848, + "learning_rate": 0.00011903814916581908, + "loss": 1.7239360809326172, + "step": 16710 + }, + { + "epoch": 0.050610688178227115, + "grad_norm": 0.17620371282100677, + "learning_rate": 0.00011903435398418167, + "loss": 1.7267545700073241, + "step": 16720 + }, + { + "epoch": 0.050640957728572945, + "grad_norm": 0.1731528341770172, + "learning_rate": 0.0001190305588025443, + "loss": 1.6734846115112305, + "step": 16730 + }, + { + "epoch": 0.05067122727891877, + "grad_norm": 0.16923145949840546, + "learning_rate": 0.0001190267636209069, + "loss": 1.732510757446289, + "step": 16740 + }, + { + "epoch": 0.0507014968292646, + "grad_norm": 0.1720634251832962, + "learning_rate": 0.0001190229684392695, + "loss": 1.6829607009887695, + "step": 16750 + }, + { + "epoch": 0.05073176637961043, + "grad_norm": 0.16830705106258392, + "learning_rate": 0.00011901917325763212, + "loss": 1.6978836059570312, + "step": 16760 + }, + { + "epoch": 0.05076203592995626, + "grad_norm": 0.1683117002248764, + "learning_rate": 0.00011901537807599472, + "loss": 1.7196144104003905, + "step": 16770 + }, + { + "epoch": 0.05079230548030209, + "grad_norm": 0.17177540063858032, + "learning_rate": 0.00011901158289435734, + "loss": 1.7132791519165038, + "step": 16780 + }, + { + "epoch": 0.05082257503064792, + "grad_norm": 0.15539640188217163, + "learning_rate": 0.00011900778771271993, + "loss": 1.742816162109375, + "step": 16790 + }, + { + "epoch": 0.05085284458099375, + "grad_norm": 0.19190916419029236, + "learning_rate": 0.00011900399253108255, + "loss": 1.7512638092041015, + "step": 16800 + }, + { + "epoch": 0.05088311413133958, + "grad_norm": 0.16022787988185883, + "learning_rate": 0.00011900019734944514, + "loss": 1.7293987274169922, + "step": 16810 + }, + { + "epoch": 0.05091338368168541, + "grad_norm": 0.1584477722644806, + "learning_rate": 0.00011899640216780776, + "loss": 1.7168041229248048, + "step": 16820 + }, + { + "epoch": 0.05094365323203124, + "grad_norm": 0.1548263430595398, + "learning_rate": 0.00011899260698617035, + "loss": 1.6995552062988282, + "step": 16830 + }, + { + "epoch": 0.05097392278237707, + "grad_norm": 0.1773291826248169, + "learning_rate": 0.00011898881180453297, + "loss": 1.7680046081542968, + "step": 16840 + }, + { + "epoch": 0.051004192332722895, + "grad_norm": 0.19875361025333405, + "learning_rate": 0.00011898501662289556, + "loss": 1.718918800354004, + "step": 16850 + }, + { + "epoch": 0.051034461883068725, + "grad_norm": 0.19024911522865295, + "learning_rate": 0.00011898122144125818, + "loss": 1.708819580078125, + "step": 16860 + }, + { + "epoch": 0.051064731433414556, + "grad_norm": 0.1763983815908432, + "learning_rate": 0.00011897742625962078, + "loss": 1.7472383499145507, + "step": 16870 + }, + { + "epoch": 0.05109500098376039, + "grad_norm": 0.1823931783437729, + "learning_rate": 0.0001189736310779834, + "loss": 1.716090774536133, + "step": 16880 + }, + { + "epoch": 0.05112527053410622, + "grad_norm": 0.16771458089351654, + "learning_rate": 0.000118969835896346, + "loss": 1.7147668838500976, + "step": 16890 + }, + { + "epoch": 0.05115554008445205, + "grad_norm": 0.19502434134483337, + "learning_rate": 0.00011896604071470861, + "loss": 1.729910659790039, + "step": 16900 + }, + { + "epoch": 0.05118580963479787, + "grad_norm": 0.18283896148204803, + "learning_rate": 0.00011896224553307121, + "loss": 1.661776351928711, + "step": 16910 + }, + { + "epoch": 0.051216079185143704, + "grad_norm": 0.18872015178203583, + "learning_rate": 0.00011895845035143382, + "loss": 1.7380760192871094, + "step": 16920 + }, + { + "epoch": 0.051246348735489534, + "grad_norm": 0.16858915984630585, + "learning_rate": 0.00011895465516979644, + "loss": 1.7446956634521484, + "step": 16930 + }, + { + "epoch": 0.051276618285835365, + "grad_norm": 0.18270079791545868, + "learning_rate": 0.00011895085998815903, + "loss": 1.6449872970581054, + "step": 16940 + }, + { + "epoch": 0.051306887836181196, + "grad_norm": 0.20611582696437836, + "learning_rate": 0.00011894706480652165, + "loss": 1.7228124618530274, + "step": 16950 + }, + { + "epoch": 0.05133715738652702, + "grad_norm": 0.16759754717350006, + "learning_rate": 0.00011894326962488424, + "loss": 1.71424560546875, + "step": 16960 + }, + { + "epoch": 0.05136742693687285, + "grad_norm": 0.19491317868232727, + "learning_rate": 0.00011893947444324686, + "loss": 1.7305181503295899, + "step": 16970 + }, + { + "epoch": 0.05139769648721868, + "grad_norm": 0.17303712666034698, + "learning_rate": 0.00011893567926160946, + "loss": 1.7423318862915038, + "step": 16980 + }, + { + "epoch": 0.05142796603756451, + "grad_norm": 0.17139914631843567, + "learning_rate": 0.00011893188407997208, + "loss": 1.709345245361328, + "step": 16990 + }, + { + "epoch": 0.05145823558791034, + "grad_norm": 0.19065245985984802, + "learning_rate": 0.00011892808889833467, + "loss": 1.7232089996337892, + "step": 17000 + }, + { + "epoch": 0.05145823558791034, + "eval_loss": 1.7220007181167603, + "eval_runtime": 28.1485, + "eval_samples_per_second": 17.763, + "eval_steps_per_second": 1.137, + "step": 17000 + }, + { + "epoch": 0.051488505138256174, + "grad_norm": 0.16629239916801453, + "learning_rate": 0.00011892429371669729, + "loss": 1.7028308868408204, + "step": 17010 + }, + { + "epoch": 0.051518774688602, + "grad_norm": 0.17994838953018188, + "learning_rate": 0.0001189204985350599, + "loss": 1.674953079223633, + "step": 17020 + }, + { + "epoch": 0.05154904423894783, + "grad_norm": 0.1509326696395874, + "learning_rate": 0.0001189167033534225, + "loss": 1.7452194213867187, + "step": 17030 + }, + { + "epoch": 0.05157931378929366, + "grad_norm": 0.16745159029960632, + "learning_rate": 0.0001189129081717851, + "loss": 1.733530616760254, + "step": 17040 + }, + { + "epoch": 0.05160958333963949, + "grad_norm": 0.17031818628311157, + "learning_rate": 0.00011890911299014771, + "loss": 1.680084800720215, + "step": 17050 + }, + { + "epoch": 0.05163985288998532, + "grad_norm": 0.1718049943447113, + "learning_rate": 0.00011890531780851032, + "loss": 1.6534431457519532, + "step": 17060 + }, + { + "epoch": 0.051670122440331145, + "grad_norm": 0.17866767942905426, + "learning_rate": 0.00011890152262687292, + "loss": 1.7327203750610352, + "step": 17070 + }, + { + "epoch": 0.051700391990676976, + "grad_norm": 0.17799270153045654, + "learning_rate": 0.00011889772744523553, + "loss": 1.7272045135498046, + "step": 17080 + }, + { + "epoch": 0.05173066154102281, + "grad_norm": 0.1703091561794281, + "learning_rate": 0.00011889393226359813, + "loss": 1.7156143188476562, + "step": 17090 + }, + { + "epoch": 0.05176093109136864, + "grad_norm": 0.15746146440505981, + "learning_rate": 0.00011889013708196074, + "loss": 1.7362258911132813, + "step": 17100 + }, + { + "epoch": 0.05179120064171447, + "grad_norm": 0.1635754406452179, + "learning_rate": 0.00011888634190032335, + "loss": 1.7203432083129884, + "step": 17110 + }, + { + "epoch": 0.0518214701920603, + "grad_norm": 0.19353064894676208, + "learning_rate": 0.00011888254671868595, + "loss": 1.721224594116211, + "step": 17120 + }, + { + "epoch": 0.051851739742406124, + "grad_norm": 0.1698865294456482, + "learning_rate": 0.00011887875153704857, + "loss": 1.7270938873291015, + "step": 17130 + }, + { + "epoch": 0.051882009292751954, + "grad_norm": 0.19177772104740143, + "learning_rate": 0.00011887495635541118, + "loss": 1.6956518173217774, + "step": 17140 + }, + { + "epoch": 0.051912278843097785, + "grad_norm": 0.21245677769184113, + "learning_rate": 0.00011887116117377378, + "loss": 1.7455772399902343, + "step": 17150 + }, + { + "epoch": 0.051942548393443616, + "grad_norm": 0.17897386848926544, + "learning_rate": 0.00011886736599213639, + "loss": 1.7176559448242188, + "step": 17160 + }, + { + "epoch": 0.05197281794378945, + "grad_norm": 0.14797350764274597, + "learning_rate": 0.000118863570810499, + "loss": 1.759640884399414, + "step": 17170 + }, + { + "epoch": 0.05200308749413528, + "grad_norm": 0.16526322066783905, + "learning_rate": 0.0001188597756288616, + "loss": 1.7001071929931642, + "step": 17180 + }, + { + "epoch": 0.0520333570444811, + "grad_norm": 0.18197838962078094, + "learning_rate": 0.00011885598044722421, + "loss": 1.699071502685547, + "step": 17190 + }, + { + "epoch": 0.05206362659482693, + "grad_norm": 0.168325275182724, + "learning_rate": 0.00011885218526558681, + "loss": 1.7369117736816406, + "step": 17200 + }, + { + "epoch": 0.05209389614517276, + "grad_norm": 0.18356464803218842, + "learning_rate": 0.00011884839008394942, + "loss": 1.693606185913086, + "step": 17210 + }, + { + "epoch": 0.052124165695518594, + "grad_norm": 0.17109763622283936, + "learning_rate": 0.00011884459490231203, + "loss": 1.716812515258789, + "step": 17220 + }, + { + "epoch": 0.052154435245864425, + "grad_norm": 0.15965187549591064, + "learning_rate": 0.00011884079972067463, + "loss": 1.7170570373535157, + "step": 17230 + }, + { + "epoch": 0.05218470479621025, + "grad_norm": 0.22428251802921295, + "learning_rate": 0.00011883700453903724, + "loss": 1.7223735809326173, + "step": 17240 + }, + { + "epoch": 0.05221497434655608, + "grad_norm": 0.15601620078086853, + "learning_rate": 0.00011883320935739984, + "loss": 1.7143892288208007, + "step": 17250 + }, + { + "epoch": 0.05224524389690191, + "grad_norm": 0.16572533547878265, + "learning_rate": 0.00011882941417576246, + "loss": 1.7191383361816406, + "step": 17260 + }, + { + "epoch": 0.05227551344724774, + "grad_norm": 0.17451101541519165, + "learning_rate": 0.00011882561899412506, + "loss": 1.6930442810058595, + "step": 17270 + }, + { + "epoch": 0.05230578299759357, + "grad_norm": 0.16764889657497406, + "learning_rate": 0.00011882182381248768, + "loss": 1.755707359313965, + "step": 17280 + }, + { + "epoch": 0.0523360525479394, + "grad_norm": 0.19303417205810547, + "learning_rate": 0.00011881802863085027, + "loss": 1.7090816497802734, + "step": 17290 + }, + { + "epoch": 0.05236632209828523, + "grad_norm": 0.17880015075206757, + "learning_rate": 0.00011881423344921289, + "loss": 1.731581687927246, + "step": 17300 + }, + { + "epoch": 0.05239659164863106, + "grad_norm": 0.1920088827610016, + "learning_rate": 0.00011881043826757548, + "loss": 1.6926471710205078, + "step": 17310 + }, + { + "epoch": 0.05242686119897689, + "grad_norm": 0.20902664959430695, + "learning_rate": 0.0001188066430859381, + "loss": 1.7041753768920898, + "step": 17320 + }, + { + "epoch": 0.05245713074932272, + "grad_norm": 0.18358737230300903, + "learning_rate": 0.00011880284790430069, + "loss": 1.691744613647461, + "step": 17330 + }, + { + "epoch": 0.05248740029966855, + "grad_norm": 0.17275644838809967, + "learning_rate": 0.00011879905272266331, + "loss": 1.7281448364257812, + "step": 17340 + }, + { + "epoch": 0.05251766985001438, + "grad_norm": 0.1771198809146881, + "learning_rate": 0.00011879525754102592, + "loss": 1.6870073318481444, + "step": 17350 + }, + { + "epoch": 0.052547939400360205, + "grad_norm": 0.17005228996276855, + "learning_rate": 0.00011879146235938852, + "loss": 1.6948966979980469, + "step": 17360 + }, + { + "epoch": 0.052578208950706036, + "grad_norm": 0.16708669066429138, + "learning_rate": 0.00011878766717775113, + "loss": 1.7012920379638672, + "step": 17370 + }, + { + "epoch": 0.05260847850105187, + "grad_norm": 0.1704285740852356, + "learning_rate": 0.00011878387199611373, + "loss": 1.6812847137451172, + "step": 17380 + }, + { + "epoch": 0.0526387480513977, + "grad_norm": 0.19565753638744354, + "learning_rate": 0.00011878007681447635, + "loss": 1.7350944519042968, + "step": 17390 + }, + { + "epoch": 0.05266901760174353, + "grad_norm": 0.18498224020004272, + "learning_rate": 0.00011877628163283895, + "loss": 1.735702896118164, + "step": 17400 + }, + { + "epoch": 0.05269928715208935, + "grad_norm": 0.18893390893936157, + "learning_rate": 0.00011877248645120157, + "loss": 1.7224651336669923, + "step": 17410 + }, + { + "epoch": 0.05272955670243518, + "grad_norm": 0.17734310030937195, + "learning_rate": 0.00011876869126956416, + "loss": 1.7382015228271483, + "step": 17420 + }, + { + "epoch": 0.052759826252781014, + "grad_norm": 0.1681891530752182, + "learning_rate": 0.00011876489608792678, + "loss": 1.70477352142334, + "step": 17430 + }, + { + "epoch": 0.052790095803126845, + "grad_norm": 0.18524594604969025, + "learning_rate": 0.00011876110090628937, + "loss": 1.7636474609375, + "step": 17440 + }, + { + "epoch": 0.052820365353472676, + "grad_norm": 0.15740719437599182, + "learning_rate": 0.00011875730572465199, + "loss": 1.731240463256836, + "step": 17450 + }, + { + "epoch": 0.05285063490381851, + "grad_norm": 0.1599298119544983, + "learning_rate": 0.00011875351054301458, + "loss": 1.7368595123291015, + "step": 17460 + }, + { + "epoch": 0.05288090445416433, + "grad_norm": 0.19253318011760712, + "learning_rate": 0.0001187497153613772, + "loss": 1.683443832397461, + "step": 17470 + }, + { + "epoch": 0.05291117400451016, + "grad_norm": 0.15487051010131836, + "learning_rate": 0.0001187459201797398, + "loss": 1.7494192123413086, + "step": 17480 + }, + { + "epoch": 0.05294144355485599, + "grad_norm": 0.1675560623407364, + "learning_rate": 0.00011874212499810241, + "loss": 1.7184303283691407, + "step": 17490 + }, + { + "epoch": 0.05297171310520182, + "grad_norm": 0.16470250487327576, + "learning_rate": 0.00011873832981646502, + "loss": 1.7011581420898438, + "step": 17500 + }, + { + "epoch": 0.05297171310520182, + "eval_loss": 1.717761754989624, + "eval_runtime": 28.0484, + "eval_samples_per_second": 17.826, + "eval_steps_per_second": 1.141, + "step": 17500 + }, + { + "epoch": 0.053001982655547654, + "grad_norm": 0.1640944927930832, + "learning_rate": 0.00011873453463482763, + "loss": 1.7423002243041992, + "step": 17510 + }, + { + "epoch": 0.053032252205893485, + "grad_norm": 0.1673746556043625, + "learning_rate": 0.00011873073945319023, + "loss": 1.7073350906372071, + "step": 17520 + }, + { + "epoch": 0.05306252175623931, + "grad_norm": 0.17537522315979004, + "learning_rate": 0.00011872694427155284, + "loss": 1.7298364639282227, + "step": 17530 + }, + { + "epoch": 0.05309279130658514, + "grad_norm": 0.19327153265476227, + "learning_rate": 0.00011872314908991546, + "loss": 1.7501720428466796, + "step": 17540 + }, + { + "epoch": 0.05312306085693097, + "grad_norm": 0.19335602223873138, + "learning_rate": 0.00011871935390827805, + "loss": 1.7206817626953126, + "step": 17550 + }, + { + "epoch": 0.0531533304072768, + "grad_norm": 0.1880834996700287, + "learning_rate": 0.00011871555872664067, + "loss": 1.698410415649414, + "step": 17560 + }, + { + "epoch": 0.05318359995762263, + "grad_norm": 0.156456857919693, + "learning_rate": 0.00011871176354500326, + "loss": 1.7112577438354493, + "step": 17570 + }, + { + "epoch": 0.053213869507968456, + "grad_norm": 0.16999810934066772, + "learning_rate": 0.00011870796836336588, + "loss": 1.7241582870483398, + "step": 17580 + }, + { + "epoch": 0.05324413905831429, + "grad_norm": 0.22556251287460327, + "learning_rate": 0.00011870417318172847, + "loss": 1.7545835494995117, + "step": 17590 + }, + { + "epoch": 0.05327440860866012, + "grad_norm": 0.1914748102426529, + "learning_rate": 0.00011870037800009109, + "loss": 1.707880401611328, + "step": 17600 + }, + { + "epoch": 0.05330467815900595, + "grad_norm": 0.16295398771762848, + "learning_rate": 0.00011869658281845368, + "loss": 1.7550159454345704, + "step": 17610 + }, + { + "epoch": 0.05333494770935178, + "grad_norm": 0.16535663604736328, + "learning_rate": 0.0001186927876368163, + "loss": 1.6923171997070312, + "step": 17620 + }, + { + "epoch": 0.05336521725969761, + "grad_norm": 0.15947479009628296, + "learning_rate": 0.00011868899245517891, + "loss": 1.6887557983398438, + "step": 17630 + }, + { + "epoch": 0.053395486810043434, + "grad_norm": 0.15788306295871735, + "learning_rate": 0.00011868519727354152, + "loss": 1.7337852478027345, + "step": 17640 + }, + { + "epoch": 0.053425756360389265, + "grad_norm": 0.1589495837688446, + "learning_rate": 0.00011868140209190412, + "loss": 1.6989797592163085, + "step": 17650 + }, + { + "epoch": 0.053456025910735096, + "grad_norm": 0.1662444770336151, + "learning_rate": 0.00011867760691026673, + "loss": 1.7443578720092774, + "step": 17660 + }, + { + "epoch": 0.05348629546108093, + "grad_norm": 0.15273869037628174, + "learning_rate": 0.00011867381172862933, + "loss": 1.7065238952636719, + "step": 17670 + }, + { + "epoch": 0.05351656501142676, + "grad_norm": 0.16469699144363403, + "learning_rate": 0.00011867001654699194, + "loss": 1.7115278244018555, + "step": 17680 + }, + { + "epoch": 0.05354683456177259, + "grad_norm": 0.17613257467746735, + "learning_rate": 0.00011866622136535455, + "loss": 1.7089502334594726, + "step": 17690 + }, + { + "epoch": 0.05357710411211841, + "grad_norm": 0.17214155197143555, + "learning_rate": 0.00011866242618371715, + "loss": 1.7425983428955079, + "step": 17700 + }, + { + "epoch": 0.05360737366246424, + "grad_norm": 0.15218783915042877, + "learning_rate": 0.00011865863100207976, + "loss": 1.7402441024780273, + "step": 17710 + }, + { + "epoch": 0.053637643212810074, + "grad_norm": 0.16213102638721466, + "learning_rate": 0.00011865483582044236, + "loss": 1.7148540496826172, + "step": 17720 + }, + { + "epoch": 0.053667912763155905, + "grad_norm": 0.16771768033504486, + "learning_rate": 0.00011865104063880497, + "loss": 1.6916000366210937, + "step": 17730 + }, + { + "epoch": 0.053698182313501736, + "grad_norm": 0.17137452960014343, + "learning_rate": 0.00011864724545716758, + "loss": 1.7338947296142577, + "step": 17740 + }, + { + "epoch": 0.05372845186384756, + "grad_norm": 0.15130595862865448, + "learning_rate": 0.0001186434502755302, + "loss": 1.7155750274658204, + "step": 17750 + }, + { + "epoch": 0.05375872141419339, + "grad_norm": 0.17242597043514252, + "learning_rate": 0.0001186396550938928, + "loss": 1.7268396377563477, + "step": 17760 + }, + { + "epoch": 0.05378899096453922, + "grad_norm": 0.1555400788784027, + "learning_rate": 0.00011863585991225541, + "loss": 1.7395702362060548, + "step": 17770 + }, + { + "epoch": 0.05381926051488505, + "grad_norm": 0.16710256040096283, + "learning_rate": 0.00011863206473061801, + "loss": 1.6958122253417969, + "step": 17780 + }, + { + "epoch": 0.05384953006523088, + "grad_norm": 0.17261405289173126, + "learning_rate": 0.00011862826954898062, + "loss": 1.7099098205566405, + "step": 17790 + }, + { + "epoch": 0.053879799615576714, + "grad_norm": 0.1724759042263031, + "learning_rate": 0.00011862447436734323, + "loss": 1.7215965270996094, + "step": 17800 + }, + { + "epoch": 0.05391006916592254, + "grad_norm": 0.1615283340215683, + "learning_rate": 0.00011862067918570583, + "loss": 1.7161880493164063, + "step": 17810 + }, + { + "epoch": 0.05394033871626837, + "grad_norm": 0.16257508099079132, + "learning_rate": 0.00011861688400406844, + "loss": 1.7208637237548827, + "step": 17820 + }, + { + "epoch": 0.0539706082666142, + "grad_norm": 0.15819591283798218, + "learning_rate": 0.00011861308882243104, + "loss": 1.7369115829467774, + "step": 17830 + }, + { + "epoch": 0.05400087781696003, + "grad_norm": 0.1773713380098343, + "learning_rate": 0.00011860929364079365, + "loss": 1.674615478515625, + "step": 17840 + }, + { + "epoch": 0.05403114736730586, + "grad_norm": 0.1716998666524887, + "learning_rate": 0.00011860549845915625, + "loss": 1.693109130859375, + "step": 17850 + }, + { + "epoch": 0.054061416917651685, + "grad_norm": 0.20151901245117188, + "learning_rate": 0.00011860170327751886, + "loss": 1.7034976959228516, + "step": 17860 + }, + { + "epoch": 0.054091686467997516, + "grad_norm": 0.15880697965621948, + "learning_rate": 0.00011859790809588148, + "loss": 1.688875389099121, + "step": 17870 + }, + { + "epoch": 0.054121956018343347, + "grad_norm": 0.16429829597473145, + "learning_rate": 0.00011859411291424407, + "loss": 1.7231287002563476, + "step": 17880 + }, + { + "epoch": 0.05415222556868918, + "grad_norm": 0.16082704067230225, + "learning_rate": 0.00011859031773260669, + "loss": 1.7575729370117188, + "step": 17890 + }, + { + "epoch": 0.05418249511903501, + "grad_norm": 0.17685098946094513, + "learning_rate": 0.00011858652255096928, + "loss": 1.740715217590332, + "step": 17900 + }, + { + "epoch": 0.05421276466938084, + "grad_norm": 0.17123505473136902, + "learning_rate": 0.0001185827273693319, + "loss": 1.7212139129638673, + "step": 17910 + }, + { + "epoch": 0.05424303421972666, + "grad_norm": 0.1752103865146637, + "learning_rate": 0.0001185789321876945, + "loss": 1.7214519500732421, + "step": 17920 + }, + { + "epoch": 0.054273303770072494, + "grad_norm": 0.18887951970100403, + "learning_rate": 0.00011857513700605712, + "loss": 1.733333206176758, + "step": 17930 + }, + { + "epoch": 0.054303573320418325, + "grad_norm": 0.15943852066993713, + "learning_rate": 0.00011857134182441971, + "loss": 1.7124958038330078, + "step": 17940 + }, + { + "epoch": 0.054333842870764155, + "grad_norm": 0.18076932430267334, + "learning_rate": 0.00011856754664278233, + "loss": 1.7052227020263673, + "step": 17950 + }, + { + "epoch": 0.054364112421109986, + "grad_norm": 0.16585804522037506, + "learning_rate": 0.00011856375146114493, + "loss": 1.7157739639282226, + "step": 17960 + }, + { + "epoch": 0.05439438197145582, + "grad_norm": 0.18913061916828156, + "learning_rate": 0.00011855995627950754, + "loss": 1.7536960601806642, + "step": 17970 + }, + { + "epoch": 0.05442465152180164, + "grad_norm": 0.17332178354263306, + "learning_rate": 0.00011855616109787015, + "loss": 1.7370344161987306, + "step": 17980 + }, + { + "epoch": 0.05445492107214747, + "grad_norm": 0.15123501420021057, + "learning_rate": 0.00011855236591623275, + "loss": 1.7009689331054687, + "step": 17990 + }, + { + "epoch": 0.0544851906224933, + "grad_norm": 0.16448725759983063, + "learning_rate": 0.00011854857073459537, + "loss": 1.718121337890625, + "step": 18000 + }, + { + "epoch": 0.0544851906224933, + "eval_loss": 1.7128061056137085, + "eval_runtime": 28.4497, + "eval_samples_per_second": 17.575, + "eval_steps_per_second": 1.125, + "step": 18000 + }, + { + "epoch": 0.054515460172839134, + "grad_norm": 0.1767583042383194, + "learning_rate": 0.00011854477555295796, + "loss": 1.6976871490478516, + "step": 18010 + }, + { + "epoch": 0.054545729723184964, + "grad_norm": 0.15006199479103088, + "learning_rate": 0.00011854098037132058, + "loss": 1.7201946258544922, + "step": 18020 + }, + { + "epoch": 0.05457599927353079, + "grad_norm": 0.18559657037258148, + "learning_rate": 0.00011853718518968318, + "loss": 1.6845083236694336, + "step": 18030 + }, + { + "epoch": 0.05460626882387662, + "grad_norm": 0.17183731496334076, + "learning_rate": 0.0001185333900080458, + "loss": 1.7181842803955079, + "step": 18040 + }, + { + "epoch": 0.05463653837422245, + "grad_norm": 0.15845949947834015, + "learning_rate": 0.00011852959482640839, + "loss": 1.699575424194336, + "step": 18050 + }, + { + "epoch": 0.05466680792456828, + "grad_norm": 0.1619371622800827, + "learning_rate": 0.00011852579964477101, + "loss": 1.6751937866210938, + "step": 18060 + }, + { + "epoch": 0.05469707747491411, + "grad_norm": 0.15146338939666748, + "learning_rate": 0.0001185220044631336, + "loss": 1.6971717834472657, + "step": 18070 + }, + { + "epoch": 0.05472734702525994, + "grad_norm": 0.16872692108154297, + "learning_rate": 0.00011851820928149622, + "loss": 1.687615203857422, + "step": 18080 + }, + { + "epoch": 0.054757616575605766, + "grad_norm": 0.18587526679039001, + "learning_rate": 0.00011851441409985881, + "loss": 1.6814918518066406, + "step": 18090 + }, + { + "epoch": 0.0547878861259516, + "grad_norm": 0.14953339099884033, + "learning_rate": 0.00011851061891822143, + "loss": 1.7196737289428712, + "step": 18100 + }, + { + "epoch": 0.05481815567629743, + "grad_norm": 0.1764412522315979, + "learning_rate": 0.00011850682373658402, + "loss": 1.7077993392944335, + "step": 18110 + }, + { + "epoch": 0.05484842522664326, + "grad_norm": 0.1690071076154709, + "learning_rate": 0.00011850302855494664, + "loss": 1.6983369827270507, + "step": 18120 + }, + { + "epoch": 0.05487869477698909, + "grad_norm": 0.1688513308763504, + "learning_rate": 0.00011849923337330925, + "loss": 1.7116867065429688, + "step": 18130 + }, + { + "epoch": 0.05490896432733492, + "grad_norm": 0.162718266248703, + "learning_rate": 0.00011849543819167185, + "loss": 1.7536136627197265, + "step": 18140 + }, + { + "epoch": 0.054939233877680745, + "grad_norm": 0.186998188495636, + "learning_rate": 0.00011849164301003447, + "loss": 1.667508888244629, + "step": 18150 + }, + { + "epoch": 0.054969503428026575, + "grad_norm": 0.1694294661283493, + "learning_rate": 0.00011848784782839707, + "loss": 1.7019142150878905, + "step": 18160 + }, + { + "epoch": 0.054999772978372406, + "grad_norm": 0.14566554129123688, + "learning_rate": 0.00011848405264675969, + "loss": 1.7068979263305664, + "step": 18170 + }, + { + "epoch": 0.05503004252871824, + "grad_norm": 0.16962096095085144, + "learning_rate": 0.00011848025746512228, + "loss": 1.7020513534545898, + "step": 18180 + }, + { + "epoch": 0.05506031207906407, + "grad_norm": 0.15376797318458557, + "learning_rate": 0.0001184764622834849, + "loss": 1.7022647857666016, + "step": 18190 + }, + { + "epoch": 0.05509058162940989, + "grad_norm": 0.1842319220304489, + "learning_rate": 0.00011847266710184749, + "loss": 1.745981216430664, + "step": 18200 + }, + { + "epoch": 0.05512085117975572, + "grad_norm": 0.16504554450511932, + "learning_rate": 0.00011846887192021011, + "loss": 1.7350894927978515, + "step": 18210 + }, + { + "epoch": 0.055151120730101554, + "grad_norm": 0.18133795261383057, + "learning_rate": 0.0001184650767385727, + "loss": 1.7071714401245117, + "step": 18220 + }, + { + "epoch": 0.055181390280447384, + "grad_norm": 0.17077046632766724, + "learning_rate": 0.00011846128155693532, + "loss": 1.7111291885375977, + "step": 18230 + }, + { + "epoch": 0.055211659830793215, + "grad_norm": 0.19037291407585144, + "learning_rate": 0.00011845748637529793, + "loss": 1.712059211730957, + "step": 18240 + }, + { + "epoch": 0.055241929381139046, + "grad_norm": 0.16409803926944733, + "learning_rate": 0.00011845369119366053, + "loss": 1.7013900756835938, + "step": 18250 + }, + { + "epoch": 0.05527219893148487, + "grad_norm": 0.17151348292827606, + "learning_rate": 0.00011844989601202314, + "loss": 1.6963037490844726, + "step": 18260 + }, + { + "epoch": 0.0553024684818307, + "grad_norm": 0.16806162893772125, + "learning_rate": 0.00011844610083038575, + "loss": 1.69761962890625, + "step": 18270 + }, + { + "epoch": 0.05533273803217653, + "grad_norm": 0.15450166165828705, + "learning_rate": 0.00011844230564874835, + "loss": 1.7094097137451172, + "step": 18280 + }, + { + "epoch": 0.05536300758252236, + "grad_norm": 0.18109667301177979, + "learning_rate": 0.00011843851046711096, + "loss": 1.6898239135742188, + "step": 18290 + }, + { + "epoch": 0.05539327713286819, + "grad_norm": 0.17154182493686676, + "learning_rate": 0.00011843471528547356, + "loss": 1.7208612442016602, + "step": 18300 + }, + { + "epoch": 0.055423546683214024, + "grad_norm": 0.16184425354003906, + "learning_rate": 0.00011843092010383617, + "loss": 1.7051719665527343, + "step": 18310 + }, + { + "epoch": 0.05545381623355985, + "grad_norm": 0.1482076346874237, + "learning_rate": 0.00011842712492219878, + "loss": 1.715884017944336, + "step": 18320 + }, + { + "epoch": 0.05548408578390568, + "grad_norm": 0.1502581536769867, + "learning_rate": 0.00011842332974056138, + "loss": 1.7203832626342774, + "step": 18330 + }, + { + "epoch": 0.05551435533425151, + "grad_norm": 0.16332347691059113, + "learning_rate": 0.00011841953455892399, + "loss": 1.7409793853759765, + "step": 18340 + }, + { + "epoch": 0.05554462488459734, + "grad_norm": 0.15879704058170319, + "learning_rate": 0.00011841573937728659, + "loss": 1.740823745727539, + "step": 18350 + }, + { + "epoch": 0.05557489443494317, + "grad_norm": 0.21798019111156464, + "learning_rate": 0.00011841194419564921, + "loss": 1.665836715698242, + "step": 18360 + }, + { + "epoch": 0.055605163985288995, + "grad_norm": 0.1688062995672226, + "learning_rate": 0.00011840814901401182, + "loss": 1.714057159423828, + "step": 18370 + }, + { + "epoch": 0.055635433535634826, + "grad_norm": 0.1638898253440857, + "learning_rate": 0.00011840435383237442, + "loss": 1.730148696899414, + "step": 18380 + }, + { + "epoch": 0.05566570308598066, + "grad_norm": 0.1628865897655487, + "learning_rate": 0.00011840055865073703, + "loss": 1.7168298721313477, + "step": 18390 + }, + { + "epoch": 0.05569597263632649, + "grad_norm": 0.16965481638908386, + "learning_rate": 0.00011839676346909964, + "loss": 1.6754878997802733, + "step": 18400 + }, + { + "epoch": 0.05572624218667232, + "grad_norm": 0.16546785831451416, + "learning_rate": 0.00011839296828746224, + "loss": 1.713285255432129, + "step": 18410 + }, + { + "epoch": 0.05575651173701815, + "grad_norm": 0.1622893065214157, + "learning_rate": 0.00011838917310582485, + "loss": 1.7077503204345703, + "step": 18420 + }, + { + "epoch": 0.05578678128736397, + "grad_norm": 0.18166236579418182, + "learning_rate": 0.00011838537792418745, + "loss": 1.6741004943847657, + "step": 18430 + }, + { + "epoch": 0.055817050837709804, + "grad_norm": 0.1781119704246521, + "learning_rate": 0.00011838158274255006, + "loss": 1.7291824340820312, + "step": 18440 + }, + { + "epoch": 0.055847320388055635, + "grad_norm": 0.1822894960641861, + "learning_rate": 0.00011837778756091267, + "loss": 1.7240005493164063, + "step": 18450 + }, + { + "epoch": 0.055877589938401466, + "grad_norm": 0.17963695526123047, + "learning_rate": 0.00011837399237927527, + "loss": 1.7505805969238282, + "step": 18460 + }, + { + "epoch": 0.0559078594887473, + "grad_norm": 0.1838042140007019, + "learning_rate": 0.00011837019719763788, + "loss": 1.7280990600585937, + "step": 18470 + }, + { + "epoch": 0.05593812903909313, + "grad_norm": 0.20207734405994415, + "learning_rate": 0.00011836640201600048, + "loss": 1.7187419891357423, + "step": 18480 + }, + { + "epoch": 0.05596839858943895, + "grad_norm": 0.16083891689777374, + "learning_rate": 0.00011836260683436309, + "loss": 1.7298393249511719, + "step": 18490 + }, + { + "epoch": 0.05599866813978478, + "grad_norm": 0.1549706608057022, + "learning_rate": 0.00011835881165272571, + "loss": 1.7110776901245117, + "step": 18500 + }, + { + "epoch": 0.05599866813978478, + "eval_loss": 1.7174581289291382, + "eval_runtime": 27.9594, + "eval_samples_per_second": 17.883, + "eval_steps_per_second": 1.145, + "step": 18500 + }, + { + "epoch": 0.05602893769013061, + "grad_norm": 0.16172592341899872, + "learning_rate": 0.0001183550164710883, + "loss": 1.687795639038086, + "step": 18510 + }, + { + "epoch": 0.056059207240476444, + "grad_norm": 0.17214623093605042, + "learning_rate": 0.00011835122128945092, + "loss": 1.6654350280761718, + "step": 18520 + }, + { + "epoch": 0.056089476790822275, + "grad_norm": 0.15300752222537994, + "learning_rate": 0.00011834742610781351, + "loss": 1.7267776489257813, + "step": 18530 + }, + { + "epoch": 0.0561197463411681, + "grad_norm": 0.15747903287410736, + "learning_rate": 0.00011834363092617613, + "loss": 1.6949302673339843, + "step": 18540 + }, + { + "epoch": 0.05615001589151393, + "grad_norm": 0.18524180352687836, + "learning_rate": 0.00011833983574453874, + "loss": 1.7107122421264649, + "step": 18550 + }, + { + "epoch": 0.05618028544185976, + "grad_norm": 0.18755024671554565, + "learning_rate": 0.00011833604056290135, + "loss": 1.701247787475586, + "step": 18560 + }, + { + "epoch": 0.05621055499220559, + "grad_norm": 0.1514388769865036, + "learning_rate": 0.00011833224538126395, + "loss": 1.6859764099121093, + "step": 18570 + }, + { + "epoch": 0.05624082454255142, + "grad_norm": 0.15597108006477356, + "learning_rate": 0.00011832845019962656, + "loss": 1.6943265914916992, + "step": 18580 + }, + { + "epoch": 0.05627109409289725, + "grad_norm": 0.15828640758991241, + "learning_rate": 0.00011832465501798916, + "loss": 1.7166688919067383, + "step": 18590 + }, + { + "epoch": 0.05630136364324308, + "grad_norm": 0.17077887058258057, + "learning_rate": 0.00011832085983635177, + "loss": 1.725320053100586, + "step": 18600 + }, + { + "epoch": 0.05633163319358891, + "grad_norm": 0.18156754970550537, + "learning_rate": 0.00011831706465471439, + "loss": 1.725119400024414, + "step": 18610 + }, + { + "epoch": 0.05636190274393474, + "grad_norm": 0.17224858701229095, + "learning_rate": 0.00011831326947307698, + "loss": 1.6957769393920898, + "step": 18620 + }, + { + "epoch": 0.05639217229428057, + "grad_norm": 0.1786700189113617, + "learning_rate": 0.0001183094742914396, + "loss": 1.7138744354248048, + "step": 18630 + }, + { + "epoch": 0.0564224418446264, + "grad_norm": 0.20330683887004852, + "learning_rate": 0.00011830567910980219, + "loss": 1.7375722885131837, + "step": 18640 + }, + { + "epoch": 0.056452711394972224, + "grad_norm": 0.15905128419399261, + "learning_rate": 0.00011830188392816481, + "loss": 1.6934677124023438, + "step": 18650 + }, + { + "epoch": 0.056482980945318055, + "grad_norm": 0.16222067177295685, + "learning_rate": 0.0001182980887465274, + "loss": 1.7179157257080078, + "step": 18660 + }, + { + "epoch": 0.056513250495663886, + "grad_norm": 0.15760789811611176, + "learning_rate": 0.00011829429356489002, + "loss": 1.6882328033447265, + "step": 18670 + }, + { + "epoch": 0.05654352004600972, + "grad_norm": 0.18164104223251343, + "learning_rate": 0.00011829049838325262, + "loss": 1.696619415283203, + "step": 18680 + }, + { + "epoch": 0.05657378959635555, + "grad_norm": 0.14616315066814423, + "learning_rate": 0.00011828670320161524, + "loss": 1.707501220703125, + "step": 18690 + }, + { + "epoch": 0.05660405914670138, + "grad_norm": 0.16975265741348267, + "learning_rate": 0.00011828290801997783, + "loss": 1.7103252410888672, + "step": 18700 + }, + { + "epoch": 0.0566343286970472, + "grad_norm": 0.17131541669368744, + "learning_rate": 0.00011827911283834045, + "loss": 1.6804164886474608, + "step": 18710 + }, + { + "epoch": 0.05666459824739303, + "grad_norm": 0.15407070517539978, + "learning_rate": 0.00011827531765670304, + "loss": 1.7153501510620117, + "step": 18720 + }, + { + "epoch": 0.056694867797738864, + "grad_norm": 0.16536861658096313, + "learning_rate": 0.00011827152247506566, + "loss": 1.7188711166381836, + "step": 18730 + }, + { + "epoch": 0.056725137348084695, + "grad_norm": 0.16040299832820892, + "learning_rate": 0.00011826772729342827, + "loss": 1.7295866012573242, + "step": 18740 + }, + { + "epoch": 0.056755406898430526, + "grad_norm": 0.16016323864459991, + "learning_rate": 0.00011826393211179087, + "loss": 1.6996809005737306, + "step": 18750 + }, + { + "epoch": 0.05678567644877636, + "grad_norm": 0.15337280929088593, + "learning_rate": 0.00011826013693015349, + "loss": 1.7222610473632813, + "step": 18760 + }, + { + "epoch": 0.05681594599912218, + "grad_norm": 0.18798485398292542, + "learning_rate": 0.00011825634174851608, + "loss": 1.684912872314453, + "step": 18770 + }, + { + "epoch": 0.05684621554946801, + "grad_norm": 0.1447891891002655, + "learning_rate": 0.0001182525465668787, + "loss": 1.692509651184082, + "step": 18780 + }, + { + "epoch": 0.05687648509981384, + "grad_norm": 0.16713854670524597, + "learning_rate": 0.0001182487513852413, + "loss": 1.7473514556884766, + "step": 18790 + }, + { + "epoch": 0.05690675465015967, + "grad_norm": 0.17084376513957977, + "learning_rate": 0.00011824495620360391, + "loss": 1.695753288269043, + "step": 18800 + }, + { + "epoch": 0.056937024200505504, + "grad_norm": 0.17008459568023682, + "learning_rate": 0.00011824116102196651, + "loss": 1.6756931304931642, + "step": 18810 + }, + { + "epoch": 0.05696729375085133, + "grad_norm": 0.1666610985994339, + "learning_rate": 0.00011823736584032913, + "loss": 1.7447242736816406, + "step": 18820 + }, + { + "epoch": 0.05699756330119716, + "grad_norm": 0.15309078991413116, + "learning_rate": 0.00011823357065869172, + "loss": 1.7096368789672851, + "step": 18830 + }, + { + "epoch": 0.05702783285154299, + "grad_norm": 0.16023953258991241, + "learning_rate": 0.00011822977547705434, + "loss": 1.7040014266967773, + "step": 18840 + }, + { + "epoch": 0.05705810240188882, + "grad_norm": 0.19357483088970184, + "learning_rate": 0.00011822598029541693, + "loss": 1.7082864761352539, + "step": 18850 + }, + { + "epoch": 0.05708837195223465, + "grad_norm": 0.16685982048511505, + "learning_rate": 0.00011822218511377955, + "loss": 1.6974655151367188, + "step": 18860 + }, + { + "epoch": 0.05711864150258048, + "grad_norm": 0.1907780021429062, + "learning_rate": 0.00011821838993214216, + "loss": 1.6889471054077148, + "step": 18870 + }, + { + "epoch": 0.057148911052926306, + "grad_norm": 0.184255912899971, + "learning_rate": 0.00011821459475050476, + "loss": 1.7183082580566407, + "step": 18880 + }, + { + "epoch": 0.05717918060327214, + "grad_norm": 0.16369013488292694, + "learning_rate": 0.00011821079956886737, + "loss": 1.703334426879883, + "step": 18890 + }, + { + "epoch": 0.05720945015361797, + "grad_norm": 0.17437225580215454, + "learning_rate": 0.00011820700438722997, + "loss": 1.6866136550903321, + "step": 18900 + }, + { + "epoch": 0.0572397197039638, + "grad_norm": 0.15464138984680176, + "learning_rate": 0.00011820320920559258, + "loss": 1.7392515182495116, + "step": 18910 + }, + { + "epoch": 0.05726998925430963, + "grad_norm": 0.1598682850599289, + "learning_rate": 0.00011819941402395519, + "loss": 1.6979990005493164, + "step": 18920 + }, + { + "epoch": 0.05730025880465546, + "grad_norm": 0.17420658469200134, + "learning_rate": 0.00011819561884231779, + "loss": 1.7205854415893556, + "step": 18930 + }, + { + "epoch": 0.057330528355001284, + "grad_norm": 0.17845246195793152, + "learning_rate": 0.0001181918236606804, + "loss": 1.6896560668945313, + "step": 18940 + }, + { + "epoch": 0.057360797905347115, + "grad_norm": 0.17925885319709778, + "learning_rate": 0.000118188028479043, + "loss": 1.719039535522461, + "step": 18950 + }, + { + "epoch": 0.057391067455692946, + "grad_norm": 0.19784197211265564, + "learning_rate": 0.00011818423329740561, + "loss": 1.7207050323486328, + "step": 18960 + }, + { + "epoch": 0.057421337006038776, + "grad_norm": 0.17284134030342102, + "learning_rate": 0.00011818043811576823, + "loss": 1.6953100204467773, + "step": 18970 + }, + { + "epoch": 0.05745160655638461, + "grad_norm": 0.15487761795520782, + "learning_rate": 0.00011817664293413084, + "loss": 1.7529485702514649, + "step": 18980 + }, + { + "epoch": 0.05748187610673043, + "grad_norm": 0.15331965684890747, + "learning_rate": 0.00011817284775249344, + "loss": 1.6908329010009766, + "step": 18990 + }, + { + "epoch": 0.05751214565707626, + "grad_norm": 0.1827729493379593, + "learning_rate": 0.00011816905257085605, + "loss": 1.738922119140625, + "step": 19000 + }, + { + "epoch": 0.05751214565707626, + "eval_loss": 1.7011563777923584, + "eval_runtime": 28.0632, + "eval_samples_per_second": 17.817, + "eval_steps_per_second": 1.14, + "step": 19000 + }, + { + "epoch": 0.05754241520742209, + "grad_norm": 0.16580691933631897, + "learning_rate": 0.00011816525738921865, + "loss": 1.711932373046875, + "step": 19010 + }, + { + "epoch": 0.057572684757767924, + "grad_norm": 0.18310287594795227, + "learning_rate": 0.00011816146220758126, + "loss": 1.6679128646850585, + "step": 19020 + }, + { + "epoch": 0.057602954308113755, + "grad_norm": 0.1945880651473999, + "learning_rate": 0.00011815766702594387, + "loss": 1.7148859024047851, + "step": 19030 + }, + { + "epoch": 0.057633223858459585, + "grad_norm": 0.15005473792552948, + "learning_rate": 0.00011815387184430647, + "loss": 1.7378013610839844, + "step": 19040 + }, + { + "epoch": 0.05766349340880541, + "grad_norm": 0.19517970085144043, + "learning_rate": 0.00011815007666266908, + "loss": 1.7050796508789063, + "step": 19050 + }, + { + "epoch": 0.05769376295915124, + "grad_norm": 0.21217601001262665, + "learning_rate": 0.00011814628148103168, + "loss": 1.7267135620117187, + "step": 19060 + }, + { + "epoch": 0.05772403250949707, + "grad_norm": 0.15176968276500702, + "learning_rate": 0.00011814248629939429, + "loss": 1.7049224853515625, + "step": 19070 + }, + { + "epoch": 0.0577543020598429, + "grad_norm": 0.17161789536476135, + "learning_rate": 0.0001181386911177569, + "loss": 1.7338153839111328, + "step": 19080 + }, + { + "epoch": 0.05778457161018873, + "grad_norm": 0.1868915855884552, + "learning_rate": 0.0001181348959361195, + "loss": 1.6846237182617188, + "step": 19090 + }, + { + "epoch": 0.057814841160534564, + "grad_norm": 0.14851634204387665, + "learning_rate": 0.00011813110075448211, + "loss": 1.7107078552246093, + "step": 19100 + }, + { + "epoch": 0.05784511071088039, + "grad_norm": 0.13811218738555908, + "learning_rate": 0.00011812730557284473, + "loss": 1.6613077163696288, + "step": 19110 + }, + { + "epoch": 0.05787538026122622, + "grad_norm": 0.15824665129184723, + "learning_rate": 0.00011812351039120732, + "loss": 1.7361663818359374, + "step": 19120 + }, + { + "epoch": 0.05790564981157205, + "grad_norm": 0.16479167342185974, + "learning_rate": 0.00011811971520956994, + "loss": 1.7460865020751952, + "step": 19130 + }, + { + "epoch": 0.05793591936191788, + "grad_norm": 0.17621752619743347, + "learning_rate": 0.00011811592002793253, + "loss": 1.7626667022705078, + "step": 19140 + }, + { + "epoch": 0.05796618891226371, + "grad_norm": 0.15351805090904236, + "learning_rate": 0.00011811212484629515, + "loss": 1.7078119277954102, + "step": 19150 + }, + { + "epoch": 0.057996458462609535, + "grad_norm": 0.16767799854278564, + "learning_rate": 0.00011810832966465776, + "loss": 1.6964784622192384, + "step": 19160 + }, + { + "epoch": 0.058026728012955366, + "grad_norm": 0.14988820254802704, + "learning_rate": 0.00011810453448302036, + "loss": 1.7118810653686523, + "step": 19170 + }, + { + "epoch": 0.058056997563301196, + "grad_norm": 0.16155560314655304, + "learning_rate": 0.00011810073930138297, + "loss": 1.6908039093017577, + "step": 19180 + }, + { + "epoch": 0.05808726711364703, + "grad_norm": 0.16973929107189178, + "learning_rate": 0.00011809694411974557, + "loss": 1.7043041229248046, + "step": 19190 + }, + { + "epoch": 0.05811753666399286, + "grad_norm": 0.1708419919013977, + "learning_rate": 0.00011809314893810818, + "loss": 1.7127981185913086, + "step": 19200 + }, + { + "epoch": 0.05814780621433869, + "grad_norm": 0.18569554388523102, + "learning_rate": 0.00011808935375647079, + "loss": 1.6886026382446289, + "step": 19210 + }, + { + "epoch": 0.05817807576468451, + "grad_norm": 0.1878521889448166, + "learning_rate": 0.0001180855585748334, + "loss": 1.6685653686523438, + "step": 19220 + }, + { + "epoch": 0.058208345315030344, + "grad_norm": 0.16532208025455475, + "learning_rate": 0.000118081763393196, + "loss": 1.7467033386230468, + "step": 19230 + }, + { + "epoch": 0.058238614865376175, + "grad_norm": 0.16794852912425995, + "learning_rate": 0.00011807796821155862, + "loss": 1.6971450805664063, + "step": 19240 + }, + { + "epoch": 0.058268884415722005, + "grad_norm": 0.16279733180999756, + "learning_rate": 0.00011807417302992121, + "loss": 1.6791440963745117, + "step": 19250 + }, + { + "epoch": 0.058299153966067836, + "grad_norm": 0.1461840271949768, + "learning_rate": 0.00011807037784828383, + "loss": 1.7165946960449219, + "step": 19260 + }, + { + "epoch": 0.05832942351641367, + "grad_norm": 0.1529965102672577, + "learning_rate": 0.00011806658266664642, + "loss": 1.7287153244018554, + "step": 19270 + }, + { + "epoch": 0.05835969306675949, + "grad_norm": 0.1878318339586258, + "learning_rate": 0.00011806278748500904, + "loss": 1.6901206970214844, + "step": 19280 + }, + { + "epoch": 0.05838996261710532, + "grad_norm": 0.16892151534557343, + "learning_rate": 0.00011805899230337163, + "loss": 1.6883621215820312, + "step": 19290 + }, + { + "epoch": 0.05842023216745115, + "grad_norm": 0.1793341338634491, + "learning_rate": 0.00011805519712173425, + "loss": 1.6508085250854492, + "step": 19300 + }, + { + "epoch": 0.058450501717796984, + "grad_norm": 0.17240135371685028, + "learning_rate": 0.00011805140194009685, + "loss": 1.6888410568237304, + "step": 19310 + }, + { + "epoch": 0.058480771268142814, + "grad_norm": 0.18667224049568176, + "learning_rate": 0.00011804760675845947, + "loss": 1.7011138916015625, + "step": 19320 + }, + { + "epoch": 0.05851104081848864, + "grad_norm": 0.16824543476104736, + "learning_rate": 0.00011804381157682206, + "loss": 1.6606941223144531, + "step": 19330 + }, + { + "epoch": 0.05854131036883447, + "grad_norm": 0.15426242351531982, + "learning_rate": 0.00011804001639518468, + "loss": 1.7100292205810548, + "step": 19340 + }, + { + "epoch": 0.0585715799191803, + "grad_norm": 0.14623688161373138, + "learning_rate": 0.00011803622121354728, + "loss": 1.6969196319580078, + "step": 19350 + }, + { + "epoch": 0.05860184946952613, + "grad_norm": 0.16358646750450134, + "learning_rate": 0.00011803242603190989, + "loss": 1.6946746826171875, + "step": 19360 + }, + { + "epoch": 0.05863211901987196, + "grad_norm": 0.19344796240329742, + "learning_rate": 0.00011802863085027251, + "loss": 1.7280559539794922, + "step": 19370 + }, + { + "epoch": 0.05866238857021779, + "grad_norm": 0.1804323047399521, + "learning_rate": 0.0001180248356686351, + "loss": 1.7119186401367188, + "step": 19380 + }, + { + "epoch": 0.058692658120563616, + "grad_norm": 0.16825613379478455, + "learning_rate": 0.00011802104048699772, + "loss": 1.735745620727539, + "step": 19390 + }, + { + "epoch": 0.05872292767090945, + "grad_norm": 0.16535362601280212, + "learning_rate": 0.00011801724530536031, + "loss": 1.734512710571289, + "step": 19400 + }, + { + "epoch": 0.05875319722125528, + "grad_norm": 0.17493480443954468, + "learning_rate": 0.00011801345012372293, + "loss": 1.7267021179199218, + "step": 19410 + }, + { + "epoch": 0.05878346677160111, + "grad_norm": 0.15924552083015442, + "learning_rate": 0.00011800965494208552, + "loss": 1.6848323822021485, + "step": 19420 + }, + { + "epoch": 0.05881373632194694, + "grad_norm": 0.1669488549232483, + "learning_rate": 0.00011800585976044814, + "loss": 1.7137161254882813, + "step": 19430 + }, + { + "epoch": 0.058844005872292764, + "grad_norm": 0.19459345936775208, + "learning_rate": 0.00011800206457881074, + "loss": 1.7017101287841796, + "step": 19440 + }, + { + "epoch": 0.058874275422638594, + "grad_norm": 0.20157231390476227, + "learning_rate": 0.00011799826939717336, + "loss": 1.6924652099609374, + "step": 19450 + }, + { + "epoch": 0.058904544972984425, + "grad_norm": 0.16218312084674835, + "learning_rate": 0.00011799447421553595, + "loss": 1.724309539794922, + "step": 19460 + }, + { + "epoch": 0.058934814523330256, + "grad_norm": 0.1571989506483078, + "learning_rate": 0.00011799067903389857, + "loss": 1.7031661987304687, + "step": 19470 + }, + { + "epoch": 0.05896508407367609, + "grad_norm": 0.1781865358352661, + "learning_rate": 0.00011798688385226117, + "loss": 1.690042495727539, + "step": 19480 + }, + { + "epoch": 0.05899535362402192, + "grad_norm": 0.15674421191215515, + "learning_rate": 0.00011798308867062378, + "loss": 1.7189720153808594, + "step": 19490 + }, + { + "epoch": 0.05902562317436774, + "grad_norm": 0.18578432500362396, + "learning_rate": 0.00011797929348898639, + "loss": 1.6916324615478515, + "step": 19500 + }, + { + "epoch": 0.05902562317436774, + "eval_loss": 1.7100565433502197, + "eval_runtime": 28.1444, + "eval_samples_per_second": 17.766, + "eval_steps_per_second": 1.137, + "step": 19500 + }, + { + "epoch": 0.05905589272471357, + "grad_norm": 0.162877157330513, + "learning_rate": 0.00011797549830734899, + "loss": 1.7083593368530274, + "step": 19510 + }, + { + "epoch": 0.0590861622750594, + "grad_norm": 0.17814436554908752, + "learning_rate": 0.0001179717031257116, + "loss": 1.733251190185547, + "step": 19520 + }, + { + "epoch": 0.059116431825405234, + "grad_norm": 0.16682954132556915, + "learning_rate": 0.0001179679079440742, + "loss": 1.6940540313720702, + "step": 19530 + }, + { + "epoch": 0.059146701375751065, + "grad_norm": 0.18281960487365723, + "learning_rate": 0.00011796411276243681, + "loss": 1.6683359146118164, + "step": 19540 + }, + { + "epoch": 0.059176970926096896, + "grad_norm": 0.17594218254089355, + "learning_rate": 0.00011796031758079942, + "loss": 1.7578235626220704, + "step": 19550 + }, + { + "epoch": 0.05920724047644272, + "grad_norm": 0.15970981121063232, + "learning_rate": 0.00011795652239916202, + "loss": 1.6865541458129882, + "step": 19560 + }, + { + "epoch": 0.05923751002678855, + "grad_norm": 0.16737134754657745, + "learning_rate": 0.00011795272721752463, + "loss": 1.6737709045410156, + "step": 19570 + }, + { + "epoch": 0.05926777957713438, + "grad_norm": 0.16122788190841675, + "learning_rate": 0.00011794893203588725, + "loss": 1.6994342803955078, + "step": 19580 + }, + { + "epoch": 0.05929804912748021, + "grad_norm": 0.16562873125076294, + "learning_rate": 0.00011794513685424984, + "loss": 1.684098243713379, + "step": 19590 + }, + { + "epoch": 0.05932831867782604, + "grad_norm": 0.16285580396652222, + "learning_rate": 0.00011794134167261246, + "loss": 1.6830886840820312, + "step": 19600 + }, + { + "epoch": 0.05935858822817187, + "grad_norm": 0.15204225480556488, + "learning_rate": 0.00011793754649097506, + "loss": 1.685567855834961, + "step": 19610 + }, + { + "epoch": 0.0593888577785177, + "grad_norm": 0.17627684772014618, + "learning_rate": 0.00011793375130933767, + "loss": 1.7474903106689452, + "step": 19620 + }, + { + "epoch": 0.05941912732886353, + "grad_norm": 0.17402376234531403, + "learning_rate": 0.00011792995612770028, + "loss": 1.6984458923339845, + "step": 19630 + }, + { + "epoch": 0.05944939687920936, + "grad_norm": 0.19467271864414215, + "learning_rate": 0.00011792616094606288, + "loss": 1.7016197204589845, + "step": 19640 + }, + { + "epoch": 0.05947966642955519, + "grad_norm": 0.14997251331806183, + "learning_rate": 0.00011792236576442549, + "loss": 1.7364715576171874, + "step": 19650 + }, + { + "epoch": 0.05950993597990102, + "grad_norm": 0.1550562083721161, + "learning_rate": 0.0001179185705827881, + "loss": 1.717401885986328, + "step": 19660 + }, + { + "epoch": 0.059540205530246845, + "grad_norm": 0.15193428099155426, + "learning_rate": 0.0001179147754011507, + "loss": 1.7110511779785156, + "step": 19670 + }, + { + "epoch": 0.059570475080592676, + "grad_norm": 0.1484527289867401, + "learning_rate": 0.0001179109802195133, + "loss": 1.7281600952148437, + "step": 19680 + }, + { + "epoch": 0.05960074463093851, + "grad_norm": 0.17320778965950012, + "learning_rate": 0.00011790718503787591, + "loss": 1.7515161514282227, + "step": 19690 + }, + { + "epoch": 0.05963101418128434, + "grad_norm": 0.1610722839832306, + "learning_rate": 0.00011790338985623852, + "loss": 1.6908966064453126, + "step": 19700 + }, + { + "epoch": 0.05966128373163017, + "grad_norm": 0.15763390064239502, + "learning_rate": 0.00011789959467460112, + "loss": 1.6998584747314454, + "step": 19710 + }, + { + "epoch": 0.059691553281976, + "grad_norm": 0.16757553815841675, + "learning_rate": 0.00011789579949296374, + "loss": 1.6989299774169921, + "step": 19720 + }, + { + "epoch": 0.05972182283232182, + "grad_norm": 0.17305150628089905, + "learning_rate": 0.00011789200431132634, + "loss": 1.7258468627929688, + "step": 19730 + }, + { + "epoch": 0.059752092382667654, + "grad_norm": 0.15640461444854736, + "learning_rate": 0.00011788820912968896, + "loss": 1.7059642791748046, + "step": 19740 + }, + { + "epoch": 0.059782361933013485, + "grad_norm": 0.17383618652820587, + "learning_rate": 0.00011788441394805155, + "loss": 1.7210954666137694, + "step": 19750 + }, + { + "epoch": 0.059812631483359316, + "grad_norm": 0.16603413224220276, + "learning_rate": 0.00011788061876641417, + "loss": 1.7298553466796875, + "step": 19760 + }, + { + "epoch": 0.05984290103370515, + "grad_norm": 0.17092478275299072, + "learning_rate": 0.00011787682358477677, + "loss": 1.7085184097290038, + "step": 19770 + }, + { + "epoch": 0.05987317058405097, + "grad_norm": 0.1678321361541748, + "learning_rate": 0.00011787302840313938, + "loss": 1.703176498413086, + "step": 19780 + }, + { + "epoch": 0.0599034401343968, + "grad_norm": 0.12942622601985931, + "learning_rate": 0.00011786923322150199, + "loss": 1.7043394088745116, + "step": 19790 + }, + { + "epoch": 0.05993370968474263, + "grad_norm": 0.15815618634223938, + "learning_rate": 0.00011786543803986459, + "loss": 1.7055931091308594, + "step": 19800 + }, + { + "epoch": 0.05996397923508846, + "grad_norm": 0.17168591916561127, + "learning_rate": 0.0001178616428582272, + "loss": 1.6678892135620118, + "step": 19810 + }, + { + "epoch": 0.059994248785434294, + "grad_norm": 0.16610293090343475, + "learning_rate": 0.0001178578476765898, + "loss": 1.7321245193481445, + "step": 19820 + }, + { + "epoch": 0.060024518335780125, + "grad_norm": 0.17406171560287476, + "learning_rate": 0.00011785405249495241, + "loss": 1.7272445678710937, + "step": 19830 + }, + { + "epoch": 0.06005478788612595, + "grad_norm": 0.15964700281620026, + "learning_rate": 0.00011785025731331502, + "loss": 1.7150781631469727, + "step": 19840 + }, + { + "epoch": 0.06008505743647178, + "grad_norm": 0.14611263573169708, + "learning_rate": 0.00011784646213167763, + "loss": 1.7056802749633788, + "step": 19850 + }, + { + "epoch": 0.06011532698681761, + "grad_norm": 0.16025765240192413, + "learning_rate": 0.00011784266695004023, + "loss": 1.7000843048095704, + "step": 19860 + }, + { + "epoch": 0.06014559653716344, + "grad_norm": 0.1865224540233612, + "learning_rate": 0.00011783887176840285, + "loss": 1.6499771118164062, + "step": 19870 + }, + { + "epoch": 0.06017586608750927, + "grad_norm": 0.166392982006073, + "learning_rate": 0.00011783507658676544, + "loss": 1.6933233261108398, + "step": 19880 + }, + { + "epoch": 0.0602061356378551, + "grad_norm": 0.1670220047235489, + "learning_rate": 0.00011783128140512806, + "loss": 1.6858211517333985, + "step": 19890 + }, + { + "epoch": 0.06023640518820093, + "grad_norm": 0.18126438558101654, + "learning_rate": 0.00011782748622349065, + "loss": 1.7282682418823243, + "step": 19900 + }, + { + "epoch": 0.06026667473854676, + "grad_norm": 0.15282851457595825, + "learning_rate": 0.00011782369104185327, + "loss": 1.668781852722168, + "step": 19910 + }, + { + "epoch": 0.06029694428889259, + "grad_norm": 0.17081274092197418, + "learning_rate": 0.00011781989586021586, + "loss": 1.6936983108520507, + "step": 19920 + }, + { + "epoch": 0.06032721383923842, + "grad_norm": 0.21458806097507477, + "learning_rate": 0.00011781610067857848, + "loss": 1.7400562286376953, + "step": 19930 + }, + { + "epoch": 0.06035748338958425, + "grad_norm": 0.16301025450229645, + "learning_rate": 0.00011781230549694107, + "loss": 1.7350112915039062, + "step": 19940 + }, + { + "epoch": 0.060387752939930074, + "grad_norm": 0.16709503531455994, + "learning_rate": 0.0001178085103153037, + "loss": 1.7158119201660156, + "step": 19950 + }, + { + "epoch": 0.060418022490275905, + "grad_norm": 0.17641551792621613, + "learning_rate": 0.00011780471513366629, + "loss": 1.6499004364013672, + "step": 19960 + }, + { + "epoch": 0.060448292040621736, + "grad_norm": 0.14528654515743256, + "learning_rate": 0.0001178009199520289, + "loss": 1.7136417388916017, + "step": 19970 + }, + { + "epoch": 0.06047856159096757, + "grad_norm": 0.16516853868961334, + "learning_rate": 0.00011779712477039153, + "loss": 1.7039688110351563, + "step": 19980 + }, + { + "epoch": 0.0605088311413134, + "grad_norm": 0.1436363011598587, + "learning_rate": 0.00011779332958875412, + "loss": 1.6870216369628905, + "step": 19990 + }, + { + "epoch": 0.06053910069165923, + "grad_norm": 0.16597098112106323, + "learning_rate": 0.00011778953440711674, + "loss": 1.6720771789550781, + "step": 20000 + }, + { + "epoch": 0.06053910069165923, + "eval_loss": 1.6958779096603394, + "eval_runtime": 27.6704, + "eval_samples_per_second": 18.07, + "eval_steps_per_second": 1.156, + "step": 20000 + }, + { + "epoch": 0.06056937024200505, + "grad_norm": 0.17140275239944458, + "learning_rate": 0.00011778573922547933, + "loss": 1.6869203567504882, + "step": 20010 + }, + { + "epoch": 0.06059963979235088, + "grad_norm": 0.16698943078517914, + "learning_rate": 0.00011778194404384195, + "loss": 1.687807846069336, + "step": 20020 + }, + { + "epoch": 0.060629909342696714, + "grad_norm": 0.18769371509552002, + "learning_rate": 0.00011777814886220454, + "loss": 1.6614593505859374, + "step": 20030 + }, + { + "epoch": 0.060660178893042545, + "grad_norm": 0.1634249985218048, + "learning_rate": 0.00011777435368056716, + "loss": 1.7090389251708984, + "step": 20040 + }, + { + "epoch": 0.060690448443388376, + "grad_norm": 0.15225651860237122, + "learning_rate": 0.00011777055849892975, + "loss": 1.7140565872192384, + "step": 20050 + }, + { + "epoch": 0.060720717993734206, + "grad_norm": 0.1564854383468628, + "learning_rate": 0.00011776676331729237, + "loss": 1.681936264038086, + "step": 20060 + }, + { + "epoch": 0.06075098754408003, + "grad_norm": 0.18646585941314697, + "learning_rate": 0.00011776296813565497, + "loss": 1.7114572525024414, + "step": 20070 + }, + { + "epoch": 0.06078125709442586, + "grad_norm": 0.18880853056907654, + "learning_rate": 0.00011775917295401759, + "loss": 1.6787113189697265, + "step": 20080 + }, + { + "epoch": 0.06081152664477169, + "grad_norm": 0.1945522129535675, + "learning_rate": 0.00011775537777238019, + "loss": 1.7456567764282227, + "step": 20090 + }, + { + "epoch": 0.06084179619511752, + "grad_norm": 0.18316112458705902, + "learning_rate": 0.0001177515825907428, + "loss": 1.7274972915649414, + "step": 20100 + }, + { + "epoch": 0.060872065745463354, + "grad_norm": 0.17455808818340302, + "learning_rate": 0.0001177477874091054, + "loss": 1.6734756469726562, + "step": 20110 + }, + { + "epoch": 0.06090233529580918, + "grad_norm": 0.17359060049057007, + "learning_rate": 0.00011774399222746801, + "loss": 1.692220115661621, + "step": 20120 + }, + { + "epoch": 0.06093260484615501, + "grad_norm": 0.19951395690441132, + "learning_rate": 0.00011774019704583061, + "loss": 1.7248344421386719, + "step": 20130 + }, + { + "epoch": 0.06096287439650084, + "grad_norm": 0.15082648396492004, + "learning_rate": 0.00011773640186419322, + "loss": 1.666314697265625, + "step": 20140 + }, + { + "epoch": 0.06099314394684667, + "grad_norm": 0.19467328488826752, + "learning_rate": 0.00011773260668255583, + "loss": 1.6977951049804687, + "step": 20150 + }, + { + "epoch": 0.0610234134971925, + "grad_norm": 0.17828837037086487, + "learning_rate": 0.00011772881150091843, + "loss": 1.7009963989257812, + "step": 20160 + }, + { + "epoch": 0.06105368304753833, + "grad_norm": 0.14815959334373474, + "learning_rate": 0.00011772501631928104, + "loss": 1.6729139328002929, + "step": 20170 + }, + { + "epoch": 0.061083952597884156, + "grad_norm": 0.17241036891937256, + "learning_rate": 0.00011772122113764364, + "loss": 1.6862653732299804, + "step": 20180 + }, + { + "epoch": 0.06111422214822999, + "grad_norm": 0.15597772598266602, + "learning_rate": 0.00011771742595600626, + "loss": 1.7187294006347655, + "step": 20190 + }, + { + "epoch": 0.06114449169857582, + "grad_norm": 0.187378391623497, + "learning_rate": 0.00011771363077436886, + "loss": 1.7047763824462892, + "step": 20200 + }, + { + "epoch": 0.06117476124892165, + "grad_norm": 0.15555597841739655, + "learning_rate": 0.00011770983559273148, + "loss": 1.7003076553344727, + "step": 20210 + }, + { + "epoch": 0.06120503079926748, + "grad_norm": 0.16102342307567596, + "learning_rate": 0.00011770604041109408, + "loss": 1.722361946105957, + "step": 20220 + }, + { + "epoch": 0.0612353003496133, + "grad_norm": 0.1592041403055191, + "learning_rate": 0.00011770224522945669, + "loss": 1.6841901779174804, + "step": 20230 + }, + { + "epoch": 0.061265569899959134, + "grad_norm": 0.15349873900413513, + "learning_rate": 0.0001176984500478193, + "loss": 1.7188013076782227, + "step": 20240 + }, + { + "epoch": 0.061295839450304965, + "grad_norm": 0.151505246758461, + "learning_rate": 0.0001176946548661819, + "loss": 1.6999231338500977, + "step": 20250 + }, + { + "epoch": 0.061326109000650796, + "grad_norm": 0.1664222627878189, + "learning_rate": 0.0001176908596845445, + "loss": 1.6867359161376954, + "step": 20260 + }, + { + "epoch": 0.061356378550996626, + "grad_norm": 0.1564992219209671, + "learning_rate": 0.00011768706450290711, + "loss": 1.6987159729003907, + "step": 20270 + }, + { + "epoch": 0.06138664810134246, + "grad_norm": 0.1533757597208023, + "learning_rate": 0.00011768326932126972, + "loss": 1.6808197021484375, + "step": 20280 + }, + { + "epoch": 0.06141691765168828, + "grad_norm": 0.17420104146003723, + "learning_rate": 0.00011767947413963232, + "loss": 1.731307601928711, + "step": 20290 + }, + { + "epoch": 0.06144718720203411, + "grad_norm": 0.17209017276763916, + "learning_rate": 0.00011767567895799493, + "loss": 1.7433334350585938, + "step": 20300 + }, + { + "epoch": 0.06147745675237994, + "grad_norm": 0.16848105192184448, + "learning_rate": 0.00011767188377635754, + "loss": 1.7486705780029297, + "step": 20310 + }, + { + "epoch": 0.061507726302725774, + "grad_norm": 0.16443875432014465, + "learning_rate": 0.00011766808859472014, + "loss": 1.7365333557128906, + "step": 20320 + }, + { + "epoch": 0.061537995853071605, + "grad_norm": 0.17235279083251953, + "learning_rate": 0.00011766429341308276, + "loss": 1.6854818344116211, + "step": 20330 + }, + { + "epoch": 0.061568265403417435, + "grad_norm": 0.16834516823291779, + "learning_rate": 0.00011766049823144535, + "loss": 1.6724708557128907, + "step": 20340 + }, + { + "epoch": 0.06159853495376326, + "grad_norm": 0.15310895442962646, + "learning_rate": 0.00011765670304980797, + "loss": 1.712569236755371, + "step": 20350 + }, + { + "epoch": 0.06162880450410909, + "grad_norm": 0.17458420991897583, + "learning_rate": 0.00011765290786817057, + "loss": 1.7139171600341796, + "step": 20360 + }, + { + "epoch": 0.06165907405445492, + "grad_norm": 0.21162745356559753, + "learning_rate": 0.00011764911268653318, + "loss": 1.716507339477539, + "step": 20370 + }, + { + "epoch": 0.06168934360480075, + "grad_norm": 0.17663118243217468, + "learning_rate": 0.00011764531750489579, + "loss": 1.6581438064575196, + "step": 20380 + }, + { + "epoch": 0.06171961315514658, + "grad_norm": 0.16449587047100067, + "learning_rate": 0.0001176415223232584, + "loss": 1.699080467224121, + "step": 20390 + }, + { + "epoch": 0.06174988270549241, + "grad_norm": 0.15024970471858978, + "learning_rate": 0.000117637727141621, + "loss": 1.6968181610107422, + "step": 20400 + }, + { + "epoch": 0.06178015225583824, + "grad_norm": 0.19692039489746094, + "learning_rate": 0.00011763393195998361, + "loss": 1.692588996887207, + "step": 20410 + }, + { + "epoch": 0.06181042180618407, + "grad_norm": 0.1667410135269165, + "learning_rate": 0.00011763013677834621, + "loss": 1.7058279037475585, + "step": 20420 + }, + { + "epoch": 0.0618406913565299, + "grad_norm": 0.16283345222473145, + "learning_rate": 0.00011762634159670882, + "loss": 1.6922225952148438, + "step": 20430 + }, + { + "epoch": 0.06187096090687573, + "grad_norm": 0.1593266725540161, + "learning_rate": 0.00011762254641507143, + "loss": 1.746078109741211, + "step": 20440 + }, + { + "epoch": 0.06190123045722156, + "grad_norm": 0.18752385675907135, + "learning_rate": 0.00011761875123343403, + "loss": 1.676193618774414, + "step": 20450 + }, + { + "epoch": 0.061931500007567385, + "grad_norm": 0.16497235000133514, + "learning_rate": 0.00011761495605179665, + "loss": 1.7031177520751952, + "step": 20460 + }, + { + "epoch": 0.061961769557913216, + "grad_norm": 0.1716269552707672, + "learning_rate": 0.00011761116087015924, + "loss": 1.6835983276367188, + "step": 20470 + }, + { + "epoch": 0.061992039108259046, + "grad_norm": 0.16731347143650055, + "learning_rate": 0.00011760736568852186, + "loss": 1.7307985305786133, + "step": 20480 + }, + { + "epoch": 0.06202230865860488, + "grad_norm": 0.16097742319107056, + "learning_rate": 0.00011760357050688446, + "loss": 1.7024250030517578, + "step": 20490 + }, + { + "epoch": 0.06205257820895071, + "grad_norm": 0.13896267116069794, + "learning_rate": 0.00011759977532524708, + "loss": 1.7177867889404297, + "step": 20500 + }, + { + "epoch": 0.06205257820895071, + "eval_loss": 1.7089858055114746, + "eval_runtime": 28.0903, + "eval_samples_per_second": 17.8, + "eval_steps_per_second": 1.139, + "step": 20500 + }, + { + "epoch": 0.06208284775929654, + "grad_norm": 0.1464475691318512, + "learning_rate": 0.00011759598014360967, + "loss": 1.7234031677246093, + "step": 20510 + }, + { + "epoch": 0.06211311730964236, + "grad_norm": 0.1589069962501526, + "learning_rate": 0.00011759218496197229, + "loss": 1.6612861633300782, + "step": 20520 + }, + { + "epoch": 0.062143386859988194, + "grad_norm": 0.18496888875961304, + "learning_rate": 0.00011758838978033488, + "loss": 1.6635021209716796, + "step": 20530 + }, + { + "epoch": 0.062173656410334024, + "grad_norm": 0.17270272970199585, + "learning_rate": 0.0001175845945986975, + "loss": 1.6979419708251953, + "step": 20540 + }, + { + "epoch": 0.062203925960679855, + "grad_norm": 0.1728316843509674, + "learning_rate": 0.00011758079941706009, + "loss": 1.721168327331543, + "step": 20550 + }, + { + "epoch": 0.062234195511025686, + "grad_norm": 0.14694955945014954, + "learning_rate": 0.00011757700423542271, + "loss": 1.7243297576904297, + "step": 20560 + }, + { + "epoch": 0.06226446506137151, + "grad_norm": 0.15067873895168304, + "learning_rate": 0.0001175732090537853, + "loss": 1.6984291076660156, + "step": 20570 + }, + { + "epoch": 0.06229473461171734, + "grad_norm": 0.17012152075767517, + "learning_rate": 0.00011756941387214792, + "loss": 1.6870409011840821, + "step": 20580 + }, + { + "epoch": 0.06232500416206317, + "grad_norm": 0.1356968730688095, + "learning_rate": 0.00011756561869051054, + "loss": 1.6916427612304688, + "step": 20590 + }, + { + "epoch": 0.062355273712409, + "grad_norm": 0.15726503729820251, + "learning_rate": 0.00011756182350887314, + "loss": 1.6915483474731445, + "step": 20600 + }, + { + "epoch": 0.06238554326275483, + "grad_norm": 0.1692361980676651, + "learning_rate": 0.00011755802832723575, + "loss": 1.730562400817871, + "step": 20610 + }, + { + "epoch": 0.062415812813100664, + "grad_norm": 0.16671133041381836, + "learning_rate": 0.00011755423314559835, + "loss": 1.6569955825805665, + "step": 20620 + }, + { + "epoch": 0.06244608236344649, + "grad_norm": 0.1562786102294922, + "learning_rate": 0.00011755043796396097, + "loss": 1.717108154296875, + "step": 20630 + }, + { + "epoch": 0.06247635191379232, + "grad_norm": 0.1569002866744995, + "learning_rate": 0.00011754664278232356, + "loss": 1.7153360366821289, + "step": 20640 + }, + { + "epoch": 0.06250662146413816, + "grad_norm": 0.16250258684158325, + "learning_rate": 0.00011754284760068618, + "loss": 1.6805891036987304, + "step": 20650 + }, + { + "epoch": 0.06253689101448398, + "grad_norm": 0.16043826937675476, + "learning_rate": 0.00011753905241904877, + "loss": 1.7076162338256835, + "step": 20660 + }, + { + "epoch": 0.0625671605648298, + "grad_norm": 0.16308091580867767, + "learning_rate": 0.00011753525723741139, + "loss": 1.6233118057250977, + "step": 20670 + }, + { + "epoch": 0.06259743011517564, + "grad_norm": 0.16823934018611908, + "learning_rate": 0.00011753146205577398, + "loss": 1.680511474609375, + "step": 20680 + }, + { + "epoch": 0.06262769966552147, + "grad_norm": 0.13839587569236755, + "learning_rate": 0.0001175276668741366, + "loss": 1.704434585571289, + "step": 20690 + }, + { + "epoch": 0.0626579692158673, + "grad_norm": 0.14550255239009857, + "learning_rate": 0.0001175238716924992, + "loss": 1.680081558227539, + "step": 20700 + }, + { + "epoch": 0.06268823876621313, + "grad_norm": 0.1604062169790268, + "learning_rate": 0.00011752007651086181, + "loss": 1.7166976928710938, + "step": 20710 + }, + { + "epoch": 0.06271850831655895, + "grad_norm": 0.17313702404499054, + "learning_rate": 0.00011751628132922442, + "loss": 1.7132875442504882, + "step": 20720 + }, + { + "epoch": 0.06274877786690479, + "grad_norm": 0.15273509919643402, + "learning_rate": 0.00011751248614758703, + "loss": 1.7437166213989257, + "step": 20730 + }, + { + "epoch": 0.06277904741725061, + "grad_norm": 0.17128247022628784, + "learning_rate": 0.00011750869096594963, + "loss": 1.6727930068969727, + "step": 20740 + }, + { + "epoch": 0.06280931696759645, + "grad_norm": 0.16115376353263855, + "learning_rate": 0.00011750489578431224, + "loss": 1.7026786804199219, + "step": 20750 + }, + { + "epoch": 0.06283958651794228, + "grad_norm": 0.15917952358722687, + "learning_rate": 0.00011750110060267484, + "loss": 1.6910408020019532, + "step": 20760 + }, + { + "epoch": 0.0628698560682881, + "grad_norm": 0.1644868403673172, + "learning_rate": 0.00011749730542103745, + "loss": 1.6923629760742187, + "step": 20770 + }, + { + "epoch": 0.06290012561863394, + "grad_norm": 0.15499348938465118, + "learning_rate": 0.00011749351023940006, + "loss": 1.7028221130371093, + "step": 20780 + }, + { + "epoch": 0.06293039516897976, + "grad_norm": 0.15528655052185059, + "learning_rate": 0.00011748971505776266, + "loss": 1.6999412536621095, + "step": 20790 + }, + { + "epoch": 0.0629606647193256, + "grad_norm": 0.16043908894062042, + "learning_rate": 0.00011748591987612528, + "loss": 1.6666259765625, + "step": 20800 + }, + { + "epoch": 0.06299093426967142, + "grad_norm": 0.16012831032276154, + "learning_rate": 0.00011748212469448787, + "loss": 1.7196771621704101, + "step": 20810 + }, + { + "epoch": 0.06302120382001726, + "grad_norm": 0.14849629998207092, + "learning_rate": 0.00011747832951285049, + "loss": 1.6843856811523437, + "step": 20820 + }, + { + "epoch": 0.06305147337036308, + "grad_norm": 0.15788273513317108, + "learning_rate": 0.0001174745343312131, + "loss": 1.68347110748291, + "step": 20830 + }, + { + "epoch": 0.06308174292070891, + "grad_norm": 0.16449548304080963, + "learning_rate": 0.0001174707391495757, + "loss": 1.718161392211914, + "step": 20840 + }, + { + "epoch": 0.06311201247105475, + "grad_norm": 0.19070039689540863, + "learning_rate": 0.00011746694396793831, + "loss": 1.6909589767456055, + "step": 20850 + }, + { + "epoch": 0.06314228202140057, + "grad_norm": 0.18180924654006958, + "learning_rate": 0.00011746314878630092, + "loss": 1.7383403778076172, + "step": 20860 + }, + { + "epoch": 0.06317255157174641, + "grad_norm": 0.14326255023479462, + "learning_rate": 0.00011745935360466352, + "loss": 1.717071533203125, + "step": 20870 + }, + { + "epoch": 0.06320282112209223, + "grad_norm": 0.16925480961799622, + "learning_rate": 0.00011745555842302613, + "loss": 1.6864601135253907, + "step": 20880 + }, + { + "epoch": 0.06323309067243806, + "grad_norm": 0.16371440887451172, + "learning_rate": 0.00011745176324138873, + "loss": 1.6769187927246094, + "step": 20890 + }, + { + "epoch": 0.0632633602227839, + "grad_norm": 0.14557933807373047, + "learning_rate": 0.00011744796805975134, + "loss": 1.7253170013427734, + "step": 20900 + }, + { + "epoch": 0.06329362977312972, + "grad_norm": 0.15587054193019867, + "learning_rate": 0.00011744417287811395, + "loss": 1.71212158203125, + "step": 20910 + }, + { + "epoch": 0.06332389932347555, + "grad_norm": 0.1587415188550949, + "learning_rate": 0.00011744037769647655, + "loss": 1.691167449951172, + "step": 20920 + }, + { + "epoch": 0.06335416887382138, + "grad_norm": 0.17366941273212433, + "learning_rate": 0.00011743658251483916, + "loss": 1.7251092910766601, + "step": 20930 + }, + { + "epoch": 0.0633844384241672, + "grad_norm": 0.17775854468345642, + "learning_rate": 0.00011743278733320176, + "loss": 1.706576156616211, + "step": 20940 + }, + { + "epoch": 0.06341470797451304, + "grad_norm": 0.18037359416484833, + "learning_rate": 0.00011742899215156437, + "loss": 1.6842874526977538, + "step": 20950 + }, + { + "epoch": 0.06344497752485886, + "grad_norm": 0.17064014077186584, + "learning_rate": 0.00011742519696992699, + "loss": 1.73515625, + "step": 20960 + }, + { + "epoch": 0.0634752470752047, + "grad_norm": 0.16698065400123596, + "learning_rate": 0.00011742140178828958, + "loss": 1.7086923599243165, + "step": 20970 + }, + { + "epoch": 0.06350551662555053, + "grad_norm": 0.18444423377513885, + "learning_rate": 0.0001174176066066522, + "loss": 1.6802013397216797, + "step": 20980 + }, + { + "epoch": 0.06353578617589636, + "grad_norm": 0.15181398391723633, + "learning_rate": 0.00011741381142501481, + "loss": 1.7033273696899414, + "step": 20990 + }, + { + "epoch": 0.06356605572624219, + "grad_norm": 0.14304012060165405, + "learning_rate": 0.00011741001624337741, + "loss": 1.7260351181030273, + "step": 21000 + }, + { + "epoch": 0.06356605572624219, + "eval_loss": 1.6811699867248535, + "eval_runtime": 27.9799, + "eval_samples_per_second": 17.87, + "eval_steps_per_second": 1.144, + "step": 21000 + }, + { + "epoch": 0.06359632527658801, + "grad_norm": 0.16949504613876343, + "learning_rate": 0.00011740622106174002, + "loss": 1.6565319061279298, + "step": 21010 + }, + { + "epoch": 0.06362659482693385, + "grad_norm": 0.17792141437530518, + "learning_rate": 0.00011740242588010263, + "loss": 1.6452224731445313, + "step": 21020 + }, + { + "epoch": 0.06365686437727967, + "grad_norm": 0.18511143326759338, + "learning_rate": 0.00011739863069846523, + "loss": 1.639872169494629, + "step": 21030 + }, + { + "epoch": 0.06368713392762551, + "grad_norm": 0.18021784722805023, + "learning_rate": 0.00011739483551682784, + "loss": 1.7139102935791015, + "step": 21040 + }, + { + "epoch": 0.06371740347797133, + "grad_norm": 0.15169504284858704, + "learning_rate": 0.00011739104033519044, + "loss": 1.7089818954467773, + "step": 21050 + }, + { + "epoch": 0.06374767302831716, + "grad_norm": 0.16548210382461548, + "learning_rate": 0.00011738724515355305, + "loss": 1.7235557556152343, + "step": 21060 + }, + { + "epoch": 0.063777942578663, + "grad_norm": 0.16589374840259552, + "learning_rate": 0.00011738344997191567, + "loss": 1.6989492416381835, + "step": 21070 + }, + { + "epoch": 0.06380821212900882, + "grad_norm": 0.17445707321166992, + "learning_rate": 0.00011737965479027826, + "loss": 1.6546245574951173, + "step": 21080 + }, + { + "epoch": 0.06383848167935466, + "grad_norm": 0.1498548686504364, + "learning_rate": 0.00011737585960864088, + "loss": 1.7130151748657227, + "step": 21090 + }, + { + "epoch": 0.06386875122970048, + "grad_norm": 0.1852242797613144, + "learning_rate": 0.00011737206442700347, + "loss": 1.7398345947265625, + "step": 21100 + }, + { + "epoch": 0.0638990207800463, + "grad_norm": 0.16421352326869965, + "learning_rate": 0.00011736826924536609, + "loss": 1.7100635528564454, + "step": 21110 + }, + { + "epoch": 0.06392929033039214, + "grad_norm": 0.16103126108646393, + "learning_rate": 0.00011736447406372869, + "loss": 1.6667335510253907, + "step": 21120 + }, + { + "epoch": 0.06395955988073797, + "grad_norm": 0.1524035930633545, + "learning_rate": 0.0001173606788820913, + "loss": 1.7314434051513672, + "step": 21130 + }, + { + "epoch": 0.0639898294310838, + "grad_norm": 0.14443771541118622, + "learning_rate": 0.0001173568837004539, + "loss": 1.724017333984375, + "step": 21140 + }, + { + "epoch": 0.06402009898142963, + "grad_norm": 0.17540286481380463, + "learning_rate": 0.00011735308851881652, + "loss": 1.6986579895019531, + "step": 21150 + }, + { + "epoch": 0.06405036853177547, + "grad_norm": 0.18261469900608063, + "learning_rate": 0.00011734929333717911, + "loss": 1.713384246826172, + "step": 21160 + }, + { + "epoch": 0.06408063808212129, + "grad_norm": 0.22374680638313293, + "learning_rate": 0.00011734549815554173, + "loss": 1.6824012756347657, + "step": 21170 + }, + { + "epoch": 0.06411090763246712, + "grad_norm": 0.1620134562253952, + "learning_rate": 0.00011734170297390432, + "loss": 1.7043964385986328, + "step": 21180 + }, + { + "epoch": 0.06414117718281295, + "grad_norm": 0.18022295832633972, + "learning_rate": 0.00011733790779226694, + "loss": 1.6498790740966798, + "step": 21190 + }, + { + "epoch": 0.06417144673315878, + "grad_norm": 0.18645259737968445, + "learning_rate": 0.00011733411261062956, + "loss": 1.663800048828125, + "step": 21200 + }, + { + "epoch": 0.06420171628350461, + "grad_norm": 0.163804292678833, + "learning_rate": 0.00011733031742899215, + "loss": 1.67733154296875, + "step": 21210 + }, + { + "epoch": 0.06423198583385044, + "grad_norm": 0.17148403823375702, + "learning_rate": 0.00011732652224735477, + "loss": 1.6763837814331055, + "step": 21220 + }, + { + "epoch": 0.06426225538419626, + "grad_norm": 0.14999251067638397, + "learning_rate": 0.00011732272706571736, + "loss": 1.6988170623779297, + "step": 21230 + }, + { + "epoch": 0.0642925249345421, + "grad_norm": 0.17186184227466583, + "learning_rate": 0.00011731893188407998, + "loss": 1.7095869064331055, + "step": 21240 + }, + { + "epoch": 0.06432279448488792, + "grad_norm": 0.169167160987854, + "learning_rate": 0.00011731513670244258, + "loss": 1.7127464294433594, + "step": 21250 + }, + { + "epoch": 0.06435306403523376, + "grad_norm": 0.1572388857603073, + "learning_rate": 0.0001173113415208052, + "loss": 1.682317352294922, + "step": 21260 + }, + { + "epoch": 0.06438333358557959, + "grad_norm": 0.17050224542617798, + "learning_rate": 0.00011730754633916779, + "loss": 1.651481819152832, + "step": 21270 + }, + { + "epoch": 0.06441360313592541, + "grad_norm": 0.1652601957321167, + "learning_rate": 0.00011730375115753041, + "loss": 1.735546875, + "step": 21280 + }, + { + "epoch": 0.06444387268627125, + "grad_norm": 0.14782461524009705, + "learning_rate": 0.000117299955975893, + "loss": 1.7237422943115235, + "step": 21290 + }, + { + "epoch": 0.06447414223661707, + "grad_norm": 0.13092660903930664, + "learning_rate": 0.00011729616079425562, + "loss": 1.7013187408447266, + "step": 21300 + }, + { + "epoch": 0.06450441178696291, + "grad_norm": 0.15582363307476044, + "learning_rate": 0.00011729236561261821, + "loss": 1.6962924957275392, + "step": 21310 + }, + { + "epoch": 0.06453468133730873, + "grad_norm": 0.15769918262958527, + "learning_rate": 0.00011728857043098083, + "loss": 1.6895992279052734, + "step": 21320 + }, + { + "epoch": 0.06456495088765457, + "grad_norm": 0.16826754808425903, + "learning_rate": 0.00011728477524934344, + "loss": 1.6858495712280273, + "step": 21330 + }, + { + "epoch": 0.0645952204380004, + "grad_norm": 0.1598827838897705, + "learning_rate": 0.00011728098006770604, + "loss": 1.7060144424438477, + "step": 21340 + }, + { + "epoch": 0.06462548998834622, + "grad_norm": 0.1702897995710373, + "learning_rate": 0.00011727718488606865, + "loss": 1.6515708923339845, + "step": 21350 + }, + { + "epoch": 0.06465575953869206, + "grad_norm": 0.1751430481672287, + "learning_rate": 0.00011727338970443126, + "loss": 1.6668312072753906, + "step": 21360 + }, + { + "epoch": 0.06468602908903788, + "grad_norm": 0.1491347849369049, + "learning_rate": 0.00011726959452279386, + "loss": 1.6865848541259765, + "step": 21370 + }, + { + "epoch": 0.06471629863938372, + "grad_norm": 0.16572758555412292, + "learning_rate": 0.00011726579934115647, + "loss": 1.7519496917724608, + "step": 21380 + }, + { + "epoch": 0.06474656818972954, + "grad_norm": 0.18032921850681305, + "learning_rate": 0.00011726200415951909, + "loss": 1.7158332824707032, + "step": 21390 + }, + { + "epoch": 0.06477683774007537, + "grad_norm": 0.16970928013324738, + "learning_rate": 0.00011725820897788168, + "loss": 1.7791631698608399, + "step": 21400 + }, + { + "epoch": 0.0648071072904212, + "grad_norm": 0.1602131724357605, + "learning_rate": 0.0001172544137962443, + "loss": 1.6937215805053711, + "step": 21410 + }, + { + "epoch": 0.06483737684076703, + "grad_norm": 0.1443321406841278, + "learning_rate": 0.00011725061861460689, + "loss": 1.7143745422363281, + "step": 21420 + }, + { + "epoch": 0.06486764639111287, + "grad_norm": 0.1508011519908905, + "learning_rate": 0.00011724682343296951, + "loss": 1.6837638854980468, + "step": 21430 + }, + { + "epoch": 0.06489791594145869, + "grad_norm": 0.17042754590511322, + "learning_rate": 0.00011724302825133212, + "loss": 1.7324348449707032, + "step": 21440 + }, + { + "epoch": 0.06492818549180451, + "grad_norm": 0.17615947127342224, + "learning_rate": 0.00011723923306969472, + "loss": 1.6873998641967773, + "step": 21450 + }, + { + "epoch": 0.06495845504215035, + "grad_norm": 0.16217176616191864, + "learning_rate": 0.00011723543788805733, + "loss": 1.733627700805664, + "step": 21460 + }, + { + "epoch": 0.06498872459249617, + "grad_norm": 0.1706259399652481, + "learning_rate": 0.00011723164270641993, + "loss": 1.7153606414794922, + "step": 21470 + }, + { + "epoch": 0.06501899414284201, + "grad_norm": 0.16846472024917603, + "learning_rate": 0.00011722784752478254, + "loss": 1.7254783630371093, + "step": 21480 + }, + { + "epoch": 0.06504926369318784, + "grad_norm": 0.16657769680023193, + "learning_rate": 0.00011722405234314515, + "loss": 1.6699489593505858, + "step": 21490 + }, + { + "epoch": 0.06507953324353366, + "grad_norm": 0.15445482730865479, + "learning_rate": 0.00011722025716150775, + "loss": 1.6862760543823243, + "step": 21500 + }, + { + "epoch": 0.06507953324353366, + "eval_loss": 1.6742615699768066, + "eval_runtime": 28.1964, + "eval_samples_per_second": 17.733, + "eval_steps_per_second": 1.135, + "step": 21500 + }, + { + "epoch": 0.0651098027938795, + "grad_norm": 0.14418953657150269, + "learning_rate": 0.00011721646197987036, + "loss": 1.6993717193603515, + "step": 21510 + }, + { + "epoch": 0.06514007234422532, + "grad_norm": 0.14214175939559937, + "learning_rate": 0.00011721266679823296, + "loss": 1.7002555847167968, + "step": 21520 + }, + { + "epoch": 0.06517034189457116, + "grad_norm": 0.16484396159648895, + "learning_rate": 0.00011720887161659557, + "loss": 1.6853822708129882, + "step": 21530 + }, + { + "epoch": 0.06520061144491698, + "grad_norm": 0.14185184240341187, + "learning_rate": 0.00011720507643495818, + "loss": 1.6887550354003906, + "step": 21540 + }, + { + "epoch": 0.06523088099526282, + "grad_norm": 0.15738026797771454, + "learning_rate": 0.00011720128125332078, + "loss": 1.6792152404785157, + "step": 21550 + }, + { + "epoch": 0.06526115054560865, + "grad_norm": 0.17087331414222717, + "learning_rate": 0.00011719748607168339, + "loss": 1.673468017578125, + "step": 21560 + }, + { + "epoch": 0.06529142009595447, + "grad_norm": 0.15199780464172363, + "learning_rate": 0.00011719369089004601, + "loss": 1.6903121948242188, + "step": 21570 + }, + { + "epoch": 0.06532168964630031, + "grad_norm": 0.15311822295188904, + "learning_rate": 0.0001171898957084086, + "loss": 1.6862504959106446, + "step": 21580 + }, + { + "epoch": 0.06535195919664613, + "grad_norm": 0.17363324761390686, + "learning_rate": 0.00011718610052677122, + "loss": 1.7147064208984375, + "step": 21590 + }, + { + "epoch": 0.06538222874699197, + "grad_norm": 0.16480261087417603, + "learning_rate": 0.00011718230534513383, + "loss": 1.7176799774169922, + "step": 21600 + }, + { + "epoch": 0.06541249829733779, + "grad_norm": 0.17873859405517578, + "learning_rate": 0.00011717851016349643, + "loss": 1.7141151428222656, + "step": 21610 + }, + { + "epoch": 0.06544276784768362, + "grad_norm": 0.16286519169807434, + "learning_rate": 0.00011717471498185904, + "loss": 1.726565170288086, + "step": 21620 + }, + { + "epoch": 0.06547303739802945, + "grad_norm": 0.15705041587352753, + "learning_rate": 0.00011717091980022164, + "loss": 1.6972957611083985, + "step": 21630 + }, + { + "epoch": 0.06550330694837528, + "grad_norm": 0.16831959784030914, + "learning_rate": 0.00011716712461858425, + "loss": 1.6995845794677735, + "step": 21640 + }, + { + "epoch": 0.06553357649872112, + "grad_norm": 0.16246996819972992, + "learning_rate": 0.00011716332943694685, + "loss": 1.739769744873047, + "step": 21650 + }, + { + "epoch": 0.06556384604906694, + "grad_norm": 0.1650572270154953, + "learning_rate": 0.00011715953425530946, + "loss": 1.6758190155029298, + "step": 21660 + }, + { + "epoch": 0.06559411559941276, + "grad_norm": 0.168349027633667, + "learning_rate": 0.00011715573907367207, + "loss": 1.688013458251953, + "step": 21670 + }, + { + "epoch": 0.0656243851497586, + "grad_norm": 0.1633092612028122, + "learning_rate": 0.00011715194389203467, + "loss": 1.681191635131836, + "step": 21680 + }, + { + "epoch": 0.06565465470010443, + "grad_norm": 0.16612288355827332, + "learning_rate": 0.00011714814871039728, + "loss": 1.6674640655517579, + "step": 21690 + }, + { + "epoch": 0.06568492425045026, + "grad_norm": 0.15338820219039917, + "learning_rate": 0.0001171443535287599, + "loss": 1.6656225204467774, + "step": 21700 + }, + { + "epoch": 0.06571519380079609, + "grad_norm": 0.14473937451839447, + "learning_rate": 0.00011714055834712249, + "loss": 1.7036333084106445, + "step": 21710 + }, + { + "epoch": 0.06574546335114193, + "grad_norm": 0.17370246350765228, + "learning_rate": 0.00011713676316548511, + "loss": 1.704615020751953, + "step": 21720 + }, + { + "epoch": 0.06577573290148775, + "grad_norm": 0.18209940195083618, + "learning_rate": 0.0001171329679838477, + "loss": 1.697966957092285, + "step": 21730 + }, + { + "epoch": 0.06580600245183357, + "grad_norm": 0.1414414793252945, + "learning_rate": 0.00011712917280221032, + "loss": 1.6814353942871094, + "step": 21740 + }, + { + "epoch": 0.06583627200217941, + "grad_norm": 0.16655518114566803, + "learning_rate": 0.00011712537762057291, + "loss": 1.7310846328735352, + "step": 21750 + }, + { + "epoch": 0.06586654155252523, + "grad_norm": 0.2094985842704773, + "learning_rate": 0.00011712158243893553, + "loss": 1.6731880187988282, + "step": 21760 + }, + { + "epoch": 0.06589681110287107, + "grad_norm": 0.170808345079422, + "learning_rate": 0.00011711778725729813, + "loss": 1.6793291091918945, + "step": 21770 + }, + { + "epoch": 0.0659270806532169, + "grad_norm": 0.14107106626033783, + "learning_rate": 0.00011711399207566075, + "loss": 1.695378875732422, + "step": 21780 + }, + { + "epoch": 0.06595735020356272, + "grad_norm": 0.16325369477272034, + "learning_rate": 0.00011711019689402334, + "loss": 1.670426368713379, + "step": 21790 + }, + { + "epoch": 0.06598761975390856, + "grad_norm": 0.16841766238212585, + "learning_rate": 0.00011710640171238596, + "loss": 1.6540468215942383, + "step": 21800 + }, + { + "epoch": 0.06601788930425438, + "grad_norm": 0.15998154878616333, + "learning_rate": 0.00011710260653074858, + "loss": 1.6945627212524415, + "step": 21810 + }, + { + "epoch": 0.06604815885460022, + "grad_norm": 0.21276679635047913, + "learning_rate": 0.00011709881134911117, + "loss": 1.6806322097778321, + "step": 21820 + }, + { + "epoch": 0.06607842840494604, + "grad_norm": 0.1519971787929535, + "learning_rate": 0.00011709501616747379, + "loss": 1.6809772491455077, + "step": 21830 + }, + { + "epoch": 0.06610869795529187, + "grad_norm": 0.16227024793624878, + "learning_rate": 0.00011709122098583638, + "loss": 1.6958770751953125, + "step": 21840 + }, + { + "epoch": 0.0661389675056377, + "grad_norm": 0.14875808358192444, + "learning_rate": 0.000117087425804199, + "loss": 1.712874984741211, + "step": 21850 + }, + { + "epoch": 0.06616923705598353, + "grad_norm": 0.1540748029947281, + "learning_rate": 0.0001170836306225616, + "loss": 1.6885250091552735, + "step": 21860 + }, + { + "epoch": 0.06619950660632937, + "grad_norm": 0.1727815419435501, + "learning_rate": 0.00011707983544092421, + "loss": 1.6540342330932618, + "step": 21870 + }, + { + "epoch": 0.06622977615667519, + "grad_norm": 0.17773407697677612, + "learning_rate": 0.0001170760402592868, + "loss": 1.659247589111328, + "step": 21880 + }, + { + "epoch": 0.06626004570702103, + "grad_norm": 0.17500974237918854, + "learning_rate": 0.00011707224507764942, + "loss": 1.6463407516479491, + "step": 21890 + }, + { + "epoch": 0.06629031525736685, + "grad_norm": 0.1540602147579193, + "learning_rate": 0.00011706844989601202, + "loss": 1.6703477859497071, + "step": 21900 + }, + { + "epoch": 0.06632058480771268, + "grad_norm": 0.17307479679584503, + "learning_rate": 0.00011706465471437464, + "loss": 1.678122329711914, + "step": 21910 + }, + { + "epoch": 0.06635085435805851, + "grad_norm": 0.16133849322795868, + "learning_rate": 0.00011706085953273723, + "loss": 1.6952077865600585, + "step": 21920 + }, + { + "epoch": 0.06638112390840434, + "grad_norm": 0.15733851492404938, + "learning_rate": 0.00011705706435109985, + "loss": 1.67984619140625, + "step": 21930 + }, + { + "epoch": 0.06641139345875018, + "grad_norm": 0.14702774584293365, + "learning_rate": 0.00011705326916946245, + "loss": 1.7114738464355468, + "step": 21940 + }, + { + "epoch": 0.066441663009096, + "grad_norm": 0.16620422899723053, + "learning_rate": 0.00011704947398782506, + "loss": 1.7045391082763672, + "step": 21950 + }, + { + "epoch": 0.06647193255944182, + "grad_norm": 0.1661292016506195, + "learning_rate": 0.00011704567880618767, + "loss": 1.6846546173095702, + "step": 21960 + }, + { + "epoch": 0.06650220210978766, + "grad_norm": 0.14530746638774872, + "learning_rate": 0.00011704188362455027, + "loss": 1.676949691772461, + "step": 21970 + }, + { + "epoch": 0.06653247166013349, + "grad_norm": 0.15883012115955353, + "learning_rate": 0.00011703808844291288, + "loss": 1.7036468505859375, + "step": 21980 + }, + { + "epoch": 0.06656274121047932, + "grad_norm": 0.1824512928724289, + "learning_rate": 0.00011703429326127548, + "loss": 1.6720268249511718, + "step": 21990 + }, + { + "epoch": 0.06659301076082515, + "grad_norm": 0.15284821391105652, + "learning_rate": 0.0001170304980796381, + "loss": 1.6782629013061523, + "step": 22000 + }, + { + "epoch": 0.06659301076082515, + "eval_loss": 1.695688009262085, + "eval_runtime": 28.1327, + "eval_samples_per_second": 17.773, + "eval_steps_per_second": 1.137, + "step": 22000 + }, + { + "epoch": 0.06662328031117097, + "grad_norm": 0.14576047658920288, + "learning_rate": 0.0001170267028980007, + "loss": 1.7057018280029297, + "step": 22010 + }, + { + "epoch": 0.06665354986151681, + "grad_norm": 0.16564077138900757, + "learning_rate": 0.00011702290771636332, + "loss": 1.7045303344726563, + "step": 22020 + }, + { + "epoch": 0.06668381941186263, + "grad_norm": 0.18653994798660278, + "learning_rate": 0.00011701911253472591, + "loss": 1.660559844970703, + "step": 22030 + }, + { + "epoch": 0.06671408896220847, + "grad_norm": 0.15454979240894318, + "learning_rate": 0.00011701531735308853, + "loss": 1.681039047241211, + "step": 22040 + }, + { + "epoch": 0.0667443585125543, + "grad_norm": 0.15271243453025818, + "learning_rate": 0.00011701152217145112, + "loss": 1.7193790435791017, + "step": 22050 + }, + { + "epoch": 0.06677462806290013, + "grad_norm": 0.16977669298648834, + "learning_rate": 0.00011700772698981374, + "loss": 1.7104930877685547, + "step": 22060 + }, + { + "epoch": 0.06680489761324596, + "grad_norm": 0.18840959668159485, + "learning_rate": 0.00011700393180817635, + "loss": 1.68172607421875, + "step": 22070 + }, + { + "epoch": 0.06683516716359178, + "grad_norm": 0.15261423587799072, + "learning_rate": 0.00011700013662653895, + "loss": 1.711736297607422, + "step": 22080 + }, + { + "epoch": 0.06686543671393762, + "grad_norm": 0.14889074862003326, + "learning_rate": 0.00011699634144490156, + "loss": 1.6452234268188477, + "step": 22090 + }, + { + "epoch": 0.06689570626428344, + "grad_norm": 0.18181316554546356, + "learning_rate": 0.00011699254626326416, + "loss": 1.706229019165039, + "step": 22100 + }, + { + "epoch": 0.06692597581462928, + "grad_norm": 0.1644529551267624, + "learning_rate": 0.00011698875108162677, + "loss": 1.686193084716797, + "step": 22110 + }, + { + "epoch": 0.0669562453649751, + "grad_norm": 0.17932939529418945, + "learning_rate": 0.00011698495589998938, + "loss": 1.7092473983764649, + "step": 22120 + }, + { + "epoch": 0.06698651491532093, + "grad_norm": 0.17729534208774567, + "learning_rate": 0.00011698116071835198, + "loss": 1.7502883911132812, + "step": 22130 + }, + { + "epoch": 0.06701678446566676, + "grad_norm": 0.13749410212039948, + "learning_rate": 0.00011697736553671459, + "loss": 1.684381103515625, + "step": 22140 + }, + { + "epoch": 0.06704705401601259, + "grad_norm": 0.1577402800321579, + "learning_rate": 0.00011697357035507719, + "loss": 1.6902414321899415, + "step": 22150 + }, + { + "epoch": 0.06707732356635843, + "grad_norm": 0.183676615357399, + "learning_rate": 0.0001169697751734398, + "loss": 1.7086994171142578, + "step": 22160 + }, + { + "epoch": 0.06710759311670425, + "grad_norm": 0.16126862168312073, + "learning_rate": 0.0001169659799918024, + "loss": 1.6645242691040039, + "step": 22170 + }, + { + "epoch": 0.06713786266705007, + "grad_norm": 0.18138064444065094, + "learning_rate": 0.00011696218481016502, + "loss": 1.6935813903808594, + "step": 22180 + }, + { + "epoch": 0.06716813221739591, + "grad_norm": 0.16037513315677643, + "learning_rate": 0.00011695838962852762, + "loss": 1.6801267623901368, + "step": 22190 + }, + { + "epoch": 0.06719840176774174, + "grad_norm": 0.16078199446201324, + "learning_rate": 0.00011695459444689024, + "loss": 1.6673713684082032, + "step": 22200 + }, + { + "epoch": 0.06722867131808757, + "grad_norm": 0.17424680292606354, + "learning_rate": 0.00011695079926525284, + "loss": 1.6376888275146484, + "step": 22210 + }, + { + "epoch": 0.0672589408684334, + "grad_norm": 0.14978905022144318, + "learning_rate": 0.00011694700408361545, + "loss": 1.7152408599853515, + "step": 22220 + }, + { + "epoch": 0.06728921041877924, + "grad_norm": 0.1499917209148407, + "learning_rate": 0.00011694320890197805, + "loss": 1.685591697692871, + "step": 22230 + }, + { + "epoch": 0.06731947996912506, + "grad_norm": 0.15934069454669952, + "learning_rate": 0.00011693941372034066, + "loss": 1.6965763092041015, + "step": 22240 + }, + { + "epoch": 0.06734974951947088, + "grad_norm": 0.17369510233402252, + "learning_rate": 0.00011693561853870327, + "loss": 1.6989852905273437, + "step": 22250 + }, + { + "epoch": 0.06738001906981672, + "grad_norm": 0.1654369980096817, + "learning_rate": 0.00011693182335706587, + "loss": 1.642608642578125, + "step": 22260 + }, + { + "epoch": 0.06741028862016255, + "grad_norm": 0.17264112830162048, + "learning_rate": 0.00011692802817542848, + "loss": 1.6785148620605468, + "step": 22270 + }, + { + "epoch": 0.06744055817050838, + "grad_norm": 0.1529124528169632, + "learning_rate": 0.00011692423299379108, + "loss": 1.6834115982055664, + "step": 22280 + }, + { + "epoch": 0.0674708277208542, + "grad_norm": 0.2052718847990036, + "learning_rate": 0.00011692043781215369, + "loss": 1.7058475494384766, + "step": 22290 + }, + { + "epoch": 0.06750109727120003, + "grad_norm": 0.1543312817811966, + "learning_rate": 0.0001169166426305163, + "loss": 1.7019382476806642, + "step": 22300 + }, + { + "epoch": 0.06753136682154587, + "grad_norm": 0.16707611083984375, + "learning_rate": 0.00011691284744887892, + "loss": 1.6959537506103515, + "step": 22310 + }, + { + "epoch": 0.06756163637189169, + "grad_norm": 0.15467536449432373, + "learning_rate": 0.00011690905226724151, + "loss": 1.716252899169922, + "step": 22320 + }, + { + "epoch": 0.06759190592223753, + "grad_norm": 0.1684684157371521, + "learning_rate": 0.00011690525708560413, + "loss": 1.643073272705078, + "step": 22330 + }, + { + "epoch": 0.06762217547258335, + "grad_norm": 0.15317237377166748, + "learning_rate": 0.00011690146190396672, + "loss": 1.64339599609375, + "step": 22340 + }, + { + "epoch": 0.06765244502292918, + "grad_norm": 0.15047086775302887, + "learning_rate": 0.00011689766672232934, + "loss": 1.69544677734375, + "step": 22350 + }, + { + "epoch": 0.06768271457327502, + "grad_norm": 0.16099202632904053, + "learning_rate": 0.00011689387154069193, + "loss": 1.7068059921264649, + "step": 22360 + }, + { + "epoch": 0.06771298412362084, + "grad_norm": 0.15512816607952118, + "learning_rate": 0.00011689007635905455, + "loss": 1.7082950592041015, + "step": 22370 + }, + { + "epoch": 0.06774325367396668, + "grad_norm": 0.1961362659931183, + "learning_rate": 0.00011688628117741714, + "loss": 1.7371004104614258, + "step": 22380 + }, + { + "epoch": 0.0677735232243125, + "grad_norm": 0.15703055262565613, + "learning_rate": 0.00011688248599577976, + "loss": 1.6580862045288085, + "step": 22390 + }, + { + "epoch": 0.06780379277465834, + "grad_norm": 0.15132595598697662, + "learning_rate": 0.00011687869081414236, + "loss": 1.7059045791625977, + "step": 22400 + }, + { + "epoch": 0.06783406232500416, + "grad_norm": 0.17134280502796173, + "learning_rate": 0.00011687489563250497, + "loss": 1.6687446594238282, + "step": 22410 + }, + { + "epoch": 0.06786433187534999, + "grad_norm": 0.17267823219299316, + "learning_rate": 0.00011687110045086758, + "loss": 1.6763786315917968, + "step": 22420 + }, + { + "epoch": 0.06789460142569582, + "grad_norm": 0.15558567643165588, + "learning_rate": 0.00011686730526923019, + "loss": 1.6879907608032227, + "step": 22430 + }, + { + "epoch": 0.06792487097604165, + "grad_norm": 0.14652571082115173, + "learning_rate": 0.0001168635100875928, + "loss": 1.7020423889160157, + "step": 22440 + }, + { + "epoch": 0.06795514052638749, + "grad_norm": 0.161643385887146, + "learning_rate": 0.0001168597149059554, + "loss": 1.7096237182617187, + "step": 22450 + }, + { + "epoch": 0.06798541007673331, + "grad_norm": 0.16494019329547882, + "learning_rate": 0.00011685591972431802, + "loss": 1.6898227691650392, + "step": 22460 + }, + { + "epoch": 0.06801567962707913, + "grad_norm": 0.16020478308200836, + "learning_rate": 0.00011685212454268061, + "loss": 1.6578676223754882, + "step": 22470 + }, + { + "epoch": 0.06804594917742497, + "grad_norm": 0.1707494705915451, + "learning_rate": 0.00011684832936104323, + "loss": 1.665267562866211, + "step": 22480 + }, + { + "epoch": 0.0680762187277708, + "grad_norm": 0.17512305080890656, + "learning_rate": 0.00011684453417940582, + "loss": 1.7037811279296875, + "step": 22490 + }, + { + "epoch": 0.06810648827811663, + "grad_norm": 0.1494217813014984, + "learning_rate": 0.00011684073899776844, + "loss": 1.7117218017578124, + "step": 22500 + }, + { + "epoch": 0.06810648827811663, + "eval_loss": 1.7108707427978516, + "eval_runtime": 28.2476, + "eval_samples_per_second": 17.701, + "eval_steps_per_second": 1.133, + "step": 22500 + }, + { + "epoch": 0.06813675782846246, + "grad_norm": 0.16994044184684753, + "learning_rate": 0.00011683694381613103, + "loss": 1.6797975540161132, + "step": 22510 + }, + { + "epoch": 0.06816702737880828, + "grad_norm": 0.17796538770198822, + "learning_rate": 0.00011683314863449365, + "loss": 1.7002124786376953, + "step": 22520 + }, + { + "epoch": 0.06819729692915412, + "grad_norm": 0.147536501288414, + "learning_rate": 0.00011682935345285625, + "loss": 1.6507844924926758, + "step": 22530 + }, + { + "epoch": 0.06822756647949994, + "grad_norm": 0.19123564660549164, + "learning_rate": 0.00011682555827121887, + "loss": 1.6830760955810546, + "step": 22540 + }, + { + "epoch": 0.06825783602984578, + "grad_norm": 0.19485530257225037, + "learning_rate": 0.00011682176308958146, + "loss": 1.6753662109375, + "step": 22550 + }, + { + "epoch": 0.0682881055801916, + "grad_norm": 0.1556108593940735, + "learning_rate": 0.00011681796790794408, + "loss": 1.6953439712524414, + "step": 22560 + }, + { + "epoch": 0.06831837513053744, + "grad_norm": 0.16086767613887787, + "learning_rate": 0.00011681417272630668, + "loss": 1.638404083251953, + "step": 22570 + }, + { + "epoch": 0.06834864468088327, + "grad_norm": 0.16112947463989258, + "learning_rate": 0.00011681037754466929, + "loss": 1.706987190246582, + "step": 22580 + }, + { + "epoch": 0.06837891423122909, + "grad_norm": 0.15229076147079468, + "learning_rate": 0.0001168065823630319, + "loss": 1.6935707092285157, + "step": 22590 + }, + { + "epoch": 0.06840918378157493, + "grad_norm": 0.1794566810131073, + "learning_rate": 0.0001168027871813945, + "loss": 1.685091209411621, + "step": 22600 + }, + { + "epoch": 0.06843945333192075, + "grad_norm": 0.15417349338531494, + "learning_rate": 0.00011679899199975712, + "loss": 1.682177734375, + "step": 22610 + }, + { + "epoch": 0.06846972288226659, + "grad_norm": 0.1596122831106186, + "learning_rate": 0.00011679519681811971, + "loss": 1.7072830200195312, + "step": 22620 + }, + { + "epoch": 0.06849999243261241, + "grad_norm": 0.1769949197769165, + "learning_rate": 0.00011679140163648233, + "loss": 1.7218967437744142, + "step": 22630 + }, + { + "epoch": 0.06853026198295824, + "grad_norm": 0.14825166761875153, + "learning_rate": 0.00011678760645484493, + "loss": 1.676490020751953, + "step": 22640 + }, + { + "epoch": 0.06856053153330408, + "grad_norm": 0.17969994246959686, + "learning_rate": 0.00011678381127320754, + "loss": 1.7149192810058593, + "step": 22650 + }, + { + "epoch": 0.0685908010836499, + "grad_norm": 0.15293598175048828, + "learning_rate": 0.00011678001609157014, + "loss": 1.6664958953857423, + "step": 22660 + }, + { + "epoch": 0.06862107063399574, + "grad_norm": 0.1526223123073578, + "learning_rate": 0.00011677622090993276, + "loss": 1.6792011260986328, + "step": 22670 + }, + { + "epoch": 0.06865134018434156, + "grad_norm": 0.20451681315898895, + "learning_rate": 0.00011677242572829536, + "loss": 1.6756145477294921, + "step": 22680 + }, + { + "epoch": 0.06868160973468738, + "grad_norm": 0.18030954897403717, + "learning_rate": 0.00011676863054665797, + "loss": 1.6809436798095703, + "step": 22690 + }, + { + "epoch": 0.06871187928503322, + "grad_norm": 0.1479622721672058, + "learning_rate": 0.00011676483536502057, + "loss": 1.6729660034179688, + "step": 22700 + }, + { + "epoch": 0.06874214883537905, + "grad_norm": 0.1779422014951706, + "learning_rate": 0.00011676104018338318, + "loss": 1.7080732345581056, + "step": 22710 + }, + { + "epoch": 0.06877241838572488, + "grad_norm": 0.15481184422969818, + "learning_rate": 0.00011675724500174579, + "loss": 1.6863723754882813, + "step": 22720 + }, + { + "epoch": 0.06880268793607071, + "grad_norm": 0.17054210603237152, + "learning_rate": 0.00011675344982010839, + "loss": 1.6830673217773438, + "step": 22730 + }, + { + "epoch": 0.06883295748641655, + "grad_norm": 0.16262440383434296, + "learning_rate": 0.000116749654638471, + "loss": 1.7007770538330078, + "step": 22740 + }, + { + "epoch": 0.06886322703676237, + "grad_norm": 0.16862402856349945, + "learning_rate": 0.0001167458594568336, + "loss": 1.7053251266479492, + "step": 22750 + }, + { + "epoch": 0.0688934965871082, + "grad_norm": 0.15932594239711761, + "learning_rate": 0.00011674206427519621, + "loss": 1.660835075378418, + "step": 22760 + }, + { + "epoch": 0.06892376613745403, + "grad_norm": 0.16863960027694702, + "learning_rate": 0.00011673826909355882, + "loss": 1.6912666320800782, + "step": 22770 + }, + { + "epoch": 0.06895403568779986, + "grad_norm": 0.16167642176151276, + "learning_rate": 0.00011673447391192142, + "loss": 1.6790939331054688, + "step": 22780 + }, + { + "epoch": 0.0689843052381457, + "grad_norm": 0.17761607468128204, + "learning_rate": 0.00011673067873028403, + "loss": 1.6778770446777345, + "step": 22790 + }, + { + "epoch": 0.06901457478849152, + "grad_norm": 0.1540718972682953, + "learning_rate": 0.00011672688354864663, + "loss": 1.6379287719726563, + "step": 22800 + }, + { + "epoch": 0.06904484433883734, + "grad_norm": 0.168651282787323, + "learning_rate": 0.00011672308836700925, + "loss": 1.6566802978515625, + "step": 22810 + }, + { + "epoch": 0.06907511388918318, + "grad_norm": 0.15464775264263153, + "learning_rate": 0.00011671929318537186, + "loss": 1.6734809875488281, + "step": 22820 + }, + { + "epoch": 0.069105383439529, + "grad_norm": 0.14908131957054138, + "learning_rate": 0.00011671549800373447, + "loss": 1.6741146087646483, + "step": 22830 + }, + { + "epoch": 0.06913565298987484, + "grad_norm": 0.14909763634204865, + "learning_rate": 0.00011671170282209707, + "loss": 1.6656845092773438, + "step": 22840 + }, + { + "epoch": 0.06916592254022066, + "grad_norm": 0.14444679021835327, + "learning_rate": 0.00011670790764045968, + "loss": 1.6725332260131835, + "step": 22850 + }, + { + "epoch": 0.06919619209056649, + "grad_norm": 0.15716706216335297, + "learning_rate": 0.00011670411245882228, + "loss": 1.6918560028076173, + "step": 22860 + }, + { + "epoch": 0.06922646164091233, + "grad_norm": 0.14628832042217255, + "learning_rate": 0.00011670031727718489, + "loss": 1.6856197357177733, + "step": 22870 + }, + { + "epoch": 0.06925673119125815, + "grad_norm": 0.1430618166923523, + "learning_rate": 0.0001166965220955475, + "loss": 1.6903976440429687, + "step": 22880 + }, + { + "epoch": 0.06928700074160399, + "grad_norm": 0.14528217911720276, + "learning_rate": 0.0001166927269139101, + "loss": 1.7446607589721679, + "step": 22890 + }, + { + "epoch": 0.06931727029194981, + "grad_norm": 0.15149638056755066, + "learning_rate": 0.00011668893173227271, + "loss": 1.6740709304809571, + "step": 22900 + }, + { + "epoch": 0.06934753984229565, + "grad_norm": 0.16144393384456635, + "learning_rate": 0.00011668513655063531, + "loss": 1.7147069931030274, + "step": 22910 + }, + { + "epoch": 0.06937780939264147, + "grad_norm": 0.1477237194776535, + "learning_rate": 0.00011668134136899793, + "loss": 1.664227294921875, + "step": 22920 + }, + { + "epoch": 0.0694080789429873, + "grad_norm": 0.13016624748706818, + "learning_rate": 0.00011667754618736052, + "loss": 1.7173587799072265, + "step": 22930 + }, + { + "epoch": 0.06943834849333314, + "grad_norm": 0.15279309451580048, + "learning_rate": 0.00011667375100572314, + "loss": 1.6954299926757812, + "step": 22940 + }, + { + "epoch": 0.06946861804367896, + "grad_norm": 0.16384650766849518, + "learning_rate": 0.00011666995582408574, + "loss": 1.6942699432373047, + "step": 22950 + }, + { + "epoch": 0.0694988875940248, + "grad_norm": 0.13560515642166138, + "learning_rate": 0.00011666616064244836, + "loss": 1.6551143646240234, + "step": 22960 + }, + { + "epoch": 0.06952915714437062, + "grad_norm": 0.15230102837085724, + "learning_rate": 0.00011666236546081095, + "loss": 1.6867307662963866, + "step": 22970 + }, + { + "epoch": 0.06955942669471644, + "grad_norm": 0.1609395295381546, + "learning_rate": 0.00011665857027917357, + "loss": 1.703775405883789, + "step": 22980 + }, + { + "epoch": 0.06958969624506228, + "grad_norm": 0.1509232223033905, + "learning_rate": 0.00011665477509753616, + "loss": 1.6691375732421876, + "step": 22990 + }, + { + "epoch": 0.0696199657954081, + "grad_norm": 0.19104263186454773, + "learning_rate": 0.00011665097991589878, + "loss": 1.7469074249267578, + "step": 23000 + }, + { + "epoch": 0.0696199657954081, + "eval_loss": 1.7065246105194092, + "eval_runtime": 28.1158, + "eval_samples_per_second": 17.784, + "eval_steps_per_second": 1.138, + "step": 23000 + }, + { + "epoch": 0.06965023534575394, + "grad_norm": 0.15961916744709015, + "learning_rate": 0.00011664718473426137, + "loss": 1.6806180953979493, + "step": 23010 + }, + { + "epoch": 0.06968050489609977, + "grad_norm": 0.15418949723243713, + "learning_rate": 0.00011664338955262399, + "loss": 1.6728126525878906, + "step": 23020 + }, + { + "epoch": 0.06971077444644559, + "grad_norm": 0.14782249927520752, + "learning_rate": 0.0001166395943709866, + "loss": 1.7224288940429688, + "step": 23030 + }, + { + "epoch": 0.06974104399679143, + "grad_norm": 0.1763286143541336, + "learning_rate": 0.0001166357991893492, + "loss": 1.692483139038086, + "step": 23040 + }, + { + "epoch": 0.06977131354713725, + "grad_norm": 0.13710972666740417, + "learning_rate": 0.00011663200400771182, + "loss": 1.6473686218261718, + "step": 23050 + }, + { + "epoch": 0.06980158309748309, + "grad_norm": 0.1595286726951599, + "learning_rate": 0.00011662820882607442, + "loss": 1.6361965179443358, + "step": 23060 + }, + { + "epoch": 0.06983185264782892, + "grad_norm": 0.154521182179451, + "learning_rate": 0.00011662441364443704, + "loss": 1.6986757278442384, + "step": 23070 + }, + { + "epoch": 0.06986212219817474, + "grad_norm": 0.17627990245819092, + "learning_rate": 0.00011662061846279963, + "loss": 1.6719524383544921, + "step": 23080 + }, + { + "epoch": 0.06989239174852058, + "grad_norm": 0.17454802989959717, + "learning_rate": 0.00011661682328116225, + "loss": 1.7238523483276367, + "step": 23090 + }, + { + "epoch": 0.0699226612988664, + "grad_norm": 0.14800165593624115, + "learning_rate": 0.00011661302809952484, + "loss": 1.725368118286133, + "step": 23100 + }, + { + "epoch": 0.06995293084921224, + "grad_norm": 0.16224844753742218, + "learning_rate": 0.00011660923291788746, + "loss": 1.7248159408569337, + "step": 23110 + }, + { + "epoch": 0.06998320039955806, + "grad_norm": 0.14440959692001343, + "learning_rate": 0.00011660543773625005, + "loss": 1.6831485748291015, + "step": 23120 + }, + { + "epoch": 0.0700134699499039, + "grad_norm": 0.16064515709877014, + "learning_rate": 0.00011660164255461267, + "loss": 1.696713638305664, + "step": 23130 + }, + { + "epoch": 0.07004373950024972, + "grad_norm": 0.1347230076789856, + "learning_rate": 0.00011659784737297526, + "loss": 1.6857528686523438, + "step": 23140 + }, + { + "epoch": 0.07007400905059555, + "grad_norm": 0.16200867295265198, + "learning_rate": 0.00011659405219133788, + "loss": 1.6753740310668945, + "step": 23150 + }, + { + "epoch": 0.07010427860094139, + "grad_norm": 0.16846321523189545, + "learning_rate": 0.00011659025700970048, + "loss": 1.7091806411743165, + "step": 23160 + }, + { + "epoch": 0.07013454815128721, + "grad_norm": 0.1671486645936966, + "learning_rate": 0.0001165864618280631, + "loss": 1.6828676223754884, + "step": 23170 + }, + { + "epoch": 0.07016481770163305, + "grad_norm": 0.19213363528251648, + "learning_rate": 0.0001165826666464257, + "loss": 1.7140615463256836, + "step": 23180 + }, + { + "epoch": 0.07019508725197887, + "grad_norm": 0.16585518419742584, + "learning_rate": 0.0001165788714647883, + "loss": 1.670355987548828, + "step": 23190 + }, + { + "epoch": 0.0702253568023247, + "grad_norm": 0.15510953962802887, + "learning_rate": 0.00011657507628315091, + "loss": 1.7118547439575196, + "step": 23200 + }, + { + "epoch": 0.07025562635267053, + "grad_norm": 0.15749649703502655, + "learning_rate": 0.00011657128110151352, + "loss": 1.7013309478759766, + "step": 23210 + }, + { + "epoch": 0.07028589590301636, + "grad_norm": 0.15553103387355804, + "learning_rate": 0.00011656748591987614, + "loss": 1.6736980438232423, + "step": 23220 + }, + { + "epoch": 0.0703161654533622, + "grad_norm": 0.1595940738916397, + "learning_rate": 0.00011656369073823873, + "loss": 1.6922525405883788, + "step": 23230 + }, + { + "epoch": 0.07034643500370802, + "grad_norm": 0.15344353020191193, + "learning_rate": 0.00011655989555660135, + "loss": 1.656475830078125, + "step": 23240 + }, + { + "epoch": 0.07037670455405384, + "grad_norm": 0.16255177557468414, + "learning_rate": 0.00011655610037496394, + "loss": 1.6759464263916015, + "step": 23250 + }, + { + "epoch": 0.07040697410439968, + "grad_norm": 0.18538784980773926, + "learning_rate": 0.00011655230519332656, + "loss": 1.688070297241211, + "step": 23260 + }, + { + "epoch": 0.0704372436547455, + "grad_norm": 0.2023490071296692, + "learning_rate": 0.00011654851001168915, + "loss": 1.7007434844970704, + "step": 23270 + }, + { + "epoch": 0.07046751320509134, + "grad_norm": 0.17493604123592377, + "learning_rate": 0.00011654471483005177, + "loss": 1.670722198486328, + "step": 23280 + }, + { + "epoch": 0.07049778275543717, + "grad_norm": 0.15643177926540375, + "learning_rate": 0.00011654091964841438, + "loss": 1.6611976623535156, + "step": 23290 + }, + { + "epoch": 0.070528052305783, + "grad_norm": 0.1629738211631775, + "learning_rate": 0.00011653712446677699, + "loss": 1.66546630859375, + "step": 23300 + }, + { + "epoch": 0.07055832185612883, + "grad_norm": 0.15585525333881378, + "learning_rate": 0.00011653332928513959, + "loss": 1.675064468383789, + "step": 23310 + }, + { + "epoch": 0.07058859140647465, + "grad_norm": 0.16830874979496002, + "learning_rate": 0.0001165295341035022, + "loss": 1.6699886322021484, + "step": 23320 + }, + { + "epoch": 0.07061886095682049, + "grad_norm": 0.14203767478466034, + "learning_rate": 0.0001165257389218648, + "loss": 1.7088058471679688, + "step": 23330 + }, + { + "epoch": 0.07064913050716631, + "grad_norm": 0.15753565728664398, + "learning_rate": 0.00011652194374022741, + "loss": 1.6733358383178711, + "step": 23340 + }, + { + "epoch": 0.07067940005751215, + "grad_norm": 0.16782733798027039, + "learning_rate": 0.00011651814855859002, + "loss": 1.6519153594970704, + "step": 23350 + }, + { + "epoch": 0.07070966960785798, + "grad_norm": 0.18829220533370972, + "learning_rate": 0.00011651435337695262, + "loss": 1.6836380004882812, + "step": 23360 + }, + { + "epoch": 0.0707399391582038, + "grad_norm": 0.16264863312244415, + "learning_rate": 0.00011651055819531523, + "loss": 1.6845821380615233, + "step": 23370 + }, + { + "epoch": 0.07077020870854964, + "grad_norm": 0.14726610481739044, + "learning_rate": 0.00011650676301367783, + "loss": 1.6526390075683595, + "step": 23380 + }, + { + "epoch": 0.07080047825889546, + "grad_norm": 0.18635544180870056, + "learning_rate": 0.00011650296783204044, + "loss": 1.7156850814819335, + "step": 23390 + }, + { + "epoch": 0.0708307478092413, + "grad_norm": 0.15128681063652039, + "learning_rate": 0.00011649917265040305, + "loss": 1.727379035949707, + "step": 23400 + }, + { + "epoch": 0.07086101735958712, + "grad_norm": 0.16534893214702606, + "learning_rate": 0.00011649537746876565, + "loss": 1.6413045883178712, + "step": 23410 + }, + { + "epoch": 0.07089128690993295, + "grad_norm": 0.15934520959854126, + "learning_rate": 0.00011649158228712827, + "loss": 1.6806167602539062, + "step": 23420 + }, + { + "epoch": 0.07092155646027878, + "grad_norm": 0.14393207430839539, + "learning_rate": 0.00011648778710549088, + "loss": 1.7155242919921876, + "step": 23430 + }, + { + "epoch": 0.07095182601062461, + "grad_norm": 0.1518269032239914, + "learning_rate": 0.00011648399192385348, + "loss": 1.6548093795776366, + "step": 23440 + }, + { + "epoch": 0.07098209556097045, + "grad_norm": 0.15970027446746826, + "learning_rate": 0.00011648019674221609, + "loss": 1.6602638244628907, + "step": 23450 + }, + { + "epoch": 0.07101236511131627, + "grad_norm": 0.1611250638961792, + "learning_rate": 0.0001164764015605787, + "loss": 1.7010089874267578, + "step": 23460 + }, + { + "epoch": 0.07104263466166211, + "grad_norm": 0.17262926697731018, + "learning_rate": 0.0001164726063789413, + "loss": 1.689251708984375, + "step": 23470 + }, + { + "epoch": 0.07107290421200793, + "grad_norm": 0.15464039146900177, + "learning_rate": 0.0001164688111973039, + "loss": 1.7136796951293944, + "step": 23480 + }, + { + "epoch": 0.07110317376235376, + "grad_norm": 0.16253678500652313, + "learning_rate": 0.00011646501601566651, + "loss": 1.6890018463134766, + "step": 23490 + }, + { + "epoch": 0.07113344331269959, + "grad_norm": 0.1666143536567688, + "learning_rate": 0.00011646122083402912, + "loss": 1.6538339614868165, + "step": 23500 + }, + { + "epoch": 0.07113344331269959, + "eval_loss": 1.6816571950912476, + "eval_runtime": 27.9094, + "eval_samples_per_second": 17.915, + "eval_steps_per_second": 1.147, + "step": 23500 + }, + { + "epoch": 0.07116371286304542, + "grad_norm": 0.1539776623249054, + "learning_rate": 0.00011645742565239172, + "loss": 1.69971923828125, + "step": 23510 + }, + { + "epoch": 0.07119398241339125, + "grad_norm": 0.13469383120536804, + "learning_rate": 0.00011645363047075433, + "loss": 1.6870857238769532, + "step": 23520 + }, + { + "epoch": 0.07122425196373708, + "grad_norm": 0.1716221272945404, + "learning_rate": 0.00011644983528911694, + "loss": 1.6828264236450194, + "step": 23530 + }, + { + "epoch": 0.0712545215140829, + "grad_norm": 0.20908676087856293, + "learning_rate": 0.00011644604010747954, + "loss": 1.700922393798828, + "step": 23540 + }, + { + "epoch": 0.07128479106442874, + "grad_norm": 0.16277556121349335, + "learning_rate": 0.00011644224492584216, + "loss": 1.6894191741943358, + "step": 23550 + }, + { + "epoch": 0.07131506061477456, + "grad_norm": 0.18716484308242798, + "learning_rate": 0.00011643844974420475, + "loss": 1.6737604141235352, + "step": 23560 + }, + { + "epoch": 0.0713453301651204, + "grad_norm": 0.1557024121284485, + "learning_rate": 0.00011643465456256737, + "loss": 1.7014678955078124, + "step": 23570 + }, + { + "epoch": 0.07137559971546623, + "grad_norm": 0.15315985679626465, + "learning_rate": 0.00011643085938092997, + "loss": 1.6453577041625977, + "step": 23580 + }, + { + "epoch": 0.07140586926581205, + "grad_norm": 0.1747387796640396, + "learning_rate": 0.00011642706419929259, + "loss": 1.6460575103759765, + "step": 23590 + }, + { + "epoch": 0.07143613881615789, + "grad_norm": 0.17675133049488068, + "learning_rate": 0.00011642326901765518, + "loss": 1.7256134033203125, + "step": 23600 + }, + { + "epoch": 0.07146640836650371, + "grad_norm": 0.1839083582162857, + "learning_rate": 0.0001164194738360178, + "loss": 1.7233579635620118, + "step": 23610 + }, + { + "epoch": 0.07149667791684955, + "grad_norm": 0.1656116396188736, + "learning_rate": 0.00011641567865438039, + "loss": 1.659165382385254, + "step": 23620 + }, + { + "epoch": 0.07152694746719537, + "grad_norm": 0.16288359463214874, + "learning_rate": 0.00011641188347274301, + "loss": 1.6854385375976562, + "step": 23630 + }, + { + "epoch": 0.07155721701754121, + "grad_norm": 0.14881525933742523, + "learning_rate": 0.00011640808829110562, + "loss": 1.6540876388549806, + "step": 23640 + }, + { + "epoch": 0.07158748656788703, + "grad_norm": 0.17274326086044312, + "learning_rate": 0.00011640429310946822, + "loss": 1.7002113342285157, + "step": 23650 + }, + { + "epoch": 0.07161775611823286, + "grad_norm": 0.1502659171819687, + "learning_rate": 0.00011640049792783084, + "loss": 1.661540412902832, + "step": 23660 + }, + { + "epoch": 0.0716480256685787, + "grad_norm": 0.15725527703762054, + "learning_rate": 0.00011639670274619343, + "loss": 1.6672746658325195, + "step": 23670 + }, + { + "epoch": 0.07167829521892452, + "grad_norm": 0.17449240386486053, + "learning_rate": 0.00011639290756455605, + "loss": 1.678569221496582, + "step": 23680 + }, + { + "epoch": 0.07170856476927036, + "grad_norm": 0.1629231572151184, + "learning_rate": 0.00011638911238291864, + "loss": 1.6496353149414062, + "step": 23690 + }, + { + "epoch": 0.07173883431961618, + "grad_norm": 0.1676604002714157, + "learning_rate": 0.00011638531720128126, + "loss": 1.6593311309814454, + "step": 23700 + }, + { + "epoch": 0.071769103869962, + "grad_norm": 0.14736436307430267, + "learning_rate": 0.00011638152201964386, + "loss": 1.7186620712280274, + "step": 23710 + }, + { + "epoch": 0.07179937342030784, + "grad_norm": 0.13455730676651, + "learning_rate": 0.00011637772683800648, + "loss": 1.6758920669555664, + "step": 23720 + }, + { + "epoch": 0.07182964297065367, + "grad_norm": 0.13744649291038513, + "learning_rate": 0.00011637393165636907, + "loss": 1.680659866333008, + "step": 23730 + }, + { + "epoch": 0.0718599125209995, + "grad_norm": 0.16859179735183716, + "learning_rate": 0.00011637013647473169, + "loss": 1.6976314544677735, + "step": 23740 + }, + { + "epoch": 0.07189018207134533, + "grad_norm": 0.15496036410331726, + "learning_rate": 0.00011636634129309428, + "loss": 1.6649721145629883, + "step": 23750 + }, + { + "epoch": 0.07192045162169115, + "grad_norm": 0.14987552165985107, + "learning_rate": 0.0001163625461114569, + "loss": 1.663421630859375, + "step": 23760 + }, + { + "epoch": 0.07195072117203699, + "grad_norm": 0.15446598827838898, + "learning_rate": 0.00011635875092981949, + "loss": 1.686849594116211, + "step": 23770 + }, + { + "epoch": 0.07198099072238281, + "grad_norm": 0.14901192486286163, + "learning_rate": 0.00011635495574818211, + "loss": 1.6648311614990234, + "step": 23780 + }, + { + "epoch": 0.07201126027272865, + "grad_norm": 0.1554042398929596, + "learning_rate": 0.00011635116056654472, + "loss": 1.6465066909790038, + "step": 23790 + }, + { + "epoch": 0.07204152982307448, + "grad_norm": 0.15777571499347687, + "learning_rate": 0.00011634736538490732, + "loss": 1.6547077178955079, + "step": 23800 + }, + { + "epoch": 0.07207179937342031, + "grad_norm": 0.1707569807767868, + "learning_rate": 0.00011634357020326993, + "loss": 1.7026227951049804, + "step": 23810 + }, + { + "epoch": 0.07210206892376614, + "grad_norm": 0.15769045054912567, + "learning_rate": 0.00011633977502163254, + "loss": 1.6538028717041016, + "step": 23820 + }, + { + "epoch": 0.07213233847411196, + "grad_norm": 0.16683922708034515, + "learning_rate": 0.00011633597983999516, + "loss": 1.6797142028808594, + "step": 23830 + }, + { + "epoch": 0.0721626080244578, + "grad_norm": 0.16562455892562866, + "learning_rate": 0.00011633218465835775, + "loss": 1.6707359313964845, + "step": 23840 + }, + { + "epoch": 0.07219287757480362, + "grad_norm": 0.15923704206943512, + "learning_rate": 0.00011632838947672037, + "loss": 1.683365249633789, + "step": 23850 + }, + { + "epoch": 0.07222314712514946, + "grad_norm": 0.1743592470884323, + "learning_rate": 0.00011632459429508296, + "loss": 1.6655309677124024, + "step": 23860 + }, + { + "epoch": 0.07225341667549529, + "grad_norm": 0.1763850599527359, + "learning_rate": 0.00011632079911344558, + "loss": 1.685904312133789, + "step": 23870 + }, + { + "epoch": 0.07228368622584111, + "grad_norm": 0.14578920602798462, + "learning_rate": 0.00011631700393180817, + "loss": 1.7131357192993164, + "step": 23880 + }, + { + "epoch": 0.07231395577618695, + "grad_norm": 0.17674261331558228, + "learning_rate": 0.00011631320875017079, + "loss": 1.662721824645996, + "step": 23890 + }, + { + "epoch": 0.07234422532653277, + "grad_norm": 0.1784653663635254, + "learning_rate": 0.00011630941356853338, + "loss": 1.678046417236328, + "step": 23900 + }, + { + "epoch": 0.07237449487687861, + "grad_norm": 0.14737007021903992, + "learning_rate": 0.000116305618386896, + "loss": 1.6675228118896483, + "step": 23910 + }, + { + "epoch": 0.07240476442722443, + "grad_norm": 0.1354425549507141, + "learning_rate": 0.00011630182320525861, + "loss": 1.6756912231445313, + "step": 23920 + }, + { + "epoch": 0.07243503397757026, + "grad_norm": 0.15261250734329224, + "learning_rate": 0.00011629802802362121, + "loss": 1.6745206832885742, + "step": 23930 + }, + { + "epoch": 0.0724653035279161, + "grad_norm": 0.1529841423034668, + "learning_rate": 0.00011629423284198382, + "loss": 1.6702836990356444, + "step": 23940 + }, + { + "epoch": 0.07249557307826192, + "grad_norm": 0.17034535109996796, + "learning_rate": 0.00011629043766034643, + "loss": 1.6633308410644532, + "step": 23950 + }, + { + "epoch": 0.07252584262860776, + "grad_norm": 0.17701420187950134, + "learning_rate": 0.00011628664247870903, + "loss": 1.6608858108520508, + "step": 23960 + }, + { + "epoch": 0.07255611217895358, + "grad_norm": 0.14147542417049408, + "learning_rate": 0.00011628284729707164, + "loss": 1.6296016693115234, + "step": 23970 + }, + { + "epoch": 0.07258638172929942, + "grad_norm": 0.15897509455680847, + "learning_rate": 0.00011627905211543424, + "loss": 1.697835922241211, + "step": 23980 + }, + { + "epoch": 0.07261665127964524, + "grad_norm": 0.15955889225006104, + "learning_rate": 0.00011627525693379685, + "loss": 1.6896343231201172, + "step": 23990 + }, + { + "epoch": 0.07264692082999107, + "grad_norm": 0.1537739485502243, + "learning_rate": 0.00011627146175215946, + "loss": 1.6677864074707032, + "step": 24000 + }, + { + "epoch": 0.07264692082999107, + "eval_loss": 1.6840451955795288, + "eval_runtime": 27.5589, + "eval_samples_per_second": 18.143, + "eval_steps_per_second": 1.161, + "step": 24000 + }, + { + "epoch": 0.0726771903803369, + "grad_norm": 0.1490391343832016, + "learning_rate": 0.00011626766657052206, + "loss": 1.6854097366333007, + "step": 24010 + }, + { + "epoch": 0.07270745993068273, + "grad_norm": 0.150370791554451, + "learning_rate": 0.00011626387138888467, + "loss": 1.7034215927124023, + "step": 24020 + }, + { + "epoch": 0.07273772948102857, + "grad_norm": 0.14489121735095978, + "learning_rate": 0.00011626007620724729, + "loss": 1.6471363067626954, + "step": 24030 + }, + { + "epoch": 0.07276799903137439, + "grad_norm": 0.169664204120636, + "learning_rate": 0.0001162562810256099, + "loss": 1.696310806274414, + "step": 24040 + }, + { + "epoch": 0.07279826858172021, + "grad_norm": 0.15983986854553223, + "learning_rate": 0.0001162524858439725, + "loss": 1.6942493438720703, + "step": 24050 + }, + { + "epoch": 0.07282853813206605, + "grad_norm": 0.17796577513217926, + "learning_rate": 0.0001162486906623351, + "loss": 1.6934032440185547, + "step": 24060 + }, + { + "epoch": 0.07285880768241187, + "grad_norm": 0.1458820104598999, + "learning_rate": 0.00011624489548069771, + "loss": 1.6617952346801759, + "step": 24070 + }, + { + "epoch": 0.07288907723275771, + "grad_norm": 0.1701367199420929, + "learning_rate": 0.00011624110029906032, + "loss": 1.6767658233642577, + "step": 24080 + }, + { + "epoch": 0.07291934678310354, + "grad_norm": 0.13583140075206757, + "learning_rate": 0.00011623730511742292, + "loss": 1.6994499206542968, + "step": 24090 + }, + { + "epoch": 0.07294961633344936, + "grad_norm": 0.16118124127388, + "learning_rate": 0.00011623350993578553, + "loss": 1.6950307846069337, + "step": 24100 + }, + { + "epoch": 0.0729798858837952, + "grad_norm": 0.14472956955432892, + "learning_rate": 0.00011622971475414814, + "loss": 1.6584632873535157, + "step": 24110 + }, + { + "epoch": 0.07301015543414102, + "grad_norm": 0.15999774634838104, + "learning_rate": 0.00011622591957251074, + "loss": 1.6462120056152343, + "step": 24120 + }, + { + "epoch": 0.07304042498448686, + "grad_norm": 0.16197727620601654, + "learning_rate": 0.00011622212439087335, + "loss": 1.691817855834961, + "step": 24130 + }, + { + "epoch": 0.07307069453483268, + "grad_norm": 0.17114368081092834, + "learning_rate": 0.00011621832920923595, + "loss": 1.6795892715454102, + "step": 24140 + }, + { + "epoch": 0.07310096408517852, + "grad_norm": 0.1616445630788803, + "learning_rate": 0.00011621453402759856, + "loss": 1.7400314331054687, + "step": 24150 + }, + { + "epoch": 0.07313123363552435, + "grad_norm": 0.1675727814435959, + "learning_rate": 0.00011621073884596118, + "loss": 1.7247634887695313, + "step": 24160 + }, + { + "epoch": 0.07316150318587017, + "grad_norm": 0.17337311804294586, + "learning_rate": 0.00011620694366432377, + "loss": 1.6746200561523437, + "step": 24170 + }, + { + "epoch": 0.07319177273621601, + "grad_norm": 0.18734057247638702, + "learning_rate": 0.00011620314848268639, + "loss": 1.6965194702148438, + "step": 24180 + }, + { + "epoch": 0.07322204228656183, + "grad_norm": 0.15699295699596405, + "learning_rate": 0.00011619935330104898, + "loss": 1.6612768173217773, + "step": 24190 + }, + { + "epoch": 0.07325231183690767, + "grad_norm": 0.13252830505371094, + "learning_rate": 0.0001161955581194116, + "loss": 1.6478727340698243, + "step": 24200 + }, + { + "epoch": 0.07328258138725349, + "grad_norm": 0.14508181810379028, + "learning_rate": 0.0001161917629377742, + "loss": 1.6668121337890625, + "step": 24210 + }, + { + "epoch": 0.07331285093759932, + "grad_norm": 0.15240702033042908, + "learning_rate": 0.00011618796775613681, + "loss": 1.712371826171875, + "step": 24220 + }, + { + "epoch": 0.07334312048794515, + "grad_norm": 0.16405141353607178, + "learning_rate": 0.0001161841725744994, + "loss": 1.6923500061035157, + "step": 24230 + }, + { + "epoch": 0.07337339003829098, + "grad_norm": 0.15146568417549133, + "learning_rate": 0.00011618037739286203, + "loss": 1.666665267944336, + "step": 24240 + }, + { + "epoch": 0.07340365958863682, + "grad_norm": 0.14497271180152893, + "learning_rate": 0.00011617658221122463, + "loss": 1.678737258911133, + "step": 24250 + }, + { + "epoch": 0.07343392913898264, + "grad_norm": 0.15138305723667145, + "learning_rate": 0.00011617278702958724, + "loss": 1.6598644256591797, + "step": 24260 + }, + { + "epoch": 0.07346419868932846, + "grad_norm": 0.1811366230249405, + "learning_rate": 0.00011616899184794984, + "loss": 1.6214468002319335, + "step": 24270 + }, + { + "epoch": 0.0734944682396743, + "grad_norm": 0.18278026580810547, + "learning_rate": 0.00011616519666631245, + "loss": 1.6968097686767578, + "step": 24280 + }, + { + "epoch": 0.07352473779002013, + "grad_norm": 0.1753406524658203, + "learning_rate": 0.00011616140148467507, + "loss": 1.7061412811279297, + "step": 24290 + }, + { + "epoch": 0.07355500734036596, + "grad_norm": 0.1645461618900299, + "learning_rate": 0.00011615760630303766, + "loss": 1.665721321105957, + "step": 24300 + }, + { + "epoch": 0.07358527689071179, + "grad_norm": 0.1525343507528305, + "learning_rate": 0.00011615381112140028, + "loss": 1.6215618133544922, + "step": 24310 + }, + { + "epoch": 0.07361554644105762, + "grad_norm": 0.16541051864624023, + "learning_rate": 0.00011615001593976287, + "loss": 1.7034908294677735, + "step": 24320 + }, + { + "epoch": 0.07364581599140345, + "grad_norm": 0.14017149806022644, + "learning_rate": 0.0001161462207581255, + "loss": 1.731656265258789, + "step": 24330 + }, + { + "epoch": 0.07367608554174927, + "grad_norm": 0.17470958828926086, + "learning_rate": 0.00011614242557648809, + "loss": 1.6795215606689453, + "step": 24340 + }, + { + "epoch": 0.07370635509209511, + "grad_norm": 0.19056878983974457, + "learning_rate": 0.0001161386303948507, + "loss": 1.686225128173828, + "step": 24350 + }, + { + "epoch": 0.07373662464244093, + "grad_norm": 0.15496039390563965, + "learning_rate": 0.0001161348352132133, + "loss": 1.671906089782715, + "step": 24360 + }, + { + "epoch": 0.07376689419278677, + "grad_norm": 0.13728100061416626, + "learning_rate": 0.00011613104003157592, + "loss": 1.685468864440918, + "step": 24370 + }, + { + "epoch": 0.0737971637431326, + "grad_norm": 0.15486915409564972, + "learning_rate": 0.00011612724484993851, + "loss": 1.692782211303711, + "step": 24380 + }, + { + "epoch": 0.07382743329347842, + "grad_norm": 0.14527679979801178, + "learning_rate": 0.00011612344966830113, + "loss": 1.6910127639770507, + "step": 24390 + }, + { + "epoch": 0.07385770284382426, + "grad_norm": 0.15384134650230408, + "learning_rate": 0.00011611965448666374, + "loss": 1.7114059448242187, + "step": 24400 + }, + { + "epoch": 0.07388797239417008, + "grad_norm": 0.14402031898498535, + "learning_rate": 0.00011611585930502634, + "loss": 1.6878890991210938, + "step": 24410 + }, + { + "epoch": 0.07391824194451592, + "grad_norm": 0.14496713876724243, + "learning_rate": 0.00011611206412338895, + "loss": 1.6657444000244142, + "step": 24420 + }, + { + "epoch": 0.07394851149486174, + "grad_norm": 0.19452472031116486, + "learning_rate": 0.00011610826894175155, + "loss": 1.6797157287597657, + "step": 24430 + }, + { + "epoch": 0.07397878104520757, + "grad_norm": 0.1664484441280365, + "learning_rate": 0.00011610447376011417, + "loss": 1.6908208847045898, + "step": 24440 + }, + { + "epoch": 0.0740090505955534, + "grad_norm": 0.14851538836956024, + "learning_rate": 0.00011610067857847676, + "loss": 1.695409393310547, + "step": 24450 + }, + { + "epoch": 0.07403932014589923, + "grad_norm": 0.17345011234283447, + "learning_rate": 0.00011609688339683938, + "loss": 1.700838851928711, + "step": 24460 + }, + { + "epoch": 0.07406958969624507, + "grad_norm": 0.14567293226718903, + "learning_rate": 0.00011609308821520198, + "loss": 1.6628190994262695, + "step": 24470 + }, + { + "epoch": 0.07409985924659089, + "grad_norm": 0.16751138865947723, + "learning_rate": 0.0001160892930335646, + "loss": 1.6944019317626953, + "step": 24480 + }, + { + "epoch": 0.07413012879693673, + "grad_norm": 0.16101834177970886, + "learning_rate": 0.00011608549785192719, + "loss": 1.6567310333251952, + "step": 24490 + }, + { + "epoch": 0.07416039834728255, + "grad_norm": 0.1704079955816269, + "learning_rate": 0.00011608170267028981, + "loss": 1.684427261352539, + "step": 24500 + }, + { + "epoch": 0.07416039834728255, + "eval_loss": 1.6562964916229248, + "eval_runtime": 27.9311, + "eval_samples_per_second": 17.901, + "eval_steps_per_second": 1.146, + "step": 24500 + }, + { + "epoch": 0.07419066789762838, + "grad_norm": 0.14968754351139069, + "learning_rate": 0.0001160779074886524, + "loss": 1.6872108459472657, + "step": 24510 + }, + { + "epoch": 0.07422093744797421, + "grad_norm": 0.19533568620681763, + "learning_rate": 0.00011607411230701502, + "loss": 1.6594133377075195, + "step": 24520 + }, + { + "epoch": 0.07425120699832004, + "grad_norm": 0.1717483252286911, + "learning_rate": 0.00011607031712537763, + "loss": 1.686231803894043, + "step": 24530 + }, + { + "epoch": 0.07428147654866588, + "grad_norm": 0.14387381076812744, + "learning_rate": 0.00011606652194374023, + "loss": 1.680649185180664, + "step": 24540 + }, + { + "epoch": 0.0743117460990117, + "grad_norm": 0.18121455609798431, + "learning_rate": 0.00011606272676210284, + "loss": 1.6803565979003907, + "step": 24550 + }, + { + "epoch": 0.07434201564935752, + "grad_norm": 0.18255098164081573, + "learning_rate": 0.00011605893158046544, + "loss": 1.6946727752685546, + "step": 24560 + }, + { + "epoch": 0.07437228519970336, + "grad_norm": 0.18024380505084991, + "learning_rate": 0.00011605513639882805, + "loss": 1.6413587570190429, + "step": 24570 + }, + { + "epoch": 0.07440255475004919, + "grad_norm": 0.16907548904418945, + "learning_rate": 0.00011605134121719066, + "loss": 1.6769981384277344, + "step": 24580 + }, + { + "epoch": 0.07443282430039502, + "grad_norm": 0.1680530160665512, + "learning_rate": 0.00011604754603555326, + "loss": 1.6687253952026366, + "step": 24590 + }, + { + "epoch": 0.07446309385074085, + "grad_norm": 0.15768514573574066, + "learning_rate": 0.00011604375085391587, + "loss": 1.674140167236328, + "step": 24600 + }, + { + "epoch": 0.07449336340108667, + "grad_norm": 0.1741001456975937, + "learning_rate": 0.00011603995567227847, + "loss": 1.6861881256103515, + "step": 24610 + }, + { + "epoch": 0.07452363295143251, + "grad_norm": 0.1487557590007782, + "learning_rate": 0.00011603616049064108, + "loss": 1.6538652420043944, + "step": 24620 + }, + { + "epoch": 0.07455390250177833, + "grad_norm": 0.1550414115190506, + "learning_rate": 0.00011603236530900369, + "loss": 1.687643051147461, + "step": 24630 + }, + { + "epoch": 0.07458417205212417, + "grad_norm": 0.1583237648010254, + "learning_rate": 0.00011602857012736629, + "loss": 1.6286643981933593, + "step": 24640 + }, + { + "epoch": 0.07461444160247, + "grad_norm": 0.1639247089624405, + "learning_rate": 0.00011602477494572891, + "loss": 1.6774351119995117, + "step": 24650 + }, + { + "epoch": 0.07464471115281582, + "grad_norm": 0.16048386693000793, + "learning_rate": 0.00011602097976409152, + "loss": 1.6860904693603516, + "step": 24660 + }, + { + "epoch": 0.07467498070316166, + "grad_norm": 0.14268764853477478, + "learning_rate": 0.00011601718458245412, + "loss": 1.7102693557739257, + "step": 24670 + }, + { + "epoch": 0.07470525025350748, + "grad_norm": 0.1668379008769989, + "learning_rate": 0.00011601338940081673, + "loss": 1.6893413543701172, + "step": 24680 + }, + { + "epoch": 0.07473551980385332, + "grad_norm": 0.1665450483560562, + "learning_rate": 0.00011600959421917933, + "loss": 1.6845947265625, + "step": 24690 + }, + { + "epoch": 0.07476578935419914, + "grad_norm": 0.14590223133563995, + "learning_rate": 0.00011600579903754194, + "loss": 1.6976333618164063, + "step": 24700 + }, + { + "epoch": 0.07479605890454498, + "grad_norm": 0.15954720973968506, + "learning_rate": 0.00011600200385590455, + "loss": 1.6676742553710937, + "step": 24710 + }, + { + "epoch": 0.0748263284548908, + "grad_norm": 0.16234833002090454, + "learning_rate": 0.00011599820867426715, + "loss": 1.720818328857422, + "step": 24720 + }, + { + "epoch": 0.07485659800523663, + "grad_norm": 0.15550917387008667, + "learning_rate": 0.00011599441349262976, + "loss": 1.6868587493896485, + "step": 24730 + }, + { + "epoch": 0.07488686755558246, + "grad_norm": 0.1832134872674942, + "learning_rate": 0.00011599061831099236, + "loss": 1.6745304107666015, + "step": 24740 + }, + { + "epoch": 0.07491713710592829, + "grad_norm": 0.15015672147274017, + "learning_rate": 0.00011598682312935497, + "loss": 1.6691741943359375, + "step": 24750 + }, + { + "epoch": 0.07494740665627413, + "grad_norm": 0.1627894937992096, + "learning_rate": 0.00011598302794771758, + "loss": 1.664055633544922, + "step": 24760 + }, + { + "epoch": 0.07497767620661995, + "grad_norm": 0.14354461431503296, + "learning_rate": 0.0001159792327660802, + "loss": 1.6743669509887695, + "step": 24770 + }, + { + "epoch": 0.07500794575696577, + "grad_norm": 0.15045450627803802, + "learning_rate": 0.00011597543758444279, + "loss": 1.6914966583251954, + "step": 24780 + }, + { + "epoch": 0.07503821530731161, + "grad_norm": 0.12805640697479248, + "learning_rate": 0.00011597164240280541, + "loss": 1.7003978729248046, + "step": 24790 + }, + { + "epoch": 0.07506848485765744, + "grad_norm": 0.1499098688364029, + "learning_rate": 0.000115967847221168, + "loss": 1.6992362976074218, + "step": 24800 + }, + { + "epoch": 0.07509875440800327, + "grad_norm": 0.15870100259780884, + "learning_rate": 0.00011596405203953062, + "loss": 1.6985971450805664, + "step": 24810 + }, + { + "epoch": 0.0751290239583491, + "grad_norm": 0.14552298188209534, + "learning_rate": 0.00011596025685789321, + "loss": 1.7316635131835938, + "step": 24820 + }, + { + "epoch": 0.07515929350869492, + "grad_norm": 0.16268491744995117, + "learning_rate": 0.00011595646167625583, + "loss": 1.6609992980957031, + "step": 24830 + }, + { + "epoch": 0.07518956305904076, + "grad_norm": 0.16350775957107544, + "learning_rate": 0.00011595266649461844, + "loss": 1.6745250701904297, + "step": 24840 + }, + { + "epoch": 0.07521983260938658, + "grad_norm": 0.15235526859760284, + "learning_rate": 0.00011594887131298104, + "loss": 1.6658241271972656, + "step": 24850 + }, + { + "epoch": 0.07525010215973242, + "grad_norm": 0.18059085309505463, + "learning_rate": 0.00011594507613134365, + "loss": 1.6742929458618163, + "step": 24860 + }, + { + "epoch": 0.07528037171007824, + "grad_norm": 0.15796159207820892, + "learning_rate": 0.00011594128094970626, + "loss": 1.6374776840209961, + "step": 24870 + }, + { + "epoch": 0.07531064126042408, + "grad_norm": 0.14865446090698242, + "learning_rate": 0.00011593748576806886, + "loss": 1.6623428344726563, + "step": 24880 + }, + { + "epoch": 0.0753409108107699, + "grad_norm": 0.14556552469730377, + "learning_rate": 0.00011593369058643147, + "loss": 1.672923469543457, + "step": 24890 + }, + { + "epoch": 0.07537118036111573, + "grad_norm": 0.14488817751407623, + "learning_rate": 0.00011592989540479409, + "loss": 1.6728614807128905, + "step": 24900 + }, + { + "epoch": 0.07540144991146157, + "grad_norm": 0.17311017215251923, + "learning_rate": 0.00011592610022315668, + "loss": 1.6533475875854493, + "step": 24910 + }, + { + "epoch": 0.07543171946180739, + "grad_norm": 0.15813469886779785, + "learning_rate": 0.0001159223050415193, + "loss": 1.6678281784057618, + "step": 24920 + }, + { + "epoch": 0.07546198901215323, + "grad_norm": 0.1347973793745041, + "learning_rate": 0.00011591850985988189, + "loss": 1.6642257690429687, + "step": 24930 + }, + { + "epoch": 0.07549225856249905, + "grad_norm": 0.14740291237831116, + "learning_rate": 0.00011591471467824451, + "loss": 1.6569154739379883, + "step": 24940 + }, + { + "epoch": 0.07552252811284488, + "grad_norm": 0.16316330432891846, + "learning_rate": 0.0001159109194966071, + "loss": 1.669955062866211, + "step": 24950 + }, + { + "epoch": 0.07555279766319072, + "grad_norm": 0.16540642082691193, + "learning_rate": 0.00011590712431496972, + "loss": 1.6949260711669922, + "step": 24960 + }, + { + "epoch": 0.07558306721353654, + "grad_norm": 0.1600542813539505, + "learning_rate": 0.00011590332913333231, + "loss": 1.6831058502197265, + "step": 24970 + }, + { + "epoch": 0.07561333676388238, + "grad_norm": 0.19693568348884583, + "learning_rate": 0.00011589953395169493, + "loss": 1.6659385681152343, + "step": 24980 + }, + { + "epoch": 0.0756436063142282, + "grad_norm": 0.1500208079814911, + "learning_rate": 0.00011589573877005753, + "loss": 1.6773447036743163, + "step": 24990 + }, + { + "epoch": 0.07567387586457403, + "grad_norm": 0.18720871210098267, + "learning_rate": 0.00011589194358842015, + "loss": 1.6333892822265625, + "step": 25000 + }, + { + "epoch": 0.07567387586457403, + "eval_loss": 1.6762073040008545, + "eval_runtime": 28.0393, + "eval_samples_per_second": 17.832, + "eval_steps_per_second": 1.141, + "step": 25000 + }, + { + "epoch": 0.07570414541491986, + "grad_norm": 0.16682615876197815, + "learning_rate": 0.00011588814840678274, + "loss": 1.6607946395874023, + "step": 25010 + }, + { + "epoch": 0.07573441496526569, + "grad_norm": 0.16382457315921783, + "learning_rate": 0.00011588435322514536, + "loss": 1.6600061416625977, + "step": 25020 + }, + { + "epoch": 0.07576468451561152, + "grad_norm": 0.14938928186893463, + "learning_rate": 0.00011588055804350796, + "loss": 1.6909019470214843, + "step": 25030 + }, + { + "epoch": 0.07579495406595735, + "grad_norm": 0.19188977777957916, + "learning_rate": 0.00011587676286187057, + "loss": 1.7102149963378905, + "step": 25040 + }, + { + "epoch": 0.07582522361630319, + "grad_norm": 0.1685790717601776, + "learning_rate": 0.00011587296768023319, + "loss": 1.736658477783203, + "step": 25050 + }, + { + "epoch": 0.07585549316664901, + "grad_norm": 0.13827013969421387, + "learning_rate": 0.00011586917249859578, + "loss": 1.6762943267822266, + "step": 25060 + }, + { + "epoch": 0.07588576271699483, + "grad_norm": 0.14462484419345856, + "learning_rate": 0.0001158653773169584, + "loss": 1.7157657623291016, + "step": 25070 + }, + { + "epoch": 0.07591603226734067, + "grad_norm": 0.17407739162445068, + "learning_rate": 0.000115861582135321, + "loss": 1.6776161193847656, + "step": 25080 + }, + { + "epoch": 0.0759463018176865, + "grad_norm": 0.15053611993789673, + "learning_rate": 0.00011585778695368361, + "loss": 1.654344367980957, + "step": 25090 + }, + { + "epoch": 0.07597657136803233, + "grad_norm": 0.18362537026405334, + "learning_rate": 0.0001158539917720462, + "loss": 1.64873046875, + "step": 25100 + }, + { + "epoch": 0.07600684091837816, + "grad_norm": 0.16353587806224823, + "learning_rate": 0.00011585019659040883, + "loss": 1.6575611114501954, + "step": 25110 + }, + { + "epoch": 0.07603711046872398, + "grad_norm": 0.19137445092201233, + "learning_rate": 0.00011584640140877142, + "loss": 1.6953826904296876, + "step": 25120 + }, + { + "epoch": 0.07606738001906982, + "grad_norm": 0.18674850463867188, + "learning_rate": 0.00011584260622713404, + "loss": 1.6741334915161132, + "step": 25130 + }, + { + "epoch": 0.07609764956941564, + "grad_norm": 0.12683741748332977, + "learning_rate": 0.00011583881104549664, + "loss": 1.6836517333984375, + "step": 25140 + }, + { + "epoch": 0.07612791911976148, + "grad_norm": 0.14140696823596954, + "learning_rate": 0.00011583501586385925, + "loss": 1.6170814514160157, + "step": 25150 + }, + { + "epoch": 0.0761581886701073, + "grad_norm": 0.16048632562160492, + "learning_rate": 0.00011583122068222185, + "loss": 1.675438690185547, + "step": 25160 + }, + { + "epoch": 0.07618845822045313, + "grad_norm": 0.15748386085033417, + "learning_rate": 0.00011582742550058446, + "loss": 1.6917381286621094, + "step": 25170 + }, + { + "epoch": 0.07621872777079897, + "grad_norm": 0.14873641729354858, + "learning_rate": 0.00011582363031894707, + "loss": 1.6547979354858398, + "step": 25180 + }, + { + "epoch": 0.07624899732114479, + "grad_norm": 0.13180577754974365, + "learning_rate": 0.00011581983513730967, + "loss": 1.6981800079345704, + "step": 25190 + }, + { + "epoch": 0.07627926687149063, + "grad_norm": 0.13966284692287445, + "learning_rate": 0.00011581603995567228, + "loss": 1.7081871032714844, + "step": 25200 + }, + { + "epoch": 0.07630953642183645, + "grad_norm": 0.17762984335422516, + "learning_rate": 0.00011581224477403488, + "loss": 1.684016799926758, + "step": 25210 + }, + { + "epoch": 0.07633980597218229, + "grad_norm": 0.12395729869604111, + "learning_rate": 0.00011580844959239749, + "loss": 1.6795669555664063, + "step": 25220 + }, + { + "epoch": 0.07637007552252811, + "grad_norm": 0.14886727929115295, + "learning_rate": 0.0001158046544107601, + "loss": 1.6671075820922852, + "step": 25230 + }, + { + "epoch": 0.07640034507287394, + "grad_norm": 0.12474680691957474, + "learning_rate": 0.0001158008592291227, + "loss": 1.6478147506713867, + "step": 25240 + }, + { + "epoch": 0.07643061462321978, + "grad_norm": 0.1663755178451538, + "learning_rate": 0.00011579706404748531, + "loss": 1.6475688934326171, + "step": 25250 + }, + { + "epoch": 0.0764608841735656, + "grad_norm": 0.15036365389823914, + "learning_rate": 0.00011579326886584793, + "loss": 1.6279512405395509, + "step": 25260 + }, + { + "epoch": 0.07649115372391144, + "grad_norm": 0.1635393500328064, + "learning_rate": 0.00011578947368421053, + "loss": 1.6730907440185547, + "step": 25270 + }, + { + "epoch": 0.07652142327425726, + "grad_norm": 0.16081346571445465, + "learning_rate": 0.00011578567850257314, + "loss": 1.6834474563598634, + "step": 25280 + }, + { + "epoch": 0.07655169282460308, + "grad_norm": 0.15031179785728455, + "learning_rate": 0.00011578188332093575, + "loss": 1.7187154769897461, + "step": 25290 + }, + { + "epoch": 0.07658196237494892, + "grad_norm": 0.1473073661327362, + "learning_rate": 0.00011577808813929835, + "loss": 1.667561912536621, + "step": 25300 + }, + { + "epoch": 0.07661223192529475, + "grad_norm": 0.18325665593147278, + "learning_rate": 0.00011577429295766096, + "loss": 1.6652606964111327, + "step": 25310 + }, + { + "epoch": 0.07664250147564058, + "grad_norm": 0.16510742902755737, + "learning_rate": 0.00011577049777602356, + "loss": 1.6948495864868165, + "step": 25320 + }, + { + "epoch": 0.07667277102598641, + "grad_norm": 0.16875424981117249, + "learning_rate": 0.00011576670259438617, + "loss": 1.7142158508300782, + "step": 25330 + }, + { + "epoch": 0.07670304057633223, + "grad_norm": 0.1955258697271347, + "learning_rate": 0.00011576290741274878, + "loss": 1.628047561645508, + "step": 25340 + }, + { + "epoch": 0.07673331012667807, + "grad_norm": 0.16955780982971191, + "learning_rate": 0.00011575911223111138, + "loss": 1.6571300506591797, + "step": 25350 + }, + { + "epoch": 0.0767635796770239, + "grad_norm": 0.14975498616695404, + "learning_rate": 0.00011575531704947399, + "loss": 1.6727313995361328, + "step": 25360 + }, + { + "epoch": 0.07679384922736973, + "grad_norm": 0.15794141590595245, + "learning_rate": 0.0001157515218678366, + "loss": 1.651186752319336, + "step": 25370 + }, + { + "epoch": 0.07682411877771556, + "grad_norm": 0.13808056712150574, + "learning_rate": 0.0001157477266861992, + "loss": 1.70860595703125, + "step": 25380 + }, + { + "epoch": 0.0768543883280614, + "grad_norm": 0.15617620944976807, + "learning_rate": 0.0001157439315045618, + "loss": 1.6547409057617188, + "step": 25390 + }, + { + "epoch": 0.07688465787840722, + "grad_norm": 0.14300191402435303, + "learning_rate": 0.00011574013632292442, + "loss": 1.6657682418823243, + "step": 25400 + }, + { + "epoch": 0.07691492742875304, + "grad_norm": 0.17353294789791107, + "learning_rate": 0.00011573634114128702, + "loss": 1.6840080261230468, + "step": 25410 + }, + { + "epoch": 0.07694519697909888, + "grad_norm": 0.18182244896888733, + "learning_rate": 0.00011573254595964964, + "loss": 1.6888301849365235, + "step": 25420 + }, + { + "epoch": 0.0769754665294447, + "grad_norm": 0.17752228677272797, + "learning_rate": 0.00011572875077801223, + "loss": 1.6517868041992188, + "step": 25430 + }, + { + "epoch": 0.07700573607979054, + "grad_norm": 0.1642887145280838, + "learning_rate": 0.00011572495559637485, + "loss": 1.6813175201416015, + "step": 25440 + }, + { + "epoch": 0.07703600563013636, + "grad_norm": 0.15314309298992157, + "learning_rate": 0.00011572116041473745, + "loss": 1.666610336303711, + "step": 25450 + }, + { + "epoch": 0.07706627518048219, + "grad_norm": 0.15190696716308594, + "learning_rate": 0.00011571736523310006, + "loss": 1.700186538696289, + "step": 25460 + }, + { + "epoch": 0.07709654473082803, + "grad_norm": 0.1458640992641449, + "learning_rate": 0.00011571357005146267, + "loss": 1.6673152923583985, + "step": 25470 + }, + { + "epoch": 0.07712681428117385, + "grad_norm": 0.14924439787864685, + "learning_rate": 0.00011570977486982527, + "loss": 1.7032943725585938, + "step": 25480 + }, + { + "epoch": 0.07715708383151969, + "grad_norm": 0.15106754004955292, + "learning_rate": 0.00011570597968818788, + "loss": 1.6548255920410155, + "step": 25490 + }, + { + "epoch": 0.07718735338186551, + "grad_norm": 0.16070395708084106, + "learning_rate": 0.00011570218450655048, + "loss": 1.642713737487793, + "step": 25500 + }, + { + "epoch": 0.07718735338186551, + "eval_loss": 1.6969033479690552, + "eval_runtime": 28.4059, + "eval_samples_per_second": 17.602, + "eval_steps_per_second": 1.127, + "step": 25500 + }, + { + "epoch": 0.07721762293221134, + "grad_norm": 0.14764028787612915, + "learning_rate": 0.0001156983893249131, + "loss": 1.6509958267211915, + "step": 25510 + }, + { + "epoch": 0.07724789248255717, + "grad_norm": 0.15240676701068878, + "learning_rate": 0.0001156945941432757, + "loss": 1.6780929565429688, + "step": 25520 + }, + { + "epoch": 0.077278162032903, + "grad_norm": 0.15385641157627106, + "learning_rate": 0.00011569079896163832, + "loss": 1.6727630615234375, + "step": 25530 + }, + { + "epoch": 0.07730843158324884, + "grad_norm": 0.15475192666053772, + "learning_rate": 0.00011568700378000091, + "loss": 1.6527610778808595, + "step": 25540 + }, + { + "epoch": 0.07733870113359466, + "grad_norm": 0.13601462543010712, + "learning_rate": 0.00011568320859836353, + "loss": 1.653555679321289, + "step": 25550 + }, + { + "epoch": 0.0773689706839405, + "grad_norm": 0.14261704683303833, + "learning_rate": 0.00011567941341672612, + "loss": 1.6816566467285157, + "step": 25560 + }, + { + "epoch": 0.07739924023428632, + "grad_norm": 0.15338072180747986, + "learning_rate": 0.00011567561823508874, + "loss": 1.658034324645996, + "step": 25570 + }, + { + "epoch": 0.07742950978463214, + "grad_norm": 0.15995490550994873, + "learning_rate": 0.00011567182305345133, + "loss": 1.694549560546875, + "step": 25580 + }, + { + "epoch": 0.07745977933497798, + "grad_norm": 0.1701045036315918, + "learning_rate": 0.00011566802787181395, + "loss": 1.6471269607543946, + "step": 25590 + }, + { + "epoch": 0.0774900488853238, + "grad_norm": 0.1622561514377594, + "learning_rate": 0.00011566423269017654, + "loss": 1.6870157241821289, + "step": 25600 + }, + { + "epoch": 0.07752031843566964, + "grad_norm": 0.13741400837898254, + "learning_rate": 0.00011566043750853916, + "loss": 1.6581806182861327, + "step": 25610 + }, + { + "epoch": 0.07755058798601547, + "grad_norm": 0.15144208073616028, + "learning_rate": 0.00011565664232690176, + "loss": 1.6963535308837892, + "step": 25620 + }, + { + "epoch": 0.07758085753636129, + "grad_norm": 0.16512992978096008, + "learning_rate": 0.00011565284714526438, + "loss": 1.6802753448486327, + "step": 25630 + }, + { + "epoch": 0.07761112708670713, + "grad_norm": 0.16024932265281677, + "learning_rate": 0.00011564905196362698, + "loss": 1.65736083984375, + "step": 25640 + }, + { + "epoch": 0.07764139663705295, + "grad_norm": 0.1604354828596115, + "learning_rate": 0.00011564525678198959, + "loss": 1.656350326538086, + "step": 25650 + }, + { + "epoch": 0.07767166618739879, + "grad_norm": 0.15388163924217224, + "learning_rate": 0.0001156414616003522, + "loss": 1.6962919235229492, + "step": 25660 + }, + { + "epoch": 0.07770193573774462, + "grad_norm": 0.16305209696292877, + "learning_rate": 0.0001156376664187148, + "loss": 1.6889427185058594, + "step": 25670 + }, + { + "epoch": 0.07773220528809044, + "grad_norm": 0.14600588381290436, + "learning_rate": 0.00011563387123707742, + "loss": 1.6597099304199219, + "step": 25680 + }, + { + "epoch": 0.07776247483843628, + "grad_norm": 0.15095224976539612, + "learning_rate": 0.00011563007605544001, + "loss": 1.6841159820556642, + "step": 25690 + }, + { + "epoch": 0.0777927443887821, + "grad_norm": 0.13115911185741425, + "learning_rate": 0.00011562628087380263, + "loss": 1.605951690673828, + "step": 25700 + }, + { + "epoch": 0.07782301393912794, + "grad_norm": 0.15404783189296722, + "learning_rate": 0.00011562248569216522, + "loss": 1.6656463623046875, + "step": 25710 + }, + { + "epoch": 0.07785328348947376, + "grad_norm": 0.14522704482078552, + "learning_rate": 0.00011561869051052784, + "loss": 1.7330028533935546, + "step": 25720 + }, + { + "epoch": 0.0778835530398196, + "grad_norm": 0.1815081238746643, + "learning_rate": 0.00011561489532889043, + "loss": 1.6719253540039063, + "step": 25730 + }, + { + "epoch": 0.07791382259016542, + "grad_norm": 0.17056040465831757, + "learning_rate": 0.00011561110014725305, + "loss": 1.6528762817382812, + "step": 25740 + }, + { + "epoch": 0.07794409214051125, + "grad_norm": 0.13875985145568848, + "learning_rate": 0.00011560730496561565, + "loss": 1.6544971466064453, + "step": 25750 + }, + { + "epoch": 0.07797436169085709, + "grad_norm": 0.16037753224372864, + "learning_rate": 0.00011560350978397827, + "loss": 1.6312374114990233, + "step": 25760 + }, + { + "epoch": 0.07800463124120291, + "grad_norm": 0.1561613380908966, + "learning_rate": 0.00011559971460234087, + "loss": 1.695547103881836, + "step": 25770 + }, + { + "epoch": 0.07803490079154875, + "grad_norm": 0.1761149764060974, + "learning_rate": 0.00011559591942070348, + "loss": 1.6862619400024415, + "step": 25780 + }, + { + "epoch": 0.07806517034189457, + "grad_norm": 0.15866714715957642, + "learning_rate": 0.00011559212423906608, + "loss": 1.6534679412841797, + "step": 25790 + }, + { + "epoch": 0.0780954398922404, + "grad_norm": 0.1508786827325821, + "learning_rate": 0.00011558832905742869, + "loss": 1.6493175506591797, + "step": 25800 + }, + { + "epoch": 0.07812570944258623, + "grad_norm": 0.1476542055606842, + "learning_rate": 0.0001155845338757913, + "loss": 1.680655288696289, + "step": 25810 + }, + { + "epoch": 0.07815597899293206, + "grad_norm": 0.15935693681240082, + "learning_rate": 0.0001155807386941539, + "loss": 1.672088050842285, + "step": 25820 + }, + { + "epoch": 0.0781862485432779, + "grad_norm": 0.16848251223564148, + "learning_rate": 0.00011557694351251651, + "loss": 1.6930112838745117, + "step": 25830 + }, + { + "epoch": 0.07821651809362372, + "grad_norm": 0.1653020977973938, + "learning_rate": 0.00011557314833087911, + "loss": 1.6854000091552734, + "step": 25840 + }, + { + "epoch": 0.07824678764396954, + "grad_norm": 0.16933834552764893, + "learning_rate": 0.00011556935314924172, + "loss": 1.6725214004516602, + "step": 25850 + }, + { + "epoch": 0.07827705719431538, + "grad_norm": 0.1611092984676361, + "learning_rate": 0.00011556555796760433, + "loss": 1.687068557739258, + "step": 25860 + }, + { + "epoch": 0.0783073267446612, + "grad_norm": 0.1561400443315506, + "learning_rate": 0.00011556176278596695, + "loss": 1.7212305068969727, + "step": 25870 + }, + { + "epoch": 0.07833759629500704, + "grad_norm": 0.1571715772151947, + "learning_rate": 0.00011555796760432955, + "loss": 1.6614473342895508, + "step": 25880 + }, + { + "epoch": 0.07836786584535287, + "grad_norm": 0.15451236069202423, + "learning_rate": 0.00011555417242269216, + "loss": 1.6394216537475585, + "step": 25890 + }, + { + "epoch": 0.0783981353956987, + "grad_norm": 0.17282746732234955, + "learning_rate": 0.00011555037724105476, + "loss": 1.6750667572021485, + "step": 25900 + }, + { + "epoch": 0.07842840494604453, + "grad_norm": 0.16652178764343262, + "learning_rate": 0.00011554658205941737, + "loss": 1.670956802368164, + "step": 25910 + }, + { + "epoch": 0.07845867449639035, + "grad_norm": 0.14504766464233398, + "learning_rate": 0.00011554278687777997, + "loss": 1.693441390991211, + "step": 25920 + }, + { + "epoch": 0.07848894404673619, + "grad_norm": 0.1512710452079773, + "learning_rate": 0.00011553899169614258, + "loss": 1.66534481048584, + "step": 25930 + }, + { + "epoch": 0.07851921359708201, + "grad_norm": 0.1581840068101883, + "learning_rate": 0.00011553519651450519, + "loss": 1.6821605682373046, + "step": 25940 + }, + { + "epoch": 0.07854948314742785, + "grad_norm": 0.16565287113189697, + "learning_rate": 0.00011553140133286779, + "loss": 1.6651130676269532, + "step": 25950 + }, + { + "epoch": 0.07857975269777367, + "grad_norm": 0.18433617055416107, + "learning_rate": 0.0001155276061512304, + "loss": 1.709854507446289, + "step": 25960 + }, + { + "epoch": 0.0786100222481195, + "grad_norm": 0.16006141901016235, + "learning_rate": 0.000115523810969593, + "loss": 1.6656333923339843, + "step": 25970 + }, + { + "epoch": 0.07864029179846534, + "grad_norm": 0.17894354462623596, + "learning_rate": 0.00011552001578795561, + "loss": 1.6805059432983398, + "step": 25980 + }, + { + "epoch": 0.07867056134881116, + "grad_norm": 0.16040784120559692, + "learning_rate": 0.00011551622060631822, + "loss": 1.676681900024414, + "step": 25990 + }, + { + "epoch": 0.078700830899157, + "grad_norm": 0.1452459692955017, + "learning_rate": 0.00011551242542468082, + "loss": 1.6373430252075196, + "step": 26000 + }, + { + "epoch": 0.078700830899157, + "eval_loss": 1.696890115737915, + "eval_runtime": 28.2358, + "eval_samples_per_second": 17.708, + "eval_steps_per_second": 1.133, + "step": 26000 + }, + { + "epoch": 0.07873110044950282, + "grad_norm": 0.1778058409690857, + "learning_rate": 0.00011550863024304344, + "loss": 1.656972885131836, + "step": 26010 + }, + { + "epoch": 0.07876136999984865, + "grad_norm": 0.16705255210399628, + "learning_rate": 0.00011550483506140603, + "loss": 1.6701513290405274, + "step": 26020 + }, + { + "epoch": 0.07879163955019448, + "grad_norm": 0.1669156402349472, + "learning_rate": 0.00011550103987976865, + "loss": 1.678745651245117, + "step": 26030 + }, + { + "epoch": 0.07882190910054031, + "grad_norm": 0.15481121838092804, + "learning_rate": 0.00011549724469813125, + "loss": 1.6757797241210937, + "step": 26040 + }, + { + "epoch": 0.07885217865088615, + "grad_norm": 0.168777197599411, + "learning_rate": 0.00011549344951649387, + "loss": 1.701605987548828, + "step": 26050 + }, + { + "epoch": 0.07888244820123197, + "grad_norm": 0.19356925785541534, + "learning_rate": 0.00011548965433485647, + "loss": 1.6557331085205078, + "step": 26060 + }, + { + "epoch": 0.07891271775157781, + "grad_norm": 0.15059146285057068, + "learning_rate": 0.00011548585915321908, + "loss": 1.6655059814453126, + "step": 26070 + }, + { + "epoch": 0.07894298730192363, + "grad_norm": 0.14961683750152588, + "learning_rate": 0.00011548206397158168, + "loss": 1.6544219970703125, + "step": 26080 + }, + { + "epoch": 0.07897325685226946, + "grad_norm": 0.14784814417362213, + "learning_rate": 0.00011547826878994429, + "loss": 1.6765466690063477, + "step": 26090 + }, + { + "epoch": 0.07900352640261529, + "grad_norm": 0.14802899956703186, + "learning_rate": 0.0001154744736083069, + "loss": 1.6703189849853515, + "step": 26100 + }, + { + "epoch": 0.07903379595296112, + "grad_norm": 0.18129511177539825, + "learning_rate": 0.0001154706784266695, + "loss": 1.6358207702636718, + "step": 26110 + }, + { + "epoch": 0.07906406550330695, + "grad_norm": 0.16879138350486755, + "learning_rate": 0.00011546688324503212, + "loss": 1.7257625579833984, + "step": 26120 + }, + { + "epoch": 0.07909433505365278, + "grad_norm": 0.1454891860485077, + "learning_rate": 0.00011546308806339471, + "loss": 1.6920162200927735, + "step": 26130 + }, + { + "epoch": 0.0791246046039986, + "grad_norm": 0.15656453371047974, + "learning_rate": 0.00011545929288175733, + "loss": 1.6750877380371094, + "step": 26140 + }, + { + "epoch": 0.07915487415434444, + "grad_norm": 0.181965634226799, + "learning_rate": 0.00011545549770011993, + "loss": 1.698114776611328, + "step": 26150 + }, + { + "epoch": 0.07918514370469026, + "grad_norm": 0.15486344695091248, + "learning_rate": 0.00011545170251848254, + "loss": 1.7061065673828124, + "step": 26160 + }, + { + "epoch": 0.0792154132550361, + "grad_norm": 0.1561138927936554, + "learning_rate": 0.00011544790733684514, + "loss": 1.6286685943603516, + "step": 26170 + }, + { + "epoch": 0.07924568280538193, + "grad_norm": 0.15909704566001892, + "learning_rate": 0.00011544411215520776, + "loss": 1.6794784545898438, + "step": 26180 + }, + { + "epoch": 0.07927595235572775, + "grad_norm": 0.1598353385925293, + "learning_rate": 0.00011544031697357035, + "loss": 1.6955375671386719, + "step": 26190 + }, + { + "epoch": 0.07930622190607359, + "grad_norm": 0.15805479884147644, + "learning_rate": 0.00011543652179193297, + "loss": 1.6066558837890625, + "step": 26200 + }, + { + "epoch": 0.07933649145641941, + "grad_norm": 0.1607433408498764, + "learning_rate": 0.00011543272661029556, + "loss": 1.6679922103881837, + "step": 26210 + }, + { + "epoch": 0.07936676100676525, + "grad_norm": 0.15256674587726593, + "learning_rate": 0.00011542893142865818, + "loss": 1.723966407775879, + "step": 26220 + }, + { + "epoch": 0.07939703055711107, + "grad_norm": 0.15007257461547852, + "learning_rate": 0.00011542513624702077, + "loss": 1.6768386840820313, + "step": 26230 + }, + { + "epoch": 0.0794273001074569, + "grad_norm": 0.13838961720466614, + "learning_rate": 0.00011542134106538339, + "loss": 1.6712797164916993, + "step": 26240 + }, + { + "epoch": 0.07945756965780273, + "grad_norm": 0.1714291274547577, + "learning_rate": 0.000115417545883746, + "loss": 1.7264381408691407, + "step": 26250 + }, + { + "epoch": 0.07948783920814856, + "grad_norm": 0.1562446653842926, + "learning_rate": 0.0001154137507021086, + "loss": 1.665269660949707, + "step": 26260 + }, + { + "epoch": 0.0795181087584944, + "grad_norm": 0.15636077523231506, + "learning_rate": 0.00011540995552047122, + "loss": 1.6917036056518555, + "step": 26270 + }, + { + "epoch": 0.07954837830884022, + "grad_norm": 0.1651841551065445, + "learning_rate": 0.00011540616033883382, + "loss": 1.6496921539306642, + "step": 26280 + }, + { + "epoch": 0.07957864785918606, + "grad_norm": 0.16837163269519806, + "learning_rate": 0.00011540236515719644, + "loss": 1.698674774169922, + "step": 26290 + }, + { + "epoch": 0.07960891740953188, + "grad_norm": 0.15744978189468384, + "learning_rate": 0.00011539856997555903, + "loss": 1.6854129791259767, + "step": 26300 + }, + { + "epoch": 0.0796391869598777, + "grad_norm": 0.16994871199131012, + "learning_rate": 0.00011539477479392165, + "loss": 1.6743963241577149, + "step": 26310 + }, + { + "epoch": 0.07966945651022354, + "grad_norm": 0.1528705358505249, + "learning_rate": 0.00011539097961228424, + "loss": 1.714396858215332, + "step": 26320 + }, + { + "epoch": 0.07969972606056937, + "grad_norm": 0.16388039290905, + "learning_rate": 0.00011538718443064686, + "loss": 1.6302221298217774, + "step": 26330 + }, + { + "epoch": 0.0797299956109152, + "grad_norm": 0.16264642775058746, + "learning_rate": 0.00011538338924900945, + "loss": 1.6673768997192382, + "step": 26340 + }, + { + "epoch": 0.07976026516126103, + "grad_norm": 0.13563799858093262, + "learning_rate": 0.00011537959406737207, + "loss": 1.6908760070800781, + "step": 26350 + }, + { + "epoch": 0.07979053471160685, + "grad_norm": 0.15242841839790344, + "learning_rate": 0.00011537579888573466, + "loss": 1.6327028274536133, + "step": 26360 + }, + { + "epoch": 0.07982080426195269, + "grad_norm": 0.17114229500293732, + "learning_rate": 0.00011537200370409728, + "loss": 1.6875860214233398, + "step": 26370 + }, + { + "epoch": 0.07985107381229851, + "grad_norm": 0.14031918346881866, + "learning_rate": 0.00011536820852245989, + "loss": 1.7047470092773438, + "step": 26380 + }, + { + "epoch": 0.07988134336264435, + "grad_norm": 0.14811092615127563, + "learning_rate": 0.0001153644133408225, + "loss": 1.6771909713745117, + "step": 26390 + }, + { + "epoch": 0.07991161291299018, + "grad_norm": 0.15516716241836548, + "learning_rate": 0.0001153606181591851, + "loss": 1.6351627349853515, + "step": 26400 + }, + { + "epoch": 0.079941882463336, + "grad_norm": 0.1471954882144928, + "learning_rate": 0.00011535682297754771, + "loss": 1.6792652130126953, + "step": 26410 + }, + { + "epoch": 0.07997215201368184, + "grad_norm": 0.17732377350330353, + "learning_rate": 0.00011535302779591031, + "loss": 1.6331958770751953, + "step": 26420 + }, + { + "epoch": 0.08000242156402766, + "grad_norm": 0.1426965594291687, + "learning_rate": 0.00011534923261427292, + "loss": 1.698907470703125, + "step": 26430 + }, + { + "epoch": 0.0800326911143735, + "grad_norm": 0.15449409186840057, + "learning_rate": 0.00011534543743263553, + "loss": 1.688160514831543, + "step": 26440 + }, + { + "epoch": 0.08006296066471932, + "grad_norm": 0.14884375035762787, + "learning_rate": 0.00011534164225099813, + "loss": 1.6707160949707032, + "step": 26450 + }, + { + "epoch": 0.08009323021506516, + "grad_norm": 0.1578306257724762, + "learning_rate": 0.00011533784706936074, + "loss": 1.68273983001709, + "step": 26460 + }, + { + "epoch": 0.08012349976541099, + "grad_norm": 0.1566983014345169, + "learning_rate": 0.00011533405188772334, + "loss": 1.644806480407715, + "step": 26470 + }, + { + "epoch": 0.08015376931575681, + "grad_norm": 0.1544102281332016, + "learning_rate": 0.00011533025670608596, + "loss": 1.6664283752441407, + "step": 26480 + }, + { + "epoch": 0.08018403886610265, + "grad_norm": 0.13252100348472595, + "learning_rate": 0.00011532646152444855, + "loss": 1.6721967697143554, + "step": 26490 + }, + { + "epoch": 0.08021430841644847, + "grad_norm": 0.1622590571641922, + "learning_rate": 0.00011532266634281117, + "loss": 1.6684606552124024, + "step": 26500 + }, + { + "epoch": 0.08021430841644847, + "eval_loss": 1.6524780988693237, + "eval_runtime": 27.9665, + "eval_samples_per_second": 17.879, + "eval_steps_per_second": 1.144, + "step": 26500 + }, + { + "epoch": 0.08024457796679431, + "grad_norm": 0.17679809033870697, + "learning_rate": 0.00011531887116117378, + "loss": 1.6602985382080078, + "step": 26510 + }, + { + "epoch": 0.08027484751714013, + "grad_norm": 0.16049152612686157, + "learning_rate": 0.00011531507597953639, + "loss": 1.698455238342285, + "step": 26520 + }, + { + "epoch": 0.08030511706748596, + "grad_norm": 0.1505085825920105, + "learning_rate": 0.00011531128079789899, + "loss": 1.6547744750976563, + "step": 26530 + }, + { + "epoch": 0.0803353866178318, + "grad_norm": 0.1611393541097641, + "learning_rate": 0.0001153074856162616, + "loss": 1.660873031616211, + "step": 26540 + }, + { + "epoch": 0.08036565616817762, + "grad_norm": 0.17286422848701477, + "learning_rate": 0.0001153036904346242, + "loss": 1.6731361389160155, + "step": 26550 + }, + { + "epoch": 0.08039592571852346, + "grad_norm": 0.15165811777114868, + "learning_rate": 0.00011529989525298681, + "loss": 1.686190414428711, + "step": 26560 + }, + { + "epoch": 0.08042619526886928, + "grad_norm": 0.16622957587242126, + "learning_rate": 0.00011529610007134942, + "loss": 1.6764358520507812, + "step": 26570 + }, + { + "epoch": 0.0804564648192151, + "grad_norm": 0.1695806384086609, + "learning_rate": 0.00011529230488971202, + "loss": 1.6514974594116212, + "step": 26580 + }, + { + "epoch": 0.08048673436956094, + "grad_norm": 0.1589728742837906, + "learning_rate": 0.00011528850970807463, + "loss": 1.6476465225219727, + "step": 26590 + }, + { + "epoch": 0.08051700391990677, + "grad_norm": 0.1466977447271347, + "learning_rate": 0.00011528471452643723, + "loss": 1.6504426956176759, + "step": 26600 + }, + { + "epoch": 0.0805472734702526, + "grad_norm": 0.16427618265151978, + "learning_rate": 0.00011528091934479984, + "loss": 1.6752357482910156, + "step": 26610 + }, + { + "epoch": 0.08057754302059843, + "grad_norm": 0.15500406920909882, + "learning_rate": 0.00011527712416316246, + "loss": 1.6556726455688477, + "step": 26620 + }, + { + "epoch": 0.08060781257094426, + "grad_norm": 0.1537162810564041, + "learning_rate": 0.00011527332898152505, + "loss": 1.6794626235961914, + "step": 26630 + }, + { + "epoch": 0.08063808212129009, + "grad_norm": 0.18765190243721008, + "learning_rate": 0.00011526953379988767, + "loss": 1.6800920486450195, + "step": 26640 + }, + { + "epoch": 0.08066835167163591, + "grad_norm": 0.14847269654273987, + "learning_rate": 0.00011526573861825026, + "loss": 1.6726940155029297, + "step": 26650 + }, + { + "epoch": 0.08069862122198175, + "grad_norm": 0.1867280751466751, + "learning_rate": 0.00011526194343661288, + "loss": 1.647671890258789, + "step": 26660 + }, + { + "epoch": 0.08072889077232757, + "grad_norm": 0.14616651833057404, + "learning_rate": 0.00011525814825497549, + "loss": 1.6783245086669922, + "step": 26670 + }, + { + "epoch": 0.08075916032267341, + "grad_norm": 0.15457655489444733, + "learning_rate": 0.0001152543530733381, + "loss": 1.6732601165771483, + "step": 26680 + }, + { + "epoch": 0.08078942987301924, + "grad_norm": 0.15674501657485962, + "learning_rate": 0.0001152505578917007, + "loss": 1.692775344848633, + "step": 26690 + }, + { + "epoch": 0.08081969942336506, + "grad_norm": 0.1631256341934204, + "learning_rate": 0.00011524676271006331, + "loss": 1.6599567413330079, + "step": 26700 + }, + { + "epoch": 0.0808499689737109, + "grad_norm": 0.1631229668855667, + "learning_rate": 0.00011524296752842591, + "loss": 1.6592704772949218, + "step": 26710 + }, + { + "epoch": 0.08088023852405672, + "grad_norm": 0.13255302608013153, + "learning_rate": 0.00011523917234678852, + "loss": 1.6572372436523437, + "step": 26720 + }, + { + "epoch": 0.08091050807440256, + "grad_norm": 0.14280147850513458, + "learning_rate": 0.00011523537716515112, + "loss": 1.7186573028564454, + "step": 26730 + }, + { + "epoch": 0.08094077762474838, + "grad_norm": 0.15994836390018463, + "learning_rate": 0.00011523158198351373, + "loss": 1.664723014831543, + "step": 26740 + }, + { + "epoch": 0.08097104717509421, + "grad_norm": 0.13820092380046844, + "learning_rate": 0.00011522778680187635, + "loss": 1.6844774246215821, + "step": 26750 + }, + { + "epoch": 0.08100131672544005, + "grad_norm": 0.14959563314914703, + "learning_rate": 0.00011522399162023894, + "loss": 1.6754421234130858, + "step": 26760 + }, + { + "epoch": 0.08103158627578587, + "grad_norm": 0.1552136242389679, + "learning_rate": 0.00011522019643860156, + "loss": 1.647218894958496, + "step": 26770 + }, + { + "epoch": 0.0810618558261317, + "grad_norm": 0.14717085659503937, + "learning_rate": 0.00011521640125696415, + "loss": 1.674758529663086, + "step": 26780 + }, + { + "epoch": 0.08109212537647753, + "grad_norm": 0.15245750546455383, + "learning_rate": 0.00011521260607532677, + "loss": 1.665658950805664, + "step": 26790 + }, + { + "epoch": 0.08112239492682337, + "grad_norm": 0.163434699177742, + "learning_rate": 0.00011520881089368937, + "loss": 1.686025047302246, + "step": 26800 + }, + { + "epoch": 0.08115266447716919, + "grad_norm": 0.13962972164154053, + "learning_rate": 0.00011520501571205199, + "loss": 1.6647808074951171, + "step": 26810 + }, + { + "epoch": 0.08118293402751502, + "grad_norm": 0.15052969753742218, + "learning_rate": 0.00011520122053041458, + "loss": 1.6411584854125976, + "step": 26820 + }, + { + "epoch": 0.08121320357786085, + "grad_norm": 0.14175772666931152, + "learning_rate": 0.0001151974253487772, + "loss": 1.713568878173828, + "step": 26830 + }, + { + "epoch": 0.08124347312820668, + "grad_norm": 0.16143850982189178, + "learning_rate": 0.00011519363016713979, + "loss": 1.6308652877807617, + "step": 26840 + }, + { + "epoch": 0.08127374267855252, + "grad_norm": 0.17227761447429657, + "learning_rate": 0.00011518983498550241, + "loss": 1.6964216232299805, + "step": 26850 + }, + { + "epoch": 0.08130401222889834, + "grad_norm": 0.1345893144607544, + "learning_rate": 0.000115186039803865, + "loss": 1.689309310913086, + "step": 26860 + }, + { + "epoch": 0.08133428177924416, + "grad_norm": 0.16419266164302826, + "learning_rate": 0.00011518224462222762, + "loss": 1.668701171875, + "step": 26870 + }, + { + "epoch": 0.08136455132959, + "grad_norm": 0.13849398493766785, + "learning_rate": 0.00011517844944059024, + "loss": 1.7291980743408204, + "step": 26880 + }, + { + "epoch": 0.08139482087993583, + "grad_norm": 0.1655762642621994, + "learning_rate": 0.00011517465425895283, + "loss": 1.6683012008666993, + "step": 26890 + }, + { + "epoch": 0.08142509043028166, + "grad_norm": 0.16182714700698853, + "learning_rate": 0.00011517085907731545, + "loss": 1.6616550445556642, + "step": 26900 + }, + { + "epoch": 0.08145535998062749, + "grad_norm": 0.15403929352760315, + "learning_rate": 0.00011516706389567805, + "loss": 1.714151954650879, + "step": 26910 + }, + { + "epoch": 0.08148562953097331, + "grad_norm": 0.14275017380714417, + "learning_rate": 0.00011516326871404066, + "loss": 1.63935546875, + "step": 26920 + }, + { + "epoch": 0.08151589908131915, + "grad_norm": 0.12156350165605545, + "learning_rate": 0.00011515947353240326, + "loss": 1.7025192260742188, + "step": 26930 + }, + { + "epoch": 0.08154616863166497, + "grad_norm": 0.17252160608768463, + "learning_rate": 0.00011515567835076588, + "loss": 1.6746231079101563, + "step": 26940 + }, + { + "epoch": 0.08157643818201081, + "grad_norm": 0.15868809819221497, + "learning_rate": 0.00011515188316912847, + "loss": 1.6745546340942383, + "step": 26950 + }, + { + "epoch": 0.08160670773235663, + "grad_norm": 0.15155592560768127, + "learning_rate": 0.00011514808798749109, + "loss": 1.6562162399291993, + "step": 26960 + }, + { + "epoch": 0.08163697728270247, + "grad_norm": 0.1569071114063263, + "learning_rate": 0.00011514429280585368, + "loss": 1.6606401443481444, + "step": 26970 + }, + { + "epoch": 0.0816672468330483, + "grad_norm": 0.1439349353313446, + "learning_rate": 0.0001151404976242163, + "loss": 1.663377571105957, + "step": 26980 + }, + { + "epoch": 0.08169751638339412, + "grad_norm": 0.14814548194408417, + "learning_rate": 0.0001151367024425789, + "loss": 1.6500734329223632, + "step": 26990 + }, + { + "epoch": 0.08172778593373996, + "grad_norm": 0.15270498394966125, + "learning_rate": 0.00011513290726094151, + "loss": 1.6723541259765624, + "step": 27000 + }, + { + "epoch": 0.08172778593373996, + "eval_loss": 1.6726460456848145, + "eval_runtime": 28.2933, + "eval_samples_per_second": 17.672, + "eval_steps_per_second": 1.131, + "step": 27000 + }, + { + "epoch": 0.08175805548408578, + "grad_norm": 0.1597924679517746, + "learning_rate": 0.00011512911207930412, + "loss": 1.6630407333374024, + "step": 27010 + }, + { + "epoch": 0.08178832503443162, + "grad_norm": 0.1605612188577652, + "learning_rate": 0.00011512531689766672, + "loss": 1.6897947311401367, + "step": 27020 + }, + { + "epoch": 0.08181859458477744, + "grad_norm": 0.1452198624610901, + "learning_rate": 0.00011512152171602933, + "loss": 1.631245994567871, + "step": 27030 + }, + { + "epoch": 0.08184886413512327, + "grad_norm": 0.15444810688495636, + "learning_rate": 0.00011511772653439194, + "loss": 1.677816390991211, + "step": 27040 + }, + { + "epoch": 0.0818791336854691, + "grad_norm": 0.148000568151474, + "learning_rate": 0.00011511393135275454, + "loss": 1.6582334518432618, + "step": 27050 + }, + { + "epoch": 0.08190940323581493, + "grad_norm": 0.14752981066703796, + "learning_rate": 0.00011511013617111715, + "loss": 1.6822887420654298, + "step": 27060 + }, + { + "epoch": 0.08193967278616077, + "grad_norm": 0.1438269019126892, + "learning_rate": 0.00011510634098947975, + "loss": 1.6161584854125977, + "step": 27070 + }, + { + "epoch": 0.08196994233650659, + "grad_norm": 0.16794076561927795, + "learning_rate": 0.00011510254580784236, + "loss": 1.651010513305664, + "step": 27080 + }, + { + "epoch": 0.08200021188685241, + "grad_norm": 0.20220859348773956, + "learning_rate": 0.00011509875062620498, + "loss": 1.6790401458740234, + "step": 27090 + }, + { + "epoch": 0.08203048143719825, + "grad_norm": 0.14820608496665955, + "learning_rate": 0.00011509495544456757, + "loss": 1.6706954956054687, + "step": 27100 + }, + { + "epoch": 0.08206075098754408, + "grad_norm": 0.15940624475479126, + "learning_rate": 0.00011509116026293019, + "loss": 1.652259063720703, + "step": 27110 + }, + { + "epoch": 0.08209102053788991, + "grad_norm": 0.1571408361196518, + "learning_rate": 0.0001150873650812928, + "loss": 1.6582019805908204, + "step": 27120 + }, + { + "epoch": 0.08212129008823574, + "grad_norm": 0.1574816107749939, + "learning_rate": 0.0001150835698996554, + "loss": 1.6987163543701171, + "step": 27130 + }, + { + "epoch": 0.08215155963858158, + "grad_norm": 0.1386736035346985, + "learning_rate": 0.00011507977471801801, + "loss": 1.6606864929199219, + "step": 27140 + }, + { + "epoch": 0.0821818291889274, + "grad_norm": 0.15879885852336884, + "learning_rate": 0.00011507597953638062, + "loss": 1.7238544464111327, + "step": 27150 + }, + { + "epoch": 0.08221209873927322, + "grad_norm": 0.15278008580207825, + "learning_rate": 0.00011507218435474322, + "loss": 1.6566152572631836, + "step": 27160 + }, + { + "epoch": 0.08224236828961906, + "grad_norm": 0.15444165468215942, + "learning_rate": 0.00011506838917310583, + "loss": 1.6449392318725586, + "step": 27170 + }, + { + "epoch": 0.08227263783996488, + "grad_norm": 0.16026440262794495, + "learning_rate": 0.00011506459399146843, + "loss": 1.6842781066894532, + "step": 27180 + }, + { + "epoch": 0.08230290739031072, + "grad_norm": 0.1570613980293274, + "learning_rate": 0.00011506079880983104, + "loss": 1.6727834701538087, + "step": 27190 + }, + { + "epoch": 0.08233317694065655, + "grad_norm": 0.16303910315036774, + "learning_rate": 0.00011505700362819365, + "loss": 1.6525157928466796, + "step": 27200 + }, + { + "epoch": 0.08236344649100237, + "grad_norm": 0.1685134619474411, + "learning_rate": 0.00011505320844655625, + "loss": 1.6325691223144532, + "step": 27210 + }, + { + "epoch": 0.08239371604134821, + "grad_norm": 0.15362054109573364, + "learning_rate": 0.00011504941326491886, + "loss": 1.6673992156982422, + "step": 27220 + }, + { + "epoch": 0.08242398559169403, + "grad_norm": 0.15904416143894196, + "learning_rate": 0.00011504561808328148, + "loss": 1.6045886993408203, + "step": 27230 + }, + { + "epoch": 0.08245425514203987, + "grad_norm": 0.1273253709077835, + "learning_rate": 0.00011504182290164407, + "loss": 1.67996826171875, + "step": 27240 + }, + { + "epoch": 0.0824845246923857, + "grad_norm": 0.18761536478996277, + "learning_rate": 0.00011503802772000669, + "loss": 1.6529195785522461, + "step": 27250 + }, + { + "epoch": 0.08251479424273152, + "grad_norm": 0.15907637774944305, + "learning_rate": 0.00011503423253836928, + "loss": 1.6411100387573243, + "step": 27260 + }, + { + "epoch": 0.08254506379307736, + "grad_norm": 0.16792526841163635, + "learning_rate": 0.0001150304373567319, + "loss": 1.690157699584961, + "step": 27270 + }, + { + "epoch": 0.08257533334342318, + "grad_norm": 0.1511419117450714, + "learning_rate": 0.0001150266421750945, + "loss": 1.6396251678466798, + "step": 27280 + }, + { + "epoch": 0.08260560289376902, + "grad_norm": 0.15792660415172577, + "learning_rate": 0.00011502284699345711, + "loss": 1.6454687118530273, + "step": 27290 + }, + { + "epoch": 0.08263587244411484, + "grad_norm": 0.14988020062446594, + "learning_rate": 0.00011501905181181972, + "loss": 1.6282405853271484, + "step": 27300 + }, + { + "epoch": 0.08266614199446068, + "grad_norm": 0.1450231969356537, + "learning_rate": 0.00011501525663018232, + "loss": 1.6505556106567383, + "step": 27310 + }, + { + "epoch": 0.0826964115448065, + "grad_norm": 0.1644052416086197, + "learning_rate": 0.00011501146144854493, + "loss": 1.6460615158081056, + "step": 27320 + }, + { + "epoch": 0.08272668109515233, + "grad_norm": 0.1577591747045517, + "learning_rate": 0.00011500766626690754, + "loss": 1.6527572631835938, + "step": 27330 + }, + { + "epoch": 0.08275695064549816, + "grad_norm": 0.15449248254299164, + "learning_rate": 0.00011500387108527014, + "loss": 1.6924263000488282, + "step": 27340 + }, + { + "epoch": 0.08278722019584399, + "grad_norm": 0.16830994188785553, + "learning_rate": 0.00011500007590363275, + "loss": 1.6510784149169921, + "step": 27350 + }, + { + "epoch": 0.08281748974618983, + "grad_norm": 0.16391444206237793, + "learning_rate": 0.00011499628072199537, + "loss": 1.6417293548583984, + "step": 27360 + }, + { + "epoch": 0.08284775929653565, + "grad_norm": 0.17897368967533112, + "learning_rate": 0.00011499248554035796, + "loss": 1.6946205139160155, + "step": 27370 + }, + { + "epoch": 0.08287802884688147, + "grad_norm": 0.1739729940891266, + "learning_rate": 0.00011498869035872058, + "loss": 1.6726205825805665, + "step": 27380 + }, + { + "epoch": 0.08290829839722731, + "grad_norm": 0.1596188098192215, + "learning_rate": 0.00011498489517708317, + "loss": 1.7114566802978515, + "step": 27390 + }, + { + "epoch": 0.08293856794757314, + "grad_norm": 0.14796702563762665, + "learning_rate": 0.00011498109999544579, + "loss": 1.6091915130615235, + "step": 27400 + }, + { + "epoch": 0.08296883749791897, + "grad_norm": 0.1555376946926117, + "learning_rate": 0.00011497730481380838, + "loss": 1.706261444091797, + "step": 27410 + }, + { + "epoch": 0.0829991070482648, + "grad_norm": 0.1557374745607376, + "learning_rate": 0.000114973509632171, + "loss": 1.6776742935180664, + "step": 27420 + }, + { + "epoch": 0.08302937659861062, + "grad_norm": 0.1575349122285843, + "learning_rate": 0.0001149697144505336, + "loss": 1.6361364364624023, + "step": 27430 + }, + { + "epoch": 0.08305964614895646, + "grad_norm": 0.1663890779018402, + "learning_rate": 0.00011496591926889621, + "loss": 1.6565881729125977, + "step": 27440 + }, + { + "epoch": 0.08308991569930228, + "grad_norm": 0.16801674664020538, + "learning_rate": 0.00011496212408725881, + "loss": 1.6706153869628906, + "step": 27450 + }, + { + "epoch": 0.08312018524964812, + "grad_norm": 0.15902389585971832, + "learning_rate": 0.00011495832890562143, + "loss": 1.6748821258544921, + "step": 27460 + }, + { + "epoch": 0.08315045479999394, + "grad_norm": 0.16508454084396362, + "learning_rate": 0.00011495453372398402, + "loss": 1.6992225646972656, + "step": 27470 + }, + { + "epoch": 0.08318072435033978, + "grad_norm": 0.14619585871696472, + "learning_rate": 0.00011495073854234664, + "loss": 1.670550537109375, + "step": 27480 + }, + { + "epoch": 0.0832109939006856, + "grad_norm": 0.14650855958461761, + "learning_rate": 0.00011494694336070926, + "loss": 1.70206298828125, + "step": 27490 + }, + { + "epoch": 0.08324126345103143, + "grad_norm": 0.15378929674625397, + "learning_rate": 0.00011494314817907185, + "loss": 1.651116180419922, + "step": 27500 + }, + { + "epoch": 0.08324126345103143, + "eval_loss": 1.6644455194473267, + "eval_runtime": 28.1527, + "eval_samples_per_second": 17.76, + "eval_steps_per_second": 1.137, + "step": 27500 + }, + { + "epoch": 0.08327153300137727, + "grad_norm": 0.15008649230003357, + "learning_rate": 0.00011493935299743447, + "loss": 1.68682861328125, + "step": 27510 + }, + { + "epoch": 0.08330180255172309, + "grad_norm": 0.13992300629615784, + "learning_rate": 0.00011493555781579706, + "loss": 1.6337345123291016, + "step": 27520 + }, + { + "epoch": 0.08333207210206893, + "grad_norm": 0.15794090926647186, + "learning_rate": 0.00011493176263415968, + "loss": 1.6301294326782227, + "step": 27530 + }, + { + "epoch": 0.08336234165241475, + "grad_norm": 0.1851130723953247, + "learning_rate": 0.00011492796745252227, + "loss": 1.7294055938720703, + "step": 27540 + }, + { + "epoch": 0.08339261120276058, + "grad_norm": 0.14528857171535492, + "learning_rate": 0.0001149241722708849, + "loss": 1.674455451965332, + "step": 27550 + }, + { + "epoch": 0.08342288075310642, + "grad_norm": 0.16202443838119507, + "learning_rate": 0.00011492037708924749, + "loss": 1.644156265258789, + "step": 27560 + }, + { + "epoch": 0.08345315030345224, + "grad_norm": 0.1517709344625473, + "learning_rate": 0.0001149165819076101, + "loss": 1.6755983352661132, + "step": 27570 + }, + { + "epoch": 0.08348341985379808, + "grad_norm": 0.14987124502658844, + "learning_rate": 0.0001149127867259727, + "loss": 1.6839302062988282, + "step": 27580 + }, + { + "epoch": 0.0835136894041439, + "grad_norm": 0.17333874106407166, + "learning_rate": 0.00011490899154433532, + "loss": 1.7225151062011719, + "step": 27590 + }, + { + "epoch": 0.08354395895448972, + "grad_norm": 0.14900998771190643, + "learning_rate": 0.00011490519636269791, + "loss": 1.6773269653320313, + "step": 27600 + }, + { + "epoch": 0.08357422850483556, + "grad_norm": 0.1497289538383484, + "learning_rate": 0.00011490140118106053, + "loss": 1.6610847473144532, + "step": 27610 + }, + { + "epoch": 0.08360449805518139, + "grad_norm": 0.14567053318023682, + "learning_rate": 0.00011489760599942314, + "loss": 1.6619115829467774, + "step": 27620 + }, + { + "epoch": 0.08363476760552722, + "grad_norm": 0.1760406643152237, + "learning_rate": 0.00011489381081778574, + "loss": 1.6482303619384766, + "step": 27630 + }, + { + "epoch": 0.08366503715587305, + "grad_norm": 0.142800971865654, + "learning_rate": 0.00011489001563614835, + "loss": 1.6964502334594727, + "step": 27640 + }, + { + "epoch": 0.08369530670621889, + "grad_norm": 0.1545894891023636, + "learning_rate": 0.00011488622045451095, + "loss": 1.7122358322143554, + "step": 27650 + }, + { + "epoch": 0.08372557625656471, + "grad_norm": 0.17087863385677338, + "learning_rate": 0.00011488242527287356, + "loss": 1.6379407882690429, + "step": 27660 + }, + { + "epoch": 0.08375584580691053, + "grad_norm": 0.15671409666538239, + "learning_rate": 0.00011487863009123617, + "loss": 1.6734085083007812, + "step": 27670 + }, + { + "epoch": 0.08378611535725637, + "grad_norm": 0.1564333736896515, + "learning_rate": 0.00011487483490959878, + "loss": 1.6178251266479493, + "step": 27680 + }, + { + "epoch": 0.0838163849076022, + "grad_norm": 0.15916909277439117, + "learning_rate": 0.00011487103972796138, + "loss": 1.6623292922973634, + "step": 27690 + }, + { + "epoch": 0.08384665445794803, + "grad_norm": 0.14108961820602417, + "learning_rate": 0.000114867244546324, + "loss": 1.687495803833008, + "step": 27700 + }, + { + "epoch": 0.08387692400829386, + "grad_norm": 0.1278107464313507, + "learning_rate": 0.00011486344936468659, + "loss": 1.6758693695068358, + "step": 27710 + }, + { + "epoch": 0.08390719355863968, + "grad_norm": 0.1425829529762268, + "learning_rate": 0.00011485965418304921, + "loss": 1.624370002746582, + "step": 27720 + }, + { + "epoch": 0.08393746310898552, + "grad_norm": 0.15549178421497345, + "learning_rate": 0.00011485585900141181, + "loss": 1.679698371887207, + "step": 27730 + }, + { + "epoch": 0.08396773265933134, + "grad_norm": 0.15398471057415009, + "learning_rate": 0.00011485206381977442, + "loss": 1.6679075241088868, + "step": 27740 + }, + { + "epoch": 0.08399800220967718, + "grad_norm": 0.1710062026977539, + "learning_rate": 0.00011484826863813703, + "loss": 1.7042659759521483, + "step": 27750 + }, + { + "epoch": 0.084028271760023, + "grad_norm": 0.14516766369342804, + "learning_rate": 0.00011484447345649963, + "loss": 1.666623306274414, + "step": 27760 + }, + { + "epoch": 0.08405854131036883, + "grad_norm": 0.14865516126155853, + "learning_rate": 0.00011484067827486224, + "loss": 1.7004104614257813, + "step": 27770 + }, + { + "epoch": 0.08408881086071467, + "grad_norm": 0.16905266046524048, + "learning_rate": 0.00011483688309322484, + "loss": 1.654852294921875, + "step": 27780 + }, + { + "epoch": 0.08411908041106049, + "grad_norm": 0.16058073937892914, + "learning_rate": 0.00011483308791158745, + "loss": 1.6620040893554688, + "step": 27790 + }, + { + "epoch": 0.08414934996140633, + "grad_norm": 0.15538552403450012, + "learning_rate": 0.00011482929272995006, + "loss": 1.698248291015625, + "step": 27800 + }, + { + "epoch": 0.08417961951175215, + "grad_norm": 0.1516352742910385, + "learning_rate": 0.00011482549754831266, + "loss": 1.6907211303710938, + "step": 27810 + }, + { + "epoch": 0.08420988906209798, + "grad_norm": 0.1620863527059555, + "learning_rate": 0.00011482170236667527, + "loss": 1.7076919555664063, + "step": 27820 + }, + { + "epoch": 0.08424015861244381, + "grad_norm": 0.14629532396793365, + "learning_rate": 0.00011481790718503787, + "loss": 1.6266807556152343, + "step": 27830 + }, + { + "epoch": 0.08427042816278964, + "grad_norm": 0.14791175723075867, + "learning_rate": 0.00011481411200340048, + "loss": 1.6617511749267577, + "step": 27840 + }, + { + "epoch": 0.08430069771313548, + "grad_norm": 0.14408810436725616, + "learning_rate": 0.00011481031682176309, + "loss": 1.6612857818603515, + "step": 27850 + }, + { + "epoch": 0.0843309672634813, + "grad_norm": 0.14206482470035553, + "learning_rate": 0.0001148065216401257, + "loss": 1.6592960357666016, + "step": 27860 + }, + { + "epoch": 0.08436123681382714, + "grad_norm": 0.14971651136875153, + "learning_rate": 0.0001148027264584883, + "loss": 1.7137996673583984, + "step": 27870 + }, + { + "epoch": 0.08439150636417296, + "grad_norm": 0.13340537250041962, + "learning_rate": 0.00011479893127685092, + "loss": 1.656502914428711, + "step": 27880 + }, + { + "epoch": 0.08442177591451878, + "grad_norm": 0.15275032818317413, + "learning_rate": 0.00011479513609521352, + "loss": 1.6149768829345703, + "step": 27890 + }, + { + "epoch": 0.08445204546486462, + "grad_norm": 0.14913907647132874, + "learning_rate": 0.00011479134091357613, + "loss": 1.668144989013672, + "step": 27900 + }, + { + "epoch": 0.08448231501521045, + "grad_norm": 0.14682908356189728, + "learning_rate": 0.00011478754573193874, + "loss": 1.6373783111572267, + "step": 27910 + }, + { + "epoch": 0.08451258456555628, + "grad_norm": 0.15722841024398804, + "learning_rate": 0.00011478375055030134, + "loss": 1.6572065353393555, + "step": 27920 + }, + { + "epoch": 0.08454285411590211, + "grad_norm": 0.16288721561431885, + "learning_rate": 0.00011477995536866395, + "loss": 1.6600849151611328, + "step": 27930 + }, + { + "epoch": 0.08457312366624793, + "grad_norm": 0.1705029308795929, + "learning_rate": 0.00011477616018702655, + "loss": 1.6571159362792969, + "step": 27940 + }, + { + "epoch": 0.08460339321659377, + "grad_norm": 0.1514250487089157, + "learning_rate": 0.00011477236500538916, + "loss": 1.6963573455810548, + "step": 27950 + }, + { + "epoch": 0.0846336627669396, + "grad_norm": 0.1595304310321808, + "learning_rate": 0.00011476856982375177, + "loss": 1.6526708602905273, + "step": 27960 + }, + { + "epoch": 0.08466393231728543, + "grad_norm": 0.18175072968006134, + "learning_rate": 0.00011476477464211438, + "loss": 1.6607637405395508, + "step": 27970 + }, + { + "epoch": 0.08469420186763126, + "grad_norm": 0.14828380942344666, + "learning_rate": 0.00011476097946047698, + "loss": 1.6840526580810546, + "step": 27980 + }, + { + "epoch": 0.08472447141797708, + "grad_norm": 0.16104213893413544, + "learning_rate": 0.0001147571842788396, + "loss": 1.6758363723754883, + "step": 27990 + }, + { + "epoch": 0.08475474096832292, + "grad_norm": 0.1635177880525589, + "learning_rate": 0.00011475338909720219, + "loss": 1.687931442260742, + "step": 28000 + }, + { + "epoch": 0.08475474096832292, + "eval_loss": 1.6729086637496948, + "eval_runtime": 28.1612, + "eval_samples_per_second": 17.755, + "eval_steps_per_second": 1.136, + "step": 28000 + }, + { + "epoch": 0.08478501051866874, + "grad_norm": 0.13304826617240906, + "learning_rate": 0.00011474959391556481, + "loss": 1.6747867584228515, + "step": 28010 + }, + { + "epoch": 0.08481528006901458, + "grad_norm": 0.14529438316822052, + "learning_rate": 0.0001147457987339274, + "loss": 1.6663379669189453, + "step": 28020 + }, + { + "epoch": 0.0848455496193604, + "grad_norm": 0.16904228925704956, + "learning_rate": 0.00011474200355229002, + "loss": 1.671421432495117, + "step": 28030 + }, + { + "epoch": 0.08487581916970624, + "grad_norm": 0.16246233880519867, + "learning_rate": 0.00011473820837065261, + "loss": 1.6552078247070312, + "step": 28040 + }, + { + "epoch": 0.08490608872005206, + "grad_norm": 0.15492761135101318, + "learning_rate": 0.00011473441318901523, + "loss": 1.680556297302246, + "step": 28050 + }, + { + "epoch": 0.08493635827039789, + "grad_norm": 0.1655411571264267, + "learning_rate": 0.00011473061800737782, + "loss": 1.7108867645263672, + "step": 28060 + }, + { + "epoch": 0.08496662782074373, + "grad_norm": 0.13834676146507263, + "learning_rate": 0.00011472682282574044, + "loss": 1.670877456665039, + "step": 28070 + }, + { + "epoch": 0.08499689737108955, + "grad_norm": 0.17714479565620422, + "learning_rate": 0.00011472302764410304, + "loss": 1.6480117797851563, + "step": 28080 + }, + { + "epoch": 0.08502716692143539, + "grad_norm": 0.14303000271320343, + "learning_rate": 0.00011471923246246566, + "loss": 1.6609638214111329, + "step": 28090 + }, + { + "epoch": 0.08505743647178121, + "grad_norm": 0.13281843066215515, + "learning_rate": 0.00011471543728082828, + "loss": 1.6710187911987304, + "step": 28100 + }, + { + "epoch": 0.08508770602212704, + "grad_norm": 0.14311924576759338, + "learning_rate": 0.00011471164209919087, + "loss": 1.6875110626220704, + "step": 28110 + }, + { + "epoch": 0.08511797557247287, + "grad_norm": 0.1846441626548767, + "learning_rate": 0.00011470784691755349, + "loss": 1.6590568542480468, + "step": 28120 + }, + { + "epoch": 0.0851482451228187, + "grad_norm": 0.15161709487438202, + "learning_rate": 0.00011470405173591608, + "loss": 1.6474859237670898, + "step": 28130 + }, + { + "epoch": 0.08517851467316453, + "grad_norm": 0.14268529415130615, + "learning_rate": 0.0001147002565542787, + "loss": 1.65179386138916, + "step": 28140 + }, + { + "epoch": 0.08520878422351036, + "grad_norm": 0.13560751080513, + "learning_rate": 0.00011469646137264129, + "loss": 1.6527597427368164, + "step": 28150 + }, + { + "epoch": 0.08523905377385618, + "grad_norm": 0.16997791826725006, + "learning_rate": 0.00011469266619100391, + "loss": 1.6750167846679687, + "step": 28160 + }, + { + "epoch": 0.08526932332420202, + "grad_norm": 0.15121187269687653, + "learning_rate": 0.0001146888710093665, + "loss": 1.668297576904297, + "step": 28170 + }, + { + "epoch": 0.08529959287454784, + "grad_norm": 0.16250485181808472, + "learning_rate": 0.00011468507582772912, + "loss": 1.6447391510009766, + "step": 28180 + }, + { + "epoch": 0.08532986242489368, + "grad_norm": 0.1442384272813797, + "learning_rate": 0.00011468128064609172, + "loss": 1.726052474975586, + "step": 28190 + }, + { + "epoch": 0.0853601319752395, + "grad_norm": 0.16288188099861145, + "learning_rate": 0.00011467748546445433, + "loss": 1.6584856033325195, + "step": 28200 + }, + { + "epoch": 0.08539040152558534, + "grad_norm": 0.179617241024971, + "learning_rate": 0.00011467369028281693, + "loss": 1.688040542602539, + "step": 28210 + }, + { + "epoch": 0.08542067107593117, + "grad_norm": 0.15480875968933105, + "learning_rate": 0.00011466989510117955, + "loss": 1.6650135040283203, + "step": 28220 + }, + { + "epoch": 0.08545094062627699, + "grad_norm": 0.1522962599992752, + "learning_rate": 0.00011466609991954215, + "loss": 1.6515159606933594, + "step": 28230 + }, + { + "epoch": 0.08548121017662283, + "grad_norm": 0.1527717262506485, + "learning_rate": 0.00011466230473790476, + "loss": 1.6771394729614257, + "step": 28240 + }, + { + "epoch": 0.08551147972696865, + "grad_norm": 0.14515669643878937, + "learning_rate": 0.00011465850955626736, + "loss": 1.6841770172119142, + "step": 28250 + }, + { + "epoch": 0.08554174927731449, + "grad_norm": 0.17035718262195587, + "learning_rate": 0.00011465471437462997, + "loss": 1.7107908248901367, + "step": 28260 + }, + { + "epoch": 0.08557201882766031, + "grad_norm": 0.15306925773620605, + "learning_rate": 0.00011465091919299258, + "loss": 1.6416942596435546, + "step": 28270 + }, + { + "epoch": 0.08560228837800614, + "grad_norm": 0.16267900168895721, + "learning_rate": 0.00011464712401135518, + "loss": 1.7063255310058594, + "step": 28280 + }, + { + "epoch": 0.08563255792835198, + "grad_norm": 0.17185243964195251, + "learning_rate": 0.0001146433288297178, + "loss": 1.6313121795654297, + "step": 28290 + }, + { + "epoch": 0.0856628274786978, + "grad_norm": 0.1507766991853714, + "learning_rate": 0.0001146395336480804, + "loss": 1.6688501358032226, + "step": 28300 + }, + { + "epoch": 0.08569309702904364, + "grad_norm": 0.1782677173614502, + "learning_rate": 0.00011463573846644301, + "loss": 1.6532150268554688, + "step": 28310 + }, + { + "epoch": 0.08572336657938946, + "grad_norm": 0.15402112901210785, + "learning_rate": 0.0001146319432848056, + "loss": 1.6244756698608398, + "step": 28320 + }, + { + "epoch": 0.08575363612973529, + "grad_norm": 0.15360446274280548, + "learning_rate": 0.00011462814810316823, + "loss": 1.639266586303711, + "step": 28330 + }, + { + "epoch": 0.08578390568008112, + "grad_norm": 0.17938826978206635, + "learning_rate": 0.00011462435292153083, + "loss": 1.6537071228027345, + "step": 28340 + }, + { + "epoch": 0.08581417523042695, + "grad_norm": 0.15356339514255524, + "learning_rate": 0.00011462055773989344, + "loss": 1.6753316879272462, + "step": 28350 + }, + { + "epoch": 0.08584444478077279, + "grad_norm": 0.17240573465824127, + "learning_rate": 0.00011461676255825604, + "loss": 1.666956901550293, + "step": 28360 + }, + { + "epoch": 0.08587471433111861, + "grad_norm": 0.14813968539237976, + "learning_rate": 0.00011461296737661865, + "loss": 1.6204931259155273, + "step": 28370 + }, + { + "epoch": 0.08590498388146445, + "grad_norm": 0.1479506939649582, + "learning_rate": 0.00011460917219498126, + "loss": 1.636960220336914, + "step": 28380 + }, + { + "epoch": 0.08593525343181027, + "grad_norm": 0.15915068984031677, + "learning_rate": 0.00011460537701334386, + "loss": 1.682563018798828, + "step": 28390 + }, + { + "epoch": 0.0859655229821561, + "grad_norm": 0.17069873213768005, + "learning_rate": 0.00011460158183170647, + "loss": 1.644197654724121, + "step": 28400 + }, + { + "epoch": 0.08599579253250193, + "grad_norm": 0.18459933996200562, + "learning_rate": 0.00011459778665006907, + "loss": 1.6509246826171875, + "step": 28410 + }, + { + "epoch": 0.08602606208284776, + "grad_norm": 0.15275514125823975, + "learning_rate": 0.00011459399146843168, + "loss": 1.6287933349609376, + "step": 28420 + }, + { + "epoch": 0.0860563316331936, + "grad_norm": 0.16727599501609802, + "learning_rate": 0.00011459019628679429, + "loss": 1.664846420288086, + "step": 28430 + }, + { + "epoch": 0.08608660118353942, + "grad_norm": 0.1465480625629425, + "learning_rate": 0.00011458640110515689, + "loss": 1.6800056457519532, + "step": 28440 + }, + { + "epoch": 0.08611687073388524, + "grad_norm": 0.1724613904953003, + "learning_rate": 0.0001145826059235195, + "loss": 1.7136276245117188, + "step": 28450 + }, + { + "epoch": 0.08614714028423108, + "grad_norm": 0.13659748435020447, + "learning_rate": 0.0001145788107418821, + "loss": 1.633184814453125, + "step": 28460 + }, + { + "epoch": 0.0861774098345769, + "grad_norm": 0.141034334897995, + "learning_rate": 0.00011457501556024472, + "loss": 1.6459388732910156, + "step": 28470 + }, + { + "epoch": 0.08620767938492274, + "grad_norm": 0.146161749958992, + "learning_rate": 0.00011457122037860732, + "loss": 1.656640625, + "step": 28480 + }, + { + "epoch": 0.08623794893526857, + "grad_norm": 0.14031223952770233, + "learning_rate": 0.00011456742519696993, + "loss": 1.6034814834594726, + "step": 28490 + }, + { + "epoch": 0.08626821848561439, + "grad_norm": 0.1538466215133667, + "learning_rate": 0.00011456363001533254, + "loss": 1.6673473358154296, + "step": 28500 + }, + { + "epoch": 0.08626821848561439, + "eval_loss": 1.6660054922103882, + "eval_runtime": 28.1321, + "eval_samples_per_second": 17.773, + "eval_steps_per_second": 1.137, + "step": 28500 + }, + { + "epoch": 0.08629848803596023, + "grad_norm": 0.15605351328849792, + "learning_rate": 0.00011455983483369515, + "loss": 1.711993408203125, + "step": 28510 + }, + { + "epoch": 0.08632875758630605, + "grad_norm": 0.14343848824501038, + "learning_rate": 0.00011455603965205775, + "loss": 1.6685321807861329, + "step": 28520 + }, + { + "epoch": 0.08635902713665189, + "grad_norm": 0.14812973141670227, + "learning_rate": 0.00011455224447042036, + "loss": 1.6610424041748046, + "step": 28530 + }, + { + "epoch": 0.08638929668699771, + "grad_norm": 0.1519182324409485, + "learning_rate": 0.00011454844928878296, + "loss": 1.7109682083129882, + "step": 28540 + }, + { + "epoch": 0.08641956623734355, + "grad_norm": 0.1452246606349945, + "learning_rate": 0.00011454465410714557, + "loss": 1.659261131286621, + "step": 28550 + }, + { + "epoch": 0.08644983578768937, + "grad_norm": 0.17627033591270447, + "learning_rate": 0.00011454085892550818, + "loss": 1.714009666442871, + "step": 28560 + }, + { + "epoch": 0.0864801053380352, + "grad_norm": 0.15900984406471252, + "learning_rate": 0.00011453706374387078, + "loss": 1.69254150390625, + "step": 28570 + }, + { + "epoch": 0.08651037488838104, + "grad_norm": 0.1503356248140335, + "learning_rate": 0.00011453326856223339, + "loss": 1.6624311447143554, + "step": 28580 + }, + { + "epoch": 0.08654064443872686, + "grad_norm": 0.13667921721935272, + "learning_rate": 0.000114529473380596, + "loss": 1.6701774597167969, + "step": 28590 + }, + { + "epoch": 0.0865709139890727, + "grad_norm": 0.1289921998977661, + "learning_rate": 0.00011452567819895861, + "loss": 1.6650615692138673, + "step": 28600 + }, + { + "epoch": 0.08660118353941852, + "grad_norm": 0.1423911154270172, + "learning_rate": 0.0001145218830173212, + "loss": 1.6588537216186523, + "step": 28610 + }, + { + "epoch": 0.08663145308976435, + "grad_norm": 0.14504940807819366, + "learning_rate": 0.00011451808783568383, + "loss": 1.6493778228759766, + "step": 28620 + }, + { + "epoch": 0.08666172264011018, + "grad_norm": 0.15366800129413605, + "learning_rate": 0.00011451429265404642, + "loss": 1.6978790283203125, + "step": 28630 + }, + { + "epoch": 0.08669199219045601, + "grad_norm": 0.15939567983150482, + "learning_rate": 0.00011451049747240904, + "loss": 1.6876506805419922, + "step": 28640 + }, + { + "epoch": 0.08672226174080185, + "grad_norm": 0.16861183941364288, + "learning_rate": 0.00011450670229077163, + "loss": 1.7112171173095703, + "step": 28650 + }, + { + "epoch": 0.08675253129114767, + "grad_norm": 0.14555935561656952, + "learning_rate": 0.00011450290710913425, + "loss": 1.6331243515014648, + "step": 28660 + }, + { + "epoch": 0.0867828008414935, + "grad_norm": 0.14066094160079956, + "learning_rate": 0.00011449911192749684, + "loss": 1.6623050689697265, + "step": 28670 + }, + { + "epoch": 0.08681307039183933, + "grad_norm": 0.15299125015735626, + "learning_rate": 0.00011449531674585946, + "loss": 1.6438961029052734, + "step": 28680 + }, + { + "epoch": 0.08684333994218515, + "grad_norm": 0.17125964164733887, + "learning_rate": 0.00011449152156422205, + "loss": 1.6705892562866211, + "step": 28690 + }, + { + "epoch": 0.08687360949253099, + "grad_norm": 0.12336236983537674, + "learning_rate": 0.00011448772638258467, + "loss": 1.6675899505615235, + "step": 28700 + }, + { + "epoch": 0.08690387904287682, + "grad_norm": 0.15140552818775177, + "learning_rate": 0.00011448393120094729, + "loss": 1.6059215545654297, + "step": 28710 + }, + { + "epoch": 0.08693414859322265, + "grad_norm": 0.14407871663570404, + "learning_rate": 0.00011448013601930988, + "loss": 1.654647445678711, + "step": 28720 + }, + { + "epoch": 0.08696441814356848, + "grad_norm": 0.1520383208990097, + "learning_rate": 0.0001144763408376725, + "loss": 1.6176982879638673, + "step": 28730 + }, + { + "epoch": 0.0869946876939143, + "grad_norm": 0.16536161303520203, + "learning_rate": 0.0001144725456560351, + "loss": 1.6288448333740235, + "step": 28740 + }, + { + "epoch": 0.08702495724426014, + "grad_norm": 0.19765518605709076, + "learning_rate": 0.00011446875047439772, + "loss": 1.6470108032226562, + "step": 28750 + }, + { + "epoch": 0.08705522679460596, + "grad_norm": 0.1447584480047226, + "learning_rate": 0.00011446495529276031, + "loss": 1.6728094100952149, + "step": 28760 + }, + { + "epoch": 0.0870854963449518, + "grad_norm": 0.18395721912384033, + "learning_rate": 0.00011446116011112293, + "loss": 1.630833625793457, + "step": 28770 + }, + { + "epoch": 0.08711576589529763, + "grad_norm": 0.18052953481674194, + "learning_rate": 0.00011445736492948552, + "loss": 1.6656154632568358, + "step": 28780 + }, + { + "epoch": 0.08714603544564345, + "grad_norm": 0.15997116267681122, + "learning_rate": 0.00011445356974784814, + "loss": 1.6594980239868165, + "step": 28790 + }, + { + "epoch": 0.08717630499598929, + "grad_norm": 0.1629808247089386, + "learning_rate": 0.00011444977456621073, + "loss": 1.6639644622802734, + "step": 28800 + }, + { + "epoch": 0.08720657454633511, + "grad_norm": 0.16131490468978882, + "learning_rate": 0.00011444597938457335, + "loss": 1.7086442947387694, + "step": 28810 + }, + { + "epoch": 0.08723684409668095, + "grad_norm": 0.15121310949325562, + "learning_rate": 0.00011444218420293594, + "loss": 1.6594799041748047, + "step": 28820 + }, + { + "epoch": 0.08726711364702677, + "grad_norm": 0.1504298746585846, + "learning_rate": 0.00011443838902129856, + "loss": 1.6744003295898438, + "step": 28830 + }, + { + "epoch": 0.0872973831973726, + "grad_norm": 0.15675826370716095, + "learning_rate": 0.00011443459383966117, + "loss": 1.6646240234375, + "step": 28840 + }, + { + "epoch": 0.08732765274771843, + "grad_norm": 0.16264840960502625, + "learning_rate": 0.00011443079865802378, + "loss": 1.6398277282714844, + "step": 28850 + }, + { + "epoch": 0.08735792229806426, + "grad_norm": 0.14266207814216614, + "learning_rate": 0.00011442700347638638, + "loss": 1.680569839477539, + "step": 28860 + }, + { + "epoch": 0.0873881918484101, + "grad_norm": 0.1359476000070572, + "learning_rate": 0.00011442320829474899, + "loss": 1.636316680908203, + "step": 28870 + }, + { + "epoch": 0.08741846139875592, + "grad_norm": 0.162380650639534, + "learning_rate": 0.0001144194131131116, + "loss": 1.6178060531616212, + "step": 28880 + }, + { + "epoch": 0.08744873094910176, + "grad_norm": 0.14236152172088623, + "learning_rate": 0.0001144156179314742, + "loss": 1.7050718307495116, + "step": 28890 + }, + { + "epoch": 0.08747900049944758, + "grad_norm": 0.16596783697605133, + "learning_rate": 0.00011441182274983682, + "loss": 1.6641258239746093, + "step": 28900 + }, + { + "epoch": 0.0875092700497934, + "grad_norm": 0.15443815290927887, + "learning_rate": 0.00011440802756819941, + "loss": 1.6803367614746094, + "step": 28910 + }, + { + "epoch": 0.08753953960013924, + "grad_norm": 0.1441749483346939, + "learning_rate": 0.00011440423238656203, + "loss": 1.6651119232177733, + "step": 28920 + }, + { + "epoch": 0.08756980915048507, + "grad_norm": 0.1607062965631485, + "learning_rate": 0.00011440043720492462, + "loss": 1.6277910232543946, + "step": 28930 + }, + { + "epoch": 0.0876000787008309, + "grad_norm": 0.1408030241727829, + "learning_rate": 0.00011439664202328724, + "loss": 1.6426403045654296, + "step": 28940 + }, + { + "epoch": 0.08763034825117673, + "grad_norm": 0.16106189787387848, + "learning_rate": 0.00011439284684164984, + "loss": 1.6969194412231445, + "step": 28950 + }, + { + "epoch": 0.08766061780152255, + "grad_norm": 0.15661542117595673, + "learning_rate": 0.00011438905166001245, + "loss": 1.68529052734375, + "step": 28960 + }, + { + "epoch": 0.08769088735186839, + "grad_norm": 0.14363262057304382, + "learning_rate": 0.00011438525647837506, + "loss": 1.6428070068359375, + "step": 28970 + }, + { + "epoch": 0.08772115690221421, + "grad_norm": 0.1396205574274063, + "learning_rate": 0.00011438146129673767, + "loss": 1.6511249542236328, + "step": 28980 + }, + { + "epoch": 0.08775142645256005, + "grad_norm": 0.14582104980945587, + "learning_rate": 0.00011437766611510027, + "loss": 1.669993019104004, + "step": 28990 + }, + { + "epoch": 0.08778169600290588, + "grad_norm": 0.16524668037891388, + "learning_rate": 0.00011437387093346288, + "loss": 1.6680248260498047, + "step": 29000 + }, + { + "epoch": 0.08778169600290588, + "eval_loss": 1.6653835773468018, + "eval_runtime": 28.2787, + "eval_samples_per_second": 17.681, + "eval_steps_per_second": 1.132, + "step": 29000 + }, + { + "epoch": 0.0878119655532517, + "grad_norm": 0.15089844167232513, + "learning_rate": 0.00011437007575182548, + "loss": 1.6479316711425782, + "step": 29010 + }, + { + "epoch": 0.08784223510359754, + "grad_norm": 0.13207237422466278, + "learning_rate": 0.00011436628057018809, + "loss": 1.6672183990478515, + "step": 29020 + }, + { + "epoch": 0.08787250465394336, + "grad_norm": 0.1807243525981903, + "learning_rate": 0.0001143624853885507, + "loss": 1.6571525573730468, + "step": 29030 + }, + { + "epoch": 0.0879027742042892, + "grad_norm": 0.16522842645645142, + "learning_rate": 0.0001143586902069133, + "loss": 1.6418264389038086, + "step": 29040 + }, + { + "epoch": 0.08793304375463502, + "grad_norm": 0.13507986068725586, + "learning_rate": 0.00011435489502527591, + "loss": 1.6641857147216796, + "step": 29050 + }, + { + "epoch": 0.08796331330498086, + "grad_norm": 0.15115173161029816, + "learning_rate": 0.00011435109984363851, + "loss": 1.6939079284667968, + "step": 29060 + }, + { + "epoch": 0.08799358285532669, + "grad_norm": 0.14937935769557953, + "learning_rate": 0.00011434730466200112, + "loss": 1.605303955078125, + "step": 29070 + }, + { + "epoch": 0.08802385240567251, + "grad_norm": 0.16584596037864685, + "learning_rate": 0.00011434350948036374, + "loss": 1.6912843704223632, + "step": 29080 + }, + { + "epoch": 0.08805412195601835, + "grad_norm": 0.15832577645778656, + "learning_rate": 0.00011433971429872633, + "loss": 1.660537338256836, + "step": 29090 + }, + { + "epoch": 0.08808439150636417, + "grad_norm": 0.15948523581027985, + "learning_rate": 0.00011433591911708895, + "loss": 1.6804315567016601, + "step": 29100 + }, + { + "epoch": 0.08811466105671001, + "grad_norm": 0.1524662971496582, + "learning_rate": 0.00011433212393545156, + "loss": 1.6749967575073241, + "step": 29110 + }, + { + "epoch": 0.08814493060705583, + "grad_norm": 0.13060332834720612, + "learning_rate": 0.00011432832875381416, + "loss": 1.6573627471923829, + "step": 29120 + }, + { + "epoch": 0.08817520015740166, + "grad_norm": 0.13718587160110474, + "learning_rate": 0.00011432453357217677, + "loss": 1.6734996795654298, + "step": 29130 + }, + { + "epoch": 0.0882054697077475, + "grad_norm": 0.15746355056762695, + "learning_rate": 0.00011432073839053938, + "loss": 1.6628961563110352, + "step": 29140 + }, + { + "epoch": 0.08823573925809332, + "grad_norm": 0.15051743388175964, + "learning_rate": 0.00011431694320890198, + "loss": 1.6690292358398438, + "step": 29150 + }, + { + "epoch": 0.08826600880843916, + "grad_norm": 0.15053044259548187, + "learning_rate": 0.00011431314802726459, + "loss": 1.6315635681152343, + "step": 29160 + }, + { + "epoch": 0.08829627835878498, + "grad_norm": 0.14438985288143158, + "learning_rate": 0.0001143093528456272, + "loss": 1.6859432220458985, + "step": 29170 + }, + { + "epoch": 0.0883265479091308, + "grad_norm": 0.18900907039642334, + "learning_rate": 0.0001143055576639898, + "loss": 1.6430986404418946, + "step": 29180 + }, + { + "epoch": 0.08835681745947664, + "grad_norm": 0.14777351915836334, + "learning_rate": 0.0001143017624823524, + "loss": 1.6707563400268555, + "step": 29190 + }, + { + "epoch": 0.08838708700982247, + "grad_norm": 0.16686204075813293, + "learning_rate": 0.00011429796730071501, + "loss": 1.6579219818115234, + "step": 29200 + }, + { + "epoch": 0.0884173565601683, + "grad_norm": 0.18015573918819427, + "learning_rate": 0.00011429417211907763, + "loss": 1.6154033660888671, + "step": 29210 + }, + { + "epoch": 0.08844762611051413, + "grad_norm": 0.16030704975128174, + "learning_rate": 0.00011429037693744022, + "loss": 1.6496898651123046, + "step": 29220 + }, + { + "epoch": 0.08847789566085996, + "grad_norm": 0.14314748346805573, + "learning_rate": 0.00011428658175580284, + "loss": 1.6897598266601563, + "step": 29230 + }, + { + "epoch": 0.08850816521120579, + "grad_norm": 0.165372833609581, + "learning_rate": 0.00011428278657416544, + "loss": 1.6673147201538085, + "step": 29240 + }, + { + "epoch": 0.08853843476155161, + "grad_norm": 0.16088220477104187, + "learning_rate": 0.00011427899139252805, + "loss": 1.6864614486694336, + "step": 29250 + }, + { + "epoch": 0.08856870431189745, + "grad_norm": 0.13419948518276215, + "learning_rate": 0.00011427519621089065, + "loss": 1.6361166000366212, + "step": 29260 + }, + { + "epoch": 0.08859897386224327, + "grad_norm": 0.14977316558361053, + "learning_rate": 0.00011427140102925327, + "loss": 1.6661972045898437, + "step": 29270 + }, + { + "epoch": 0.08862924341258911, + "grad_norm": 0.1453743577003479, + "learning_rate": 0.00011426760584761586, + "loss": 1.6329023361206054, + "step": 29280 + }, + { + "epoch": 0.08865951296293494, + "grad_norm": 0.15422390401363373, + "learning_rate": 0.00011426381066597848, + "loss": 1.6663341522216797, + "step": 29290 + }, + { + "epoch": 0.08868978251328076, + "grad_norm": 0.1463342308998108, + "learning_rate": 0.00011426001548434107, + "loss": 1.6883991241455079, + "step": 29300 + }, + { + "epoch": 0.0887200520636266, + "grad_norm": 0.15726417303085327, + "learning_rate": 0.00011425622030270369, + "loss": 1.6269800186157226, + "step": 29310 + }, + { + "epoch": 0.08875032161397242, + "grad_norm": 0.1617286503314972, + "learning_rate": 0.0001142524251210663, + "loss": 1.6413202285766602, + "step": 29320 + }, + { + "epoch": 0.08878059116431826, + "grad_norm": 0.1340027004480362, + "learning_rate": 0.0001142486299394289, + "loss": 1.6748701095581056, + "step": 29330 + }, + { + "epoch": 0.08881086071466408, + "grad_norm": 0.13711589574813843, + "learning_rate": 0.00011424483475779152, + "loss": 1.6465747833251954, + "step": 29340 + }, + { + "epoch": 0.08884113026500991, + "grad_norm": 0.1514117568731308, + "learning_rate": 0.00011424103957615411, + "loss": 1.658592414855957, + "step": 29350 + }, + { + "epoch": 0.08887139981535574, + "grad_norm": 0.15166988968849182, + "learning_rate": 0.00011423724439451673, + "loss": 1.638180923461914, + "step": 29360 + }, + { + "epoch": 0.08890166936570157, + "grad_norm": 0.17074494063854218, + "learning_rate": 0.00011423344921287933, + "loss": 1.6430423736572266, + "step": 29370 + }, + { + "epoch": 0.0889319389160474, + "grad_norm": 0.15032903850078583, + "learning_rate": 0.00011422965403124195, + "loss": 1.6504243850708007, + "step": 29380 + }, + { + "epoch": 0.08896220846639323, + "grad_norm": 0.1674249917268753, + "learning_rate": 0.00011422585884960454, + "loss": 1.6746931076049805, + "step": 29390 + }, + { + "epoch": 0.08899247801673905, + "grad_norm": 0.14816659688949585, + "learning_rate": 0.00011422206366796716, + "loss": 1.649407958984375, + "step": 29400 + }, + { + "epoch": 0.08902274756708489, + "grad_norm": 0.17581205070018768, + "learning_rate": 0.00011421826848632975, + "loss": 1.696384811401367, + "step": 29410 + }, + { + "epoch": 0.08905301711743072, + "grad_norm": 0.15057459473609924, + "learning_rate": 0.00011421447330469237, + "loss": 1.6295921325683593, + "step": 29420 + }, + { + "epoch": 0.08908328666777655, + "grad_norm": 0.16352783143520355, + "learning_rate": 0.00011421067812305496, + "loss": 1.673184585571289, + "step": 29430 + }, + { + "epoch": 0.08911355621812238, + "grad_norm": 0.13905112445354462, + "learning_rate": 0.00011420688294141758, + "loss": 1.6450933456420898, + "step": 29440 + }, + { + "epoch": 0.08914382576846822, + "grad_norm": 0.14734867215156555, + "learning_rate": 0.00011420308775978019, + "loss": 1.637274932861328, + "step": 29450 + }, + { + "epoch": 0.08917409531881404, + "grad_norm": 0.15651936829090118, + "learning_rate": 0.00011419929257814279, + "loss": 1.6769020080566406, + "step": 29460 + }, + { + "epoch": 0.08920436486915986, + "grad_norm": 0.17198680341243744, + "learning_rate": 0.0001141954973965054, + "loss": 1.6237459182739258, + "step": 29470 + }, + { + "epoch": 0.0892346344195057, + "grad_norm": 0.1767844408750534, + "learning_rate": 0.000114191702214868, + "loss": 1.6143327713012696, + "step": 29480 + }, + { + "epoch": 0.08926490396985153, + "grad_norm": 0.14699050784111023, + "learning_rate": 0.00011418790703323061, + "loss": 1.6671215057373048, + "step": 29490 + }, + { + "epoch": 0.08929517352019736, + "grad_norm": 0.13679105043411255, + "learning_rate": 0.00011418411185159322, + "loss": 1.6664972305297852, + "step": 29500 + }, + { + "epoch": 0.08929517352019736, + "eval_loss": 1.6572250127792358, + "eval_runtime": 27.8758, + "eval_samples_per_second": 17.937, + "eval_steps_per_second": 1.148, + "step": 29500 + }, + { + "epoch": 0.08932544307054319, + "grad_norm": 0.15701249241828918, + "learning_rate": 0.00011418031666995584, + "loss": 1.7137611389160157, + "step": 29510 + }, + { + "epoch": 0.08935571262088901, + "grad_norm": 0.14470010995864868, + "learning_rate": 0.00011417652148831843, + "loss": 1.6651020050048828, + "step": 29520 + }, + { + "epoch": 0.08938598217123485, + "grad_norm": 0.1538514643907547, + "learning_rate": 0.00011417272630668105, + "loss": 1.6568778991699218, + "step": 29530 + }, + { + "epoch": 0.08941625172158067, + "grad_norm": 0.17006492614746094, + "learning_rate": 0.00011416893112504364, + "loss": 1.6714345932006835, + "step": 29540 + }, + { + "epoch": 0.08944652127192651, + "grad_norm": 0.14763547480106354, + "learning_rate": 0.00011416513594340626, + "loss": 1.665421485900879, + "step": 29550 + }, + { + "epoch": 0.08947679082227233, + "grad_norm": 0.15580938756465912, + "learning_rate": 0.00011416134076176885, + "loss": 1.660784912109375, + "step": 29560 + }, + { + "epoch": 0.08950706037261816, + "grad_norm": 0.13858337700366974, + "learning_rate": 0.00011415754558013147, + "loss": 1.6775726318359374, + "step": 29570 + }, + { + "epoch": 0.089537329922964, + "grad_norm": 0.14079280197620392, + "learning_rate": 0.00011415375039849408, + "loss": 1.6428565979003906, + "step": 29580 + }, + { + "epoch": 0.08956759947330982, + "grad_norm": 0.15784034132957458, + "learning_rate": 0.00011414995521685668, + "loss": 1.6501148223876954, + "step": 29590 + }, + { + "epoch": 0.08959786902365566, + "grad_norm": 0.16515614092350006, + "learning_rate": 0.00011414616003521929, + "loss": 1.652474594116211, + "step": 29600 + }, + { + "epoch": 0.08962813857400148, + "grad_norm": 0.14523592591285706, + "learning_rate": 0.0001141423648535819, + "loss": 1.651153564453125, + "step": 29610 + }, + { + "epoch": 0.08965840812434732, + "grad_norm": 0.15654927492141724, + "learning_rate": 0.0001141385696719445, + "loss": 1.6129409790039062, + "step": 29620 + }, + { + "epoch": 0.08968867767469314, + "grad_norm": 0.1434876024723053, + "learning_rate": 0.00011413477449030711, + "loss": 1.6867813110351562, + "step": 29630 + }, + { + "epoch": 0.08971894722503897, + "grad_norm": 0.1555895209312439, + "learning_rate": 0.00011413097930866971, + "loss": 1.6237415313720702, + "step": 29640 + }, + { + "epoch": 0.0897492167753848, + "grad_norm": 0.16356034576892853, + "learning_rate": 0.00011412718412703232, + "loss": 1.6328617095947267, + "step": 29650 + }, + { + "epoch": 0.08977948632573063, + "grad_norm": 0.14194907248020172, + "learning_rate": 0.00011412338894539493, + "loss": 1.6934522628784179, + "step": 29660 + }, + { + "epoch": 0.08980975587607647, + "grad_norm": 0.15621529519557953, + "learning_rate": 0.00011411959376375753, + "loss": 1.6598855972290039, + "step": 29670 + }, + { + "epoch": 0.08984002542642229, + "grad_norm": 0.14594484865665436, + "learning_rate": 0.00011411579858212014, + "loss": 1.6385009765625, + "step": 29680 + }, + { + "epoch": 0.08987029497676811, + "grad_norm": 0.14313529431819916, + "learning_rate": 0.00011411200340048274, + "loss": 1.676431655883789, + "step": 29690 + }, + { + "epoch": 0.08990056452711395, + "grad_norm": 0.14352962374687195, + "learning_rate": 0.00011410820821884535, + "loss": 1.6646507263183594, + "step": 29700 + }, + { + "epoch": 0.08993083407745978, + "grad_norm": 0.13356353342533112, + "learning_rate": 0.00011410441303720797, + "loss": 1.6306640625, + "step": 29710 + }, + { + "epoch": 0.08996110362780561, + "grad_norm": 0.1445050686597824, + "learning_rate": 0.00011410061785557057, + "loss": 1.6281829833984376, + "step": 29720 + }, + { + "epoch": 0.08999137317815144, + "grad_norm": 0.14434970915317535, + "learning_rate": 0.00011409682267393318, + "loss": 1.6597412109375, + "step": 29730 + }, + { + "epoch": 0.09002164272849726, + "grad_norm": 0.15939132869243622, + "learning_rate": 0.00011409302749229579, + "loss": 1.6200056076049805, + "step": 29740 + }, + { + "epoch": 0.0900519122788431, + "grad_norm": 0.13657890260219574, + "learning_rate": 0.00011408923231065839, + "loss": 1.6685914993286133, + "step": 29750 + }, + { + "epoch": 0.09008218182918892, + "grad_norm": 0.16663287580013275, + "learning_rate": 0.000114085437129021, + "loss": 1.6707849502563477, + "step": 29760 + }, + { + "epoch": 0.09011245137953476, + "grad_norm": 0.1382029801607132, + "learning_rate": 0.0001140816419473836, + "loss": 1.6532974243164062, + "step": 29770 + }, + { + "epoch": 0.09014272092988058, + "grad_norm": 0.13977327942848206, + "learning_rate": 0.00011407784676574621, + "loss": 1.6523195266723634, + "step": 29780 + }, + { + "epoch": 0.09017299048022642, + "grad_norm": 0.14937539398670197, + "learning_rate": 0.00011407405158410882, + "loss": 1.635312843322754, + "step": 29790 + }, + { + "epoch": 0.09020326003057225, + "grad_norm": 0.15198521316051483, + "learning_rate": 0.00011407025640247142, + "loss": 1.688167190551758, + "step": 29800 + }, + { + "epoch": 0.09023352958091807, + "grad_norm": 0.146878182888031, + "learning_rate": 0.00011406646122083403, + "loss": 1.6545093536376954, + "step": 29810 + }, + { + "epoch": 0.09026379913126391, + "grad_norm": 0.1463272124528885, + "learning_rate": 0.00011406266603919665, + "loss": 1.6928108215332032, + "step": 29820 + }, + { + "epoch": 0.09029406868160973, + "grad_norm": 0.15243473649024963, + "learning_rate": 0.00011405887085755924, + "loss": 1.6353343963623046, + "step": 29830 + }, + { + "epoch": 0.09032433823195557, + "grad_norm": 0.15107177197933197, + "learning_rate": 0.00011405507567592186, + "loss": 1.656447982788086, + "step": 29840 + }, + { + "epoch": 0.0903546077823014, + "grad_norm": 0.15987396240234375, + "learning_rate": 0.00011405128049428445, + "loss": 1.675210189819336, + "step": 29850 + }, + { + "epoch": 0.09038487733264722, + "grad_norm": 0.15785372257232666, + "learning_rate": 0.00011404748531264707, + "loss": 1.6880306243896483, + "step": 29860 + }, + { + "epoch": 0.09041514688299306, + "grad_norm": 0.14548490941524506, + "learning_rate": 0.00011404369013100966, + "loss": 1.6688003540039062, + "step": 29870 + }, + { + "epoch": 0.09044541643333888, + "grad_norm": 0.14242805540561676, + "learning_rate": 0.00011403989494937228, + "loss": 1.632525634765625, + "step": 29880 + }, + { + "epoch": 0.09047568598368472, + "grad_norm": 0.15651346743106842, + "learning_rate": 0.00011403609976773488, + "loss": 1.6367101669311523, + "step": 29890 + }, + { + "epoch": 0.09050595553403054, + "grad_norm": 0.15116246044635773, + "learning_rate": 0.0001140323045860975, + "loss": 1.6259212493896484, + "step": 29900 + }, + { + "epoch": 0.09053622508437636, + "grad_norm": 0.13817399740219116, + "learning_rate": 0.00011402850940446009, + "loss": 1.6817066192626953, + "step": 29910 + }, + { + "epoch": 0.0905664946347222, + "grad_norm": 0.14810267090797424, + "learning_rate": 0.00011402471422282271, + "loss": 1.697130012512207, + "step": 29920 + }, + { + "epoch": 0.09059676418506803, + "grad_norm": 0.15106886625289917, + "learning_rate": 0.00011402091904118531, + "loss": 1.6234882354736329, + "step": 29930 + }, + { + "epoch": 0.09062703373541386, + "grad_norm": 0.14006559550762177, + "learning_rate": 0.00011401712385954792, + "loss": 1.667977523803711, + "step": 29940 + }, + { + "epoch": 0.09065730328575969, + "grad_norm": 0.17565415799617767, + "learning_rate": 0.00011401332867791054, + "loss": 1.6346502304077148, + "step": 29950 + }, + { + "epoch": 0.09068757283610553, + "grad_norm": 0.16648046672344208, + "learning_rate": 0.00011400953349627313, + "loss": 1.690793800354004, + "step": 29960 + }, + { + "epoch": 0.09071784238645135, + "grad_norm": 0.1377379447221756, + "learning_rate": 0.00011400573831463575, + "loss": 1.6446514129638672, + "step": 29970 + }, + { + "epoch": 0.09074811193679717, + "grad_norm": 0.1397591531276703, + "learning_rate": 0.00011400194313299834, + "loss": 1.6591716766357423, + "step": 29980 + }, + { + "epoch": 0.09077838148714301, + "grad_norm": 0.15017123520374298, + "learning_rate": 0.00011399814795136096, + "loss": 1.6680301666259765, + "step": 29990 + }, + { + "epoch": 0.09080865103748884, + "grad_norm": 0.13247382640838623, + "learning_rate": 0.00011399435276972356, + "loss": 1.664698028564453, + "step": 30000 + }, + { + "epoch": 0.09080865103748884, + "eval_loss": 1.6703104972839355, + "eval_runtime": 28.3618, + "eval_samples_per_second": 17.629, + "eval_steps_per_second": 1.128, + "step": 30000 + }, + { + "epoch": 0.09083892058783467, + "grad_norm": 0.14577637612819672, + "learning_rate": 0.00011399055758808617, + "loss": 1.6974491119384765, + "step": 30010 + }, + { + "epoch": 0.0908691901381805, + "grad_norm": 0.14795802533626556, + "learning_rate": 0.00011398676240644877, + "loss": 1.660435676574707, + "step": 30020 + }, + { + "epoch": 0.09089945968852632, + "grad_norm": 0.1574062556028366, + "learning_rate": 0.00011398296722481139, + "loss": 1.6605504989624023, + "step": 30030 + }, + { + "epoch": 0.09092972923887216, + "grad_norm": 0.15152454376220703, + "learning_rate": 0.00011397917204317398, + "loss": 1.66104736328125, + "step": 30040 + }, + { + "epoch": 0.09095999878921798, + "grad_norm": 0.16297096014022827, + "learning_rate": 0.0001139753768615366, + "loss": 1.6860942840576172, + "step": 30050 + }, + { + "epoch": 0.09099026833956382, + "grad_norm": 0.14905840158462524, + "learning_rate": 0.00011397158167989919, + "loss": 1.6726297378540038, + "step": 30060 + }, + { + "epoch": 0.09102053788990964, + "grad_norm": 0.1700976938009262, + "learning_rate": 0.00011396778649826181, + "loss": 1.6501972198486328, + "step": 30070 + }, + { + "epoch": 0.09105080744025547, + "grad_norm": 0.14854490756988525, + "learning_rate": 0.00011396399131662442, + "loss": 1.6794994354248047, + "step": 30080 + }, + { + "epoch": 0.0910810769906013, + "grad_norm": 0.13413968682289124, + "learning_rate": 0.00011396019613498702, + "loss": 1.6864086151123048, + "step": 30090 + }, + { + "epoch": 0.09111134654094713, + "grad_norm": 0.162059023976326, + "learning_rate": 0.00011395640095334963, + "loss": 1.6611698150634766, + "step": 30100 + }, + { + "epoch": 0.09114161609129297, + "grad_norm": 0.16336825489997864, + "learning_rate": 0.00011395260577171223, + "loss": 1.6534984588623047, + "step": 30110 + }, + { + "epoch": 0.09117188564163879, + "grad_norm": 0.1531449854373932, + "learning_rate": 0.00011394881059007485, + "loss": 1.6784059524536132, + "step": 30120 + }, + { + "epoch": 0.09120215519198463, + "grad_norm": 0.13043062388896942, + "learning_rate": 0.00011394501540843745, + "loss": 1.6790916442871093, + "step": 30130 + }, + { + "epoch": 0.09123242474233045, + "grad_norm": 0.15488331019878387, + "learning_rate": 0.00011394122022680007, + "loss": 1.688736343383789, + "step": 30140 + }, + { + "epoch": 0.09126269429267628, + "grad_norm": 0.17275764048099518, + "learning_rate": 0.00011393742504516266, + "loss": 1.667038345336914, + "step": 30150 + }, + { + "epoch": 0.09129296384302212, + "grad_norm": 0.15414631366729736, + "learning_rate": 0.00011393362986352528, + "loss": 1.6336843490600585, + "step": 30160 + }, + { + "epoch": 0.09132323339336794, + "grad_norm": 0.16485966742038727, + "learning_rate": 0.00011392983468188787, + "loss": 1.6729610443115235, + "step": 30170 + }, + { + "epoch": 0.09135350294371378, + "grad_norm": 0.14442382752895355, + "learning_rate": 0.00011392603950025049, + "loss": 1.648015594482422, + "step": 30180 + }, + { + "epoch": 0.0913837724940596, + "grad_norm": 0.15881647169589996, + "learning_rate": 0.0001139222443186131, + "loss": 1.6701828002929688, + "step": 30190 + }, + { + "epoch": 0.09141404204440542, + "grad_norm": 0.1440943479537964, + "learning_rate": 0.0001139184491369757, + "loss": 1.6462518692016601, + "step": 30200 + }, + { + "epoch": 0.09144431159475126, + "grad_norm": 0.15127362310886383, + "learning_rate": 0.00011391465395533831, + "loss": 1.6481046676635742, + "step": 30210 + }, + { + "epoch": 0.09147458114509709, + "grad_norm": 0.1629503220319748, + "learning_rate": 0.00011391085877370091, + "loss": 1.6451190948486327, + "step": 30220 + }, + { + "epoch": 0.09150485069544292, + "grad_norm": 0.1550600230693817, + "learning_rate": 0.00011390706359206352, + "loss": 1.6659053802490233, + "step": 30230 + }, + { + "epoch": 0.09153512024578875, + "grad_norm": 0.1456466019153595, + "learning_rate": 0.00011390326841042612, + "loss": 1.6296703338623046, + "step": 30240 + }, + { + "epoch": 0.09156538979613457, + "grad_norm": 0.14495374262332916, + "learning_rate": 0.00011389947322878873, + "loss": 1.6405765533447265, + "step": 30250 + }, + { + "epoch": 0.09159565934648041, + "grad_norm": 0.1378823220729828, + "learning_rate": 0.00011389567804715134, + "loss": 1.662770652770996, + "step": 30260 + }, + { + "epoch": 0.09162592889682623, + "grad_norm": 0.1407078504562378, + "learning_rate": 0.00011389188286551394, + "loss": 1.663919448852539, + "step": 30270 + }, + { + "epoch": 0.09165619844717207, + "grad_norm": 0.13887521624565125, + "learning_rate": 0.00011388808768387655, + "loss": 1.6449893951416015, + "step": 30280 + }, + { + "epoch": 0.0916864679975179, + "grad_norm": 0.14903318881988525, + "learning_rate": 0.00011388429250223915, + "loss": 1.6495372772216796, + "step": 30290 + }, + { + "epoch": 0.09171673754786373, + "grad_norm": 0.1483898162841797, + "learning_rate": 0.00011388049732060176, + "loss": 1.6602188110351563, + "step": 30300 + }, + { + "epoch": 0.09174700709820956, + "grad_norm": 0.13890548050403595, + "learning_rate": 0.00011387670213896437, + "loss": 1.6774662017822266, + "step": 30310 + }, + { + "epoch": 0.09177727664855538, + "grad_norm": 0.1548394113779068, + "learning_rate": 0.00011387290695732699, + "loss": 1.6265453338623046, + "step": 30320 + }, + { + "epoch": 0.09180754619890122, + "grad_norm": 0.13332389295101166, + "learning_rate": 0.00011386911177568959, + "loss": 1.66085205078125, + "step": 30330 + }, + { + "epoch": 0.09183781574924704, + "grad_norm": 0.14607395231723785, + "learning_rate": 0.0001138653165940522, + "loss": 1.676454734802246, + "step": 30340 + }, + { + "epoch": 0.09186808529959288, + "grad_norm": 0.14787019789218903, + "learning_rate": 0.0001138615214124148, + "loss": 1.6724153518676759, + "step": 30350 + }, + { + "epoch": 0.0918983548499387, + "grad_norm": 0.15453065931797028, + "learning_rate": 0.00011385772623077741, + "loss": 1.6483652114868164, + "step": 30360 + }, + { + "epoch": 0.09192862440028453, + "grad_norm": 0.15210376679897308, + "learning_rate": 0.00011385393104914002, + "loss": 1.6343650817871094, + "step": 30370 + }, + { + "epoch": 0.09195889395063037, + "grad_norm": 0.18160493671894073, + "learning_rate": 0.00011385013586750262, + "loss": 1.6473461151123048, + "step": 30380 + }, + { + "epoch": 0.09198916350097619, + "grad_norm": 0.17473915219306946, + "learning_rate": 0.00011384634068586523, + "loss": 1.6643169403076172, + "step": 30390 + }, + { + "epoch": 0.09201943305132203, + "grad_norm": 0.14988721907138824, + "learning_rate": 0.00011384254550422783, + "loss": 1.625545120239258, + "step": 30400 + }, + { + "epoch": 0.09204970260166785, + "grad_norm": 0.14336426556110382, + "learning_rate": 0.00011383875032259044, + "loss": 1.6436552047729491, + "step": 30410 + }, + { + "epoch": 0.09207997215201368, + "grad_norm": 0.14335471391677856, + "learning_rate": 0.00011383495514095305, + "loss": 1.6620323181152343, + "step": 30420 + }, + { + "epoch": 0.09211024170235951, + "grad_norm": 0.14478805661201477, + "learning_rate": 0.00011383115995931565, + "loss": 1.6873817443847656, + "step": 30430 + }, + { + "epoch": 0.09214051125270534, + "grad_norm": 0.14692702889442444, + "learning_rate": 0.00011382736477767826, + "loss": 1.6680332183837892, + "step": 30440 + }, + { + "epoch": 0.09217078080305117, + "grad_norm": 0.13180027902126312, + "learning_rate": 0.00011382356959604088, + "loss": 1.6409780502319335, + "step": 30450 + }, + { + "epoch": 0.092201050353397, + "grad_norm": 0.15080903470516205, + "learning_rate": 0.00011381977441440347, + "loss": 1.6307388305664063, + "step": 30460 + }, + { + "epoch": 0.09223131990374284, + "grad_norm": 0.15452629327774048, + "learning_rate": 0.00011381597923276609, + "loss": 1.610791015625, + "step": 30470 + }, + { + "epoch": 0.09226158945408866, + "grad_norm": 0.15471351146697998, + "learning_rate": 0.00011381218405112868, + "loss": 1.6202919006347656, + "step": 30480 + }, + { + "epoch": 0.09229185900443448, + "grad_norm": 0.1516381800174713, + "learning_rate": 0.0001138083888694913, + "loss": 1.6621536254882812, + "step": 30490 + }, + { + "epoch": 0.09232212855478032, + "grad_norm": 0.15025481581687927, + "learning_rate": 0.00011380459368785389, + "loss": 1.6525419235229493, + "step": 30500 + }, + { + "epoch": 0.09232212855478032, + "eval_loss": 1.6481376886367798, + "eval_runtime": 28.0384, + "eval_samples_per_second": 17.833, + "eval_steps_per_second": 1.141, + "step": 30500 + }, + { + "epoch": 0.09235239810512615, + "grad_norm": 0.14806503057479858, + "learning_rate": 0.00011380079850621651, + "loss": 1.6568408966064454, + "step": 30510 + }, + { + "epoch": 0.09238266765547198, + "grad_norm": 0.16153199970722198, + "learning_rate": 0.00011379700332457912, + "loss": 1.6587209701538086, + "step": 30520 + }, + { + "epoch": 0.09241293720581781, + "grad_norm": 0.1445678025484085, + "learning_rate": 0.00011379320814294172, + "loss": 1.6447925567626953, + "step": 30530 + }, + { + "epoch": 0.09244320675616363, + "grad_norm": 0.14402814209461212, + "learning_rate": 0.00011378941296130433, + "loss": 1.6810508728027345, + "step": 30540 + }, + { + "epoch": 0.09247347630650947, + "grad_norm": 0.14341798424720764, + "learning_rate": 0.00011378561777966694, + "loss": 1.6591598510742187, + "step": 30550 + }, + { + "epoch": 0.0925037458568553, + "grad_norm": 0.12986090779304504, + "learning_rate": 0.00011378182259802956, + "loss": 1.654776382446289, + "step": 30560 + }, + { + "epoch": 0.09253401540720113, + "grad_norm": 0.1459386646747589, + "learning_rate": 0.00011377802741639215, + "loss": 1.6789011001586913, + "step": 30570 + }, + { + "epoch": 0.09256428495754696, + "grad_norm": 0.1605425924062729, + "learning_rate": 0.00011377423223475477, + "loss": 1.6487224578857422, + "step": 30580 + }, + { + "epoch": 0.09259455450789278, + "grad_norm": 0.1412368267774582, + "learning_rate": 0.00011377043705311736, + "loss": 1.6957719802856446, + "step": 30590 + }, + { + "epoch": 0.09262482405823862, + "grad_norm": 0.15843553841114044, + "learning_rate": 0.00011376664187147998, + "loss": 1.6985048294067382, + "step": 30600 + }, + { + "epoch": 0.09265509360858444, + "grad_norm": 0.13053904473781586, + "learning_rate": 0.00011376284668984257, + "loss": 1.637564468383789, + "step": 30610 + }, + { + "epoch": 0.09268536315893028, + "grad_norm": 0.1524025797843933, + "learning_rate": 0.00011375905150820519, + "loss": 1.6930355072021483, + "step": 30620 + }, + { + "epoch": 0.0927156327092761, + "grad_norm": 0.13579680025577545, + "learning_rate": 0.00011375525632656778, + "loss": 1.6449440002441407, + "step": 30630 + }, + { + "epoch": 0.09274590225962194, + "grad_norm": 0.16094659268856049, + "learning_rate": 0.0001137514611449304, + "loss": 1.6480186462402344, + "step": 30640 + }, + { + "epoch": 0.09277617180996776, + "grad_norm": 0.1659449189901352, + "learning_rate": 0.000113747665963293, + "loss": 1.6486814498901368, + "step": 30650 + }, + { + "epoch": 0.09280644136031359, + "grad_norm": 0.1351683884859085, + "learning_rate": 0.00011374387078165562, + "loss": 1.720527458190918, + "step": 30660 + }, + { + "epoch": 0.09283671091065943, + "grad_norm": 0.1583687663078308, + "learning_rate": 0.00011374007560001821, + "loss": 1.6749088287353515, + "step": 30670 + }, + { + "epoch": 0.09286698046100525, + "grad_norm": 0.13323917984962463, + "learning_rate": 0.00011373628041838083, + "loss": 1.616550827026367, + "step": 30680 + }, + { + "epoch": 0.09289725001135109, + "grad_norm": 0.1363806426525116, + "learning_rate": 0.00011373248523674343, + "loss": 1.7003395080566406, + "step": 30690 + }, + { + "epoch": 0.09292751956169691, + "grad_norm": 0.14569242298603058, + "learning_rate": 0.00011372869005510604, + "loss": 1.657490348815918, + "step": 30700 + }, + { + "epoch": 0.09295778911204274, + "grad_norm": 0.15688633918762207, + "learning_rate": 0.00011372489487346865, + "loss": 1.651373291015625, + "step": 30710 + }, + { + "epoch": 0.09298805866238857, + "grad_norm": 0.15738454461097717, + "learning_rate": 0.00011372109969183125, + "loss": 1.6021553039550782, + "step": 30720 + }, + { + "epoch": 0.0930183282127344, + "grad_norm": 0.15373744070529938, + "learning_rate": 0.00011371730451019387, + "loss": 1.7203304290771484, + "step": 30730 + }, + { + "epoch": 0.09304859776308023, + "grad_norm": 0.1363813281059265, + "learning_rate": 0.00011371350932855646, + "loss": 1.6590927124023438, + "step": 30740 + }, + { + "epoch": 0.09307886731342606, + "grad_norm": 0.15770946443080902, + "learning_rate": 0.00011370971414691908, + "loss": 1.678877067565918, + "step": 30750 + }, + { + "epoch": 0.09310913686377188, + "grad_norm": 0.16443993151187897, + "learning_rate": 0.00011370591896528168, + "loss": 1.6503978729248048, + "step": 30760 + }, + { + "epoch": 0.09313940641411772, + "grad_norm": 0.1604442447423935, + "learning_rate": 0.0001137021237836443, + "loss": 1.6589532852172852, + "step": 30770 + }, + { + "epoch": 0.09316967596446354, + "grad_norm": 0.16044823825359344, + "learning_rate": 0.00011369832860200689, + "loss": 1.675198745727539, + "step": 30780 + }, + { + "epoch": 0.09319994551480938, + "grad_norm": 0.16156665980815887, + "learning_rate": 0.0001136945334203695, + "loss": 1.6582542419433595, + "step": 30790 + }, + { + "epoch": 0.0932302150651552, + "grad_norm": 0.1688823699951172, + "learning_rate": 0.0001136907382387321, + "loss": 1.6671993255615234, + "step": 30800 + }, + { + "epoch": 0.09326048461550104, + "grad_norm": 0.1561322659254074, + "learning_rate": 0.00011368694305709472, + "loss": 1.6186708450317382, + "step": 30810 + }, + { + "epoch": 0.09329075416584687, + "grad_norm": 0.15742383897304535, + "learning_rate": 0.00011368314787545732, + "loss": 1.6514286041259765, + "step": 30820 + }, + { + "epoch": 0.09332102371619269, + "grad_norm": 0.1565748155117035, + "learning_rate": 0.00011367935269381993, + "loss": 1.6488327026367187, + "step": 30830 + }, + { + "epoch": 0.09335129326653853, + "grad_norm": 0.1279226541519165, + "learning_rate": 0.00011367555751218254, + "loss": 1.6411331176757813, + "step": 30840 + }, + { + "epoch": 0.09338156281688435, + "grad_norm": 0.1457800567150116, + "learning_rate": 0.00011367176233054514, + "loss": 1.6677759170532227, + "step": 30850 + }, + { + "epoch": 0.09341183236723019, + "grad_norm": 0.1669599860906601, + "learning_rate": 0.00011366796714890775, + "loss": 1.645693588256836, + "step": 30860 + }, + { + "epoch": 0.09344210191757601, + "grad_norm": 0.1640859842300415, + "learning_rate": 0.00011366417196727035, + "loss": 1.6572887420654296, + "step": 30870 + }, + { + "epoch": 0.09347237146792184, + "grad_norm": 0.1715160310268402, + "learning_rate": 0.00011366037678563296, + "loss": 1.6499040603637696, + "step": 30880 + }, + { + "epoch": 0.09350264101826768, + "grad_norm": 0.1428753286600113, + "learning_rate": 0.00011365658160399557, + "loss": 1.6577520370483398, + "step": 30890 + }, + { + "epoch": 0.0935329105686135, + "grad_norm": 0.18148303031921387, + "learning_rate": 0.00011365278642235817, + "loss": 1.6685287475585937, + "step": 30900 + }, + { + "epoch": 0.09356318011895934, + "grad_norm": 0.14056894183158875, + "learning_rate": 0.00011364899124072078, + "loss": 1.6457084655761718, + "step": 30910 + }, + { + "epoch": 0.09359344966930516, + "grad_norm": 0.15413156151771545, + "learning_rate": 0.00011364519605908338, + "loss": 1.670430564880371, + "step": 30920 + }, + { + "epoch": 0.09362371921965099, + "grad_norm": 0.17475871741771698, + "learning_rate": 0.000113641400877446, + "loss": 1.6078807830810546, + "step": 30930 + }, + { + "epoch": 0.09365398876999682, + "grad_norm": 0.13983263075351715, + "learning_rate": 0.00011363760569580861, + "loss": 1.6345996856689453, + "step": 30940 + }, + { + "epoch": 0.09368425832034265, + "grad_norm": 0.15137998759746552, + "learning_rate": 0.00011363381051417122, + "loss": 1.6664335250854492, + "step": 30950 + }, + { + "epoch": 0.09371452787068849, + "grad_norm": 0.14832548797130585, + "learning_rate": 0.00011363001533253382, + "loss": 1.655288314819336, + "step": 30960 + }, + { + "epoch": 0.09374479742103431, + "grad_norm": 0.14421293139457703, + "learning_rate": 0.00011362622015089643, + "loss": 1.6509437561035156, + "step": 30970 + }, + { + "epoch": 0.09377506697138013, + "grad_norm": 0.14857909083366394, + "learning_rate": 0.00011362242496925903, + "loss": 1.6402423858642579, + "step": 30980 + }, + { + "epoch": 0.09380533652172597, + "grad_norm": 0.14064347743988037, + "learning_rate": 0.00011361862978762164, + "loss": 1.6713855743408204, + "step": 30990 + }, + { + "epoch": 0.0938356060720718, + "grad_norm": 0.13871048390865326, + "learning_rate": 0.00011361483460598424, + "loss": 1.6425395965576173, + "step": 31000 + }, + { + "epoch": 0.0938356060720718, + "eval_loss": 1.6532245874404907, + "eval_runtime": 28.0915, + "eval_samples_per_second": 17.799, + "eval_steps_per_second": 1.139, + "step": 31000 + }, + { + "epoch": 0.09386587562241763, + "grad_norm": 0.1363358199596405, + "learning_rate": 0.00011361103942434685, + "loss": 1.6817508697509767, + "step": 31010 + }, + { + "epoch": 0.09389614517276346, + "grad_norm": 0.15086282789707184, + "learning_rate": 0.00011360724424270946, + "loss": 1.6489707946777343, + "step": 31020 + }, + { + "epoch": 0.0939264147231093, + "grad_norm": 0.13868188858032227, + "learning_rate": 0.00011360344906107206, + "loss": 1.637383270263672, + "step": 31030 + }, + { + "epoch": 0.09395668427345512, + "grad_norm": 0.15138192474842072, + "learning_rate": 0.00011359965387943467, + "loss": 1.6561925888061524, + "step": 31040 + }, + { + "epoch": 0.09398695382380094, + "grad_norm": 0.14689484238624573, + "learning_rate": 0.00011359585869779727, + "loss": 1.6484888076782227, + "step": 31050 + }, + { + "epoch": 0.09401722337414678, + "grad_norm": 0.1440790891647339, + "learning_rate": 0.0001135920635161599, + "loss": 1.7057197570800782, + "step": 31060 + }, + { + "epoch": 0.0940474929244926, + "grad_norm": 0.15283013880252838, + "learning_rate": 0.00011358826833452249, + "loss": 1.6353309631347657, + "step": 31070 + }, + { + "epoch": 0.09407776247483844, + "grad_norm": 0.14303500950336456, + "learning_rate": 0.0001135844731528851, + "loss": 1.650672149658203, + "step": 31080 + }, + { + "epoch": 0.09410803202518427, + "grad_norm": 0.15647532045841217, + "learning_rate": 0.0001135806779712477, + "loss": 1.6651132583618165, + "step": 31090 + }, + { + "epoch": 0.09413830157553009, + "grad_norm": 0.13925088942050934, + "learning_rate": 0.00011357688278961032, + "loss": 1.6368030548095702, + "step": 31100 + }, + { + "epoch": 0.09416857112587593, + "grad_norm": 0.15934878587722778, + "learning_rate": 0.00011357308760797291, + "loss": 1.6385690689086914, + "step": 31110 + }, + { + "epoch": 0.09419884067622175, + "grad_norm": 0.1541873663663864, + "learning_rate": 0.00011356929242633553, + "loss": 1.684708023071289, + "step": 31120 + }, + { + "epoch": 0.09422911022656759, + "grad_norm": 0.17442065477371216, + "learning_rate": 0.00011356549724469814, + "loss": 1.6332563400268554, + "step": 31130 + }, + { + "epoch": 0.09425937977691341, + "grad_norm": 0.14568816125392914, + "learning_rate": 0.00011356170206306074, + "loss": 1.6776100158691407, + "step": 31140 + }, + { + "epoch": 0.09428964932725924, + "grad_norm": 0.13275152444839478, + "learning_rate": 0.00011355790688142335, + "loss": 1.6801084518432616, + "step": 31150 + }, + { + "epoch": 0.09431991887760507, + "grad_norm": 0.14942777156829834, + "learning_rate": 0.00011355411169978595, + "loss": 1.6253841400146485, + "step": 31160 + }, + { + "epoch": 0.0943501884279509, + "grad_norm": 0.13731694221496582, + "learning_rate": 0.00011355031651814857, + "loss": 1.6389152526855468, + "step": 31170 + }, + { + "epoch": 0.09438045797829674, + "grad_norm": 0.1404530256986618, + "learning_rate": 0.00011354652133651117, + "loss": 1.702629852294922, + "step": 31180 + }, + { + "epoch": 0.09441072752864256, + "grad_norm": 0.1591528058052063, + "learning_rate": 0.00011354272615487379, + "loss": 1.6724483489990234, + "step": 31190 + }, + { + "epoch": 0.0944409970789884, + "grad_norm": 0.15999017655849457, + "learning_rate": 0.00011353893097323638, + "loss": 1.6602806091308593, + "step": 31200 + }, + { + "epoch": 0.09447126662933422, + "grad_norm": 0.1569231152534485, + "learning_rate": 0.000113535135791599, + "loss": 1.6686031341552734, + "step": 31210 + }, + { + "epoch": 0.09450153617968005, + "grad_norm": 0.15104329586029053, + "learning_rate": 0.00011353134060996159, + "loss": 1.6730680465698242, + "step": 31220 + }, + { + "epoch": 0.09453180573002588, + "grad_norm": 0.14610926806926727, + "learning_rate": 0.00011352754542832421, + "loss": 1.6667594909667969, + "step": 31230 + }, + { + "epoch": 0.09456207528037171, + "grad_norm": 0.1532888114452362, + "learning_rate": 0.0001135237502466868, + "loss": 1.614293670654297, + "step": 31240 + }, + { + "epoch": 0.09459234483071755, + "grad_norm": 0.1450500339269638, + "learning_rate": 0.00011351995506504942, + "loss": 1.658024215698242, + "step": 31250 + }, + { + "epoch": 0.09462261438106337, + "grad_norm": 0.14362917840480804, + "learning_rate": 0.00011351615988341201, + "loss": 1.6423912048339844, + "step": 31260 + }, + { + "epoch": 0.09465288393140919, + "grad_norm": 0.15773876011371613, + "learning_rate": 0.00011351236470177463, + "loss": 1.61610107421875, + "step": 31270 + }, + { + "epoch": 0.09468315348175503, + "grad_norm": 0.14618995785713196, + "learning_rate": 0.00011350856952013723, + "loss": 1.7032686233520509, + "step": 31280 + }, + { + "epoch": 0.09471342303210085, + "grad_norm": 0.14468756318092346, + "learning_rate": 0.00011350477433849984, + "loss": 1.6540016174316405, + "step": 31290 + }, + { + "epoch": 0.09474369258244669, + "grad_norm": 0.13692022860050201, + "learning_rate": 0.00011350097915686245, + "loss": 1.6529716491699218, + "step": 31300 + }, + { + "epoch": 0.09477396213279252, + "grad_norm": 0.13868258893489838, + "learning_rate": 0.00011349718397522506, + "loss": 1.6629524230957031, + "step": 31310 + }, + { + "epoch": 0.09480423168313834, + "grad_norm": 0.1653079241514206, + "learning_rate": 0.00011349338879358766, + "loss": 1.613797378540039, + "step": 31320 + }, + { + "epoch": 0.09483450123348418, + "grad_norm": 0.16930511593818665, + "learning_rate": 0.00011348959361195027, + "loss": 1.6535213470458985, + "step": 31330 + }, + { + "epoch": 0.09486477078383, + "grad_norm": 0.16470427811145782, + "learning_rate": 0.00011348579843031289, + "loss": 1.6557144165039062, + "step": 31340 + }, + { + "epoch": 0.09489504033417584, + "grad_norm": 0.17867136001586914, + "learning_rate": 0.00011348200324867548, + "loss": 1.6488517761230468, + "step": 31350 + }, + { + "epoch": 0.09492530988452166, + "grad_norm": 0.15737058222293854, + "learning_rate": 0.0001134782080670381, + "loss": 1.6709892272949218, + "step": 31360 + }, + { + "epoch": 0.0949555794348675, + "grad_norm": 0.16063998639583588, + "learning_rate": 0.00011347441288540069, + "loss": 1.6514236450195312, + "step": 31370 + }, + { + "epoch": 0.09498584898521333, + "grad_norm": 0.16638672351837158, + "learning_rate": 0.00011347061770376331, + "loss": 1.6732660293579102, + "step": 31380 + }, + { + "epoch": 0.09501611853555915, + "grad_norm": 0.1407250463962555, + "learning_rate": 0.0001134668225221259, + "loss": 1.6580278396606445, + "step": 31390 + }, + { + "epoch": 0.09504638808590499, + "grad_norm": 0.14902696013450623, + "learning_rate": 0.00011346302734048852, + "loss": 1.6402275085449218, + "step": 31400 + }, + { + "epoch": 0.09507665763625081, + "grad_norm": 0.15110242366790771, + "learning_rate": 0.00011345923215885112, + "loss": 1.640414047241211, + "step": 31410 + }, + { + "epoch": 0.09510692718659665, + "grad_norm": 0.159250408411026, + "learning_rate": 0.00011345543697721374, + "loss": 1.6440898895263671, + "step": 31420 + }, + { + "epoch": 0.09513719673694247, + "grad_norm": 0.15328507125377655, + "learning_rate": 0.00011345164179557634, + "loss": 1.6480094909667968, + "step": 31430 + }, + { + "epoch": 0.0951674662872883, + "grad_norm": 0.1405993402004242, + "learning_rate": 0.00011344784661393895, + "loss": 1.6315404891967773, + "step": 31440 + }, + { + "epoch": 0.09519773583763413, + "grad_norm": 0.1901964545249939, + "learning_rate": 0.00011344405143230155, + "loss": 1.7193466186523438, + "step": 31450 + }, + { + "epoch": 0.09522800538797996, + "grad_norm": 0.15092375874519348, + "learning_rate": 0.00011344025625066416, + "loss": 1.6130496978759765, + "step": 31460 + }, + { + "epoch": 0.0952582749383258, + "grad_norm": 0.14715981483459473, + "learning_rate": 0.00011343646106902677, + "loss": 1.7134012222290038, + "step": 31470 + }, + { + "epoch": 0.09528854448867162, + "grad_norm": 0.15519553422927856, + "learning_rate": 0.00011343266588738937, + "loss": 1.6598167419433594, + "step": 31480 + }, + { + "epoch": 0.09531881403901744, + "grad_norm": 0.15460918843746185, + "learning_rate": 0.00011342887070575198, + "loss": 1.6696245193481445, + "step": 31490 + }, + { + "epoch": 0.09534908358936328, + "grad_norm": 0.15059798955917358, + "learning_rate": 0.00011342507552411458, + "loss": 1.647848129272461, + "step": 31500 + }, + { + "epoch": 0.09534908358936328, + "eval_loss": 1.6557867527008057, + "eval_runtime": 28.0088, + "eval_samples_per_second": 17.852, + "eval_steps_per_second": 1.142, + "step": 31500 + }, + { + "epoch": 0.0953793531397091, + "grad_norm": 0.16777916252613068, + "learning_rate": 0.00011342128034247719, + "loss": 1.668746566772461, + "step": 31510 + }, + { + "epoch": 0.09540962269005494, + "grad_norm": 0.13832253217697144, + "learning_rate": 0.0001134174851608398, + "loss": 1.6735158920288087, + "step": 31520 + }, + { + "epoch": 0.09543989224040077, + "grad_norm": 0.1487002819776535, + "learning_rate": 0.0001134136899792024, + "loss": 1.6297037124633789, + "step": 31530 + }, + { + "epoch": 0.0954701617907466, + "grad_norm": 0.14592944085597992, + "learning_rate": 0.00011340989479756501, + "loss": 1.6393733978271485, + "step": 31540 + }, + { + "epoch": 0.09550043134109243, + "grad_norm": 0.1606953889131546, + "learning_rate": 0.00011340609961592763, + "loss": 1.6602489471435546, + "step": 31550 + }, + { + "epoch": 0.09553070089143825, + "grad_norm": 0.1285005360841751, + "learning_rate": 0.00011340230443429023, + "loss": 1.6198936462402345, + "step": 31560 + }, + { + "epoch": 0.09556097044178409, + "grad_norm": 0.1607784926891327, + "learning_rate": 0.00011339850925265284, + "loss": 1.615654182434082, + "step": 31570 + }, + { + "epoch": 0.09559123999212991, + "grad_norm": 0.13709791004657745, + "learning_rate": 0.00011339471407101544, + "loss": 1.6646629333496095, + "step": 31580 + }, + { + "epoch": 0.09562150954247575, + "grad_norm": 0.14912202954292297, + "learning_rate": 0.00011339091888937805, + "loss": 1.6353759765625, + "step": 31590 + }, + { + "epoch": 0.09565177909282158, + "grad_norm": 0.1477195769548416, + "learning_rate": 0.00011338712370774066, + "loss": 1.6696496963500977, + "step": 31600 + }, + { + "epoch": 0.0956820486431674, + "grad_norm": 0.14599530398845673, + "learning_rate": 0.00011338332852610326, + "loss": 1.6220737457275392, + "step": 31610 + }, + { + "epoch": 0.09571231819351324, + "grad_norm": 0.13868246972560883, + "learning_rate": 0.00011337953334446587, + "loss": 1.627913475036621, + "step": 31620 + }, + { + "epoch": 0.09574258774385906, + "grad_norm": 0.16967390477657318, + "learning_rate": 0.00011337573816282847, + "loss": 1.6959197998046875, + "step": 31630 + }, + { + "epoch": 0.0957728572942049, + "grad_norm": 0.1488361805677414, + "learning_rate": 0.00011337194298119108, + "loss": 1.6300151824951172, + "step": 31640 + }, + { + "epoch": 0.09580312684455072, + "grad_norm": 0.19502153992652893, + "learning_rate": 0.00011336814779955369, + "loss": 1.6720893859863282, + "step": 31650 + }, + { + "epoch": 0.09583339639489655, + "grad_norm": 0.16269470751285553, + "learning_rate": 0.00011336435261791629, + "loss": 1.6877220153808594, + "step": 31660 + }, + { + "epoch": 0.09586366594524239, + "grad_norm": 0.1322861909866333, + "learning_rate": 0.00011336055743627891, + "loss": 1.6920753479003907, + "step": 31670 + }, + { + "epoch": 0.09589393549558821, + "grad_norm": 0.13298308849334717, + "learning_rate": 0.0001133567622546415, + "loss": 1.6861568450927735, + "step": 31680 + }, + { + "epoch": 0.09592420504593405, + "grad_norm": 0.14776475727558136, + "learning_rate": 0.00011335296707300412, + "loss": 1.640275001525879, + "step": 31690 + }, + { + "epoch": 0.09595447459627987, + "grad_norm": 0.12963895499706268, + "learning_rate": 0.00011334917189136672, + "loss": 1.6444053649902344, + "step": 31700 + }, + { + "epoch": 0.09598474414662571, + "grad_norm": 0.15216369926929474, + "learning_rate": 0.00011334537670972934, + "loss": 1.6257965087890625, + "step": 31710 + }, + { + "epoch": 0.09601501369697153, + "grad_norm": 0.1387435793876648, + "learning_rate": 0.00011334158152809193, + "loss": 1.6200454711914063, + "step": 31720 + }, + { + "epoch": 0.09604528324731736, + "grad_norm": 0.14495719969272614, + "learning_rate": 0.00011333778634645455, + "loss": 1.605026626586914, + "step": 31730 + }, + { + "epoch": 0.0960755527976632, + "grad_norm": 0.1537863165140152, + "learning_rate": 0.00011333399116481715, + "loss": 1.657906723022461, + "step": 31740 + }, + { + "epoch": 0.09610582234800902, + "grad_norm": 0.14009979367256165, + "learning_rate": 0.00011333019598317976, + "loss": 1.6751710891723632, + "step": 31750 + }, + { + "epoch": 0.09613609189835486, + "grad_norm": 0.15210218727588654, + "learning_rate": 0.00011332640080154236, + "loss": 1.6506477355957032, + "step": 31760 + }, + { + "epoch": 0.09616636144870068, + "grad_norm": 0.130696102976799, + "learning_rate": 0.00011332260561990497, + "loss": 1.6321041107177734, + "step": 31770 + }, + { + "epoch": 0.0961966309990465, + "grad_norm": 0.16315367817878723, + "learning_rate": 0.00011331881043826758, + "loss": 1.6650327682495116, + "step": 31780 + }, + { + "epoch": 0.09622690054939234, + "grad_norm": 0.14592817425727844, + "learning_rate": 0.00011331501525663018, + "loss": 1.617745590209961, + "step": 31790 + }, + { + "epoch": 0.09625717009973817, + "grad_norm": 0.1569720357656479, + "learning_rate": 0.0001133112200749928, + "loss": 1.6618371963500977, + "step": 31800 + }, + { + "epoch": 0.096287439650084, + "grad_norm": 0.1357439160346985, + "learning_rate": 0.0001133074248933554, + "loss": 1.674957847595215, + "step": 31810 + }, + { + "epoch": 0.09631770920042983, + "grad_norm": 0.14353932440280914, + "learning_rate": 0.00011330362971171801, + "loss": 1.6354545593261718, + "step": 31820 + }, + { + "epoch": 0.09634797875077565, + "grad_norm": 0.14296455681324005, + "learning_rate": 0.0001132998345300806, + "loss": 1.6466030120849608, + "step": 31830 + }, + { + "epoch": 0.09637824830112149, + "grad_norm": 0.16719043254852295, + "learning_rate": 0.00011329603934844323, + "loss": 1.658255386352539, + "step": 31840 + }, + { + "epoch": 0.09640851785146731, + "grad_norm": 0.15012741088867188, + "learning_rate": 0.00011329224416680582, + "loss": 1.6547691345214843, + "step": 31850 + }, + { + "epoch": 0.09643878740181315, + "grad_norm": 0.16774074733257294, + "learning_rate": 0.00011328844898516844, + "loss": 1.66334228515625, + "step": 31860 + }, + { + "epoch": 0.09646905695215897, + "grad_norm": 0.13611984252929688, + "learning_rate": 0.00011328465380353103, + "loss": 1.6408720016479492, + "step": 31870 + }, + { + "epoch": 0.09649932650250481, + "grad_norm": 0.15251456201076508, + "learning_rate": 0.00011328085862189365, + "loss": 1.670168685913086, + "step": 31880 + }, + { + "epoch": 0.09652959605285064, + "grad_norm": 0.14075912535190582, + "learning_rate": 0.00011327706344025624, + "loss": 1.6520519256591797, + "step": 31890 + }, + { + "epoch": 0.09655986560319646, + "grad_norm": 0.16330277919769287, + "learning_rate": 0.00011327326825861886, + "loss": 1.6485960006713867, + "step": 31900 + }, + { + "epoch": 0.0965901351535423, + "grad_norm": 0.1597418189048767, + "learning_rate": 0.00011326947307698145, + "loss": 1.6677608489990234, + "step": 31910 + }, + { + "epoch": 0.09662040470388812, + "grad_norm": 0.13745950162410736, + "learning_rate": 0.00011326567789534407, + "loss": 1.6449478149414063, + "step": 31920 + }, + { + "epoch": 0.09665067425423396, + "grad_norm": 0.15487536787986755, + "learning_rate": 0.00011326188271370668, + "loss": 1.607427215576172, + "step": 31930 + }, + { + "epoch": 0.09668094380457978, + "grad_norm": 0.15317897498607635, + "learning_rate": 0.00011325808753206929, + "loss": 1.645858383178711, + "step": 31940 + }, + { + "epoch": 0.09671121335492561, + "grad_norm": 0.16194027662277222, + "learning_rate": 0.0001132542923504319, + "loss": 1.6289680480957032, + "step": 31950 + }, + { + "epoch": 0.09674148290527144, + "grad_norm": 0.1500679850578308, + "learning_rate": 0.0001132504971687945, + "loss": 1.598310661315918, + "step": 31960 + }, + { + "epoch": 0.09677175245561727, + "grad_norm": 0.14378221333026886, + "learning_rate": 0.00011324670198715712, + "loss": 1.6848155975341796, + "step": 31970 + }, + { + "epoch": 0.0968020220059631, + "grad_norm": 0.1697116494178772, + "learning_rate": 0.00011324290680551971, + "loss": 1.6479742050170898, + "step": 31980 + }, + { + "epoch": 0.09683229155630893, + "grad_norm": 0.1444941610097885, + "learning_rate": 0.00011323911162388233, + "loss": 1.6046483993530274, + "step": 31990 + }, + { + "epoch": 0.09686256110665475, + "grad_norm": 0.1356077790260315, + "learning_rate": 0.00011323531644224492, + "loss": 1.6240571975708007, + "step": 32000 + }, + { + "epoch": 0.09686256110665475, + "eval_loss": 1.6644760370254517, + "eval_runtime": 27.7971, + "eval_samples_per_second": 17.988, + "eval_steps_per_second": 1.151, + "step": 32000 + }, + { + "epoch": 0.09689283065700059, + "grad_norm": 0.13814319670200348, + "learning_rate": 0.00011323152126060754, + "loss": 1.6407741546630858, + "step": 32010 + }, + { + "epoch": 0.09692310020734642, + "grad_norm": 0.17210884392261505, + "learning_rate": 0.00011322772607897013, + "loss": 1.6697254180908203, + "step": 32020 + }, + { + "epoch": 0.09695336975769225, + "grad_norm": 0.14433586597442627, + "learning_rate": 0.00011322393089733275, + "loss": 1.6082523345947266, + "step": 32030 + }, + { + "epoch": 0.09698363930803808, + "grad_norm": 0.15328305959701538, + "learning_rate": 0.00011322013571569536, + "loss": 1.6259685516357423, + "step": 32040 + }, + { + "epoch": 0.09701390885838392, + "grad_norm": 0.14544008672237396, + "learning_rate": 0.00011321634053405796, + "loss": 1.6494461059570313, + "step": 32050 + }, + { + "epoch": 0.09704417840872974, + "grad_norm": 0.16647738218307495, + "learning_rate": 0.00011321254535242057, + "loss": 1.6190174102783204, + "step": 32060 + }, + { + "epoch": 0.09707444795907556, + "grad_norm": 0.15249668061733246, + "learning_rate": 0.00011320875017078318, + "loss": 1.6299530029296876, + "step": 32070 + }, + { + "epoch": 0.0971047175094214, + "grad_norm": 0.1427941918373108, + "learning_rate": 0.00011320495498914578, + "loss": 1.6490856170654298, + "step": 32080 + }, + { + "epoch": 0.09713498705976722, + "grad_norm": 0.13762260973453522, + "learning_rate": 0.00011320115980750839, + "loss": 1.6782562255859375, + "step": 32090 + }, + { + "epoch": 0.09716525661011306, + "grad_norm": 0.14035990834236145, + "learning_rate": 0.000113197364625871, + "loss": 1.6609146118164062, + "step": 32100 + }, + { + "epoch": 0.09719552616045889, + "grad_norm": 0.14312605559825897, + "learning_rate": 0.0001131935694442336, + "loss": 1.6671871185302733, + "step": 32110 + }, + { + "epoch": 0.09722579571080471, + "grad_norm": 0.13490000367164612, + "learning_rate": 0.0001131897742625962, + "loss": 1.68529052734375, + "step": 32120 + }, + { + "epoch": 0.09725606526115055, + "grad_norm": 0.1534557193517685, + "learning_rate": 0.00011318597908095881, + "loss": 1.6573558807373048, + "step": 32130 + }, + { + "epoch": 0.09728633481149637, + "grad_norm": 0.14948412775993347, + "learning_rate": 0.00011318218389932142, + "loss": 1.6198953628540038, + "step": 32140 + }, + { + "epoch": 0.09731660436184221, + "grad_norm": 0.1389797031879425, + "learning_rate": 0.00011317838871768402, + "loss": 1.6569316864013672, + "step": 32150 + }, + { + "epoch": 0.09734687391218803, + "grad_norm": 0.1642504632472992, + "learning_rate": 0.00011317459353604664, + "loss": 1.6589542388916017, + "step": 32160 + }, + { + "epoch": 0.09737714346253386, + "grad_norm": 0.14909787476062775, + "learning_rate": 0.00011317079835440925, + "loss": 1.6696453094482422, + "step": 32170 + }, + { + "epoch": 0.0974074130128797, + "grad_norm": 0.14818817377090454, + "learning_rate": 0.00011316700317277186, + "loss": 1.6758598327636718, + "step": 32180 + }, + { + "epoch": 0.09743768256322552, + "grad_norm": 0.16161929070949554, + "learning_rate": 0.00011316320799113446, + "loss": 1.642378807067871, + "step": 32190 + }, + { + "epoch": 0.09746795211357136, + "grad_norm": 0.16332797706127167, + "learning_rate": 0.00011315941280949707, + "loss": 1.6254766464233399, + "step": 32200 + }, + { + "epoch": 0.09749822166391718, + "grad_norm": 0.1448783576488495, + "learning_rate": 0.00011315561762785967, + "loss": 1.635736846923828, + "step": 32210 + }, + { + "epoch": 0.09752849121426302, + "grad_norm": 0.14296792447566986, + "learning_rate": 0.00011315182244622228, + "loss": 1.6349889755249023, + "step": 32220 + }, + { + "epoch": 0.09755876076460884, + "grad_norm": 0.15217825770378113, + "learning_rate": 0.00011314802726458489, + "loss": 1.577756690979004, + "step": 32230 + }, + { + "epoch": 0.09758903031495467, + "grad_norm": 0.13404831290245056, + "learning_rate": 0.00011314423208294749, + "loss": 1.6367834091186524, + "step": 32240 + }, + { + "epoch": 0.0976192998653005, + "grad_norm": 0.14775794744491577, + "learning_rate": 0.0001131404369013101, + "loss": 1.6853771209716797, + "step": 32250 + }, + { + "epoch": 0.09764956941564633, + "grad_norm": 0.14291925728321075, + "learning_rate": 0.0001131366417196727, + "loss": 1.641876220703125, + "step": 32260 + }, + { + "epoch": 0.09767983896599217, + "grad_norm": 0.15091462433338165, + "learning_rate": 0.00011313284653803531, + "loss": 1.6530033111572267, + "step": 32270 + }, + { + "epoch": 0.09771010851633799, + "grad_norm": 0.1578206568956375, + "learning_rate": 0.00011312905135639793, + "loss": 1.6287445068359374, + "step": 32280 + }, + { + "epoch": 0.09774037806668381, + "grad_norm": 0.16752706468105316, + "learning_rate": 0.00011312525617476052, + "loss": 1.621335220336914, + "step": 32290 + }, + { + "epoch": 0.09777064761702965, + "grad_norm": 0.16161608695983887, + "learning_rate": 0.00011312146099312314, + "loss": 1.6476839065551758, + "step": 32300 + }, + { + "epoch": 0.09780091716737548, + "grad_norm": 0.1475532203912735, + "learning_rate": 0.00011311766581148573, + "loss": 1.6293581008911133, + "step": 32310 + }, + { + "epoch": 0.09783118671772131, + "grad_norm": 0.15494440495967865, + "learning_rate": 0.00011311387062984835, + "loss": 1.6629493713378907, + "step": 32320 + }, + { + "epoch": 0.09786145626806714, + "grad_norm": 0.14763090014457703, + "learning_rate": 0.00011311007544821094, + "loss": 1.6702905654907227, + "step": 32330 + }, + { + "epoch": 0.09789172581841296, + "grad_norm": 0.1633177399635315, + "learning_rate": 0.00011310628026657356, + "loss": 1.6475204467773437, + "step": 32340 + }, + { + "epoch": 0.0979219953687588, + "grad_norm": 0.15533579885959625, + "learning_rate": 0.00011310248508493617, + "loss": 1.6001926422119142, + "step": 32350 + }, + { + "epoch": 0.09795226491910462, + "grad_norm": 0.15487751364707947, + "learning_rate": 0.00011309868990329878, + "loss": 1.6194602966308593, + "step": 32360 + }, + { + "epoch": 0.09798253446945046, + "grad_norm": 0.14388276636600494, + "learning_rate": 0.00011309489472166138, + "loss": 1.6568500518798828, + "step": 32370 + }, + { + "epoch": 0.09801280401979628, + "grad_norm": 0.1625034064054489, + "learning_rate": 0.00011309109954002399, + "loss": 1.6146181106567383, + "step": 32380 + }, + { + "epoch": 0.09804307357014212, + "grad_norm": 0.16075670719146729, + "learning_rate": 0.0001130873043583866, + "loss": 1.6566688537597656, + "step": 32390 + }, + { + "epoch": 0.09807334312048795, + "grad_norm": 0.13318882882595062, + "learning_rate": 0.0001130835091767492, + "loss": 1.6195318222045898, + "step": 32400 + }, + { + "epoch": 0.09810361267083377, + "grad_norm": 0.15017937123775482, + "learning_rate": 0.00011307971399511182, + "loss": 1.657169723510742, + "step": 32410 + }, + { + "epoch": 0.09813388222117961, + "grad_norm": 0.1500580906867981, + "learning_rate": 0.00011307591881347441, + "loss": 1.6331281661987305, + "step": 32420 + }, + { + "epoch": 0.09816415177152543, + "grad_norm": 0.1492200791835785, + "learning_rate": 0.00011307212363183703, + "loss": 1.6676025390625, + "step": 32430 + }, + { + "epoch": 0.09819442132187127, + "grad_norm": 0.14856570959091187, + "learning_rate": 0.00011306832845019962, + "loss": 1.6248735427856444, + "step": 32440 + }, + { + "epoch": 0.0982246908722171, + "grad_norm": 0.16300927102565765, + "learning_rate": 0.00011306453326856224, + "loss": 1.6339124679565429, + "step": 32450 + }, + { + "epoch": 0.09825496042256292, + "grad_norm": 0.1658310443162918, + "learning_rate": 0.00011306073808692484, + "loss": 1.659804916381836, + "step": 32460 + }, + { + "epoch": 0.09828522997290876, + "grad_norm": 0.14427582919597626, + "learning_rate": 0.00011305694290528746, + "loss": 1.6501956939697267, + "step": 32470 + }, + { + "epoch": 0.09831549952325458, + "grad_norm": 0.14729991555213928, + "learning_rate": 0.00011305314772365005, + "loss": 1.6772058486938477, + "step": 32480 + }, + { + "epoch": 0.09834576907360042, + "grad_norm": 0.15078817307949066, + "learning_rate": 0.00011304935254201267, + "loss": 1.6818925857543945, + "step": 32490 + }, + { + "epoch": 0.09837603862394624, + "grad_norm": 0.13095107674598694, + "learning_rate": 0.00011304555736037526, + "loss": 1.6387710571289062, + "step": 32500 + }, + { + "epoch": 0.09837603862394624, + "eval_loss": 1.6638312339782715, + "eval_runtime": 28.0992, + "eval_samples_per_second": 17.794, + "eval_steps_per_second": 1.139, + "step": 32500 + }, + { + "epoch": 0.09840630817429206, + "grad_norm": 0.13295213878154755, + "learning_rate": 0.00011304176217873788, + "loss": 1.6853111267089844, + "step": 32510 + }, + { + "epoch": 0.0984365777246379, + "grad_norm": 0.1426849067211151, + "learning_rate": 0.00011303796699710047, + "loss": 1.6521644592285156, + "step": 32520 + }, + { + "epoch": 0.09846684727498373, + "grad_norm": 0.15024293959140778, + "learning_rate": 0.00011303417181546309, + "loss": 1.685079574584961, + "step": 32530 + }, + { + "epoch": 0.09849711682532956, + "grad_norm": 0.16171111166477203, + "learning_rate": 0.0001130303766338257, + "loss": 1.6945032119750976, + "step": 32540 + }, + { + "epoch": 0.09852738637567539, + "grad_norm": 0.13771672546863556, + "learning_rate": 0.0001130265814521883, + "loss": 1.6158435821533204, + "step": 32550 + }, + { + "epoch": 0.09855765592602121, + "grad_norm": 0.17164428532123566, + "learning_rate": 0.00011302278627055092, + "loss": 1.662203025817871, + "step": 32560 + }, + { + "epoch": 0.09858792547636705, + "grad_norm": 0.1525900512933731, + "learning_rate": 0.00011301899108891351, + "loss": 1.6765764236450196, + "step": 32570 + }, + { + "epoch": 0.09861819502671287, + "grad_norm": 0.13764196634292603, + "learning_rate": 0.00011301519590727613, + "loss": 1.6442283630371093, + "step": 32580 + }, + { + "epoch": 0.09864846457705871, + "grad_norm": 0.16614624857902527, + "learning_rate": 0.00011301140072563873, + "loss": 1.6502960205078125, + "step": 32590 + }, + { + "epoch": 0.09867873412740454, + "grad_norm": 0.1440773457288742, + "learning_rate": 0.00011300760554400135, + "loss": 1.6150894165039062, + "step": 32600 + }, + { + "epoch": 0.09870900367775037, + "grad_norm": 0.15377886593341827, + "learning_rate": 0.00011300381036236394, + "loss": 1.6822383880615235, + "step": 32610 + }, + { + "epoch": 0.0987392732280962, + "grad_norm": 0.1380310207605362, + "learning_rate": 0.00011300001518072656, + "loss": 1.6537616729736329, + "step": 32620 + }, + { + "epoch": 0.09876954277844202, + "grad_norm": 0.1421002447605133, + "learning_rate": 0.00011299621999908915, + "loss": 1.6240589141845703, + "step": 32630 + }, + { + "epoch": 0.09879981232878786, + "grad_norm": 0.14773505926132202, + "learning_rate": 0.00011299242481745177, + "loss": 1.6430408477783203, + "step": 32640 + }, + { + "epoch": 0.09883008187913368, + "grad_norm": 0.15508119761943817, + "learning_rate": 0.00011298862963581436, + "loss": 1.6346975326538087, + "step": 32650 + }, + { + "epoch": 0.09886035142947952, + "grad_norm": 0.149263396859169, + "learning_rate": 0.00011298483445417698, + "loss": 1.6503700256347655, + "step": 32660 + }, + { + "epoch": 0.09889062097982534, + "grad_norm": 0.14137831330299377, + "learning_rate": 0.00011298103927253959, + "loss": 1.6283660888671876, + "step": 32670 + }, + { + "epoch": 0.09892089053017117, + "grad_norm": 0.18203160166740417, + "learning_rate": 0.0001129772440909022, + "loss": 1.7112293243408203, + "step": 32680 + }, + { + "epoch": 0.098951160080517, + "grad_norm": 0.1404915452003479, + "learning_rate": 0.0001129734489092648, + "loss": 1.6714559555053712, + "step": 32690 + }, + { + "epoch": 0.09898142963086283, + "grad_norm": 0.1350311040878296, + "learning_rate": 0.0001129696537276274, + "loss": 1.6421844482421875, + "step": 32700 + }, + { + "epoch": 0.09901169918120867, + "grad_norm": 0.16318748891353607, + "learning_rate": 0.00011296585854599001, + "loss": 1.6347030639648437, + "step": 32710 + }, + { + "epoch": 0.09904196873155449, + "grad_norm": 0.1429361253976822, + "learning_rate": 0.00011296206336435262, + "loss": 1.6654804229736329, + "step": 32720 + }, + { + "epoch": 0.09907223828190032, + "grad_norm": 0.1527228057384491, + "learning_rate": 0.00011295826818271522, + "loss": 1.6682735443115235, + "step": 32730 + }, + { + "epoch": 0.09910250783224615, + "grad_norm": 0.15682189166545868, + "learning_rate": 0.00011295447300107783, + "loss": 1.6495494842529297, + "step": 32740 + }, + { + "epoch": 0.09913277738259198, + "grad_norm": 0.16960158944129944, + "learning_rate": 0.00011295067781944044, + "loss": 1.6617765426635742, + "step": 32750 + }, + { + "epoch": 0.09916304693293782, + "grad_norm": 0.16086351871490479, + "learning_rate": 0.00011294688263780304, + "loss": 1.7018375396728516, + "step": 32760 + }, + { + "epoch": 0.09919331648328364, + "grad_norm": 0.14412496984004974, + "learning_rate": 0.00011294308745616566, + "loss": 1.6522861480712892, + "step": 32770 + }, + { + "epoch": 0.09922358603362948, + "grad_norm": 0.14432594180107117, + "learning_rate": 0.00011293929227452827, + "loss": 1.6640853881835938, + "step": 32780 + }, + { + "epoch": 0.0992538555839753, + "grad_norm": 0.1247229352593422, + "learning_rate": 0.00011293549709289087, + "loss": 1.6271652221679687, + "step": 32790 + }, + { + "epoch": 0.09928412513432112, + "grad_norm": 0.14553803205490112, + "learning_rate": 0.00011293170191125348, + "loss": 1.6263416290283204, + "step": 32800 + }, + { + "epoch": 0.09931439468466696, + "grad_norm": 0.14530903100967407, + "learning_rate": 0.00011292790672961608, + "loss": 1.666787338256836, + "step": 32810 + }, + { + "epoch": 0.09934466423501279, + "grad_norm": 0.12821631133556366, + "learning_rate": 0.00011292411154797869, + "loss": 1.6446578979492188, + "step": 32820 + }, + { + "epoch": 0.09937493378535862, + "grad_norm": 0.13252541422843933, + "learning_rate": 0.0001129203163663413, + "loss": 1.673870849609375, + "step": 32830 + }, + { + "epoch": 0.09940520333570445, + "grad_norm": 0.13832108676433563, + "learning_rate": 0.0001129165211847039, + "loss": 1.6403644561767579, + "step": 32840 + }, + { + "epoch": 0.09943547288605027, + "grad_norm": 0.1612962782382965, + "learning_rate": 0.00011291272600306651, + "loss": 1.6449600219726563, + "step": 32850 + }, + { + "epoch": 0.09946574243639611, + "grad_norm": 0.1583915799856186, + "learning_rate": 0.00011290893082142911, + "loss": 1.665889358520508, + "step": 32860 + }, + { + "epoch": 0.09949601198674193, + "grad_norm": 0.1429886370897293, + "learning_rate": 0.00011290513563979172, + "loss": 1.621544647216797, + "step": 32870 + }, + { + "epoch": 0.09952628153708777, + "grad_norm": 0.15624772012233734, + "learning_rate": 0.00011290134045815433, + "loss": 1.653326416015625, + "step": 32880 + }, + { + "epoch": 0.0995565510874336, + "grad_norm": 0.15949057042598724, + "learning_rate": 0.00011289754527651693, + "loss": 1.6352134704589845, + "step": 32890 + }, + { + "epoch": 0.09958682063777942, + "grad_norm": 0.15595969557762146, + "learning_rate": 0.00011289375009487954, + "loss": 1.5876729011535644, + "step": 32900 + }, + { + "epoch": 0.09961709018812526, + "grad_norm": 0.15940599143505096, + "learning_rate": 0.00011288995491324216, + "loss": 1.651295280456543, + "step": 32910 + }, + { + "epoch": 0.09964735973847108, + "grad_norm": 0.1679789274930954, + "learning_rate": 0.00011288615973160475, + "loss": 1.6803205490112305, + "step": 32920 + }, + { + "epoch": 0.09967762928881692, + "grad_norm": 0.15789705514907837, + "learning_rate": 0.00011288236454996737, + "loss": 1.6346660614013673, + "step": 32930 + }, + { + "epoch": 0.09970789883916274, + "grad_norm": 0.1549530327320099, + "learning_rate": 0.00011287856936832996, + "loss": 1.6637939453125, + "step": 32940 + }, + { + "epoch": 0.09973816838950858, + "grad_norm": 0.1505516618490219, + "learning_rate": 0.00011287477418669258, + "loss": 1.6018674850463868, + "step": 32950 + }, + { + "epoch": 0.0997684379398544, + "grad_norm": 0.15666164457798004, + "learning_rate": 0.00011287097900505519, + "loss": 1.6599124908447265, + "step": 32960 + }, + { + "epoch": 0.09979870749020023, + "grad_norm": 0.1460713893175125, + "learning_rate": 0.0001128671838234178, + "loss": 1.6695497512817383, + "step": 32970 + }, + { + "epoch": 0.09982897704054607, + "grad_norm": 0.1508805751800537, + "learning_rate": 0.0001128633886417804, + "loss": 1.6434751510620118, + "step": 32980 + }, + { + "epoch": 0.09985924659089189, + "grad_norm": 0.13779982924461365, + "learning_rate": 0.000112859593460143, + "loss": 1.63902587890625, + "step": 32990 + }, + { + "epoch": 0.09988951614123773, + "grad_norm": 0.13959775865077972, + "learning_rate": 0.00011285579827850561, + "loss": 1.6975963592529297, + "step": 33000 + }, + { + "epoch": 0.09988951614123773, + "eval_loss": 1.6496213674545288, + "eval_runtime": 28.3724, + "eval_samples_per_second": 17.623, + "eval_steps_per_second": 1.128, + "step": 33000 + }, + { + "epoch": 0.09991978569158355, + "grad_norm": 0.14988788962364197, + "learning_rate": 0.00011285200309686822, + "loss": 1.6107397079467773, + "step": 33010 + }, + { + "epoch": 0.09995005524192938, + "grad_norm": 0.14635968208312988, + "learning_rate": 0.00011284820791523084, + "loss": 1.6716802597045899, + "step": 33020 + }, + { + "epoch": 0.09998032479227521, + "grad_norm": 0.14350591599941254, + "learning_rate": 0.00011284441273359343, + "loss": 1.6558696746826171, + "step": 33030 + }, + { + "epoch": 0.10001059434262104, + "grad_norm": 0.1429956555366516, + "learning_rate": 0.00011284061755195605, + "loss": 1.650408935546875, + "step": 33040 + }, + { + "epoch": 0.10004086389296687, + "grad_norm": 0.1632968634366989, + "learning_rate": 0.00011283682237031864, + "loss": 1.6453571319580078, + "step": 33050 + }, + { + "epoch": 0.1000711334433127, + "grad_norm": 0.14867828786373138, + "learning_rate": 0.00011283302718868126, + "loss": 1.6442142486572267, + "step": 33060 + }, + { + "epoch": 0.10010140299365852, + "grad_norm": 0.1575733721256256, + "learning_rate": 0.00011282923200704385, + "loss": 1.6387973785400392, + "step": 33070 + }, + { + "epoch": 0.10013167254400436, + "grad_norm": 0.14436446130275726, + "learning_rate": 0.00011282543682540647, + "loss": 1.6598331451416015, + "step": 33080 + }, + { + "epoch": 0.10016194209435018, + "grad_norm": 0.1541822999715805, + "learning_rate": 0.00011282164164376906, + "loss": 1.632723045349121, + "step": 33090 + }, + { + "epoch": 0.10019221164469602, + "grad_norm": 0.1440499871969223, + "learning_rate": 0.00011281784646213168, + "loss": 1.6497711181640624, + "step": 33100 + }, + { + "epoch": 0.10022248119504185, + "grad_norm": 0.1462579220533371, + "learning_rate": 0.00011281405128049428, + "loss": 1.6143001556396483, + "step": 33110 + }, + { + "epoch": 0.10025275074538768, + "grad_norm": 0.14033083617687225, + "learning_rate": 0.0001128102560988569, + "loss": 1.6558195114135743, + "step": 33120 + }, + { + "epoch": 0.10028302029573351, + "grad_norm": 0.14095108211040497, + "learning_rate": 0.00011280646091721949, + "loss": 1.6802659988403321, + "step": 33130 + }, + { + "epoch": 0.10031328984607933, + "grad_norm": 0.1474648267030716, + "learning_rate": 0.00011280266573558211, + "loss": 1.6546789169311524, + "step": 33140 + }, + { + "epoch": 0.10034355939642517, + "grad_norm": 0.13982339203357697, + "learning_rate": 0.00011279887055394471, + "loss": 1.636746597290039, + "step": 33150 + }, + { + "epoch": 0.100373828946771, + "grad_norm": 0.13630065321922302, + "learning_rate": 0.00011279507537230732, + "loss": 1.6224472045898437, + "step": 33160 + }, + { + "epoch": 0.10040409849711683, + "grad_norm": 0.1395498663187027, + "learning_rate": 0.00011279128019066994, + "loss": 1.6814098358154297, + "step": 33170 + }, + { + "epoch": 0.10043436804746265, + "grad_norm": 0.13390521705150604, + "learning_rate": 0.00011278748500903253, + "loss": 1.6907285690307616, + "step": 33180 + }, + { + "epoch": 0.10046463759780848, + "grad_norm": 0.16124644875526428, + "learning_rate": 0.00011278368982739515, + "loss": 1.6485584259033204, + "step": 33190 + }, + { + "epoch": 0.10049490714815432, + "grad_norm": 0.1351238340139389, + "learning_rate": 0.00011277989464575774, + "loss": 1.6657773971557617, + "step": 33200 + }, + { + "epoch": 0.10052517669850014, + "grad_norm": 0.14045622944831848, + "learning_rate": 0.00011277609946412036, + "loss": 1.6496902465820313, + "step": 33210 + }, + { + "epoch": 0.10055544624884598, + "grad_norm": 0.14977528154850006, + "learning_rate": 0.00011277230428248296, + "loss": 1.6703054428100585, + "step": 33220 + }, + { + "epoch": 0.1005857157991918, + "grad_norm": 0.14727549254894257, + "learning_rate": 0.00011276850910084558, + "loss": 1.6536670684814454, + "step": 33230 + }, + { + "epoch": 0.10061598534953763, + "grad_norm": 0.147766575217247, + "learning_rate": 0.00011276471391920817, + "loss": 1.620859146118164, + "step": 33240 + }, + { + "epoch": 0.10064625489988346, + "grad_norm": 0.13336923718452454, + "learning_rate": 0.00011276091873757079, + "loss": 1.621772003173828, + "step": 33250 + }, + { + "epoch": 0.10067652445022929, + "grad_norm": 0.14144623279571533, + "learning_rate": 0.00011275712355593338, + "loss": 1.6826723098754883, + "step": 33260 + }, + { + "epoch": 0.10070679400057513, + "grad_norm": 0.1405179500579834, + "learning_rate": 0.000112753328374296, + "loss": 1.6476499557495117, + "step": 33270 + }, + { + "epoch": 0.10073706355092095, + "grad_norm": 0.12423109263181686, + "learning_rate": 0.0001127495331926586, + "loss": 1.6501726150512694, + "step": 33280 + }, + { + "epoch": 0.10076733310126679, + "grad_norm": 0.13140976428985596, + "learning_rate": 0.00011274573801102121, + "loss": 1.6611442565917969, + "step": 33290 + }, + { + "epoch": 0.10079760265161261, + "grad_norm": 0.1483583003282547, + "learning_rate": 0.00011274194282938382, + "loss": 1.6440694808959961, + "step": 33300 + }, + { + "epoch": 0.10082787220195844, + "grad_norm": 0.1521255075931549, + "learning_rate": 0.00011273814764774642, + "loss": 1.6488414764404298, + "step": 33310 + }, + { + "epoch": 0.10085814175230427, + "grad_norm": 0.15569362044334412, + "learning_rate": 0.00011273435246610903, + "loss": 1.6727865219116211, + "step": 33320 + }, + { + "epoch": 0.1008884113026501, + "grad_norm": 0.1502412110567093, + "learning_rate": 0.00011273055728447163, + "loss": 1.6472232818603516, + "step": 33330 + }, + { + "epoch": 0.10091868085299593, + "grad_norm": 0.14088937640190125, + "learning_rate": 0.00011272676210283424, + "loss": 1.6667377471923828, + "step": 33340 + }, + { + "epoch": 0.10094895040334176, + "grad_norm": 0.129018172621727, + "learning_rate": 0.00011272296692119685, + "loss": 1.6505090713500976, + "step": 33350 + }, + { + "epoch": 0.10097921995368758, + "grad_norm": 0.15279455482959747, + "learning_rate": 0.00011271917173955947, + "loss": 1.6819101333618165, + "step": 33360 + }, + { + "epoch": 0.10100948950403342, + "grad_norm": 0.14125214517116547, + "learning_rate": 0.00011271537655792206, + "loss": 1.643756103515625, + "step": 33370 + }, + { + "epoch": 0.10103975905437924, + "grad_norm": 0.14292864501476288, + "learning_rate": 0.00011271158137628468, + "loss": 1.6402742385864257, + "step": 33380 + }, + { + "epoch": 0.10107002860472508, + "grad_norm": 0.1472243070602417, + "learning_rate": 0.00011270778619464728, + "loss": 1.6660995483398438, + "step": 33390 + }, + { + "epoch": 0.1011002981550709, + "grad_norm": 0.15228337049484253, + "learning_rate": 0.00011270399101300989, + "loss": 1.669980239868164, + "step": 33400 + }, + { + "epoch": 0.10113056770541673, + "grad_norm": 0.14379017055034637, + "learning_rate": 0.0001127001958313725, + "loss": 1.6423410415649413, + "step": 33410 + }, + { + "epoch": 0.10116083725576257, + "grad_norm": 0.13682392239570618, + "learning_rate": 0.0001126964006497351, + "loss": 1.6678691864013673, + "step": 33420 + }, + { + "epoch": 0.10119110680610839, + "grad_norm": 0.1642504632472992, + "learning_rate": 0.00011269260546809771, + "loss": 1.613103485107422, + "step": 33430 + }, + { + "epoch": 0.10122137635645423, + "grad_norm": 0.13689130544662476, + "learning_rate": 0.00011268881028646031, + "loss": 1.661156463623047, + "step": 33440 + }, + { + "epoch": 0.10125164590680005, + "grad_norm": 0.15552091598510742, + "learning_rate": 0.00011268501510482292, + "loss": 1.6543052673339844, + "step": 33450 + }, + { + "epoch": 0.10128191545714589, + "grad_norm": 0.12631374597549438, + "learning_rate": 0.00011268121992318553, + "loss": 1.628596305847168, + "step": 33460 + }, + { + "epoch": 0.10131218500749171, + "grad_norm": 0.13470755517482758, + "learning_rate": 0.00011267742474154813, + "loss": 1.6668428421020507, + "step": 33470 + }, + { + "epoch": 0.10134245455783754, + "grad_norm": 0.14132510125637054, + "learning_rate": 0.00011267362955991074, + "loss": 1.6482357025146483, + "step": 33480 + }, + { + "epoch": 0.10137272410818338, + "grad_norm": 0.16826684772968292, + "learning_rate": 0.00011266983437827334, + "loss": 1.6333248138427734, + "step": 33490 + }, + { + "epoch": 0.1014029936585292, + "grad_norm": 0.14785297214984894, + "learning_rate": 0.00011266603919663595, + "loss": 1.675290298461914, + "step": 33500 + }, + { + "epoch": 0.1014029936585292, + "eval_loss": 1.6165785789489746, + "eval_runtime": 27.9936, + "eval_samples_per_second": 17.861, + "eval_steps_per_second": 1.143, + "step": 33500 + }, + { + "epoch": 0.10143326320887504, + "grad_norm": 0.1355317085981369, + "learning_rate": 0.00011266224401499856, + "loss": 1.6223392486572266, + "step": 33510 + }, + { + "epoch": 0.10146353275922086, + "grad_norm": 0.15240295231342316, + "learning_rate": 0.00011265844883336117, + "loss": 1.6793247222900392, + "step": 33520 + }, + { + "epoch": 0.10149380230956669, + "grad_norm": 0.14629164338111877, + "learning_rate": 0.00011265465365172377, + "loss": 1.6843425750732421, + "step": 33530 + }, + { + "epoch": 0.10152407185991252, + "grad_norm": 0.1477922648191452, + "learning_rate": 0.00011265085847008639, + "loss": 1.6311811447143554, + "step": 33540 + }, + { + "epoch": 0.10155434141025835, + "grad_norm": 0.1412120759487152, + "learning_rate": 0.00011264706328844898, + "loss": 1.6934865951538085, + "step": 33550 + }, + { + "epoch": 0.10158461096060419, + "grad_norm": 0.15532732009887695, + "learning_rate": 0.0001126432681068116, + "loss": 1.6105924606323243, + "step": 33560 + }, + { + "epoch": 0.10161488051095001, + "grad_norm": 0.14730095863342285, + "learning_rate": 0.0001126394729251742, + "loss": 1.667079544067383, + "step": 33570 + }, + { + "epoch": 0.10164515006129583, + "grad_norm": 0.165104940533638, + "learning_rate": 0.00011263567774353681, + "loss": 1.6688861846923828, + "step": 33580 + }, + { + "epoch": 0.10167541961164167, + "grad_norm": 0.152533158659935, + "learning_rate": 0.00011263188256189942, + "loss": 1.6622447967529297, + "step": 33590 + }, + { + "epoch": 0.1017056891619875, + "grad_norm": 0.13495847582817078, + "learning_rate": 0.00011262808738026202, + "loss": 1.678451919555664, + "step": 33600 + }, + { + "epoch": 0.10173595871233333, + "grad_norm": 0.14996862411499023, + "learning_rate": 0.00011262429219862463, + "loss": 1.6451942443847656, + "step": 33610 + }, + { + "epoch": 0.10176622826267916, + "grad_norm": 0.13275054097175598, + "learning_rate": 0.00011262049701698723, + "loss": 1.6278493881225586, + "step": 33620 + }, + { + "epoch": 0.101796497813025, + "grad_norm": 0.15542958676815033, + "learning_rate": 0.00011261670183534984, + "loss": 1.6448348999023437, + "step": 33630 + }, + { + "epoch": 0.10182676736337082, + "grad_norm": 0.14988061785697937, + "learning_rate": 0.00011261290665371245, + "loss": 1.6315670013427734, + "step": 33640 + }, + { + "epoch": 0.10185703691371664, + "grad_norm": 0.14415301382541656, + "learning_rate": 0.00011260911147207507, + "loss": 1.656670379638672, + "step": 33650 + }, + { + "epoch": 0.10188730646406248, + "grad_norm": 0.1253730207681656, + "learning_rate": 0.00011260531629043766, + "loss": 1.6611217498779296, + "step": 33660 + }, + { + "epoch": 0.1019175760144083, + "grad_norm": 0.16017617285251617, + "learning_rate": 0.00011260152110880028, + "loss": 1.6613752365112304, + "step": 33670 + }, + { + "epoch": 0.10194784556475414, + "grad_norm": 0.16172389686107635, + "learning_rate": 0.00011259772592716287, + "loss": 1.639653205871582, + "step": 33680 + }, + { + "epoch": 0.10197811511509997, + "grad_norm": 0.13327035307884216, + "learning_rate": 0.00011259393074552549, + "loss": 1.6382701873779297, + "step": 33690 + }, + { + "epoch": 0.10200838466544579, + "grad_norm": 0.15133629739284515, + "learning_rate": 0.00011259013556388808, + "loss": 1.6542039871215821, + "step": 33700 + }, + { + "epoch": 0.10203865421579163, + "grad_norm": 0.156050443649292, + "learning_rate": 0.0001125863403822507, + "loss": 1.6220390319824218, + "step": 33710 + }, + { + "epoch": 0.10206892376613745, + "grad_norm": 0.14606349170207977, + "learning_rate": 0.0001125825452006133, + "loss": 1.6640806198120117, + "step": 33720 + }, + { + "epoch": 0.10209919331648329, + "grad_norm": 0.15416237711906433, + "learning_rate": 0.00011257875001897591, + "loss": 1.6203706741333008, + "step": 33730 + }, + { + "epoch": 0.10212946286682911, + "grad_norm": 0.13946393132209778, + "learning_rate": 0.0001125749548373385, + "loss": 1.6347793579101562, + "step": 33740 + }, + { + "epoch": 0.10215973241717494, + "grad_norm": 0.14403413236141205, + "learning_rate": 0.00011257115965570113, + "loss": 1.5941282272338868, + "step": 33750 + }, + { + "epoch": 0.10219000196752077, + "grad_norm": 0.14233645796775818, + "learning_rate": 0.00011256736447406372, + "loss": 1.642073631286621, + "step": 33760 + }, + { + "epoch": 0.1022202715178666, + "grad_norm": 0.13848385214805603, + "learning_rate": 0.00011256356929242634, + "loss": 1.6681388854980468, + "step": 33770 + }, + { + "epoch": 0.10225054106821244, + "grad_norm": 0.16350045800209045, + "learning_rate": 0.00011255977411078896, + "loss": 1.675099754333496, + "step": 33780 + }, + { + "epoch": 0.10228081061855826, + "grad_norm": 0.16760244965553284, + "learning_rate": 0.00011255597892915155, + "loss": 1.649832534790039, + "step": 33790 + }, + { + "epoch": 0.1023110801689041, + "grad_norm": 0.1589493751525879, + "learning_rate": 0.00011255218374751417, + "loss": 1.6208192825317382, + "step": 33800 + }, + { + "epoch": 0.10234134971924992, + "grad_norm": 0.13402503728866577, + "learning_rate": 0.00011254838856587676, + "loss": 1.6712154388427733, + "step": 33810 + }, + { + "epoch": 0.10237161926959575, + "grad_norm": 0.1641789972782135, + "learning_rate": 0.00011254459338423938, + "loss": 1.6346534729003905, + "step": 33820 + }, + { + "epoch": 0.10240188881994158, + "grad_norm": 0.14086487889289856, + "learning_rate": 0.00011254079820260197, + "loss": 1.6263650894165038, + "step": 33830 + }, + { + "epoch": 0.10243215837028741, + "grad_norm": 0.15330654382705688, + "learning_rate": 0.00011253700302096459, + "loss": 1.5996614456176759, + "step": 33840 + }, + { + "epoch": 0.10246242792063325, + "grad_norm": 0.1589469313621521, + "learning_rate": 0.00011253320783932718, + "loss": 1.5994930267333984, + "step": 33850 + }, + { + "epoch": 0.10249269747097907, + "grad_norm": 0.15329977869987488, + "learning_rate": 0.0001125294126576898, + "loss": 1.6357940673828124, + "step": 33860 + }, + { + "epoch": 0.10252296702132489, + "grad_norm": 0.1395512819290161, + "learning_rate": 0.0001125256174760524, + "loss": 1.686207389831543, + "step": 33870 + }, + { + "epoch": 0.10255323657167073, + "grad_norm": 0.14248572289943695, + "learning_rate": 0.00011252182229441502, + "loss": 1.6169387817382812, + "step": 33880 + }, + { + "epoch": 0.10258350612201655, + "grad_norm": 0.13209542632102966, + "learning_rate": 0.00011251802711277762, + "loss": 1.6105670928955078, + "step": 33890 + }, + { + "epoch": 0.10261377567236239, + "grad_norm": 0.1453617960214615, + "learning_rate": 0.00011251423193114023, + "loss": 1.6375274658203125, + "step": 33900 + }, + { + "epoch": 0.10264404522270822, + "grad_norm": 0.1567075103521347, + "learning_rate": 0.00011251043674950283, + "loss": 1.6550331115722656, + "step": 33910 + }, + { + "epoch": 0.10267431477305404, + "grad_norm": 0.15657980740070343, + "learning_rate": 0.00011250664156786544, + "loss": 1.6373722076416015, + "step": 33920 + }, + { + "epoch": 0.10270458432339988, + "grad_norm": 0.15417805314064026, + "learning_rate": 0.00011250284638622805, + "loss": 1.6552650451660156, + "step": 33930 + }, + { + "epoch": 0.1027348538737457, + "grad_norm": 0.1630830466747284, + "learning_rate": 0.00011249905120459065, + "loss": 1.6271602630615234, + "step": 33940 + }, + { + "epoch": 0.10276512342409154, + "grad_norm": 0.14922095835208893, + "learning_rate": 0.00011249525602295326, + "loss": 1.68267879486084, + "step": 33950 + }, + { + "epoch": 0.10279539297443736, + "grad_norm": 0.15904362499713898, + "learning_rate": 0.00011249146084131586, + "loss": 1.6682491302490234, + "step": 33960 + }, + { + "epoch": 0.1028256625247832, + "grad_norm": 0.1484304666519165, + "learning_rate": 0.00011248766565967848, + "loss": 1.6285720825195313, + "step": 33970 + }, + { + "epoch": 0.10285593207512903, + "grad_norm": 0.1564205288887024, + "learning_rate": 0.00011248387047804108, + "loss": 1.6517223358154296, + "step": 33980 + }, + { + "epoch": 0.10288620162547485, + "grad_norm": 0.13318133354187012, + "learning_rate": 0.0001124800752964037, + "loss": 1.6584196090698242, + "step": 33990 + }, + { + "epoch": 0.10291647117582069, + "grad_norm": 0.14513911306858063, + "learning_rate": 0.00011247628011476629, + "loss": 1.6202899932861328, + "step": 34000 + }, + { + "epoch": 0.10291647117582069, + "eval_loss": 1.6609654426574707, + "eval_runtime": 28.1628, + "eval_samples_per_second": 17.754, + "eval_steps_per_second": 1.136, + "step": 34000 + }, + { + "epoch": 0.10294674072616651, + "grad_norm": 0.1372794210910797, + "learning_rate": 0.00011247248493312891, + "loss": 1.7034835815429688, + "step": 34010 + }, + { + "epoch": 0.10297701027651235, + "grad_norm": 0.1618950366973877, + "learning_rate": 0.00011246868975149151, + "loss": 1.6268260955810547, + "step": 34020 + }, + { + "epoch": 0.10300727982685817, + "grad_norm": 0.1475875973701477, + "learning_rate": 0.00011246489456985412, + "loss": 1.6004199981689453, + "step": 34030 + }, + { + "epoch": 0.103037549377204, + "grad_norm": 0.13756680488586426, + "learning_rate": 0.00011246109938821672, + "loss": 1.5834754943847655, + "step": 34040 + }, + { + "epoch": 0.10306781892754983, + "grad_norm": 0.1620245724916458, + "learning_rate": 0.00011245730420657933, + "loss": 1.6483478546142578, + "step": 34050 + }, + { + "epoch": 0.10309808847789566, + "grad_norm": 0.147023543715477, + "learning_rate": 0.00011245350902494194, + "loss": 1.6259109497070312, + "step": 34060 + }, + { + "epoch": 0.1031283580282415, + "grad_norm": 0.16006888449192047, + "learning_rate": 0.00011244971384330454, + "loss": 1.6494525909423827, + "step": 34070 + }, + { + "epoch": 0.10315862757858732, + "grad_norm": 0.17000263929367065, + "learning_rate": 0.00011244591866166715, + "loss": 1.631841278076172, + "step": 34080 + }, + { + "epoch": 0.10318889712893314, + "grad_norm": 0.13839901983737946, + "learning_rate": 0.00011244212348002975, + "loss": 1.7318130493164063, + "step": 34090 + }, + { + "epoch": 0.10321916667927898, + "grad_norm": 0.17113952338695526, + "learning_rate": 0.00011243832829839236, + "loss": 1.650693130493164, + "step": 34100 + }, + { + "epoch": 0.1032494362296248, + "grad_norm": 0.1405460387468338, + "learning_rate": 0.00011243453311675497, + "loss": 1.6572120666503907, + "step": 34110 + }, + { + "epoch": 0.10327970577997064, + "grad_norm": 0.13722379505634308, + "learning_rate": 0.00011243073793511757, + "loss": 1.626579475402832, + "step": 34120 + }, + { + "epoch": 0.10330997533031647, + "grad_norm": 0.15137295424938202, + "learning_rate": 0.00011242694275348019, + "loss": 1.6158910751342774, + "step": 34130 + }, + { + "epoch": 0.10334024488066229, + "grad_norm": 0.14891129732131958, + "learning_rate": 0.00011242314757184278, + "loss": 1.642736053466797, + "step": 34140 + }, + { + "epoch": 0.10337051443100813, + "grad_norm": 0.14477461576461792, + "learning_rate": 0.0001124193523902054, + "loss": 1.6200820922851562, + "step": 34150 + }, + { + "epoch": 0.10340078398135395, + "grad_norm": 0.13855907320976257, + "learning_rate": 0.000112415557208568, + "loss": 1.6524658203125, + "step": 34160 + }, + { + "epoch": 0.10343105353169979, + "grad_norm": 0.14841605722904205, + "learning_rate": 0.00011241176202693062, + "loss": 1.6194444656372071, + "step": 34170 + }, + { + "epoch": 0.10346132308204561, + "grad_norm": 0.1417478621006012, + "learning_rate": 0.00011240796684529322, + "loss": 1.65173397064209, + "step": 34180 + }, + { + "epoch": 0.10349159263239145, + "grad_norm": 0.1477864384651184, + "learning_rate": 0.00011240417166365583, + "loss": 1.6421310424804687, + "step": 34190 + }, + { + "epoch": 0.10352186218273728, + "grad_norm": 0.13454709947109222, + "learning_rate": 0.00011240037648201843, + "loss": 1.6421649932861329, + "step": 34200 + }, + { + "epoch": 0.1035521317330831, + "grad_norm": 0.15609627962112427, + "learning_rate": 0.00011239658130038104, + "loss": 1.6288883209228515, + "step": 34210 + }, + { + "epoch": 0.10358240128342894, + "grad_norm": 0.13469356298446655, + "learning_rate": 0.00011239278611874365, + "loss": 1.6497783660888672, + "step": 34220 + }, + { + "epoch": 0.10361267083377476, + "grad_norm": 0.1519378274679184, + "learning_rate": 0.00011238899093710625, + "loss": 1.6330379486083983, + "step": 34230 + }, + { + "epoch": 0.1036429403841206, + "grad_norm": 0.1630910485982895, + "learning_rate": 0.00011238519575546886, + "loss": 1.617498016357422, + "step": 34240 + }, + { + "epoch": 0.10367320993446642, + "grad_norm": 0.15124274790287018, + "learning_rate": 0.00011238140057383146, + "loss": 1.6663482666015625, + "step": 34250 + }, + { + "epoch": 0.10370347948481225, + "grad_norm": 0.17757853865623474, + "learning_rate": 0.00011237760539219408, + "loss": 1.666414451599121, + "step": 34260 + }, + { + "epoch": 0.10373374903515808, + "grad_norm": 0.1743311583995819, + "learning_rate": 0.00011237381021055668, + "loss": 1.6320171356201172, + "step": 34270 + }, + { + "epoch": 0.10376401858550391, + "grad_norm": 0.13806802034378052, + "learning_rate": 0.0001123700150289193, + "loss": 1.6335693359375, + "step": 34280 + }, + { + "epoch": 0.10379428813584975, + "grad_norm": 0.1397782415151596, + "learning_rate": 0.00011236621984728189, + "loss": 1.6541587829589843, + "step": 34290 + }, + { + "epoch": 0.10382455768619557, + "grad_norm": 0.13900701701641083, + "learning_rate": 0.0001123624246656445, + "loss": 1.627831268310547, + "step": 34300 + }, + { + "epoch": 0.1038548272365414, + "grad_norm": 0.1467514932155609, + "learning_rate": 0.0001123586294840071, + "loss": 1.6497303009033204, + "step": 34310 + }, + { + "epoch": 0.10388509678688723, + "grad_norm": 0.16348640620708466, + "learning_rate": 0.00011235483430236972, + "loss": 1.6480812072753905, + "step": 34320 + }, + { + "epoch": 0.10391536633723306, + "grad_norm": 0.13982878625392914, + "learning_rate": 0.00011235103912073231, + "loss": 1.6396358489990235, + "step": 34330 + }, + { + "epoch": 0.1039456358875789, + "grad_norm": 0.1404147893190384, + "learning_rate": 0.00011234724393909493, + "loss": 1.6170974731445313, + "step": 34340 + }, + { + "epoch": 0.10397590543792472, + "grad_norm": 0.16558745503425598, + "learning_rate": 0.00011234344875745752, + "loss": 1.625259017944336, + "step": 34350 + }, + { + "epoch": 0.10400617498827056, + "grad_norm": 0.1535813957452774, + "learning_rate": 0.00011233965357582014, + "loss": 1.6091642379760742, + "step": 34360 + }, + { + "epoch": 0.10403644453861638, + "grad_norm": 0.15539468824863434, + "learning_rate": 0.00011233585839418273, + "loss": 1.647437286376953, + "step": 34370 + }, + { + "epoch": 0.1040667140889622, + "grad_norm": 0.1480962187051773, + "learning_rate": 0.00011233206321254535, + "loss": 1.6569091796875, + "step": 34380 + }, + { + "epoch": 0.10409698363930804, + "grad_norm": 0.13368919491767883, + "learning_rate": 0.00011232826803090797, + "loss": 1.631005096435547, + "step": 34390 + }, + { + "epoch": 0.10412725318965387, + "grad_norm": 0.14502264559268951, + "learning_rate": 0.00011232447284927057, + "loss": 1.6610177993774413, + "step": 34400 + }, + { + "epoch": 0.1041575227399997, + "grad_norm": 0.13721247017383575, + "learning_rate": 0.00011232067766763319, + "loss": 1.6154422760009766, + "step": 34410 + }, + { + "epoch": 0.10418779229034553, + "grad_norm": 0.1649245321750641, + "learning_rate": 0.00011231688248599578, + "loss": 1.6340625762939454, + "step": 34420 + }, + { + "epoch": 0.10421806184069135, + "grad_norm": 0.1419145166873932, + "learning_rate": 0.0001123130873043584, + "loss": 1.655128288269043, + "step": 34430 + }, + { + "epoch": 0.10424833139103719, + "grad_norm": 0.14302468299865723, + "learning_rate": 0.00011230929212272099, + "loss": 1.6130001068115234, + "step": 34440 + }, + { + "epoch": 0.10427860094138301, + "grad_norm": 0.1405562460422516, + "learning_rate": 0.00011230549694108361, + "loss": 1.6425468444824218, + "step": 34450 + }, + { + "epoch": 0.10430887049172885, + "grad_norm": 0.13255485892295837, + "learning_rate": 0.0001123017017594462, + "loss": 1.6349407196044923, + "step": 34460 + }, + { + "epoch": 0.10433914004207467, + "grad_norm": 0.1411626636981964, + "learning_rate": 0.00011229790657780882, + "loss": 1.6471118927001953, + "step": 34470 + }, + { + "epoch": 0.1043694095924205, + "grad_norm": 0.16704700887203217, + "learning_rate": 0.00011229411139617141, + "loss": 1.6339126586914063, + "step": 34480 + }, + { + "epoch": 0.10439967914276634, + "grad_norm": 0.15002596378326416, + "learning_rate": 0.00011229031621453403, + "loss": 1.6143661499023438, + "step": 34490 + }, + { + "epoch": 0.10442994869311216, + "grad_norm": 0.16791574656963348, + "learning_rate": 0.00011228652103289664, + "loss": 1.650190544128418, + "step": 34500 + }, + { + "epoch": 0.10442994869311216, + "eval_loss": 1.6393214464187622, + "eval_runtime": 28.084, + "eval_samples_per_second": 17.804, + "eval_steps_per_second": 1.139, + "step": 34500 + }, + { + "epoch": 0.104460218243458, + "grad_norm": 0.13759469985961914, + "learning_rate": 0.00011228272585125925, + "loss": 1.645674705505371, + "step": 34510 + }, + { + "epoch": 0.10449048779380382, + "grad_norm": 0.13670125603675842, + "learning_rate": 0.00011227893066962185, + "loss": 1.6299467086791992, + "step": 34520 + }, + { + "epoch": 0.10452075734414966, + "grad_norm": 0.16297794878482819, + "learning_rate": 0.00011227513548798446, + "loss": 1.6172382354736328, + "step": 34530 + }, + { + "epoch": 0.10455102689449548, + "grad_norm": 0.14612828195095062, + "learning_rate": 0.00011227134030634706, + "loss": 1.6297683715820312, + "step": 34540 + }, + { + "epoch": 0.1045812964448413, + "grad_norm": 0.1557205766439438, + "learning_rate": 0.00011226754512470967, + "loss": 1.669368362426758, + "step": 34550 + }, + { + "epoch": 0.10461156599518714, + "grad_norm": 0.15277674794197083, + "learning_rate": 0.00011226374994307227, + "loss": 1.6750930786132812, + "step": 34560 + }, + { + "epoch": 0.10464183554553297, + "grad_norm": 0.14282189309597015, + "learning_rate": 0.00011225995476143488, + "loss": 1.6614002227783202, + "step": 34570 + }, + { + "epoch": 0.1046721050958788, + "grad_norm": 0.14483417570590973, + "learning_rate": 0.0001122561595797975, + "loss": 1.640038299560547, + "step": 34580 + }, + { + "epoch": 0.10470237464622463, + "grad_norm": 0.1467052400112152, + "learning_rate": 0.00011225236439816009, + "loss": 1.6458118438720704, + "step": 34590 + }, + { + "epoch": 0.10473264419657045, + "grad_norm": 0.15061888098716736, + "learning_rate": 0.00011224856921652271, + "loss": 1.6555757522583008, + "step": 34600 + }, + { + "epoch": 0.10476291374691629, + "grad_norm": 0.16176658868789673, + "learning_rate": 0.0001122447740348853, + "loss": 1.6243545532226562, + "step": 34610 + }, + { + "epoch": 0.10479318329726212, + "grad_norm": 0.1323048174381256, + "learning_rate": 0.00011224097885324792, + "loss": 1.6178741455078125, + "step": 34620 + }, + { + "epoch": 0.10482345284760795, + "grad_norm": 0.13621766865253448, + "learning_rate": 0.00011223718367161053, + "loss": 1.6344877243041993, + "step": 34630 + }, + { + "epoch": 0.10485372239795378, + "grad_norm": 0.14234685897827148, + "learning_rate": 0.00011223338848997314, + "loss": 1.621432113647461, + "step": 34640 + }, + { + "epoch": 0.1048839919482996, + "grad_norm": 0.14248411357402802, + "learning_rate": 0.00011222959330833574, + "loss": 1.6186050415039062, + "step": 34650 + }, + { + "epoch": 0.10491426149864544, + "grad_norm": 0.1546676605939865, + "learning_rate": 0.00011222579812669835, + "loss": 1.6733694076538086, + "step": 34660 + }, + { + "epoch": 0.10494453104899126, + "grad_norm": 0.13440488278865814, + "learning_rate": 0.00011222200294506095, + "loss": 1.6381708145141602, + "step": 34670 + }, + { + "epoch": 0.1049748005993371, + "grad_norm": 0.1308482438325882, + "learning_rate": 0.00011221820776342356, + "loss": 1.6165115356445312, + "step": 34680 + }, + { + "epoch": 0.10500507014968292, + "grad_norm": 0.14585377275943756, + "learning_rate": 0.00011221441258178617, + "loss": 1.6333633422851563, + "step": 34690 + }, + { + "epoch": 0.10503533970002876, + "grad_norm": 0.14896787703037262, + "learning_rate": 0.00011221061740014877, + "loss": 1.6394430160522462, + "step": 34700 + }, + { + "epoch": 0.10506560925037459, + "grad_norm": 0.12905479967594147, + "learning_rate": 0.00011220682221851138, + "loss": 1.6500396728515625, + "step": 34710 + }, + { + "epoch": 0.10509587880072041, + "grad_norm": 0.1376631110906601, + "learning_rate": 0.00011220302703687398, + "loss": 1.654184913635254, + "step": 34720 + }, + { + "epoch": 0.10512614835106625, + "grad_norm": 0.15405243635177612, + "learning_rate": 0.00011219923185523659, + "loss": 1.6379989624023437, + "step": 34730 + }, + { + "epoch": 0.10515641790141207, + "grad_norm": 0.16239483654499054, + "learning_rate": 0.0001121954366735992, + "loss": 1.627100944519043, + "step": 34740 + }, + { + "epoch": 0.10518668745175791, + "grad_norm": 0.15524893999099731, + "learning_rate": 0.0001121916414919618, + "loss": 1.6065145492553712, + "step": 34750 + }, + { + "epoch": 0.10521695700210373, + "grad_norm": 0.142926886677742, + "learning_rate": 0.00011218784631032442, + "loss": 1.6659820556640625, + "step": 34760 + }, + { + "epoch": 0.10524722655244956, + "grad_norm": 0.18307504057884216, + "learning_rate": 0.00011218405112868701, + "loss": 1.6680700302124023, + "step": 34770 + }, + { + "epoch": 0.1052774961027954, + "grad_norm": 0.16032488644123077, + "learning_rate": 0.00011218025594704963, + "loss": 1.672430419921875, + "step": 34780 + }, + { + "epoch": 0.10530776565314122, + "grad_norm": 0.1481086015701294, + "learning_rate": 0.00011217646076541224, + "loss": 1.6438703536987305, + "step": 34790 + }, + { + "epoch": 0.10533803520348706, + "grad_norm": 0.17775441706180573, + "learning_rate": 0.00011217266558377484, + "loss": 1.673503303527832, + "step": 34800 + }, + { + "epoch": 0.10536830475383288, + "grad_norm": 0.1597248911857605, + "learning_rate": 0.00011216887040213745, + "loss": 1.644986343383789, + "step": 34810 + }, + { + "epoch": 0.1053985743041787, + "grad_norm": 0.13632506132125854, + "learning_rate": 0.00011216507522050006, + "loss": 1.6113243103027344, + "step": 34820 + }, + { + "epoch": 0.10542884385452454, + "grad_norm": 0.13472509384155273, + "learning_rate": 0.00011216128003886266, + "loss": 1.65814208984375, + "step": 34830 + }, + { + "epoch": 0.10545911340487037, + "grad_norm": 0.14129525423049927, + "learning_rate": 0.00011215748485722527, + "loss": 1.6474294662475586, + "step": 34840 + }, + { + "epoch": 0.1054893829552162, + "grad_norm": 0.14955627918243408, + "learning_rate": 0.00011215368967558787, + "loss": 1.6258636474609376, + "step": 34850 + }, + { + "epoch": 0.10551965250556203, + "grad_norm": 0.1737411469221115, + "learning_rate": 0.00011214989449395048, + "loss": 1.660024642944336, + "step": 34860 + }, + { + "epoch": 0.10554992205590787, + "grad_norm": 0.14280688762664795, + "learning_rate": 0.0001121460993123131, + "loss": 1.6223220825195312, + "step": 34870 + }, + { + "epoch": 0.10558019160625369, + "grad_norm": 0.1426953673362732, + "learning_rate": 0.00011214230413067569, + "loss": 1.6664154052734375, + "step": 34880 + }, + { + "epoch": 0.10561046115659951, + "grad_norm": 0.14519742131233215, + "learning_rate": 0.00011213850894903831, + "loss": 1.599080467224121, + "step": 34890 + }, + { + "epoch": 0.10564073070694535, + "grad_norm": 0.13907968997955322, + "learning_rate": 0.0001121347137674009, + "loss": 1.6353641510009767, + "step": 34900 + }, + { + "epoch": 0.10567100025729118, + "grad_norm": 0.13126197457313538, + "learning_rate": 0.00011213091858576352, + "loss": 1.6426897048950195, + "step": 34910 + }, + { + "epoch": 0.10570126980763701, + "grad_norm": 0.1397596001625061, + "learning_rate": 0.00011212712340412612, + "loss": 1.6263168334960938, + "step": 34920 + }, + { + "epoch": 0.10573153935798284, + "grad_norm": 0.15446878969669342, + "learning_rate": 0.00011212332822248874, + "loss": 1.6611692428588867, + "step": 34930 + }, + { + "epoch": 0.10576180890832866, + "grad_norm": 0.13362787663936615, + "learning_rate": 0.00011211953304085133, + "loss": 1.613206672668457, + "step": 34940 + }, + { + "epoch": 0.1057920784586745, + "grad_norm": 0.15627343952655792, + "learning_rate": 0.00011211573785921395, + "loss": 1.635908317565918, + "step": 34950 + }, + { + "epoch": 0.10582234800902032, + "grad_norm": 0.13120625913143158, + "learning_rate": 0.00011211194267757654, + "loss": 1.6401601791381837, + "step": 34960 + }, + { + "epoch": 0.10585261755936616, + "grad_norm": 0.15781846642494202, + "learning_rate": 0.00011210814749593916, + "loss": 1.61682071685791, + "step": 34970 + }, + { + "epoch": 0.10588288710971198, + "grad_norm": 0.15678073465824127, + "learning_rate": 0.00011210435231430175, + "loss": 1.6777517318725585, + "step": 34980 + }, + { + "epoch": 0.10591315666005781, + "grad_norm": 0.15119370818138123, + "learning_rate": 0.00011210055713266437, + "loss": 1.6204231262207032, + "step": 34990 + }, + { + "epoch": 0.10594342621040365, + "grad_norm": 0.14390257000923157, + "learning_rate": 0.00011209676195102699, + "loss": 1.6607288360595702, + "step": 35000 + }, + { + "epoch": 0.10594342621040365, + "eval_loss": 1.6722761392593384, + "eval_runtime": 27.9305, + "eval_samples_per_second": 17.902, + "eval_steps_per_second": 1.146, + "step": 35000 + }, + { + "epoch": 0.10597369576074947, + "grad_norm": 0.15430696308612823, + "learning_rate": 0.00011209296676938958, + "loss": 1.6310304641723632, + "step": 35010 + }, + { + "epoch": 0.10600396531109531, + "grad_norm": 0.1519296020269394, + "learning_rate": 0.0001120891715877522, + "loss": 1.6338090896606445, + "step": 35020 + }, + { + "epoch": 0.10603423486144113, + "grad_norm": 0.15083982050418854, + "learning_rate": 0.0001120853764061148, + "loss": 1.664984130859375, + "step": 35030 + }, + { + "epoch": 0.10606450441178697, + "grad_norm": 0.14116011559963226, + "learning_rate": 0.00011208158122447741, + "loss": 1.6450836181640625, + "step": 35040 + }, + { + "epoch": 0.1060947739621328, + "grad_norm": 0.13973526656627655, + "learning_rate": 0.00011207778604284001, + "loss": 1.6158958435058595, + "step": 35050 + }, + { + "epoch": 0.10612504351247862, + "grad_norm": 0.13603749871253967, + "learning_rate": 0.00011207399086120263, + "loss": 1.6432605743408204, + "step": 35060 + }, + { + "epoch": 0.10615531306282446, + "grad_norm": 0.15358881652355194, + "learning_rate": 0.00011207019567956522, + "loss": 1.6673103332519532, + "step": 35070 + }, + { + "epoch": 0.10618558261317028, + "grad_norm": 0.14134623110294342, + "learning_rate": 0.00011206640049792784, + "loss": 1.6483779907226563, + "step": 35080 + }, + { + "epoch": 0.10621585216351612, + "grad_norm": 0.1388549953699112, + "learning_rate": 0.00011206260531629043, + "loss": 1.623944091796875, + "step": 35090 + }, + { + "epoch": 0.10624612171386194, + "grad_norm": 0.15167927742004395, + "learning_rate": 0.00011205881013465305, + "loss": 1.6714607238769532, + "step": 35100 + }, + { + "epoch": 0.10627639126420776, + "grad_norm": 0.14175571501255035, + "learning_rate": 0.00011205501495301564, + "loss": 1.692274284362793, + "step": 35110 + }, + { + "epoch": 0.1063066608145536, + "grad_norm": 0.15929152071475983, + "learning_rate": 0.00011205121977137826, + "loss": 1.6876754760742188, + "step": 35120 + }, + { + "epoch": 0.10633693036489943, + "grad_norm": 0.14351584017276764, + "learning_rate": 0.00011204742458974087, + "loss": 1.6505958557128906, + "step": 35130 + }, + { + "epoch": 0.10636719991524526, + "grad_norm": 0.15078522264957428, + "learning_rate": 0.00011204362940810347, + "loss": 1.6362480163574218, + "step": 35140 + }, + { + "epoch": 0.10639746946559109, + "grad_norm": 0.15259012579917908, + "learning_rate": 0.00011203983422646608, + "loss": 1.5972578048706054, + "step": 35150 + }, + { + "epoch": 0.10642773901593691, + "grad_norm": 0.12687751650810242, + "learning_rate": 0.00011203603904482869, + "loss": 1.644660758972168, + "step": 35160 + }, + { + "epoch": 0.10645800856628275, + "grad_norm": 0.15624862909317017, + "learning_rate": 0.00011203224386319129, + "loss": 1.65576229095459, + "step": 35170 + }, + { + "epoch": 0.10648827811662857, + "grad_norm": 0.14449818432331085, + "learning_rate": 0.0001120284486815539, + "loss": 1.6290424346923829, + "step": 35180 + }, + { + "epoch": 0.10651854766697441, + "grad_norm": 0.14975421130657196, + "learning_rate": 0.00011202465349991652, + "loss": 1.6546531677246095, + "step": 35190 + }, + { + "epoch": 0.10654881721732024, + "grad_norm": 0.15074092149734497, + "learning_rate": 0.00011202085831827911, + "loss": 1.6504051208496093, + "step": 35200 + }, + { + "epoch": 0.10657908676766607, + "grad_norm": 0.13864471018314362, + "learning_rate": 0.00011201706313664173, + "loss": 1.650563621520996, + "step": 35210 + }, + { + "epoch": 0.1066093563180119, + "grad_norm": 0.1490841656923294, + "learning_rate": 0.00011201326795500432, + "loss": 1.6058799743652343, + "step": 35220 + }, + { + "epoch": 0.10663962586835772, + "grad_norm": 0.15187957882881165, + "learning_rate": 0.00011200947277336694, + "loss": 1.6589544296264649, + "step": 35230 + }, + { + "epoch": 0.10666989541870356, + "grad_norm": 0.15347415208816528, + "learning_rate": 0.00011200567759172955, + "loss": 1.604549026489258, + "step": 35240 + }, + { + "epoch": 0.10670016496904938, + "grad_norm": 0.14110079407691956, + "learning_rate": 0.00011200188241009215, + "loss": 1.6023853302001954, + "step": 35250 + }, + { + "epoch": 0.10673043451939522, + "grad_norm": 0.14657534658908844, + "learning_rate": 0.00011199808722845476, + "loss": 1.6194154739379882, + "step": 35260 + }, + { + "epoch": 0.10676070406974104, + "grad_norm": 0.14959542453289032, + "learning_rate": 0.00011199429204681737, + "loss": 1.674990463256836, + "step": 35270 + }, + { + "epoch": 0.10679097362008687, + "grad_norm": 0.1669190227985382, + "learning_rate": 0.00011199049686517997, + "loss": 1.6512775421142578, + "step": 35280 + }, + { + "epoch": 0.1068212431704327, + "grad_norm": 0.14839646220207214, + "learning_rate": 0.00011198670168354258, + "loss": 1.6420469284057617, + "step": 35290 + }, + { + "epoch": 0.10685151272077853, + "grad_norm": 0.1278868019580841, + "learning_rate": 0.00011198290650190518, + "loss": 1.6360107421875, + "step": 35300 + }, + { + "epoch": 0.10688178227112437, + "grad_norm": 0.1357768326997757, + "learning_rate": 0.00011197911132026779, + "loss": 1.5995869636535645, + "step": 35310 + }, + { + "epoch": 0.10691205182147019, + "grad_norm": 0.1440259963274002, + "learning_rate": 0.0001119753161386304, + "loss": 1.6080615997314454, + "step": 35320 + }, + { + "epoch": 0.10694232137181602, + "grad_norm": 0.14788641035556793, + "learning_rate": 0.000111971520956993, + "loss": 1.675675392150879, + "step": 35330 + }, + { + "epoch": 0.10697259092216185, + "grad_norm": 0.14318586885929108, + "learning_rate": 0.00011196772577535561, + "loss": 1.6123207092285157, + "step": 35340 + }, + { + "epoch": 0.10700286047250768, + "grad_norm": 0.14611579477787018, + "learning_rate": 0.00011196393059371821, + "loss": 1.6449468612670899, + "step": 35350 + }, + { + "epoch": 0.10703313002285351, + "grad_norm": 0.1518898457288742, + "learning_rate": 0.00011196013541208082, + "loss": 1.6744701385498046, + "step": 35360 + }, + { + "epoch": 0.10706339957319934, + "grad_norm": 0.14575426280498505, + "learning_rate": 0.00011195634023044344, + "loss": 1.7326492309570312, + "step": 35370 + }, + { + "epoch": 0.10709366912354518, + "grad_norm": 0.14042025804519653, + "learning_rate": 0.00011195254504880603, + "loss": 1.6568405151367187, + "step": 35380 + }, + { + "epoch": 0.107123938673891, + "grad_norm": 0.1543692648410797, + "learning_rate": 0.00011194874986716865, + "loss": 1.6228206634521485, + "step": 35390 + }, + { + "epoch": 0.10715420822423682, + "grad_norm": 0.12641406059265137, + "learning_rate": 0.00011194495468553126, + "loss": 1.6084182739257813, + "step": 35400 + }, + { + "epoch": 0.10718447777458266, + "grad_norm": 0.15704691410064697, + "learning_rate": 0.00011194115950389386, + "loss": 1.6356128692626952, + "step": 35410 + }, + { + "epoch": 0.10721474732492849, + "grad_norm": 0.1313195675611496, + "learning_rate": 0.00011193736432225647, + "loss": 1.6422954559326173, + "step": 35420 + }, + { + "epoch": 0.10724501687527432, + "grad_norm": 0.15976707637310028, + "learning_rate": 0.00011193356914061907, + "loss": 1.6756874084472657, + "step": 35430 + }, + { + "epoch": 0.10727528642562015, + "grad_norm": 0.14142771065235138, + "learning_rate": 0.00011192977395898168, + "loss": 1.6319271087646485, + "step": 35440 + }, + { + "epoch": 0.10730555597596597, + "grad_norm": 0.14166593551635742, + "learning_rate": 0.00011192597877734429, + "loss": 1.6165946960449218, + "step": 35450 + }, + { + "epoch": 0.10733582552631181, + "grad_norm": 0.15039120614528656, + "learning_rate": 0.00011192218359570689, + "loss": 1.6412666320800782, + "step": 35460 + }, + { + "epoch": 0.10736609507665763, + "grad_norm": 0.13518337905406952, + "learning_rate": 0.0001119183884140695, + "loss": 1.6183631896972657, + "step": 35470 + }, + { + "epoch": 0.10739636462700347, + "grad_norm": 0.14816191792488098, + "learning_rate": 0.0001119145932324321, + "loss": 1.6506008148193358, + "step": 35480 + }, + { + "epoch": 0.1074266341773493, + "grad_norm": 0.12258058786392212, + "learning_rate": 0.00011191079805079471, + "loss": 1.6408828735351562, + "step": 35490 + }, + { + "epoch": 0.10745690372769512, + "grad_norm": 0.14883460104465485, + "learning_rate": 0.00011190700286915733, + "loss": 1.5990528106689452, + "step": 35500 + }, + { + "epoch": 0.10745690372769512, + "eval_loss": 1.6332793235778809, + "eval_runtime": 27.8821, + "eval_samples_per_second": 17.933, + "eval_steps_per_second": 1.148, + "step": 35500 + }, + { + "epoch": 0.10748717327804096, + "grad_norm": 0.13310325145721436, + "learning_rate": 0.00011190320768751992, + "loss": 1.6277481079101563, + "step": 35510 + }, + { + "epoch": 0.10751744282838678, + "grad_norm": 0.13055728375911713, + "learning_rate": 0.00011189941250588254, + "loss": 1.659903335571289, + "step": 35520 + }, + { + "epoch": 0.10754771237873262, + "grad_norm": 0.13186272978782654, + "learning_rate": 0.00011189561732424513, + "loss": 1.6259004592895507, + "step": 35530 + }, + { + "epoch": 0.10757798192907844, + "grad_norm": 0.15266825258731842, + "learning_rate": 0.00011189182214260775, + "loss": 1.623000144958496, + "step": 35540 + }, + { + "epoch": 0.10760825147942428, + "grad_norm": 0.13122689723968506, + "learning_rate": 0.00011188802696097035, + "loss": 1.6554733276367188, + "step": 35550 + }, + { + "epoch": 0.1076385210297701, + "grad_norm": 0.1432933360338211, + "learning_rate": 0.00011188423177933296, + "loss": 1.6510028839111328, + "step": 35560 + }, + { + "epoch": 0.10766879058011593, + "grad_norm": 0.1517258584499359, + "learning_rate": 0.00011188043659769556, + "loss": 1.634674072265625, + "step": 35570 + }, + { + "epoch": 0.10769906013046177, + "grad_norm": 0.1466984897851944, + "learning_rate": 0.00011187664141605818, + "loss": 1.6158891677856446, + "step": 35580 + }, + { + "epoch": 0.10772932968080759, + "grad_norm": 0.16199469566345215, + "learning_rate": 0.00011187284623442077, + "loss": 1.564813995361328, + "step": 35590 + }, + { + "epoch": 0.10775959923115343, + "grad_norm": 0.14268551766872406, + "learning_rate": 0.00011186905105278339, + "loss": 1.6409666061401367, + "step": 35600 + }, + { + "epoch": 0.10778986878149925, + "grad_norm": 0.14059771597385406, + "learning_rate": 0.00011186525587114601, + "loss": 1.673590087890625, + "step": 35610 + }, + { + "epoch": 0.10782013833184508, + "grad_norm": 0.1415848433971405, + "learning_rate": 0.0001118614606895086, + "loss": 1.6854360580444336, + "step": 35620 + }, + { + "epoch": 0.10785040788219091, + "grad_norm": 0.12729552388191223, + "learning_rate": 0.00011185766550787122, + "loss": 1.649312973022461, + "step": 35630 + }, + { + "epoch": 0.10788067743253674, + "grad_norm": 0.15339502692222595, + "learning_rate": 0.00011185387032623381, + "loss": 1.6170961380004882, + "step": 35640 + }, + { + "epoch": 0.10791094698288257, + "grad_norm": 0.14813460409641266, + "learning_rate": 0.00011185007514459643, + "loss": 1.6889005661010743, + "step": 35650 + }, + { + "epoch": 0.1079412165332284, + "grad_norm": 0.14651374518871307, + "learning_rate": 0.00011184627996295902, + "loss": 1.625208282470703, + "step": 35660 + }, + { + "epoch": 0.10797148608357422, + "grad_norm": 0.12832534313201904, + "learning_rate": 0.00011184248478132164, + "loss": 1.6598949432373047, + "step": 35670 + }, + { + "epoch": 0.10800175563392006, + "grad_norm": 0.1402389258146286, + "learning_rate": 0.00011183868959968424, + "loss": 1.667642593383789, + "step": 35680 + }, + { + "epoch": 0.10803202518426588, + "grad_norm": 0.13960321247577667, + "learning_rate": 0.00011183489441804686, + "loss": 1.656711959838867, + "step": 35690 + }, + { + "epoch": 0.10806229473461172, + "grad_norm": 0.1512753963470459, + "learning_rate": 0.00011183109923640945, + "loss": 1.6015966415405274, + "step": 35700 + }, + { + "epoch": 0.10809256428495755, + "grad_norm": 0.13330310583114624, + "learning_rate": 0.00011182730405477207, + "loss": 1.6686275482177735, + "step": 35710 + }, + { + "epoch": 0.10812283383530337, + "grad_norm": 0.13304857909679413, + "learning_rate": 0.00011182350887313466, + "loss": 1.619797706604004, + "step": 35720 + }, + { + "epoch": 0.10815310338564921, + "grad_norm": 0.1649560183286667, + "learning_rate": 0.00011181971369149728, + "loss": 1.6368118286132813, + "step": 35730 + }, + { + "epoch": 0.10818337293599503, + "grad_norm": 0.13266848027706146, + "learning_rate": 0.00011181591850985989, + "loss": 1.6063776016235352, + "step": 35740 + }, + { + "epoch": 0.10821364248634087, + "grad_norm": 0.15548264980316162, + "learning_rate": 0.00011181212332822249, + "loss": 1.6618669509887696, + "step": 35750 + }, + { + "epoch": 0.10824391203668669, + "grad_norm": 0.12504346668720245, + "learning_rate": 0.0001118083281465851, + "loss": 1.6415271759033203, + "step": 35760 + }, + { + "epoch": 0.10827418158703253, + "grad_norm": 0.13487999141216278, + "learning_rate": 0.0001118045329649477, + "loss": 1.6359407424926757, + "step": 35770 + }, + { + "epoch": 0.10830445113737835, + "grad_norm": 0.13517993688583374, + "learning_rate": 0.00011180073778331031, + "loss": 1.664137077331543, + "step": 35780 + }, + { + "epoch": 0.10833472068772418, + "grad_norm": 0.13341794908046722, + "learning_rate": 0.00011179694260167292, + "loss": 1.6265241622924804, + "step": 35790 + }, + { + "epoch": 0.10836499023807002, + "grad_norm": 0.14531534910202026, + "learning_rate": 0.00011179314742003553, + "loss": 1.6579383850097655, + "step": 35800 + }, + { + "epoch": 0.10839525978841584, + "grad_norm": 0.1383945792913437, + "learning_rate": 0.00011178935223839813, + "loss": 1.6103704452514649, + "step": 35810 + }, + { + "epoch": 0.10842552933876168, + "grad_norm": 0.12511704862117767, + "learning_rate": 0.00011178555705676075, + "loss": 1.6437480926513672, + "step": 35820 + }, + { + "epoch": 0.1084557988891075, + "grad_norm": 0.12805260717868805, + "learning_rate": 0.00011178176187512334, + "loss": 1.6373023986816406, + "step": 35830 + }, + { + "epoch": 0.10848606843945333, + "grad_norm": 0.14073990285396576, + "learning_rate": 0.00011177796669348596, + "loss": 1.620033073425293, + "step": 35840 + }, + { + "epoch": 0.10851633798979916, + "grad_norm": 0.13002808392047882, + "learning_rate": 0.00011177417151184855, + "loss": 1.6464521408081054, + "step": 35850 + }, + { + "epoch": 0.10854660754014499, + "grad_norm": 0.12643183767795563, + "learning_rate": 0.00011177037633021117, + "loss": 1.6534446716308593, + "step": 35860 + }, + { + "epoch": 0.10857687709049083, + "grad_norm": 0.14522692561149597, + "learning_rate": 0.00011176658114857378, + "loss": 1.6982812881469727, + "step": 35870 + }, + { + "epoch": 0.10860714664083665, + "grad_norm": 0.1424257904291153, + "learning_rate": 0.00011176278596693638, + "loss": 1.6463115692138672, + "step": 35880 + }, + { + "epoch": 0.10863741619118247, + "grad_norm": 0.15548475086688995, + "learning_rate": 0.00011175899078529899, + "loss": 1.6286945343017578, + "step": 35890 + }, + { + "epoch": 0.10866768574152831, + "grad_norm": 0.140954852104187, + "learning_rate": 0.0001117551956036616, + "loss": 1.6514835357666016, + "step": 35900 + }, + { + "epoch": 0.10869795529187413, + "grad_norm": 0.1479293555021286, + "learning_rate": 0.0001117514004220242, + "loss": 1.6425848007202148, + "step": 35910 + }, + { + "epoch": 0.10872822484221997, + "grad_norm": 0.14207826554775238, + "learning_rate": 0.0001117476052403868, + "loss": 1.640634536743164, + "step": 35920 + }, + { + "epoch": 0.1087584943925658, + "grad_norm": 0.15585121512413025, + "learning_rate": 0.00011174381005874941, + "loss": 1.6428155899047852, + "step": 35930 + }, + { + "epoch": 0.10878876394291163, + "grad_norm": 0.1377694457769394, + "learning_rate": 0.00011174001487711202, + "loss": 1.6649890899658204, + "step": 35940 + }, + { + "epoch": 0.10881903349325746, + "grad_norm": 0.12993420660495758, + "learning_rate": 0.00011173621969547462, + "loss": 1.6612144470214845, + "step": 35950 + }, + { + "epoch": 0.10884930304360328, + "grad_norm": 0.13988876342773438, + "learning_rate": 0.00011173242451383723, + "loss": 1.633350944519043, + "step": 35960 + }, + { + "epoch": 0.10887957259394912, + "grad_norm": 0.16540785133838654, + "learning_rate": 0.00011172862933219984, + "loss": 1.645932960510254, + "step": 35970 + }, + { + "epoch": 0.10890984214429494, + "grad_norm": 0.16725115478038788, + "learning_rate": 0.00011172483415056246, + "loss": 1.649850845336914, + "step": 35980 + }, + { + "epoch": 0.10894011169464078, + "grad_norm": 0.1415415108203888, + "learning_rate": 0.00011172103896892505, + "loss": 1.630221176147461, + "step": 35990 + }, + { + "epoch": 0.1089703812449866, + "grad_norm": 0.13795973360538483, + "learning_rate": 0.00011171724378728767, + "loss": 1.6663902282714844, + "step": 36000 + }, + { + "epoch": 0.1089703812449866, + "eval_loss": 1.6322003602981567, + "eval_runtime": 27.7894, + "eval_samples_per_second": 17.993, + "eval_steps_per_second": 1.152, + "step": 36000 + }, + { + "epoch": 0.10900065079533243, + "grad_norm": 0.1335534155368805, + "learning_rate": 0.00011171344860565027, + "loss": 1.5965551376342773, + "step": 36010 + }, + { + "epoch": 0.10903092034567827, + "grad_norm": 0.16803395748138428, + "learning_rate": 0.00011170965342401288, + "loss": 1.639350128173828, + "step": 36020 + }, + { + "epoch": 0.10906118989602409, + "grad_norm": 0.16876332461833954, + "learning_rate": 0.00011170585824237549, + "loss": 1.634193229675293, + "step": 36030 + }, + { + "epoch": 0.10909145944636993, + "grad_norm": 0.15574708580970764, + "learning_rate": 0.00011170206306073809, + "loss": 1.6633415222167969, + "step": 36040 + }, + { + "epoch": 0.10912172899671575, + "grad_norm": 0.14016102254390717, + "learning_rate": 0.0001116982678791007, + "loss": 1.6524198532104493, + "step": 36050 + }, + { + "epoch": 0.10915199854706158, + "grad_norm": 0.1399191915988922, + "learning_rate": 0.0001116944726974633, + "loss": 1.663821029663086, + "step": 36060 + }, + { + "epoch": 0.10918226809740741, + "grad_norm": 0.14064191281795502, + "learning_rate": 0.00011169067751582591, + "loss": 1.6575937271118164, + "step": 36070 + }, + { + "epoch": 0.10921253764775324, + "grad_norm": 0.14763517677783966, + "learning_rate": 0.00011168688233418851, + "loss": 1.6485408782958983, + "step": 36080 + }, + { + "epoch": 0.10924280719809908, + "grad_norm": 0.14491181075572968, + "learning_rate": 0.00011168308715255112, + "loss": 1.6625186920166015, + "step": 36090 + }, + { + "epoch": 0.1092730767484449, + "grad_norm": 0.1356671303510666, + "learning_rate": 0.00011167929197091373, + "loss": 1.6171443939208985, + "step": 36100 + }, + { + "epoch": 0.10930334629879074, + "grad_norm": 0.12593531608581543, + "learning_rate": 0.00011167549678927635, + "loss": 1.6493255615234375, + "step": 36110 + }, + { + "epoch": 0.10933361584913656, + "grad_norm": 0.16276106238365173, + "learning_rate": 0.00011167170160763894, + "loss": 1.6327096939086914, + "step": 36120 + }, + { + "epoch": 0.10936388539948239, + "grad_norm": 0.12934143841266632, + "learning_rate": 0.00011166790642600156, + "loss": 1.6757801055908204, + "step": 36130 + }, + { + "epoch": 0.10939415494982822, + "grad_norm": 0.1777825504541397, + "learning_rate": 0.00011166411124436415, + "loss": 1.6237335205078125, + "step": 36140 + }, + { + "epoch": 0.10942442450017405, + "grad_norm": 0.12579345703125, + "learning_rate": 0.00011166031606272677, + "loss": 1.6710758209228516, + "step": 36150 + }, + { + "epoch": 0.10945469405051989, + "grad_norm": 0.14327770471572876, + "learning_rate": 0.00011165652088108936, + "loss": 1.670481300354004, + "step": 36160 + }, + { + "epoch": 0.10948496360086571, + "grad_norm": 0.13794617354869843, + "learning_rate": 0.00011165272569945198, + "loss": 1.6380517959594727, + "step": 36170 + }, + { + "epoch": 0.10951523315121153, + "grad_norm": 0.1402171105146408, + "learning_rate": 0.00011164893051781457, + "loss": 1.6478515625, + "step": 36180 + }, + { + "epoch": 0.10954550270155737, + "grad_norm": 0.13967037200927734, + "learning_rate": 0.0001116451353361772, + "loss": 1.6437156677246094, + "step": 36190 + }, + { + "epoch": 0.1095757722519032, + "grad_norm": 0.15172912180423737, + "learning_rate": 0.0001116413401545398, + "loss": 1.6063013076782227, + "step": 36200 + }, + { + "epoch": 0.10960604180224903, + "grad_norm": 0.13782472908496857, + "learning_rate": 0.0001116375449729024, + "loss": 1.623825454711914, + "step": 36210 + }, + { + "epoch": 0.10963631135259486, + "grad_norm": 0.14045235514640808, + "learning_rate": 0.00011163374979126501, + "loss": 1.6222772598266602, + "step": 36220 + }, + { + "epoch": 0.10966658090294068, + "grad_norm": 0.15645073354244232, + "learning_rate": 0.00011162995460962762, + "loss": 1.6150115966796874, + "step": 36230 + }, + { + "epoch": 0.10969685045328652, + "grad_norm": 0.14880798757076263, + "learning_rate": 0.00011162615942799024, + "loss": 1.6373830795288087, + "step": 36240 + }, + { + "epoch": 0.10972712000363234, + "grad_norm": 0.15536260604858398, + "learning_rate": 0.00011162236424635283, + "loss": 1.6207855224609375, + "step": 36250 + }, + { + "epoch": 0.10975738955397818, + "grad_norm": 0.14090335369110107, + "learning_rate": 0.00011161856906471545, + "loss": 1.6782497406005858, + "step": 36260 + }, + { + "epoch": 0.109787659104324, + "grad_norm": 0.1260191947221756, + "learning_rate": 0.00011161477388307804, + "loss": 1.6685369491577149, + "step": 36270 + }, + { + "epoch": 0.10981792865466984, + "grad_norm": 0.14060181379318237, + "learning_rate": 0.00011161097870144066, + "loss": 1.6259658813476563, + "step": 36280 + }, + { + "epoch": 0.10984819820501567, + "grad_norm": 0.13112609088420868, + "learning_rate": 0.00011160718351980325, + "loss": 1.6508539199829102, + "step": 36290 + }, + { + "epoch": 0.10987846775536149, + "grad_norm": 0.14768102765083313, + "learning_rate": 0.00011160338833816587, + "loss": 1.6342658996582031, + "step": 36300 + }, + { + "epoch": 0.10990873730570733, + "grad_norm": 0.13915947079658508, + "learning_rate": 0.00011159959315652847, + "loss": 1.6266546249389648, + "step": 36310 + }, + { + "epoch": 0.10993900685605315, + "grad_norm": 0.13700689375400543, + "learning_rate": 0.00011159579797489108, + "loss": 1.685420608520508, + "step": 36320 + }, + { + "epoch": 0.10996927640639899, + "grad_norm": 0.147710919380188, + "learning_rate": 0.00011159200279325368, + "loss": 1.6207159042358399, + "step": 36330 + }, + { + "epoch": 0.10999954595674481, + "grad_norm": 0.14229950308799744, + "learning_rate": 0.0001115882076116163, + "loss": 1.628446388244629, + "step": 36340 + }, + { + "epoch": 0.11002981550709064, + "grad_norm": 0.12249547243118286, + "learning_rate": 0.0001115844124299789, + "loss": 1.648960494995117, + "step": 36350 + }, + { + "epoch": 0.11006008505743647, + "grad_norm": 0.14281417429447174, + "learning_rate": 0.00011158061724834151, + "loss": 1.637774658203125, + "step": 36360 + }, + { + "epoch": 0.1100903546077823, + "grad_norm": 0.14286234974861145, + "learning_rate": 0.00011157682206670411, + "loss": 1.6516462326049806, + "step": 36370 + }, + { + "epoch": 0.11012062415812814, + "grad_norm": 0.12997043132781982, + "learning_rate": 0.00011157302688506672, + "loss": 1.613154983520508, + "step": 36380 + }, + { + "epoch": 0.11015089370847396, + "grad_norm": 0.15597915649414062, + "learning_rate": 0.00011156923170342933, + "loss": 1.628568458557129, + "step": 36390 + }, + { + "epoch": 0.11018116325881978, + "grad_norm": 0.1398627907037735, + "learning_rate": 0.00011156543652179193, + "loss": 1.647807502746582, + "step": 36400 + }, + { + "epoch": 0.11021143280916562, + "grad_norm": 0.1569763422012329, + "learning_rate": 0.00011156164134015455, + "loss": 1.647171401977539, + "step": 36410 + }, + { + "epoch": 0.11024170235951145, + "grad_norm": 0.14941485226154327, + "learning_rate": 0.00011155784615851714, + "loss": 1.6231098175048828, + "step": 36420 + }, + { + "epoch": 0.11027197190985728, + "grad_norm": 0.15889321267604828, + "learning_rate": 0.00011155405097687976, + "loss": 1.6464990615844726, + "step": 36430 + }, + { + "epoch": 0.11030224146020311, + "grad_norm": 0.1671402007341385, + "learning_rate": 0.00011155025579524236, + "loss": 1.6021751403808593, + "step": 36440 + }, + { + "epoch": 0.11033251101054894, + "grad_norm": 0.13304544985294342, + "learning_rate": 0.00011154646061360498, + "loss": 1.6663951873779297, + "step": 36450 + }, + { + "epoch": 0.11036278056089477, + "grad_norm": 0.13842090964317322, + "learning_rate": 0.00011154266543196757, + "loss": 1.6046756744384765, + "step": 36460 + }, + { + "epoch": 0.11039305011124059, + "grad_norm": 0.16493377089500427, + "learning_rate": 0.00011153887025033019, + "loss": 1.6688957214355469, + "step": 36470 + }, + { + "epoch": 0.11042331966158643, + "grad_norm": 0.14070044457912445, + "learning_rate": 0.0001115350750686928, + "loss": 1.6267375946044922, + "step": 36480 + }, + { + "epoch": 0.11045358921193225, + "grad_norm": 0.12716904282569885, + "learning_rate": 0.0001115312798870554, + "loss": 1.6585329055786133, + "step": 36490 + }, + { + "epoch": 0.11048385876227809, + "grad_norm": 0.1730252504348755, + "learning_rate": 0.000111527484705418, + "loss": 1.634134292602539, + "step": 36500 + }, + { + "epoch": 0.11048385876227809, + "eval_loss": 1.6549475193023682, + "eval_runtime": 27.8289, + "eval_samples_per_second": 17.967, + "eval_steps_per_second": 1.15, + "step": 36500 + }, + { + "epoch": 0.11051412831262392, + "grad_norm": 0.1634204238653183, + "learning_rate": 0.00011152368952378061, + "loss": 1.6677631378173827, + "step": 36510 + }, + { + "epoch": 0.11054439786296974, + "grad_norm": 0.14659830927848816, + "learning_rate": 0.00011151989434214322, + "loss": 1.6246055603027343, + "step": 36520 + }, + { + "epoch": 0.11057466741331558, + "grad_norm": 0.1404706835746765, + "learning_rate": 0.00011151609916050582, + "loss": 1.6448698043823242, + "step": 36530 + }, + { + "epoch": 0.1106049369636614, + "grad_norm": 0.15511168539524078, + "learning_rate": 0.00011151230397886843, + "loss": 1.5996070861816407, + "step": 36540 + }, + { + "epoch": 0.11063520651400724, + "grad_norm": 0.15714602172374725, + "learning_rate": 0.00011150850879723104, + "loss": 1.6356552124023438, + "step": 36550 + }, + { + "epoch": 0.11066547606435306, + "grad_norm": 0.14712147414684296, + "learning_rate": 0.00011150471361559364, + "loss": 1.6159097671508789, + "step": 36560 + }, + { + "epoch": 0.11069574561469889, + "grad_norm": 0.13948959112167358, + "learning_rate": 0.00011150091843395625, + "loss": 1.618312454223633, + "step": 36570 + }, + { + "epoch": 0.11072601516504472, + "grad_norm": 0.1403181552886963, + "learning_rate": 0.00011149712325231885, + "loss": 1.644430923461914, + "step": 36580 + }, + { + "epoch": 0.11075628471539055, + "grad_norm": 0.1506219208240509, + "learning_rate": 0.00011149332807068146, + "loss": 1.624049758911133, + "step": 36590 + }, + { + "epoch": 0.11078655426573639, + "grad_norm": 0.13942070305347443, + "learning_rate": 0.00011148953288904406, + "loss": 1.6602888107299805, + "step": 36600 + }, + { + "epoch": 0.11081682381608221, + "grad_norm": 0.13538675010204315, + "learning_rate": 0.00011148573770740668, + "loss": 1.6282335281372071, + "step": 36610 + }, + { + "epoch": 0.11084709336642805, + "grad_norm": 0.15106633305549622, + "learning_rate": 0.00011148194252576929, + "loss": 1.6204004287719727, + "step": 36620 + }, + { + "epoch": 0.11087736291677387, + "grad_norm": 0.14113016426563263, + "learning_rate": 0.0001114781473441319, + "loss": 1.6268932342529296, + "step": 36630 + }, + { + "epoch": 0.1109076324671197, + "grad_norm": 0.13580958545207977, + "learning_rate": 0.0001114743521624945, + "loss": 1.6344051361083984, + "step": 36640 + }, + { + "epoch": 0.11093790201746553, + "grad_norm": 0.13692985475063324, + "learning_rate": 0.00011147055698085711, + "loss": 1.6520368576049804, + "step": 36650 + }, + { + "epoch": 0.11096817156781136, + "grad_norm": 0.13024309277534485, + "learning_rate": 0.00011146676179921971, + "loss": 1.6568618774414063, + "step": 36660 + }, + { + "epoch": 0.1109984411181572, + "grad_norm": 0.1346009075641632, + "learning_rate": 0.00011146296661758232, + "loss": 1.6431289672851563, + "step": 36670 + }, + { + "epoch": 0.11102871066850302, + "grad_norm": 0.15704220533370972, + "learning_rate": 0.00011145917143594493, + "loss": 1.6382966995239259, + "step": 36680 + }, + { + "epoch": 0.11105898021884884, + "grad_norm": 0.15041321516036987, + "learning_rate": 0.00011145537625430753, + "loss": 1.6327564239501953, + "step": 36690 + }, + { + "epoch": 0.11108924976919468, + "grad_norm": 0.15094135701656342, + "learning_rate": 0.00011145158107267014, + "loss": 1.6363590240478516, + "step": 36700 + }, + { + "epoch": 0.1111195193195405, + "grad_norm": 0.13411705195903778, + "learning_rate": 0.00011144778589103274, + "loss": 1.6322561264038087, + "step": 36710 + }, + { + "epoch": 0.11114978886988634, + "grad_norm": 0.15692377090454102, + "learning_rate": 0.00011144399070939536, + "loss": 1.6079566955566407, + "step": 36720 + }, + { + "epoch": 0.11118005842023217, + "grad_norm": 0.1504000872373581, + "learning_rate": 0.00011144019552775796, + "loss": 1.6404178619384766, + "step": 36730 + }, + { + "epoch": 0.11121032797057799, + "grad_norm": 0.14574530720710754, + "learning_rate": 0.00011143640034612058, + "loss": 1.6708162307739258, + "step": 36740 + }, + { + "epoch": 0.11124059752092383, + "grad_norm": 0.1264961063861847, + "learning_rate": 0.00011143260516448317, + "loss": 1.674047088623047, + "step": 36750 + }, + { + "epoch": 0.11127086707126965, + "grad_norm": 0.16333959996700287, + "learning_rate": 0.00011142880998284579, + "loss": 1.7100553512573242, + "step": 36760 + }, + { + "epoch": 0.11130113662161549, + "grad_norm": 0.1513887345790863, + "learning_rate": 0.00011142501480120838, + "loss": 1.6215999603271485, + "step": 36770 + }, + { + "epoch": 0.11133140617196131, + "grad_norm": 0.15448707342147827, + "learning_rate": 0.000111421219619571, + "loss": 1.5845009803771972, + "step": 36780 + }, + { + "epoch": 0.11136167572230715, + "grad_norm": 0.15002979338169098, + "learning_rate": 0.00011141742443793359, + "loss": 1.6116981506347656, + "step": 36790 + }, + { + "epoch": 0.11139194527265298, + "grad_norm": 0.12515418231487274, + "learning_rate": 0.00011141362925629621, + "loss": 1.6420379638671876, + "step": 36800 + }, + { + "epoch": 0.1114222148229988, + "grad_norm": 0.17247477173805237, + "learning_rate": 0.00011140983407465882, + "loss": 1.6422550201416015, + "step": 36810 + }, + { + "epoch": 0.11145248437334464, + "grad_norm": 0.13229556381702423, + "learning_rate": 0.00011140603889302142, + "loss": 1.6220340728759766, + "step": 36820 + }, + { + "epoch": 0.11148275392369046, + "grad_norm": 0.1732223480939865, + "learning_rate": 0.00011140224371138403, + "loss": 1.624894905090332, + "step": 36830 + }, + { + "epoch": 0.1115130234740363, + "grad_norm": 0.13966473937034607, + "learning_rate": 0.00011139844852974663, + "loss": 1.6474843978881837, + "step": 36840 + }, + { + "epoch": 0.11154329302438212, + "grad_norm": 0.1697293072938919, + "learning_rate": 0.00011139465334810925, + "loss": 1.6355785369873046, + "step": 36850 + }, + { + "epoch": 0.11157356257472795, + "grad_norm": 0.14941932260990143, + "learning_rate": 0.00011139085816647185, + "loss": 1.5973803520202636, + "step": 36860 + }, + { + "epoch": 0.11160383212507378, + "grad_norm": 0.13185366988182068, + "learning_rate": 0.00011138706298483447, + "loss": 1.6792062759399413, + "step": 36870 + }, + { + "epoch": 0.11163410167541961, + "grad_norm": 0.15073010325431824, + "learning_rate": 0.00011138326780319706, + "loss": 1.6394554138183595, + "step": 36880 + }, + { + "epoch": 0.11166437122576545, + "grad_norm": 0.1566293090581894, + "learning_rate": 0.00011137947262155968, + "loss": 1.6307897567749023, + "step": 36890 + }, + { + "epoch": 0.11169464077611127, + "grad_norm": 0.144432932138443, + "learning_rate": 0.00011137567743992227, + "loss": 1.6422378540039062, + "step": 36900 + }, + { + "epoch": 0.1117249103264571, + "grad_norm": 0.15231086313724518, + "learning_rate": 0.00011137188225828489, + "loss": 1.6521440505981446, + "step": 36910 + }, + { + "epoch": 0.11175517987680293, + "grad_norm": 0.14975018799304962, + "learning_rate": 0.00011136808707664748, + "loss": 1.651773452758789, + "step": 36920 + }, + { + "epoch": 0.11178544942714876, + "grad_norm": 0.13167521357536316, + "learning_rate": 0.0001113642918950101, + "loss": 1.64654483795166, + "step": 36930 + }, + { + "epoch": 0.1118157189774946, + "grad_norm": 0.1381334811449051, + "learning_rate": 0.0001113604967133727, + "loss": 1.6233963012695312, + "step": 36940 + }, + { + "epoch": 0.11184598852784042, + "grad_norm": 0.15224722027778625, + "learning_rate": 0.00011135670153173531, + "loss": 1.6250072479248048, + "step": 36950 + }, + { + "epoch": 0.11187625807818626, + "grad_norm": 0.1346813440322876, + "learning_rate": 0.0001113529063500979, + "loss": 1.706853485107422, + "step": 36960 + }, + { + "epoch": 0.11190652762853208, + "grad_norm": 0.13159382343292236, + "learning_rate": 0.00011134911116846053, + "loss": 1.661466407775879, + "step": 36970 + }, + { + "epoch": 0.1119367971788779, + "grad_norm": 0.13322347402572632, + "learning_rate": 0.00011134531598682313, + "loss": 1.6321660995483398, + "step": 36980 + }, + { + "epoch": 0.11196706672922374, + "grad_norm": 0.1257960945367813, + "learning_rate": 0.00011134152080518574, + "loss": 1.6468193054199218, + "step": 36990 + }, + { + "epoch": 0.11199733627956956, + "grad_norm": 0.1416335552930832, + "learning_rate": 0.00011133772562354834, + "loss": 1.6401416778564453, + "step": 37000 + }, + { + "epoch": 0.11199733627956956, + "eval_loss": 1.6451520919799805, + "eval_runtime": 27.9985, + "eval_samples_per_second": 17.858, + "eval_steps_per_second": 1.143, + "step": 37000 + }, + { + "epoch": 0.1120276058299154, + "grad_norm": 0.1710992455482483, + "learning_rate": 0.00011133393044191095, + "loss": 1.665513038635254, + "step": 37010 + }, + { + "epoch": 0.11205787538026123, + "grad_norm": 0.1322363317012787, + "learning_rate": 0.00011133013526027357, + "loss": 1.6504257202148438, + "step": 37020 + }, + { + "epoch": 0.11208814493060705, + "grad_norm": 0.14637106657028198, + "learning_rate": 0.00011132634007863616, + "loss": 1.6546131134033204, + "step": 37030 + }, + { + "epoch": 0.11211841448095289, + "grad_norm": 0.1468953639268875, + "learning_rate": 0.00011132254489699878, + "loss": 1.6126602172851563, + "step": 37040 + }, + { + "epoch": 0.11214868403129871, + "grad_norm": 0.1424204707145691, + "learning_rate": 0.00011131874971536137, + "loss": 1.6101137161254884, + "step": 37050 + }, + { + "epoch": 0.11217895358164455, + "grad_norm": 0.14184194803237915, + "learning_rate": 0.00011131495453372399, + "loss": 1.6680744171142579, + "step": 37060 + }, + { + "epoch": 0.11220922313199037, + "grad_norm": 0.13175629079341888, + "learning_rate": 0.00011131115935208659, + "loss": 1.6585212707519532, + "step": 37070 + }, + { + "epoch": 0.1122394926823362, + "grad_norm": 0.15409573912620544, + "learning_rate": 0.0001113073641704492, + "loss": 1.634376335144043, + "step": 37080 + }, + { + "epoch": 0.11226976223268204, + "grad_norm": 0.1588420867919922, + "learning_rate": 0.00011130356898881181, + "loss": 1.6433589935302735, + "step": 37090 + }, + { + "epoch": 0.11230003178302786, + "grad_norm": 0.16640067100524902, + "learning_rate": 0.00011129977380717442, + "loss": 1.6235271453857423, + "step": 37100 + }, + { + "epoch": 0.1123303013333737, + "grad_norm": 0.15426048636436462, + "learning_rate": 0.00011129597862553702, + "loss": 1.6675403594970704, + "step": 37110 + }, + { + "epoch": 0.11236057088371952, + "grad_norm": 0.14531660079956055, + "learning_rate": 0.00011129218344389963, + "loss": 1.647895050048828, + "step": 37120 + }, + { + "epoch": 0.11239084043406536, + "grad_norm": 0.1462382674217224, + "learning_rate": 0.00011128838826226223, + "loss": 1.6356040954589843, + "step": 37130 + }, + { + "epoch": 0.11242110998441118, + "grad_norm": 0.13785181939601898, + "learning_rate": 0.00011128459308062484, + "loss": 1.6504383087158203, + "step": 37140 + }, + { + "epoch": 0.112451379534757, + "grad_norm": 0.15966300666332245, + "learning_rate": 0.00011128079789898745, + "loss": 1.6643394470214843, + "step": 37150 + }, + { + "epoch": 0.11248164908510284, + "grad_norm": 0.16094395518302917, + "learning_rate": 0.00011127700271735005, + "loss": 1.6722505569458008, + "step": 37160 + }, + { + "epoch": 0.11251191863544867, + "grad_norm": 0.141749307513237, + "learning_rate": 0.00011127320753571266, + "loss": 1.6328014373779296, + "step": 37170 + }, + { + "epoch": 0.1125421881857945, + "grad_norm": 0.14910444617271423, + "learning_rate": 0.00011126941235407526, + "loss": 1.5975321769714355, + "step": 37180 + }, + { + "epoch": 0.11257245773614033, + "grad_norm": 0.1423795372247696, + "learning_rate": 0.00011126561717243787, + "loss": 1.6025081634521485, + "step": 37190 + }, + { + "epoch": 0.11260272728648615, + "grad_norm": 0.16126315295696259, + "learning_rate": 0.00011126182199080048, + "loss": 1.6479547500610352, + "step": 37200 + }, + { + "epoch": 0.11263299683683199, + "grad_norm": 0.15079344809055328, + "learning_rate": 0.00011125802680916308, + "loss": 1.6209928512573242, + "step": 37210 + }, + { + "epoch": 0.11266326638717782, + "grad_norm": 0.15201634168624878, + "learning_rate": 0.0001112542316275257, + "loss": 1.6131898880004882, + "step": 37220 + }, + { + "epoch": 0.11269353593752365, + "grad_norm": 0.15071457624435425, + "learning_rate": 0.00011125043644588831, + "loss": 1.6093324661254882, + "step": 37230 + }, + { + "epoch": 0.11272380548786948, + "grad_norm": 0.13289757072925568, + "learning_rate": 0.00011124664126425091, + "loss": 1.6309152603149415, + "step": 37240 + }, + { + "epoch": 0.1127540750382153, + "grad_norm": 0.12951786816120148, + "learning_rate": 0.00011124284608261352, + "loss": 1.613290023803711, + "step": 37250 + }, + { + "epoch": 0.11278434458856114, + "grad_norm": 0.16887247562408447, + "learning_rate": 0.00011123905090097613, + "loss": 1.6078994750976563, + "step": 37260 + }, + { + "epoch": 0.11281461413890696, + "grad_norm": 0.1328016221523285, + "learning_rate": 0.00011123525571933873, + "loss": 1.6183338165283203, + "step": 37270 + }, + { + "epoch": 0.1128448836892528, + "grad_norm": 0.13979828357696533, + "learning_rate": 0.00011123146053770134, + "loss": 1.6488155364990233, + "step": 37280 + }, + { + "epoch": 0.11287515323959862, + "grad_norm": 0.14092782139778137, + "learning_rate": 0.00011122766535606394, + "loss": 1.6385627746582032, + "step": 37290 + }, + { + "epoch": 0.11290542278994445, + "grad_norm": 0.15708553791046143, + "learning_rate": 0.00011122387017442655, + "loss": 1.6128999710083007, + "step": 37300 + }, + { + "epoch": 0.11293569234029029, + "grad_norm": 0.12707079946994781, + "learning_rate": 0.00011122007499278916, + "loss": 1.6496768951416017, + "step": 37310 + }, + { + "epoch": 0.11296596189063611, + "grad_norm": 0.12657402455806732, + "learning_rate": 0.00011121627981115176, + "loss": 1.6841318130493164, + "step": 37320 + }, + { + "epoch": 0.11299623144098195, + "grad_norm": 0.14919911324977875, + "learning_rate": 0.00011121248462951437, + "loss": 1.6349809646606446, + "step": 37330 + }, + { + "epoch": 0.11302650099132777, + "grad_norm": 0.14013677835464478, + "learning_rate": 0.00011120868944787697, + "loss": 1.6635387420654297, + "step": 37340 + }, + { + "epoch": 0.11305677054167361, + "grad_norm": 0.13546894490718842, + "learning_rate": 0.00011120489426623959, + "loss": 1.6507087707519532, + "step": 37350 + }, + { + "epoch": 0.11308704009201943, + "grad_norm": 0.13704165816307068, + "learning_rate": 0.00011120109908460218, + "loss": 1.691901969909668, + "step": 37360 + }, + { + "epoch": 0.11311730964236526, + "grad_norm": 0.13104192912578583, + "learning_rate": 0.0001111973039029648, + "loss": 1.646840476989746, + "step": 37370 + }, + { + "epoch": 0.1131475791927111, + "grad_norm": 0.1425362080335617, + "learning_rate": 0.0001111935087213274, + "loss": 1.6263559341430665, + "step": 37380 + }, + { + "epoch": 0.11317784874305692, + "grad_norm": 0.1435733139514923, + "learning_rate": 0.00011118971353969002, + "loss": 1.6201019287109375, + "step": 37390 + }, + { + "epoch": 0.11320811829340276, + "grad_norm": 0.13535894453525543, + "learning_rate": 0.00011118591835805261, + "loss": 1.673004722595215, + "step": 37400 + }, + { + "epoch": 0.11323838784374858, + "grad_norm": 0.13170333206653595, + "learning_rate": 0.00011118212317641523, + "loss": 1.600912094116211, + "step": 37410 + }, + { + "epoch": 0.1132686573940944, + "grad_norm": 0.1548815667629242, + "learning_rate": 0.00011117832799477783, + "loss": 1.6191125869750977, + "step": 37420 + }, + { + "epoch": 0.11329892694444024, + "grad_norm": 0.1521953046321869, + "learning_rate": 0.00011117453281314044, + "loss": 1.613785171508789, + "step": 37430 + }, + { + "epoch": 0.11332919649478607, + "grad_norm": 0.16882261633872986, + "learning_rate": 0.00011117073763150305, + "loss": 1.6524692535400392, + "step": 37440 + }, + { + "epoch": 0.1133594660451319, + "grad_norm": 0.13208922743797302, + "learning_rate": 0.00011116694244986565, + "loss": 1.6340702056884766, + "step": 37450 + }, + { + "epoch": 0.11338973559547773, + "grad_norm": 0.13876192271709442, + "learning_rate": 0.00011116314726822827, + "loss": 1.6326480865478517, + "step": 37460 + }, + { + "epoch": 0.11342000514582355, + "grad_norm": 0.1465439647436142, + "learning_rate": 0.00011115935208659086, + "loss": 1.6326065063476562, + "step": 37470 + }, + { + "epoch": 0.11345027469616939, + "grad_norm": 0.1291520595550537, + "learning_rate": 0.00011115555690495348, + "loss": 1.618848991394043, + "step": 37480 + }, + { + "epoch": 0.11348054424651521, + "grad_norm": 0.14717556536197662, + "learning_rate": 0.00011115176172331608, + "loss": 1.5825157165527344, + "step": 37490 + }, + { + "epoch": 0.11351081379686105, + "grad_norm": 0.12304560840129852, + "learning_rate": 0.0001111479665416787, + "loss": 1.642893409729004, + "step": 37500 + }, + { + "epoch": 0.11351081379686105, + "eval_loss": 1.6366686820983887, + "eval_runtime": 27.7779, + "eval_samples_per_second": 18.0, + "eval_steps_per_second": 1.152, + "step": 37500 + }, + { + "epoch": 0.11354108334720688, + "grad_norm": 0.14860236644744873, + "learning_rate": 0.00011114417136004129, + "loss": 1.59735107421875, + "step": 37510 + }, + { + "epoch": 0.11357135289755271, + "grad_norm": 0.13843569159507751, + "learning_rate": 0.00011114037617840391, + "loss": 1.6253705978393556, + "step": 37520 + }, + { + "epoch": 0.11360162244789854, + "grad_norm": 0.13568530976772308, + "learning_rate": 0.0001111365809967665, + "loss": 1.6332914352416992, + "step": 37530 + }, + { + "epoch": 0.11363189199824436, + "grad_norm": 0.12717914581298828, + "learning_rate": 0.00011113278581512912, + "loss": 1.6234050750732423, + "step": 37540 + }, + { + "epoch": 0.1136621615485902, + "grad_norm": 0.14998973906040192, + "learning_rate": 0.00011112899063349171, + "loss": 1.632335090637207, + "step": 37550 + }, + { + "epoch": 0.11369243109893602, + "grad_norm": 0.15157616138458252, + "learning_rate": 0.00011112519545185433, + "loss": 1.6359966278076172, + "step": 37560 + }, + { + "epoch": 0.11372270064928186, + "grad_norm": 0.14243310689926147, + "learning_rate": 0.00011112140027021692, + "loss": 1.6359903335571289, + "step": 37570 + }, + { + "epoch": 0.11375297019962768, + "grad_norm": 0.14448347687721252, + "learning_rate": 0.00011111760508857954, + "loss": 1.6623575210571289, + "step": 37580 + }, + { + "epoch": 0.11378323974997351, + "grad_norm": 0.13829709589481354, + "learning_rate": 0.00011111380990694215, + "loss": 1.652749252319336, + "step": 37590 + }, + { + "epoch": 0.11381350930031935, + "grad_norm": 0.14601832628250122, + "learning_rate": 0.00011111001472530475, + "loss": 1.6463760375976562, + "step": 37600 + }, + { + "epoch": 0.11384377885066517, + "grad_norm": 0.1439613550901413, + "learning_rate": 0.00011110621954366736, + "loss": 1.6348655700683594, + "step": 37610 + }, + { + "epoch": 0.11387404840101101, + "grad_norm": 0.15701442956924438, + "learning_rate": 0.00011110242436202997, + "loss": 1.6468048095703125, + "step": 37620 + }, + { + "epoch": 0.11390431795135683, + "grad_norm": 0.1635928750038147, + "learning_rate": 0.00011109862918039259, + "loss": 1.6510515213012695, + "step": 37630 + }, + { + "epoch": 0.11393458750170266, + "grad_norm": 0.16254836320877075, + "learning_rate": 0.00011109483399875518, + "loss": 1.6728206634521485, + "step": 37640 + }, + { + "epoch": 0.1139648570520485, + "grad_norm": 0.1348692774772644, + "learning_rate": 0.0001110910388171178, + "loss": 1.6466487884521483, + "step": 37650 + }, + { + "epoch": 0.11399512660239432, + "grad_norm": 0.13164103031158447, + "learning_rate": 0.00011108724363548039, + "loss": 1.6280433654785156, + "step": 37660 + }, + { + "epoch": 0.11402539615274015, + "grad_norm": 0.13807764649391174, + "learning_rate": 0.00011108344845384301, + "loss": 1.6564952850341796, + "step": 37670 + }, + { + "epoch": 0.11405566570308598, + "grad_norm": 0.11729904264211655, + "learning_rate": 0.0001110796532722056, + "loss": 1.6299510955810548, + "step": 37680 + }, + { + "epoch": 0.11408593525343182, + "grad_norm": 0.1378263235092163, + "learning_rate": 0.00011107585809056822, + "loss": 1.6102245330810547, + "step": 37690 + }, + { + "epoch": 0.11411620480377764, + "grad_norm": 0.16037434339523315, + "learning_rate": 0.00011107206290893081, + "loss": 1.6446722030639649, + "step": 37700 + }, + { + "epoch": 0.11414647435412346, + "grad_norm": 0.13973239064216614, + "learning_rate": 0.00011106826772729343, + "loss": 1.6556879043579102, + "step": 37710 + }, + { + "epoch": 0.1141767439044693, + "grad_norm": 0.1380636841058731, + "learning_rate": 0.00011106447254565604, + "loss": 1.6434440612792969, + "step": 37720 + }, + { + "epoch": 0.11420701345481513, + "grad_norm": 0.13721175491809845, + "learning_rate": 0.00011106067736401865, + "loss": 1.6347621917724608, + "step": 37730 + }, + { + "epoch": 0.11423728300516096, + "grad_norm": 0.1503579169511795, + "learning_rate": 0.00011105688218238125, + "loss": 1.621327781677246, + "step": 37740 + }, + { + "epoch": 0.11426755255550679, + "grad_norm": 0.15772973001003265, + "learning_rate": 0.00011105308700074386, + "loss": 1.6142967224121094, + "step": 37750 + }, + { + "epoch": 0.11429782210585261, + "grad_norm": 0.13941873610019684, + "learning_rate": 0.00011104929181910646, + "loss": 1.6373640060424806, + "step": 37760 + }, + { + "epoch": 0.11432809165619845, + "grad_norm": 0.1342151165008545, + "learning_rate": 0.00011104549663746907, + "loss": 1.6607110977172852, + "step": 37770 + }, + { + "epoch": 0.11435836120654427, + "grad_norm": 0.14656679332256317, + "learning_rate": 0.00011104170145583168, + "loss": 1.5982925415039062, + "step": 37780 + }, + { + "epoch": 0.11438863075689011, + "grad_norm": 0.1445535570383072, + "learning_rate": 0.00011103790627419428, + "loss": 1.5927934646606445, + "step": 37790 + }, + { + "epoch": 0.11441890030723594, + "grad_norm": 0.13671688735485077, + "learning_rate": 0.00011103411109255689, + "loss": 1.592750358581543, + "step": 37800 + }, + { + "epoch": 0.11444916985758176, + "grad_norm": 0.15627911686897278, + "learning_rate": 0.0001110303159109195, + "loss": 1.6548595428466797, + "step": 37810 + }, + { + "epoch": 0.1144794394079276, + "grad_norm": 0.15523026883602142, + "learning_rate": 0.0001110265207292821, + "loss": 1.6310907363891602, + "step": 37820 + }, + { + "epoch": 0.11450970895827342, + "grad_norm": 0.16508495807647705, + "learning_rate": 0.00011102272554764472, + "loss": 1.5993562698364259, + "step": 37830 + }, + { + "epoch": 0.11453997850861926, + "grad_norm": 0.13357169926166534, + "learning_rate": 0.00011101893036600732, + "loss": 1.6254112243652343, + "step": 37840 + }, + { + "epoch": 0.11457024805896508, + "grad_norm": 0.12975311279296875, + "learning_rate": 0.00011101513518436993, + "loss": 1.6852684020996094, + "step": 37850 + }, + { + "epoch": 0.11460051760931092, + "grad_norm": 0.1298532634973526, + "learning_rate": 0.00011101134000273254, + "loss": 1.674179458618164, + "step": 37860 + }, + { + "epoch": 0.11463078715965674, + "grad_norm": 0.15960507094860077, + "learning_rate": 0.00011100754482109514, + "loss": 1.5974700927734375, + "step": 37870 + }, + { + "epoch": 0.11466105671000257, + "grad_norm": 0.15909484028816223, + "learning_rate": 0.00011100374963945775, + "loss": 1.6056858062744142, + "step": 37880 + }, + { + "epoch": 0.1146913262603484, + "grad_norm": 0.1613510400056839, + "learning_rate": 0.00011099995445782035, + "loss": 1.635561752319336, + "step": 37890 + }, + { + "epoch": 0.11472159581069423, + "grad_norm": 0.14852496981620789, + "learning_rate": 0.00011099615927618296, + "loss": 1.601515007019043, + "step": 37900 + }, + { + "epoch": 0.11475186536104007, + "grad_norm": 0.123896025121212, + "learning_rate": 0.00011099236409454557, + "loss": 1.6088489532470702, + "step": 37910 + }, + { + "epoch": 0.11478213491138589, + "grad_norm": 0.13321559131145477, + "learning_rate": 0.00011098856891290817, + "loss": 1.6641475677490234, + "step": 37920 + }, + { + "epoch": 0.11481240446173172, + "grad_norm": 0.13732457160949707, + "learning_rate": 0.00011098477373127078, + "loss": 1.625795555114746, + "step": 37930 + }, + { + "epoch": 0.11484267401207755, + "grad_norm": 0.15679273009300232, + "learning_rate": 0.00011098097854963338, + "loss": 1.646634292602539, + "step": 37940 + }, + { + "epoch": 0.11487294356242338, + "grad_norm": 0.15307222306728363, + "learning_rate": 0.00011097718336799599, + "loss": 1.6109413146972655, + "step": 37950 + }, + { + "epoch": 0.11490321311276921, + "grad_norm": 0.15123730897903442, + "learning_rate": 0.00011097338818635861, + "loss": 1.6490522384643556, + "step": 37960 + }, + { + "epoch": 0.11493348266311504, + "grad_norm": 0.15041621029376984, + "learning_rate": 0.0001109695930047212, + "loss": 1.6579875946044922, + "step": 37970 + }, + { + "epoch": 0.11496375221346086, + "grad_norm": 0.12440071254968643, + "learning_rate": 0.00011096579782308382, + "loss": 1.636191177368164, + "step": 37980 + }, + { + "epoch": 0.1149940217638067, + "grad_norm": 0.1366558074951172, + "learning_rate": 0.00011096200264144641, + "loss": 1.6133167266845703, + "step": 37990 + }, + { + "epoch": 0.11502429131415252, + "grad_norm": 0.1507655531167984, + "learning_rate": 0.00011095820745980903, + "loss": 1.6759576797485352, + "step": 38000 + }, + { + "epoch": 0.11502429131415252, + "eval_loss": 1.6279205083847046, + "eval_runtime": 28.2772, + "eval_samples_per_second": 17.682, + "eval_steps_per_second": 1.132, + "step": 38000 + }, + { + "epoch": 0.11505456086449836, + "grad_norm": 0.13548599183559418, + "learning_rate": 0.00011095441227817163, + "loss": 1.6380485534667968, + "step": 38010 + }, + { + "epoch": 0.11508483041484419, + "grad_norm": 0.14832453429698944, + "learning_rate": 0.00011095061709653425, + "loss": 1.6417243957519532, + "step": 38020 + }, + { + "epoch": 0.11511509996519002, + "grad_norm": 0.17521268129348755, + "learning_rate": 0.00011094682191489685, + "loss": 1.6307327270507812, + "step": 38030 + }, + { + "epoch": 0.11514536951553585, + "grad_norm": 0.15322570502758026, + "learning_rate": 0.00011094302673325946, + "loss": 1.5771488189697265, + "step": 38040 + }, + { + "epoch": 0.11517563906588167, + "grad_norm": 0.1314355581998825, + "learning_rate": 0.00011093923155162206, + "loss": 1.6028827667236327, + "step": 38050 + }, + { + "epoch": 0.11520590861622751, + "grad_norm": 0.1446547955274582, + "learning_rate": 0.00011093543636998467, + "loss": 1.6406558990478515, + "step": 38060 + }, + { + "epoch": 0.11523617816657333, + "grad_norm": 0.13299460709095, + "learning_rate": 0.00011093164118834729, + "loss": 1.6355369567871094, + "step": 38070 + }, + { + "epoch": 0.11526644771691917, + "grad_norm": 0.1430647224187851, + "learning_rate": 0.00011092784600670988, + "loss": 1.6174795150756835, + "step": 38080 + }, + { + "epoch": 0.115296717267265, + "grad_norm": 0.13251300156116486, + "learning_rate": 0.0001109240508250725, + "loss": 1.6420963287353516, + "step": 38090 + }, + { + "epoch": 0.11532698681761082, + "grad_norm": 0.14607790112495422, + "learning_rate": 0.00011092025564343509, + "loss": 1.6066856384277344, + "step": 38100 + }, + { + "epoch": 0.11535725636795666, + "grad_norm": 0.14323875308036804, + "learning_rate": 0.00011091646046179771, + "loss": 1.6266530990600585, + "step": 38110 + }, + { + "epoch": 0.11538752591830248, + "grad_norm": 0.15551145374774933, + "learning_rate": 0.0001109126652801603, + "loss": 1.6244520187377929, + "step": 38120 + }, + { + "epoch": 0.11541779546864832, + "grad_norm": 0.14182235300540924, + "learning_rate": 0.00011090887009852292, + "loss": 1.650680160522461, + "step": 38130 + }, + { + "epoch": 0.11544806501899414, + "grad_norm": 0.1605839729309082, + "learning_rate": 0.00011090507491688552, + "loss": 1.667588233947754, + "step": 38140 + }, + { + "epoch": 0.11547833456933997, + "grad_norm": 0.14049024879932404, + "learning_rate": 0.00011090127973524814, + "loss": 1.5896556854248047, + "step": 38150 + }, + { + "epoch": 0.1155086041196858, + "grad_norm": 0.1477675884962082, + "learning_rate": 0.00011089748455361073, + "loss": 1.6165618896484375, + "step": 38160 + }, + { + "epoch": 0.11553887367003163, + "grad_norm": 0.14180566370487213, + "learning_rate": 0.00011089368937197335, + "loss": 1.6306718826293944, + "step": 38170 + }, + { + "epoch": 0.11556914322037747, + "grad_norm": 0.15923939645290375, + "learning_rate": 0.00011088989419033594, + "loss": 1.6646783828735352, + "step": 38180 + }, + { + "epoch": 0.11559941277072329, + "grad_norm": 0.14207632839679718, + "learning_rate": 0.00011088609900869856, + "loss": 1.5864540100097657, + "step": 38190 + }, + { + "epoch": 0.11562968232106913, + "grad_norm": 0.12837207317352295, + "learning_rate": 0.00011088230382706117, + "loss": 1.632959747314453, + "step": 38200 + }, + { + "epoch": 0.11565995187141495, + "grad_norm": 0.14154623448848724, + "learning_rate": 0.00011087850864542377, + "loss": 1.6380870819091797, + "step": 38210 + }, + { + "epoch": 0.11569022142176077, + "grad_norm": 0.12885458767414093, + "learning_rate": 0.00011087471346378638, + "loss": 1.5911765098571777, + "step": 38220 + }, + { + "epoch": 0.11572049097210661, + "grad_norm": 0.14290057122707367, + "learning_rate": 0.00011087091828214898, + "loss": 1.589427375793457, + "step": 38230 + }, + { + "epoch": 0.11575076052245244, + "grad_norm": 0.15541736781597137, + "learning_rate": 0.0001108671231005116, + "loss": 1.6483152389526368, + "step": 38240 + }, + { + "epoch": 0.11578103007279827, + "grad_norm": 0.1393197774887085, + "learning_rate": 0.0001108633279188742, + "loss": 1.6110733032226563, + "step": 38250 + }, + { + "epoch": 0.1158112996231441, + "grad_norm": 0.14205560088157654, + "learning_rate": 0.00011085953273723682, + "loss": 1.6705055236816406, + "step": 38260 + }, + { + "epoch": 0.11584156917348992, + "grad_norm": 0.13573046028614044, + "learning_rate": 0.00011085573755559941, + "loss": 1.6170852661132813, + "step": 38270 + }, + { + "epoch": 0.11587183872383576, + "grad_norm": 0.14108413457870483, + "learning_rate": 0.00011085194237396203, + "loss": 1.6642471313476563, + "step": 38280 + }, + { + "epoch": 0.11590210827418158, + "grad_norm": 0.13787990808486938, + "learning_rate": 0.00011084814719232462, + "loss": 1.6376541137695313, + "step": 38290 + }, + { + "epoch": 0.11593237782452742, + "grad_norm": 0.1490883082151413, + "learning_rate": 0.00011084435201068724, + "loss": 1.611574363708496, + "step": 38300 + }, + { + "epoch": 0.11596264737487325, + "grad_norm": 0.14000585675239563, + "learning_rate": 0.00011084055682904983, + "loss": 1.6303627014160156, + "step": 38310 + }, + { + "epoch": 0.11599291692521907, + "grad_norm": 0.154290571808815, + "learning_rate": 0.00011083676164741245, + "loss": 1.6341938018798827, + "step": 38320 + }, + { + "epoch": 0.11602318647556491, + "grad_norm": 0.12524260580539703, + "learning_rate": 0.00011083296646577506, + "loss": 1.6441329956054687, + "step": 38330 + }, + { + "epoch": 0.11605345602591073, + "grad_norm": 0.14573058485984802, + "learning_rate": 0.00011082917128413766, + "loss": 1.6441593170166016, + "step": 38340 + }, + { + "epoch": 0.11608372557625657, + "grad_norm": 0.14400655031204224, + "learning_rate": 0.00011082537610250027, + "loss": 1.614035415649414, + "step": 38350 + }, + { + "epoch": 0.11611399512660239, + "grad_norm": 0.12722450494766235, + "learning_rate": 0.00011082158092086287, + "loss": 1.5902334213256837, + "step": 38360 + }, + { + "epoch": 0.11614426467694823, + "grad_norm": 0.14054930210113525, + "learning_rate": 0.00011081778573922548, + "loss": 1.6048664093017577, + "step": 38370 + }, + { + "epoch": 0.11617453422729405, + "grad_norm": 0.15935534238815308, + "learning_rate": 0.00011081399055758809, + "loss": 1.6426368713378907, + "step": 38380 + }, + { + "epoch": 0.11620480377763988, + "grad_norm": 0.15581031143665314, + "learning_rate": 0.00011081019537595069, + "loss": 1.6277515411376953, + "step": 38390 + }, + { + "epoch": 0.11623507332798572, + "grad_norm": 0.14791014790534973, + "learning_rate": 0.0001108064001943133, + "loss": 1.6342498779296875, + "step": 38400 + }, + { + "epoch": 0.11626534287833154, + "grad_norm": 0.14828811585903168, + "learning_rate": 0.0001108026050126759, + "loss": 1.6424774169921874, + "step": 38410 + }, + { + "epoch": 0.11629561242867738, + "grad_norm": 0.13367116451263428, + "learning_rate": 0.00011079880983103851, + "loss": 1.5964405059814453, + "step": 38420 + }, + { + "epoch": 0.1163258819790232, + "grad_norm": 0.15733088552951813, + "learning_rate": 0.00011079501464940112, + "loss": 1.6377418518066407, + "step": 38430 + }, + { + "epoch": 0.11635615152936903, + "grad_norm": 0.14810480177402496, + "learning_rate": 0.00011079121946776372, + "loss": 1.6415470123291016, + "step": 38440 + }, + { + "epoch": 0.11638642107971486, + "grad_norm": 0.13593970239162445, + "learning_rate": 0.00011078742428612634, + "loss": 1.66156005859375, + "step": 38450 + }, + { + "epoch": 0.11641669063006069, + "grad_norm": 0.13765321671962738, + "learning_rate": 0.00011078362910448895, + "loss": 1.6596782684326172, + "step": 38460 + }, + { + "epoch": 0.11644696018040653, + "grad_norm": 0.13529595732688904, + "learning_rate": 0.00011077983392285155, + "loss": 1.663189697265625, + "step": 38470 + }, + { + "epoch": 0.11647722973075235, + "grad_norm": 0.14300934970378876, + "learning_rate": 0.00011077603874121416, + "loss": 1.594254207611084, + "step": 38480 + }, + { + "epoch": 0.11650749928109817, + "grad_norm": 0.15003854036331177, + "learning_rate": 0.00011077224355957677, + "loss": 1.6121440887451173, + "step": 38490 + }, + { + "epoch": 0.11653776883144401, + "grad_norm": 0.13142873346805573, + "learning_rate": 0.00011076844837793937, + "loss": 1.6621496200561523, + "step": 38500 + }, + { + "epoch": 0.11653776883144401, + "eval_loss": 1.6609597206115723, + "eval_runtime": 28.0851, + "eval_samples_per_second": 17.803, + "eval_steps_per_second": 1.139, + "step": 38500 + }, + { + "epoch": 0.11656803838178983, + "grad_norm": 0.14106741547584534, + "learning_rate": 0.00011076465319630198, + "loss": 1.6213821411132812, + "step": 38510 + }, + { + "epoch": 0.11659830793213567, + "grad_norm": 0.14332225918769836, + "learning_rate": 0.00011076085801466458, + "loss": 1.5670158386230468, + "step": 38520 + }, + { + "epoch": 0.1166285774824815, + "grad_norm": 0.15915025770664215, + "learning_rate": 0.00011075706283302719, + "loss": 1.6229972839355469, + "step": 38530 + }, + { + "epoch": 0.11665884703282733, + "grad_norm": 0.1495729684829712, + "learning_rate": 0.0001107532676513898, + "loss": 1.6496070861816405, + "step": 38540 + }, + { + "epoch": 0.11668911658317316, + "grad_norm": 0.1346307098865509, + "learning_rate": 0.0001107494724697524, + "loss": 1.588394832611084, + "step": 38550 + }, + { + "epoch": 0.11671938613351898, + "grad_norm": 0.1536579132080078, + "learning_rate": 0.00011074567728811501, + "loss": 1.6697704315185546, + "step": 38560 + }, + { + "epoch": 0.11674965568386482, + "grad_norm": 0.17035330832004547, + "learning_rate": 0.00011074188210647763, + "loss": 1.6351795196533203, + "step": 38570 + }, + { + "epoch": 0.11677992523421064, + "grad_norm": 0.1448252946138382, + "learning_rate": 0.00011073808692484022, + "loss": 1.651005172729492, + "step": 38580 + }, + { + "epoch": 0.11681019478455648, + "grad_norm": 0.13538512587547302, + "learning_rate": 0.00011073429174320284, + "loss": 1.6068342208862305, + "step": 38590 + }, + { + "epoch": 0.1168404643349023, + "grad_norm": 0.13901914656162262, + "learning_rate": 0.00011073049656156543, + "loss": 1.6407403945922852, + "step": 38600 + }, + { + "epoch": 0.11687073388524813, + "grad_norm": 0.14003422856330872, + "learning_rate": 0.00011072670137992805, + "loss": 1.6413494110107423, + "step": 38610 + }, + { + "epoch": 0.11690100343559397, + "grad_norm": 0.13984999060630798, + "learning_rate": 0.00011072290619829064, + "loss": 1.6430660247802735, + "step": 38620 + }, + { + "epoch": 0.11693127298593979, + "grad_norm": 0.15655028820037842, + "learning_rate": 0.00011071911101665326, + "loss": 1.6122608184814453, + "step": 38630 + }, + { + "epoch": 0.11696154253628563, + "grad_norm": 0.14503420889377594, + "learning_rate": 0.00011071531583501587, + "loss": 1.6747039794921874, + "step": 38640 + }, + { + "epoch": 0.11699181208663145, + "grad_norm": 0.1268751472234726, + "learning_rate": 0.00011071152065337847, + "loss": 1.5917234420776367, + "step": 38650 + }, + { + "epoch": 0.11702208163697728, + "grad_norm": 0.1410631388425827, + "learning_rate": 0.00011070772547174108, + "loss": 1.6167131423950196, + "step": 38660 + }, + { + "epoch": 0.11705235118732311, + "grad_norm": 0.14290666580200195, + "learning_rate": 0.00011070393029010369, + "loss": 1.6198602676391602, + "step": 38670 + }, + { + "epoch": 0.11708262073766894, + "grad_norm": 0.12617991864681244, + "learning_rate": 0.00011070013510846629, + "loss": 1.6239416122436523, + "step": 38680 + }, + { + "epoch": 0.11711289028801478, + "grad_norm": 0.13793975114822388, + "learning_rate": 0.0001106963399268289, + "loss": 1.597467041015625, + "step": 38690 + }, + { + "epoch": 0.1171431598383606, + "grad_norm": 0.14499224722385406, + "learning_rate": 0.00011069254474519152, + "loss": 1.6247320175170898, + "step": 38700 + }, + { + "epoch": 0.11717342938870644, + "grad_norm": 0.12246394157409668, + "learning_rate": 0.00011068874956355411, + "loss": 1.6335289001464843, + "step": 38710 + }, + { + "epoch": 0.11720369893905226, + "grad_norm": 0.15803571045398712, + "learning_rate": 0.00011068495438191673, + "loss": 1.6103256225585938, + "step": 38720 + }, + { + "epoch": 0.11723396848939809, + "grad_norm": 0.1453765630722046, + "learning_rate": 0.00011068115920027932, + "loss": 1.6435989379882812, + "step": 38730 + }, + { + "epoch": 0.11726423803974392, + "grad_norm": 0.14890791475772858, + "learning_rate": 0.00011067736401864194, + "loss": 1.632814598083496, + "step": 38740 + }, + { + "epoch": 0.11729450759008975, + "grad_norm": 0.12559323012828827, + "learning_rate": 0.00011067356883700453, + "loss": 1.6284181594848632, + "step": 38750 + }, + { + "epoch": 0.11732477714043558, + "grad_norm": 0.13407450914382935, + "learning_rate": 0.00011066977365536715, + "loss": 1.643665885925293, + "step": 38760 + }, + { + "epoch": 0.11735504669078141, + "grad_norm": 0.15248656272888184, + "learning_rate": 0.00011066597847372975, + "loss": 1.633949089050293, + "step": 38770 + }, + { + "epoch": 0.11738531624112723, + "grad_norm": 0.13604679703712463, + "learning_rate": 0.00011066218329209237, + "loss": 1.653426170349121, + "step": 38780 + }, + { + "epoch": 0.11741558579147307, + "grad_norm": 0.12733276188373566, + "learning_rate": 0.00011065838811045496, + "loss": 1.6704647064208984, + "step": 38790 + }, + { + "epoch": 0.1174458553418189, + "grad_norm": 0.14092904329299927, + "learning_rate": 0.00011065459292881758, + "loss": 1.6328025817871095, + "step": 38800 + }, + { + "epoch": 0.11747612489216473, + "grad_norm": 0.13144904375076294, + "learning_rate": 0.00011065079774718017, + "loss": 1.6201805114746093, + "step": 38810 + }, + { + "epoch": 0.11750639444251056, + "grad_norm": 0.15566736459732056, + "learning_rate": 0.00011064700256554279, + "loss": 1.6424016952514648, + "step": 38820 + }, + { + "epoch": 0.11753666399285638, + "grad_norm": 0.1317291408777237, + "learning_rate": 0.0001106432073839054, + "loss": 1.5879545211791992, + "step": 38830 + }, + { + "epoch": 0.11756693354320222, + "grad_norm": 0.15783603489398956, + "learning_rate": 0.000110639412202268, + "loss": 1.6179702758789063, + "step": 38840 + }, + { + "epoch": 0.11759720309354804, + "grad_norm": 0.14811429381370544, + "learning_rate": 0.00011063561702063062, + "loss": 1.607947540283203, + "step": 38850 + }, + { + "epoch": 0.11762747264389388, + "grad_norm": 0.1297062337398529, + "learning_rate": 0.00011063182183899321, + "loss": 1.629459762573242, + "step": 38860 + }, + { + "epoch": 0.1176577421942397, + "grad_norm": 0.13826175034046173, + "learning_rate": 0.00011062802665735583, + "loss": 1.6478382110595704, + "step": 38870 + }, + { + "epoch": 0.11768801174458553, + "grad_norm": 0.13983145356178284, + "learning_rate": 0.00011062423147571842, + "loss": 1.610438919067383, + "step": 38880 + }, + { + "epoch": 0.11771828129493137, + "grad_norm": 0.14506547152996063, + "learning_rate": 0.00011062043629408104, + "loss": 1.6009288787841798, + "step": 38890 + }, + { + "epoch": 0.11774855084527719, + "grad_norm": 0.15877269208431244, + "learning_rate": 0.00011061664111244364, + "loss": 1.6445480346679688, + "step": 38900 + }, + { + "epoch": 0.11777882039562303, + "grad_norm": 0.12719032168388367, + "learning_rate": 0.00011061284593080626, + "loss": 1.6425167083740235, + "step": 38910 + }, + { + "epoch": 0.11780908994596885, + "grad_norm": 0.12463374435901642, + "learning_rate": 0.00011060905074916885, + "loss": 1.6074089050292968, + "step": 38920 + }, + { + "epoch": 0.11783935949631469, + "grad_norm": 0.1343872845172882, + "learning_rate": 0.00011060525556753147, + "loss": 1.655887985229492, + "step": 38930 + }, + { + "epoch": 0.11786962904666051, + "grad_norm": 0.13222181797027588, + "learning_rate": 0.00011060146038589407, + "loss": 1.6424335479736327, + "step": 38940 + }, + { + "epoch": 0.11789989859700634, + "grad_norm": 0.13878409564495087, + "learning_rate": 0.00011059766520425668, + "loss": 1.6061967849731444, + "step": 38950 + }, + { + "epoch": 0.11793016814735217, + "grad_norm": 0.12713143229484558, + "learning_rate": 0.00011059387002261929, + "loss": 1.6138731002807618, + "step": 38960 + }, + { + "epoch": 0.117960437697698, + "grad_norm": 0.15237875282764435, + "learning_rate": 0.00011059007484098189, + "loss": 1.6053321838378907, + "step": 38970 + }, + { + "epoch": 0.11799070724804384, + "grad_norm": 0.1433531641960144, + "learning_rate": 0.0001105862796593445, + "loss": 1.6169076919555665, + "step": 38980 + }, + { + "epoch": 0.11802097679838966, + "grad_norm": 0.1445121020078659, + "learning_rate": 0.0001105824844777071, + "loss": 1.6456451416015625, + "step": 38990 + }, + { + "epoch": 0.11805124634873548, + "grad_norm": 0.1582903414964676, + "learning_rate": 0.00011057868929606971, + "loss": 1.6437149047851562, + "step": 39000 + }, + { + "epoch": 0.11805124634873548, + "eval_loss": 1.6219011545181274, + "eval_runtime": 27.8698, + "eval_samples_per_second": 17.941, + "eval_steps_per_second": 1.148, + "step": 39000 + }, + { + "epoch": 0.11808151589908132, + "grad_norm": 0.1374429166316986, + "learning_rate": 0.00011057489411443232, + "loss": 1.648343276977539, + "step": 39010 + }, + { + "epoch": 0.11811178544942715, + "grad_norm": 0.12467261403799057, + "learning_rate": 0.00011057109893279492, + "loss": 1.6203205108642578, + "step": 39020 + }, + { + "epoch": 0.11814205499977298, + "grad_norm": 0.15039288997650146, + "learning_rate": 0.00011056730375115753, + "loss": 1.5685067176818848, + "step": 39030 + }, + { + "epoch": 0.1181723245501188, + "grad_norm": 0.1453588455915451, + "learning_rate": 0.00011056350856952015, + "loss": 1.6456111907958983, + "step": 39040 + }, + { + "epoch": 0.11820259410046463, + "grad_norm": 0.13666793704032898, + "learning_rate": 0.00011055971338788274, + "loss": 1.634619140625, + "step": 39050 + }, + { + "epoch": 0.11823286365081047, + "grad_norm": 0.12931150197982788, + "learning_rate": 0.00011055591820624536, + "loss": 1.621063232421875, + "step": 39060 + }, + { + "epoch": 0.11826313320115629, + "grad_norm": 0.1332535743713379, + "learning_rate": 0.00011055212302460797, + "loss": 1.6234657287597656, + "step": 39070 + }, + { + "epoch": 0.11829340275150213, + "grad_norm": 0.1502522975206375, + "learning_rate": 0.00011054832784297057, + "loss": 1.6305496215820312, + "step": 39080 + }, + { + "epoch": 0.11832367230184795, + "grad_norm": 0.14526762068271637, + "learning_rate": 0.00011054453266133318, + "loss": 1.6267324447631837, + "step": 39090 + }, + { + "epoch": 0.11835394185219379, + "grad_norm": 0.13637247681617737, + "learning_rate": 0.00011054073747969578, + "loss": 1.600061798095703, + "step": 39100 + }, + { + "epoch": 0.11838421140253962, + "grad_norm": 0.11907241493463516, + "learning_rate": 0.00011053694229805839, + "loss": 1.5964563369750977, + "step": 39110 + }, + { + "epoch": 0.11841448095288544, + "grad_norm": 0.14153960347175598, + "learning_rate": 0.000110533147116421, + "loss": 1.6436660766601563, + "step": 39120 + }, + { + "epoch": 0.11844475050323128, + "grad_norm": 0.13342060148715973, + "learning_rate": 0.0001105293519347836, + "loss": 1.6709129333496093, + "step": 39130 + }, + { + "epoch": 0.1184750200535771, + "grad_norm": 0.14724615216255188, + "learning_rate": 0.0001105255567531462, + "loss": 1.6487354278564452, + "step": 39140 + }, + { + "epoch": 0.11850528960392294, + "grad_norm": 0.13079850375652313, + "learning_rate": 0.00011052176157150881, + "loss": 1.5766084671020508, + "step": 39150 + }, + { + "epoch": 0.11853555915426876, + "grad_norm": 0.1653294414281845, + "learning_rate": 0.00011051796638987142, + "loss": 1.6082094192504883, + "step": 39160 + }, + { + "epoch": 0.11856582870461459, + "grad_norm": 0.15533579885959625, + "learning_rate": 0.00011051417120823402, + "loss": 1.6055397033691405, + "step": 39170 + }, + { + "epoch": 0.11859609825496042, + "grad_norm": 0.14742091298103333, + "learning_rate": 0.00011051037602659664, + "loss": 1.6009750366210938, + "step": 39180 + }, + { + "epoch": 0.11862636780530625, + "grad_norm": 0.13224738836288452, + "learning_rate": 0.00011050658084495924, + "loss": 1.6764480590820312, + "step": 39190 + }, + { + "epoch": 0.11865663735565209, + "grad_norm": 0.14375369250774384, + "learning_rate": 0.00011050278566332186, + "loss": 1.5874689102172852, + "step": 39200 + }, + { + "epoch": 0.11868690690599791, + "grad_norm": 0.1288384050130844, + "learning_rate": 0.00011049899048168445, + "loss": 1.6560783386230469, + "step": 39210 + }, + { + "epoch": 0.11871717645634373, + "grad_norm": 0.1360829621553421, + "learning_rate": 0.00011049519530004707, + "loss": 1.6434093475341798, + "step": 39220 + }, + { + "epoch": 0.11874744600668957, + "grad_norm": 0.13148324191570282, + "learning_rate": 0.00011049140011840966, + "loss": 1.6579254150390625, + "step": 39230 + }, + { + "epoch": 0.1187777155570354, + "grad_norm": 0.14428773522377014, + "learning_rate": 0.00011048760493677228, + "loss": 1.6358200073242188, + "step": 39240 + }, + { + "epoch": 0.11880798510738123, + "grad_norm": 0.14321692287921906, + "learning_rate": 0.00011048380975513489, + "loss": 1.6398283004760743, + "step": 39250 + }, + { + "epoch": 0.11883825465772706, + "grad_norm": 0.14481347799301147, + "learning_rate": 0.00011048001457349749, + "loss": 1.6405769348144532, + "step": 39260 + }, + { + "epoch": 0.1188685242080729, + "grad_norm": 0.13116222620010376, + "learning_rate": 0.0001104762193918601, + "loss": 1.6318513870239257, + "step": 39270 + }, + { + "epoch": 0.11889879375841872, + "grad_norm": 0.13956192135810852, + "learning_rate": 0.0001104724242102227, + "loss": 1.6297834396362305, + "step": 39280 + }, + { + "epoch": 0.11892906330876454, + "grad_norm": 0.13636621832847595, + "learning_rate": 0.00011046862902858531, + "loss": 1.6086164474487306, + "step": 39290 + }, + { + "epoch": 0.11895933285911038, + "grad_norm": 0.14099939167499542, + "learning_rate": 0.00011046483384694792, + "loss": 1.6448047637939454, + "step": 39300 + }, + { + "epoch": 0.1189896024094562, + "grad_norm": 0.13044136762619019, + "learning_rate": 0.00011046103866531053, + "loss": 1.6355958938598634, + "step": 39310 + }, + { + "epoch": 0.11901987195980204, + "grad_norm": 0.13094136118888855, + "learning_rate": 0.00011045724348367313, + "loss": 1.6411602020263671, + "step": 39320 + }, + { + "epoch": 0.11905014151014787, + "grad_norm": 0.1466827243566513, + "learning_rate": 0.00011045344830203575, + "loss": 1.6584465026855468, + "step": 39330 + }, + { + "epoch": 0.11908041106049369, + "grad_norm": 0.13943001627922058, + "learning_rate": 0.00011044965312039834, + "loss": 1.6726825714111329, + "step": 39340 + }, + { + "epoch": 0.11911068061083953, + "grad_norm": 0.134866401553154, + "learning_rate": 0.00011044585793876096, + "loss": 1.6294910430908203, + "step": 39350 + }, + { + "epoch": 0.11914095016118535, + "grad_norm": 0.13806720077991486, + "learning_rate": 0.00011044206275712355, + "loss": 1.662150001525879, + "step": 39360 + }, + { + "epoch": 0.11917121971153119, + "grad_norm": 0.1478094756603241, + "learning_rate": 0.00011043826757548617, + "loss": 1.6232114791870118, + "step": 39370 + }, + { + "epoch": 0.11920148926187701, + "grad_norm": 0.12663592398166656, + "learning_rate": 0.00011043447239384876, + "loss": 1.6038658142089843, + "step": 39380 + }, + { + "epoch": 0.11923175881222284, + "grad_norm": 0.15276607871055603, + "learning_rate": 0.00011043067721221138, + "loss": 1.604257583618164, + "step": 39390 + }, + { + "epoch": 0.11926202836256868, + "grad_norm": 0.12766033411026, + "learning_rate": 0.00011042688203057397, + "loss": 1.636976432800293, + "step": 39400 + }, + { + "epoch": 0.1192922979129145, + "grad_norm": 0.13613034784793854, + "learning_rate": 0.0001104230868489366, + "loss": 1.6607751846313477, + "step": 39410 + }, + { + "epoch": 0.11932256746326034, + "grad_norm": 0.13557086884975433, + "learning_rate": 0.00011041929166729919, + "loss": 1.5905221939086913, + "step": 39420 + }, + { + "epoch": 0.11935283701360616, + "grad_norm": 0.13802717626094818, + "learning_rate": 0.0001104154964856618, + "loss": 1.6287601470947266, + "step": 39430 + }, + { + "epoch": 0.119383106563952, + "grad_norm": 0.13212402164936066, + "learning_rate": 0.00011041170130402441, + "loss": 1.6403995513916017, + "step": 39440 + }, + { + "epoch": 0.11941337611429782, + "grad_norm": 0.14492173492908478, + "learning_rate": 0.00011040790612238702, + "loss": 1.6537752151489258, + "step": 39450 + }, + { + "epoch": 0.11944364566464365, + "grad_norm": 0.15273430943489075, + "learning_rate": 0.00011040411094074964, + "loss": 1.609361457824707, + "step": 39460 + }, + { + "epoch": 0.11947391521498948, + "grad_norm": 0.15101824700832367, + "learning_rate": 0.00011040031575911223, + "loss": 1.6758323669433595, + "step": 39470 + }, + { + "epoch": 0.11950418476533531, + "grad_norm": 0.17679695785045624, + "learning_rate": 0.00011039652057747485, + "loss": 1.5915050506591797, + "step": 39480 + }, + { + "epoch": 0.11953445431568115, + "grad_norm": 0.15056641399860382, + "learning_rate": 0.00011039272539583744, + "loss": 1.6162790298461913, + "step": 39490 + }, + { + "epoch": 0.11956472386602697, + "grad_norm": 0.14354540407657623, + "learning_rate": 0.00011038893021420006, + "loss": 1.6306425094604493, + "step": 39500 + }, + { + "epoch": 0.11956472386602697, + "eval_loss": 1.6399623155593872, + "eval_runtime": 28.4259, + "eval_samples_per_second": 17.59, + "eval_steps_per_second": 1.126, + "step": 39500 + }, + { + "epoch": 0.1195949934163728, + "grad_norm": 0.13695573806762695, + "learning_rate": 0.00011038513503256265, + "loss": 1.627371597290039, + "step": 39510 + }, + { + "epoch": 0.11962526296671863, + "grad_norm": 0.1399320363998413, + "learning_rate": 0.00011038133985092527, + "loss": 1.6339851379394532, + "step": 39520 + }, + { + "epoch": 0.11965553251706446, + "grad_norm": 0.13247467577457428, + "learning_rate": 0.00011037754466928787, + "loss": 1.6395063400268555, + "step": 39530 + }, + { + "epoch": 0.1196858020674103, + "grad_norm": 0.14647939801216125, + "learning_rate": 0.00011037374948765049, + "loss": 1.6282009124755858, + "step": 39540 + }, + { + "epoch": 0.11971607161775612, + "grad_norm": 0.15852642059326172, + "learning_rate": 0.00011036995430601308, + "loss": 1.5976866722106933, + "step": 39550 + }, + { + "epoch": 0.11974634116810194, + "grad_norm": 0.15155881643295288, + "learning_rate": 0.0001103661591243757, + "loss": 1.643179702758789, + "step": 39560 + }, + { + "epoch": 0.11977661071844778, + "grad_norm": 0.1555330455303192, + "learning_rate": 0.0001103623639427383, + "loss": 1.6069793701171875, + "step": 39570 + }, + { + "epoch": 0.1198068802687936, + "grad_norm": 0.15717464685440063, + "learning_rate": 0.00011035856876110091, + "loss": 1.627654457092285, + "step": 39580 + }, + { + "epoch": 0.11983714981913944, + "grad_norm": 0.14945991337299347, + "learning_rate": 0.00011035477357946352, + "loss": 1.659639549255371, + "step": 39590 + }, + { + "epoch": 0.11986741936948526, + "grad_norm": 0.1402890682220459, + "learning_rate": 0.00011035097839782612, + "loss": 1.6282020568847657, + "step": 39600 + }, + { + "epoch": 0.1198976889198311, + "grad_norm": 0.1318565011024475, + "learning_rate": 0.00011034718321618873, + "loss": 1.6193130493164063, + "step": 39610 + }, + { + "epoch": 0.11992795847017693, + "grad_norm": 0.13893628120422363, + "learning_rate": 0.00011034338803455133, + "loss": 1.6788658142089843, + "step": 39620 + }, + { + "epoch": 0.11995822802052275, + "grad_norm": 0.13543176651000977, + "learning_rate": 0.00011033959285291394, + "loss": 1.6280986785888671, + "step": 39630 + }, + { + "epoch": 0.11998849757086859, + "grad_norm": 0.151985764503479, + "learning_rate": 0.00011033579767127654, + "loss": 1.6326492309570313, + "step": 39640 + }, + { + "epoch": 0.12001876712121441, + "grad_norm": 0.1496872752904892, + "learning_rate": 0.00011033200248963916, + "loss": 1.6472740173339844, + "step": 39650 + }, + { + "epoch": 0.12004903667156025, + "grad_norm": 0.14059679210186005, + "learning_rate": 0.00011032820730800176, + "loss": 1.6223064422607423, + "step": 39660 + }, + { + "epoch": 0.12007930622190607, + "grad_norm": 0.13665097951889038, + "learning_rate": 0.00011032441212636438, + "loss": 1.6506034851074218, + "step": 39670 + }, + { + "epoch": 0.1201095757722519, + "grad_norm": 0.1347370147705078, + "learning_rate": 0.00011032061694472698, + "loss": 1.66009521484375, + "step": 39680 + }, + { + "epoch": 0.12013984532259774, + "grad_norm": 0.1490640938282013, + "learning_rate": 0.00011031682176308959, + "loss": 1.6236381530761719, + "step": 39690 + }, + { + "epoch": 0.12017011487294356, + "grad_norm": 0.15553632378578186, + "learning_rate": 0.0001103130265814522, + "loss": 1.5982866287231445, + "step": 39700 + }, + { + "epoch": 0.1202003844232894, + "grad_norm": 0.1248253583908081, + "learning_rate": 0.0001103092313998148, + "loss": 1.6258434295654296, + "step": 39710 + }, + { + "epoch": 0.12023065397363522, + "grad_norm": 0.1249435544013977, + "learning_rate": 0.0001103054362181774, + "loss": 1.6434745788574219, + "step": 39720 + }, + { + "epoch": 0.12026092352398104, + "grad_norm": 0.1338450163602829, + "learning_rate": 0.00011030164103654001, + "loss": 1.6387439727783204, + "step": 39730 + }, + { + "epoch": 0.12029119307432688, + "grad_norm": 0.13941362500190735, + "learning_rate": 0.00011029784585490262, + "loss": 1.6284805297851563, + "step": 39740 + }, + { + "epoch": 0.1203214626246727, + "grad_norm": 0.1483360230922699, + "learning_rate": 0.00011029405067326522, + "loss": 1.6705469131469726, + "step": 39750 + }, + { + "epoch": 0.12035173217501854, + "grad_norm": 0.14684082567691803, + "learning_rate": 0.00011029025549162783, + "loss": 1.606988525390625, + "step": 39760 + }, + { + "epoch": 0.12038200172536437, + "grad_norm": 0.1304865926504135, + "learning_rate": 0.00011028646030999044, + "loss": 1.6337512969970702, + "step": 39770 + }, + { + "epoch": 0.1204122712757102, + "grad_norm": 0.14052346348762512, + "learning_rate": 0.00011028266512835304, + "loss": 1.6367469787597657, + "step": 39780 + }, + { + "epoch": 0.12044254082605603, + "grad_norm": 0.13869024813175201, + "learning_rate": 0.00011027886994671565, + "loss": 1.655364418029785, + "step": 39790 + }, + { + "epoch": 0.12047281037640185, + "grad_norm": 0.14206573367118835, + "learning_rate": 0.00011027507476507825, + "loss": 1.6632120132446289, + "step": 39800 + }, + { + "epoch": 0.12050307992674769, + "grad_norm": 0.1393343061208725, + "learning_rate": 0.00011027127958344087, + "loss": 1.6591758728027344, + "step": 39810 + }, + { + "epoch": 0.12053334947709352, + "grad_norm": 0.1406344324350357, + "learning_rate": 0.00011026748440180347, + "loss": 1.5932214736938477, + "step": 39820 + }, + { + "epoch": 0.12056361902743935, + "grad_norm": 0.13652363419532776, + "learning_rate": 0.00011026368922016609, + "loss": 1.6487092971801758, + "step": 39830 + }, + { + "epoch": 0.12059388857778518, + "grad_norm": 0.1348930299282074, + "learning_rate": 0.00011025989403852868, + "loss": 1.6538040161132812, + "step": 39840 + }, + { + "epoch": 0.120624158128131, + "grad_norm": 0.15427732467651367, + "learning_rate": 0.0001102560988568913, + "loss": 1.5895238876342774, + "step": 39850 + }, + { + "epoch": 0.12065442767847684, + "grad_norm": 0.14436957240104675, + "learning_rate": 0.0001102523036752539, + "loss": 1.6327512741088868, + "step": 39860 + }, + { + "epoch": 0.12068469722882266, + "grad_norm": 0.13581138849258423, + "learning_rate": 0.00011024850849361651, + "loss": 1.642601203918457, + "step": 39870 + }, + { + "epoch": 0.1207149667791685, + "grad_norm": 0.13969644904136658, + "learning_rate": 0.00011024471331197911, + "loss": 1.653955841064453, + "step": 39880 + }, + { + "epoch": 0.12074523632951432, + "grad_norm": 0.14021609723567963, + "learning_rate": 0.00011024091813034172, + "loss": 1.6262868881225585, + "step": 39890 + }, + { + "epoch": 0.12077550587986015, + "grad_norm": 0.13751628994941711, + "learning_rate": 0.00011023712294870433, + "loss": 1.6120067596435548, + "step": 39900 + }, + { + "epoch": 0.12080577543020599, + "grad_norm": 0.13251449167728424, + "learning_rate": 0.00011023332776706693, + "loss": 1.6384693145751954, + "step": 39910 + }, + { + "epoch": 0.12083604498055181, + "grad_norm": 0.14322592318058014, + "learning_rate": 0.00011022953258542955, + "loss": 1.6560800552368165, + "step": 39920 + }, + { + "epoch": 0.12086631453089765, + "grad_norm": 0.15788203477859497, + "learning_rate": 0.00011022573740379214, + "loss": 1.624046516418457, + "step": 39930 + }, + { + "epoch": 0.12089658408124347, + "grad_norm": 0.133340522646904, + "learning_rate": 0.00011022194222215476, + "loss": 1.6303535461425782, + "step": 39940 + }, + { + "epoch": 0.12092685363158931, + "grad_norm": 0.14714166522026062, + "learning_rate": 0.00011021814704051736, + "loss": 1.638787841796875, + "step": 39950 + }, + { + "epoch": 0.12095712318193513, + "grad_norm": 0.15010549128055573, + "learning_rate": 0.00011021435185887998, + "loss": 1.5844141960144043, + "step": 39960 + }, + { + "epoch": 0.12098739273228096, + "grad_norm": 0.1415908932685852, + "learning_rate": 0.00011021055667724257, + "loss": 1.6240493774414062, + "step": 39970 + }, + { + "epoch": 0.1210176622826268, + "grad_norm": 0.1537216454744339, + "learning_rate": 0.00011020676149560519, + "loss": 1.632636260986328, + "step": 39980 + }, + { + "epoch": 0.12104793183297262, + "grad_norm": 0.145690456032753, + "learning_rate": 0.00011020296631396778, + "loss": 1.6543052673339844, + "step": 39990 + }, + { + "epoch": 0.12107820138331846, + "grad_norm": 0.16533805429935455, + "learning_rate": 0.0001101991711323304, + "loss": 1.627050018310547, + "step": 40000 + }, + { + "epoch": 0.12107820138331846, + "eval_loss": 1.5965251922607422, + "eval_runtime": 27.6472, + "eval_samples_per_second": 18.085, + "eval_steps_per_second": 1.157, + "step": 40000 + }, + { + "epoch": 0.12110847093366428, + "grad_norm": 0.1522308886051178, + "learning_rate": 0.00011019537595069299, + "loss": 1.6259021759033203, + "step": 40010 + }, + { + "epoch": 0.1211387404840101, + "grad_norm": 0.18109218776226044, + "learning_rate": 0.00011019158076905561, + "loss": 1.6069755554199219, + "step": 40020 + }, + { + "epoch": 0.12116901003435594, + "grad_norm": 0.14158056676387787, + "learning_rate": 0.0001101877855874182, + "loss": 1.596226119995117, + "step": 40030 + }, + { + "epoch": 0.12119927958470177, + "grad_norm": 0.15174968540668488, + "learning_rate": 0.00011018399040578082, + "loss": 1.6412141799926758, + "step": 40040 + }, + { + "epoch": 0.1212295491350476, + "grad_norm": 0.1496388465166092, + "learning_rate": 0.00011018019522414343, + "loss": 1.6558080673217774, + "step": 40050 + }, + { + "epoch": 0.12125981868539343, + "grad_norm": 0.14255133271217346, + "learning_rate": 0.00011017640004250604, + "loss": 1.636002540588379, + "step": 40060 + }, + { + "epoch": 0.12129008823573925, + "grad_norm": 0.1424453854560852, + "learning_rate": 0.00011017260486086865, + "loss": 1.645989227294922, + "step": 40070 + }, + { + "epoch": 0.12132035778608509, + "grad_norm": 0.16167916357517242, + "learning_rate": 0.00011016880967923125, + "loss": 1.642243003845215, + "step": 40080 + }, + { + "epoch": 0.12135062733643091, + "grad_norm": 0.14333891868591309, + "learning_rate": 0.00011016501449759387, + "loss": 1.6220857620239257, + "step": 40090 + }, + { + "epoch": 0.12138089688677675, + "grad_norm": 0.15165114402770996, + "learning_rate": 0.00011016121931595646, + "loss": 1.6496011734008789, + "step": 40100 + }, + { + "epoch": 0.12141116643712258, + "grad_norm": 0.1403682678937912, + "learning_rate": 0.00011015742413431908, + "loss": 1.6140312194824218, + "step": 40110 + }, + { + "epoch": 0.12144143598746841, + "grad_norm": 0.14986619353294373, + "learning_rate": 0.00011015362895268167, + "loss": 1.6007783889770508, + "step": 40120 + }, + { + "epoch": 0.12147170553781424, + "grad_norm": 0.13426800072193146, + "learning_rate": 0.00011014983377104429, + "loss": 1.62677059173584, + "step": 40130 + }, + { + "epoch": 0.12150197508816006, + "grad_norm": 0.1296129673719406, + "learning_rate": 0.00011014603858940688, + "loss": 1.6243574142456054, + "step": 40140 + }, + { + "epoch": 0.1215322446385059, + "grad_norm": 0.14411136507987976, + "learning_rate": 0.0001101422434077695, + "loss": 1.6171049118041991, + "step": 40150 + }, + { + "epoch": 0.12156251418885172, + "grad_norm": 0.1414037048816681, + "learning_rate": 0.0001101384482261321, + "loss": 1.6280719757080078, + "step": 40160 + }, + { + "epoch": 0.12159278373919756, + "grad_norm": 0.1479933112859726, + "learning_rate": 0.00011013465304449471, + "loss": 1.6813232421875, + "step": 40170 + }, + { + "epoch": 0.12162305328954338, + "grad_norm": 0.14544856548309326, + "learning_rate": 0.00011013085786285732, + "loss": 1.620064926147461, + "step": 40180 + }, + { + "epoch": 0.12165332283988921, + "grad_norm": 0.13262642920017242, + "learning_rate": 0.00011012706268121993, + "loss": 1.6443416595458984, + "step": 40190 + }, + { + "epoch": 0.12168359239023505, + "grad_norm": 0.1396198570728302, + "learning_rate": 0.00011012326749958253, + "loss": 1.6260553359985352, + "step": 40200 + }, + { + "epoch": 0.12171386194058087, + "grad_norm": 0.1487874984741211, + "learning_rate": 0.00011011947231794514, + "loss": 1.6414512634277343, + "step": 40210 + }, + { + "epoch": 0.12174413149092671, + "grad_norm": 0.1454562246799469, + "learning_rate": 0.00011011567713630774, + "loss": 1.664850616455078, + "step": 40220 + }, + { + "epoch": 0.12177440104127253, + "grad_norm": 0.14477397501468658, + "learning_rate": 0.00011011188195467035, + "loss": 1.6625005722045898, + "step": 40230 + }, + { + "epoch": 0.12180467059161836, + "grad_norm": 0.14909252524375916, + "learning_rate": 0.00011010808677303296, + "loss": 1.6195293426513673, + "step": 40240 + }, + { + "epoch": 0.1218349401419642, + "grad_norm": 0.13157224655151367, + "learning_rate": 0.00011010429159139556, + "loss": 1.6297983169555663, + "step": 40250 + }, + { + "epoch": 0.12186520969231002, + "grad_norm": 0.13408075273036957, + "learning_rate": 0.00011010049640975818, + "loss": 1.6471452713012695, + "step": 40260 + }, + { + "epoch": 0.12189547924265585, + "grad_norm": 0.13148027658462524, + "learning_rate": 0.00011009670122812077, + "loss": 1.6535446166992187, + "step": 40270 + }, + { + "epoch": 0.12192574879300168, + "grad_norm": 0.13048481941223145, + "learning_rate": 0.0001100929060464834, + "loss": 1.6631610870361329, + "step": 40280 + }, + { + "epoch": 0.12195601834334752, + "grad_norm": 0.1376458704471588, + "learning_rate": 0.000110089110864846, + "loss": 1.6473445892333984, + "step": 40290 + }, + { + "epoch": 0.12198628789369334, + "grad_norm": 0.12545841932296753, + "learning_rate": 0.0001100853156832086, + "loss": 1.6450468063354493, + "step": 40300 + }, + { + "epoch": 0.12201655744403916, + "grad_norm": 0.14642152190208435, + "learning_rate": 0.00011008152050157121, + "loss": 1.634202003479004, + "step": 40310 + }, + { + "epoch": 0.122046826994385, + "grad_norm": 0.13095475733280182, + "learning_rate": 0.00011007772531993382, + "loss": 1.635678482055664, + "step": 40320 + }, + { + "epoch": 0.12207709654473083, + "grad_norm": 0.15171916782855988, + "learning_rate": 0.00011007393013829642, + "loss": 1.6281646728515624, + "step": 40330 + }, + { + "epoch": 0.12210736609507666, + "grad_norm": 0.15342164039611816, + "learning_rate": 0.00011007013495665903, + "loss": 1.609295654296875, + "step": 40340 + }, + { + "epoch": 0.12213763564542249, + "grad_norm": 0.14759458601474762, + "learning_rate": 0.00011006633977502164, + "loss": 1.6083580017089845, + "step": 40350 + }, + { + "epoch": 0.12216790519576831, + "grad_norm": 0.1314408779144287, + "learning_rate": 0.00011006254459338424, + "loss": 1.6358016967773437, + "step": 40360 + }, + { + "epoch": 0.12219817474611415, + "grad_norm": 0.15432173013687134, + "learning_rate": 0.00011005874941174685, + "loss": 1.6006168365478515, + "step": 40370 + }, + { + "epoch": 0.12222844429645997, + "grad_norm": 0.14741730690002441, + "learning_rate": 0.00011005495423010945, + "loss": 1.6206506729125976, + "step": 40380 + }, + { + "epoch": 0.12225871384680581, + "grad_norm": 0.13516175746917725, + "learning_rate": 0.00011005115904847206, + "loss": 1.6023345947265626, + "step": 40390 + }, + { + "epoch": 0.12228898339715163, + "grad_norm": 0.13776050508022308, + "learning_rate": 0.00011004736386683466, + "loss": 1.6296882629394531, + "step": 40400 + }, + { + "epoch": 0.12231925294749746, + "grad_norm": 0.1581951528787613, + "learning_rate": 0.00011004356868519727, + "loss": 1.6108543395996093, + "step": 40410 + }, + { + "epoch": 0.1223495224978433, + "grad_norm": 0.13436739146709442, + "learning_rate": 0.00011003977350355989, + "loss": 1.6305465698242188, + "step": 40420 + }, + { + "epoch": 0.12237979204818912, + "grad_norm": 0.12735655903816223, + "learning_rate": 0.00011003597832192248, + "loss": 1.5865216255187988, + "step": 40430 + }, + { + "epoch": 0.12241006159853496, + "grad_norm": 0.12530192732810974, + "learning_rate": 0.0001100321831402851, + "loss": 1.6110641479492187, + "step": 40440 + }, + { + "epoch": 0.12244033114888078, + "grad_norm": 0.13960014283657074, + "learning_rate": 0.0001100283879586477, + "loss": 1.6690574645996095, + "step": 40450 + }, + { + "epoch": 0.1224706006992266, + "grad_norm": 0.133758544921875, + "learning_rate": 0.00011002459277701031, + "loss": 1.624757957458496, + "step": 40460 + }, + { + "epoch": 0.12250087024957244, + "grad_norm": 0.12751007080078125, + "learning_rate": 0.00011002079759537292, + "loss": 1.612969207763672, + "step": 40470 + }, + { + "epoch": 0.12253113979991827, + "grad_norm": 0.14617185294628143, + "learning_rate": 0.00011001700241373553, + "loss": 1.6528606414794922, + "step": 40480 + }, + { + "epoch": 0.1225614093502641, + "grad_norm": 0.14673683047294617, + "learning_rate": 0.00011001320723209813, + "loss": 1.6440185546875, + "step": 40490 + }, + { + "epoch": 0.12259167890060993, + "grad_norm": 0.15387682616710663, + "learning_rate": 0.00011000941205046074, + "loss": 1.5912650108337403, + "step": 40500 + }, + { + "epoch": 0.12259167890060993, + "eval_loss": 1.6392128467559814, + "eval_runtime": 27.6903, + "eval_samples_per_second": 18.057, + "eval_steps_per_second": 1.156, + "step": 40500 + }, + { + "epoch": 0.12262194845095577, + "grad_norm": 0.13033263385295868, + "learning_rate": 0.00011000561686882334, + "loss": 1.612900161743164, + "step": 40510 + }, + { + "epoch": 0.12265221800130159, + "grad_norm": 0.14639224112033844, + "learning_rate": 0.00011000182168718595, + "loss": 1.6215316772460937, + "step": 40520 + }, + { + "epoch": 0.12268248755164742, + "grad_norm": 0.16174978017807007, + "learning_rate": 0.00010999802650554856, + "loss": 1.6103960037231446, + "step": 40530 + }, + { + "epoch": 0.12271275710199325, + "grad_norm": 0.12868806719779968, + "learning_rate": 0.00010999423132391116, + "loss": 1.6116718292236327, + "step": 40540 + }, + { + "epoch": 0.12274302665233908, + "grad_norm": 0.12454783916473389, + "learning_rate": 0.00010999043614227378, + "loss": 1.6181814193725585, + "step": 40550 + }, + { + "epoch": 0.12277329620268491, + "grad_norm": 0.12913921475410461, + "learning_rate": 0.00010998664096063637, + "loss": 1.6441940307617187, + "step": 40560 + }, + { + "epoch": 0.12280356575303074, + "grad_norm": 0.14830292761325836, + "learning_rate": 0.00010998284577899899, + "loss": 1.6232593536376954, + "step": 40570 + }, + { + "epoch": 0.12283383530337656, + "grad_norm": 0.15072114765644073, + "learning_rate": 0.00010997905059736159, + "loss": 1.649362564086914, + "step": 40580 + }, + { + "epoch": 0.1228641048537224, + "grad_norm": 0.11982211470603943, + "learning_rate": 0.0001099752554157242, + "loss": 1.5882109642028808, + "step": 40590 + }, + { + "epoch": 0.12289437440406822, + "grad_norm": 0.13179616630077362, + "learning_rate": 0.0001099714602340868, + "loss": 1.6154193878173828, + "step": 40600 + }, + { + "epoch": 0.12292464395441406, + "grad_norm": 0.1447441428899765, + "learning_rate": 0.00010996766505244942, + "loss": 1.6262224197387696, + "step": 40610 + }, + { + "epoch": 0.12295491350475989, + "grad_norm": 0.13426926732063293, + "learning_rate": 0.00010996386987081201, + "loss": 1.633066177368164, + "step": 40620 + }, + { + "epoch": 0.12298518305510571, + "grad_norm": 0.1460207998752594, + "learning_rate": 0.00010996007468917463, + "loss": 1.577811050415039, + "step": 40630 + }, + { + "epoch": 0.12301545260545155, + "grad_norm": 0.13141249120235443, + "learning_rate": 0.00010995627950753722, + "loss": 1.6760480880737305, + "step": 40640 + }, + { + "epoch": 0.12304572215579737, + "grad_norm": 0.13341303169727325, + "learning_rate": 0.00010995248432589984, + "loss": 1.639712142944336, + "step": 40650 + }, + { + "epoch": 0.12307599170614321, + "grad_norm": 0.12705758213996887, + "learning_rate": 0.00010994868914426243, + "loss": 1.6171863555908204, + "step": 40660 + }, + { + "epoch": 0.12310626125648903, + "grad_norm": 0.16016265749931335, + "learning_rate": 0.00010994489396262505, + "loss": 1.6270828247070312, + "step": 40670 + }, + { + "epoch": 0.12313653080683487, + "grad_norm": 0.1446055769920349, + "learning_rate": 0.00010994109878098767, + "loss": 1.621792984008789, + "step": 40680 + }, + { + "epoch": 0.1231668003571807, + "grad_norm": 0.14969728887081146, + "learning_rate": 0.00010993730359935026, + "loss": 1.6025588989257813, + "step": 40690 + }, + { + "epoch": 0.12319706990752652, + "grad_norm": 0.1511041820049286, + "learning_rate": 0.00010993350841771288, + "loss": 1.6222326278686523, + "step": 40700 + }, + { + "epoch": 0.12322733945787236, + "grad_norm": 0.12905462086200714, + "learning_rate": 0.00010992971323607548, + "loss": 1.6460771560668945, + "step": 40710 + }, + { + "epoch": 0.12325760900821818, + "grad_norm": 0.14818957448005676, + "learning_rate": 0.0001099259180544381, + "loss": 1.5825674057006835, + "step": 40720 + }, + { + "epoch": 0.12328787855856402, + "grad_norm": 0.14450529217720032, + "learning_rate": 0.00010992212287280069, + "loss": 1.6361568450927735, + "step": 40730 + }, + { + "epoch": 0.12331814810890984, + "grad_norm": 0.1400623470544815, + "learning_rate": 0.00010991832769116331, + "loss": 1.6794593811035157, + "step": 40740 + }, + { + "epoch": 0.12334841765925567, + "grad_norm": 0.14485587179660797, + "learning_rate": 0.0001099145325095259, + "loss": 1.6212932586669921, + "step": 40750 + }, + { + "epoch": 0.1233786872096015, + "grad_norm": 0.1278696209192276, + "learning_rate": 0.00010991073732788852, + "loss": 1.6484806060791015, + "step": 40760 + }, + { + "epoch": 0.12340895675994733, + "grad_norm": 0.1279991865158081, + "learning_rate": 0.00010990694214625111, + "loss": 1.618414306640625, + "step": 40770 + }, + { + "epoch": 0.12343922631029317, + "grad_norm": 0.12059763073921204, + "learning_rate": 0.00010990314696461373, + "loss": 1.65069637298584, + "step": 40780 + }, + { + "epoch": 0.12346949586063899, + "grad_norm": 0.15163789689540863, + "learning_rate": 0.00010989935178297634, + "loss": 1.6647382736206056, + "step": 40790 + }, + { + "epoch": 0.12349976541098481, + "grad_norm": 0.13705188035964966, + "learning_rate": 0.00010989555660133894, + "loss": 1.661387825012207, + "step": 40800 + }, + { + "epoch": 0.12353003496133065, + "grad_norm": 0.12456528097391129, + "learning_rate": 0.00010989176141970155, + "loss": 1.6531038284301758, + "step": 40810 + }, + { + "epoch": 0.12356030451167647, + "grad_norm": 0.15310263633728027, + "learning_rate": 0.00010988796623806416, + "loss": 1.6498506546020508, + "step": 40820 + }, + { + "epoch": 0.12359057406202231, + "grad_norm": 0.14896522462368011, + "learning_rate": 0.00010988417105642676, + "loss": 1.6025245666503907, + "step": 40830 + }, + { + "epoch": 0.12362084361236814, + "grad_norm": 0.12469589710235596, + "learning_rate": 0.00010988037587478937, + "loss": 1.577909278869629, + "step": 40840 + }, + { + "epoch": 0.12365111316271397, + "grad_norm": 0.1516575962305069, + "learning_rate": 0.00010987658069315197, + "loss": 1.5877562522888184, + "step": 40850 + }, + { + "epoch": 0.1236813827130598, + "grad_norm": 0.13306760787963867, + "learning_rate": 0.00010987278551151458, + "loss": 1.6543182373046874, + "step": 40860 + }, + { + "epoch": 0.12371165226340562, + "grad_norm": 0.13878609240055084, + "learning_rate": 0.0001098689903298772, + "loss": 1.606937789916992, + "step": 40870 + }, + { + "epoch": 0.12374192181375146, + "grad_norm": 0.1406795233488083, + "learning_rate": 0.00010986519514823979, + "loss": 1.63375244140625, + "step": 40880 + }, + { + "epoch": 0.12377219136409728, + "grad_norm": 0.13731347024440765, + "learning_rate": 0.00010986139996660241, + "loss": 1.6748546600341796, + "step": 40890 + }, + { + "epoch": 0.12380246091444312, + "grad_norm": 0.15804125368595123, + "learning_rate": 0.000109857604784965, + "loss": 1.626971435546875, + "step": 40900 + }, + { + "epoch": 0.12383273046478895, + "grad_norm": 0.1315164566040039, + "learning_rate": 0.00010985380960332762, + "loss": 1.6745185852050781, + "step": 40910 + }, + { + "epoch": 0.12386300001513477, + "grad_norm": 0.13509772717952728, + "learning_rate": 0.00010985001442169023, + "loss": 1.6295024871826171, + "step": 40920 + }, + { + "epoch": 0.12389326956548061, + "grad_norm": 0.13915324211120605, + "learning_rate": 0.00010984621924005283, + "loss": 1.6352081298828125, + "step": 40930 + }, + { + "epoch": 0.12392353911582643, + "grad_norm": 0.13059163093566895, + "learning_rate": 0.00010984242405841544, + "loss": 1.5960628509521484, + "step": 40940 + }, + { + "epoch": 0.12395380866617227, + "grad_norm": 0.1253858506679535, + "learning_rate": 0.00010983862887677805, + "loss": 1.6548768997192382, + "step": 40950 + }, + { + "epoch": 0.12398407821651809, + "grad_norm": 0.13546524941921234, + "learning_rate": 0.00010983483369514065, + "loss": 1.6016670227050782, + "step": 40960 + }, + { + "epoch": 0.12401434776686392, + "grad_norm": 0.13245490193367004, + "learning_rate": 0.00010983103851350326, + "loss": 1.6100460052490235, + "step": 40970 + }, + { + "epoch": 0.12404461731720975, + "grad_norm": 0.1372687667608261, + "learning_rate": 0.00010982724333186586, + "loss": 1.6198247909545898, + "step": 40980 + }, + { + "epoch": 0.12407488686755558, + "grad_norm": 0.14254707098007202, + "learning_rate": 0.00010982344815022847, + "loss": 1.6594640731811523, + "step": 40990 + }, + { + "epoch": 0.12410515641790142, + "grad_norm": 0.12657856941223145, + "learning_rate": 0.00010981965296859108, + "loss": 1.6581914901733399, + "step": 41000 + }, + { + "epoch": 0.12410515641790142, + "eval_loss": 1.6230593919754028, + "eval_runtime": 28.1448, + "eval_samples_per_second": 17.765, + "eval_steps_per_second": 1.137, + "step": 41000 + }, + { + "epoch": 0.12413542596824724, + "grad_norm": 0.14058227837085724, + "learning_rate": 0.00010981585778695368, + "loss": 1.650217628479004, + "step": 41010 + }, + { + "epoch": 0.12416569551859308, + "grad_norm": 0.1387016326189041, + "learning_rate": 0.00010981206260531629, + "loss": 1.612245750427246, + "step": 41020 + }, + { + "epoch": 0.1241959650689389, + "grad_norm": 0.13434408605098724, + "learning_rate": 0.00010980826742367891, + "loss": 1.6220775604248048, + "step": 41030 + }, + { + "epoch": 0.12422623461928473, + "grad_norm": 0.15048716962337494, + "learning_rate": 0.0001098044722420415, + "loss": 1.61999568939209, + "step": 41040 + }, + { + "epoch": 0.12425650416963056, + "grad_norm": 0.1348404437303543, + "learning_rate": 0.00010980067706040412, + "loss": 1.6508880615234376, + "step": 41050 + }, + { + "epoch": 0.12428677371997639, + "grad_norm": 0.14708521962165833, + "learning_rate": 0.00010979688187876671, + "loss": 1.6524635314941407, + "step": 41060 + }, + { + "epoch": 0.12431704327032223, + "grad_norm": 0.1574539840221405, + "learning_rate": 0.00010979308669712933, + "loss": 1.6222801208496094, + "step": 41070 + }, + { + "epoch": 0.12434731282066805, + "grad_norm": 0.1360493153333664, + "learning_rate": 0.00010978929151549194, + "loss": 1.6334978103637696, + "step": 41080 + }, + { + "epoch": 0.12437758237101387, + "grad_norm": 0.1237308531999588, + "learning_rate": 0.00010978549633385454, + "loss": 1.6212745666503907, + "step": 41090 + }, + { + "epoch": 0.12440785192135971, + "grad_norm": 0.14073000848293304, + "learning_rate": 0.00010978170115221715, + "loss": 1.5942937850952148, + "step": 41100 + }, + { + "epoch": 0.12443812147170553, + "grad_norm": 0.1502731591463089, + "learning_rate": 0.00010977790597057976, + "loss": 1.5668377876281738, + "step": 41110 + }, + { + "epoch": 0.12446839102205137, + "grad_norm": 0.13248248398303986, + "learning_rate": 0.00010977411078894236, + "loss": 1.6720756530761718, + "step": 41120 + }, + { + "epoch": 0.1244986605723972, + "grad_norm": 0.14862193167209625, + "learning_rate": 0.00010977031560730497, + "loss": 1.5980340003967286, + "step": 41130 + }, + { + "epoch": 0.12452893012274302, + "grad_norm": 0.14639389514923096, + "learning_rate": 0.00010976652042566757, + "loss": 1.6413286209106446, + "step": 41140 + }, + { + "epoch": 0.12455919967308886, + "grad_norm": 0.13997696340084076, + "learning_rate": 0.00010976272524403018, + "loss": 1.5944084167480468, + "step": 41150 + }, + { + "epoch": 0.12458946922343468, + "grad_norm": 0.13860616087913513, + "learning_rate": 0.0001097589300623928, + "loss": 1.616744613647461, + "step": 41160 + }, + { + "epoch": 0.12461973877378052, + "grad_norm": 0.12781503796577454, + "learning_rate": 0.00010975513488075539, + "loss": 1.6240234375, + "step": 41170 + }, + { + "epoch": 0.12465000832412634, + "grad_norm": 0.13992206752300262, + "learning_rate": 0.00010975133969911801, + "loss": 1.650282096862793, + "step": 41180 + }, + { + "epoch": 0.12468027787447218, + "grad_norm": 0.13140293955802917, + "learning_rate": 0.0001097475445174806, + "loss": 1.6143976211547852, + "step": 41190 + }, + { + "epoch": 0.124710547424818, + "grad_norm": 0.13911615312099457, + "learning_rate": 0.00010974374933584322, + "loss": 1.628251838684082, + "step": 41200 + }, + { + "epoch": 0.12474081697516383, + "grad_norm": 0.13437294960021973, + "learning_rate": 0.00010973995415420581, + "loss": 1.6528079986572266, + "step": 41210 + }, + { + "epoch": 0.12477108652550967, + "grad_norm": 0.13786464929580688, + "learning_rate": 0.00010973615897256843, + "loss": 1.651983642578125, + "step": 41220 + }, + { + "epoch": 0.12480135607585549, + "grad_norm": 0.12631306052207947, + "learning_rate": 0.00010973236379093103, + "loss": 1.6629714965820312, + "step": 41230 + }, + { + "epoch": 0.12483162562620133, + "grad_norm": 0.12679389119148254, + "learning_rate": 0.00010972856860929365, + "loss": 1.643347930908203, + "step": 41240 + }, + { + "epoch": 0.12486189517654715, + "grad_norm": 0.13270846009254456, + "learning_rate": 0.00010972477342765624, + "loss": 1.5866172790527344, + "step": 41250 + }, + { + "epoch": 0.12489216472689298, + "grad_norm": 0.14930002391338348, + "learning_rate": 0.00010972097824601886, + "loss": 1.6544483184814454, + "step": 41260 + }, + { + "epoch": 0.12492243427723881, + "grad_norm": 0.12680168449878693, + "learning_rate": 0.00010971718306438145, + "loss": 1.647599983215332, + "step": 41270 + }, + { + "epoch": 0.12495270382758464, + "grad_norm": 0.1297338902950287, + "learning_rate": 0.00010971338788274407, + "loss": 1.6248106002807616, + "step": 41280 + }, + { + "epoch": 0.12498297337793048, + "grad_norm": 0.14578546583652496, + "learning_rate": 0.00010970959270110669, + "loss": 1.6690933227539062, + "step": 41290 + }, + { + "epoch": 0.1250132429282763, + "grad_norm": 0.15696972608566284, + "learning_rate": 0.00010970579751946928, + "loss": 1.6542829513549804, + "step": 41300 + }, + { + "epoch": 0.12504351247862214, + "grad_norm": 0.14840079843997955, + "learning_rate": 0.0001097020023378319, + "loss": 1.6206802368164062, + "step": 41310 + }, + { + "epoch": 0.12507378202896796, + "grad_norm": 0.1534077227115631, + "learning_rate": 0.0001096982071561945, + "loss": 1.628097152709961, + "step": 41320 + }, + { + "epoch": 0.12510405157931379, + "grad_norm": 0.14409111440181732, + "learning_rate": 0.00010969441197455711, + "loss": 1.6107017517089843, + "step": 41330 + }, + { + "epoch": 0.1251343211296596, + "grad_norm": 0.13656260073184967, + "learning_rate": 0.0001096906167929197, + "loss": 1.6020742416381837, + "step": 41340 + }, + { + "epoch": 0.12516459068000546, + "grad_norm": 0.13224153220653534, + "learning_rate": 0.00010968682161128232, + "loss": 1.619390106201172, + "step": 41350 + }, + { + "epoch": 0.12519486023035128, + "grad_norm": 0.1519973874092102, + "learning_rate": 0.00010968302642964492, + "loss": 1.6493518829345704, + "step": 41360 + }, + { + "epoch": 0.1252251297806971, + "grad_norm": 0.14213033020496368, + "learning_rate": 0.00010967923124800754, + "loss": 1.657684326171875, + "step": 41370 + }, + { + "epoch": 0.12525539933104293, + "grad_norm": 0.12971477210521698, + "learning_rate": 0.00010967543606637013, + "loss": 1.611539077758789, + "step": 41380 + }, + { + "epoch": 0.12528566888138876, + "grad_norm": 0.13990838825702667, + "learning_rate": 0.00010967164088473275, + "loss": 1.6101524353027343, + "step": 41390 + }, + { + "epoch": 0.1253159384317346, + "grad_norm": 0.12808217108249664, + "learning_rate": 0.00010966784570309535, + "loss": 1.6106590270996093, + "step": 41400 + }, + { + "epoch": 0.12534620798208043, + "grad_norm": 0.14169475436210632, + "learning_rate": 0.00010966405052145796, + "loss": 1.6262887954711913, + "step": 41410 + }, + { + "epoch": 0.12537647753242626, + "grad_norm": 0.12728209793567657, + "learning_rate": 0.00010966025533982057, + "loss": 1.6454547882080077, + "step": 41420 + }, + { + "epoch": 0.12540674708277208, + "grad_norm": 0.14737483859062195, + "learning_rate": 0.00010965646015818317, + "loss": 1.5741939544677734, + "step": 41430 + }, + { + "epoch": 0.1254370166331179, + "grad_norm": 0.15067467093467712, + "learning_rate": 0.00010965266497654578, + "loss": 1.6484169006347655, + "step": 41440 + }, + { + "epoch": 0.12546728618346376, + "grad_norm": 0.1525532752275467, + "learning_rate": 0.00010964886979490838, + "loss": 1.638812255859375, + "step": 41450 + }, + { + "epoch": 0.12549755573380958, + "grad_norm": 0.13905036449432373, + "learning_rate": 0.00010964507461327099, + "loss": 1.6419706344604492, + "step": 41460 + }, + { + "epoch": 0.1255278252841554, + "grad_norm": 0.13998979330062866, + "learning_rate": 0.0001096412794316336, + "loss": 1.6754207611083984, + "step": 41470 + }, + { + "epoch": 0.12555809483450123, + "grad_norm": 0.15136194229125977, + "learning_rate": 0.00010963748424999622, + "loss": 1.5862407684326172, + "step": 41480 + }, + { + "epoch": 0.12558836438484705, + "grad_norm": 0.12852057814598083, + "learning_rate": 0.00010963368906835881, + "loss": 1.5953468322753905, + "step": 41490 + }, + { + "epoch": 0.1256186339351929, + "grad_norm": 0.12070894986391068, + "learning_rate": 0.00010962989388672143, + "loss": 1.628166389465332, + "step": 41500 + }, + { + "epoch": 0.1256186339351929, + "eval_loss": 1.6464712619781494, + "eval_runtime": 28.6267, + "eval_samples_per_second": 17.466, + "eval_steps_per_second": 1.118, + "step": 41500 + }, + { + "epoch": 0.12564890348553873, + "grad_norm": 0.14500339329242706, + "learning_rate": 0.00010962609870508402, + "loss": 1.6199760437011719, + "step": 41510 + }, + { + "epoch": 0.12567917303588455, + "grad_norm": 0.14909419417381287, + "learning_rate": 0.00010962230352344664, + "loss": 1.655720329284668, + "step": 41520 + }, + { + "epoch": 0.12570944258623037, + "grad_norm": 0.1799120157957077, + "learning_rate": 0.00010961850834180925, + "loss": 1.600640869140625, + "step": 41530 + }, + { + "epoch": 0.1257397121365762, + "grad_norm": 0.15246422588825226, + "learning_rate": 0.00010961471316017185, + "loss": 1.6147979736328124, + "step": 41540 + }, + { + "epoch": 0.12576998168692205, + "grad_norm": 0.14680634438991547, + "learning_rate": 0.00010961091797853446, + "loss": 1.6318363189697265, + "step": 41550 + }, + { + "epoch": 0.12580025123726787, + "grad_norm": 0.14031927287578583, + "learning_rate": 0.00010960712279689706, + "loss": 1.6644025802612306, + "step": 41560 + }, + { + "epoch": 0.1258305207876137, + "grad_norm": 0.13002583384513855, + "learning_rate": 0.00010960332761525967, + "loss": 1.6430368423461914, + "step": 41570 + }, + { + "epoch": 0.12586079033795952, + "grad_norm": 0.15676963329315186, + "learning_rate": 0.00010959953243362228, + "loss": 1.5865219116210938, + "step": 41580 + }, + { + "epoch": 0.12589105988830535, + "grad_norm": 0.134613499045372, + "learning_rate": 0.00010959573725198488, + "loss": 1.5915593147277831, + "step": 41590 + }, + { + "epoch": 0.1259213294386512, + "grad_norm": 0.14711007475852966, + "learning_rate": 0.00010959194207034749, + "loss": 1.6516040802001952, + "step": 41600 + }, + { + "epoch": 0.12595159898899702, + "grad_norm": 0.15534062683582306, + "learning_rate": 0.0001095881468887101, + "loss": 1.6073604583740235, + "step": 41610 + }, + { + "epoch": 0.12598186853934285, + "grad_norm": 0.14408715069293976, + "learning_rate": 0.0001095843517070727, + "loss": 1.6039857864379883, + "step": 41620 + }, + { + "epoch": 0.12601213808968867, + "grad_norm": 0.158005490899086, + "learning_rate": 0.0001095805565254353, + "loss": 1.6567981719970704, + "step": 41630 + }, + { + "epoch": 0.12604240764003452, + "grad_norm": 0.12322672456502914, + "learning_rate": 0.00010957676134379791, + "loss": 1.6087226867675781, + "step": 41640 + }, + { + "epoch": 0.12607267719038034, + "grad_norm": 0.14465296268463135, + "learning_rate": 0.00010957296616216052, + "loss": 1.6218002319335938, + "step": 41650 + }, + { + "epoch": 0.12610294674072617, + "grad_norm": 0.1319049447774887, + "learning_rate": 0.00010956917098052314, + "loss": 1.6278026580810547, + "step": 41660 + }, + { + "epoch": 0.126133216291072, + "grad_norm": 0.1559010148048401, + "learning_rate": 0.00010956537579888573, + "loss": 1.6515846252441406, + "step": 41670 + }, + { + "epoch": 0.12616348584141782, + "grad_norm": 0.12545886635780334, + "learning_rate": 0.00010956158061724835, + "loss": 1.6353309631347657, + "step": 41680 + }, + { + "epoch": 0.12619375539176367, + "grad_norm": 0.13278663158416748, + "learning_rate": 0.00010955778543561095, + "loss": 1.635124397277832, + "step": 41690 + }, + { + "epoch": 0.1262240249421095, + "grad_norm": 0.12856730818748474, + "learning_rate": 0.00010955399025397356, + "loss": 1.5983882904052735, + "step": 41700 + }, + { + "epoch": 0.12625429449245532, + "grad_norm": 0.1323273777961731, + "learning_rate": 0.00010955019507233617, + "loss": 1.584986972808838, + "step": 41710 + }, + { + "epoch": 0.12628456404280114, + "grad_norm": 0.12801846861839294, + "learning_rate": 0.00010954639989069877, + "loss": 1.6128900527954102, + "step": 41720 + }, + { + "epoch": 0.12631483359314696, + "grad_norm": 0.15283343195915222, + "learning_rate": 0.00010954260470906138, + "loss": 1.6511064529418946, + "step": 41730 + }, + { + "epoch": 0.12634510314349282, + "grad_norm": 0.14116846024990082, + "learning_rate": 0.00010953880952742398, + "loss": 1.6460966110229491, + "step": 41740 + }, + { + "epoch": 0.12637537269383864, + "grad_norm": 0.13944457471370697, + "learning_rate": 0.00010953501434578659, + "loss": 1.6413108825683593, + "step": 41750 + }, + { + "epoch": 0.12640564224418446, + "grad_norm": 0.13071702420711517, + "learning_rate": 0.0001095312191641492, + "loss": 1.6664056777954102, + "step": 41760 + }, + { + "epoch": 0.1264359117945303, + "grad_norm": 0.13900640606880188, + "learning_rate": 0.00010952742398251182, + "loss": 1.6274932861328124, + "step": 41770 + }, + { + "epoch": 0.1264661813448761, + "grad_norm": 0.13440614938735962, + "learning_rate": 0.00010952362880087441, + "loss": 1.6307743072509766, + "step": 41780 + }, + { + "epoch": 0.12649645089522196, + "grad_norm": 0.12495823204517365, + "learning_rate": 0.00010951983361923703, + "loss": 1.617950439453125, + "step": 41790 + }, + { + "epoch": 0.1265267204455678, + "grad_norm": 0.13177940249443054, + "learning_rate": 0.00010951603843759962, + "loss": 1.6302350997924804, + "step": 41800 + }, + { + "epoch": 0.1265569899959136, + "grad_norm": 0.16062186658382416, + "learning_rate": 0.00010951224325596224, + "loss": 1.6233436584472656, + "step": 41810 + }, + { + "epoch": 0.12658725954625943, + "grad_norm": 0.12955333292484283, + "learning_rate": 0.00010950844807432483, + "loss": 1.6169523239135741, + "step": 41820 + }, + { + "epoch": 0.12661752909660526, + "grad_norm": 0.14504633843898773, + "learning_rate": 0.00010950465289268745, + "loss": 1.6472885131835937, + "step": 41830 + }, + { + "epoch": 0.1266477986469511, + "grad_norm": 0.14168277382850647, + "learning_rate": 0.00010950085771105004, + "loss": 1.6342948913574218, + "step": 41840 + }, + { + "epoch": 0.12667806819729693, + "grad_norm": 0.14638319611549377, + "learning_rate": 0.00010949706252941266, + "loss": 1.6265928268432617, + "step": 41850 + }, + { + "epoch": 0.12670833774764276, + "grad_norm": 0.15403904020786285, + "learning_rate": 0.00010949326734777526, + "loss": 1.6636816024780274, + "step": 41860 + }, + { + "epoch": 0.12673860729798858, + "grad_norm": 0.14840008318424225, + "learning_rate": 0.00010948947216613788, + "loss": 1.6159259796142578, + "step": 41870 + }, + { + "epoch": 0.1267688768483344, + "grad_norm": 0.1282159835100174, + "learning_rate": 0.00010948567698450048, + "loss": 1.665998649597168, + "step": 41880 + }, + { + "epoch": 0.12679914639868026, + "grad_norm": 0.14250826835632324, + "learning_rate": 0.00010948188180286309, + "loss": 1.6493946075439454, + "step": 41890 + }, + { + "epoch": 0.12682941594902608, + "grad_norm": 0.14246445894241333, + "learning_rate": 0.0001094780866212257, + "loss": 1.6578765869140626, + "step": 41900 + }, + { + "epoch": 0.1268596854993719, + "grad_norm": 0.13467787206172943, + "learning_rate": 0.0001094742914395883, + "loss": 1.6231555938720703, + "step": 41910 + }, + { + "epoch": 0.12688995504971773, + "grad_norm": 0.15044240653514862, + "learning_rate": 0.00010947049625795092, + "loss": 1.627889060974121, + "step": 41920 + }, + { + "epoch": 0.12692022460006355, + "grad_norm": 0.12388388067483902, + "learning_rate": 0.00010946670107631351, + "loss": 1.6054821014404297, + "step": 41930 + }, + { + "epoch": 0.1269504941504094, + "grad_norm": 0.15708027780056, + "learning_rate": 0.00010946290589467613, + "loss": 1.6577508926391602, + "step": 41940 + }, + { + "epoch": 0.12698076370075523, + "grad_norm": 0.1390896588563919, + "learning_rate": 0.00010945911071303872, + "loss": 1.6685287475585937, + "step": 41950 + }, + { + "epoch": 0.12701103325110105, + "grad_norm": 0.13343247771263123, + "learning_rate": 0.00010945531553140134, + "loss": 1.6124347686767577, + "step": 41960 + }, + { + "epoch": 0.12704130280144688, + "grad_norm": 0.14486443996429443, + "learning_rate": 0.00010945152034976393, + "loss": 1.6075990676879883, + "step": 41970 + }, + { + "epoch": 0.12707157235179273, + "grad_norm": 0.13607558608055115, + "learning_rate": 0.00010944772516812655, + "loss": 1.608839225769043, + "step": 41980 + }, + { + "epoch": 0.12710184190213855, + "grad_norm": 0.1699160635471344, + "learning_rate": 0.00010944392998648915, + "loss": 1.6895925521850585, + "step": 41990 + }, + { + "epoch": 0.12713211145248438, + "grad_norm": 0.14012637734413147, + "learning_rate": 0.00010944013480485177, + "loss": 1.6060630798339843, + "step": 42000 + }, + { + "epoch": 0.12713211145248438, + "eval_loss": 1.62443208694458, + "eval_runtime": 28.2839, + "eval_samples_per_second": 17.678, + "eval_steps_per_second": 1.131, + "step": 42000 + }, + { + "epoch": 0.1271623810028302, + "grad_norm": 0.1337386816740036, + "learning_rate": 0.00010943633962321436, + "loss": 1.6326726913452148, + "step": 42010 + }, + { + "epoch": 0.12719265055317602, + "grad_norm": 0.1326654702425003, + "learning_rate": 0.00010943254444157698, + "loss": 1.6610794067382812, + "step": 42020 + }, + { + "epoch": 0.12722292010352187, + "grad_norm": 0.1371884047985077, + "learning_rate": 0.00010942874925993958, + "loss": 1.639328384399414, + "step": 42030 + }, + { + "epoch": 0.1272531896538677, + "grad_norm": 0.1308259516954422, + "learning_rate": 0.00010942495407830219, + "loss": 1.6521568298339844, + "step": 42040 + }, + { + "epoch": 0.12728345920421352, + "grad_norm": 0.14784109592437744, + "learning_rate": 0.0001094211588966648, + "loss": 1.6195581436157227, + "step": 42050 + }, + { + "epoch": 0.12731372875455935, + "grad_norm": 0.14227551221847534, + "learning_rate": 0.0001094173637150274, + "loss": 1.5970868110656737, + "step": 42060 + }, + { + "epoch": 0.12734399830490517, + "grad_norm": 0.14668038487434387, + "learning_rate": 0.00010941356853339001, + "loss": 1.6286643981933593, + "step": 42070 + }, + { + "epoch": 0.12737426785525102, + "grad_norm": 0.142417311668396, + "learning_rate": 0.00010940977335175261, + "loss": 1.652838897705078, + "step": 42080 + }, + { + "epoch": 0.12740453740559685, + "grad_norm": 0.14883248507976532, + "learning_rate": 0.00010940597817011523, + "loss": 1.6192281723022461, + "step": 42090 + }, + { + "epoch": 0.12743480695594267, + "grad_norm": 0.1343364268541336, + "learning_rate": 0.00010940218298847783, + "loss": 1.5910856246948242, + "step": 42100 + }, + { + "epoch": 0.1274650765062885, + "grad_norm": 0.13860705494880676, + "learning_rate": 0.00010939838780684044, + "loss": 1.6208835601806642, + "step": 42110 + }, + { + "epoch": 0.12749534605663432, + "grad_norm": 0.13561023771762848, + "learning_rate": 0.00010939459262520304, + "loss": 1.6292243957519532, + "step": 42120 + }, + { + "epoch": 0.12752561560698017, + "grad_norm": 0.14385052025318146, + "learning_rate": 0.00010939079744356566, + "loss": 1.62445011138916, + "step": 42130 + }, + { + "epoch": 0.127555885157326, + "grad_norm": 0.13303114473819733, + "learning_rate": 0.00010938700226192826, + "loss": 1.5974766731262207, + "step": 42140 + }, + { + "epoch": 0.12758615470767182, + "grad_norm": 0.14686983823776245, + "learning_rate": 0.00010938320708029087, + "loss": 1.5991393089294434, + "step": 42150 + }, + { + "epoch": 0.12761642425801764, + "grad_norm": 0.14288881421089172, + "learning_rate": 0.00010937941189865347, + "loss": 1.6002948760986329, + "step": 42160 + }, + { + "epoch": 0.12764669380836347, + "grad_norm": 0.13219161331653595, + "learning_rate": 0.00010937561671701608, + "loss": 1.660354995727539, + "step": 42170 + }, + { + "epoch": 0.12767696335870932, + "grad_norm": 0.13466410338878632, + "learning_rate": 0.00010937182153537869, + "loss": 1.612156867980957, + "step": 42180 + }, + { + "epoch": 0.12770723290905514, + "grad_norm": 0.14376157522201538, + "learning_rate": 0.00010936802635374129, + "loss": 1.5943443298339843, + "step": 42190 + }, + { + "epoch": 0.12773750245940096, + "grad_norm": 0.14631953835487366, + "learning_rate": 0.0001093642311721039, + "loss": 1.5985942840576173, + "step": 42200 + }, + { + "epoch": 0.1277677720097468, + "grad_norm": 0.12716174125671387, + "learning_rate": 0.0001093604359904665, + "loss": 1.6175518035888672, + "step": 42210 + }, + { + "epoch": 0.1277980415600926, + "grad_norm": 0.14645400643348694, + "learning_rate": 0.00010935664080882911, + "loss": 1.6036849975585938, + "step": 42220 + }, + { + "epoch": 0.12782831111043846, + "grad_norm": 0.14562514424324036, + "learning_rate": 0.00010935284562719172, + "loss": 1.6691608428955078, + "step": 42230 + }, + { + "epoch": 0.1278585806607843, + "grad_norm": 0.12734076380729675, + "learning_rate": 0.00010934905044555432, + "loss": 1.6288436889648437, + "step": 42240 + }, + { + "epoch": 0.1278888502111301, + "grad_norm": 0.14630769193172455, + "learning_rate": 0.00010934525526391693, + "loss": 1.5803656578063965, + "step": 42250 + }, + { + "epoch": 0.12791911976147594, + "grad_norm": 0.14478915929794312, + "learning_rate": 0.00010934146008227953, + "loss": 1.6192964553833007, + "step": 42260 + }, + { + "epoch": 0.12794938931182176, + "grad_norm": 0.1509561538696289, + "learning_rate": 0.00010933766490064215, + "loss": 1.6261056900024413, + "step": 42270 + }, + { + "epoch": 0.1279796588621676, + "grad_norm": 0.12338361144065857, + "learning_rate": 0.00010933386971900475, + "loss": 1.624078369140625, + "step": 42280 + }, + { + "epoch": 0.12800992841251344, + "grad_norm": 0.1521591991186142, + "learning_rate": 0.00010933007453736737, + "loss": 1.6587234497070313, + "step": 42290 + }, + { + "epoch": 0.12804019796285926, + "grad_norm": 0.1221848800778389, + "learning_rate": 0.00010932627935572997, + "loss": 1.6287622451782227, + "step": 42300 + }, + { + "epoch": 0.12807046751320508, + "grad_norm": 0.13936613500118256, + "learning_rate": 0.00010932248417409258, + "loss": 1.6239852905273438, + "step": 42310 + }, + { + "epoch": 0.12810073706355093, + "grad_norm": 0.12728697061538696, + "learning_rate": 0.00010931868899245518, + "loss": 1.6502315521240234, + "step": 42320 + }, + { + "epoch": 0.12813100661389676, + "grad_norm": 0.13808751106262207, + "learning_rate": 0.00010931489381081779, + "loss": 1.6399471282958984, + "step": 42330 + }, + { + "epoch": 0.12816127616424258, + "grad_norm": 0.12904301285743713, + "learning_rate": 0.0001093110986291804, + "loss": 1.6216949462890624, + "step": 42340 + }, + { + "epoch": 0.1281915457145884, + "grad_norm": 0.13429757952690125, + "learning_rate": 0.000109307303447543, + "loss": 1.6512422561645508, + "step": 42350 + }, + { + "epoch": 0.12822181526493423, + "grad_norm": 0.14785845577716827, + "learning_rate": 0.00010930350826590561, + "loss": 1.6461711883544923, + "step": 42360 + }, + { + "epoch": 0.12825208481528008, + "grad_norm": 0.15122739970684052, + "learning_rate": 0.00010929971308426821, + "loss": 1.6473037719726562, + "step": 42370 + }, + { + "epoch": 0.1282823543656259, + "grad_norm": 0.14520317316055298, + "learning_rate": 0.00010929591790263082, + "loss": 1.6837390899658202, + "step": 42380 + }, + { + "epoch": 0.12831262391597173, + "grad_norm": 0.1462959498167038, + "learning_rate": 0.00010929212272099343, + "loss": 1.603509521484375, + "step": 42390 + }, + { + "epoch": 0.12834289346631755, + "grad_norm": 0.1626748889684677, + "learning_rate": 0.00010928832753935604, + "loss": 1.643590545654297, + "step": 42400 + }, + { + "epoch": 0.12837316301666338, + "grad_norm": 0.1302793025970459, + "learning_rate": 0.00010928453235771864, + "loss": 1.639963150024414, + "step": 42410 + }, + { + "epoch": 0.12840343256700923, + "grad_norm": 0.14374494552612305, + "learning_rate": 0.00010928073717608126, + "loss": 1.5939191818237304, + "step": 42420 + }, + { + "epoch": 0.12843370211735505, + "grad_norm": 0.13342979550361633, + "learning_rate": 0.00010927694199444385, + "loss": 1.613346481323242, + "step": 42430 + }, + { + "epoch": 0.12846397166770088, + "grad_norm": 0.1590629667043686, + "learning_rate": 0.00010927314681280647, + "loss": 1.6375102996826172, + "step": 42440 + }, + { + "epoch": 0.1284942412180467, + "grad_norm": 0.14752815663814545, + "learning_rate": 0.00010926935163116906, + "loss": 1.6417011260986327, + "step": 42450 + }, + { + "epoch": 0.12852451076839252, + "grad_norm": 0.14103178679943085, + "learning_rate": 0.00010926555644953168, + "loss": 1.6255481719970704, + "step": 42460 + }, + { + "epoch": 0.12855478031873838, + "grad_norm": 0.13387727737426758, + "learning_rate": 0.00010926176126789427, + "loss": 1.6080684661865234, + "step": 42470 + }, + { + "epoch": 0.1285850498690842, + "grad_norm": 0.14949534833431244, + "learning_rate": 0.00010925796608625689, + "loss": 1.6781368255615234, + "step": 42480 + }, + { + "epoch": 0.12861531941943002, + "grad_norm": 0.14760561287403107, + "learning_rate": 0.0001092541709046195, + "loss": 1.6455087661743164, + "step": 42490 + }, + { + "epoch": 0.12864558896977585, + "grad_norm": 0.13887281715869904, + "learning_rate": 0.0001092503757229821, + "loss": 1.6036079406738282, + "step": 42500 + }, + { + "epoch": 0.12864558896977585, + "eval_loss": 1.6146230697631836, + "eval_runtime": 28.2291, + "eval_samples_per_second": 17.712, + "eval_steps_per_second": 1.134, + "step": 42500 + }, + { + "epoch": 0.12867585852012167, + "grad_norm": 0.14820095896720886, + "learning_rate": 0.00010924658054134472, + "loss": 1.5806764602661132, + "step": 42510 + }, + { + "epoch": 0.12870612807046752, + "grad_norm": 0.12855640053749084, + "learning_rate": 0.00010924278535970732, + "loss": 1.641514205932617, + "step": 42520 + }, + { + "epoch": 0.12873639762081335, + "grad_norm": 0.14353181421756744, + "learning_rate": 0.00010923899017806994, + "loss": 1.638612937927246, + "step": 42530 + }, + { + "epoch": 0.12876666717115917, + "grad_norm": 0.14110694825649261, + "learning_rate": 0.00010923519499643253, + "loss": 1.658578109741211, + "step": 42540 + }, + { + "epoch": 0.128796936721505, + "grad_norm": 0.1488867998123169, + "learning_rate": 0.00010923139981479515, + "loss": 1.6241416931152344, + "step": 42550 + }, + { + "epoch": 0.12882720627185082, + "grad_norm": 0.13166800141334534, + "learning_rate": 0.00010922760463315774, + "loss": 1.6050006866455078, + "step": 42560 + }, + { + "epoch": 0.12885747582219667, + "grad_norm": 0.11589530110359192, + "learning_rate": 0.00010922380945152036, + "loss": 1.630122756958008, + "step": 42570 + }, + { + "epoch": 0.1288877453725425, + "grad_norm": 0.12493986636400223, + "learning_rate": 0.00010922001426988295, + "loss": 1.620551872253418, + "step": 42580 + }, + { + "epoch": 0.12891801492288832, + "grad_norm": 0.12474576383829117, + "learning_rate": 0.00010921621908824557, + "loss": 1.620347023010254, + "step": 42590 + }, + { + "epoch": 0.12894828447323414, + "grad_norm": 0.13347865641117096, + "learning_rate": 0.00010921242390660816, + "loss": 1.6570030212402345, + "step": 42600 + }, + { + "epoch": 0.12897855402357997, + "grad_norm": 0.1408914029598236, + "learning_rate": 0.00010920862872497078, + "loss": 1.6043403625488282, + "step": 42610 + }, + { + "epoch": 0.12900882357392582, + "grad_norm": 0.14203576743602753, + "learning_rate": 0.00010920483354333338, + "loss": 1.65931396484375, + "step": 42620 + }, + { + "epoch": 0.12903909312427164, + "grad_norm": 0.13732530176639557, + "learning_rate": 0.000109201038361696, + "loss": 1.6196479797363281, + "step": 42630 + }, + { + "epoch": 0.12906936267461747, + "grad_norm": 0.1255672574043274, + "learning_rate": 0.0001091972431800586, + "loss": 1.6283329010009766, + "step": 42640 + }, + { + "epoch": 0.1290996322249633, + "grad_norm": 0.14999531209468842, + "learning_rate": 0.00010919344799842121, + "loss": 1.6051986694335938, + "step": 42650 + }, + { + "epoch": 0.12912990177530914, + "grad_norm": 0.12213832885026932, + "learning_rate": 0.00010918965281678381, + "loss": 1.6129877090454101, + "step": 42660 + }, + { + "epoch": 0.12916017132565497, + "grad_norm": 0.13882632553577423, + "learning_rate": 0.00010918585763514642, + "loss": 1.6110462188720702, + "step": 42670 + }, + { + "epoch": 0.1291904408760008, + "grad_norm": 0.12948442995548248, + "learning_rate": 0.00010918206245350902, + "loss": 1.5912835121154785, + "step": 42680 + }, + { + "epoch": 0.1292207104263466, + "grad_norm": 0.14008669555187225, + "learning_rate": 0.00010917826727187163, + "loss": 1.6070058822631836, + "step": 42690 + }, + { + "epoch": 0.12925097997669244, + "grad_norm": 0.13462677597999573, + "learning_rate": 0.00010917447209023425, + "loss": 1.6418014526367188, + "step": 42700 + }, + { + "epoch": 0.1292812495270383, + "grad_norm": 0.1388729214668274, + "learning_rate": 0.00010917067690859684, + "loss": 1.659792709350586, + "step": 42710 + }, + { + "epoch": 0.1293115190773841, + "grad_norm": 0.12664483487606049, + "learning_rate": 0.00010916688172695946, + "loss": 1.6074913024902344, + "step": 42720 + }, + { + "epoch": 0.12934178862772994, + "grad_norm": 0.14578983187675476, + "learning_rate": 0.00010916308654532205, + "loss": 1.6099092483520507, + "step": 42730 + }, + { + "epoch": 0.12937205817807576, + "grad_norm": 0.13167895376682281, + "learning_rate": 0.00010915929136368467, + "loss": 1.5976799964904784, + "step": 42740 + }, + { + "epoch": 0.12940232772842158, + "grad_norm": 0.12346866726875305, + "learning_rate": 0.00010915549618204727, + "loss": 1.6133747100830078, + "step": 42750 + }, + { + "epoch": 0.12943259727876744, + "grad_norm": 0.13208235800266266, + "learning_rate": 0.00010915170100040989, + "loss": 1.6056182861328125, + "step": 42760 + }, + { + "epoch": 0.12946286682911326, + "grad_norm": 0.1387878954410553, + "learning_rate": 0.00010914790581877249, + "loss": 1.6472366333007813, + "step": 42770 + }, + { + "epoch": 0.12949313637945908, + "grad_norm": 0.1511135697364807, + "learning_rate": 0.0001091441106371351, + "loss": 1.6119930267333984, + "step": 42780 + }, + { + "epoch": 0.1295234059298049, + "grad_norm": 0.1300230324268341, + "learning_rate": 0.0001091403154554977, + "loss": 1.6328136444091796, + "step": 42790 + }, + { + "epoch": 0.12955367548015073, + "grad_norm": 0.14616543054580688, + "learning_rate": 0.00010913652027386031, + "loss": 1.5913934707641602, + "step": 42800 + }, + { + "epoch": 0.12958394503049658, + "grad_norm": 0.13721780478954315, + "learning_rate": 0.00010913272509222292, + "loss": 1.5871081352233887, + "step": 42810 + }, + { + "epoch": 0.1296142145808424, + "grad_norm": 0.1297849714756012, + "learning_rate": 0.00010912892991058552, + "loss": 1.6459476470947265, + "step": 42820 + }, + { + "epoch": 0.12964448413118823, + "grad_norm": 0.1503724902868271, + "learning_rate": 0.00010912513472894813, + "loss": 1.6536441802978517, + "step": 42830 + }, + { + "epoch": 0.12967475368153406, + "grad_norm": 0.13790424168109894, + "learning_rate": 0.00010912133954731073, + "loss": 1.6291818618774414, + "step": 42840 + }, + { + "epoch": 0.12970502323187988, + "grad_norm": 0.14560936391353607, + "learning_rate": 0.00010911754436567334, + "loss": 1.6398218154907227, + "step": 42850 + }, + { + "epoch": 0.12973529278222573, + "grad_norm": 0.13274793326854706, + "learning_rate": 0.00010911374918403595, + "loss": 1.6286283493041993, + "step": 42860 + }, + { + "epoch": 0.12976556233257155, + "grad_norm": 0.12704351544380188, + "learning_rate": 0.00010910995400239855, + "loss": 1.5978344917297362, + "step": 42870 + }, + { + "epoch": 0.12979583188291738, + "grad_norm": 0.12989500164985657, + "learning_rate": 0.00010910615882076117, + "loss": 1.6466377258300782, + "step": 42880 + }, + { + "epoch": 0.1298261014332632, + "grad_norm": 0.1313013732433319, + "learning_rate": 0.00010910236363912376, + "loss": 1.6394660949707032, + "step": 42890 + }, + { + "epoch": 0.12985637098360903, + "grad_norm": 0.14722466468811035, + "learning_rate": 0.00010909856845748638, + "loss": 1.6040157318115233, + "step": 42900 + }, + { + "epoch": 0.12988664053395488, + "grad_norm": 0.14212535321712494, + "learning_rate": 0.00010909477327584899, + "loss": 1.6494056701660156, + "step": 42910 + }, + { + "epoch": 0.1299169100843007, + "grad_norm": 0.14978301525115967, + "learning_rate": 0.0001090909780942116, + "loss": 1.6336399078369142, + "step": 42920 + }, + { + "epoch": 0.12994717963464653, + "grad_norm": 0.15107597410678864, + "learning_rate": 0.0001090871829125742, + "loss": 1.5991600036621094, + "step": 42930 + }, + { + "epoch": 0.12997744918499235, + "grad_norm": 0.13456475734710693, + "learning_rate": 0.0001090833877309368, + "loss": 1.612665557861328, + "step": 42940 + }, + { + "epoch": 0.13000771873533817, + "grad_norm": 0.14339452981948853, + "learning_rate": 0.00010907959254929941, + "loss": 1.602762222290039, + "step": 42950 + }, + { + "epoch": 0.13003798828568403, + "grad_norm": 0.13040359318256378, + "learning_rate": 0.00010907579736766202, + "loss": 1.6089231491088867, + "step": 42960 + }, + { + "epoch": 0.13006825783602985, + "grad_norm": 0.12068332731723785, + "learning_rate": 0.00010907200218602462, + "loss": 1.6358810424804688, + "step": 42970 + }, + { + "epoch": 0.13009852738637567, + "grad_norm": 0.1287400722503662, + "learning_rate": 0.00010906820700438723, + "loss": 1.6259590148925782, + "step": 42980 + }, + { + "epoch": 0.1301287969367215, + "grad_norm": 0.1352817714214325, + "learning_rate": 0.00010906441182274984, + "loss": 1.6168792724609375, + "step": 42990 + }, + { + "epoch": 0.13015906648706732, + "grad_norm": 0.1340949386358261, + "learning_rate": 0.00010906061664111244, + "loss": 1.6392723083496095, + "step": 43000 + }, + { + "epoch": 0.13015906648706732, + "eval_loss": 1.6322946548461914, + "eval_runtime": 27.9589, + "eval_samples_per_second": 17.883, + "eval_steps_per_second": 1.145, + "step": 43000 + }, + { + "epoch": 0.13018933603741317, + "grad_norm": 0.1388545036315918, + "learning_rate": 0.00010905682145947506, + "loss": 1.6208030700683593, + "step": 43010 + }, + { + "epoch": 0.130219605587759, + "grad_norm": 0.14492405951023102, + "learning_rate": 0.00010905302627783765, + "loss": 1.5842708587646483, + "step": 43020 + }, + { + "epoch": 0.13024987513810482, + "grad_norm": 0.1480240374803543, + "learning_rate": 0.00010904923109620027, + "loss": 1.6439956665039062, + "step": 43030 + }, + { + "epoch": 0.13028014468845064, + "grad_norm": 0.15636426210403442, + "learning_rate": 0.00010904543591456287, + "loss": 1.6199119567871094, + "step": 43040 + }, + { + "epoch": 0.1303104142387965, + "grad_norm": 0.11900615692138672, + "learning_rate": 0.00010904164073292549, + "loss": 1.613088035583496, + "step": 43050 + }, + { + "epoch": 0.13034068378914232, + "grad_norm": 0.14425423741340637, + "learning_rate": 0.00010903784555128808, + "loss": 1.6261428833007812, + "step": 43060 + }, + { + "epoch": 0.13037095333948814, + "grad_norm": 0.14074255526065826, + "learning_rate": 0.0001090340503696507, + "loss": 1.6077919006347656, + "step": 43070 + }, + { + "epoch": 0.13040122288983397, + "grad_norm": 0.13875646889209747, + "learning_rate": 0.00010903025518801329, + "loss": 1.5706151962280273, + "step": 43080 + }, + { + "epoch": 0.1304314924401798, + "grad_norm": 0.12828218936920166, + "learning_rate": 0.00010902646000637591, + "loss": 1.6375650405883788, + "step": 43090 + }, + { + "epoch": 0.13046176199052564, + "grad_norm": 0.14489497244358063, + "learning_rate": 0.00010902266482473852, + "loss": 1.5931161880493163, + "step": 43100 + }, + { + "epoch": 0.13049203154087147, + "grad_norm": 0.1568707674741745, + "learning_rate": 0.00010901886964310112, + "loss": 1.6117446899414063, + "step": 43110 + }, + { + "epoch": 0.1305223010912173, + "grad_norm": 0.14170609414577484, + "learning_rate": 0.00010901507446146374, + "loss": 1.6671369552612305, + "step": 43120 + }, + { + "epoch": 0.13055257064156311, + "grad_norm": 0.1243351474404335, + "learning_rate": 0.00010901127927982633, + "loss": 1.6377372741699219, + "step": 43130 + }, + { + "epoch": 0.13058284019190894, + "grad_norm": 0.14787954092025757, + "learning_rate": 0.00010900748409818895, + "loss": 1.660463523864746, + "step": 43140 + }, + { + "epoch": 0.1306131097422548, + "grad_norm": 0.12551353871822357, + "learning_rate": 0.00010900368891655155, + "loss": 1.6247350692749023, + "step": 43150 + }, + { + "epoch": 0.13064337929260061, + "grad_norm": 0.1358576864004135, + "learning_rate": 0.00010899989373491416, + "loss": 1.62421875, + "step": 43160 + }, + { + "epoch": 0.13067364884294644, + "grad_norm": 0.13685095310211182, + "learning_rate": 0.00010899609855327676, + "loss": 1.6098297119140625, + "step": 43170 + }, + { + "epoch": 0.13070391839329226, + "grad_norm": 0.13561008870601654, + "learning_rate": 0.00010899230337163938, + "loss": 1.5961907386779786, + "step": 43180 + }, + { + "epoch": 0.13073418794363809, + "grad_norm": 0.12995173037052155, + "learning_rate": 0.00010898850819000197, + "loss": 1.6418142318725586, + "step": 43190 + }, + { + "epoch": 0.13076445749398394, + "grad_norm": 0.13516363501548767, + "learning_rate": 0.00010898471300836459, + "loss": 1.6041536331176758, + "step": 43200 + }, + { + "epoch": 0.13079472704432976, + "grad_norm": 0.14422985911369324, + "learning_rate": 0.00010898091782672718, + "loss": 1.6082595825195312, + "step": 43210 + }, + { + "epoch": 0.13082499659467559, + "grad_norm": 0.1309143751859665, + "learning_rate": 0.0001089771226450898, + "loss": 1.6421459197998047, + "step": 43220 + }, + { + "epoch": 0.1308552661450214, + "grad_norm": 0.14682337641716003, + "learning_rate": 0.00010897332746345239, + "loss": 1.6152305603027344, + "step": 43230 + }, + { + "epoch": 0.13088553569536723, + "grad_norm": 0.14182670414447784, + "learning_rate": 0.00010896953228181501, + "loss": 1.5886159896850587, + "step": 43240 + }, + { + "epoch": 0.13091580524571308, + "grad_norm": 0.13481555879116058, + "learning_rate": 0.00010896573710017762, + "loss": 1.6277841567993163, + "step": 43250 + }, + { + "epoch": 0.1309460747960589, + "grad_norm": 0.12804298102855682, + "learning_rate": 0.00010896194191854022, + "loss": 1.6271797180175782, + "step": 43260 + }, + { + "epoch": 0.13097634434640473, + "grad_norm": 0.11508705466985703, + "learning_rate": 0.00010895814673690283, + "loss": 1.6180551528930665, + "step": 43270 + }, + { + "epoch": 0.13100661389675056, + "grad_norm": 0.1285332590341568, + "learning_rate": 0.00010895435155526544, + "loss": 1.6225679397583008, + "step": 43280 + }, + { + "epoch": 0.13103688344709638, + "grad_norm": 0.13540202379226685, + "learning_rate": 0.00010895055637362804, + "loss": 1.5765769958496094, + "step": 43290 + }, + { + "epoch": 0.13106715299744223, + "grad_norm": 0.14214465022087097, + "learning_rate": 0.00010894676119199065, + "loss": 1.6494060516357423, + "step": 43300 + }, + { + "epoch": 0.13109742254778806, + "grad_norm": 0.13050831854343414, + "learning_rate": 0.00010894296601035327, + "loss": 1.6391399383544922, + "step": 43310 + }, + { + "epoch": 0.13112769209813388, + "grad_norm": 0.12678790092468262, + "learning_rate": 0.00010893917082871586, + "loss": 1.6152528762817382, + "step": 43320 + }, + { + "epoch": 0.1311579616484797, + "grad_norm": 0.15274351835250854, + "learning_rate": 0.00010893537564707848, + "loss": 1.5957252502441406, + "step": 43330 + }, + { + "epoch": 0.13118823119882553, + "grad_norm": 0.1522519439458847, + "learning_rate": 0.00010893158046544107, + "loss": 1.5949182510375977, + "step": 43340 + }, + { + "epoch": 0.13121850074917138, + "grad_norm": 0.14253760874271393, + "learning_rate": 0.00010892778528380369, + "loss": 1.6425098419189452, + "step": 43350 + }, + { + "epoch": 0.1312487702995172, + "grad_norm": 0.1355362981557846, + "learning_rate": 0.00010892399010216628, + "loss": 1.6605072021484375, + "step": 43360 + }, + { + "epoch": 0.13127903984986303, + "grad_norm": 0.14115439355373383, + "learning_rate": 0.0001089201949205289, + "loss": 1.607905960083008, + "step": 43370 + }, + { + "epoch": 0.13130930940020885, + "grad_norm": 0.15655897557735443, + "learning_rate": 0.00010891639973889151, + "loss": 1.5840118408203125, + "step": 43380 + }, + { + "epoch": 0.1313395789505547, + "grad_norm": 0.15469884872436523, + "learning_rate": 0.00010891260455725412, + "loss": 1.6304134368896483, + "step": 43390 + }, + { + "epoch": 0.13136984850090053, + "grad_norm": 0.13063913583755493, + "learning_rate": 0.00010890880937561672, + "loss": 1.6008174896240235, + "step": 43400 + }, + { + "epoch": 0.13140011805124635, + "grad_norm": 0.15494020283222198, + "learning_rate": 0.00010890501419397933, + "loss": 1.592050552368164, + "step": 43410 + }, + { + "epoch": 0.13143038760159217, + "grad_norm": 0.15022653341293335, + "learning_rate": 0.00010890121901234193, + "loss": 1.6407821655273438, + "step": 43420 + }, + { + "epoch": 0.131460657151938, + "grad_norm": 0.13591352105140686, + "learning_rate": 0.00010889742383070454, + "loss": 1.6478057861328126, + "step": 43430 + }, + { + "epoch": 0.13149092670228385, + "grad_norm": 0.13904450833797455, + "learning_rate": 0.00010889362864906714, + "loss": 1.6691574096679687, + "step": 43440 + }, + { + "epoch": 0.13152119625262967, + "grad_norm": 0.15329597890377045, + "learning_rate": 0.00010888983346742975, + "loss": 1.6262351989746093, + "step": 43450 + }, + { + "epoch": 0.1315514658029755, + "grad_norm": 0.1520300954580307, + "learning_rate": 0.00010888603828579236, + "loss": 1.6045312881469727, + "step": 43460 + }, + { + "epoch": 0.13158173535332132, + "grad_norm": 0.1407891809940338, + "learning_rate": 0.00010888224310415496, + "loss": 1.622452163696289, + "step": 43470 + }, + { + "epoch": 0.13161200490366715, + "grad_norm": 0.14810296893119812, + "learning_rate": 0.00010887844792251757, + "loss": 1.662886619567871, + "step": 43480 + }, + { + "epoch": 0.131642274454013, + "grad_norm": 0.1243213340640068, + "learning_rate": 0.00010887465274088017, + "loss": 1.6363422393798828, + "step": 43490 + }, + { + "epoch": 0.13167254400435882, + "grad_norm": 0.13582397997379303, + "learning_rate": 0.00010887085755924278, + "loss": 1.5873522758483887, + "step": 43500 + }, + { + "epoch": 0.13167254400435882, + "eval_loss": 1.6212031841278076, + "eval_runtime": 28.0317, + "eval_samples_per_second": 17.837, + "eval_steps_per_second": 1.142, + "step": 43500 + }, + { + "epoch": 0.13170281355470465, + "grad_norm": 0.14470458030700684, + "learning_rate": 0.0001088670623776054, + "loss": 1.605870246887207, + "step": 43510 + }, + { + "epoch": 0.13173308310505047, + "grad_norm": 0.13554416596889496, + "learning_rate": 0.000108863267195968, + "loss": 1.6045400619506835, + "step": 43520 + }, + { + "epoch": 0.1317633526553963, + "grad_norm": 0.1333262324333191, + "learning_rate": 0.00010885947201433061, + "loss": 1.6098236083984374, + "step": 43530 + }, + { + "epoch": 0.13179362220574214, + "grad_norm": 0.14145301282405853, + "learning_rate": 0.00010885567683269322, + "loss": 1.594810962677002, + "step": 43540 + }, + { + "epoch": 0.13182389175608797, + "grad_norm": 0.15412001311779022, + "learning_rate": 0.00010885188165105582, + "loss": 1.6446348190307618, + "step": 43550 + }, + { + "epoch": 0.1318541613064338, + "grad_norm": 0.15572263300418854, + "learning_rate": 0.00010884808646941843, + "loss": 1.5735315322875976, + "step": 43560 + }, + { + "epoch": 0.13188443085677962, + "grad_norm": 0.14529119431972504, + "learning_rate": 0.00010884429128778104, + "loss": 1.6123012542724608, + "step": 43570 + }, + { + "epoch": 0.13191470040712544, + "grad_norm": 0.13289090991020203, + "learning_rate": 0.00010884049610614364, + "loss": 1.6133047103881837, + "step": 43580 + }, + { + "epoch": 0.1319449699574713, + "grad_norm": 0.14043831825256348, + "learning_rate": 0.00010883670092450625, + "loss": 1.6281562805175782, + "step": 43590 + }, + { + "epoch": 0.13197523950781712, + "grad_norm": 0.1362410932779312, + "learning_rate": 0.00010883290574286885, + "loss": 1.6315898895263672, + "step": 43600 + }, + { + "epoch": 0.13200550905816294, + "grad_norm": 0.12471230328083038, + "learning_rate": 0.00010882911056123146, + "loss": 1.639044952392578, + "step": 43610 + }, + { + "epoch": 0.13203577860850876, + "grad_norm": 0.1362195909023285, + "learning_rate": 0.00010882531537959408, + "loss": 1.6165857315063477, + "step": 43620 + }, + { + "epoch": 0.1320660481588546, + "grad_norm": 0.1355208158493042, + "learning_rate": 0.00010882152019795667, + "loss": 1.5746116638183594, + "step": 43630 + }, + { + "epoch": 0.13209631770920044, + "grad_norm": 0.1385618895292282, + "learning_rate": 0.00010881772501631929, + "loss": 1.6465702056884766, + "step": 43640 + }, + { + "epoch": 0.13212658725954626, + "grad_norm": 0.1338602751493454, + "learning_rate": 0.00010881392983468188, + "loss": 1.5972207069396973, + "step": 43650 + }, + { + "epoch": 0.1321568568098921, + "grad_norm": 0.14858794212341309, + "learning_rate": 0.0001088101346530445, + "loss": 1.6437889099121095, + "step": 43660 + }, + { + "epoch": 0.1321871263602379, + "grad_norm": 0.12718798220157623, + "learning_rate": 0.0001088063394714071, + "loss": 1.657559585571289, + "step": 43670 + }, + { + "epoch": 0.13221739591058373, + "grad_norm": 0.13897965848445892, + "learning_rate": 0.00010880254428976971, + "loss": 1.6107517242431642, + "step": 43680 + }, + { + "epoch": 0.1322476654609296, + "grad_norm": 0.12158074975013733, + "learning_rate": 0.00010879874910813231, + "loss": 1.6427446365356446, + "step": 43690 + }, + { + "epoch": 0.1322779350112754, + "grad_norm": 0.14136533439159393, + "learning_rate": 0.00010879495392649493, + "loss": 1.6449203491210938, + "step": 43700 + }, + { + "epoch": 0.13230820456162123, + "grad_norm": 0.14702241122722626, + "learning_rate": 0.00010879115874485753, + "loss": 1.5879384994506835, + "step": 43710 + }, + { + "epoch": 0.13233847411196706, + "grad_norm": 0.12307281047105789, + "learning_rate": 0.00010878736356322014, + "loss": 1.646508026123047, + "step": 43720 + }, + { + "epoch": 0.1323687436623129, + "grad_norm": 0.14416591823101044, + "learning_rate": 0.00010878356838158274, + "loss": 1.6172124862670898, + "step": 43730 + }, + { + "epoch": 0.13239901321265873, + "grad_norm": 0.12514829635620117, + "learning_rate": 0.00010877977319994535, + "loss": 1.6498340606689452, + "step": 43740 + }, + { + "epoch": 0.13242928276300456, + "grad_norm": 0.13032518327236176, + "learning_rate": 0.00010877597801830797, + "loss": 1.5907517433166505, + "step": 43750 + }, + { + "epoch": 0.13245955231335038, + "grad_norm": 0.15083593130111694, + "learning_rate": 0.00010877218283667056, + "loss": 1.624049186706543, + "step": 43760 + }, + { + "epoch": 0.1324898218636962, + "grad_norm": 0.1359640210866928, + "learning_rate": 0.00010876838765503318, + "loss": 1.610417366027832, + "step": 43770 + }, + { + "epoch": 0.13252009141404206, + "grad_norm": 0.12895840406417847, + "learning_rate": 0.00010876459247339577, + "loss": 1.6397834777832032, + "step": 43780 + }, + { + "epoch": 0.13255036096438788, + "grad_norm": 0.13749125599861145, + "learning_rate": 0.0001087607972917584, + "loss": 1.645502471923828, + "step": 43790 + }, + { + "epoch": 0.1325806305147337, + "grad_norm": 0.1249711737036705, + "learning_rate": 0.00010875700211012099, + "loss": 1.6334104537963867, + "step": 43800 + }, + { + "epoch": 0.13261090006507953, + "grad_norm": 0.16080963611602783, + "learning_rate": 0.0001087532069284836, + "loss": 1.6102260589599608, + "step": 43810 + }, + { + "epoch": 0.13264116961542535, + "grad_norm": 0.1608210653066635, + "learning_rate": 0.0001087494117468462, + "loss": 1.595141315460205, + "step": 43820 + }, + { + "epoch": 0.1326714391657712, + "grad_norm": 0.13854750990867615, + "learning_rate": 0.00010874561656520882, + "loss": 1.635495376586914, + "step": 43830 + }, + { + "epoch": 0.13270170871611703, + "grad_norm": 0.12818105518817902, + "learning_rate": 0.00010874182138357141, + "loss": 1.6178600311279296, + "step": 43840 + }, + { + "epoch": 0.13273197826646285, + "grad_norm": 0.13069233298301697, + "learning_rate": 0.00010873802620193403, + "loss": 1.6013679504394531, + "step": 43850 + }, + { + "epoch": 0.13276224781680868, + "grad_norm": 0.13142666220664978, + "learning_rate": 0.00010873423102029662, + "loss": 1.6300376892089843, + "step": 43860 + }, + { + "epoch": 0.1327925173671545, + "grad_norm": 0.13208864629268646, + "learning_rate": 0.00010873043583865924, + "loss": 1.6316530227661132, + "step": 43870 + }, + { + "epoch": 0.13282278691750035, + "grad_norm": 0.1320326328277588, + "learning_rate": 0.00010872664065702185, + "loss": 1.6021427154541015, + "step": 43880 + }, + { + "epoch": 0.13285305646784618, + "grad_norm": 0.142772376537323, + "learning_rate": 0.00010872284547538445, + "loss": 1.625267791748047, + "step": 43890 + }, + { + "epoch": 0.132883326018192, + "grad_norm": 0.13205450773239136, + "learning_rate": 0.00010871905029374706, + "loss": 1.62003173828125, + "step": 43900 + }, + { + "epoch": 0.13291359556853782, + "grad_norm": 0.12656912207603455, + "learning_rate": 0.00010871525511210967, + "loss": 1.5981330871582031, + "step": 43910 + }, + { + "epoch": 0.13294386511888365, + "grad_norm": 0.13605637848377228, + "learning_rate": 0.00010871145993047228, + "loss": 1.6457759857177734, + "step": 43920 + }, + { + "epoch": 0.1329741346692295, + "grad_norm": 0.13512611389160156, + "learning_rate": 0.00010870766474883488, + "loss": 1.6333747863769532, + "step": 43930 + }, + { + "epoch": 0.13300440421957532, + "grad_norm": 0.13845954835414886, + "learning_rate": 0.0001087038695671975, + "loss": 1.6529827117919922, + "step": 43940 + }, + { + "epoch": 0.13303467376992115, + "grad_norm": 0.14445577561855316, + "learning_rate": 0.00010870007438556009, + "loss": 1.6114620208740233, + "step": 43950 + }, + { + "epoch": 0.13306494332026697, + "grad_norm": 0.12894022464752197, + "learning_rate": 0.00010869627920392271, + "loss": 1.6266883850097655, + "step": 43960 + }, + { + "epoch": 0.1330952128706128, + "grad_norm": 0.14514386653900146, + "learning_rate": 0.0001086924840222853, + "loss": 1.6191585540771485, + "step": 43970 + }, + { + "epoch": 0.13312548242095865, + "grad_norm": 0.14019329845905304, + "learning_rate": 0.00010868868884064792, + "loss": 1.6364320755004882, + "step": 43980 + }, + { + "epoch": 0.13315575197130447, + "grad_norm": 0.13593506813049316, + "learning_rate": 0.00010868489365901053, + "loss": 1.6498809814453126, + "step": 43990 + }, + { + "epoch": 0.1331860215216503, + "grad_norm": 0.14024600386619568, + "learning_rate": 0.00010868109847737313, + "loss": 1.600600814819336, + "step": 44000 + }, + { + "epoch": 0.1331860215216503, + "eval_loss": 1.6366928815841675, + "eval_runtime": 27.9171, + "eval_samples_per_second": 17.91, + "eval_steps_per_second": 1.146, + "step": 44000 + }, + { + "epoch": 0.13321629107199612, + "grad_norm": 0.12252578139305115, + "learning_rate": 0.00010867730329573574, + "loss": 1.645198440551758, + "step": 44010 + }, + { + "epoch": 0.13324656062234194, + "grad_norm": 0.155439093708992, + "learning_rate": 0.00010867350811409834, + "loss": 1.6081245422363282, + "step": 44020 + }, + { + "epoch": 0.1332768301726878, + "grad_norm": 0.1303028166294098, + "learning_rate": 0.00010866971293246095, + "loss": 1.6724365234375, + "step": 44030 + }, + { + "epoch": 0.13330709972303362, + "grad_norm": 0.1393376886844635, + "learning_rate": 0.00010866591775082356, + "loss": 1.606770133972168, + "step": 44040 + }, + { + "epoch": 0.13333736927337944, + "grad_norm": 0.13423104584217072, + "learning_rate": 0.00010866212256918616, + "loss": 1.6288429260253907, + "step": 44050 + }, + { + "epoch": 0.13336763882372527, + "grad_norm": 0.13502486050128937, + "learning_rate": 0.00010865832738754877, + "loss": 1.6194339752197267, + "step": 44060 + }, + { + "epoch": 0.13339790837407112, + "grad_norm": 0.12750361859798431, + "learning_rate": 0.00010865453220591137, + "loss": 1.624302291870117, + "step": 44070 + }, + { + "epoch": 0.13342817792441694, + "grad_norm": 0.15850192308425903, + "learning_rate": 0.00010865073702427398, + "loss": 1.6434934616088868, + "step": 44080 + }, + { + "epoch": 0.13345844747476276, + "grad_norm": 0.11933235824108124, + "learning_rate": 0.00010864694184263659, + "loss": 1.6056344985961915, + "step": 44090 + }, + { + "epoch": 0.1334887170251086, + "grad_norm": 0.1294836401939392, + "learning_rate": 0.00010864314666099919, + "loss": 1.625412368774414, + "step": 44100 + }, + { + "epoch": 0.1335189865754544, + "grad_norm": 0.14506736397743225, + "learning_rate": 0.0001086393514793618, + "loss": 1.586203670501709, + "step": 44110 + }, + { + "epoch": 0.13354925612580026, + "grad_norm": 0.13702014088630676, + "learning_rate": 0.00010863555629772442, + "loss": 1.6656116485595702, + "step": 44120 + }, + { + "epoch": 0.1335795256761461, + "grad_norm": 0.13370856642723083, + "learning_rate": 0.00010863176111608702, + "loss": 1.6089134216308594, + "step": 44130 + }, + { + "epoch": 0.1336097952264919, + "grad_norm": 0.13882261514663696, + "learning_rate": 0.00010862796593444963, + "loss": 1.5939084053039552, + "step": 44140 + }, + { + "epoch": 0.13364006477683774, + "grad_norm": 0.13801808655261993, + "learning_rate": 0.00010862417075281224, + "loss": 1.61804256439209, + "step": 44150 + }, + { + "epoch": 0.13367033432718356, + "grad_norm": 0.15165837109088898, + "learning_rate": 0.00010862037557117484, + "loss": 1.6505781173706056, + "step": 44160 + }, + { + "epoch": 0.1337006038775294, + "grad_norm": 0.12902595102787018, + "learning_rate": 0.00010861658038953745, + "loss": 1.6336894989013673, + "step": 44170 + }, + { + "epoch": 0.13373087342787524, + "grad_norm": 0.11925309896469116, + "learning_rate": 0.00010861278520790005, + "loss": 1.575846290588379, + "step": 44180 + }, + { + "epoch": 0.13376114297822106, + "grad_norm": 0.1304980218410492, + "learning_rate": 0.00010860899002626266, + "loss": 1.6405218124389649, + "step": 44190 + }, + { + "epoch": 0.13379141252856688, + "grad_norm": 0.14764617383480072, + "learning_rate": 0.00010860519484462526, + "loss": 1.6212635040283203, + "step": 44200 + }, + { + "epoch": 0.1338216820789127, + "grad_norm": 0.16824518144130707, + "learning_rate": 0.00010860139966298787, + "loss": 1.5663771629333496, + "step": 44210 + }, + { + "epoch": 0.13385195162925856, + "grad_norm": 0.12630625069141388, + "learning_rate": 0.00010859760448135048, + "loss": 1.6355985641479491, + "step": 44220 + }, + { + "epoch": 0.13388222117960438, + "grad_norm": 0.14271096885204315, + "learning_rate": 0.0001085938092997131, + "loss": 1.6618106842041016, + "step": 44230 + }, + { + "epoch": 0.1339124907299502, + "grad_norm": 0.1314135491847992, + "learning_rate": 0.00010859001411807569, + "loss": 1.618692398071289, + "step": 44240 + }, + { + "epoch": 0.13394276028029603, + "grad_norm": 0.13233351707458496, + "learning_rate": 0.00010858621893643831, + "loss": 1.6119558334350585, + "step": 44250 + }, + { + "epoch": 0.13397302983064185, + "grad_norm": 0.12719811499118805, + "learning_rate": 0.0001085824237548009, + "loss": 1.6856693267822265, + "step": 44260 + }, + { + "epoch": 0.1340032993809877, + "grad_norm": 0.13604192435741425, + "learning_rate": 0.00010857862857316352, + "loss": 1.6153350830078126, + "step": 44270 + }, + { + "epoch": 0.13403356893133353, + "grad_norm": 0.14237797260284424, + "learning_rate": 0.00010857483339152611, + "loss": 1.614263153076172, + "step": 44280 + }, + { + "epoch": 0.13406383848167935, + "grad_norm": 0.13290052115917206, + "learning_rate": 0.00010857103820988873, + "loss": 1.6170108795166016, + "step": 44290 + }, + { + "epoch": 0.13409410803202518, + "grad_norm": 0.13644079864025116, + "learning_rate": 0.00010856724302825132, + "loss": 1.5715883255004883, + "step": 44300 + }, + { + "epoch": 0.134124377582371, + "grad_norm": 0.12899920344352722, + "learning_rate": 0.00010856344784661394, + "loss": 1.6311660766601563, + "step": 44310 + }, + { + "epoch": 0.13415464713271685, + "grad_norm": 0.12757447361946106, + "learning_rate": 0.00010855965266497655, + "loss": 1.6145074844360352, + "step": 44320 + }, + { + "epoch": 0.13418491668306268, + "grad_norm": 0.13144218921661377, + "learning_rate": 0.00010855585748333916, + "loss": 1.6246309280395508, + "step": 44330 + }, + { + "epoch": 0.1342151862334085, + "grad_norm": 0.13776077330112457, + "learning_rate": 0.00010855206230170176, + "loss": 1.6190706253051759, + "step": 44340 + }, + { + "epoch": 0.13424545578375432, + "grad_norm": 0.15161828696727753, + "learning_rate": 0.00010854826712006437, + "loss": 1.6186189651489258, + "step": 44350 + }, + { + "epoch": 0.13427572533410015, + "grad_norm": 0.15621647238731384, + "learning_rate": 0.00010854447193842699, + "loss": 1.6446250915527343, + "step": 44360 + }, + { + "epoch": 0.134305994884446, + "grad_norm": 0.15699957311153412, + "learning_rate": 0.00010854067675678958, + "loss": 1.611697006225586, + "step": 44370 + }, + { + "epoch": 0.13433626443479182, + "grad_norm": 0.12972134351730347, + "learning_rate": 0.0001085368815751522, + "loss": 1.6228286743164062, + "step": 44380 + }, + { + "epoch": 0.13436653398513765, + "grad_norm": 0.1387633979320526, + "learning_rate": 0.00010853308639351479, + "loss": 1.6508256912231445, + "step": 44390 + }, + { + "epoch": 0.13439680353548347, + "grad_norm": 0.12842972576618195, + "learning_rate": 0.00010852929121187741, + "loss": 1.6048721313476562, + "step": 44400 + }, + { + "epoch": 0.1344270730858293, + "grad_norm": 0.14244090020656586, + "learning_rate": 0.00010852549603024, + "loss": 1.6263267517089843, + "step": 44410 + }, + { + "epoch": 0.13445734263617515, + "grad_norm": 0.1262374073266983, + "learning_rate": 0.00010852170084860262, + "loss": 1.6294342041015626, + "step": 44420 + }, + { + "epoch": 0.13448761218652097, + "grad_norm": 0.1490219682455063, + "learning_rate": 0.00010851790566696522, + "loss": 1.6181110382080077, + "step": 44430 + }, + { + "epoch": 0.1345178817368668, + "grad_norm": 0.13318422436714172, + "learning_rate": 0.00010851411048532783, + "loss": 1.6639123916625977, + "step": 44440 + }, + { + "epoch": 0.13454815128721262, + "grad_norm": 0.1320783793926239, + "learning_rate": 0.00010851031530369043, + "loss": 1.6031768798828125, + "step": 44450 + }, + { + "epoch": 0.13457842083755847, + "grad_norm": 0.13184064626693726, + "learning_rate": 0.00010850652012205305, + "loss": 1.581019401550293, + "step": 44460 + }, + { + "epoch": 0.1346086903879043, + "grad_norm": 0.13579411804676056, + "learning_rate": 0.00010850272494041564, + "loss": 1.6443321228027343, + "step": 44470 + }, + { + "epoch": 0.13463895993825012, + "grad_norm": 0.1376044750213623, + "learning_rate": 0.00010849892975877826, + "loss": 1.6183063507080078, + "step": 44480 + }, + { + "epoch": 0.13466922948859594, + "grad_norm": 0.15115201473236084, + "learning_rate": 0.00010849513457714086, + "loss": 1.6176748275756836, + "step": 44490 + }, + { + "epoch": 0.13469949903894177, + "grad_norm": 0.13665597140789032, + "learning_rate": 0.00010849133939550347, + "loss": 1.6458396911621094, + "step": 44500 + }, + { + "epoch": 0.13469949903894177, + "eval_loss": 1.6299494504928589, + "eval_runtime": 28.067, + "eval_samples_per_second": 17.815, + "eval_steps_per_second": 1.14, + "step": 44500 + }, + { + "epoch": 0.13472976858928762, + "grad_norm": 0.15544185042381287, + "learning_rate": 0.00010848754421386608, + "loss": 1.6167724609375, + "step": 44510 + }, + { + "epoch": 0.13476003813963344, + "grad_norm": 0.12868770956993103, + "learning_rate": 0.00010848374903222868, + "loss": 1.6253387451171875, + "step": 44520 + }, + { + "epoch": 0.13479030768997927, + "grad_norm": 0.14900974929332733, + "learning_rate": 0.0001084799538505913, + "loss": 1.635706901550293, + "step": 44530 + }, + { + "epoch": 0.1348205772403251, + "grad_norm": 0.16230198740959167, + "learning_rate": 0.0001084761586689539, + "loss": 1.6004501342773438, + "step": 44540 + }, + { + "epoch": 0.13485084679067091, + "grad_norm": 0.138033926486969, + "learning_rate": 0.00010847236348731651, + "loss": 1.6488992691040039, + "step": 44550 + }, + { + "epoch": 0.13488111634101677, + "grad_norm": 0.129909485578537, + "learning_rate": 0.0001084685683056791, + "loss": 1.632196807861328, + "step": 44560 + }, + { + "epoch": 0.1349113858913626, + "grad_norm": 0.12430249154567719, + "learning_rate": 0.00010846477312404173, + "loss": 1.6349143981933594, + "step": 44570 + }, + { + "epoch": 0.1349416554417084, + "grad_norm": 0.15111854672431946, + "learning_rate": 0.00010846097794240432, + "loss": 1.6220512390136719, + "step": 44580 + }, + { + "epoch": 0.13497192499205424, + "grad_norm": 0.1255291849374771, + "learning_rate": 0.00010845718276076694, + "loss": 1.6352703094482421, + "step": 44590 + }, + { + "epoch": 0.13500219454240006, + "grad_norm": 0.15820203721523285, + "learning_rate": 0.00010845338757912953, + "loss": 1.621255874633789, + "step": 44600 + }, + { + "epoch": 0.1350324640927459, + "grad_norm": 0.15457725524902344, + "learning_rate": 0.00010844959239749215, + "loss": 1.5984418869018555, + "step": 44610 + }, + { + "epoch": 0.13506273364309174, + "grad_norm": 0.14543017745018005, + "learning_rate": 0.00010844579721585476, + "loss": 1.6246475219726562, + "step": 44620 + }, + { + "epoch": 0.13509300319343756, + "grad_norm": 0.13499519228935242, + "learning_rate": 0.00010844200203421736, + "loss": 1.630345344543457, + "step": 44630 + }, + { + "epoch": 0.13512327274378338, + "grad_norm": 0.14122670888900757, + "learning_rate": 0.00010843820685257997, + "loss": 1.6158700942993165, + "step": 44640 + }, + { + "epoch": 0.1351535422941292, + "grad_norm": 0.15036599338054657, + "learning_rate": 0.00010843441167094257, + "loss": 1.6005220413208008, + "step": 44650 + }, + { + "epoch": 0.13518381184447506, + "grad_norm": 0.13464903831481934, + "learning_rate": 0.00010843061648930518, + "loss": 1.6255859375, + "step": 44660 + }, + { + "epoch": 0.13521408139482088, + "grad_norm": 0.1339176893234253, + "learning_rate": 0.00010842682130766779, + "loss": 1.6313074111938477, + "step": 44670 + }, + { + "epoch": 0.1352443509451667, + "grad_norm": 0.14223818480968475, + "learning_rate": 0.00010842302612603039, + "loss": 1.5744637489318847, + "step": 44680 + }, + { + "epoch": 0.13527462049551253, + "grad_norm": 0.1335325539112091, + "learning_rate": 0.000108419230944393, + "loss": 1.6096412658691406, + "step": 44690 + }, + { + "epoch": 0.13530489004585836, + "grad_norm": 0.12860272824764252, + "learning_rate": 0.0001084154357627556, + "loss": 1.5926508903503418, + "step": 44700 + }, + { + "epoch": 0.1353351595962042, + "grad_norm": 0.12787088751792908, + "learning_rate": 0.00010841164058111821, + "loss": 1.6353923797607421, + "step": 44710 + }, + { + "epoch": 0.13536542914655003, + "grad_norm": 0.1351873278617859, + "learning_rate": 0.00010840784539948081, + "loss": 1.6203060150146484, + "step": 44720 + }, + { + "epoch": 0.13539569869689586, + "grad_norm": 0.15316849946975708, + "learning_rate": 0.00010840405021784343, + "loss": 1.5830325126647948, + "step": 44730 + }, + { + "epoch": 0.13542596824724168, + "grad_norm": 0.14756609499454498, + "learning_rate": 0.00010840025503620604, + "loss": 1.6290948867797852, + "step": 44740 + }, + { + "epoch": 0.1354562377975875, + "grad_norm": 0.13174408674240112, + "learning_rate": 0.00010839645985456865, + "loss": 1.6228061676025392, + "step": 44750 + }, + { + "epoch": 0.13548650734793335, + "grad_norm": 0.1371503621339798, + "learning_rate": 0.00010839266467293125, + "loss": 1.5881678581237793, + "step": 44760 + }, + { + "epoch": 0.13551677689827918, + "grad_norm": 0.14978700876235962, + "learning_rate": 0.00010838886949129386, + "loss": 1.6052679061889648, + "step": 44770 + }, + { + "epoch": 0.135547046448625, + "grad_norm": 0.14584340155124664, + "learning_rate": 0.00010838507430965646, + "loss": 1.6424474716186523, + "step": 44780 + }, + { + "epoch": 0.13557731599897083, + "grad_norm": 0.12309075146913528, + "learning_rate": 0.00010838127912801907, + "loss": 1.5966880798339844, + "step": 44790 + }, + { + "epoch": 0.13560758554931668, + "grad_norm": 0.14466841518878937, + "learning_rate": 0.00010837748394638168, + "loss": 1.6047203063964843, + "step": 44800 + }, + { + "epoch": 0.1356378550996625, + "grad_norm": 0.12462716549634933, + "learning_rate": 0.00010837368876474428, + "loss": 1.634955596923828, + "step": 44810 + }, + { + "epoch": 0.13566812465000833, + "grad_norm": 0.12428233027458191, + "learning_rate": 0.00010836989358310689, + "loss": 1.6040069580078125, + "step": 44820 + }, + { + "epoch": 0.13569839420035415, + "grad_norm": 0.12759770452976227, + "learning_rate": 0.0001083660984014695, + "loss": 1.6393850326538086, + "step": 44830 + }, + { + "epoch": 0.13572866375069997, + "grad_norm": 0.1478327065706253, + "learning_rate": 0.0001083623032198321, + "loss": 1.5903857231140137, + "step": 44840 + }, + { + "epoch": 0.13575893330104583, + "grad_norm": 0.15893881022930145, + "learning_rate": 0.0001083585080381947, + "loss": 1.6273920059204101, + "step": 44850 + }, + { + "epoch": 0.13578920285139165, + "grad_norm": 0.13494406640529633, + "learning_rate": 0.00010835471285655733, + "loss": 1.5760611534118651, + "step": 44860 + }, + { + "epoch": 0.13581947240173747, + "grad_norm": 0.1295001357793808, + "learning_rate": 0.00010835091767491992, + "loss": 1.632411003112793, + "step": 44870 + }, + { + "epoch": 0.1358497419520833, + "grad_norm": 0.14637653529644012, + "learning_rate": 0.00010834712249328254, + "loss": 1.6265071868896483, + "step": 44880 + }, + { + "epoch": 0.13588001150242912, + "grad_norm": 0.13853605091571808, + "learning_rate": 0.00010834332731164513, + "loss": 1.5794163703918458, + "step": 44890 + }, + { + "epoch": 0.13591028105277497, + "grad_norm": 0.1341148018836975, + "learning_rate": 0.00010833953213000775, + "loss": 1.644917869567871, + "step": 44900 + }, + { + "epoch": 0.1359405506031208, + "grad_norm": 0.13687124848365784, + "learning_rate": 0.00010833573694837034, + "loss": 1.6080717086791991, + "step": 44910 + }, + { + "epoch": 0.13597082015346662, + "grad_norm": 0.12533552944660187, + "learning_rate": 0.00010833194176673296, + "loss": 1.6319236755371094, + "step": 44920 + }, + { + "epoch": 0.13600108970381244, + "grad_norm": 0.139448881149292, + "learning_rate": 0.00010832814658509557, + "loss": 1.6367864608764648, + "step": 44930 + }, + { + "epoch": 0.13603135925415827, + "grad_norm": 0.15207158029079437, + "learning_rate": 0.00010832435140345817, + "loss": 1.611320686340332, + "step": 44940 + }, + { + "epoch": 0.13606162880450412, + "grad_norm": 0.1546914279460907, + "learning_rate": 0.00010832055622182078, + "loss": 1.6176090240478516, + "step": 44950 + }, + { + "epoch": 0.13609189835484994, + "grad_norm": 0.16184674203395844, + "learning_rate": 0.00010831676104018338, + "loss": 1.6134401321411134, + "step": 44960 + }, + { + "epoch": 0.13612216790519577, + "grad_norm": 0.12894926965236664, + "learning_rate": 0.000108312965858546, + "loss": 1.627404022216797, + "step": 44970 + }, + { + "epoch": 0.1361524374555416, + "grad_norm": 0.14050643146038055, + "learning_rate": 0.0001083091706769086, + "loss": 1.5958009719848634, + "step": 44980 + }, + { + "epoch": 0.13618270700588742, + "grad_norm": 0.1333867460489273, + "learning_rate": 0.00010830537549527122, + "loss": 1.6422893524169921, + "step": 44990 + }, + { + "epoch": 0.13621297655623327, + "grad_norm": 0.13113926351070404, + "learning_rate": 0.00010830158031363381, + "loss": 1.6560352325439454, + "step": 45000 + }, + { + "epoch": 0.13621297655623327, + "eval_loss": 1.637918472290039, + "eval_runtime": 28.3542, + "eval_samples_per_second": 17.634, + "eval_steps_per_second": 1.129, + "step": 45000 + }, + { + "epoch": 0.1362432461065791, + "grad_norm": 0.1485910415649414, + "learning_rate": 0.00010829778513199643, + "loss": 1.602077102661133, + "step": 45010 + }, + { + "epoch": 0.13627351565692492, + "grad_norm": 0.14485326409339905, + "learning_rate": 0.00010829398995035902, + "loss": 1.634927749633789, + "step": 45020 + }, + { + "epoch": 0.13630378520727074, + "grad_norm": 0.1266547292470932, + "learning_rate": 0.00010829019476872164, + "loss": 1.6299020767211914, + "step": 45030 + }, + { + "epoch": 0.13633405475761656, + "grad_norm": 0.12984974682331085, + "learning_rate": 0.00010828639958708423, + "loss": 1.5807588577270508, + "step": 45040 + }, + { + "epoch": 0.13636432430796241, + "grad_norm": 0.1349581629037857, + "learning_rate": 0.00010828260440544685, + "loss": 1.6199073791503906, + "step": 45050 + }, + { + "epoch": 0.13639459385830824, + "grad_norm": 0.13351835310459137, + "learning_rate": 0.00010827880922380944, + "loss": 1.626858901977539, + "step": 45060 + }, + { + "epoch": 0.13642486340865406, + "grad_norm": 0.12843620777130127, + "learning_rate": 0.00010827501404217206, + "loss": 1.6322154998779297, + "step": 45070 + }, + { + "epoch": 0.1364551329589999, + "grad_norm": 0.14605334401130676, + "learning_rate": 0.00010827121886053466, + "loss": 1.5595443725585938, + "step": 45080 + }, + { + "epoch": 0.1364854025093457, + "grad_norm": 0.14531531929969788, + "learning_rate": 0.00010826742367889728, + "loss": 1.6288017272949218, + "step": 45090 + }, + { + "epoch": 0.13651567205969156, + "grad_norm": 0.1380733996629715, + "learning_rate": 0.00010826362849725988, + "loss": 1.6230756759643554, + "step": 45100 + }, + { + "epoch": 0.13654594161003739, + "grad_norm": 0.13366110622882843, + "learning_rate": 0.00010825983331562249, + "loss": 1.6049907684326172, + "step": 45110 + }, + { + "epoch": 0.1365762111603832, + "grad_norm": 0.14229637384414673, + "learning_rate": 0.0001082560381339851, + "loss": 1.655129623413086, + "step": 45120 + }, + { + "epoch": 0.13660648071072903, + "grad_norm": 0.12776169180870056, + "learning_rate": 0.0001082522429523477, + "loss": 1.6322624206542968, + "step": 45130 + }, + { + "epoch": 0.13663675026107489, + "grad_norm": 0.12445221096277237, + "learning_rate": 0.00010824844777071032, + "loss": 1.6339561462402343, + "step": 45140 + }, + { + "epoch": 0.1366670198114207, + "grad_norm": 0.13336122035980225, + "learning_rate": 0.00010824465258907291, + "loss": 1.6475387573242188, + "step": 45150 + }, + { + "epoch": 0.13669728936176653, + "grad_norm": 0.125754714012146, + "learning_rate": 0.00010824085740743553, + "loss": 1.659596061706543, + "step": 45160 + }, + { + "epoch": 0.13672755891211236, + "grad_norm": 0.13397516310214996, + "learning_rate": 0.00010823706222579812, + "loss": 1.5875299453735352, + "step": 45170 + }, + { + "epoch": 0.13675782846245818, + "grad_norm": 0.12568166851997375, + "learning_rate": 0.00010823326704416074, + "loss": 1.6596389770507813, + "step": 45180 + }, + { + "epoch": 0.13678809801280403, + "grad_norm": 0.15227943658828735, + "learning_rate": 0.00010822947186252334, + "loss": 1.6739536285400392, + "step": 45190 + }, + { + "epoch": 0.13681836756314986, + "grad_norm": 0.11816427111625671, + "learning_rate": 0.00010822567668088595, + "loss": 1.6221858978271484, + "step": 45200 + }, + { + "epoch": 0.13684863711349568, + "grad_norm": 0.12515795230865479, + "learning_rate": 0.00010822188149924855, + "loss": 1.5992844581604004, + "step": 45210 + }, + { + "epoch": 0.1368789066638415, + "grad_norm": 0.1392187923192978, + "learning_rate": 0.00010821808631761117, + "loss": 1.620839500427246, + "step": 45220 + }, + { + "epoch": 0.13690917621418733, + "grad_norm": 0.15497471392154694, + "learning_rate": 0.00010821429113597377, + "loss": 1.6171072006225586, + "step": 45230 + }, + { + "epoch": 0.13693944576453318, + "grad_norm": 0.1297508031129837, + "learning_rate": 0.00010821049595433638, + "loss": 1.6046318054199218, + "step": 45240 + }, + { + "epoch": 0.136969715314879, + "grad_norm": 0.14039911329746246, + "learning_rate": 0.00010820670077269898, + "loss": 1.5970046043395996, + "step": 45250 + }, + { + "epoch": 0.13699998486522483, + "grad_norm": 0.13476505875587463, + "learning_rate": 0.00010820290559106159, + "loss": 1.6319398880004883, + "step": 45260 + }, + { + "epoch": 0.13703025441557065, + "grad_norm": 0.1301865428686142, + "learning_rate": 0.0001081991104094242, + "loss": 1.62012939453125, + "step": 45270 + }, + { + "epoch": 0.13706052396591648, + "grad_norm": 0.1235354095697403, + "learning_rate": 0.0001081953152277868, + "loss": 1.6519718170166016, + "step": 45280 + }, + { + "epoch": 0.13709079351626233, + "grad_norm": 0.1491881161928177, + "learning_rate": 0.00010819152004614941, + "loss": 1.6601205825805665, + "step": 45290 + }, + { + "epoch": 0.13712106306660815, + "grad_norm": 0.12476520240306854, + "learning_rate": 0.00010818772486451201, + "loss": 1.6406402587890625, + "step": 45300 + }, + { + "epoch": 0.13715133261695397, + "grad_norm": 0.12694580852985382, + "learning_rate": 0.00010818392968287462, + "loss": 1.6557857513427734, + "step": 45310 + }, + { + "epoch": 0.1371816021672998, + "grad_norm": 0.13316114246845245, + "learning_rate": 0.00010818013450123723, + "loss": 1.6030677795410155, + "step": 45320 + }, + { + "epoch": 0.13721187171764562, + "grad_norm": 0.14860346913337708, + "learning_rate": 0.00010817633931959985, + "loss": 1.6183076858520509, + "step": 45330 + }, + { + "epoch": 0.13724214126799147, + "grad_norm": 0.13361743092536926, + "learning_rate": 0.00010817254413796245, + "loss": 1.6596199035644532, + "step": 45340 + }, + { + "epoch": 0.1372724108183373, + "grad_norm": 0.1263893097639084, + "learning_rate": 0.00010816874895632506, + "loss": 1.6456024169921875, + "step": 45350 + }, + { + "epoch": 0.13730268036868312, + "grad_norm": 0.1458812952041626, + "learning_rate": 0.00010816495377468766, + "loss": 1.604237937927246, + "step": 45360 + }, + { + "epoch": 0.13733294991902895, + "grad_norm": 0.1442635953426361, + "learning_rate": 0.00010816115859305027, + "loss": 1.6175655364990233, + "step": 45370 + }, + { + "epoch": 0.13736321946937477, + "grad_norm": 0.1438862830400467, + "learning_rate": 0.00010815736341141288, + "loss": 1.6185462951660157, + "step": 45380 + }, + { + "epoch": 0.13739348901972062, + "grad_norm": 0.1363944262266159, + "learning_rate": 0.00010815356822977548, + "loss": 1.6527843475341797, + "step": 45390 + }, + { + "epoch": 0.13742375857006645, + "grad_norm": 0.10784871876239777, + "learning_rate": 0.00010814977304813809, + "loss": 1.6447677612304688, + "step": 45400 + }, + { + "epoch": 0.13745402812041227, + "grad_norm": 0.12440065294504166, + "learning_rate": 0.00010814597786650069, + "loss": 1.6310291290283203, + "step": 45410 + }, + { + "epoch": 0.1374842976707581, + "grad_norm": 0.14497746527194977, + "learning_rate": 0.0001081421826848633, + "loss": 1.6933242797851562, + "step": 45420 + }, + { + "epoch": 0.13751456722110392, + "grad_norm": 0.13406163454055786, + "learning_rate": 0.0001081383875032259, + "loss": 1.5581562042236328, + "step": 45430 + }, + { + "epoch": 0.13754483677144977, + "grad_norm": 0.14365290105342865, + "learning_rate": 0.00010813459232158851, + "loss": 1.6349174499511718, + "step": 45440 + }, + { + "epoch": 0.1375751063217956, + "grad_norm": 0.1414984166622162, + "learning_rate": 0.00010813079713995112, + "loss": 1.6387937545776368, + "step": 45450 + }, + { + "epoch": 0.13760537587214142, + "grad_norm": 0.13366268575191498, + "learning_rate": 0.00010812700195831372, + "loss": 1.5913116455078125, + "step": 45460 + }, + { + "epoch": 0.13763564542248724, + "grad_norm": 0.12910151481628418, + "learning_rate": 0.00010812320677667634, + "loss": 1.6391889572143554, + "step": 45470 + }, + { + "epoch": 0.1376659149728331, + "grad_norm": 0.12833714485168457, + "learning_rate": 0.00010811941159503893, + "loss": 1.5917617797851562, + "step": 45480 + }, + { + "epoch": 0.13769618452317892, + "grad_norm": 0.13403485715389252, + "learning_rate": 0.00010811561641340155, + "loss": 1.6261894226074218, + "step": 45490 + }, + { + "epoch": 0.13772645407352474, + "grad_norm": 0.1200350672006607, + "learning_rate": 0.00010811182123176415, + "loss": 1.624125099182129, + "step": 45500 + }, + { + "epoch": 0.13772645407352474, + "eval_loss": 1.6077405214309692, + "eval_runtime": 27.9377, + "eval_samples_per_second": 17.897, + "eval_steps_per_second": 1.145, + "step": 45500 + }, + { + "epoch": 0.13775672362387056, + "grad_norm": 0.1465732306241989, + "learning_rate": 0.00010810802605012677, + "loss": 1.628554916381836, + "step": 45510 + }, + { + "epoch": 0.1377869931742164, + "grad_norm": 0.14678046107292175, + "learning_rate": 0.00010810423086848936, + "loss": 1.6166790008544922, + "step": 45520 + }, + { + "epoch": 0.13781726272456224, + "grad_norm": 0.13218790292739868, + "learning_rate": 0.00010810043568685198, + "loss": 1.6104248046875, + "step": 45530 + }, + { + "epoch": 0.13784753227490806, + "grad_norm": 0.16517239809036255, + "learning_rate": 0.00010809664050521458, + "loss": 1.6553001403808594, + "step": 45540 + }, + { + "epoch": 0.1378778018252539, + "grad_norm": 0.1336441934108734, + "learning_rate": 0.00010809284532357719, + "loss": 1.6401268005371095, + "step": 45550 + }, + { + "epoch": 0.1379080713755997, + "grad_norm": 0.12677544355392456, + "learning_rate": 0.0001080890501419398, + "loss": 1.5967554092407226, + "step": 45560 + }, + { + "epoch": 0.13793834092594554, + "grad_norm": 0.12961527705192566, + "learning_rate": 0.0001080852549603024, + "loss": 1.5873937606811523, + "step": 45570 + }, + { + "epoch": 0.1379686104762914, + "grad_norm": 0.14158056676387787, + "learning_rate": 0.00010808145977866501, + "loss": 1.6187429428100586, + "step": 45580 + }, + { + "epoch": 0.1379988800266372, + "grad_norm": 0.13728153705596924, + "learning_rate": 0.00010807766459702761, + "loss": 1.5978349685668944, + "step": 45590 + }, + { + "epoch": 0.13802914957698303, + "grad_norm": 0.16662749648094177, + "learning_rate": 0.00010807386941539023, + "loss": 1.6254213333129883, + "step": 45600 + }, + { + "epoch": 0.13805941912732886, + "grad_norm": 0.16515807807445526, + "learning_rate": 0.00010807007423375283, + "loss": 1.6456649780273438, + "step": 45610 + }, + { + "epoch": 0.13808968867767468, + "grad_norm": 0.14767704904079437, + "learning_rate": 0.00010806627905211545, + "loss": 1.6468782424926758, + "step": 45620 + }, + { + "epoch": 0.13811995822802053, + "grad_norm": 0.15210269391536713, + "learning_rate": 0.00010806248387047804, + "loss": 1.582040023803711, + "step": 45630 + }, + { + "epoch": 0.13815022777836636, + "grad_norm": 0.12257269769906998, + "learning_rate": 0.00010805868868884066, + "loss": 1.6064090728759766, + "step": 45640 + }, + { + "epoch": 0.13818049732871218, + "grad_norm": 0.13965944945812225, + "learning_rate": 0.00010805489350720325, + "loss": 1.6340465545654297, + "step": 45650 + }, + { + "epoch": 0.138210766879058, + "grad_norm": 0.15163986384868622, + "learning_rate": 0.00010805109832556587, + "loss": 1.612843894958496, + "step": 45660 + }, + { + "epoch": 0.13824103642940383, + "grad_norm": 0.13797931373119354, + "learning_rate": 0.00010804730314392846, + "loss": 1.623988723754883, + "step": 45670 + }, + { + "epoch": 0.13827130597974968, + "grad_norm": 0.1409238576889038, + "learning_rate": 0.00010804350796229108, + "loss": 1.5954174041748046, + "step": 45680 + }, + { + "epoch": 0.1383015755300955, + "grad_norm": 0.1462019979953766, + "learning_rate": 0.00010803971278065367, + "loss": 1.5894041061401367, + "step": 45690 + }, + { + "epoch": 0.13833184508044133, + "grad_norm": 0.14636816084384918, + "learning_rate": 0.00010803591759901629, + "loss": 1.6016132354736328, + "step": 45700 + }, + { + "epoch": 0.13836211463078715, + "grad_norm": 0.13234859704971313, + "learning_rate": 0.00010803212241737889, + "loss": 1.6323991775512696, + "step": 45710 + }, + { + "epoch": 0.13839238418113298, + "grad_norm": 0.15573646128177643, + "learning_rate": 0.0001080283272357415, + "loss": 1.587025260925293, + "step": 45720 + }, + { + "epoch": 0.13842265373147883, + "grad_norm": 0.125428706407547, + "learning_rate": 0.00010802453205410411, + "loss": 1.633189582824707, + "step": 45730 + }, + { + "epoch": 0.13845292328182465, + "grad_norm": 0.1485753357410431, + "learning_rate": 0.00010802073687246672, + "loss": 1.58831787109375, + "step": 45740 + }, + { + "epoch": 0.13848319283217048, + "grad_norm": 0.13000133633613586, + "learning_rate": 0.00010801694169082934, + "loss": 1.5976223945617676, + "step": 45750 + }, + { + "epoch": 0.1385134623825163, + "grad_norm": 0.13760486245155334, + "learning_rate": 0.00010801314650919193, + "loss": 1.6266563415527344, + "step": 45760 + }, + { + "epoch": 0.13854373193286212, + "grad_norm": 0.11401385068893433, + "learning_rate": 0.00010800935132755455, + "loss": 1.6271553039550781, + "step": 45770 + }, + { + "epoch": 0.13857400148320798, + "grad_norm": 0.13263601064682007, + "learning_rate": 0.00010800555614591714, + "loss": 1.5987516403198243, + "step": 45780 + }, + { + "epoch": 0.1386042710335538, + "grad_norm": 0.13915738463401794, + "learning_rate": 0.00010800176096427976, + "loss": 1.5718702316284179, + "step": 45790 + }, + { + "epoch": 0.13863454058389962, + "grad_norm": 0.13221041858196259, + "learning_rate": 0.00010799796578264235, + "loss": 1.6119377136230468, + "step": 45800 + }, + { + "epoch": 0.13866481013424545, + "grad_norm": 0.13210538029670715, + "learning_rate": 0.00010799417060100497, + "loss": 1.620753860473633, + "step": 45810 + }, + { + "epoch": 0.1386950796845913, + "grad_norm": 0.1302587240934372, + "learning_rate": 0.00010799037541936756, + "loss": 1.6481288909912108, + "step": 45820 + }, + { + "epoch": 0.13872534923493712, + "grad_norm": 0.11372624337673187, + "learning_rate": 0.00010798658023773018, + "loss": 1.5829400062561034, + "step": 45830 + }, + { + "epoch": 0.13875561878528295, + "grad_norm": 0.1251242458820343, + "learning_rate": 0.00010798278505609279, + "loss": 1.6204727172851563, + "step": 45840 + }, + { + "epoch": 0.13878588833562877, + "grad_norm": 0.16088594496250153, + "learning_rate": 0.0001079789898744554, + "loss": 1.5881508827209472, + "step": 45850 + }, + { + "epoch": 0.1388161578859746, + "grad_norm": 0.134470596909523, + "learning_rate": 0.000107975194692818, + "loss": 1.6498577117919921, + "step": 45860 + }, + { + "epoch": 0.13884642743632045, + "grad_norm": 0.1411161571741104, + "learning_rate": 0.00010797139951118061, + "loss": 1.626907730102539, + "step": 45870 + }, + { + "epoch": 0.13887669698666627, + "grad_norm": 0.15711544454097748, + "learning_rate": 0.00010796760432954321, + "loss": 1.6504377365112304, + "step": 45880 + }, + { + "epoch": 0.1389069665370121, + "grad_norm": 0.1425490528345108, + "learning_rate": 0.00010796380914790582, + "loss": 1.6128580093383789, + "step": 45890 + }, + { + "epoch": 0.13893723608735792, + "grad_norm": 0.11876732856035233, + "learning_rate": 0.00010796001396626843, + "loss": 1.5904530525207519, + "step": 45900 + }, + { + "epoch": 0.13896750563770374, + "grad_norm": 0.13838286697864532, + "learning_rate": 0.00010795621878463103, + "loss": 1.6133243560791015, + "step": 45910 + }, + { + "epoch": 0.1389977751880496, + "grad_norm": 0.13273519277572632, + "learning_rate": 0.00010795242360299364, + "loss": 1.630651092529297, + "step": 45920 + }, + { + "epoch": 0.13902804473839542, + "grad_norm": 0.13680878281593323, + "learning_rate": 0.00010794862842135624, + "loss": 1.6165748596191407, + "step": 45930 + }, + { + "epoch": 0.13905831428874124, + "grad_norm": 0.13620994985103607, + "learning_rate": 0.00010794483323971886, + "loss": 1.600986862182617, + "step": 45940 + }, + { + "epoch": 0.13908858383908707, + "grad_norm": 0.14528588950634003, + "learning_rate": 0.00010794103805808146, + "loss": 1.614170455932617, + "step": 45950 + }, + { + "epoch": 0.1391188533894329, + "grad_norm": 0.13819560408592224, + "learning_rate": 0.00010793724287644407, + "loss": 1.6628147125244142, + "step": 45960 + }, + { + "epoch": 0.13914912293977874, + "grad_norm": 0.12480335682630539, + "learning_rate": 0.00010793344769480668, + "loss": 1.5818735122680665, + "step": 45970 + }, + { + "epoch": 0.13917939249012456, + "grad_norm": 0.1335807889699936, + "learning_rate": 0.00010792965251316929, + "loss": 1.6122440338134765, + "step": 45980 + }, + { + "epoch": 0.1392096620404704, + "grad_norm": 0.13685008883476257, + "learning_rate": 0.00010792585733153189, + "loss": 1.6383684158325196, + "step": 45990 + }, + { + "epoch": 0.1392399315908162, + "grad_norm": 0.14182579517364502, + "learning_rate": 0.0001079220621498945, + "loss": 1.6309207916259765, + "step": 46000 + }, + { + "epoch": 0.1392399315908162, + "eval_loss": 1.6121797561645508, + "eval_runtime": 28.2079, + "eval_samples_per_second": 17.726, + "eval_steps_per_second": 1.134, + "step": 46000 + }, + { + "epoch": 0.13927020114116204, + "grad_norm": 0.14580918848514557, + "learning_rate": 0.0001079182669682571, + "loss": 1.609151268005371, + "step": 46010 + }, + { + "epoch": 0.1393004706915079, + "grad_norm": 0.12843263149261475, + "learning_rate": 0.00010791447178661971, + "loss": 1.6102697372436523, + "step": 46020 + }, + { + "epoch": 0.1393307402418537, + "grad_norm": 0.12941144406795502, + "learning_rate": 0.00010791067660498232, + "loss": 1.6612640380859376, + "step": 46030 + }, + { + "epoch": 0.13936100979219954, + "grad_norm": 0.13181202113628387, + "learning_rate": 0.00010790688142334492, + "loss": 1.6010774612426757, + "step": 46040 + }, + { + "epoch": 0.13939127934254536, + "grad_norm": 0.13998647034168243, + "learning_rate": 0.00010790308624170753, + "loss": 1.618697738647461, + "step": 46050 + }, + { + "epoch": 0.13942154889289118, + "grad_norm": 0.1327090561389923, + "learning_rate": 0.00010789929106007013, + "loss": 1.6227869033813476, + "step": 46060 + }, + { + "epoch": 0.13945181844323704, + "grad_norm": 0.13294361531734467, + "learning_rate": 0.00010789549587843274, + "loss": 1.5792710304260253, + "step": 46070 + }, + { + "epoch": 0.13948208799358286, + "grad_norm": 0.14515210688114166, + "learning_rate": 0.00010789170069679536, + "loss": 1.615147590637207, + "step": 46080 + }, + { + "epoch": 0.13951235754392868, + "grad_norm": 0.13597992062568665, + "learning_rate": 0.00010788790551515795, + "loss": 1.6464942932128905, + "step": 46090 + }, + { + "epoch": 0.1395426270942745, + "grad_norm": 0.1318339854478836, + "learning_rate": 0.00010788411033352057, + "loss": 1.6363649368286133, + "step": 46100 + }, + { + "epoch": 0.13957289664462033, + "grad_norm": 0.12903736531734467, + "learning_rate": 0.00010788031515188316, + "loss": 1.6183935165405274, + "step": 46110 + }, + { + "epoch": 0.13960316619496618, + "grad_norm": 0.14828693866729736, + "learning_rate": 0.00010787651997024578, + "loss": 1.6098434448242187, + "step": 46120 + }, + { + "epoch": 0.139633435745312, + "grad_norm": 0.1378592997789383, + "learning_rate": 0.00010787272478860838, + "loss": 1.604250717163086, + "step": 46130 + }, + { + "epoch": 0.13966370529565783, + "grad_norm": 0.14536963403224945, + "learning_rate": 0.000107868929606971, + "loss": 1.6323993682861329, + "step": 46140 + }, + { + "epoch": 0.13969397484600365, + "grad_norm": 0.1453002542257309, + "learning_rate": 0.0001078651344253336, + "loss": 1.613121795654297, + "step": 46150 + }, + { + "epoch": 0.13972424439634948, + "grad_norm": 0.136931911110878, + "learning_rate": 0.00010786133924369621, + "loss": 1.6119956970214844, + "step": 46160 + }, + { + "epoch": 0.13975451394669533, + "grad_norm": 0.13639108836650848, + "learning_rate": 0.00010785754406205881, + "loss": 1.6122812271118163, + "step": 46170 + }, + { + "epoch": 0.13978478349704115, + "grad_norm": 0.12984541058540344, + "learning_rate": 0.00010785374888042142, + "loss": 1.6341842651367187, + "step": 46180 + }, + { + "epoch": 0.13981505304738698, + "grad_norm": 0.12868820130825043, + "learning_rate": 0.00010784995369878403, + "loss": 1.6092029571533204, + "step": 46190 + }, + { + "epoch": 0.1398453225977328, + "grad_norm": 0.13817831873893738, + "learning_rate": 0.00010784615851714663, + "loss": 1.579905891418457, + "step": 46200 + }, + { + "epoch": 0.13987559214807865, + "grad_norm": 0.12157844007015228, + "learning_rate": 0.00010784236333550925, + "loss": 1.6042015075683593, + "step": 46210 + }, + { + "epoch": 0.13990586169842448, + "grad_norm": 0.14184251427650452, + "learning_rate": 0.00010783856815387184, + "loss": 1.6203475952148438, + "step": 46220 + }, + { + "epoch": 0.1399361312487703, + "grad_norm": 0.14499548077583313, + "learning_rate": 0.00010783477297223446, + "loss": 1.6303756713867188, + "step": 46230 + }, + { + "epoch": 0.13996640079911613, + "grad_norm": 0.13327570259571075, + "learning_rate": 0.00010783097779059705, + "loss": 1.6233665466308593, + "step": 46240 + }, + { + "epoch": 0.13999667034946195, + "grad_norm": 0.14332462847232819, + "learning_rate": 0.00010782718260895967, + "loss": 1.6282638549804687, + "step": 46250 + }, + { + "epoch": 0.1400269398998078, + "grad_norm": 0.1280924379825592, + "learning_rate": 0.00010782338742732227, + "loss": 1.6082462310791015, + "step": 46260 + }, + { + "epoch": 0.14005720945015362, + "grad_norm": 0.14336149394512177, + "learning_rate": 0.00010781959224568489, + "loss": 1.5957098007202148, + "step": 46270 + }, + { + "epoch": 0.14008747900049945, + "grad_norm": 0.1349504441022873, + "learning_rate": 0.00010781579706404748, + "loss": 1.6394603729248047, + "step": 46280 + }, + { + "epoch": 0.14011774855084527, + "grad_norm": 0.14375151693820953, + "learning_rate": 0.0001078120018824101, + "loss": 1.573044776916504, + "step": 46290 + }, + { + "epoch": 0.1401480181011911, + "grad_norm": 0.13403362035751343, + "learning_rate": 0.00010780820670077269, + "loss": 1.6327428817749023, + "step": 46300 + }, + { + "epoch": 0.14017828765153695, + "grad_norm": 0.13059961795806885, + "learning_rate": 0.00010780441151913531, + "loss": 1.6232242584228516, + "step": 46310 + }, + { + "epoch": 0.14020855720188277, + "grad_norm": 0.12242323905229568, + "learning_rate": 0.0001078006163374979, + "loss": 1.5930305480957032, + "step": 46320 + }, + { + "epoch": 0.1402388267522286, + "grad_norm": 0.14379850029945374, + "learning_rate": 0.00010779682115586052, + "loss": 1.6255908966064454, + "step": 46330 + }, + { + "epoch": 0.14026909630257442, + "grad_norm": 0.13627295196056366, + "learning_rate": 0.00010779302597422313, + "loss": 1.6119293212890624, + "step": 46340 + }, + { + "epoch": 0.14029936585292024, + "grad_norm": 0.13818687200546265, + "learning_rate": 0.00010778923079258573, + "loss": 1.6134933471679687, + "step": 46350 + }, + { + "epoch": 0.1403296354032661, + "grad_norm": 0.13225920498371124, + "learning_rate": 0.00010778543561094835, + "loss": 1.5662689208984375, + "step": 46360 + }, + { + "epoch": 0.14035990495361192, + "grad_norm": 0.14473120868206024, + "learning_rate": 0.00010778164042931095, + "loss": 1.6105987548828125, + "step": 46370 + }, + { + "epoch": 0.14039017450395774, + "grad_norm": 0.12817159295082092, + "learning_rate": 0.00010777784524767357, + "loss": 1.5986794471740722, + "step": 46380 + }, + { + "epoch": 0.14042044405430357, + "grad_norm": 0.1359196901321411, + "learning_rate": 0.00010777405006603616, + "loss": 1.6104536056518555, + "step": 46390 + }, + { + "epoch": 0.1404507136046494, + "grad_norm": 0.13737304508686066, + "learning_rate": 0.00010777025488439878, + "loss": 1.6045974731445312, + "step": 46400 + }, + { + "epoch": 0.14048098315499524, + "grad_norm": 0.13903093338012695, + "learning_rate": 0.00010776645970276137, + "loss": 1.6220964431762694, + "step": 46410 + }, + { + "epoch": 0.14051125270534107, + "grad_norm": 0.12966161966323853, + "learning_rate": 0.00010776266452112399, + "loss": 1.621268081665039, + "step": 46420 + }, + { + "epoch": 0.1405415222556869, + "grad_norm": 0.15698152780532837, + "learning_rate": 0.00010775886933948658, + "loss": 1.5999614715576171, + "step": 46430 + }, + { + "epoch": 0.14057179180603271, + "grad_norm": 0.13766667246818542, + "learning_rate": 0.0001077550741578492, + "loss": 1.6162096023559571, + "step": 46440 + }, + { + "epoch": 0.14060206135637854, + "grad_norm": 0.1250748187303543, + "learning_rate": 0.00010775127897621181, + "loss": 1.6076631546020508, + "step": 46450 + }, + { + "epoch": 0.1406323309067244, + "grad_norm": 0.1253008246421814, + "learning_rate": 0.00010774748379457441, + "loss": 1.625647735595703, + "step": 46460 + }, + { + "epoch": 0.1406626004570702, + "grad_norm": 0.14679259061813354, + "learning_rate": 0.00010774368861293702, + "loss": 1.6167564392089844, + "step": 46470 + }, + { + "epoch": 0.14069287000741604, + "grad_norm": 0.13113628327846527, + "learning_rate": 0.00010773989343129962, + "loss": 1.635598373413086, + "step": 46480 + }, + { + "epoch": 0.14072313955776186, + "grad_norm": 0.12438688427209854, + "learning_rate": 0.00010773609824966223, + "loss": 1.6069555282592773, + "step": 46490 + }, + { + "epoch": 0.14075340910810769, + "grad_norm": 0.1433326005935669, + "learning_rate": 0.00010773230306802484, + "loss": 1.619949722290039, + "step": 46500 + }, + { + "epoch": 0.14075340910810769, + "eval_loss": 1.6337827444076538, + "eval_runtime": 28.3387, + "eval_samples_per_second": 17.644, + "eval_steps_per_second": 1.129, + "step": 46500 + }, + { + "epoch": 0.14078367865845354, + "grad_norm": 0.14953428506851196, + "learning_rate": 0.00010772850788638744, + "loss": 1.609354019165039, + "step": 46510 + }, + { + "epoch": 0.14081394820879936, + "grad_norm": 0.13306541740894318, + "learning_rate": 0.00010772471270475005, + "loss": 1.6299825668334962, + "step": 46520 + }, + { + "epoch": 0.14084421775914518, + "grad_norm": 0.13075338304042816, + "learning_rate": 0.00010772091752311265, + "loss": 1.6292057037353516, + "step": 46530 + }, + { + "epoch": 0.140874487309491, + "grad_norm": 0.13312798738479614, + "learning_rate": 0.00010771712234147526, + "loss": 1.6248796463012696, + "step": 46540 + }, + { + "epoch": 0.14090475685983686, + "grad_norm": 0.1284542679786682, + "learning_rate": 0.00010771332715983788, + "loss": 1.597705078125, + "step": 46550 + }, + { + "epoch": 0.14093502641018268, + "grad_norm": 0.13981866836547852, + "learning_rate": 0.00010770953197820047, + "loss": 1.5840595245361329, + "step": 46560 + }, + { + "epoch": 0.1409652959605285, + "grad_norm": 0.11811327934265137, + "learning_rate": 0.00010770573679656309, + "loss": 1.645320510864258, + "step": 46570 + }, + { + "epoch": 0.14099556551087433, + "grad_norm": 0.12537828087806702, + "learning_rate": 0.0001077019416149257, + "loss": 1.6354894638061523, + "step": 46580 + }, + { + "epoch": 0.14102583506122016, + "grad_norm": 0.15063677728176117, + "learning_rate": 0.0001076981464332883, + "loss": 1.6305801391601562, + "step": 46590 + }, + { + "epoch": 0.141056104611566, + "grad_norm": 0.12246806919574738, + "learning_rate": 0.00010769435125165091, + "loss": 1.6432428359985352, + "step": 46600 + }, + { + "epoch": 0.14108637416191183, + "grad_norm": 0.13520807027816772, + "learning_rate": 0.00010769055607001352, + "loss": 1.6058555603027345, + "step": 46610 + }, + { + "epoch": 0.14111664371225766, + "grad_norm": 0.14320972561836243, + "learning_rate": 0.00010768676088837612, + "loss": 1.6044471740722657, + "step": 46620 + }, + { + "epoch": 0.14114691326260348, + "grad_norm": 0.11837390065193176, + "learning_rate": 0.00010768296570673873, + "loss": 1.614456558227539, + "step": 46630 + }, + { + "epoch": 0.1411771828129493, + "grad_norm": 0.12839986383914948, + "learning_rate": 0.00010767917052510133, + "loss": 1.6020034790039062, + "step": 46640 + }, + { + "epoch": 0.14120745236329516, + "grad_norm": 0.14695347845554352, + "learning_rate": 0.00010767537534346394, + "loss": 1.6115503311157227, + "step": 46650 + }, + { + "epoch": 0.14123772191364098, + "grad_norm": 0.12386012077331543, + "learning_rate": 0.00010767158016182655, + "loss": 1.6122074127197266, + "step": 46660 + }, + { + "epoch": 0.1412679914639868, + "grad_norm": 0.1396339386701584, + "learning_rate": 0.00010766778498018915, + "loss": 1.5949451446533203, + "step": 46670 + }, + { + "epoch": 0.14129826101433263, + "grad_norm": 0.12668690085411072, + "learning_rate": 0.00010766398979855176, + "loss": 1.6162380218505858, + "step": 46680 + }, + { + "epoch": 0.14132853056467845, + "grad_norm": 0.16521798074245453, + "learning_rate": 0.00010766019461691436, + "loss": 1.6354124069213867, + "step": 46690 + }, + { + "epoch": 0.1413588001150243, + "grad_norm": 0.1387658566236496, + "learning_rate": 0.00010765639943527697, + "loss": 1.6301185607910156, + "step": 46700 + }, + { + "epoch": 0.14138906966537013, + "grad_norm": 0.1302364468574524, + "learning_rate": 0.00010765260425363959, + "loss": 1.6368053436279297, + "step": 46710 + }, + { + "epoch": 0.14141933921571595, + "grad_norm": 0.12137189507484436, + "learning_rate": 0.00010764880907200218, + "loss": 1.6083492279052733, + "step": 46720 + }, + { + "epoch": 0.14144960876606177, + "grad_norm": 0.13196244835853577, + "learning_rate": 0.0001076450138903648, + "loss": 1.6390066146850586, + "step": 46730 + }, + { + "epoch": 0.1414798783164076, + "grad_norm": 0.15349265933036804, + "learning_rate": 0.00010764121870872739, + "loss": 1.6161785125732422, + "step": 46740 + }, + { + "epoch": 0.14151014786675345, + "grad_norm": 0.1259961724281311, + "learning_rate": 0.00010763742352709001, + "loss": 1.638962173461914, + "step": 46750 + }, + { + "epoch": 0.14154041741709927, + "grad_norm": 0.14177265763282776, + "learning_rate": 0.00010763362834545262, + "loss": 1.5986236572265624, + "step": 46760 + }, + { + "epoch": 0.1415706869674451, + "grad_norm": 0.1310613751411438, + "learning_rate": 0.00010762983316381522, + "loss": 1.6583028793334962, + "step": 46770 + }, + { + "epoch": 0.14160095651779092, + "grad_norm": 0.12805265188217163, + "learning_rate": 0.00010762603798217783, + "loss": 1.6248321533203125, + "step": 46780 + }, + { + "epoch": 0.14163122606813675, + "grad_norm": 0.12427284568548203, + "learning_rate": 0.00010762224280054044, + "loss": 1.6079042434692383, + "step": 46790 + }, + { + "epoch": 0.1416614956184826, + "grad_norm": 0.14079150557518005, + "learning_rate": 0.00010761844761890304, + "loss": 1.6536125183105468, + "step": 46800 + }, + { + "epoch": 0.14169176516882842, + "grad_norm": 0.13795106112957, + "learning_rate": 0.00010761465243726565, + "loss": 1.5993905067443848, + "step": 46810 + }, + { + "epoch": 0.14172203471917424, + "grad_norm": 0.1272938847541809, + "learning_rate": 0.00010761085725562827, + "loss": 1.6426656723022461, + "step": 46820 + }, + { + "epoch": 0.14175230426952007, + "grad_norm": 0.12965236604213715, + "learning_rate": 0.00010760706207399086, + "loss": 1.6294471740722656, + "step": 46830 + }, + { + "epoch": 0.1417825738198659, + "grad_norm": 0.13455404341220856, + "learning_rate": 0.00010760326689235348, + "loss": 1.6205352783203124, + "step": 46840 + }, + { + "epoch": 0.14181284337021174, + "grad_norm": 0.1490262746810913, + "learning_rate": 0.00010759947171071607, + "loss": 1.62705078125, + "step": 46850 + }, + { + "epoch": 0.14184311292055757, + "grad_norm": 0.12418176233768463, + "learning_rate": 0.00010759567652907869, + "loss": 1.6163373947143556, + "step": 46860 + }, + { + "epoch": 0.1418733824709034, + "grad_norm": 0.11749562621116638, + "learning_rate": 0.00010759188134744128, + "loss": 1.5862417221069336, + "step": 46870 + }, + { + "epoch": 0.14190365202124922, + "grad_norm": 0.13826411962509155, + "learning_rate": 0.0001075880861658039, + "loss": 1.590908908843994, + "step": 46880 + }, + { + "epoch": 0.14193392157159507, + "grad_norm": 0.1223822757601738, + "learning_rate": 0.0001075842909841665, + "loss": 1.591209602355957, + "step": 46890 + }, + { + "epoch": 0.1419641911219409, + "grad_norm": 0.12774331867694855, + "learning_rate": 0.00010758049580252912, + "loss": 1.6048206329345702, + "step": 46900 + }, + { + "epoch": 0.14199446067228672, + "grad_norm": 0.1315617561340332, + "learning_rate": 0.00010757670062089171, + "loss": 1.5873955726623534, + "step": 46910 + }, + { + "epoch": 0.14202473022263254, + "grad_norm": 0.127470463514328, + "learning_rate": 0.00010757290543925433, + "loss": 1.6297378540039062, + "step": 46920 + }, + { + "epoch": 0.14205499977297836, + "grad_norm": 0.12728269398212433, + "learning_rate": 0.00010756911025761692, + "loss": 1.6291183471679687, + "step": 46930 + }, + { + "epoch": 0.14208526932332421, + "grad_norm": 0.1343509405851364, + "learning_rate": 0.00010756531507597954, + "loss": 1.6254756927490235, + "step": 46940 + }, + { + "epoch": 0.14211553887367004, + "grad_norm": 0.12924133241176605, + "learning_rate": 0.00010756151989434215, + "loss": 1.6183305740356446, + "step": 46950 + }, + { + "epoch": 0.14214580842401586, + "grad_norm": 0.14721855521202087, + "learning_rate": 0.00010755772471270475, + "loss": 1.6057563781738282, + "step": 46960 + }, + { + "epoch": 0.1421760779743617, + "grad_norm": 0.14757174253463745, + "learning_rate": 0.00010755392953106737, + "loss": 1.6508893966674805, + "step": 46970 + }, + { + "epoch": 0.1422063475247075, + "grad_norm": 0.16627921164035797, + "learning_rate": 0.00010755013434942996, + "loss": 1.636509895324707, + "step": 46980 + }, + { + "epoch": 0.14223661707505336, + "grad_norm": 0.14820601046085358, + "learning_rate": 0.00010754633916779258, + "loss": 1.6312847137451172, + "step": 46990 + }, + { + "epoch": 0.14226688662539919, + "grad_norm": 0.1429329663515091, + "learning_rate": 0.00010754254398615517, + "loss": 1.6105390548706056, + "step": 47000 + }, + { + "epoch": 0.14226688662539919, + "eval_loss": 1.6125653982162476, + "eval_runtime": 28.3206, + "eval_samples_per_second": 17.655, + "eval_steps_per_second": 1.13, + "step": 47000 + }, + { + "epoch": 0.142297156175745, + "grad_norm": 0.13899271190166473, + "learning_rate": 0.0001075387488045178, + "loss": 1.5912424087524415, + "step": 47010 + }, + { + "epoch": 0.14232742572609083, + "grad_norm": 0.12418108433485031, + "learning_rate": 0.00010753495362288039, + "loss": 1.6160995483398437, + "step": 47020 + }, + { + "epoch": 0.14235769527643666, + "grad_norm": 0.14047905802726746, + "learning_rate": 0.000107531158441243, + "loss": 1.6028680801391602, + "step": 47030 + }, + { + "epoch": 0.1423879648267825, + "grad_norm": 0.14393705129623413, + "learning_rate": 0.0001075273632596056, + "loss": 1.62976131439209, + "step": 47040 + }, + { + "epoch": 0.14241823437712833, + "grad_norm": 0.14769597351551056, + "learning_rate": 0.00010752356807796822, + "loss": 1.625874137878418, + "step": 47050 + }, + { + "epoch": 0.14244850392747416, + "grad_norm": 0.1371387094259262, + "learning_rate": 0.00010751977289633081, + "loss": 1.588331699371338, + "step": 47060 + }, + { + "epoch": 0.14247877347781998, + "grad_norm": 0.12857837975025177, + "learning_rate": 0.00010751597771469343, + "loss": 1.6336219787597657, + "step": 47070 + }, + { + "epoch": 0.1425090430281658, + "grad_norm": 0.13208290934562683, + "learning_rate": 0.00010751218253305604, + "loss": 1.585866641998291, + "step": 47080 + }, + { + "epoch": 0.14253931257851166, + "grad_norm": 0.15524472296237946, + "learning_rate": 0.00010750838735141864, + "loss": 1.6389881134033204, + "step": 47090 + }, + { + "epoch": 0.14256958212885748, + "grad_norm": 0.13362860679626465, + "learning_rate": 0.00010750459216978125, + "loss": 1.635301399230957, + "step": 47100 + }, + { + "epoch": 0.1425998516792033, + "grad_norm": 0.1385907083749771, + "learning_rate": 0.00010750079698814385, + "loss": 1.6031679153442382, + "step": 47110 + }, + { + "epoch": 0.14263012122954913, + "grad_norm": 0.13473893702030182, + "learning_rate": 0.00010749700180650646, + "loss": 1.6388740539550781, + "step": 47120 + }, + { + "epoch": 0.14266039077989495, + "grad_norm": 0.13697780668735504, + "learning_rate": 0.00010749320662486907, + "loss": 1.605117416381836, + "step": 47130 + }, + { + "epoch": 0.1426906603302408, + "grad_norm": 0.1545863151550293, + "learning_rate": 0.00010748941144323167, + "loss": 1.5854710578918456, + "step": 47140 + }, + { + "epoch": 0.14272092988058663, + "grad_norm": 0.1457107961177826, + "learning_rate": 0.00010748561626159428, + "loss": 1.5815111160278321, + "step": 47150 + }, + { + "epoch": 0.14275119943093245, + "grad_norm": 0.11549140512943268, + "learning_rate": 0.0001074818210799569, + "loss": 1.6374513626098632, + "step": 47160 + }, + { + "epoch": 0.14278146898127828, + "grad_norm": 0.16083857417106628, + "learning_rate": 0.00010747802589831949, + "loss": 1.5867266654968262, + "step": 47170 + }, + { + "epoch": 0.1428117385316241, + "grad_norm": 0.11163903027772903, + "learning_rate": 0.00010747423071668211, + "loss": 1.6562688827514649, + "step": 47180 + }, + { + "epoch": 0.14284200808196995, + "grad_norm": 0.12701328098773956, + "learning_rate": 0.00010747043553504471, + "loss": 1.6179235458374024, + "step": 47190 + }, + { + "epoch": 0.14287227763231578, + "grad_norm": 0.1514185518026352, + "learning_rate": 0.00010746664035340732, + "loss": 1.6040273666381837, + "step": 47200 + }, + { + "epoch": 0.1429025471826616, + "grad_norm": 0.1286754459142685, + "learning_rate": 0.00010746284517176993, + "loss": 1.6149654388427734, + "step": 47210 + }, + { + "epoch": 0.14293281673300742, + "grad_norm": 0.12422341853380203, + "learning_rate": 0.00010745904999013253, + "loss": 1.6217287063598633, + "step": 47220 + }, + { + "epoch": 0.14296308628335327, + "grad_norm": 0.14807218313217163, + "learning_rate": 0.00010745525480849514, + "loss": 1.6266839981079102, + "step": 47230 + }, + { + "epoch": 0.1429933558336991, + "grad_norm": 0.12872928380966187, + "learning_rate": 0.00010745145962685774, + "loss": 1.59559326171875, + "step": 47240 + }, + { + "epoch": 0.14302362538404492, + "grad_norm": 0.12102628499269485, + "learning_rate": 0.00010744766444522035, + "loss": 1.622041893005371, + "step": 47250 + }, + { + "epoch": 0.14305389493439075, + "grad_norm": 0.1545492559671402, + "learning_rate": 0.00010744386926358296, + "loss": 1.6417943954467773, + "step": 47260 + }, + { + "epoch": 0.14308416448473657, + "grad_norm": 0.12478173524141312, + "learning_rate": 0.00010744007408194556, + "loss": 1.6119016647338866, + "step": 47270 + }, + { + "epoch": 0.14311443403508242, + "grad_norm": 0.12088593095541, + "learning_rate": 0.00010743627890030817, + "loss": 1.605525588989258, + "step": 47280 + }, + { + "epoch": 0.14314470358542825, + "grad_norm": 0.13809353113174438, + "learning_rate": 0.00010743248371867077, + "loss": 1.5961678504943848, + "step": 47290 + }, + { + "epoch": 0.14317497313577407, + "grad_norm": 0.13745935261249542, + "learning_rate": 0.00010742868853703338, + "loss": 1.60925350189209, + "step": 47300 + }, + { + "epoch": 0.1432052426861199, + "grad_norm": 0.14345820248126984, + "learning_rate": 0.00010742489335539599, + "loss": 1.6059005737304688, + "step": 47310 + }, + { + "epoch": 0.14323551223646572, + "grad_norm": 0.12683641910552979, + "learning_rate": 0.0001074210981737586, + "loss": 1.604964828491211, + "step": 47320 + }, + { + "epoch": 0.14326578178681157, + "grad_norm": 0.1360968053340912, + "learning_rate": 0.0001074173029921212, + "loss": 1.639938735961914, + "step": 47330 + }, + { + "epoch": 0.1432960513371574, + "grad_norm": 0.12948408722877502, + "learning_rate": 0.00010741350781048382, + "loss": 1.607183074951172, + "step": 47340 + }, + { + "epoch": 0.14332632088750322, + "grad_norm": 0.12280899286270142, + "learning_rate": 0.00010740971262884641, + "loss": 1.6437719345092774, + "step": 47350 + }, + { + "epoch": 0.14335659043784904, + "grad_norm": 0.1444225311279297, + "learning_rate": 0.00010740591744720903, + "loss": 1.6204353332519532, + "step": 47360 + }, + { + "epoch": 0.14338685998819486, + "grad_norm": 0.14055709540843964, + "learning_rate": 0.00010740212226557164, + "loss": 1.6153457641601563, + "step": 47370 + }, + { + "epoch": 0.14341712953854072, + "grad_norm": 0.12789708375930786, + "learning_rate": 0.00010739832708393424, + "loss": 1.6268173217773438, + "step": 47380 + }, + { + "epoch": 0.14344739908888654, + "grad_norm": 0.11350449174642563, + "learning_rate": 0.00010739453190229685, + "loss": 1.6118597030639648, + "step": 47390 + }, + { + "epoch": 0.14347766863923236, + "grad_norm": 0.12586621940135956, + "learning_rate": 0.00010739073672065945, + "loss": 1.5944867134094238, + "step": 47400 + }, + { + "epoch": 0.1435079381895782, + "grad_norm": 0.1370086818933487, + "learning_rate": 0.00010738694153902206, + "loss": 1.6116500854492188, + "step": 47410 + }, + { + "epoch": 0.143538207739924, + "grad_norm": 0.13460266590118408, + "learning_rate": 0.00010738314635738467, + "loss": 1.5731424331665038, + "step": 47420 + }, + { + "epoch": 0.14356847729026986, + "grad_norm": 0.13751265406608582, + "learning_rate": 0.00010737935117574727, + "loss": 1.6269027709960937, + "step": 47430 + }, + { + "epoch": 0.1435987468406157, + "grad_norm": 0.1301899403333664, + "learning_rate": 0.00010737555599410988, + "loss": 1.6073963165283203, + "step": 47440 + }, + { + "epoch": 0.1436290163909615, + "grad_norm": 0.13258585333824158, + "learning_rate": 0.0001073717608124725, + "loss": 1.6355422973632812, + "step": 47450 + }, + { + "epoch": 0.14365928594130734, + "grad_norm": 0.13196955621242523, + "learning_rate": 0.00010736796563083509, + "loss": 1.5952028274536132, + "step": 47460 + }, + { + "epoch": 0.14368955549165316, + "grad_norm": 0.137176513671875, + "learning_rate": 0.00010736417044919771, + "loss": 1.6098854064941406, + "step": 47470 + }, + { + "epoch": 0.143719825041999, + "grad_norm": 0.15411242842674255, + "learning_rate": 0.0001073603752675603, + "loss": 1.6344430923461915, + "step": 47480 + }, + { + "epoch": 0.14375009459234483, + "grad_norm": 0.13007745146751404, + "learning_rate": 0.00010735658008592292, + "loss": 1.6510976791381835, + "step": 47490 + }, + { + "epoch": 0.14378036414269066, + "grad_norm": 0.12560983002185822, + "learning_rate": 0.00010735278490428551, + "loss": 1.6665599822998047, + "step": 47500 + }, + { + "epoch": 0.14378036414269066, + "eval_loss": 1.604326605796814, + "eval_runtime": 28.3863, + "eval_samples_per_second": 17.614, + "eval_steps_per_second": 1.127, + "step": 47500 + }, + { + "epoch": 0.14381063369303648, + "grad_norm": 0.13645534217357635, + "learning_rate": 0.00010734898972264813, + "loss": 1.5998344421386719, + "step": 47510 + }, + { + "epoch": 0.1438409032433823, + "grad_norm": 0.14153191447257996, + "learning_rate": 0.00010734519454101072, + "loss": 1.6307884216308595, + "step": 47520 + }, + { + "epoch": 0.14387117279372816, + "grad_norm": 0.14088194072246552, + "learning_rate": 0.00010734139935937334, + "loss": 1.624613380432129, + "step": 47530 + }, + { + "epoch": 0.14390144234407398, + "grad_norm": 0.16062356531620026, + "learning_rate": 0.00010733760417773594, + "loss": 1.6070877075195313, + "step": 47540 + }, + { + "epoch": 0.1439317118944198, + "grad_norm": 0.12104494869709015, + "learning_rate": 0.00010733380899609856, + "loss": 1.6073444366455079, + "step": 47550 + }, + { + "epoch": 0.14396198144476563, + "grad_norm": 0.13816556334495544, + "learning_rate": 0.00010733001381446116, + "loss": 1.600653076171875, + "step": 47560 + }, + { + "epoch": 0.14399225099511145, + "grad_norm": 0.12461850792169571, + "learning_rate": 0.00010732621863282377, + "loss": 1.6115690231323243, + "step": 47570 + }, + { + "epoch": 0.1440225205454573, + "grad_norm": 0.12479289621114731, + "learning_rate": 0.00010732242345118639, + "loss": 1.6393321990966796, + "step": 47580 + }, + { + "epoch": 0.14405279009580313, + "grad_norm": 0.12551067769527435, + "learning_rate": 0.00010731862826954898, + "loss": 1.6485733032226562, + "step": 47590 + }, + { + "epoch": 0.14408305964614895, + "grad_norm": 0.1466420590877533, + "learning_rate": 0.0001073148330879116, + "loss": 1.6076141357421876, + "step": 47600 + }, + { + "epoch": 0.14411332919649478, + "grad_norm": 0.12493077665567398, + "learning_rate": 0.00010731103790627419, + "loss": 1.6365957260131836, + "step": 47610 + }, + { + "epoch": 0.14414359874684063, + "grad_norm": 0.13457514345645905, + "learning_rate": 0.00010730724272463681, + "loss": 1.6127685546875, + "step": 47620 + }, + { + "epoch": 0.14417386829718645, + "grad_norm": 0.11469823122024536, + "learning_rate": 0.0001073034475429994, + "loss": 1.638430404663086, + "step": 47630 + }, + { + "epoch": 0.14420413784753228, + "grad_norm": 0.1334000676870346, + "learning_rate": 0.00010729965236136202, + "loss": 1.6175739288330078, + "step": 47640 + }, + { + "epoch": 0.1442344073978781, + "grad_norm": 0.13494616746902466, + "learning_rate": 0.00010729585717972462, + "loss": 1.598836326599121, + "step": 47650 + }, + { + "epoch": 0.14426467694822392, + "grad_norm": 0.1397581249475479, + "learning_rate": 0.00010729206199808724, + "loss": 1.6428674697875976, + "step": 47660 + }, + { + "epoch": 0.14429494649856978, + "grad_norm": 0.1611756831407547, + "learning_rate": 0.00010728826681644983, + "loss": 1.583964729309082, + "step": 47670 + }, + { + "epoch": 0.1443252160489156, + "grad_norm": 0.12354674190282822, + "learning_rate": 0.00010728447163481245, + "loss": 1.5792394638061524, + "step": 47680 + }, + { + "epoch": 0.14435548559926142, + "grad_norm": 0.13161416351795197, + "learning_rate": 0.00010728067645317505, + "loss": 1.6097925186157227, + "step": 47690 + }, + { + "epoch": 0.14438575514960725, + "grad_norm": 0.13251444697380066, + "learning_rate": 0.00010727688127153766, + "loss": 1.5875370979309082, + "step": 47700 + }, + { + "epoch": 0.14441602469995307, + "grad_norm": 0.13684730231761932, + "learning_rate": 0.00010727308608990026, + "loss": 1.6365747451782227, + "step": 47710 + }, + { + "epoch": 0.14444629425029892, + "grad_norm": 0.15778771042823792, + "learning_rate": 0.00010726929090826287, + "loss": 1.6040328979492187, + "step": 47720 + }, + { + "epoch": 0.14447656380064475, + "grad_norm": 0.11939609050750732, + "learning_rate": 0.00010726549572662548, + "loss": 1.6519269943237305, + "step": 47730 + }, + { + "epoch": 0.14450683335099057, + "grad_norm": 0.13468578457832336, + "learning_rate": 0.00010726170054498808, + "loss": 1.5724279403686523, + "step": 47740 + }, + { + "epoch": 0.1445371029013364, + "grad_norm": 0.1395060122013092, + "learning_rate": 0.00010725790536335069, + "loss": 1.6165420532226562, + "step": 47750 + }, + { + "epoch": 0.14456737245168222, + "grad_norm": 0.12981629371643066, + "learning_rate": 0.0001072541101817133, + "loss": 1.6021486282348634, + "step": 47760 + }, + { + "epoch": 0.14459764200202807, + "grad_norm": 0.12607446312904358, + "learning_rate": 0.00010725031500007591, + "loss": 1.5963747024536132, + "step": 47770 + }, + { + "epoch": 0.1446279115523739, + "grad_norm": 0.13060873746871948, + "learning_rate": 0.0001072465198184385, + "loss": 1.6018524169921875, + "step": 47780 + }, + { + "epoch": 0.14465818110271972, + "grad_norm": 0.12988822162151337, + "learning_rate": 0.00010724272463680113, + "loss": 1.634360122680664, + "step": 47790 + }, + { + "epoch": 0.14468845065306554, + "grad_norm": 0.13969974219799042, + "learning_rate": 0.00010723892945516372, + "loss": 1.6168621063232422, + "step": 47800 + }, + { + "epoch": 0.14471872020341137, + "grad_norm": 0.13093307614326477, + "learning_rate": 0.00010723513427352634, + "loss": 1.6122333526611328, + "step": 47810 + }, + { + "epoch": 0.14474898975375722, + "grad_norm": 0.13643303513526917, + "learning_rate": 0.00010723133909188894, + "loss": 1.5866186141967773, + "step": 47820 + }, + { + "epoch": 0.14477925930410304, + "grad_norm": 0.15034449100494385, + "learning_rate": 0.00010722754391025155, + "loss": 1.6389368057250977, + "step": 47830 + }, + { + "epoch": 0.14480952885444887, + "grad_norm": 0.13699999451637268, + "learning_rate": 0.00010722374872861416, + "loss": 1.6285688400268554, + "step": 47840 + }, + { + "epoch": 0.1448397984047947, + "grad_norm": 0.14025232195854187, + "learning_rate": 0.00010721995354697676, + "loss": 1.6049190521240235, + "step": 47850 + }, + { + "epoch": 0.1448700679551405, + "grad_norm": 0.12958194315433502, + "learning_rate": 0.00010721615836533937, + "loss": 1.6173030853271484, + "step": 47860 + }, + { + "epoch": 0.14490033750548637, + "grad_norm": 0.11921104788780212, + "learning_rate": 0.00010721236318370197, + "loss": 1.6097709655761718, + "step": 47870 + }, + { + "epoch": 0.1449306070558322, + "grad_norm": 0.15066856145858765, + "learning_rate": 0.00010720856800206458, + "loss": 1.6192834854125977, + "step": 47880 + }, + { + "epoch": 0.144960876606178, + "grad_norm": 0.13362447917461395, + "learning_rate": 0.00010720477282042719, + "loss": 1.6367559432983398, + "step": 47890 + }, + { + "epoch": 0.14499114615652384, + "grad_norm": 0.11820293962955475, + "learning_rate": 0.00010720097763878979, + "loss": 1.5730573654174804, + "step": 47900 + }, + { + "epoch": 0.14502141570686966, + "grad_norm": 0.12326756119728088, + "learning_rate": 0.0001071971824571524, + "loss": 1.626141357421875, + "step": 47910 + }, + { + "epoch": 0.1450516852572155, + "grad_norm": 0.1304941028356552, + "learning_rate": 0.000107193387275515, + "loss": 1.6099641799926758, + "step": 47920 + }, + { + "epoch": 0.14508195480756134, + "grad_norm": 0.13200441002845764, + "learning_rate": 0.00010718959209387762, + "loss": 1.5918424606323243, + "step": 47930 + }, + { + "epoch": 0.14511222435790716, + "grad_norm": 0.13454477488994598, + "learning_rate": 0.00010718579691224022, + "loss": 1.6210922241210937, + "step": 47940 + }, + { + "epoch": 0.14514249390825298, + "grad_norm": 0.12400112301111221, + "learning_rate": 0.00010718200173060283, + "loss": 1.6151338577270509, + "step": 47950 + }, + { + "epoch": 0.14517276345859884, + "grad_norm": 0.12670667469501495, + "learning_rate": 0.00010717820654896543, + "loss": 1.6149951934814453, + "step": 47960 + }, + { + "epoch": 0.14520303300894466, + "grad_norm": 0.12433747202157974, + "learning_rate": 0.00010717441136732805, + "loss": 1.6374969482421875, + "step": 47970 + }, + { + "epoch": 0.14523330255929048, + "grad_norm": 0.12831076979637146, + "learning_rate": 0.00010717061618569065, + "loss": 1.6570205688476562, + "step": 47980 + }, + { + "epoch": 0.1452635721096363, + "grad_norm": 0.14132173359394073, + "learning_rate": 0.00010716682100405326, + "loss": 1.5604576110839843, + "step": 47990 + }, + { + "epoch": 0.14529384165998213, + "grad_norm": 0.128562331199646, + "learning_rate": 0.00010716302582241586, + "loss": 1.609891128540039, + "step": 48000 + }, + { + "epoch": 0.14529384165998213, + "eval_loss": 1.6161860227584839, + "eval_runtime": 27.8895, + "eval_samples_per_second": 17.928, + "eval_steps_per_second": 1.147, + "step": 48000 + }, + { + "epoch": 0.14532411121032798, + "grad_norm": 0.1116117388010025, + "learning_rate": 0.00010715923064077847, + "loss": 1.6187450408935546, + "step": 48010 + }, + { + "epoch": 0.1453543807606738, + "grad_norm": 0.14161840081214905, + "learning_rate": 0.00010715543545914108, + "loss": 1.5882661819458008, + "step": 48020 + }, + { + "epoch": 0.14538465031101963, + "grad_norm": 0.14027617871761322, + "learning_rate": 0.00010715164027750368, + "loss": 1.6005918502807617, + "step": 48030 + }, + { + "epoch": 0.14541491986136545, + "grad_norm": 0.12871938943862915, + "learning_rate": 0.00010714784509586629, + "loss": 1.6088428497314453, + "step": 48040 + }, + { + "epoch": 0.14544518941171128, + "grad_norm": 0.13456706702709198, + "learning_rate": 0.0001071440499142289, + "loss": 1.6458106994628907, + "step": 48050 + }, + { + "epoch": 0.14547545896205713, + "grad_norm": 0.12372169643640518, + "learning_rate": 0.00010714025473259151, + "loss": 1.6168895721435548, + "step": 48060 + }, + { + "epoch": 0.14550572851240295, + "grad_norm": 0.14712435007095337, + "learning_rate": 0.0001071364595509541, + "loss": 1.618771743774414, + "step": 48070 + }, + { + "epoch": 0.14553599806274878, + "grad_norm": 0.1357496678829193, + "learning_rate": 0.00010713266436931673, + "loss": 1.6150344848632812, + "step": 48080 + }, + { + "epoch": 0.1455662676130946, + "grad_norm": 0.15027570724487305, + "learning_rate": 0.00010712886918767932, + "loss": 1.5713987350463867, + "step": 48090 + }, + { + "epoch": 0.14559653716344043, + "grad_norm": 0.13392513990402222, + "learning_rate": 0.00010712507400604194, + "loss": 1.5570640563964844, + "step": 48100 + }, + { + "epoch": 0.14562680671378628, + "grad_norm": 0.1179814487695694, + "learning_rate": 0.00010712127882440453, + "loss": 1.5752909660339356, + "step": 48110 + }, + { + "epoch": 0.1456570762641321, + "grad_norm": 0.12910979986190796, + "learning_rate": 0.00010711748364276715, + "loss": 1.6524375915527343, + "step": 48120 + }, + { + "epoch": 0.14568734581447793, + "grad_norm": 0.1248561292886734, + "learning_rate": 0.00010711368846112974, + "loss": 1.6043821334838868, + "step": 48130 + }, + { + "epoch": 0.14571761536482375, + "grad_norm": 0.13351993262767792, + "learning_rate": 0.00010710989327949236, + "loss": 1.6241806030273438, + "step": 48140 + }, + { + "epoch": 0.14574788491516957, + "grad_norm": 0.1221163347363472, + "learning_rate": 0.00010710609809785495, + "loss": 1.5804821014404298, + "step": 48150 + }, + { + "epoch": 0.14577815446551542, + "grad_norm": 0.14034102857112885, + "learning_rate": 0.00010710230291621757, + "loss": 1.6914474487304687, + "step": 48160 + }, + { + "epoch": 0.14580842401586125, + "grad_norm": 0.12221558392047882, + "learning_rate": 0.00010709850773458019, + "loss": 1.6351690292358398, + "step": 48170 + }, + { + "epoch": 0.14583869356620707, + "grad_norm": 0.1365627944469452, + "learning_rate": 0.00010709471255294279, + "loss": 1.6215038299560547, + "step": 48180 + }, + { + "epoch": 0.1458689631165529, + "grad_norm": 0.13422052562236786, + "learning_rate": 0.0001070909173713054, + "loss": 1.6225120544433593, + "step": 48190 + }, + { + "epoch": 0.14589923266689872, + "grad_norm": 0.15019968152046204, + "learning_rate": 0.000107087122189668, + "loss": 1.6204719543457031, + "step": 48200 + }, + { + "epoch": 0.14592950221724457, + "grad_norm": 0.1417224109172821, + "learning_rate": 0.00010708332700803062, + "loss": 1.619598960876465, + "step": 48210 + }, + { + "epoch": 0.1459597717675904, + "grad_norm": 0.13358783721923828, + "learning_rate": 0.00010707953182639321, + "loss": 1.6077058792114258, + "step": 48220 + }, + { + "epoch": 0.14599004131793622, + "grad_norm": 0.1120697408914566, + "learning_rate": 0.00010707573664475583, + "loss": 1.6344182968139649, + "step": 48230 + }, + { + "epoch": 0.14602031086828204, + "grad_norm": 0.154598668217659, + "learning_rate": 0.00010707194146311842, + "loss": 1.6420385360717773, + "step": 48240 + }, + { + "epoch": 0.14605058041862787, + "grad_norm": 0.12441089004278183, + "learning_rate": 0.00010706814628148104, + "loss": 1.6611209869384767, + "step": 48250 + }, + { + "epoch": 0.14608084996897372, + "grad_norm": 0.14111770689487457, + "learning_rate": 0.00010706435109984363, + "loss": 1.607879638671875, + "step": 48260 + }, + { + "epoch": 0.14611111951931954, + "grad_norm": 0.12240491062402725, + "learning_rate": 0.00010706055591820625, + "loss": 1.6291948318481446, + "step": 48270 + }, + { + "epoch": 0.14614138906966537, + "grad_norm": 0.1187773048877716, + "learning_rate": 0.00010705676073656884, + "loss": 1.6426782608032227, + "step": 48280 + }, + { + "epoch": 0.1461716586200112, + "grad_norm": 0.13534076511859894, + "learning_rate": 0.00010705296555493146, + "loss": 1.6132556915283203, + "step": 48290 + }, + { + "epoch": 0.14620192817035704, + "grad_norm": 0.12372955679893494, + "learning_rate": 0.00010704917037329407, + "loss": 1.6012069702148437, + "step": 48300 + }, + { + "epoch": 0.14623219772070287, + "grad_norm": 0.12636853754520416, + "learning_rate": 0.00010704537519165668, + "loss": 1.6121692657470703, + "step": 48310 + }, + { + "epoch": 0.1462624672710487, + "grad_norm": 0.1349499523639679, + "learning_rate": 0.00010704158001001928, + "loss": 1.6026214599609374, + "step": 48320 + }, + { + "epoch": 0.14629273682139451, + "grad_norm": 0.14282745122909546, + "learning_rate": 0.00010703778482838189, + "loss": 1.6008211135864259, + "step": 48330 + }, + { + "epoch": 0.14632300637174034, + "grad_norm": 0.12498846650123596, + "learning_rate": 0.0001070339896467445, + "loss": 1.6289928436279297, + "step": 48340 + }, + { + "epoch": 0.1463532759220862, + "grad_norm": 0.12590430676937103, + "learning_rate": 0.0001070301944651071, + "loss": 1.5735723495483398, + "step": 48350 + }, + { + "epoch": 0.14638354547243201, + "grad_norm": 0.12957794964313507, + "learning_rate": 0.0001070263992834697, + "loss": 1.6014820098876954, + "step": 48360 + }, + { + "epoch": 0.14641381502277784, + "grad_norm": 0.14390824735164642, + "learning_rate": 0.00010702260410183231, + "loss": 1.6496566772460937, + "step": 48370 + }, + { + "epoch": 0.14644408457312366, + "grad_norm": 0.14962008595466614, + "learning_rate": 0.00010701880892019493, + "loss": 1.5566194534301758, + "step": 48380 + }, + { + "epoch": 0.14647435412346949, + "grad_norm": 0.1423662304878235, + "learning_rate": 0.00010701501373855752, + "loss": 1.634013557434082, + "step": 48390 + }, + { + "epoch": 0.14650462367381534, + "grad_norm": 0.13626742362976074, + "learning_rate": 0.00010701121855692014, + "loss": 1.5950567245483398, + "step": 48400 + }, + { + "epoch": 0.14653489322416116, + "grad_norm": 0.13506941497325897, + "learning_rate": 0.00010700742337528274, + "loss": 1.60827693939209, + "step": 48410 + }, + { + "epoch": 0.14656516277450699, + "grad_norm": 0.14408323168754578, + "learning_rate": 0.00010700362819364536, + "loss": 1.5845788955688476, + "step": 48420 + }, + { + "epoch": 0.1465954323248528, + "grad_norm": 0.11804350465536118, + "learning_rate": 0.00010699983301200796, + "loss": 1.6248113632202148, + "step": 48430 + }, + { + "epoch": 0.14662570187519863, + "grad_norm": 0.11502651125192642, + "learning_rate": 0.00010699603783037057, + "loss": 1.6344011306762696, + "step": 48440 + }, + { + "epoch": 0.14665597142554448, + "grad_norm": 0.14084096252918243, + "learning_rate": 0.00010699224264873317, + "loss": 1.578073501586914, + "step": 48450 + }, + { + "epoch": 0.1466862409758903, + "grad_norm": 0.14358185231685638, + "learning_rate": 0.00010698844746709578, + "loss": 1.6129890441894532, + "step": 48460 + }, + { + "epoch": 0.14671651052623613, + "grad_norm": 0.13205930590629578, + "learning_rate": 0.00010698465228545838, + "loss": 1.624801254272461, + "step": 48470 + }, + { + "epoch": 0.14674678007658196, + "grad_norm": 0.12282788008451462, + "learning_rate": 0.00010698085710382099, + "loss": 1.63187255859375, + "step": 48480 + }, + { + "epoch": 0.14677704962692778, + "grad_norm": 0.1262788027524948, + "learning_rate": 0.0001069770619221836, + "loss": 1.6259410858154297, + "step": 48490 + }, + { + "epoch": 0.14680731917727363, + "grad_norm": 0.13468582928180695, + "learning_rate": 0.0001069732667405462, + "loss": 1.6263595581054688, + "step": 48500 + }, + { + "epoch": 0.14680731917727363, + "eval_loss": 1.6318918466567993, + "eval_runtime": 28.5231, + "eval_samples_per_second": 17.53, + "eval_steps_per_second": 1.122, + "step": 48500 + }, + { + "epoch": 0.14683758872761946, + "grad_norm": 0.1311231255531311, + "learning_rate": 0.00010696947155890881, + "loss": 1.591154193878174, + "step": 48510 + }, + { + "epoch": 0.14686785827796528, + "grad_norm": 0.1283271461725235, + "learning_rate": 0.00010696567637727141, + "loss": 1.61322021484375, + "step": 48520 + }, + { + "epoch": 0.1468981278283111, + "grad_norm": 0.13310551643371582, + "learning_rate": 0.00010696188119563402, + "loss": 1.6449172973632813, + "step": 48530 + }, + { + "epoch": 0.14692839737865693, + "grad_norm": 0.13085223734378815, + "learning_rate": 0.00010695808601399663, + "loss": 1.6510606765747071, + "step": 48540 + }, + { + "epoch": 0.14695866692900278, + "grad_norm": 0.13725438714027405, + "learning_rate": 0.00010695429083235923, + "loss": 1.6240480422973633, + "step": 48550 + }, + { + "epoch": 0.1469889364793486, + "grad_norm": 0.1301555335521698, + "learning_rate": 0.00010695049565072185, + "loss": 1.6273628234863282, + "step": 48560 + }, + { + "epoch": 0.14701920602969443, + "grad_norm": 0.1214822307229042, + "learning_rate": 0.00010694670046908444, + "loss": 1.59872465133667, + "step": 48570 + }, + { + "epoch": 0.14704947558004025, + "grad_norm": 0.12119609117507935, + "learning_rate": 0.00010694290528744706, + "loss": 1.6068614959716796, + "step": 48580 + }, + { + "epoch": 0.14707974513038607, + "grad_norm": 0.1444167047739029, + "learning_rate": 0.00010693911010580967, + "loss": 1.6140880584716797, + "step": 48590 + }, + { + "epoch": 0.14711001468073193, + "grad_norm": 0.14933964610099792, + "learning_rate": 0.00010693531492417228, + "loss": 1.5883015632629394, + "step": 48600 + }, + { + "epoch": 0.14714028423107775, + "grad_norm": 0.1439373791217804, + "learning_rate": 0.00010693151974253488, + "loss": 1.5996981620788575, + "step": 48610 + }, + { + "epoch": 0.14717055378142357, + "grad_norm": 0.12001528590917587, + "learning_rate": 0.00010692772456089749, + "loss": 1.611081314086914, + "step": 48620 + }, + { + "epoch": 0.1472008233317694, + "grad_norm": 0.14069309830665588, + "learning_rate": 0.0001069239293792601, + "loss": 1.6038440704345702, + "step": 48630 + }, + { + "epoch": 0.14723109288211525, + "grad_norm": 0.13208511471748352, + "learning_rate": 0.0001069201341976227, + "loss": 1.6192697525024413, + "step": 48640 + }, + { + "epoch": 0.14726136243246107, + "grad_norm": 0.14108428359031677, + "learning_rate": 0.0001069163390159853, + "loss": 1.6331090927124023, + "step": 48650 + }, + { + "epoch": 0.1472916319828069, + "grad_norm": 0.1349346935749054, + "learning_rate": 0.00010691254383434791, + "loss": 1.5934482574462892, + "step": 48660 + }, + { + "epoch": 0.14732190153315272, + "grad_norm": 0.1327781230211258, + "learning_rate": 0.00010690874865271053, + "loss": 1.6243320465087892, + "step": 48670 + }, + { + "epoch": 0.14735217108349855, + "grad_norm": 0.14482107758522034, + "learning_rate": 0.00010690495347107312, + "loss": 1.6410203933715821, + "step": 48680 + }, + { + "epoch": 0.1473824406338444, + "grad_norm": 0.1444086730480194, + "learning_rate": 0.00010690115828943574, + "loss": 1.614260482788086, + "step": 48690 + }, + { + "epoch": 0.14741271018419022, + "grad_norm": 0.13029000163078308, + "learning_rate": 0.00010689736310779834, + "loss": 1.5878564834594726, + "step": 48700 + }, + { + "epoch": 0.14744297973453604, + "grad_norm": 0.1298060566186905, + "learning_rate": 0.00010689356792616095, + "loss": 1.6419864654541017, + "step": 48710 + }, + { + "epoch": 0.14747324928488187, + "grad_norm": 0.13168904185295105, + "learning_rate": 0.00010688977274452355, + "loss": 1.5947759628295899, + "step": 48720 + }, + { + "epoch": 0.1475035188352277, + "grad_norm": 0.1489674150943756, + "learning_rate": 0.00010688597756288617, + "loss": 1.6154109954833984, + "step": 48730 + }, + { + "epoch": 0.14753378838557354, + "grad_norm": 0.14573156833648682, + "learning_rate": 0.00010688218238124876, + "loss": 1.6345863342285156, + "step": 48740 + }, + { + "epoch": 0.14756405793591937, + "grad_norm": 0.14362813532352448, + "learning_rate": 0.00010687838719961138, + "loss": 1.614237403869629, + "step": 48750 + }, + { + "epoch": 0.1475943274862652, + "grad_norm": 0.13760393857955933, + "learning_rate": 0.00010687459201797397, + "loss": 1.5823493003845215, + "step": 48760 + }, + { + "epoch": 0.14762459703661102, + "grad_norm": 0.1397860050201416, + "learning_rate": 0.00010687079683633659, + "loss": 1.634438705444336, + "step": 48770 + }, + { + "epoch": 0.14765486658695684, + "grad_norm": 0.12426138669252396, + "learning_rate": 0.0001068670016546992, + "loss": 1.668858528137207, + "step": 48780 + }, + { + "epoch": 0.1476851361373027, + "grad_norm": 0.12117888778448105, + "learning_rate": 0.0001068632064730618, + "loss": 1.6034761428833009, + "step": 48790 + }, + { + "epoch": 0.14771540568764852, + "grad_norm": 0.15115925669670105, + "learning_rate": 0.00010685941129142442, + "loss": 1.670062255859375, + "step": 48800 + }, + { + "epoch": 0.14774567523799434, + "grad_norm": 0.142647385597229, + "learning_rate": 0.00010685561610978701, + "loss": 1.6284637451171875, + "step": 48810 + }, + { + "epoch": 0.14777594478834016, + "grad_norm": 0.14148439466953278, + "learning_rate": 0.00010685182092814963, + "loss": 1.6317873001098633, + "step": 48820 + }, + { + "epoch": 0.147806214338686, + "grad_norm": 0.12628492712974548, + "learning_rate": 0.00010684802574651223, + "loss": 1.575554656982422, + "step": 48830 + }, + { + "epoch": 0.14783648388903184, + "grad_norm": 0.1227564588189125, + "learning_rate": 0.00010684423056487485, + "loss": 1.5978455543518066, + "step": 48840 + }, + { + "epoch": 0.14786675343937766, + "grad_norm": 0.12454581260681152, + "learning_rate": 0.00010684043538323744, + "loss": 1.6292381286621094, + "step": 48850 + }, + { + "epoch": 0.1478970229897235, + "grad_norm": 0.1522718369960785, + "learning_rate": 0.00010683664020160006, + "loss": 1.6312616348266602, + "step": 48860 + }, + { + "epoch": 0.1479272925400693, + "grad_norm": 0.12755414843559265, + "learning_rate": 0.00010683284501996265, + "loss": 1.6204566955566406, + "step": 48870 + }, + { + "epoch": 0.14795756209041513, + "grad_norm": 0.14753276109695435, + "learning_rate": 0.00010682904983832527, + "loss": 1.6121971130371093, + "step": 48880 + }, + { + "epoch": 0.147987831640761, + "grad_norm": 0.12358905375003815, + "learning_rate": 0.00010682525465668786, + "loss": 1.6066011428833007, + "step": 48890 + }, + { + "epoch": 0.1480181011911068, + "grad_norm": 0.13792124390602112, + "learning_rate": 0.00010682145947505048, + "loss": 1.6436838150024413, + "step": 48900 + }, + { + "epoch": 0.14804837074145263, + "grad_norm": 0.1515696495771408, + "learning_rate": 0.00010681766429341307, + "loss": 1.5957228660583496, + "step": 48910 + }, + { + "epoch": 0.14807864029179846, + "grad_norm": 0.1542457789182663, + "learning_rate": 0.0001068138691117757, + "loss": 1.6159112930297852, + "step": 48920 + }, + { + "epoch": 0.14810890984214428, + "grad_norm": 0.12737977504730225, + "learning_rate": 0.0001068100739301383, + "loss": 1.6616912841796876, + "step": 48930 + }, + { + "epoch": 0.14813917939249013, + "grad_norm": 0.11638175696134567, + "learning_rate": 0.0001068062787485009, + "loss": 1.614016342163086, + "step": 48940 + }, + { + "epoch": 0.14816944894283596, + "grad_norm": 0.13153016567230225, + "learning_rate": 0.00010680248356686351, + "loss": 1.5942358016967773, + "step": 48950 + }, + { + "epoch": 0.14819971849318178, + "grad_norm": 0.13704587519168854, + "learning_rate": 0.00010679868838522612, + "loss": 1.659186553955078, + "step": 48960 + }, + { + "epoch": 0.1482299880435276, + "grad_norm": 0.13686074316501617, + "learning_rate": 0.00010679489320358872, + "loss": 1.597928237915039, + "step": 48970 + }, + { + "epoch": 0.14826025759387346, + "grad_norm": 0.13985034823417664, + "learning_rate": 0.00010679109802195133, + "loss": 1.5807676315307617, + "step": 48980 + }, + { + "epoch": 0.14829052714421928, + "grad_norm": 0.11868799477815628, + "learning_rate": 0.00010678730284031395, + "loss": 1.6187313079833985, + "step": 48990 + }, + { + "epoch": 0.1483207966945651, + "grad_norm": 0.13911744952201843, + "learning_rate": 0.00010678350765867654, + "loss": 1.6207971572875977, + "step": 49000 + }, + { + "epoch": 0.1483207966945651, + "eval_loss": 1.610211968421936, + "eval_runtime": 28.1226, + "eval_samples_per_second": 17.779, + "eval_steps_per_second": 1.138, + "step": 49000 + }, + { + "epoch": 0.14835106624491093, + "grad_norm": 0.12947577238082886, + "learning_rate": 0.00010677971247703916, + "loss": 1.6237972259521485, + "step": 49010 + }, + { + "epoch": 0.14838133579525675, + "grad_norm": 0.12946732342243195, + "learning_rate": 0.00010677591729540175, + "loss": 1.5654129028320312, + "step": 49020 + }, + { + "epoch": 0.1484116053456026, + "grad_norm": 0.17780838906764984, + "learning_rate": 0.00010677212211376437, + "loss": 1.6251672744750976, + "step": 49030 + }, + { + "epoch": 0.14844187489594843, + "grad_norm": 0.13933861255645752, + "learning_rate": 0.00010676832693212698, + "loss": 1.5727425575256349, + "step": 49040 + }, + { + "epoch": 0.14847214444629425, + "grad_norm": 0.12479838728904724, + "learning_rate": 0.00010676453175048958, + "loss": 1.5902128219604492, + "step": 49050 + }, + { + "epoch": 0.14850241399664008, + "grad_norm": 0.13817152380943298, + "learning_rate": 0.00010676073656885219, + "loss": 1.5940306663513184, + "step": 49060 + }, + { + "epoch": 0.1485326835469859, + "grad_norm": 0.149787038564682, + "learning_rate": 0.0001067569413872148, + "loss": 1.612788772583008, + "step": 49070 + }, + { + "epoch": 0.14856295309733175, + "grad_norm": 0.1517002135515213, + "learning_rate": 0.0001067531462055774, + "loss": 1.6312326431274413, + "step": 49080 + }, + { + "epoch": 0.14859322264767758, + "grad_norm": 0.12342012673616409, + "learning_rate": 0.00010674935102394001, + "loss": 1.621322250366211, + "step": 49090 + }, + { + "epoch": 0.1486234921980234, + "grad_norm": 0.1374868005514145, + "learning_rate": 0.00010674555584230261, + "loss": 1.5966447830200194, + "step": 49100 + }, + { + "epoch": 0.14865376174836922, + "grad_norm": 0.14282701909542084, + "learning_rate": 0.00010674176066066522, + "loss": 1.5912742614746094, + "step": 49110 + }, + { + "epoch": 0.14868403129871505, + "grad_norm": 0.14672957360744476, + "learning_rate": 0.00010673796547902783, + "loss": 1.609145736694336, + "step": 49120 + }, + { + "epoch": 0.1487143008490609, + "grad_norm": 0.12182924151420593, + "learning_rate": 0.00010673417029739043, + "loss": 1.5699352264404296, + "step": 49130 + }, + { + "epoch": 0.14874457039940672, + "grad_norm": 0.12977731227874756, + "learning_rate": 0.00010673037511575304, + "loss": 1.6146007537841798, + "step": 49140 + }, + { + "epoch": 0.14877483994975255, + "grad_norm": 0.127190500497818, + "learning_rate": 0.00010672657993411564, + "loss": 1.6044876098632812, + "step": 49150 + }, + { + "epoch": 0.14880510950009837, + "grad_norm": 0.13400012254714966, + "learning_rate": 0.00010672278475247825, + "loss": 1.634070587158203, + "step": 49160 + }, + { + "epoch": 0.1488353790504442, + "grad_norm": 0.13335759937763214, + "learning_rate": 0.00010671898957084087, + "loss": 1.614248275756836, + "step": 49170 + }, + { + "epoch": 0.14886564860079005, + "grad_norm": 0.12901534140110016, + "learning_rate": 0.00010671519438920346, + "loss": 1.5700056076049804, + "step": 49180 + }, + { + "epoch": 0.14889591815113587, + "grad_norm": 0.13090896606445312, + "learning_rate": 0.00010671139920756608, + "loss": 1.575363826751709, + "step": 49190 + }, + { + "epoch": 0.1489261877014817, + "grad_norm": 0.13429157435894012, + "learning_rate": 0.00010670760402592869, + "loss": 1.6205638885498046, + "step": 49200 + }, + { + "epoch": 0.14895645725182752, + "grad_norm": 0.12926355004310608, + "learning_rate": 0.00010670380884429129, + "loss": 1.6445213317871095, + "step": 49210 + }, + { + "epoch": 0.14898672680217334, + "grad_norm": 0.14830337464809418, + "learning_rate": 0.0001067000136626539, + "loss": 1.6101835250854493, + "step": 49220 + }, + { + "epoch": 0.1490169963525192, + "grad_norm": 0.14957118034362793, + "learning_rate": 0.0001066962184810165, + "loss": 1.606619644165039, + "step": 49230 + }, + { + "epoch": 0.14904726590286502, + "grad_norm": 0.12494141608476639, + "learning_rate": 0.00010669242329937911, + "loss": 1.6124202728271484, + "step": 49240 + }, + { + "epoch": 0.14907753545321084, + "grad_norm": 0.1391746997833252, + "learning_rate": 0.00010668862811774172, + "loss": 1.6447914123535157, + "step": 49250 + }, + { + "epoch": 0.14910780500355666, + "grad_norm": 0.14083069562911987, + "learning_rate": 0.00010668483293610432, + "loss": 1.6603776931762695, + "step": 49260 + }, + { + "epoch": 0.1491380745539025, + "grad_norm": 0.12290970981121063, + "learning_rate": 0.00010668103775446693, + "loss": 1.6213871002197267, + "step": 49270 + }, + { + "epoch": 0.14916834410424834, + "grad_norm": 0.12685173749923706, + "learning_rate": 0.00010667724257282953, + "loss": 1.6070507049560547, + "step": 49280 + }, + { + "epoch": 0.14919861365459416, + "grad_norm": 0.13317495584487915, + "learning_rate": 0.00010667344739119214, + "loss": 1.603818130493164, + "step": 49290 + }, + { + "epoch": 0.14922888320494, + "grad_norm": 0.12971791625022888, + "learning_rate": 0.00010666965220955476, + "loss": 1.6054779052734376, + "step": 49300 + }, + { + "epoch": 0.1492591527552858, + "grad_norm": 0.1198020875453949, + "learning_rate": 0.00010666585702791735, + "loss": 1.618218994140625, + "step": 49310 + }, + { + "epoch": 0.14928942230563164, + "grad_norm": 0.13395452499389648, + "learning_rate": 0.00010666206184627997, + "loss": 1.6042610168457032, + "step": 49320 + }, + { + "epoch": 0.1493196918559775, + "grad_norm": 0.14211472868919373, + "learning_rate": 0.00010665826666464256, + "loss": 1.6124654769897462, + "step": 49330 + }, + { + "epoch": 0.1493499614063233, + "grad_norm": 0.13260243833065033, + "learning_rate": 0.00010665447148300518, + "loss": 1.6264575958251952, + "step": 49340 + }, + { + "epoch": 0.14938023095666914, + "grad_norm": 0.12359706312417984, + "learning_rate": 0.00010665067630136778, + "loss": 1.6017051696777345, + "step": 49350 + }, + { + "epoch": 0.14941050050701496, + "grad_norm": 0.13339219987392426, + "learning_rate": 0.0001066468811197304, + "loss": 1.6245006561279296, + "step": 49360 + }, + { + "epoch": 0.1494407700573608, + "grad_norm": 0.14289382100105286, + "learning_rate": 0.00010664308593809299, + "loss": 1.6136043548583985, + "step": 49370 + }, + { + "epoch": 0.14947103960770663, + "grad_norm": 0.14477743208408356, + "learning_rate": 0.00010663929075645561, + "loss": 1.6401985168457032, + "step": 49380 + }, + { + "epoch": 0.14950130915805246, + "grad_norm": 0.14448370039463043, + "learning_rate": 0.00010663549557481821, + "loss": 1.594312858581543, + "step": 49390 + }, + { + "epoch": 0.14953157870839828, + "grad_norm": 0.13566958904266357, + "learning_rate": 0.00010663170039318082, + "loss": 1.6280086517333985, + "step": 49400 + }, + { + "epoch": 0.1495618482587441, + "grad_norm": 0.13817857205867767, + "learning_rate": 0.00010662790521154344, + "loss": 1.6135406494140625, + "step": 49410 + }, + { + "epoch": 0.14959211780908996, + "grad_norm": 0.13890111446380615, + "learning_rate": 0.00010662411002990603, + "loss": 1.61193904876709, + "step": 49420 + }, + { + "epoch": 0.14962238735943578, + "grad_norm": 0.13092133402824402, + "learning_rate": 0.00010662031484826865, + "loss": 1.6062202453613281, + "step": 49430 + }, + { + "epoch": 0.1496526569097816, + "grad_norm": 0.13420559465885162, + "learning_rate": 0.00010661651966663124, + "loss": 1.6127447128295898, + "step": 49440 + }, + { + "epoch": 0.14968292646012743, + "grad_norm": 0.1341347098350525, + "learning_rate": 0.00010661272448499386, + "loss": 1.6405755996704101, + "step": 49450 + }, + { + "epoch": 0.14971319601047325, + "grad_norm": 0.12384860962629318, + "learning_rate": 0.00010660892930335646, + "loss": 1.5889562606811523, + "step": 49460 + }, + { + "epoch": 0.1497434655608191, + "grad_norm": 0.13925832509994507, + "learning_rate": 0.00010660513412171907, + "loss": 1.5916871070861816, + "step": 49470 + }, + { + "epoch": 0.14977373511116493, + "grad_norm": 0.12585866451263428, + "learning_rate": 0.00010660133894008167, + "loss": 1.5980361938476562, + "step": 49480 + }, + { + "epoch": 0.14980400466151075, + "grad_norm": 0.14508451521396637, + "learning_rate": 0.00010659754375844429, + "loss": 1.611966896057129, + "step": 49490 + }, + { + "epoch": 0.14983427421185658, + "grad_norm": 0.1272895336151123, + "learning_rate": 0.00010659374857680688, + "loss": 1.5851256370544433, + "step": 49500 + }, + { + "epoch": 0.14983427421185658, + "eval_loss": 1.6107107400894165, + "eval_runtime": 28.2277, + "eval_samples_per_second": 17.713, + "eval_steps_per_second": 1.134, + "step": 49500 + }, + { + "epoch": 0.1498645437622024, + "grad_norm": 0.1333940476179123, + "learning_rate": 0.0001065899533951695, + "loss": 1.6435619354248048, + "step": 49510 + }, + { + "epoch": 0.14989481331254825, + "grad_norm": 0.13713379204273224, + "learning_rate": 0.00010658615821353209, + "loss": 1.6034896850585938, + "step": 49520 + }, + { + "epoch": 0.14992508286289408, + "grad_norm": 0.13074912130832672, + "learning_rate": 0.00010658236303189471, + "loss": 1.63255615234375, + "step": 49530 + }, + { + "epoch": 0.1499553524132399, + "grad_norm": 0.1459258645772934, + "learning_rate": 0.00010657856785025732, + "loss": 1.6172273635864258, + "step": 49540 + }, + { + "epoch": 0.14998562196358572, + "grad_norm": 0.14176149666309357, + "learning_rate": 0.00010657477266861992, + "loss": 1.6396678924560546, + "step": 49550 + }, + { + "epoch": 0.15001589151393155, + "grad_norm": 0.13358889520168304, + "learning_rate": 0.00010657097748698253, + "loss": 1.600986862182617, + "step": 49560 + }, + { + "epoch": 0.1500461610642774, + "grad_norm": 0.1388716697692871, + "learning_rate": 0.00010656718230534513, + "loss": 1.627035140991211, + "step": 49570 + }, + { + "epoch": 0.15007643061462322, + "grad_norm": 0.15249697864055634, + "learning_rate": 0.00010656338712370774, + "loss": 1.63248291015625, + "step": 49580 + }, + { + "epoch": 0.15010670016496905, + "grad_norm": 0.1316218078136444, + "learning_rate": 0.00010655959194207035, + "loss": 1.6230045318603517, + "step": 49590 + }, + { + "epoch": 0.15013696971531487, + "grad_norm": 0.1428549885749817, + "learning_rate": 0.00010655579676043297, + "loss": 1.6164436340332031, + "step": 49600 + }, + { + "epoch": 0.1501672392656607, + "grad_norm": 0.13285645842552185, + "learning_rate": 0.00010655200157879556, + "loss": 1.574795913696289, + "step": 49610 + }, + { + "epoch": 0.15019750881600655, + "grad_norm": 0.1332576423883438, + "learning_rate": 0.00010654820639715818, + "loss": 1.6203109741210937, + "step": 49620 + }, + { + "epoch": 0.15022777836635237, + "grad_norm": 0.14581897854804993, + "learning_rate": 0.00010654441121552077, + "loss": 1.6151557922363282, + "step": 49630 + }, + { + "epoch": 0.1502580479166982, + "grad_norm": 0.13740761578083038, + "learning_rate": 0.00010654061603388339, + "loss": 1.6431608200073242, + "step": 49640 + }, + { + "epoch": 0.15028831746704402, + "grad_norm": 0.12793003022670746, + "learning_rate": 0.00010653682085224598, + "loss": 1.5911767959594727, + "step": 49650 + }, + { + "epoch": 0.15031858701738984, + "grad_norm": 0.11927346885204315, + "learning_rate": 0.0001065330256706086, + "loss": 1.63199462890625, + "step": 49660 + }, + { + "epoch": 0.1503488565677357, + "grad_norm": 0.12931902706623077, + "learning_rate": 0.00010652923048897121, + "loss": 1.5971552848815918, + "step": 49670 + }, + { + "epoch": 0.15037912611808152, + "grad_norm": 0.1268589049577713, + "learning_rate": 0.00010652543530733381, + "loss": 1.6288322448730468, + "step": 49680 + }, + { + "epoch": 0.15040939566842734, + "grad_norm": 0.12995052337646484, + "learning_rate": 0.00010652164012569642, + "loss": 1.5901241302490234, + "step": 49690 + }, + { + "epoch": 0.15043966521877317, + "grad_norm": 0.1335935741662979, + "learning_rate": 0.00010651784494405903, + "loss": 1.6446449279785156, + "step": 49700 + }, + { + "epoch": 0.15046993476911902, + "grad_norm": 0.1359298676252365, + "learning_rate": 0.00010651404976242163, + "loss": 1.6120975494384766, + "step": 49710 + }, + { + "epoch": 0.15050020431946484, + "grad_norm": 0.12586724758148193, + "learning_rate": 0.00010651025458078424, + "loss": 1.5884601593017578, + "step": 49720 + }, + { + "epoch": 0.15053047386981067, + "grad_norm": 0.12444696575403214, + "learning_rate": 0.00010650645939914684, + "loss": 1.5861364364624024, + "step": 49730 + }, + { + "epoch": 0.1505607434201565, + "grad_norm": 0.12811265885829926, + "learning_rate": 0.00010650266421750945, + "loss": 1.5849433898925782, + "step": 49740 + }, + { + "epoch": 0.1505910129705023, + "grad_norm": 0.13234350085258484, + "learning_rate": 0.00010649886903587206, + "loss": 1.602254867553711, + "step": 49750 + }, + { + "epoch": 0.15062128252084817, + "grad_norm": 0.15801629424095154, + "learning_rate": 0.00010649507385423466, + "loss": 1.5965278625488282, + "step": 49760 + }, + { + "epoch": 0.150651552071194, + "grad_norm": 0.12684914469718933, + "learning_rate": 0.00010649127867259727, + "loss": 1.644423484802246, + "step": 49770 + }, + { + "epoch": 0.1506818216215398, + "grad_norm": 0.13844464719295502, + "learning_rate": 0.00010648748349095989, + "loss": 1.6491817474365233, + "step": 49780 + }, + { + "epoch": 0.15071209117188564, + "grad_norm": 0.13024410605430603, + "learning_rate": 0.00010648368830932248, + "loss": 1.6049718856811523, + "step": 49790 + }, + { + "epoch": 0.15074236072223146, + "grad_norm": 0.12355981767177582, + "learning_rate": 0.0001064798931276851, + "loss": 1.6046314239501953, + "step": 49800 + }, + { + "epoch": 0.1507726302725773, + "grad_norm": 0.12936624884605408, + "learning_rate": 0.0001064760979460477, + "loss": 1.5908515930175782, + "step": 49810 + }, + { + "epoch": 0.15080289982292314, + "grad_norm": 0.13606655597686768, + "learning_rate": 0.00010647230276441031, + "loss": 1.5974311828613281, + "step": 49820 + }, + { + "epoch": 0.15083316937326896, + "grad_norm": 0.11944949626922607, + "learning_rate": 0.00010646850758277292, + "loss": 1.639094924926758, + "step": 49830 + }, + { + "epoch": 0.15086343892361478, + "grad_norm": 0.1355586051940918, + "learning_rate": 0.00010646471240113552, + "loss": 1.6428594589233398, + "step": 49840 + }, + { + "epoch": 0.1508937084739606, + "grad_norm": 0.13081464171409607, + "learning_rate": 0.00010646091721949813, + "loss": 1.620157241821289, + "step": 49850 + }, + { + "epoch": 0.15092397802430646, + "grad_norm": 0.14573314785957336, + "learning_rate": 0.00010645712203786073, + "loss": 1.6094545364379882, + "step": 49860 + }, + { + "epoch": 0.15095424757465228, + "grad_norm": 0.14290854334831238, + "learning_rate": 0.00010645332685622334, + "loss": 1.607797622680664, + "step": 49870 + }, + { + "epoch": 0.1509845171249981, + "grad_norm": 0.13314752280712128, + "learning_rate": 0.00010644953167458595, + "loss": 1.6177186965942383, + "step": 49880 + }, + { + "epoch": 0.15101478667534393, + "grad_norm": 0.14142155647277832, + "learning_rate": 0.00010644573649294855, + "loss": 1.6073247909545898, + "step": 49890 + }, + { + "epoch": 0.15104505622568976, + "grad_norm": 0.13520830869674683, + "learning_rate": 0.00010644194131131116, + "loss": 1.6350425720214843, + "step": 49900 + }, + { + "epoch": 0.1510753257760356, + "grad_norm": 0.12863925099372864, + "learning_rate": 0.00010643814612967378, + "loss": 1.5786112785339355, + "step": 49910 + }, + { + "epoch": 0.15110559532638143, + "grad_norm": 0.11954151839017868, + "learning_rate": 0.00010643435094803637, + "loss": 1.5625112533569336, + "step": 49920 + }, + { + "epoch": 0.15113586487672726, + "grad_norm": 0.14071255922317505, + "learning_rate": 0.00010643055576639899, + "loss": 1.5975326538085937, + "step": 49930 + }, + { + "epoch": 0.15116613442707308, + "grad_norm": 0.1275099515914917, + "learning_rate": 0.00010642676058476158, + "loss": 1.602315902709961, + "step": 49940 + }, + { + "epoch": 0.1511964039774189, + "grad_norm": 0.12883274257183075, + "learning_rate": 0.0001064229654031242, + "loss": 1.5806500434875488, + "step": 49950 + }, + { + "epoch": 0.15122667352776475, + "grad_norm": 0.127792090177536, + "learning_rate": 0.0001064191702214868, + "loss": 1.618448257446289, + "step": 49960 + }, + { + "epoch": 0.15125694307811058, + "grad_norm": 0.1388750672340393, + "learning_rate": 0.00010641537503984941, + "loss": 1.5989755630493163, + "step": 49970 + }, + { + "epoch": 0.1512872126284564, + "grad_norm": 0.12985524535179138, + "learning_rate": 0.000106411579858212, + "loss": 1.6173198699951172, + "step": 49980 + }, + { + "epoch": 0.15131748217880223, + "grad_norm": 0.14651793241500854, + "learning_rate": 0.00010640778467657462, + "loss": 1.624899673461914, + "step": 49990 + }, + { + "epoch": 0.15134775172914805, + "grad_norm": 0.14499148726463318, + "learning_rate": 0.00010640398949493723, + "loss": 1.6282388687133789, + "step": 50000 + }, + { + "epoch": 0.15134775172914805, + "eval_loss": 1.6230305433273315, + "eval_runtime": 28.5102, + "eval_samples_per_second": 17.538, + "eval_steps_per_second": 1.122, + "step": 50000 + }, + { + "epoch": 0.1513780212794939, + "grad_norm": 0.14562614262104034, + "learning_rate": 0.00010640019431329984, + "loss": 1.6143951416015625, + "step": 50010 + }, + { + "epoch": 0.15140829082983973, + "grad_norm": 0.12328340858221054, + "learning_rate": 0.00010639639913166246, + "loss": 1.6084423065185547, + "step": 50020 + }, + { + "epoch": 0.15143856038018555, + "grad_norm": 0.1344257891178131, + "learning_rate": 0.00010639260395002505, + "loss": 1.6152666091918946, + "step": 50030 + }, + { + "epoch": 0.15146882993053137, + "grad_norm": 0.12482559680938721, + "learning_rate": 0.00010638880876838767, + "loss": 1.6169368743896484, + "step": 50040 + }, + { + "epoch": 0.15149909948087723, + "grad_norm": 0.13507091999053955, + "learning_rate": 0.00010638501358675026, + "loss": 1.6174919128417968, + "step": 50050 + }, + { + "epoch": 0.15152936903122305, + "grad_norm": 0.1389417201280594, + "learning_rate": 0.00010638121840511288, + "loss": 1.5921339988708496, + "step": 50060 + }, + { + "epoch": 0.15155963858156887, + "grad_norm": 0.13121509552001953, + "learning_rate": 0.00010637742322347547, + "loss": 1.6028343200683595, + "step": 50070 + }, + { + "epoch": 0.1515899081319147, + "grad_norm": 0.14304780960083008, + "learning_rate": 0.00010637362804183809, + "loss": 1.6017566680908204, + "step": 50080 + }, + { + "epoch": 0.15162017768226052, + "grad_norm": 0.1289886087179184, + "learning_rate": 0.00010636983286020068, + "loss": 1.5847486495971679, + "step": 50090 + }, + { + "epoch": 0.15165044723260637, + "grad_norm": 0.12648452818393707, + "learning_rate": 0.0001063660376785633, + "loss": 1.611343002319336, + "step": 50100 + }, + { + "epoch": 0.1516807167829522, + "grad_norm": 0.12926386296749115, + "learning_rate": 0.0001063622424969259, + "loss": 1.6210594177246094, + "step": 50110 + }, + { + "epoch": 0.15171098633329802, + "grad_norm": 0.13130159676074982, + "learning_rate": 0.00010635844731528852, + "loss": 1.6128421783447267, + "step": 50120 + }, + { + "epoch": 0.15174125588364384, + "grad_norm": 0.12546436488628387, + "learning_rate": 0.00010635465213365111, + "loss": 1.5957921981811523, + "step": 50130 + }, + { + "epoch": 0.15177152543398967, + "grad_norm": 0.12691128253936768, + "learning_rate": 0.00010635085695201373, + "loss": 1.604266357421875, + "step": 50140 + }, + { + "epoch": 0.15180179498433552, + "grad_norm": 0.13623325526714325, + "learning_rate": 0.00010634706177037633, + "loss": 1.6412309646606444, + "step": 50150 + }, + { + "epoch": 0.15183206453468134, + "grad_norm": 0.11840939521789551, + "learning_rate": 0.00010634326658873894, + "loss": 1.6684860229492187, + "step": 50160 + }, + { + "epoch": 0.15186233408502717, + "grad_norm": 0.12970364093780518, + "learning_rate": 0.00010633947140710155, + "loss": 1.6303922653198242, + "step": 50170 + }, + { + "epoch": 0.151892603635373, + "grad_norm": 0.11404827982187271, + "learning_rate": 0.00010633567622546415, + "loss": 1.6216911315917968, + "step": 50180 + }, + { + "epoch": 0.15192287318571882, + "grad_norm": 0.13113120198249817, + "learning_rate": 0.00010633188104382676, + "loss": 1.6291839599609375, + "step": 50190 + }, + { + "epoch": 0.15195314273606467, + "grad_norm": 0.1394766867160797, + "learning_rate": 0.00010632808586218936, + "loss": 1.618187713623047, + "step": 50200 + }, + { + "epoch": 0.1519834122864105, + "grad_norm": 0.153174489736557, + "learning_rate": 0.00010632429068055198, + "loss": 1.577022171020508, + "step": 50210 + }, + { + "epoch": 0.15201368183675631, + "grad_norm": 0.12691690027713776, + "learning_rate": 0.00010632049549891458, + "loss": 1.6092737197875977, + "step": 50220 + }, + { + "epoch": 0.15204395138710214, + "grad_norm": 0.14269080758094788, + "learning_rate": 0.0001063167003172772, + "loss": 1.5918592453002929, + "step": 50230 + }, + { + "epoch": 0.15207422093744796, + "grad_norm": 0.14756402373313904, + "learning_rate": 0.00010631290513563979, + "loss": 1.6106758117675781, + "step": 50240 + }, + { + "epoch": 0.15210449048779381, + "grad_norm": 0.14240379631519318, + "learning_rate": 0.0001063091099540024, + "loss": 1.635793685913086, + "step": 50250 + }, + { + "epoch": 0.15213476003813964, + "grad_norm": 0.1363179236650467, + "learning_rate": 0.000106305314772365, + "loss": 1.6289480209350586, + "step": 50260 + }, + { + "epoch": 0.15216502958848546, + "grad_norm": 0.1307298094034195, + "learning_rate": 0.00010630151959072762, + "loss": 1.6084936141967774, + "step": 50270 + }, + { + "epoch": 0.15219529913883129, + "grad_norm": 0.13375943899154663, + "learning_rate": 0.00010629772440909022, + "loss": 1.6364625930786132, + "step": 50280 + }, + { + "epoch": 0.1522255686891771, + "grad_norm": 0.1247037798166275, + "learning_rate": 0.00010629392922745283, + "loss": 1.629939651489258, + "step": 50290 + }, + { + "epoch": 0.15225583823952296, + "grad_norm": 0.12102916836738586, + "learning_rate": 0.00010629013404581544, + "loss": 1.6104007720947267, + "step": 50300 + }, + { + "epoch": 0.15228610778986879, + "grad_norm": 0.13021618127822876, + "learning_rate": 0.00010628633886417804, + "loss": 1.625913429260254, + "step": 50310 + }, + { + "epoch": 0.1523163773402146, + "grad_norm": 0.12633726000785828, + "learning_rate": 0.00010628254368254065, + "loss": 1.6196355819702148, + "step": 50320 + }, + { + "epoch": 0.15234664689056043, + "grad_norm": 0.13830845057964325, + "learning_rate": 0.00010627874850090325, + "loss": 1.5961867332458497, + "step": 50330 + }, + { + "epoch": 0.15237691644090626, + "grad_norm": 0.1420169174671173, + "learning_rate": 0.00010627495331926586, + "loss": 1.6110054016113282, + "step": 50340 + }, + { + "epoch": 0.1524071859912521, + "grad_norm": 0.13442587852478027, + "learning_rate": 0.00010627115813762847, + "loss": 1.640472412109375, + "step": 50350 + }, + { + "epoch": 0.15243745554159793, + "grad_norm": 0.1478811800479889, + "learning_rate": 0.00010626736295599107, + "loss": 1.560027503967285, + "step": 50360 + }, + { + "epoch": 0.15246772509194376, + "grad_norm": 0.1281873732805252, + "learning_rate": 0.00010626356777435368, + "loss": 1.6238662719726562, + "step": 50370 + }, + { + "epoch": 0.15249799464228958, + "grad_norm": 0.12799550592899323, + "learning_rate": 0.00010625977259271628, + "loss": 1.6128307342529298, + "step": 50380 + }, + { + "epoch": 0.15252826419263543, + "grad_norm": 0.1272624135017395, + "learning_rate": 0.00010625597741107889, + "loss": 1.6016580581665039, + "step": 50390 + }, + { + "epoch": 0.15255853374298126, + "grad_norm": 0.11678604036569595, + "learning_rate": 0.0001062521822294415, + "loss": 1.665163803100586, + "step": 50400 + }, + { + "epoch": 0.15258880329332708, + "grad_norm": 0.11529343575239182, + "learning_rate": 0.00010624838704780412, + "loss": 1.6060295104980469, + "step": 50410 + }, + { + "epoch": 0.1526190728436729, + "grad_norm": 0.12833434343338013, + "learning_rate": 0.00010624459186616672, + "loss": 1.623765754699707, + "step": 50420 + }, + { + "epoch": 0.15264934239401873, + "grad_norm": 0.13162493705749512, + "learning_rate": 0.00010624079668452933, + "loss": 1.6161148071289062, + "step": 50430 + }, + { + "epoch": 0.15267961194436458, + "grad_norm": 0.14824751019477844, + "learning_rate": 0.00010623700150289193, + "loss": 1.6220808029174805, + "step": 50440 + }, + { + "epoch": 0.1527098814947104, + "grad_norm": 0.12890228629112244, + "learning_rate": 0.00010623320632125454, + "loss": 1.6320384979248046, + "step": 50450 + }, + { + "epoch": 0.15274015104505623, + "grad_norm": 0.11916013807058334, + "learning_rate": 0.00010622941113961715, + "loss": 1.5830701828002929, + "step": 50460 + }, + { + "epoch": 0.15277042059540205, + "grad_norm": 0.1141551062464714, + "learning_rate": 0.00010622561595797975, + "loss": 1.593877410888672, + "step": 50470 + }, + { + "epoch": 0.15280069014574788, + "grad_norm": 0.1258206069469452, + "learning_rate": 0.00010622182077634236, + "loss": 1.5987180709838866, + "step": 50480 + }, + { + "epoch": 0.15283095969609373, + "grad_norm": 0.12658220529556274, + "learning_rate": 0.00010621802559470496, + "loss": 1.6449293136596679, + "step": 50490 + }, + { + "epoch": 0.15286122924643955, + "grad_norm": 0.13797834515571594, + "learning_rate": 0.00010621423041306757, + "loss": 1.5587175369262696, + "step": 50500 + }, + { + "epoch": 0.15286122924643955, + "eval_loss": 1.6126205921173096, + "eval_runtime": 28.0514, + "eval_samples_per_second": 17.824, + "eval_steps_per_second": 1.141, + "step": 50500 + }, + { + "epoch": 0.15289149879678537, + "grad_norm": 0.1292905956506729, + "learning_rate": 0.00010621043523143018, + "loss": 1.587060546875, + "step": 50510 + }, + { + "epoch": 0.1529217683471312, + "grad_norm": 0.15189418196678162, + "learning_rate": 0.0001062066400497928, + "loss": 1.6174306869506836, + "step": 50520 + }, + { + "epoch": 0.15295203789747702, + "grad_norm": 0.12403862923383713, + "learning_rate": 0.00010620284486815539, + "loss": 1.6193641662597655, + "step": 50530 + }, + { + "epoch": 0.15298230744782287, + "grad_norm": 0.12638498842716217, + "learning_rate": 0.000106199049686518, + "loss": 1.5939023971557618, + "step": 50540 + }, + { + "epoch": 0.1530125769981687, + "grad_norm": 0.13530227541923523, + "learning_rate": 0.0001061952545048806, + "loss": 1.5896677017211913, + "step": 50550 + }, + { + "epoch": 0.15304284654851452, + "grad_norm": 0.14591827988624573, + "learning_rate": 0.00010619145932324322, + "loss": 1.6364681243896484, + "step": 50560 + }, + { + "epoch": 0.15307311609886035, + "grad_norm": 0.13452015817165375, + "learning_rate": 0.00010618766414160581, + "loss": 1.60953369140625, + "step": 50570 + }, + { + "epoch": 0.15310338564920617, + "grad_norm": 0.13987624645233154, + "learning_rate": 0.00010618386895996843, + "loss": 1.6562789916992187, + "step": 50580 + }, + { + "epoch": 0.15313365519955202, + "grad_norm": 0.13554833829402924, + "learning_rate": 0.00010618007377833102, + "loss": 1.619493293762207, + "step": 50590 + }, + { + "epoch": 0.15316392474989785, + "grad_norm": 0.1262538880109787, + "learning_rate": 0.00010617627859669364, + "loss": 1.624110221862793, + "step": 50600 + }, + { + "epoch": 0.15319419430024367, + "grad_norm": 0.128061443567276, + "learning_rate": 0.00010617248341505625, + "loss": 1.5764503479003906, + "step": 50610 + }, + { + "epoch": 0.1532244638505895, + "grad_norm": 0.134828582406044, + "learning_rate": 0.00010616868823341885, + "loss": 1.6123781204223633, + "step": 50620 + }, + { + "epoch": 0.15325473340093532, + "grad_norm": 0.1449786275625229, + "learning_rate": 0.00010616489305178146, + "loss": 1.616006088256836, + "step": 50630 + }, + { + "epoch": 0.15328500295128117, + "grad_norm": 0.14452782273292542, + "learning_rate": 0.00010616109787014407, + "loss": 1.5979604721069336, + "step": 50640 + }, + { + "epoch": 0.153315272501627, + "grad_norm": 0.12802265584468842, + "learning_rate": 0.00010615730268850669, + "loss": 1.586691188812256, + "step": 50650 + }, + { + "epoch": 0.15334554205197282, + "grad_norm": 0.12964139878749847, + "learning_rate": 0.00010615350750686928, + "loss": 1.599344825744629, + "step": 50660 + }, + { + "epoch": 0.15337581160231864, + "grad_norm": 0.13929682970046997, + "learning_rate": 0.0001061497123252319, + "loss": 1.582326889038086, + "step": 50670 + }, + { + "epoch": 0.15340608115266446, + "grad_norm": 0.12787310779094696, + "learning_rate": 0.00010614591714359449, + "loss": 1.5920549392700196, + "step": 50680 + }, + { + "epoch": 0.15343635070301032, + "grad_norm": 0.13989807665348053, + "learning_rate": 0.00010614212196195711, + "loss": 1.600054931640625, + "step": 50690 + }, + { + "epoch": 0.15346662025335614, + "grad_norm": 0.1336323767900467, + "learning_rate": 0.0001061383267803197, + "loss": 1.6226360321044921, + "step": 50700 + }, + { + "epoch": 0.15349688980370196, + "grad_norm": 0.1221652701497078, + "learning_rate": 0.00010613453159868232, + "loss": 1.5586860656738282, + "step": 50710 + }, + { + "epoch": 0.1535271593540478, + "grad_norm": 0.11984819173812866, + "learning_rate": 0.00010613073641704491, + "loss": 1.6430965423583985, + "step": 50720 + }, + { + "epoch": 0.1535574289043936, + "grad_norm": 0.12076810002326965, + "learning_rate": 0.00010612694123540753, + "loss": 1.574099349975586, + "step": 50730 + }, + { + "epoch": 0.15358769845473946, + "grad_norm": 0.1274062991142273, + "learning_rate": 0.00010612314605377013, + "loss": 1.6054954528808594, + "step": 50740 + }, + { + "epoch": 0.1536179680050853, + "grad_norm": 0.13889676332473755, + "learning_rate": 0.00010611935087213274, + "loss": 1.5825970649719239, + "step": 50750 + }, + { + "epoch": 0.1536482375554311, + "grad_norm": 0.10896041244268417, + "learning_rate": 0.00010611555569049534, + "loss": 1.6366456985473632, + "step": 50760 + }, + { + "epoch": 0.15367850710577693, + "grad_norm": 0.11636090278625488, + "learning_rate": 0.00010611176050885796, + "loss": 1.6295440673828125, + "step": 50770 + }, + { + "epoch": 0.1537087766561228, + "grad_norm": 0.12875178456306458, + "learning_rate": 0.00010610796532722056, + "loss": 1.6182868957519532, + "step": 50780 + }, + { + "epoch": 0.1537390462064686, + "grad_norm": 0.14596346020698547, + "learning_rate": 0.00010610417014558317, + "loss": 1.6711936950683595, + "step": 50790 + }, + { + "epoch": 0.15376931575681443, + "grad_norm": 0.12457408010959625, + "learning_rate": 0.00010610037496394577, + "loss": 1.5989849090576171, + "step": 50800 + }, + { + "epoch": 0.15379958530716026, + "grad_norm": 0.12115134298801422, + "learning_rate": 0.00010609657978230838, + "loss": 1.6107780456542968, + "step": 50810 + }, + { + "epoch": 0.15382985485750608, + "grad_norm": 0.1260155737400055, + "learning_rate": 0.000106092784600671, + "loss": 1.6108219146728515, + "step": 50820 + }, + { + "epoch": 0.15386012440785193, + "grad_norm": 0.13496996462345123, + "learning_rate": 0.00010608898941903359, + "loss": 1.6356208801269532, + "step": 50830 + }, + { + "epoch": 0.15389039395819776, + "grad_norm": 0.13793352246284485, + "learning_rate": 0.00010608519423739621, + "loss": 1.6123338699340821, + "step": 50840 + }, + { + "epoch": 0.15392066350854358, + "grad_norm": 0.1418723165988922, + "learning_rate": 0.0001060813990557588, + "loss": 1.5979894638061523, + "step": 50850 + }, + { + "epoch": 0.1539509330588894, + "grad_norm": 0.1527472734451294, + "learning_rate": 0.00010607760387412142, + "loss": 1.6014404296875, + "step": 50860 + }, + { + "epoch": 0.15398120260923523, + "grad_norm": 0.12355388700962067, + "learning_rate": 0.00010607380869248402, + "loss": 1.619539451599121, + "step": 50870 + }, + { + "epoch": 0.15401147215958108, + "grad_norm": 0.13931627571582794, + "learning_rate": 0.00010607001351084664, + "loss": 1.6000198364257812, + "step": 50880 + }, + { + "epoch": 0.1540417417099269, + "grad_norm": 0.11681853979825974, + "learning_rate": 0.00010606621832920924, + "loss": 1.618514633178711, + "step": 50890 + }, + { + "epoch": 0.15407201126027273, + "grad_norm": 0.14039021730422974, + "learning_rate": 0.00010606242314757185, + "loss": 1.5977571487426758, + "step": 50900 + }, + { + "epoch": 0.15410228081061855, + "grad_norm": 0.1667398363351822, + "learning_rate": 0.00010605862796593445, + "loss": 1.6334426879882813, + "step": 50910 + }, + { + "epoch": 0.15413255036096438, + "grad_norm": 0.13634145259857178, + "learning_rate": 0.00010605483278429706, + "loss": 1.6345470428466797, + "step": 50920 + }, + { + "epoch": 0.15416281991131023, + "grad_norm": 0.12308723479509354, + "learning_rate": 0.00010605103760265967, + "loss": 1.619102668762207, + "step": 50930 + }, + { + "epoch": 0.15419308946165605, + "grad_norm": 0.1370759904384613, + "learning_rate": 0.00010604724242102227, + "loss": 1.5993005752563476, + "step": 50940 + }, + { + "epoch": 0.15422335901200188, + "grad_norm": 0.12838846445083618, + "learning_rate": 0.00010604344723938488, + "loss": 1.5989507675170898, + "step": 50950 + }, + { + "epoch": 0.1542536285623477, + "grad_norm": 0.11964241415262222, + "learning_rate": 0.00010603965205774748, + "loss": 1.6303535461425782, + "step": 50960 + }, + { + "epoch": 0.15428389811269352, + "grad_norm": 0.13413558900356293, + "learning_rate": 0.00010603585687611009, + "loss": 1.5738924980163573, + "step": 50970 + }, + { + "epoch": 0.15431416766303938, + "grad_norm": 0.12871605157852173, + "learning_rate": 0.0001060320616944727, + "loss": 1.5896055221557617, + "step": 50980 + }, + { + "epoch": 0.1543444372133852, + "grad_norm": 0.12127666175365448, + "learning_rate": 0.0001060282665128353, + "loss": 1.5936132431030274, + "step": 50990 + }, + { + "epoch": 0.15437470676373102, + "grad_norm": 0.13947460055351257, + "learning_rate": 0.00010602447133119791, + "loss": 1.584012794494629, + "step": 51000 + }, + { + "epoch": 0.15437470676373102, + "eval_loss": 1.621618628501892, + "eval_runtime": 28.1703, + "eval_samples_per_second": 17.749, + "eval_steps_per_second": 1.136, + "step": 51000 + }, + { + "epoch": 0.15440497631407685, + "grad_norm": 0.13521960377693176, + "learning_rate": 0.00010602067614956053, + "loss": 1.6243003845214843, + "step": 51010 + }, + { + "epoch": 0.15443524586442267, + "grad_norm": 0.13605108857154846, + "learning_rate": 0.00010601688096792313, + "loss": 1.627787208557129, + "step": 51020 + }, + { + "epoch": 0.15446551541476852, + "grad_norm": 0.14941729605197906, + "learning_rate": 0.00010601308578628574, + "loss": 1.590443992614746, + "step": 51030 + }, + { + "epoch": 0.15449578496511435, + "grad_norm": 0.12809491157531738, + "learning_rate": 0.00010600929060464834, + "loss": 1.566036319732666, + "step": 51040 + }, + { + "epoch": 0.15452605451546017, + "grad_norm": 0.1485578715801239, + "learning_rate": 0.00010600549542301095, + "loss": 1.5959491729736328, + "step": 51050 + }, + { + "epoch": 0.154556324065806, + "grad_norm": 0.1358618140220642, + "learning_rate": 0.00010600170024137356, + "loss": 1.631222915649414, + "step": 51060 + }, + { + "epoch": 0.15458659361615182, + "grad_norm": 0.12768889963626862, + "learning_rate": 0.00010599790505973616, + "loss": 1.5957691192626953, + "step": 51070 + }, + { + "epoch": 0.15461686316649767, + "grad_norm": 0.13132528960704803, + "learning_rate": 0.00010599410987809877, + "loss": 1.609979248046875, + "step": 51080 + }, + { + "epoch": 0.1546471327168435, + "grad_norm": 0.1504647433757782, + "learning_rate": 0.00010599031469646137, + "loss": 1.5910110473632812, + "step": 51090 + }, + { + "epoch": 0.15467740226718932, + "grad_norm": 0.1584380716085434, + "learning_rate": 0.00010598651951482398, + "loss": 1.6057207107543945, + "step": 51100 + }, + { + "epoch": 0.15470767181753514, + "grad_norm": 0.13918863236904144, + "learning_rate": 0.00010598272433318659, + "loss": 1.6243114471435547, + "step": 51110 + }, + { + "epoch": 0.154737941367881, + "grad_norm": 0.1521454155445099, + "learning_rate": 0.00010597892915154919, + "loss": 1.6402971267700195, + "step": 51120 + }, + { + "epoch": 0.15476821091822682, + "grad_norm": 0.13640475273132324, + "learning_rate": 0.00010597513396991181, + "loss": 1.6127601623535157, + "step": 51130 + }, + { + "epoch": 0.15479848046857264, + "grad_norm": 0.12889565527439117, + "learning_rate": 0.0001059713387882744, + "loss": 1.5978650093078612, + "step": 51140 + }, + { + "epoch": 0.15482875001891847, + "grad_norm": 0.13814900815486908, + "learning_rate": 0.00010596754360663702, + "loss": 1.6320079803466796, + "step": 51150 + }, + { + "epoch": 0.1548590195692643, + "grad_norm": 0.13794247806072235, + "learning_rate": 0.00010596374842499962, + "loss": 1.5981555938720704, + "step": 51160 + }, + { + "epoch": 0.15488928911961014, + "grad_norm": 0.13628782331943512, + "learning_rate": 0.00010595995324336224, + "loss": 1.6228313446044922, + "step": 51170 + }, + { + "epoch": 0.15491955866995596, + "grad_norm": 0.12144716084003448, + "learning_rate": 0.00010595615806172483, + "loss": 1.629431915283203, + "step": 51180 + }, + { + "epoch": 0.1549498282203018, + "grad_norm": 0.1394611895084381, + "learning_rate": 0.00010595236288008745, + "loss": 1.6069137573242187, + "step": 51190 + }, + { + "epoch": 0.1549800977706476, + "grad_norm": 0.14218857884407043, + "learning_rate": 0.00010594856769845004, + "loss": 1.6026624679565429, + "step": 51200 + }, + { + "epoch": 0.15501036732099344, + "grad_norm": 0.1261620819568634, + "learning_rate": 0.00010594477251681266, + "loss": 1.616531753540039, + "step": 51210 + }, + { + "epoch": 0.1550406368713393, + "grad_norm": 0.1301330327987671, + "learning_rate": 0.00010594097733517527, + "loss": 1.608795166015625, + "step": 51220 + }, + { + "epoch": 0.1550709064216851, + "grad_norm": 0.12878462672233582, + "learning_rate": 0.00010593718215353787, + "loss": 1.619207000732422, + "step": 51230 + }, + { + "epoch": 0.15510117597203094, + "grad_norm": 0.12800085544586182, + "learning_rate": 0.00010593338697190048, + "loss": 1.5815293312072753, + "step": 51240 + }, + { + "epoch": 0.15513144552237676, + "grad_norm": 0.12307921051979065, + "learning_rate": 0.00010592959179026308, + "loss": 1.5922707557678222, + "step": 51250 + }, + { + "epoch": 0.15516171507272258, + "grad_norm": 0.13765229284763336, + "learning_rate": 0.0001059257966086257, + "loss": 1.6210941314697265, + "step": 51260 + }, + { + "epoch": 0.15519198462306844, + "grad_norm": 0.12165264040231705, + "learning_rate": 0.0001059220014269883, + "loss": 1.5662595748901367, + "step": 51270 + }, + { + "epoch": 0.15522225417341426, + "grad_norm": 0.11893048137426376, + "learning_rate": 0.00010591820624535091, + "loss": 1.627919578552246, + "step": 51280 + }, + { + "epoch": 0.15525252372376008, + "grad_norm": 0.1256275177001953, + "learning_rate": 0.00010591441106371351, + "loss": 1.6295682907104492, + "step": 51290 + }, + { + "epoch": 0.1552827932741059, + "grad_norm": 0.14684423804283142, + "learning_rate": 0.00010591061588207613, + "loss": 1.6393823623657227, + "step": 51300 + }, + { + "epoch": 0.15531306282445173, + "grad_norm": 0.11281944811344147, + "learning_rate": 0.00010590682070043872, + "loss": 1.6036230087280274, + "step": 51310 + }, + { + "epoch": 0.15534333237479758, + "grad_norm": 0.13426806032657623, + "learning_rate": 0.00010590302551880134, + "loss": 1.627950668334961, + "step": 51320 + }, + { + "epoch": 0.1553736019251434, + "grad_norm": 0.14958509802818298, + "learning_rate": 0.00010589923033716393, + "loss": 1.5988740921020508, + "step": 51330 + }, + { + "epoch": 0.15540387147548923, + "grad_norm": 0.14555571973323822, + "learning_rate": 0.00010589543515552655, + "loss": 1.5770094871520997, + "step": 51340 + }, + { + "epoch": 0.15543414102583505, + "grad_norm": 0.13424718379974365, + "learning_rate": 0.00010589163997388914, + "loss": 1.6028717041015625, + "step": 51350 + }, + { + "epoch": 0.15546441057618088, + "grad_norm": 0.13256622850894928, + "learning_rate": 0.00010588784479225176, + "loss": 1.6066923141479492, + "step": 51360 + }, + { + "epoch": 0.15549468012652673, + "grad_norm": 0.1333952248096466, + "learning_rate": 0.00010588404961061435, + "loss": 1.617019271850586, + "step": 51370 + }, + { + "epoch": 0.15552494967687255, + "grad_norm": 0.13285884261131287, + "learning_rate": 0.00010588025442897697, + "loss": 1.5725640296936034, + "step": 51380 + }, + { + "epoch": 0.15555521922721838, + "grad_norm": 0.14296038448810577, + "learning_rate": 0.00010587645924733958, + "loss": 1.6357093811035157, + "step": 51390 + }, + { + "epoch": 0.1555854887775642, + "grad_norm": 0.13709735870361328, + "learning_rate": 0.00010587266406570219, + "loss": 1.6096504211425782, + "step": 51400 + }, + { + "epoch": 0.15561575832791003, + "grad_norm": 0.1413615345954895, + "learning_rate": 0.00010586886888406479, + "loss": 1.568459415435791, + "step": 51410 + }, + { + "epoch": 0.15564602787825588, + "grad_norm": 0.1316085010766983, + "learning_rate": 0.0001058650737024274, + "loss": 1.5744709014892577, + "step": 51420 + }, + { + "epoch": 0.1556762974286017, + "grad_norm": 0.14336878061294556, + "learning_rate": 0.00010586127852079002, + "loss": 1.547941017150879, + "step": 51430 + }, + { + "epoch": 0.15570656697894752, + "grad_norm": 0.1420290768146515, + "learning_rate": 0.00010585748333915261, + "loss": 1.5758472442626954, + "step": 51440 + }, + { + "epoch": 0.15573683652929335, + "grad_norm": 0.14632092416286469, + "learning_rate": 0.00010585368815751523, + "loss": 1.5773364067077638, + "step": 51450 + }, + { + "epoch": 0.1557671060796392, + "grad_norm": 0.13969476521015167, + "learning_rate": 0.00010584989297587782, + "loss": 1.629816436767578, + "step": 51460 + }, + { + "epoch": 0.15579737562998502, + "grad_norm": 0.13216319680213928, + "learning_rate": 0.00010584609779424044, + "loss": 1.5723501205444337, + "step": 51470 + }, + { + "epoch": 0.15582764518033085, + "grad_norm": 0.12523578107357025, + "learning_rate": 0.00010584230261260303, + "loss": 1.6241514205932617, + "step": 51480 + }, + { + "epoch": 0.15585791473067667, + "grad_norm": 0.13103623688220978, + "learning_rate": 0.00010583850743096565, + "loss": 1.6226877212524413, + "step": 51490 + }, + { + "epoch": 0.1558881842810225, + "grad_norm": 0.12112020701169968, + "learning_rate": 0.00010583471224932825, + "loss": 1.6158878326416015, + "step": 51500 + }, + { + "epoch": 0.1558881842810225, + "eval_loss": 1.640334963798523, + "eval_runtime": 28.1003, + "eval_samples_per_second": 17.793, + "eval_steps_per_second": 1.139, + "step": 51500 + }, + { + "epoch": 0.15591845383136835, + "grad_norm": 0.12273459881544113, + "learning_rate": 0.00010583091706769086, + "loss": 1.5916786193847656, + "step": 51510 + }, + { + "epoch": 0.15594872338171417, + "grad_norm": 0.11691901087760925, + "learning_rate": 0.00010582712188605347, + "loss": 1.5676007270812988, + "step": 51520 + }, + { + "epoch": 0.15597899293206, + "grad_norm": 0.13712987303733826, + "learning_rate": 0.00010582332670441608, + "loss": 1.6222816467285157, + "step": 51530 + }, + { + "epoch": 0.15600926248240582, + "grad_norm": 0.12815089523792267, + "learning_rate": 0.00010581953152277868, + "loss": 1.6449695587158204, + "step": 51540 + }, + { + "epoch": 0.15603953203275164, + "grad_norm": 0.15349024534225464, + "learning_rate": 0.00010581573634114129, + "loss": 1.6090829849243165, + "step": 51550 + }, + { + "epoch": 0.1560698015830975, + "grad_norm": 0.13867804408073425, + "learning_rate": 0.0001058119411595039, + "loss": 1.5884610176086427, + "step": 51560 + }, + { + "epoch": 0.15610007113344332, + "grad_norm": 0.14343610405921936, + "learning_rate": 0.0001058081459778665, + "loss": 1.6552623748779296, + "step": 51570 + }, + { + "epoch": 0.15613034068378914, + "grad_norm": 0.14156962931156158, + "learning_rate": 0.0001058043507962291, + "loss": 1.576881504058838, + "step": 51580 + }, + { + "epoch": 0.15616061023413497, + "grad_norm": 0.12478455901145935, + "learning_rate": 0.00010580055561459171, + "loss": 1.605187225341797, + "step": 51590 + }, + { + "epoch": 0.1561908797844808, + "grad_norm": 0.13032762706279755, + "learning_rate": 0.00010579676043295432, + "loss": 1.584305763244629, + "step": 51600 + }, + { + "epoch": 0.15622114933482664, + "grad_norm": 0.14065049588680267, + "learning_rate": 0.00010579296525131692, + "loss": 1.5911584854125977, + "step": 51610 + }, + { + "epoch": 0.15625141888517247, + "grad_norm": 0.14593389630317688, + "learning_rate": 0.00010578917006967954, + "loss": 1.621713638305664, + "step": 51620 + }, + { + "epoch": 0.1562816884355183, + "grad_norm": 0.13489654660224915, + "learning_rate": 0.00010578537488804215, + "loss": 1.5890405654907227, + "step": 51630 + }, + { + "epoch": 0.15631195798586411, + "grad_norm": 0.1259622573852539, + "learning_rate": 0.00010578157970640476, + "loss": 1.566220474243164, + "step": 51640 + }, + { + "epoch": 0.15634222753620994, + "grad_norm": 0.12478058785200119, + "learning_rate": 0.00010577778452476736, + "loss": 1.624025535583496, + "step": 51650 + }, + { + "epoch": 0.1563724970865558, + "grad_norm": 0.13116423785686493, + "learning_rate": 0.00010577398934312997, + "loss": 1.6268810272216796, + "step": 51660 + }, + { + "epoch": 0.1564027666369016, + "grad_norm": 0.12040108442306519, + "learning_rate": 0.00010577019416149257, + "loss": 1.5818913459777832, + "step": 51670 + }, + { + "epoch": 0.15643303618724744, + "grad_norm": 0.13458868861198425, + "learning_rate": 0.00010576639897985518, + "loss": 1.6019273757934571, + "step": 51680 + }, + { + "epoch": 0.15646330573759326, + "grad_norm": 0.14566536247730255, + "learning_rate": 0.00010576260379821779, + "loss": 1.6388797760009766, + "step": 51690 + }, + { + "epoch": 0.15649357528793909, + "grad_norm": 0.13293704390525818, + "learning_rate": 0.00010575880861658039, + "loss": 1.5686424255371094, + "step": 51700 + }, + { + "epoch": 0.15652384483828494, + "grad_norm": 0.15100671350955963, + "learning_rate": 0.000105755013434943, + "loss": 1.5980148315429688, + "step": 51710 + }, + { + "epoch": 0.15655411438863076, + "grad_norm": 0.15217356383800507, + "learning_rate": 0.0001057512182533056, + "loss": 1.5935125350952148, + "step": 51720 + }, + { + "epoch": 0.15658438393897658, + "grad_norm": 0.13377952575683594, + "learning_rate": 0.00010574742307166821, + "loss": 1.6085613250732422, + "step": 51730 + }, + { + "epoch": 0.1566146534893224, + "grad_norm": 0.13676419854164124, + "learning_rate": 0.00010574362789003082, + "loss": 1.581364059448242, + "step": 51740 + }, + { + "epoch": 0.15664492303966823, + "grad_norm": 0.13556015491485596, + "learning_rate": 0.00010573983270839342, + "loss": 1.559183692932129, + "step": 51750 + }, + { + "epoch": 0.15667519259001408, + "grad_norm": 0.12010656297206879, + "learning_rate": 0.00010573603752675604, + "loss": 1.6023967742919922, + "step": 51760 + }, + { + "epoch": 0.1567054621403599, + "grad_norm": 0.14419159293174744, + "learning_rate": 0.00010573224234511863, + "loss": 1.6005964279174805, + "step": 51770 + }, + { + "epoch": 0.15673573169070573, + "grad_norm": 0.1314844936132431, + "learning_rate": 0.00010572844716348125, + "loss": 1.6092552185058593, + "step": 51780 + }, + { + "epoch": 0.15676600124105156, + "grad_norm": 0.1293395310640335, + "learning_rate": 0.00010572465198184385, + "loss": 1.6398468017578125, + "step": 51790 + }, + { + "epoch": 0.1567962707913974, + "grad_norm": 0.1128309965133667, + "learning_rate": 0.00010572085680020646, + "loss": 1.6261140823364257, + "step": 51800 + }, + { + "epoch": 0.15682654034174323, + "grad_norm": 0.11997434496879578, + "learning_rate": 0.00010571706161856906, + "loss": 1.5678479194641113, + "step": 51810 + }, + { + "epoch": 0.15685680989208906, + "grad_norm": 0.15564675629138947, + "learning_rate": 0.00010571326643693168, + "loss": 1.6313766479492187, + "step": 51820 + }, + { + "epoch": 0.15688707944243488, + "grad_norm": 0.11584719270467758, + "learning_rate": 0.00010570947125529428, + "loss": 1.6633142471313476, + "step": 51830 + }, + { + "epoch": 0.1569173489927807, + "grad_norm": 0.11661617457866669, + "learning_rate": 0.00010570567607365689, + "loss": 1.5960402488708496, + "step": 51840 + }, + { + "epoch": 0.15694761854312655, + "grad_norm": 0.1414097249507904, + "learning_rate": 0.0001057018808920195, + "loss": 1.5998111724853517, + "step": 51850 + }, + { + "epoch": 0.15697788809347238, + "grad_norm": 0.13417592644691467, + "learning_rate": 0.0001056980857103821, + "loss": 1.6187553405761719, + "step": 51860 + }, + { + "epoch": 0.1570081576438182, + "grad_norm": 0.14341461658477783, + "learning_rate": 0.00010569429052874472, + "loss": 1.628958511352539, + "step": 51870 + }, + { + "epoch": 0.15703842719416403, + "grad_norm": 0.1435953974723816, + "learning_rate": 0.00010569049534710731, + "loss": 1.6005908966064453, + "step": 51880 + }, + { + "epoch": 0.15706869674450985, + "grad_norm": 0.13538777828216553, + "learning_rate": 0.00010568670016546993, + "loss": 1.5959028244018554, + "step": 51890 + }, + { + "epoch": 0.1570989662948557, + "grad_norm": 0.14132094383239746, + "learning_rate": 0.00010568290498383252, + "loss": 1.5924372673034668, + "step": 51900 + }, + { + "epoch": 0.15712923584520153, + "grad_norm": 0.12732845544815063, + "learning_rate": 0.00010567910980219514, + "loss": 1.6060226440429688, + "step": 51910 + }, + { + "epoch": 0.15715950539554735, + "grad_norm": 0.12343720346689224, + "learning_rate": 0.00010567531462055774, + "loss": 1.6151100158691407, + "step": 51920 + }, + { + "epoch": 0.15718977494589317, + "grad_norm": 0.12588152289390564, + "learning_rate": 0.00010567151943892036, + "loss": 1.6100784301757813, + "step": 51930 + }, + { + "epoch": 0.157220044496239, + "grad_norm": 0.12413804978132248, + "learning_rate": 0.00010566772425728295, + "loss": 1.5902290344238281, + "step": 51940 + }, + { + "epoch": 0.15725031404658485, + "grad_norm": 0.1236460879445076, + "learning_rate": 0.00010566392907564557, + "loss": 1.5944799423217773, + "step": 51950 + }, + { + "epoch": 0.15728058359693067, + "grad_norm": 0.11485081911087036, + "learning_rate": 0.00010566013389400816, + "loss": 1.617863655090332, + "step": 51960 + }, + { + "epoch": 0.1573108531472765, + "grad_norm": 0.13504812121391296, + "learning_rate": 0.00010565633871237078, + "loss": 1.6295553207397462, + "step": 51970 + }, + { + "epoch": 0.15734112269762232, + "grad_norm": 0.1371319741010666, + "learning_rate": 0.00010565254353073337, + "loss": 1.5929805755615234, + "step": 51980 + }, + { + "epoch": 0.15737139224796814, + "grad_norm": 0.14085838198661804, + "learning_rate": 0.00010564874834909599, + "loss": 1.6095809936523438, + "step": 51990 + }, + { + "epoch": 0.157401661798314, + "grad_norm": 0.13439726829528809, + "learning_rate": 0.0001056449531674586, + "loss": 1.6307960510253907, + "step": 52000 + }, + { + "epoch": 0.157401661798314, + "eval_loss": 1.611993670463562, + "eval_runtime": 27.9859, + "eval_samples_per_second": 17.866, + "eval_steps_per_second": 1.143, + "step": 52000 + }, + { + "epoch": 0.15743193134865982, + "grad_norm": 0.13497786223888397, + "learning_rate": 0.0001056411579858212, + "loss": 1.5786301612854003, + "step": 52010 + }, + { + "epoch": 0.15746220089900564, + "grad_norm": 0.11759132146835327, + "learning_rate": 0.00010563736280418381, + "loss": 1.6334630966186523, + "step": 52020 + }, + { + "epoch": 0.15749247044935147, + "grad_norm": 0.14396090805530548, + "learning_rate": 0.00010563356762254641, + "loss": 1.5951459884643555, + "step": 52030 + }, + { + "epoch": 0.1575227399996973, + "grad_norm": 0.1540922075510025, + "learning_rate": 0.00010562977244090903, + "loss": 1.5857112884521485, + "step": 52040 + }, + { + "epoch": 0.15755300955004314, + "grad_norm": 0.12981101870536804, + "learning_rate": 0.00010562597725927163, + "loss": 1.6057088851928711, + "step": 52050 + }, + { + "epoch": 0.15758327910038897, + "grad_norm": 0.13311704993247986, + "learning_rate": 0.00010562218207763425, + "loss": 1.605636215209961, + "step": 52060 + }, + { + "epoch": 0.1576135486507348, + "grad_norm": 0.12376552075147629, + "learning_rate": 0.00010561838689599684, + "loss": 1.6010393142700194, + "step": 52070 + }, + { + "epoch": 0.15764381820108062, + "grad_norm": 0.1333167999982834, + "learning_rate": 0.00010561459171435946, + "loss": 1.603378677368164, + "step": 52080 + }, + { + "epoch": 0.15767408775142644, + "grad_norm": 0.12303363531827927, + "learning_rate": 0.00010561079653272205, + "loss": 1.6015365600585938, + "step": 52090 + }, + { + "epoch": 0.1577043573017723, + "grad_norm": 0.13845983147621155, + "learning_rate": 0.00010560700135108467, + "loss": 1.6137922286987305, + "step": 52100 + }, + { + "epoch": 0.15773462685211811, + "grad_norm": 0.1495903879404068, + "learning_rate": 0.00010560320616944726, + "loss": 1.6527639389038087, + "step": 52110 + }, + { + "epoch": 0.15776489640246394, + "grad_norm": 0.13777025043964386, + "learning_rate": 0.00010559941098780988, + "loss": 1.5953948974609375, + "step": 52120 + }, + { + "epoch": 0.15779516595280976, + "grad_norm": 0.12039880454540253, + "learning_rate": 0.00010559561580617249, + "loss": 1.6423320770263672, + "step": 52130 + }, + { + "epoch": 0.15782543550315561, + "grad_norm": 0.12153397500514984, + "learning_rate": 0.0001055918206245351, + "loss": 1.5913092613220214, + "step": 52140 + }, + { + "epoch": 0.15785570505350144, + "grad_norm": 0.1438911259174347, + "learning_rate": 0.0001055880254428977, + "loss": 1.5891565322875976, + "step": 52150 + }, + { + "epoch": 0.15788597460384726, + "grad_norm": 0.12492410838603973, + "learning_rate": 0.0001055842302612603, + "loss": 1.6197200775146485, + "step": 52160 + }, + { + "epoch": 0.1579162441541931, + "grad_norm": 0.15001726150512695, + "learning_rate": 0.00010558043507962291, + "loss": 1.5523988723754882, + "step": 52170 + }, + { + "epoch": 0.1579465137045389, + "grad_norm": 0.12242032587528229, + "learning_rate": 0.00010557663989798552, + "loss": 1.624136734008789, + "step": 52180 + }, + { + "epoch": 0.15797678325488476, + "grad_norm": 0.13208341598510742, + "learning_rate": 0.00010557284471634812, + "loss": 1.565713119506836, + "step": 52190 + }, + { + "epoch": 0.15800705280523059, + "grad_norm": 0.12209567427635193, + "learning_rate": 0.00010556904953471073, + "loss": 1.5829237937927245, + "step": 52200 + }, + { + "epoch": 0.1580373223555764, + "grad_norm": 0.13080927729606628, + "learning_rate": 0.00010556525435307334, + "loss": 1.6079004287719727, + "step": 52210 + }, + { + "epoch": 0.15806759190592223, + "grad_norm": 0.13399682939052582, + "learning_rate": 0.00010556145917143594, + "loss": 1.648886489868164, + "step": 52220 + }, + { + "epoch": 0.15809786145626806, + "grad_norm": 0.13775630295276642, + "learning_rate": 0.00010555766398979856, + "loss": 1.6274484634399413, + "step": 52230 + }, + { + "epoch": 0.1581281310066139, + "grad_norm": 0.13629259169101715, + "learning_rate": 0.00010555386880816117, + "loss": 1.6261301040649414, + "step": 52240 + }, + { + "epoch": 0.15815840055695973, + "grad_norm": 0.1522008776664734, + "learning_rate": 0.00010555007362652377, + "loss": 1.5725547790527343, + "step": 52250 + }, + { + "epoch": 0.15818867010730556, + "grad_norm": 0.14801259338855743, + "learning_rate": 0.00010554627844488638, + "loss": 1.5986108779907227, + "step": 52260 + }, + { + "epoch": 0.15821893965765138, + "grad_norm": 0.1377347856760025, + "learning_rate": 0.00010554248326324898, + "loss": 1.6266159057617187, + "step": 52270 + }, + { + "epoch": 0.1582492092079972, + "grad_norm": 0.12444281578063965, + "learning_rate": 0.00010553868808161159, + "loss": 1.6122211456298827, + "step": 52280 + }, + { + "epoch": 0.15827947875834306, + "grad_norm": 0.11831314116716385, + "learning_rate": 0.0001055348928999742, + "loss": 1.594261360168457, + "step": 52290 + }, + { + "epoch": 0.15830974830868888, + "grad_norm": 0.12522125244140625, + "learning_rate": 0.0001055310977183368, + "loss": 1.6531558990478517, + "step": 52300 + }, + { + "epoch": 0.1583400178590347, + "grad_norm": 0.12109418213367462, + "learning_rate": 0.00010552730253669941, + "loss": 1.604269790649414, + "step": 52310 + }, + { + "epoch": 0.15837028740938053, + "grad_norm": 0.11799684911966324, + "learning_rate": 0.00010552350735506201, + "loss": 1.5838937759399414, + "step": 52320 + }, + { + "epoch": 0.15840055695972635, + "grad_norm": 0.13690970838069916, + "learning_rate": 0.00010551971217342462, + "loss": 1.5984762191772461, + "step": 52330 + }, + { + "epoch": 0.1584308265100722, + "grad_norm": 0.12302903085947037, + "learning_rate": 0.00010551591699178723, + "loss": 1.6625728607177734, + "step": 52340 + }, + { + "epoch": 0.15846109606041803, + "grad_norm": 0.15812796354293823, + "learning_rate": 0.00010551212181014983, + "loss": 1.6227170944213867, + "step": 52350 + }, + { + "epoch": 0.15849136561076385, + "grad_norm": 0.13279645144939423, + "learning_rate": 0.00010550832662851244, + "loss": 1.605929183959961, + "step": 52360 + }, + { + "epoch": 0.15852163516110968, + "grad_norm": 0.11018494516611099, + "learning_rate": 0.00010550453144687506, + "loss": 1.6236753463745117, + "step": 52370 + }, + { + "epoch": 0.1585519047114555, + "grad_norm": 0.138521209359169, + "learning_rate": 0.00010550073626523765, + "loss": 1.6062454223632812, + "step": 52380 + }, + { + "epoch": 0.15858217426180135, + "grad_norm": 0.1351199746131897, + "learning_rate": 0.00010549694108360027, + "loss": 1.6319545745849608, + "step": 52390 + }, + { + "epoch": 0.15861244381214717, + "grad_norm": 0.13718600571155548, + "learning_rate": 0.00010549314590196286, + "loss": 1.5907156944274903, + "step": 52400 + }, + { + "epoch": 0.158642713362493, + "grad_norm": 0.1546591818332672, + "learning_rate": 0.00010548935072032548, + "loss": 1.6224853515625, + "step": 52410 + }, + { + "epoch": 0.15867298291283882, + "grad_norm": 0.12668564915657043, + "learning_rate": 0.00010548555553868807, + "loss": 1.6358949661254882, + "step": 52420 + }, + { + "epoch": 0.15870325246318465, + "grad_norm": 0.12427156418561935, + "learning_rate": 0.0001054817603570507, + "loss": 1.5948066711425781, + "step": 52430 + }, + { + "epoch": 0.1587335220135305, + "grad_norm": 0.1358661949634552, + "learning_rate": 0.0001054779651754133, + "loss": 1.618067169189453, + "step": 52440 + }, + { + "epoch": 0.15876379156387632, + "grad_norm": 0.1309536248445511, + "learning_rate": 0.0001054741699937759, + "loss": 1.6194969177246095, + "step": 52450 + }, + { + "epoch": 0.15879406111422215, + "grad_norm": 0.13063503801822662, + "learning_rate": 0.00010547037481213851, + "loss": 1.6378063201904296, + "step": 52460 + }, + { + "epoch": 0.15882433066456797, + "grad_norm": 0.12364863604307175, + "learning_rate": 0.00010546657963050112, + "loss": 1.6081403732299804, + "step": 52470 + }, + { + "epoch": 0.1588546002149138, + "grad_norm": 0.13959042727947235, + "learning_rate": 0.00010546278444886372, + "loss": 1.6050838470458983, + "step": 52480 + }, + { + "epoch": 0.15888486976525965, + "grad_norm": 0.12739527225494385, + "learning_rate": 0.00010545898926722633, + "loss": 1.59194974899292, + "step": 52490 + }, + { + "epoch": 0.15891513931560547, + "grad_norm": 0.1379755735397339, + "learning_rate": 0.00010545519408558895, + "loss": 1.5778047561645507, + "step": 52500 + }, + { + "epoch": 0.15891513931560547, + "eval_loss": 1.6177690029144287, + "eval_runtime": 28.4022, + "eval_samples_per_second": 17.604, + "eval_steps_per_second": 1.127, + "step": 52500 + }, + { + "epoch": 0.1589454088659513, + "grad_norm": 0.14907944202423096, + "learning_rate": 0.00010545139890395154, + "loss": 1.6238979339599608, + "step": 52510 + }, + { + "epoch": 0.15897567841629712, + "grad_norm": 0.12293080985546112, + "learning_rate": 0.00010544760372231416, + "loss": 1.6058015823364258, + "step": 52520 + }, + { + "epoch": 0.15900594796664297, + "grad_norm": 0.11965017765760422, + "learning_rate": 0.00010544380854067675, + "loss": 1.6358282089233398, + "step": 52530 + }, + { + "epoch": 0.1590362175169888, + "grad_norm": 0.1266048699617386, + "learning_rate": 0.00010544001335903937, + "loss": 1.6149274826049804, + "step": 52540 + }, + { + "epoch": 0.15906648706733462, + "grad_norm": 0.1238023191690445, + "learning_rate": 0.00010543621817740197, + "loss": 1.620953369140625, + "step": 52550 + }, + { + "epoch": 0.15909675661768044, + "grad_norm": 0.1348736435174942, + "learning_rate": 0.00010543242299576458, + "loss": 1.664535903930664, + "step": 52560 + }, + { + "epoch": 0.15912702616802626, + "grad_norm": 0.13722822070121765, + "learning_rate": 0.00010542862781412718, + "loss": 1.6132423400878906, + "step": 52570 + }, + { + "epoch": 0.15915729571837212, + "grad_norm": 0.11719657480716705, + "learning_rate": 0.0001054248326324898, + "loss": 1.6039213180541991, + "step": 52580 + }, + { + "epoch": 0.15918756526871794, + "grad_norm": 0.12660078704357147, + "learning_rate": 0.00010542103745085239, + "loss": 1.6033824920654296, + "step": 52590 + }, + { + "epoch": 0.15921783481906376, + "grad_norm": 0.13088564574718475, + "learning_rate": 0.00010541724226921501, + "loss": 1.6184246063232421, + "step": 52600 + }, + { + "epoch": 0.1592481043694096, + "grad_norm": 0.12303774803876877, + "learning_rate": 0.0001054134470875776, + "loss": 1.615241813659668, + "step": 52610 + }, + { + "epoch": 0.1592783739197554, + "grad_norm": 0.13135115802288055, + "learning_rate": 0.00010540965190594022, + "loss": 1.606656265258789, + "step": 52620 + }, + { + "epoch": 0.15930864347010126, + "grad_norm": 0.13505586981773376, + "learning_rate": 0.00010540585672430283, + "loss": 1.5810598373413085, + "step": 52630 + }, + { + "epoch": 0.1593389130204471, + "grad_norm": 0.1461726427078247, + "learning_rate": 0.00010540206154266543, + "loss": 1.6332313537597656, + "step": 52640 + }, + { + "epoch": 0.1593691825707929, + "grad_norm": 0.14271105825901031, + "learning_rate": 0.00010539826636102805, + "loss": 1.632695198059082, + "step": 52650 + }, + { + "epoch": 0.15939945212113873, + "grad_norm": 0.15707828104496002, + "learning_rate": 0.00010539447117939064, + "loss": 1.6342416763305665, + "step": 52660 + }, + { + "epoch": 0.15942972167148456, + "grad_norm": 0.12638162076473236, + "learning_rate": 0.00010539067599775326, + "loss": 1.6295475006103515, + "step": 52670 + }, + { + "epoch": 0.1594599912218304, + "grad_norm": 0.12651844322681427, + "learning_rate": 0.00010538688081611586, + "loss": 1.6522445678710938, + "step": 52680 + }, + { + "epoch": 0.15949026077217623, + "grad_norm": 0.12180297821760178, + "learning_rate": 0.00010538308563447848, + "loss": 1.5902767181396484, + "step": 52690 + }, + { + "epoch": 0.15952053032252206, + "grad_norm": 0.12628424167633057, + "learning_rate": 0.00010537929045284107, + "loss": 1.595463180541992, + "step": 52700 + }, + { + "epoch": 0.15955079987286788, + "grad_norm": 0.12949427962303162, + "learning_rate": 0.00010537549527120369, + "loss": 1.6273700714111328, + "step": 52710 + }, + { + "epoch": 0.1595810694232137, + "grad_norm": 0.1291598081588745, + "learning_rate": 0.00010537170008956628, + "loss": 1.598182201385498, + "step": 52720 + }, + { + "epoch": 0.15961133897355956, + "grad_norm": 0.1303849220275879, + "learning_rate": 0.0001053679049079289, + "loss": 1.6159832000732421, + "step": 52730 + }, + { + "epoch": 0.15964160852390538, + "grad_norm": 0.11668530106544495, + "learning_rate": 0.0001053641097262915, + "loss": 1.582744026184082, + "step": 52740 + }, + { + "epoch": 0.1596718780742512, + "grad_norm": 0.1340399831533432, + "learning_rate": 0.00010536031454465411, + "loss": 1.6422691345214844, + "step": 52750 + }, + { + "epoch": 0.15970214762459703, + "grad_norm": 0.1282450556755066, + "learning_rate": 0.00010535651936301672, + "loss": 1.6006940841674804, + "step": 52760 + }, + { + "epoch": 0.15973241717494285, + "grad_norm": 0.13048364222049713, + "learning_rate": 0.00010535272418137932, + "loss": 1.5904650688171387, + "step": 52770 + }, + { + "epoch": 0.1597626867252887, + "grad_norm": 0.13968774676322937, + "learning_rate": 0.00010534892899974193, + "loss": 1.547640609741211, + "step": 52780 + }, + { + "epoch": 0.15979295627563453, + "grad_norm": 0.141809344291687, + "learning_rate": 0.00010534513381810453, + "loss": 1.6318052291870118, + "step": 52790 + }, + { + "epoch": 0.15982322582598035, + "grad_norm": 0.12780308723449707, + "learning_rate": 0.00010534133863646714, + "loss": 1.5925171852111817, + "step": 52800 + }, + { + "epoch": 0.15985349537632618, + "grad_norm": 0.11347094923257828, + "learning_rate": 0.00010533754345482975, + "loss": 1.617755126953125, + "step": 52810 + }, + { + "epoch": 0.159883764926672, + "grad_norm": 0.14133509993553162, + "learning_rate": 0.00010533374827319235, + "loss": 1.6134010314941407, + "step": 52820 + }, + { + "epoch": 0.15991403447701785, + "grad_norm": 0.14518700540065765, + "learning_rate": 0.00010532995309155496, + "loss": 1.5990327835083007, + "step": 52830 + }, + { + "epoch": 0.15994430402736368, + "grad_norm": 0.14566552639007568, + "learning_rate": 0.00010532615790991758, + "loss": 1.6005125045776367, + "step": 52840 + }, + { + "epoch": 0.1599745735777095, + "grad_norm": 0.12547743320465088, + "learning_rate": 0.00010532236272828017, + "loss": 1.6008529663085938, + "step": 52850 + }, + { + "epoch": 0.16000484312805532, + "grad_norm": 0.13056236505508423, + "learning_rate": 0.00010531856754664279, + "loss": 1.6021625518798828, + "step": 52860 + }, + { + "epoch": 0.16003511267840118, + "grad_norm": 0.1306896209716797, + "learning_rate": 0.0001053147723650054, + "loss": 1.6165973663330078, + "step": 52870 + }, + { + "epoch": 0.160065382228747, + "grad_norm": 0.1184791699051857, + "learning_rate": 0.000105310977183368, + "loss": 1.5645152091979981, + "step": 52880 + }, + { + "epoch": 0.16009565177909282, + "grad_norm": 0.12627798318862915, + "learning_rate": 0.00010530718200173061, + "loss": 1.6287853240966796, + "step": 52890 + }, + { + "epoch": 0.16012592132943865, + "grad_norm": 0.13235324621200562, + "learning_rate": 0.00010530338682009321, + "loss": 1.5947351455688477, + "step": 52900 + }, + { + "epoch": 0.16015619087978447, + "grad_norm": 0.1655278205871582, + "learning_rate": 0.00010529959163845582, + "loss": 1.6255578994750977, + "step": 52910 + }, + { + "epoch": 0.16018646043013032, + "grad_norm": 0.1346757411956787, + "learning_rate": 0.00010529579645681843, + "loss": 1.5580827713012695, + "step": 52920 + }, + { + "epoch": 0.16021672998047615, + "grad_norm": 0.13196854293346405, + "learning_rate": 0.00010529200127518103, + "loss": 1.6022983551025392, + "step": 52930 + }, + { + "epoch": 0.16024699953082197, + "grad_norm": 0.12734651565551758, + "learning_rate": 0.00010528820609354364, + "loss": 1.6152591705322266, + "step": 52940 + }, + { + "epoch": 0.1602772690811678, + "grad_norm": 0.12137750536203384, + "learning_rate": 0.00010528441091190624, + "loss": 1.6040451049804687, + "step": 52950 + }, + { + "epoch": 0.16030753863151362, + "grad_norm": 0.12774565815925598, + "learning_rate": 0.00010528061573026885, + "loss": 1.5862926483154296, + "step": 52960 + }, + { + "epoch": 0.16033780818185947, + "grad_norm": 0.1234564334154129, + "learning_rate": 0.00010527682054863146, + "loss": 1.6357135772705078, + "step": 52970 + }, + { + "epoch": 0.1603680777322053, + "grad_norm": 0.12411145865917206, + "learning_rate": 0.00010527302536699408, + "loss": 1.5954307556152343, + "step": 52980 + }, + { + "epoch": 0.16039834728255112, + "grad_norm": 0.12609770894050598, + "learning_rate": 0.00010526923018535667, + "loss": 1.6168220520019532, + "step": 52990 + }, + { + "epoch": 0.16042861683289694, + "grad_norm": 0.13724790513515472, + "learning_rate": 0.00010526543500371929, + "loss": 1.5908300399780273, + "step": 53000 + }, + { + "epoch": 0.16042861683289694, + "eval_loss": 1.5963630676269531, + "eval_runtime": 28.0571, + "eval_samples_per_second": 17.821, + "eval_steps_per_second": 1.141, + "step": 53000 + }, + { + "epoch": 0.16045888638324277, + "grad_norm": 0.1312258094549179, + "learning_rate": 0.00010526163982208188, + "loss": 1.5862539291381836, + "step": 53010 + }, + { + "epoch": 0.16048915593358862, + "grad_norm": 0.12287651002407074, + "learning_rate": 0.0001052578446404445, + "loss": 1.6382591247558593, + "step": 53020 + }, + { + "epoch": 0.16051942548393444, + "grad_norm": 0.12425912916660309, + "learning_rate": 0.00010525404945880709, + "loss": 1.6133617401123046, + "step": 53030 + }, + { + "epoch": 0.16054969503428027, + "grad_norm": 0.11403702199459076, + "learning_rate": 0.00010525025427716971, + "loss": 1.5754801750183105, + "step": 53040 + }, + { + "epoch": 0.1605799645846261, + "grad_norm": 0.13755159080028534, + "learning_rate": 0.00010524645909553232, + "loss": 1.6043218612670898, + "step": 53050 + }, + { + "epoch": 0.1606102341349719, + "grad_norm": 0.12566733360290527, + "learning_rate": 0.00010524266391389492, + "loss": 1.6033889770507812, + "step": 53060 + }, + { + "epoch": 0.16064050368531776, + "grad_norm": 0.12013720721006393, + "learning_rate": 0.00010523886873225753, + "loss": 1.626692008972168, + "step": 53070 + }, + { + "epoch": 0.1606707732356636, + "grad_norm": 0.14793677628040314, + "learning_rate": 0.00010523507355062013, + "loss": 1.6263893127441407, + "step": 53080 + }, + { + "epoch": 0.1607010427860094, + "grad_norm": 0.13774509727954865, + "learning_rate": 0.00010523127836898274, + "loss": 1.5999395370483398, + "step": 53090 + }, + { + "epoch": 0.16073131233635524, + "grad_norm": 0.1277347207069397, + "learning_rate": 0.00010522748318734535, + "loss": 1.638555145263672, + "step": 53100 + }, + { + "epoch": 0.16076158188670106, + "grad_norm": 0.14884153008460999, + "learning_rate": 0.00010522368800570797, + "loss": 1.6220117568969727, + "step": 53110 + }, + { + "epoch": 0.1607918514370469, + "grad_norm": 0.1337011456489563, + "learning_rate": 0.00010521989282407056, + "loss": 1.595512294769287, + "step": 53120 + }, + { + "epoch": 0.16082212098739274, + "grad_norm": 0.1334289163351059, + "learning_rate": 0.00010521609764243318, + "loss": 1.6303165435791016, + "step": 53130 + }, + { + "epoch": 0.16085239053773856, + "grad_norm": 0.12555500864982605, + "learning_rate": 0.00010521230246079577, + "loss": 1.624455451965332, + "step": 53140 + }, + { + "epoch": 0.16088266008808438, + "grad_norm": 0.12458686530590057, + "learning_rate": 0.00010520850727915839, + "loss": 1.6040557861328124, + "step": 53150 + }, + { + "epoch": 0.1609129296384302, + "grad_norm": 0.1280858814716339, + "learning_rate": 0.00010520471209752098, + "loss": 1.6210506439208985, + "step": 53160 + }, + { + "epoch": 0.16094319918877606, + "grad_norm": 0.11817187815904617, + "learning_rate": 0.0001052009169158836, + "loss": 1.6047571182250977, + "step": 53170 + }, + { + "epoch": 0.16097346873912188, + "grad_norm": 0.132892444729805, + "learning_rate": 0.0001051971217342462, + "loss": 1.637842559814453, + "step": 53180 + }, + { + "epoch": 0.1610037382894677, + "grad_norm": 0.13973259925842285, + "learning_rate": 0.00010519332655260881, + "loss": 1.566697597503662, + "step": 53190 + }, + { + "epoch": 0.16103400783981353, + "grad_norm": 0.1276763677597046, + "learning_rate": 0.0001051895313709714, + "loss": 1.6300037384033204, + "step": 53200 + }, + { + "epoch": 0.16106427739015938, + "grad_norm": 0.13105370104312897, + "learning_rate": 0.00010518573618933403, + "loss": 1.6070327758789062, + "step": 53210 + }, + { + "epoch": 0.1610945469405052, + "grad_norm": 0.1465027779340744, + "learning_rate": 0.00010518194100769662, + "loss": 1.6596515655517579, + "step": 53220 + }, + { + "epoch": 0.16112481649085103, + "grad_norm": 0.13003405928611755, + "learning_rate": 0.00010517814582605924, + "loss": 1.5986730575561523, + "step": 53230 + }, + { + "epoch": 0.16115508604119685, + "grad_norm": 0.1327715367078781, + "learning_rate": 0.00010517435064442184, + "loss": 1.6046859741210937, + "step": 53240 + }, + { + "epoch": 0.16118535559154268, + "grad_norm": 0.12169934064149857, + "learning_rate": 0.00010517055546278445, + "loss": 1.5406709671020509, + "step": 53250 + }, + { + "epoch": 0.16121562514188853, + "grad_norm": 0.1398506760597229, + "learning_rate": 0.00010516676028114707, + "loss": 1.6345718383789063, + "step": 53260 + }, + { + "epoch": 0.16124589469223435, + "grad_norm": 0.132168710231781, + "learning_rate": 0.00010516296509950966, + "loss": 1.6242094039916992, + "step": 53270 + }, + { + "epoch": 0.16127616424258018, + "grad_norm": 0.15263791382312775, + "learning_rate": 0.00010515916991787228, + "loss": 1.6150196075439454, + "step": 53280 + }, + { + "epoch": 0.161306433792926, + "grad_norm": 0.13224002718925476, + "learning_rate": 0.00010515537473623487, + "loss": 1.5715078353881835, + "step": 53290 + }, + { + "epoch": 0.16133670334327183, + "grad_norm": 0.1504206359386444, + "learning_rate": 0.00010515157955459749, + "loss": 1.6058399200439453, + "step": 53300 + }, + { + "epoch": 0.16136697289361768, + "grad_norm": 0.13644830882549286, + "learning_rate": 0.00010514778437296009, + "loss": 1.5609432220458985, + "step": 53310 + }, + { + "epoch": 0.1613972424439635, + "grad_norm": 0.12059711664915085, + "learning_rate": 0.0001051439891913227, + "loss": 1.6364723205566407, + "step": 53320 + }, + { + "epoch": 0.16142751199430933, + "grad_norm": 0.14238782227039337, + "learning_rate": 0.0001051401940096853, + "loss": 1.5997698783874512, + "step": 53330 + }, + { + "epoch": 0.16145778154465515, + "grad_norm": 0.11157534271478653, + "learning_rate": 0.00010513639882804792, + "loss": 1.6138923645019532, + "step": 53340 + }, + { + "epoch": 0.16148805109500097, + "grad_norm": 0.14334794878959656, + "learning_rate": 0.00010513260364641052, + "loss": 1.620110321044922, + "step": 53350 + }, + { + "epoch": 0.16151832064534682, + "grad_norm": 0.13198553025722504, + "learning_rate": 0.00010512880846477313, + "loss": 1.587249183654785, + "step": 53360 + }, + { + "epoch": 0.16154859019569265, + "grad_norm": 0.13972418010234833, + "learning_rate": 0.00010512501328313573, + "loss": 1.6120454788208007, + "step": 53370 + }, + { + "epoch": 0.16157885974603847, + "grad_norm": 0.11997322738170624, + "learning_rate": 0.00010512121810149834, + "loss": 1.590773582458496, + "step": 53380 + }, + { + "epoch": 0.1616091292963843, + "grad_norm": 0.12456092983484268, + "learning_rate": 0.00010511742291986095, + "loss": 1.6323375701904297, + "step": 53390 + }, + { + "epoch": 0.16163939884673012, + "grad_norm": 0.12504009902477264, + "learning_rate": 0.00010511362773822355, + "loss": 1.5886054992675782, + "step": 53400 + }, + { + "epoch": 0.16166966839707597, + "grad_norm": 0.1306217759847641, + "learning_rate": 0.00010510983255658616, + "loss": 1.601497268676758, + "step": 53410 + }, + { + "epoch": 0.1616999379474218, + "grad_norm": 0.1208869069814682, + "learning_rate": 0.00010510603737494876, + "loss": 1.5782861709594727, + "step": 53420 + }, + { + "epoch": 0.16173020749776762, + "grad_norm": 0.1283119022846222, + "learning_rate": 0.00010510224219331137, + "loss": 1.5966291427612305, + "step": 53430 + }, + { + "epoch": 0.16176047704811344, + "grad_norm": 0.12318532913923264, + "learning_rate": 0.00010509844701167398, + "loss": 1.600830078125, + "step": 53440 + }, + { + "epoch": 0.16179074659845927, + "grad_norm": 0.13710172474384308, + "learning_rate": 0.0001050946518300366, + "loss": 1.6222034454345704, + "step": 53450 + }, + { + "epoch": 0.16182101614880512, + "grad_norm": 0.1328335553407669, + "learning_rate": 0.00010509085664839919, + "loss": 1.5962244033813477, + "step": 53460 + }, + { + "epoch": 0.16185128569915094, + "grad_norm": 0.11522941291332245, + "learning_rate": 0.00010508706146676181, + "loss": 1.601961326599121, + "step": 53470 + }, + { + "epoch": 0.16188155524949677, + "grad_norm": 0.13074283301830292, + "learning_rate": 0.00010508326628512441, + "loss": 1.5542606353759765, + "step": 53480 + }, + { + "epoch": 0.1619118247998426, + "grad_norm": 0.1172068789601326, + "learning_rate": 0.00010507947110348702, + "loss": 1.599320411682129, + "step": 53490 + }, + { + "epoch": 0.16194209435018841, + "grad_norm": 0.12927645444869995, + "learning_rate": 0.00010507567592184963, + "loss": 1.6148950576782226, + "step": 53500 + }, + { + "epoch": 0.16194209435018841, + "eval_loss": 1.6052649021148682, + "eval_runtime": 28.0116, + "eval_samples_per_second": 17.85, + "eval_steps_per_second": 1.142, + "step": 53500 + }, + { + "epoch": 0.16197236390053427, + "grad_norm": 0.13721933960914612, + "learning_rate": 0.00010507188074021223, + "loss": 1.5943912506103515, + "step": 53510 + }, + { + "epoch": 0.1620026334508801, + "grad_norm": 0.12013190239667892, + "learning_rate": 0.00010506808555857484, + "loss": 1.608133316040039, + "step": 53520 + }, + { + "epoch": 0.16203290300122591, + "grad_norm": 0.13971269130706787, + "learning_rate": 0.00010506429037693744, + "loss": 1.6154163360595704, + "step": 53530 + }, + { + "epoch": 0.16206317255157174, + "grad_norm": 0.1263599544763565, + "learning_rate": 0.00010506049519530005, + "loss": 1.5820316314697265, + "step": 53540 + }, + { + "epoch": 0.1620934421019176, + "grad_norm": 0.12800665199756622, + "learning_rate": 0.00010505670001366265, + "loss": 1.6392065048217774, + "step": 53550 + }, + { + "epoch": 0.1621237116522634, + "grad_norm": 0.12239375710487366, + "learning_rate": 0.00010505290483202526, + "loss": 1.6373416900634765, + "step": 53560 + }, + { + "epoch": 0.16215398120260924, + "grad_norm": 0.13284805417060852, + "learning_rate": 0.00010504910965038787, + "loss": 1.6420116424560547, + "step": 53570 + }, + { + "epoch": 0.16218425075295506, + "grad_norm": 0.14106588065624237, + "learning_rate": 0.00010504531446875047, + "loss": 1.6257850646972656, + "step": 53580 + }, + { + "epoch": 0.16221452030330089, + "grad_norm": 0.12471708655357361, + "learning_rate": 0.00010504151928711308, + "loss": 1.5905282974243165, + "step": 53590 + }, + { + "epoch": 0.16224478985364674, + "grad_norm": 0.13436144590377808, + "learning_rate": 0.00010503772410547568, + "loss": 1.629483985900879, + "step": 53600 + }, + { + "epoch": 0.16227505940399256, + "grad_norm": 0.127651646733284, + "learning_rate": 0.0001050339289238383, + "loss": 1.5722671508789063, + "step": 53610 + }, + { + "epoch": 0.16230532895433838, + "grad_norm": 0.12337994575500488, + "learning_rate": 0.0001050301337422009, + "loss": 1.6238748550415039, + "step": 53620 + }, + { + "epoch": 0.1623355985046842, + "grad_norm": 0.1248752772808075, + "learning_rate": 0.00010502633856056352, + "loss": 1.6013582229614258, + "step": 53630 + }, + { + "epoch": 0.16236586805503003, + "grad_norm": 0.13466818630695343, + "learning_rate": 0.00010502254337892611, + "loss": 1.6053033828735352, + "step": 53640 + }, + { + "epoch": 0.16239613760537588, + "grad_norm": 0.13018079102039337, + "learning_rate": 0.00010501874819728873, + "loss": 1.6096004486083983, + "step": 53650 + }, + { + "epoch": 0.1624264071557217, + "grad_norm": 0.123440220952034, + "learning_rate": 0.00010501495301565133, + "loss": 1.638495635986328, + "step": 53660 + }, + { + "epoch": 0.16245667670606753, + "grad_norm": 0.11943180859088898, + "learning_rate": 0.00010501115783401394, + "loss": 1.6152755737304687, + "step": 53670 + }, + { + "epoch": 0.16248694625641336, + "grad_norm": 0.1411026418209076, + "learning_rate": 0.00010500736265237655, + "loss": 1.5850254058837892, + "step": 53680 + }, + { + "epoch": 0.16251721580675918, + "grad_norm": 0.13499419391155243, + "learning_rate": 0.00010500356747073915, + "loss": 1.5809282302856444, + "step": 53690 + }, + { + "epoch": 0.16254748535710503, + "grad_norm": 0.12450970709323883, + "learning_rate": 0.00010499977228910176, + "loss": 1.6116619110107422, + "step": 53700 + }, + { + "epoch": 0.16257775490745086, + "grad_norm": 0.13084878027439117, + "learning_rate": 0.00010499597710746436, + "loss": 1.6319692611694336, + "step": 53710 + }, + { + "epoch": 0.16260802445779668, + "grad_norm": 0.12571530044078827, + "learning_rate": 0.00010499218192582698, + "loss": 1.6647029876708985, + "step": 53720 + }, + { + "epoch": 0.1626382940081425, + "grad_norm": 0.13272491097450256, + "learning_rate": 0.00010498838674418958, + "loss": 1.5913966178894043, + "step": 53730 + }, + { + "epoch": 0.16266856355848833, + "grad_norm": 0.1284348964691162, + "learning_rate": 0.0001049845915625522, + "loss": 1.586258316040039, + "step": 53740 + }, + { + "epoch": 0.16269883310883418, + "grad_norm": 0.13880975544452667, + "learning_rate": 0.00010498079638091479, + "loss": 1.645552635192871, + "step": 53750 + }, + { + "epoch": 0.16272910265918, + "grad_norm": 0.13347367942333221, + "learning_rate": 0.00010497700119927741, + "loss": 1.592587661743164, + "step": 53760 + }, + { + "epoch": 0.16275937220952583, + "grad_norm": 0.1322673112154007, + "learning_rate": 0.00010497320601764, + "loss": 1.6111324310302735, + "step": 53770 + }, + { + "epoch": 0.16278964175987165, + "grad_norm": 0.1395423859357834, + "learning_rate": 0.00010496941083600262, + "loss": 1.5934589385986329, + "step": 53780 + }, + { + "epoch": 0.16281991131021747, + "grad_norm": 0.12524625658988953, + "learning_rate": 0.00010496561565436521, + "loss": 1.595794105529785, + "step": 53790 + }, + { + "epoch": 0.16285018086056333, + "grad_norm": 0.12578235566616058, + "learning_rate": 0.00010496182047272783, + "loss": 1.6101556777954102, + "step": 53800 + }, + { + "epoch": 0.16288045041090915, + "grad_norm": 0.13633058965206146, + "learning_rate": 0.00010495802529109042, + "loss": 1.6231063842773437, + "step": 53810 + }, + { + "epoch": 0.16291071996125497, + "grad_norm": 0.1483645886182785, + "learning_rate": 0.00010495423010945304, + "loss": 1.5926532745361328, + "step": 53820 + }, + { + "epoch": 0.1629409895116008, + "grad_norm": 0.1251031756401062, + "learning_rate": 0.00010495043492781564, + "loss": 1.6383674621582032, + "step": 53830 + }, + { + "epoch": 0.16297125906194662, + "grad_norm": 0.12533524632453918, + "learning_rate": 0.00010494663974617825, + "loss": 1.6563392639160157, + "step": 53840 + }, + { + "epoch": 0.16300152861229247, + "grad_norm": 0.1383861005306244, + "learning_rate": 0.00010494284456454087, + "loss": 1.5971670150756836, + "step": 53850 + }, + { + "epoch": 0.1630317981626383, + "grad_norm": 0.1355842798948288, + "learning_rate": 0.00010493904938290347, + "loss": 1.599382495880127, + "step": 53860 + }, + { + "epoch": 0.16306206771298412, + "grad_norm": 0.13413618505001068, + "learning_rate": 0.00010493525420126609, + "loss": 1.5833654403686523, + "step": 53870 + }, + { + "epoch": 0.16309233726332995, + "grad_norm": 0.1366376280784607, + "learning_rate": 0.00010493145901962868, + "loss": 1.5923477172851563, + "step": 53880 + }, + { + "epoch": 0.16312260681367577, + "grad_norm": 0.12078405171632767, + "learning_rate": 0.0001049276638379913, + "loss": 1.6549007415771484, + "step": 53890 + }, + { + "epoch": 0.16315287636402162, + "grad_norm": 0.13485576212406158, + "learning_rate": 0.00010492386865635389, + "loss": 1.588131809234619, + "step": 53900 + }, + { + "epoch": 0.16318314591436744, + "grad_norm": 0.13540975749492645, + "learning_rate": 0.00010492007347471651, + "loss": 1.5547119140625, + "step": 53910 + }, + { + "epoch": 0.16321341546471327, + "grad_norm": 0.1304693967103958, + "learning_rate": 0.0001049162782930791, + "loss": 1.594460678100586, + "step": 53920 + }, + { + "epoch": 0.1632436850150591, + "grad_norm": 0.1246645599603653, + "learning_rate": 0.00010491248311144172, + "loss": 1.5672534942626952, + "step": 53930 + }, + { + "epoch": 0.16327395456540494, + "grad_norm": 0.12495748698711395, + "learning_rate": 0.00010490868792980431, + "loss": 1.575190258026123, + "step": 53940 + }, + { + "epoch": 0.16330422411575077, + "grad_norm": 0.13169901072978973, + "learning_rate": 0.00010490489274816693, + "loss": 1.5850486755371094, + "step": 53950 + }, + { + "epoch": 0.1633344936660966, + "grad_norm": 0.15210416913032532, + "learning_rate": 0.00010490109756652953, + "loss": 1.6087127685546876, + "step": 53960 + }, + { + "epoch": 0.16336476321644242, + "grad_norm": 0.14011803269386292, + "learning_rate": 0.00010489730238489215, + "loss": 1.6256805419921876, + "step": 53970 + }, + { + "epoch": 0.16339503276678824, + "grad_norm": 0.14012624323368073, + "learning_rate": 0.00010489350720325475, + "loss": 1.5707260131835938, + "step": 53980 + }, + { + "epoch": 0.1634253023171341, + "grad_norm": 0.12867838144302368, + "learning_rate": 0.00010488971202161736, + "loss": 1.585586929321289, + "step": 53990 + }, + { + "epoch": 0.16345557186747992, + "grad_norm": 0.15185779333114624, + "learning_rate": 0.00010488591683997996, + "loss": 1.6266353607177735, + "step": 54000 + }, + { + "epoch": 0.16345557186747992, + "eval_loss": 1.5923221111297607, + "eval_runtime": 28.4943, + "eval_samples_per_second": 17.547, + "eval_steps_per_second": 1.123, + "step": 54000 + }, + { + "epoch": 0.16348584141782574, + "grad_norm": 0.12586115300655365, + "learning_rate": 0.00010488212165834257, + "loss": 1.6186790466308594, + "step": 54010 + }, + { + "epoch": 0.16351611096817156, + "grad_norm": 0.15741164982318878, + "learning_rate": 0.00010487832647670518, + "loss": 1.640771484375, + "step": 54020 + }, + { + "epoch": 0.1635463805185174, + "grad_norm": 0.1438615471124649, + "learning_rate": 0.00010487453129506778, + "loss": 1.6260787963867187, + "step": 54030 + }, + { + "epoch": 0.16357665006886324, + "grad_norm": 0.14184509217739105, + "learning_rate": 0.00010487073611343039, + "loss": 1.630499267578125, + "step": 54040 + }, + { + "epoch": 0.16360691961920906, + "grad_norm": 0.13455630838871002, + "learning_rate": 0.00010486694093179299, + "loss": 1.587113380432129, + "step": 54050 + }, + { + "epoch": 0.1636371891695549, + "grad_norm": 0.1374368667602539, + "learning_rate": 0.00010486314575015561, + "loss": 1.6313133239746094, + "step": 54060 + }, + { + "epoch": 0.1636674587199007, + "grad_norm": 0.1426924169063568, + "learning_rate": 0.0001048593505685182, + "loss": 1.629480743408203, + "step": 54070 + }, + { + "epoch": 0.16369772827024653, + "grad_norm": 0.12990880012512207, + "learning_rate": 0.00010485555538688082, + "loss": 1.577085590362549, + "step": 54080 + }, + { + "epoch": 0.16372799782059239, + "grad_norm": 0.13655900955200195, + "learning_rate": 0.00010485176020524343, + "loss": 1.5965201377868652, + "step": 54090 + }, + { + "epoch": 0.1637582673709382, + "grad_norm": 0.13483451306819916, + "learning_rate": 0.00010484796502360604, + "loss": 1.6150659561157226, + "step": 54100 + }, + { + "epoch": 0.16378853692128403, + "grad_norm": 0.14715148508548737, + "learning_rate": 0.00010484416984196864, + "loss": 1.605635643005371, + "step": 54110 + }, + { + "epoch": 0.16381880647162986, + "grad_norm": 0.12361413985490799, + "learning_rate": 0.00010484037466033125, + "loss": 1.6031122207641602, + "step": 54120 + }, + { + "epoch": 0.16384907602197568, + "grad_norm": 0.12656168639659882, + "learning_rate": 0.00010483657947869385, + "loss": 1.6211864471435546, + "step": 54130 + }, + { + "epoch": 0.16387934557232153, + "grad_norm": 0.13430556654930115, + "learning_rate": 0.00010483278429705646, + "loss": 1.6019979476928712, + "step": 54140 + }, + { + "epoch": 0.16390961512266736, + "grad_norm": 0.1380249261856079, + "learning_rate": 0.00010482898911541907, + "loss": 1.6083013534545898, + "step": 54150 + }, + { + "epoch": 0.16393988467301318, + "grad_norm": 0.13682985305786133, + "learning_rate": 0.00010482519393378167, + "loss": 1.6174423217773437, + "step": 54160 + }, + { + "epoch": 0.163970154223359, + "grad_norm": 0.12742862105369568, + "learning_rate": 0.00010482139875214428, + "loss": 1.6301429748535157, + "step": 54170 + }, + { + "epoch": 0.16400042377370483, + "grad_norm": 0.1306757777929306, + "learning_rate": 0.00010481760357050688, + "loss": 1.6154241561889648, + "step": 54180 + }, + { + "epoch": 0.16403069332405068, + "grad_norm": 0.13797545433044434, + "learning_rate": 0.00010481380838886949, + "loss": 1.5915159225463866, + "step": 54190 + }, + { + "epoch": 0.1640609628743965, + "grad_norm": 0.12714605033397675, + "learning_rate": 0.0001048100132072321, + "loss": 1.559494400024414, + "step": 54200 + }, + { + "epoch": 0.16409123242474233, + "grad_norm": 0.13551533222198486, + "learning_rate": 0.0001048062180255947, + "loss": 1.6162599563598632, + "step": 54210 + }, + { + "epoch": 0.16412150197508815, + "grad_norm": 0.12734465301036835, + "learning_rate": 0.00010480242284395732, + "loss": 1.5950443267822265, + "step": 54220 + }, + { + "epoch": 0.16415177152543398, + "grad_norm": 0.13721846044063568, + "learning_rate": 0.00010479862766231991, + "loss": 1.6304353713989257, + "step": 54230 + }, + { + "epoch": 0.16418204107577983, + "grad_norm": 0.11713677644729614, + "learning_rate": 0.00010479483248068253, + "loss": 1.650149154663086, + "step": 54240 + }, + { + "epoch": 0.16421231062612565, + "grad_norm": 0.13073715567588806, + "learning_rate": 0.00010479103729904513, + "loss": 1.5764408111572266, + "step": 54250 + }, + { + "epoch": 0.16424258017647148, + "grad_norm": 0.14873652160167694, + "learning_rate": 0.00010478724211740775, + "loss": 1.5974549293518066, + "step": 54260 + }, + { + "epoch": 0.1642728497268173, + "grad_norm": 0.12714779376983643, + "learning_rate": 0.00010478344693577035, + "loss": 1.6501544952392577, + "step": 54270 + }, + { + "epoch": 0.16430311927716315, + "grad_norm": 0.14203156530857086, + "learning_rate": 0.00010477965175413296, + "loss": 1.5957510948181153, + "step": 54280 + }, + { + "epoch": 0.16433338882750897, + "grad_norm": 0.139139324426651, + "learning_rate": 0.00010477585657249556, + "loss": 1.6007938385009766, + "step": 54290 + }, + { + "epoch": 0.1643636583778548, + "grad_norm": 0.13477212190628052, + "learning_rate": 0.00010477206139085817, + "loss": 1.5936304092407227, + "step": 54300 + }, + { + "epoch": 0.16439392792820062, + "grad_norm": 0.13562726974487305, + "learning_rate": 0.00010476826620922077, + "loss": 1.6031442642211915, + "step": 54310 + }, + { + "epoch": 0.16442419747854645, + "grad_norm": 0.14531241357326508, + "learning_rate": 0.00010476447102758338, + "loss": 1.6357707977294922, + "step": 54320 + }, + { + "epoch": 0.1644544670288923, + "grad_norm": 0.12827351689338684, + "learning_rate": 0.00010476067584594599, + "loss": 1.6127601623535157, + "step": 54330 + }, + { + "epoch": 0.16448473657923812, + "grad_norm": 0.12303178757429123, + "learning_rate": 0.00010475688066430859, + "loss": 1.581064796447754, + "step": 54340 + }, + { + "epoch": 0.16451500612958395, + "grad_norm": 0.14443334937095642, + "learning_rate": 0.00010475308548267121, + "loss": 1.627345657348633, + "step": 54350 + }, + { + "epoch": 0.16454527567992977, + "grad_norm": 0.12848785519599915, + "learning_rate": 0.0001047492903010338, + "loss": 1.625225830078125, + "step": 54360 + }, + { + "epoch": 0.1645755452302756, + "grad_norm": 0.11336079984903336, + "learning_rate": 0.00010474549511939642, + "loss": 1.6013710021972656, + "step": 54370 + }, + { + "epoch": 0.16460581478062145, + "grad_norm": 0.13426397740840912, + "learning_rate": 0.00010474169993775902, + "loss": 1.5860190391540527, + "step": 54380 + }, + { + "epoch": 0.16463608433096727, + "grad_norm": 0.13441763818264008, + "learning_rate": 0.00010473790475612164, + "loss": 1.5978111267089843, + "step": 54390 + }, + { + "epoch": 0.1646663538813131, + "grad_norm": 0.1314314752817154, + "learning_rate": 0.00010473410957448423, + "loss": 1.6489713668823243, + "step": 54400 + }, + { + "epoch": 0.16469662343165892, + "grad_norm": 0.15726324915885925, + "learning_rate": 0.00010473031439284685, + "loss": 1.5586402893066407, + "step": 54410 + }, + { + "epoch": 0.16472689298200474, + "grad_norm": 0.14118531346321106, + "learning_rate": 0.00010472651921120944, + "loss": 1.5762948989868164, + "step": 54420 + }, + { + "epoch": 0.1647571625323506, + "grad_norm": 0.12975671887397766, + "learning_rate": 0.00010472272402957206, + "loss": 1.5919334411621093, + "step": 54430 + }, + { + "epoch": 0.16478743208269642, + "grad_norm": 0.1499468982219696, + "learning_rate": 0.00010471892884793465, + "loss": 1.6165029525756835, + "step": 54440 + }, + { + "epoch": 0.16481770163304224, + "grad_norm": 0.1345716416835785, + "learning_rate": 0.00010471513366629727, + "loss": 1.5776290893554688, + "step": 54450 + }, + { + "epoch": 0.16484797118338806, + "grad_norm": 0.13334263861179352, + "learning_rate": 0.00010471133848465989, + "loss": 1.589129638671875, + "step": 54460 + }, + { + "epoch": 0.1648782407337339, + "grad_norm": 0.12176588922739029, + "learning_rate": 0.00010470754330302248, + "loss": 1.593080997467041, + "step": 54470 + }, + { + "epoch": 0.16490851028407974, + "grad_norm": 0.12796200811862946, + "learning_rate": 0.0001047037481213851, + "loss": 1.6030546188354493, + "step": 54480 + }, + { + "epoch": 0.16493877983442556, + "grad_norm": 0.12246726453304291, + "learning_rate": 0.0001046999529397477, + "loss": 1.5693370819091796, + "step": 54490 + }, + { + "epoch": 0.1649690493847714, + "grad_norm": 0.1294613629579544, + "learning_rate": 0.00010469615775811032, + "loss": 1.6182632446289062, + "step": 54500 + }, + { + "epoch": 0.1649690493847714, + "eval_loss": 1.6314125061035156, + "eval_runtime": 28.3129, + "eval_samples_per_second": 17.66, + "eval_steps_per_second": 1.13, + "step": 54500 + }, + { + "epoch": 0.1649993189351172, + "grad_norm": 0.12968167662620544, + "learning_rate": 0.00010469236257647291, + "loss": 1.6216190338134766, + "step": 54510 + }, + { + "epoch": 0.16502958848546304, + "grad_norm": 0.1418495774269104, + "learning_rate": 0.00010468856739483553, + "loss": 1.6095529556274415, + "step": 54520 + }, + { + "epoch": 0.1650598580358089, + "grad_norm": 0.14466917514801025, + "learning_rate": 0.00010468477221319812, + "loss": 1.6147823333740234, + "step": 54530 + }, + { + "epoch": 0.1650901275861547, + "grad_norm": 0.14120177924633026, + "learning_rate": 0.00010468097703156074, + "loss": 1.6639463424682617, + "step": 54540 + }, + { + "epoch": 0.16512039713650054, + "grad_norm": 0.14647138118743896, + "learning_rate": 0.00010467718184992333, + "loss": 1.5867748260498047, + "step": 54550 + }, + { + "epoch": 0.16515066668684636, + "grad_norm": 0.13280466198921204, + "learning_rate": 0.00010467338666828595, + "loss": 1.60345401763916, + "step": 54560 + }, + { + "epoch": 0.16518093623719218, + "grad_norm": 0.11900684237480164, + "learning_rate": 0.00010466959148664854, + "loss": 1.5958017349243163, + "step": 54570 + }, + { + "epoch": 0.16521120578753803, + "grad_norm": 0.12059556692838669, + "learning_rate": 0.00010466579630501116, + "loss": 1.600596046447754, + "step": 54580 + }, + { + "epoch": 0.16524147533788386, + "grad_norm": 0.13684801757335663, + "learning_rate": 0.00010466200112337377, + "loss": 1.5732650756835938, + "step": 54590 + }, + { + "epoch": 0.16527174488822968, + "grad_norm": 0.13519629836082458, + "learning_rate": 0.00010465820594173637, + "loss": 1.6108095169067382, + "step": 54600 + }, + { + "epoch": 0.1653020144385755, + "grad_norm": 0.1496470421552658, + "learning_rate": 0.00010465441076009898, + "loss": 1.5927178382873535, + "step": 54610 + }, + { + "epoch": 0.16533228398892136, + "grad_norm": 0.13115458190441132, + "learning_rate": 0.00010465061557846159, + "loss": 1.6344612121582032, + "step": 54620 + }, + { + "epoch": 0.16536255353926718, + "grad_norm": 0.13324877619743347, + "learning_rate": 0.00010464682039682419, + "loss": 1.63182373046875, + "step": 54630 + }, + { + "epoch": 0.165392823089613, + "grad_norm": 0.11751226335763931, + "learning_rate": 0.0001046430252151868, + "loss": 1.6071701049804688, + "step": 54640 + }, + { + "epoch": 0.16542309263995883, + "grad_norm": 0.14373549818992615, + "learning_rate": 0.0001046392300335494, + "loss": 1.5586334228515626, + "step": 54650 + }, + { + "epoch": 0.16545336219030465, + "grad_norm": 0.11887747794389725, + "learning_rate": 0.00010463543485191201, + "loss": 1.5752439498901367, + "step": 54660 + }, + { + "epoch": 0.1654836317406505, + "grad_norm": 0.14169123768806458, + "learning_rate": 0.00010463163967027463, + "loss": 1.5692241668701172, + "step": 54670 + }, + { + "epoch": 0.16551390129099633, + "grad_norm": 0.12537196278572083, + "learning_rate": 0.00010462784448863722, + "loss": 1.611382293701172, + "step": 54680 + }, + { + "epoch": 0.16554417084134215, + "grad_norm": 0.1126113161444664, + "learning_rate": 0.00010462404930699984, + "loss": 1.5901853561401367, + "step": 54690 + }, + { + "epoch": 0.16557444039168798, + "grad_norm": 0.12578892707824707, + "learning_rate": 0.00010462025412536243, + "loss": 1.5910047531127929, + "step": 54700 + }, + { + "epoch": 0.1656047099420338, + "grad_norm": 0.1300920695066452, + "learning_rate": 0.00010461645894372505, + "loss": 1.578591537475586, + "step": 54710 + }, + { + "epoch": 0.16563497949237965, + "grad_norm": 0.13132905960083008, + "learning_rate": 0.00010461266376208766, + "loss": 1.5772193908691405, + "step": 54720 + }, + { + "epoch": 0.16566524904272548, + "grad_norm": 0.13003423810005188, + "learning_rate": 0.00010460886858045027, + "loss": 1.5830489158630372, + "step": 54730 + }, + { + "epoch": 0.1656955185930713, + "grad_norm": 0.1267409771680832, + "learning_rate": 0.00010460507339881287, + "loss": 1.556825542449951, + "step": 54740 + }, + { + "epoch": 0.16572578814341712, + "grad_norm": 0.13336355984210968, + "learning_rate": 0.00010460127821717548, + "loss": 1.5893972396850586, + "step": 54750 + }, + { + "epoch": 0.16575605769376295, + "grad_norm": 0.12448076158761978, + "learning_rate": 0.00010459748303553808, + "loss": 1.5427205085754394, + "step": 54760 + }, + { + "epoch": 0.1657863272441088, + "grad_norm": 0.14361409842967987, + "learning_rate": 0.00010459368785390069, + "loss": 1.5933300971984863, + "step": 54770 + }, + { + "epoch": 0.16581659679445462, + "grad_norm": 0.13582073152065277, + "learning_rate": 0.0001045898926722633, + "loss": 1.5561258316040039, + "step": 54780 + }, + { + "epoch": 0.16584686634480045, + "grad_norm": 0.11483453959226608, + "learning_rate": 0.0001045860974906259, + "loss": 1.612067413330078, + "step": 54790 + }, + { + "epoch": 0.16587713589514627, + "grad_norm": 0.1380748301744461, + "learning_rate": 0.00010458230230898851, + "loss": 1.6067590713500977, + "step": 54800 + }, + { + "epoch": 0.1659074054454921, + "grad_norm": 0.12014417350292206, + "learning_rate": 0.00010457850712735111, + "loss": 1.5975746154785155, + "step": 54810 + }, + { + "epoch": 0.16593767499583795, + "grad_norm": 0.12639237940311432, + "learning_rate": 0.00010457471194571372, + "loss": 1.639265251159668, + "step": 54820 + }, + { + "epoch": 0.16596794454618377, + "grad_norm": 0.13731418550014496, + "learning_rate": 0.00010457091676407634, + "loss": 1.579951858520508, + "step": 54830 + }, + { + "epoch": 0.1659982140965296, + "grad_norm": 0.11487115919589996, + "learning_rate": 0.00010456712158243893, + "loss": 1.5888238906860352, + "step": 54840 + }, + { + "epoch": 0.16602848364687542, + "grad_norm": 0.12932170927524567, + "learning_rate": 0.00010456332640080155, + "loss": 1.599107551574707, + "step": 54850 + }, + { + "epoch": 0.16605875319722124, + "grad_norm": 0.12640048563480377, + "learning_rate": 0.00010455953121916414, + "loss": 1.5875725746154785, + "step": 54860 + }, + { + "epoch": 0.1660890227475671, + "grad_norm": 0.13860490918159485, + "learning_rate": 0.00010455573603752676, + "loss": 1.6364500045776367, + "step": 54870 + }, + { + "epoch": 0.16611929229791292, + "grad_norm": 0.13841612637043, + "learning_rate": 0.00010455194085588937, + "loss": 1.560653305053711, + "step": 54880 + }, + { + "epoch": 0.16614956184825874, + "grad_norm": 0.13985677063465118, + "learning_rate": 0.00010454814567425197, + "loss": 1.6099983215332032, + "step": 54890 + }, + { + "epoch": 0.16617983139860457, + "grad_norm": 0.12263695150613785, + "learning_rate": 0.00010454435049261458, + "loss": 1.5862369537353516, + "step": 54900 + }, + { + "epoch": 0.1662101009489504, + "grad_norm": 0.12620535492897034, + "learning_rate": 0.00010454055531097719, + "loss": 1.6069786071777343, + "step": 54910 + }, + { + "epoch": 0.16624037049929624, + "grad_norm": 0.13788571953773499, + "learning_rate": 0.00010453676012933979, + "loss": 1.631129837036133, + "step": 54920 + }, + { + "epoch": 0.16627064004964207, + "grad_norm": 0.12844732403755188, + "learning_rate": 0.0001045329649477024, + "loss": 1.6316513061523437, + "step": 54930 + }, + { + "epoch": 0.1663009095999879, + "grad_norm": 0.1430601179599762, + "learning_rate": 0.000104529169766065, + "loss": 1.5925339698791503, + "step": 54940 + }, + { + "epoch": 0.1663311791503337, + "grad_norm": 0.12757569551467896, + "learning_rate": 0.00010452537458442761, + "loss": 1.589057445526123, + "step": 54950 + }, + { + "epoch": 0.16636144870067957, + "grad_norm": 0.12646332383155823, + "learning_rate": 0.00010452157940279023, + "loss": 1.617682647705078, + "step": 54960 + }, + { + "epoch": 0.1663917182510254, + "grad_norm": 0.12798619270324707, + "learning_rate": 0.00010451778422115282, + "loss": 1.5490917205810546, + "step": 54970 + }, + { + "epoch": 0.1664219878013712, + "grad_norm": 0.12840865552425385, + "learning_rate": 0.00010451398903951544, + "loss": 1.5864351272583008, + "step": 54980 + }, + { + "epoch": 0.16645225735171704, + "grad_norm": 0.12996654212474823, + "learning_rate": 0.00010451019385787803, + "loss": 1.6355508804321288, + "step": 54990 + }, + { + "epoch": 0.16648252690206286, + "grad_norm": 0.12292426079511642, + "learning_rate": 0.00010450639867624065, + "loss": 1.5823351860046386, + "step": 55000 + }, + { + "epoch": 0.16648252690206286, + "eval_loss": 1.5948796272277832, + "eval_runtime": 28.0138, + "eval_samples_per_second": 17.848, + "eval_steps_per_second": 1.142, + "step": 55000 + }, + { + "epoch": 0.1665127964524087, + "grad_norm": 0.11409453302621841, + "learning_rate": 0.00010450260349460325, + "loss": 1.5728053092956542, + "step": 55010 + }, + { + "epoch": 0.16654306600275454, + "grad_norm": 0.13977794349193573, + "learning_rate": 0.00010449880831296587, + "loss": 1.6101509094238282, + "step": 55020 + }, + { + "epoch": 0.16657333555310036, + "grad_norm": 0.152554452419281, + "learning_rate": 0.00010449501313132846, + "loss": 1.59368953704834, + "step": 55030 + }, + { + "epoch": 0.16660360510344618, + "grad_norm": 0.13605697453022003, + "learning_rate": 0.00010449121794969108, + "loss": 1.573737335205078, + "step": 55040 + }, + { + "epoch": 0.166633874653792, + "grad_norm": 0.11876536160707474, + "learning_rate": 0.00010448742276805367, + "loss": 1.628824806213379, + "step": 55050 + }, + { + "epoch": 0.16666414420413786, + "grad_norm": 0.14240244030952454, + "learning_rate": 0.00010448362758641629, + "loss": 1.6068727493286132, + "step": 55060 + }, + { + "epoch": 0.16669441375448368, + "grad_norm": 0.12970693409442902, + "learning_rate": 0.00010447983240477891, + "loss": 1.6243494033813477, + "step": 55070 + }, + { + "epoch": 0.1667246833048295, + "grad_norm": 0.13072998821735382, + "learning_rate": 0.0001044760372231415, + "loss": 1.6253957748413086, + "step": 55080 + }, + { + "epoch": 0.16675495285517533, + "grad_norm": 0.12384463101625443, + "learning_rate": 0.00010447224204150412, + "loss": 1.6186447143554688, + "step": 55090 + }, + { + "epoch": 0.16678522240552116, + "grad_norm": 0.13430370390415192, + "learning_rate": 0.00010446844685986671, + "loss": 1.6027599334716798, + "step": 55100 + }, + { + "epoch": 0.166815491955867, + "grad_norm": 0.12158156931400299, + "learning_rate": 0.00010446465167822933, + "loss": 1.6479869842529298, + "step": 55110 + }, + { + "epoch": 0.16684576150621283, + "grad_norm": 0.13164186477661133, + "learning_rate": 0.00010446085649659192, + "loss": 1.592668342590332, + "step": 55120 + }, + { + "epoch": 0.16687603105655865, + "grad_norm": 0.12345462292432785, + "learning_rate": 0.00010445706131495454, + "loss": 1.5955102920532227, + "step": 55130 + }, + { + "epoch": 0.16690630060690448, + "grad_norm": 0.12780043482780457, + "learning_rate": 0.00010445326613331714, + "loss": 1.6409467697143554, + "step": 55140 + }, + { + "epoch": 0.1669365701572503, + "grad_norm": 0.12381391227245331, + "learning_rate": 0.00010444947095167976, + "loss": 1.599045181274414, + "step": 55150 + }, + { + "epoch": 0.16696683970759615, + "grad_norm": 0.1255822479724884, + "learning_rate": 0.00010444567577004235, + "loss": 1.5547685623168945, + "step": 55160 + }, + { + "epoch": 0.16699710925794198, + "grad_norm": 0.12006539851427078, + "learning_rate": 0.00010444188058840497, + "loss": 1.6097801208496094, + "step": 55170 + }, + { + "epoch": 0.1670273788082878, + "grad_norm": 0.13844244182109833, + "learning_rate": 0.00010443808540676756, + "loss": 1.6239532470703124, + "step": 55180 + }, + { + "epoch": 0.16705764835863363, + "grad_norm": 0.1279529184103012, + "learning_rate": 0.00010443429022513018, + "loss": 1.5548320770263673, + "step": 55190 + }, + { + "epoch": 0.16708791790897945, + "grad_norm": 0.12835508584976196, + "learning_rate": 0.00010443049504349279, + "loss": 1.5766733169555665, + "step": 55200 + }, + { + "epoch": 0.1671181874593253, + "grad_norm": 0.1316218078136444, + "learning_rate": 0.00010442669986185539, + "loss": 1.5741896629333496, + "step": 55210 + }, + { + "epoch": 0.16714845700967113, + "grad_norm": 0.12623830139636993, + "learning_rate": 0.000104422904680218, + "loss": 1.5365854263305665, + "step": 55220 + }, + { + "epoch": 0.16717872656001695, + "grad_norm": 0.127196803689003, + "learning_rate": 0.0001044191094985806, + "loss": 1.5653714179992675, + "step": 55230 + }, + { + "epoch": 0.16720899611036277, + "grad_norm": 0.13341061770915985, + "learning_rate": 0.00010441531431694321, + "loss": 1.6062034606933593, + "step": 55240 + }, + { + "epoch": 0.1672392656607086, + "grad_norm": 0.13792413473129272, + "learning_rate": 0.00010441151913530582, + "loss": 1.599332046508789, + "step": 55250 + }, + { + "epoch": 0.16726953521105445, + "grad_norm": 0.14117155969142914, + "learning_rate": 0.00010440772395366842, + "loss": 1.5707104682922364, + "step": 55260 + }, + { + "epoch": 0.16729980476140027, + "grad_norm": 0.1256158947944641, + "learning_rate": 0.00010440392877203103, + "loss": 1.5997858047485352, + "step": 55270 + }, + { + "epoch": 0.1673300743117461, + "grad_norm": 0.13319337368011475, + "learning_rate": 0.00010440013359039365, + "loss": 1.6210643768310546, + "step": 55280 + }, + { + "epoch": 0.16736034386209192, + "grad_norm": 0.12028159201145172, + "learning_rate": 0.00010439633840875624, + "loss": 1.6154369354248046, + "step": 55290 + }, + { + "epoch": 0.16739061341243777, + "grad_norm": 0.13701163232326508, + "learning_rate": 0.00010439254322711886, + "loss": 1.6046218872070312, + "step": 55300 + }, + { + "epoch": 0.1674208829627836, + "grad_norm": 0.13371878862380981, + "learning_rate": 0.00010438874804548145, + "loss": 1.5806934356689453, + "step": 55310 + }, + { + "epoch": 0.16745115251312942, + "grad_norm": 0.1259528249502182, + "learning_rate": 0.00010438495286384407, + "loss": 1.5695114135742188, + "step": 55320 + }, + { + "epoch": 0.16748142206347524, + "grad_norm": 0.12231244146823883, + "learning_rate": 0.00010438115768220668, + "loss": 1.6135692596435547, + "step": 55330 + }, + { + "epoch": 0.16751169161382107, + "grad_norm": 0.11236254125833511, + "learning_rate": 0.00010437736250056928, + "loss": 1.590012550354004, + "step": 55340 + }, + { + "epoch": 0.16754196116416692, + "grad_norm": 0.13756063580513, + "learning_rate": 0.00010437356731893189, + "loss": 1.6118654251098632, + "step": 55350 + }, + { + "epoch": 0.16757223071451274, + "grad_norm": 0.13605672121047974, + "learning_rate": 0.0001043697721372945, + "loss": 1.5682371139526368, + "step": 55360 + }, + { + "epoch": 0.16760250026485857, + "grad_norm": 0.1349773108959198, + "learning_rate": 0.0001043659769556571, + "loss": 1.6157602310180663, + "step": 55370 + }, + { + "epoch": 0.1676327698152044, + "grad_norm": 0.1332426369190216, + "learning_rate": 0.0001043621817740197, + "loss": 1.593825912475586, + "step": 55380 + }, + { + "epoch": 0.16766303936555021, + "grad_norm": 0.1296779364347458, + "learning_rate": 0.00010435838659238231, + "loss": 1.5544260025024415, + "step": 55390 + }, + { + "epoch": 0.16769330891589607, + "grad_norm": 0.1286267787218094, + "learning_rate": 0.00010435459141074492, + "loss": 1.5933834075927735, + "step": 55400 + }, + { + "epoch": 0.1677235784662419, + "grad_norm": 0.13606765866279602, + "learning_rate": 0.00010435079622910752, + "loss": 1.5883530616760253, + "step": 55410 + }, + { + "epoch": 0.16775384801658771, + "grad_norm": 0.13214001059532166, + "learning_rate": 0.00010434700104747013, + "loss": 1.6009204864501954, + "step": 55420 + }, + { + "epoch": 0.16778411756693354, + "grad_norm": 0.16471299529075623, + "learning_rate": 0.00010434320586583274, + "loss": 1.5824583053588868, + "step": 55430 + }, + { + "epoch": 0.16781438711727936, + "grad_norm": 0.12769843637943268, + "learning_rate": 0.00010433941068419534, + "loss": 1.6287063598632812, + "step": 55440 + }, + { + "epoch": 0.1678446566676252, + "grad_norm": 0.13739091157913208, + "learning_rate": 0.00010433561550255795, + "loss": 1.6110687255859375, + "step": 55450 + }, + { + "epoch": 0.16787492621797104, + "grad_norm": 0.14299388229846954, + "learning_rate": 0.00010433182032092057, + "loss": 1.5593220710754394, + "step": 55460 + }, + { + "epoch": 0.16790519576831686, + "grad_norm": 0.12581440806388855, + "learning_rate": 0.00010432802513928316, + "loss": 1.5890666961669921, + "step": 55470 + }, + { + "epoch": 0.16793546531866269, + "grad_norm": 0.1263503134250641, + "learning_rate": 0.00010432422995764578, + "loss": 1.6375225067138672, + "step": 55480 + }, + { + "epoch": 0.1679657348690085, + "grad_norm": 0.13650846481323242, + "learning_rate": 0.00010432043477600839, + "loss": 1.6014575958251953, + "step": 55490 + }, + { + "epoch": 0.16799600441935436, + "grad_norm": 0.140530064702034, + "learning_rate": 0.00010431663959437099, + "loss": 1.548732376098633, + "step": 55500 + }, + { + "epoch": 0.16799600441935436, + "eval_loss": 1.5695468187332153, + "eval_runtime": 27.8866, + "eval_samples_per_second": 17.93, + "eval_steps_per_second": 1.148, + "step": 55500 + }, + { + "epoch": 0.16802627396970019, + "grad_norm": 0.12471971660852432, + "learning_rate": 0.0001043128444127336, + "loss": 1.5809650421142578, + "step": 55510 + }, + { + "epoch": 0.168056543520046, + "grad_norm": 0.12153933942317963, + "learning_rate": 0.0001043090492310962, + "loss": 1.6011188507080079, + "step": 55520 + }, + { + "epoch": 0.16808681307039183, + "grad_norm": 0.11833354830741882, + "learning_rate": 0.00010430525404945881, + "loss": 1.6705329895019532, + "step": 55530 + }, + { + "epoch": 0.16811708262073766, + "grad_norm": 0.15378141403198242, + "learning_rate": 0.00010430145886782142, + "loss": 1.5989002227783202, + "step": 55540 + }, + { + "epoch": 0.1681473521710835, + "grad_norm": 0.14795444905757904, + "learning_rate": 0.00010429766368618402, + "loss": 1.6163856506347656, + "step": 55550 + }, + { + "epoch": 0.16817762172142933, + "grad_norm": 0.13075385987758636, + "learning_rate": 0.00010429386850454663, + "loss": 1.581635093688965, + "step": 55560 + }, + { + "epoch": 0.16820789127177516, + "grad_norm": 0.1253637820482254, + "learning_rate": 0.00010429007332290925, + "loss": 1.5935028076171875, + "step": 55570 + }, + { + "epoch": 0.16823816082212098, + "grad_norm": 0.12855856120586395, + "learning_rate": 0.00010428627814127184, + "loss": 1.6030662536621094, + "step": 55580 + }, + { + "epoch": 0.1682684303724668, + "grad_norm": 0.14472635090351105, + "learning_rate": 0.00010428248295963446, + "loss": 1.5888090133666992, + "step": 55590 + }, + { + "epoch": 0.16829869992281266, + "grad_norm": 0.13347172737121582, + "learning_rate": 0.00010427868777799705, + "loss": 1.6151927947998046, + "step": 55600 + }, + { + "epoch": 0.16832896947315848, + "grad_norm": 0.13151390850543976, + "learning_rate": 0.00010427489259635967, + "loss": 1.6236351013183594, + "step": 55610 + }, + { + "epoch": 0.1683592390235043, + "grad_norm": 0.1590348333120346, + "learning_rate": 0.00010427109741472226, + "loss": 1.5706689834594727, + "step": 55620 + }, + { + "epoch": 0.16838950857385013, + "grad_norm": 0.14718282222747803, + "learning_rate": 0.00010426730223308488, + "loss": 1.6119029998779297, + "step": 55630 + }, + { + "epoch": 0.16841977812419595, + "grad_norm": 0.12467504292726517, + "learning_rate": 0.00010426350705144747, + "loss": 1.6187828063964844, + "step": 55640 + }, + { + "epoch": 0.1684500476745418, + "grad_norm": 0.12026004493236542, + "learning_rate": 0.0001042597118698101, + "loss": 1.5633203506469726, + "step": 55650 + }, + { + "epoch": 0.16848031722488763, + "grad_norm": 0.13796786963939667, + "learning_rate": 0.00010425591668817269, + "loss": 1.5656105041503907, + "step": 55660 + }, + { + "epoch": 0.16851058677523345, + "grad_norm": 0.13596417009830475, + "learning_rate": 0.0001042521215065353, + "loss": 1.5991073608398438, + "step": 55670 + }, + { + "epoch": 0.16854085632557927, + "grad_norm": 0.12613187730312347, + "learning_rate": 0.00010424832632489791, + "loss": 1.611866569519043, + "step": 55680 + }, + { + "epoch": 0.16857112587592513, + "grad_norm": 0.12119093537330627, + "learning_rate": 0.00010424453114326052, + "loss": 1.6028076171875, + "step": 55690 + }, + { + "epoch": 0.16860139542627095, + "grad_norm": 0.13218730688095093, + "learning_rate": 0.00010424073596162314, + "loss": 1.5749336242675782, + "step": 55700 + }, + { + "epoch": 0.16863166497661677, + "grad_norm": 0.12284848839044571, + "learning_rate": 0.00010423694077998573, + "loss": 1.64129638671875, + "step": 55710 + }, + { + "epoch": 0.1686619345269626, + "grad_norm": 0.11919189244508743, + "learning_rate": 0.00010423314559834835, + "loss": 1.6270307540893554, + "step": 55720 + }, + { + "epoch": 0.16869220407730842, + "grad_norm": 0.12541306018829346, + "learning_rate": 0.00010422935041671094, + "loss": 1.6263919830322267, + "step": 55730 + }, + { + "epoch": 0.16872247362765427, + "grad_norm": 0.12520352005958557, + "learning_rate": 0.00010422555523507356, + "loss": 1.6100780487060546, + "step": 55740 + }, + { + "epoch": 0.1687527431780001, + "grad_norm": 0.13167883455753326, + "learning_rate": 0.00010422176005343615, + "loss": 1.614959716796875, + "step": 55750 + }, + { + "epoch": 0.16878301272834592, + "grad_norm": 0.13384856283664703, + "learning_rate": 0.00010421796487179877, + "loss": 1.5938318252563477, + "step": 55760 + }, + { + "epoch": 0.16881328227869175, + "grad_norm": 0.14160382747650146, + "learning_rate": 0.00010421416969016137, + "loss": 1.6288707733154297, + "step": 55770 + }, + { + "epoch": 0.16884355182903757, + "grad_norm": 0.12325489521026611, + "learning_rate": 0.00010421037450852399, + "loss": 1.5787525177001953, + "step": 55780 + }, + { + "epoch": 0.16887382137938342, + "grad_norm": 0.13398119807243347, + "learning_rate": 0.00010420657932688658, + "loss": 1.5937804222106933, + "step": 55790 + }, + { + "epoch": 0.16890409092972924, + "grad_norm": 0.15082991123199463, + "learning_rate": 0.0001042027841452492, + "loss": 1.6088882446289063, + "step": 55800 + }, + { + "epoch": 0.16893436048007507, + "grad_norm": 0.1235111802816391, + "learning_rate": 0.00010419898896361179, + "loss": 1.5957746505737305, + "step": 55810 + }, + { + "epoch": 0.1689646300304209, + "grad_norm": 0.1383194923400879, + "learning_rate": 0.00010419519378197441, + "loss": 1.5596492767333985, + "step": 55820 + }, + { + "epoch": 0.16899489958076672, + "grad_norm": 0.12906496226787567, + "learning_rate": 0.00010419139860033701, + "loss": 1.6113555908203125, + "step": 55830 + }, + { + "epoch": 0.16902516913111257, + "grad_norm": 0.11789208650588989, + "learning_rate": 0.00010418760341869962, + "loss": 1.6055057525634766, + "step": 55840 + }, + { + "epoch": 0.1690554386814584, + "grad_norm": 0.12140503525733948, + "learning_rate": 0.00010418380823706223, + "loss": 1.586721420288086, + "step": 55850 + }, + { + "epoch": 0.16908570823180422, + "grad_norm": 0.1258714497089386, + "learning_rate": 0.00010418001305542483, + "loss": 1.5862428665161132, + "step": 55860 + }, + { + "epoch": 0.16911597778215004, + "grad_norm": 0.13474902510643005, + "learning_rate": 0.00010417621787378744, + "loss": 1.5903782844543457, + "step": 55870 + }, + { + "epoch": 0.16914624733249586, + "grad_norm": 0.1347954273223877, + "learning_rate": 0.00010417242269215004, + "loss": 1.6248180389404296, + "step": 55880 + }, + { + "epoch": 0.16917651688284172, + "grad_norm": 0.12656375765800476, + "learning_rate": 0.00010416862751051266, + "loss": 1.6231714248657227, + "step": 55890 + }, + { + "epoch": 0.16920678643318754, + "grad_norm": 0.11443755030632019, + "learning_rate": 0.00010416483232887526, + "loss": 1.6030139923095703, + "step": 55900 + }, + { + "epoch": 0.16923705598353336, + "grad_norm": 0.11965405195951462, + "learning_rate": 0.00010416103714723788, + "loss": 1.6014049530029297, + "step": 55910 + }, + { + "epoch": 0.1692673255338792, + "grad_norm": 0.13061214983463287, + "learning_rate": 0.00010415724196560047, + "loss": 1.6109386444091798, + "step": 55920 + }, + { + "epoch": 0.169297595084225, + "grad_norm": 0.11797380447387695, + "learning_rate": 0.00010415344678396309, + "loss": 1.5894780158996582, + "step": 55930 + }, + { + "epoch": 0.16932786463457086, + "grad_norm": 0.13887447118759155, + "learning_rate": 0.0001041496516023257, + "loss": 1.6137317657470702, + "step": 55940 + }, + { + "epoch": 0.1693581341849167, + "grad_norm": 0.12054044753313065, + "learning_rate": 0.0001041458564206883, + "loss": 1.5956392288208008, + "step": 55950 + }, + { + "epoch": 0.1693884037352625, + "grad_norm": 0.13779528439044952, + "learning_rate": 0.0001041420612390509, + "loss": 1.5874921798706054, + "step": 55960 + }, + { + "epoch": 0.16941867328560833, + "grad_norm": 0.14091655611991882, + "learning_rate": 0.00010413826605741351, + "loss": 1.5991753578186034, + "step": 55970 + }, + { + "epoch": 0.16944894283595416, + "grad_norm": 0.12017004191875458, + "learning_rate": 0.00010413447087577612, + "loss": 1.5639484405517579, + "step": 55980 + }, + { + "epoch": 0.1694792123863, + "grad_norm": 0.1300419718027115, + "learning_rate": 0.00010413067569413872, + "loss": 1.6245647430419923, + "step": 55990 + }, + { + "epoch": 0.16950948193664583, + "grad_norm": 0.13152098655700684, + "learning_rate": 0.00010412688051250133, + "loss": 1.6121881484985352, + "step": 56000 + }, + { + "epoch": 0.16950948193664583, + "eval_loss": 1.6014679670333862, + "eval_runtime": 28.3944, + "eval_samples_per_second": 17.609, + "eval_steps_per_second": 1.127, + "step": 56000 + }, + { + "epoch": 0.16953975148699166, + "grad_norm": 0.12862175703048706, + "learning_rate": 0.00010412308533086394, + "loss": 1.5930513381958007, + "step": 56010 + }, + { + "epoch": 0.16957002103733748, + "grad_norm": 0.13639949262142181, + "learning_rate": 0.00010411929014922654, + "loss": 1.6061229705810547, + "step": 56020 + }, + { + "epoch": 0.16960029058768333, + "grad_norm": 0.12574315071105957, + "learning_rate": 0.00010411549496758915, + "loss": 1.6318841934204102, + "step": 56030 + }, + { + "epoch": 0.16963056013802916, + "grad_norm": 0.13788220286369324, + "learning_rate": 0.00010411169978595175, + "loss": 1.5997751235961915, + "step": 56040 + }, + { + "epoch": 0.16966082968837498, + "grad_norm": 0.13449986279010773, + "learning_rate": 0.00010410790460431436, + "loss": 1.6253629684448243, + "step": 56050 + }, + { + "epoch": 0.1696910992387208, + "grad_norm": 0.11975401639938354, + "learning_rate": 0.00010410410942267697, + "loss": 1.6389177322387696, + "step": 56060 + }, + { + "epoch": 0.16972136878906663, + "grad_norm": 0.11682054400444031, + "learning_rate": 0.00010410031424103958, + "loss": 1.627294921875, + "step": 56070 + }, + { + "epoch": 0.16975163833941248, + "grad_norm": 0.14156174659729004, + "learning_rate": 0.00010409651905940218, + "loss": 1.5673436164855956, + "step": 56080 + }, + { + "epoch": 0.1697819078897583, + "grad_norm": 0.14372488856315613, + "learning_rate": 0.0001040927238777648, + "loss": 1.615565872192383, + "step": 56090 + }, + { + "epoch": 0.16981217744010413, + "grad_norm": 0.12687958776950836, + "learning_rate": 0.0001040889286961274, + "loss": 1.6011367797851563, + "step": 56100 + }, + { + "epoch": 0.16984244699044995, + "grad_norm": 0.12775981426239014, + "learning_rate": 0.00010408513351449001, + "loss": 1.6148662567138672, + "step": 56110 + }, + { + "epoch": 0.16987271654079578, + "grad_norm": 0.13415883481502533, + "learning_rate": 0.00010408133833285261, + "loss": 1.620636558532715, + "step": 56120 + }, + { + "epoch": 0.16990298609114163, + "grad_norm": 0.13352152705192566, + "learning_rate": 0.00010407754315121522, + "loss": 1.628801155090332, + "step": 56130 + }, + { + "epoch": 0.16993325564148745, + "grad_norm": 0.12804661691188812, + "learning_rate": 0.00010407374796957783, + "loss": 1.620049285888672, + "step": 56140 + }, + { + "epoch": 0.16996352519183328, + "grad_norm": 0.13010193407535553, + "learning_rate": 0.00010406995278794043, + "loss": 1.5781404495239257, + "step": 56150 + }, + { + "epoch": 0.1699937947421791, + "grad_norm": 0.1551816165447235, + "learning_rate": 0.00010406615760630304, + "loss": 1.5845709800720216, + "step": 56160 + }, + { + "epoch": 0.17002406429252492, + "grad_norm": 0.13703614473342896, + "learning_rate": 0.00010406236242466564, + "loss": 1.5507701873779296, + "step": 56170 + }, + { + "epoch": 0.17005433384287078, + "grad_norm": 0.12778885662555695, + "learning_rate": 0.00010405856724302826, + "loss": 1.5891839027404786, + "step": 56180 + }, + { + "epoch": 0.1700846033932166, + "grad_norm": 0.1463734209537506, + "learning_rate": 0.00010405477206139086, + "loss": 1.6138755798339843, + "step": 56190 + }, + { + "epoch": 0.17011487294356242, + "grad_norm": 0.12617893517017365, + "learning_rate": 0.00010405097687975348, + "loss": 1.559635543823242, + "step": 56200 + }, + { + "epoch": 0.17014514249390825, + "grad_norm": 0.12785854935646057, + "learning_rate": 0.00010404718169811607, + "loss": 1.6051530838012695, + "step": 56210 + }, + { + "epoch": 0.17017541204425407, + "grad_norm": 0.12953053414821625, + "learning_rate": 0.00010404338651647869, + "loss": 1.6175006866455077, + "step": 56220 + }, + { + "epoch": 0.17020568159459992, + "grad_norm": 0.14074411988258362, + "learning_rate": 0.00010403959133484128, + "loss": 1.5958871841430664, + "step": 56230 + }, + { + "epoch": 0.17023595114494575, + "grad_norm": 0.12314888089895248, + "learning_rate": 0.0001040357961532039, + "loss": 1.6051218032836914, + "step": 56240 + }, + { + "epoch": 0.17026622069529157, + "grad_norm": 0.11267933249473572, + "learning_rate": 0.00010403200097156649, + "loss": 1.6085460662841797, + "step": 56250 + }, + { + "epoch": 0.1702964902456374, + "grad_norm": 0.1260022073984146, + "learning_rate": 0.00010402820578992911, + "loss": 1.5891046524047852, + "step": 56260 + }, + { + "epoch": 0.17032675979598322, + "grad_norm": 0.134762704372406, + "learning_rate": 0.0001040244106082917, + "loss": 1.5722817420959472, + "step": 56270 + }, + { + "epoch": 0.17035702934632907, + "grad_norm": 0.131272092461586, + "learning_rate": 0.00010402061542665432, + "loss": 1.6119239807128907, + "step": 56280 + }, + { + "epoch": 0.1703872988966749, + "grad_norm": 0.11829445511102676, + "learning_rate": 0.00010401682024501693, + "loss": 1.6229665756225586, + "step": 56290 + }, + { + "epoch": 0.17041756844702072, + "grad_norm": 0.11857372522354126, + "learning_rate": 0.00010401302506337954, + "loss": 1.6334190368652344, + "step": 56300 + }, + { + "epoch": 0.17044783799736654, + "grad_norm": 0.12403299659490585, + "learning_rate": 0.00010400922988174215, + "loss": 1.598734474182129, + "step": 56310 + }, + { + "epoch": 0.17047810754771237, + "grad_norm": 0.13378450274467468, + "learning_rate": 0.00010400543470010475, + "loss": 1.6441761016845704, + "step": 56320 + }, + { + "epoch": 0.17050837709805822, + "grad_norm": 0.15101604163646698, + "learning_rate": 0.00010400163951846737, + "loss": 1.5334861755371094, + "step": 56330 + }, + { + "epoch": 0.17053864664840404, + "grad_norm": 0.13721738755702972, + "learning_rate": 0.00010399784433682996, + "loss": 1.597071075439453, + "step": 56340 + }, + { + "epoch": 0.17056891619874986, + "grad_norm": 0.13295158743858337, + "learning_rate": 0.00010399404915519258, + "loss": 1.5725385665893554, + "step": 56350 + }, + { + "epoch": 0.1705991857490957, + "grad_norm": 0.1284887194633484, + "learning_rate": 0.00010399025397355517, + "loss": 1.6056709289550781, + "step": 56360 + }, + { + "epoch": 0.17062945529944154, + "grad_norm": 0.13070812821388245, + "learning_rate": 0.00010398645879191779, + "loss": 1.5883666038513184, + "step": 56370 + }, + { + "epoch": 0.17065972484978736, + "grad_norm": 0.13196076452732086, + "learning_rate": 0.00010398266361028038, + "loss": 1.6226789474487304, + "step": 56380 + }, + { + "epoch": 0.1706899944001332, + "grad_norm": 0.13789893686771393, + "learning_rate": 0.000103978868428643, + "loss": 1.5808143615722656, + "step": 56390 + }, + { + "epoch": 0.170720263950479, + "grad_norm": 0.1358947604894638, + "learning_rate": 0.0001039750732470056, + "loss": 1.610995101928711, + "step": 56400 + }, + { + "epoch": 0.17075053350082484, + "grad_norm": 0.14956100285053253, + "learning_rate": 0.00010397127806536821, + "loss": 1.616963005065918, + "step": 56410 + }, + { + "epoch": 0.1707808030511707, + "grad_norm": 0.13888472318649292, + "learning_rate": 0.0001039674828837308, + "loss": 1.5783121109008789, + "step": 56420 + }, + { + "epoch": 0.1708110726015165, + "grad_norm": 0.137897327542305, + "learning_rate": 0.00010396368770209343, + "loss": 1.5685276985168457, + "step": 56430 + }, + { + "epoch": 0.17084134215186234, + "grad_norm": 0.13316382467746735, + "learning_rate": 0.00010395989252045603, + "loss": 1.5568927764892577, + "step": 56440 + }, + { + "epoch": 0.17087161170220816, + "grad_norm": 0.1268918365240097, + "learning_rate": 0.00010395609733881864, + "loss": 1.605235481262207, + "step": 56450 + }, + { + "epoch": 0.17090188125255398, + "grad_norm": 0.13287746906280518, + "learning_rate": 0.00010395230215718124, + "loss": 1.5574912071228026, + "step": 56460 + }, + { + "epoch": 0.17093215080289983, + "grad_norm": 0.12671303749084473, + "learning_rate": 0.00010394850697554385, + "loss": 1.5898736000061036, + "step": 56470 + }, + { + "epoch": 0.17096242035324566, + "grad_norm": 0.12426573038101196, + "learning_rate": 0.00010394471179390646, + "loss": 1.6376262664794923, + "step": 56480 + }, + { + "epoch": 0.17099268990359148, + "grad_norm": 0.13334527611732483, + "learning_rate": 0.00010394091661226906, + "loss": 1.6122753143310546, + "step": 56490 + }, + { + "epoch": 0.1710229594539373, + "grad_norm": 0.13384588062763214, + "learning_rate": 0.00010393712143063168, + "loss": 1.6069068908691406, + "step": 56500 + }, + { + "epoch": 0.1710229594539373, + "eval_loss": 1.6222939491271973, + "eval_runtime": 28.1599, + "eval_samples_per_second": 17.756, + "eval_steps_per_second": 1.136, + "step": 56500 + }, + { + "epoch": 0.17105322900428313, + "grad_norm": 0.11793861538171768, + "learning_rate": 0.00010393332624899427, + "loss": 1.5899234771728517, + "step": 56510 + }, + { + "epoch": 0.17108349855462898, + "grad_norm": 0.13087762892246246, + "learning_rate": 0.00010392953106735689, + "loss": 1.6290668487548827, + "step": 56520 + }, + { + "epoch": 0.1711137681049748, + "grad_norm": 0.12551234662532806, + "learning_rate": 0.00010392573588571949, + "loss": 1.5532302856445312, + "step": 56530 + }, + { + "epoch": 0.17114403765532063, + "grad_norm": 0.1295258104801178, + "learning_rate": 0.0001039219407040821, + "loss": 1.6042388916015624, + "step": 56540 + }, + { + "epoch": 0.17117430720566645, + "grad_norm": 0.1408437192440033, + "learning_rate": 0.0001039181455224447, + "loss": 1.6384033203125, + "step": 56550 + }, + { + "epoch": 0.17120457675601228, + "grad_norm": 0.13569264113903046, + "learning_rate": 0.00010391435034080732, + "loss": 1.6131427764892579, + "step": 56560 + }, + { + "epoch": 0.17123484630635813, + "grad_norm": 0.13184820115566254, + "learning_rate": 0.00010391055515916992, + "loss": 1.5749851226806642, + "step": 56570 + }, + { + "epoch": 0.17126511585670395, + "grad_norm": 0.14724192023277283, + "learning_rate": 0.00010390675997753253, + "loss": 1.6302762985229493, + "step": 56580 + }, + { + "epoch": 0.17129538540704978, + "grad_norm": 0.12667545676231384, + "learning_rate": 0.00010390296479589513, + "loss": 1.5866941452026366, + "step": 56590 + }, + { + "epoch": 0.1713256549573956, + "grad_norm": 0.12760205566883087, + "learning_rate": 0.00010389916961425774, + "loss": 1.6223249435424805, + "step": 56600 + }, + { + "epoch": 0.17135592450774143, + "grad_norm": 0.13514353334903717, + "learning_rate": 0.00010389537443262035, + "loss": 1.6232580184936523, + "step": 56610 + }, + { + "epoch": 0.17138619405808728, + "grad_norm": 0.12073150277137756, + "learning_rate": 0.00010389157925098295, + "loss": 1.601041030883789, + "step": 56620 + }, + { + "epoch": 0.1714164636084331, + "grad_norm": 0.14025060832500458, + "learning_rate": 0.00010388778406934556, + "loss": 1.5699095726013184, + "step": 56630 + }, + { + "epoch": 0.17144673315877892, + "grad_norm": 0.1318758726119995, + "learning_rate": 0.00010388398888770816, + "loss": 1.6237405776977538, + "step": 56640 + }, + { + "epoch": 0.17147700270912475, + "grad_norm": 0.13770820200443268, + "learning_rate": 0.00010388019370607077, + "loss": 1.6060026168823243, + "step": 56650 + }, + { + "epoch": 0.17150727225947057, + "grad_norm": 0.13961343467235565, + "learning_rate": 0.00010387639852443338, + "loss": 1.5884026527404784, + "step": 56660 + }, + { + "epoch": 0.17153754180981642, + "grad_norm": 0.12190386652946472, + "learning_rate": 0.00010387260334279598, + "loss": 1.5939674377441406, + "step": 56670 + }, + { + "epoch": 0.17156781136016225, + "grad_norm": 0.12944434583187103, + "learning_rate": 0.0001038688081611586, + "loss": 1.6080051422119142, + "step": 56680 + }, + { + "epoch": 0.17159808091050807, + "grad_norm": 0.135753333568573, + "learning_rate": 0.00010386501297952121, + "loss": 1.59176607131958, + "step": 56690 + }, + { + "epoch": 0.1716283504608539, + "grad_norm": 0.1383083313703537, + "learning_rate": 0.00010386121779788381, + "loss": 1.6447763442993164, + "step": 56700 + }, + { + "epoch": 0.17165862001119975, + "grad_norm": 0.12272413820028305, + "learning_rate": 0.00010385742261624642, + "loss": 1.6091678619384766, + "step": 56710 + }, + { + "epoch": 0.17168888956154557, + "grad_norm": 0.1329641193151474, + "learning_rate": 0.00010385362743460903, + "loss": 1.5745719909667968, + "step": 56720 + }, + { + "epoch": 0.1717191591118914, + "grad_norm": 0.1382705718278885, + "learning_rate": 0.00010384983225297163, + "loss": 1.5701480865478517, + "step": 56730 + }, + { + "epoch": 0.17174942866223722, + "grad_norm": 0.13944296538829803, + "learning_rate": 0.00010384603707133424, + "loss": 1.5671738624572753, + "step": 56740 + }, + { + "epoch": 0.17177969821258304, + "grad_norm": 0.13523957133293152, + "learning_rate": 0.00010384224188969684, + "loss": 1.6138551712036133, + "step": 56750 + }, + { + "epoch": 0.1718099677629289, + "grad_norm": 0.11184422671794891, + "learning_rate": 0.00010383844670805945, + "loss": 1.6582233428955078, + "step": 56760 + }, + { + "epoch": 0.17184023731327472, + "grad_norm": 0.1400853842496872, + "learning_rate": 0.00010383465152642206, + "loss": 1.6304386138916016, + "step": 56770 + }, + { + "epoch": 0.17187050686362054, + "grad_norm": 0.13959819078445435, + "learning_rate": 0.00010383085634478466, + "loss": 1.622530174255371, + "step": 56780 + }, + { + "epoch": 0.17190077641396637, + "grad_norm": 0.1266668140888214, + "learning_rate": 0.00010382706116314727, + "loss": 1.5773655891418457, + "step": 56790 + }, + { + "epoch": 0.1719310459643122, + "grad_norm": 0.1430738866329193, + "learning_rate": 0.00010382326598150987, + "loss": 1.5916815757751466, + "step": 56800 + }, + { + "epoch": 0.17196131551465804, + "grad_norm": 0.12537789344787598, + "learning_rate": 0.00010381947079987249, + "loss": 1.6255228042602539, + "step": 56810 + }, + { + "epoch": 0.17199158506500387, + "grad_norm": 0.11869002133607864, + "learning_rate": 0.00010381567561823509, + "loss": 1.605811309814453, + "step": 56820 + }, + { + "epoch": 0.1720218546153497, + "grad_norm": 0.14785002171993256, + "learning_rate": 0.0001038118804365977, + "loss": 1.610408401489258, + "step": 56830 + }, + { + "epoch": 0.1720521241656955, + "grad_norm": 0.14130103588104248, + "learning_rate": 0.0001038080852549603, + "loss": 1.6238710403442382, + "step": 56840 + }, + { + "epoch": 0.17208239371604134, + "grad_norm": 0.15688276290893555, + "learning_rate": 0.00010380429007332292, + "loss": 1.5751916885375976, + "step": 56850 + }, + { + "epoch": 0.1721126632663872, + "grad_norm": 0.13059212267398834, + "learning_rate": 0.00010380049489168551, + "loss": 1.5851217269897462, + "step": 56860 + }, + { + "epoch": 0.172142932816733, + "grad_norm": 0.13094450533390045, + "learning_rate": 0.00010379669971004813, + "loss": 1.585097885131836, + "step": 56870 + }, + { + "epoch": 0.17217320236707884, + "grad_norm": 0.13156476616859436, + "learning_rate": 0.00010379290452841072, + "loss": 1.6223350524902345, + "step": 56880 + }, + { + "epoch": 0.17220347191742466, + "grad_norm": 0.12303910404443741, + "learning_rate": 0.00010378910934677334, + "loss": 1.5885080337524413, + "step": 56890 + }, + { + "epoch": 0.17223374146777048, + "grad_norm": 0.10838493704795837, + "learning_rate": 0.00010378531416513595, + "loss": 1.588094139099121, + "step": 56900 + }, + { + "epoch": 0.17226401101811634, + "grad_norm": 0.12780101597309113, + "learning_rate": 0.00010378151898349855, + "loss": 1.5734424591064453, + "step": 56910 + }, + { + "epoch": 0.17229428056846216, + "grad_norm": 0.11609210073947906, + "learning_rate": 0.00010377772380186117, + "loss": 1.6240379333496093, + "step": 56920 + }, + { + "epoch": 0.17232455011880798, + "grad_norm": 0.11977639049291611, + "learning_rate": 0.00010377392862022376, + "loss": 1.5786791801452638, + "step": 56930 + }, + { + "epoch": 0.1723548196691538, + "grad_norm": 0.1217065304517746, + "learning_rate": 0.00010377013343858638, + "loss": 1.6049745559692383, + "step": 56940 + }, + { + "epoch": 0.17238508921949963, + "grad_norm": 0.131573885679245, + "learning_rate": 0.00010376633825694898, + "loss": 1.6191402435302735, + "step": 56950 + }, + { + "epoch": 0.17241535876984548, + "grad_norm": 0.11894388496875763, + "learning_rate": 0.0001037625430753116, + "loss": 1.621007537841797, + "step": 56960 + }, + { + "epoch": 0.1724456283201913, + "grad_norm": 0.12474951148033142, + "learning_rate": 0.00010375874789367419, + "loss": 1.6408504486083983, + "step": 56970 + }, + { + "epoch": 0.17247589787053713, + "grad_norm": 0.1249697282910347, + "learning_rate": 0.00010375495271203681, + "loss": 1.5845746040344237, + "step": 56980 + }, + { + "epoch": 0.17250616742088296, + "grad_norm": 0.1352807581424713, + "learning_rate": 0.0001037511575303994, + "loss": 1.6043306350708009, + "step": 56990 + }, + { + "epoch": 0.17253643697122878, + "grad_norm": 0.12812846899032593, + "learning_rate": 0.00010374736234876202, + "loss": 1.6261260986328125, + "step": 57000 + }, + { + "epoch": 0.17253643697122878, + "eval_loss": 1.6211918592453003, + "eval_runtime": 28.2292, + "eval_samples_per_second": 17.712, + "eval_steps_per_second": 1.134, + "step": 57000 + }, + { + "epoch": 0.17256670652157463, + "grad_norm": 0.11996378004550934, + "learning_rate": 0.00010374356716712461, + "loss": 1.5813060760498048, + "step": 57010 + }, + { + "epoch": 0.17259697607192045, + "grad_norm": 0.13897854089736938, + "learning_rate": 0.00010373977198548723, + "loss": 1.600291633605957, + "step": 57020 + }, + { + "epoch": 0.17262724562226628, + "grad_norm": 0.14713835716247559, + "learning_rate": 0.00010373597680384982, + "loss": 1.6370737075805664, + "step": 57030 + }, + { + "epoch": 0.1726575151726121, + "grad_norm": 0.13366146385669708, + "learning_rate": 0.00010373218162221244, + "loss": 1.5819306373596191, + "step": 57040 + }, + { + "epoch": 0.17268778472295793, + "grad_norm": 0.13465692102909088, + "learning_rate": 0.00010372838644057505, + "loss": 1.5940314292907716, + "step": 57050 + }, + { + "epoch": 0.17271805427330378, + "grad_norm": 0.11969654262065887, + "learning_rate": 0.00010372459125893766, + "loss": 1.6414501190185546, + "step": 57060 + }, + { + "epoch": 0.1727483238236496, + "grad_norm": 0.12188141793012619, + "learning_rate": 0.00010372079607730026, + "loss": 1.5706807136535645, + "step": 57070 + }, + { + "epoch": 0.17277859337399543, + "grad_norm": 0.13731062412261963, + "learning_rate": 0.00010371700089566287, + "loss": 1.5890827178955078, + "step": 57080 + }, + { + "epoch": 0.17280886292434125, + "grad_norm": 0.13265804946422577, + "learning_rate": 0.00010371320571402547, + "loss": 1.5853071212768555, + "step": 57090 + }, + { + "epoch": 0.1728391324746871, + "grad_norm": 0.13809987902641296, + "learning_rate": 0.00010370941053238808, + "loss": 1.6193843841552735, + "step": 57100 + }, + { + "epoch": 0.17286940202503293, + "grad_norm": 0.12752817571163177, + "learning_rate": 0.0001037056153507507, + "loss": 1.613205909729004, + "step": 57110 + }, + { + "epoch": 0.17289967157537875, + "grad_norm": 0.14389663934707642, + "learning_rate": 0.00010370182016911329, + "loss": 1.601192092895508, + "step": 57120 + }, + { + "epoch": 0.17292994112572457, + "grad_norm": 0.1178768053650856, + "learning_rate": 0.00010369802498747591, + "loss": 1.6029598236083984, + "step": 57130 + }, + { + "epoch": 0.1729602106760704, + "grad_norm": 0.13836807012557983, + "learning_rate": 0.0001036942298058385, + "loss": 1.6239274978637694, + "step": 57140 + }, + { + "epoch": 0.17299048022641625, + "grad_norm": 0.12132605165243149, + "learning_rate": 0.00010369043462420112, + "loss": 1.6069608688354493, + "step": 57150 + }, + { + "epoch": 0.17302074977676207, + "grad_norm": 0.1456938236951828, + "learning_rate": 0.00010368663944256371, + "loss": 1.5712652206420898, + "step": 57160 + }, + { + "epoch": 0.1730510193271079, + "grad_norm": 0.13132889568805695, + "learning_rate": 0.00010368284426092633, + "loss": 1.582472038269043, + "step": 57170 + }, + { + "epoch": 0.17308128887745372, + "grad_norm": 0.12081290781497955, + "learning_rate": 0.00010367904907928894, + "loss": 1.6114793777465821, + "step": 57180 + }, + { + "epoch": 0.17311155842779954, + "grad_norm": 0.13612496852874756, + "learning_rate": 0.00010367525389765155, + "loss": 1.5933647155761719, + "step": 57190 + }, + { + "epoch": 0.1731418279781454, + "grad_norm": 0.1205076053738594, + "learning_rate": 0.00010367145871601415, + "loss": 1.5626192092895508, + "step": 57200 + }, + { + "epoch": 0.17317209752849122, + "grad_norm": 0.1308652013540268, + "learning_rate": 0.00010366766353437676, + "loss": 1.5693909645080566, + "step": 57210 + }, + { + "epoch": 0.17320236707883704, + "grad_norm": 0.12021349370479584, + "learning_rate": 0.00010366386835273936, + "loss": 1.6247339248657227, + "step": 57220 + }, + { + "epoch": 0.17323263662918287, + "grad_norm": 0.1364743858575821, + "learning_rate": 0.00010366007317110197, + "loss": 1.5657640457153321, + "step": 57230 + }, + { + "epoch": 0.1732629061795287, + "grad_norm": 0.1770325005054474, + "learning_rate": 0.00010365627798946458, + "loss": 1.568641185760498, + "step": 57240 + }, + { + "epoch": 0.17329317572987454, + "grad_norm": 0.12441276013851166, + "learning_rate": 0.00010365248280782718, + "loss": 1.583917236328125, + "step": 57250 + }, + { + "epoch": 0.17332344528022037, + "grad_norm": 0.13987691700458527, + "learning_rate": 0.00010364868762618979, + "loss": 1.5806072235107422, + "step": 57260 + }, + { + "epoch": 0.1733537148305662, + "grad_norm": 0.11538086086511612, + "learning_rate": 0.0001036448924445524, + "loss": 1.6567760467529298, + "step": 57270 + }, + { + "epoch": 0.17338398438091202, + "grad_norm": 0.1279652714729309, + "learning_rate": 0.000103641097262915, + "loss": 1.5788690567016601, + "step": 57280 + }, + { + "epoch": 0.17341425393125784, + "grad_norm": 0.11906813085079193, + "learning_rate": 0.00010363730208127762, + "loss": 1.6207096099853515, + "step": 57290 + }, + { + "epoch": 0.1734445234816037, + "grad_norm": 0.1145666316151619, + "learning_rate": 0.00010363350689964023, + "loss": 1.61519775390625, + "step": 57300 + }, + { + "epoch": 0.17347479303194951, + "grad_norm": 0.15118882060050964, + "learning_rate": 0.00010362971171800283, + "loss": 1.5952704429626465, + "step": 57310 + }, + { + "epoch": 0.17350506258229534, + "grad_norm": 0.11926047503948212, + "learning_rate": 0.00010362591653636544, + "loss": 1.5937625885009765, + "step": 57320 + }, + { + "epoch": 0.17353533213264116, + "grad_norm": 0.1280047595500946, + "learning_rate": 0.00010362212135472804, + "loss": 1.5938472747802734, + "step": 57330 + }, + { + "epoch": 0.173565601682987, + "grad_norm": 0.13749118149280548, + "learning_rate": 0.00010361832617309065, + "loss": 1.564648723602295, + "step": 57340 + }, + { + "epoch": 0.17359587123333284, + "grad_norm": 0.12291748076677322, + "learning_rate": 0.00010361453099145325, + "loss": 1.5718310356140137, + "step": 57350 + }, + { + "epoch": 0.17362614078367866, + "grad_norm": 0.11686339974403381, + "learning_rate": 0.00010361073580981586, + "loss": 1.572871780395508, + "step": 57360 + }, + { + "epoch": 0.17365641033402449, + "grad_norm": 0.12898766994476318, + "learning_rate": 0.00010360694062817847, + "loss": 1.578227424621582, + "step": 57370 + }, + { + "epoch": 0.1736866798843703, + "grad_norm": 0.13372306525707245, + "learning_rate": 0.00010360314544654107, + "loss": 1.586650276184082, + "step": 57380 + }, + { + "epoch": 0.17371694943471613, + "grad_norm": 0.13142237067222595, + "learning_rate": 0.00010359935026490368, + "loss": 1.599508571624756, + "step": 57390 + }, + { + "epoch": 0.17374721898506199, + "grad_norm": 0.13426335155963898, + "learning_rate": 0.00010359555508326628, + "loss": 1.6277624130249024, + "step": 57400 + }, + { + "epoch": 0.1737774885354078, + "grad_norm": 0.1332470178604126, + "learning_rate": 0.00010359175990162889, + "loss": 1.583173656463623, + "step": 57410 + }, + { + "epoch": 0.17380775808575363, + "grad_norm": 0.1375979334115982, + "learning_rate": 0.00010358796471999151, + "loss": 1.6091499328613281, + "step": 57420 + }, + { + "epoch": 0.17383802763609946, + "grad_norm": 0.12136901915073395, + "learning_rate": 0.0001035841695383541, + "loss": 1.6257232666015624, + "step": 57430 + }, + { + "epoch": 0.1738682971864453, + "grad_norm": 0.11564339697360992, + "learning_rate": 0.00010358037435671672, + "loss": 1.6255844116210938, + "step": 57440 + }, + { + "epoch": 0.17389856673679113, + "grad_norm": 0.12974727153778076, + "learning_rate": 0.00010357657917507931, + "loss": 1.6237264633178712, + "step": 57450 + }, + { + "epoch": 0.17392883628713696, + "grad_norm": 0.13675087690353394, + "learning_rate": 0.00010357278399344193, + "loss": 1.5622652053833008, + "step": 57460 + }, + { + "epoch": 0.17395910583748278, + "grad_norm": 0.13111747801303864, + "learning_rate": 0.00010356898881180453, + "loss": 1.5815714836120605, + "step": 57470 + }, + { + "epoch": 0.1739893753878286, + "grad_norm": 0.12647078931331635, + "learning_rate": 0.00010356519363016715, + "loss": 1.5664989471435546, + "step": 57480 + }, + { + "epoch": 0.17401964493817446, + "grad_norm": 0.1298321634531021, + "learning_rate": 0.00010356139844852974, + "loss": 1.6301055908203126, + "step": 57490 + }, + { + "epoch": 0.17404991448852028, + "grad_norm": 0.1312648057937622, + "learning_rate": 0.00010355760326689236, + "loss": 1.5972293853759765, + "step": 57500 + }, + { + "epoch": 0.17404991448852028, + "eval_loss": 1.6108659505844116, + "eval_runtime": 28.0661, + "eval_samples_per_second": 17.815, + "eval_steps_per_second": 1.14, + "step": 57500 + }, + { + "epoch": 0.1740801840388661, + "grad_norm": 0.12252967059612274, + "learning_rate": 0.00010355380808525496, + "loss": 1.608624267578125, + "step": 57510 + }, + { + "epoch": 0.17411045358921193, + "grad_norm": 0.12732809782028198, + "learning_rate": 0.00010355001290361757, + "loss": 1.6148162841796876, + "step": 57520 + }, + { + "epoch": 0.17414072313955775, + "grad_norm": 0.1299472451210022, + "learning_rate": 0.00010354621772198018, + "loss": 1.5853464126586914, + "step": 57530 + }, + { + "epoch": 0.1741709926899036, + "grad_norm": 0.12917868793010712, + "learning_rate": 0.00010354242254034278, + "loss": 1.6037464141845703, + "step": 57540 + }, + { + "epoch": 0.17420126224024943, + "grad_norm": 0.1393928825855255, + "learning_rate": 0.0001035386273587054, + "loss": 1.5977169036865235, + "step": 57550 + }, + { + "epoch": 0.17423153179059525, + "grad_norm": 0.1304583102464676, + "learning_rate": 0.000103534832177068, + "loss": 1.6074447631835938, + "step": 57560 + }, + { + "epoch": 0.17426180134094107, + "grad_norm": 0.1366361379623413, + "learning_rate": 0.00010353103699543061, + "loss": 1.5464362144470214, + "step": 57570 + }, + { + "epoch": 0.1742920708912869, + "grad_norm": 0.10662207752466202, + "learning_rate": 0.0001035272418137932, + "loss": 1.582288646697998, + "step": 57580 + }, + { + "epoch": 0.17432234044163275, + "grad_norm": 0.1293220967054367, + "learning_rate": 0.00010352344663215582, + "loss": 1.6039157867431642, + "step": 57590 + }, + { + "epoch": 0.17435260999197857, + "grad_norm": 0.13270115852355957, + "learning_rate": 0.00010351965145051842, + "loss": 1.6039220809936523, + "step": 57600 + }, + { + "epoch": 0.1743828795423244, + "grad_norm": 0.1392199993133545, + "learning_rate": 0.00010351585626888104, + "loss": 1.5756540298461914, + "step": 57610 + }, + { + "epoch": 0.17441314909267022, + "grad_norm": 0.1283150315284729, + "learning_rate": 0.00010351206108724363, + "loss": 1.594523811340332, + "step": 57620 + }, + { + "epoch": 0.17444341864301605, + "grad_norm": 0.13989639282226562, + "learning_rate": 0.00010350826590560625, + "loss": 1.5990358352661134, + "step": 57630 + }, + { + "epoch": 0.1744736881933619, + "grad_norm": 0.15053221583366394, + "learning_rate": 0.00010350447072396884, + "loss": 1.6248687744140624, + "step": 57640 + }, + { + "epoch": 0.17450395774370772, + "grad_norm": 0.1323138326406479, + "learning_rate": 0.00010350067554233146, + "loss": 1.6017147064208985, + "step": 57650 + }, + { + "epoch": 0.17453422729405355, + "grad_norm": 0.1256655752658844, + "learning_rate": 0.00010349688036069405, + "loss": 1.576146411895752, + "step": 57660 + }, + { + "epoch": 0.17456449684439937, + "grad_norm": 0.1429648995399475, + "learning_rate": 0.00010349308517905667, + "loss": 1.60355224609375, + "step": 57670 + }, + { + "epoch": 0.1745947663947452, + "grad_norm": 0.13020364940166473, + "learning_rate": 0.00010348928999741928, + "loss": 1.6036378860473632, + "step": 57680 + }, + { + "epoch": 0.17462503594509104, + "grad_norm": 0.13783377408981323, + "learning_rate": 0.00010348549481578188, + "loss": 1.6173337936401366, + "step": 57690 + }, + { + "epoch": 0.17465530549543687, + "grad_norm": 0.12365344166755676, + "learning_rate": 0.00010348169963414449, + "loss": 1.6120494842529296, + "step": 57700 + }, + { + "epoch": 0.1746855750457827, + "grad_norm": 0.14212742447853088, + "learning_rate": 0.0001034779044525071, + "loss": 1.602549934387207, + "step": 57710 + }, + { + "epoch": 0.17471584459612852, + "grad_norm": 0.1174728199839592, + "learning_rate": 0.00010347410927086972, + "loss": 1.6278984069824218, + "step": 57720 + }, + { + "epoch": 0.17474611414647434, + "grad_norm": 0.11813341826200485, + "learning_rate": 0.00010347031408923231, + "loss": 1.5873395919799804, + "step": 57730 + }, + { + "epoch": 0.1747763836968202, + "grad_norm": 0.14248871803283691, + "learning_rate": 0.00010346651890759493, + "loss": 1.5503219604492187, + "step": 57740 + }, + { + "epoch": 0.17480665324716602, + "grad_norm": 0.12175135314464569, + "learning_rate": 0.00010346272372595752, + "loss": 1.604032325744629, + "step": 57750 + }, + { + "epoch": 0.17483692279751184, + "grad_norm": 0.12413893640041351, + "learning_rate": 0.00010345892854432014, + "loss": 1.5901893615722655, + "step": 57760 + }, + { + "epoch": 0.17486719234785766, + "grad_norm": 0.13063986599445343, + "learning_rate": 0.00010345513336268273, + "loss": 1.6030029296875, + "step": 57770 + }, + { + "epoch": 0.17489746189820352, + "grad_norm": 0.12125398218631744, + "learning_rate": 0.00010345133818104535, + "loss": 1.5769089698791503, + "step": 57780 + }, + { + "epoch": 0.17492773144854934, + "grad_norm": 0.1530877649784088, + "learning_rate": 0.00010344754299940796, + "loss": 1.6055524826049805, + "step": 57790 + }, + { + "epoch": 0.17495800099889516, + "grad_norm": 0.1410951465368271, + "learning_rate": 0.00010344374781777056, + "loss": 1.5969106674194335, + "step": 57800 + }, + { + "epoch": 0.174988270549241, + "grad_norm": 0.12144351750612259, + "learning_rate": 0.00010343995263613317, + "loss": 1.5924560546875, + "step": 57810 + }, + { + "epoch": 0.1750185400995868, + "grad_norm": 0.12408013641834259, + "learning_rate": 0.00010343615745449578, + "loss": 1.6293472290039062, + "step": 57820 + }, + { + "epoch": 0.17504880964993266, + "grad_norm": 0.11986502259969711, + "learning_rate": 0.00010343236227285838, + "loss": 1.5842451095581054, + "step": 57830 + }, + { + "epoch": 0.1750790792002785, + "grad_norm": 0.13559561967849731, + "learning_rate": 0.00010342856709122099, + "loss": 1.576093864440918, + "step": 57840 + }, + { + "epoch": 0.1751093487506243, + "grad_norm": 0.126695916056633, + "learning_rate": 0.00010342477190958359, + "loss": 1.5875307083129884, + "step": 57850 + }, + { + "epoch": 0.17513961830097013, + "grad_norm": 0.1300773322582245, + "learning_rate": 0.0001034209767279462, + "loss": 1.6254158020019531, + "step": 57860 + }, + { + "epoch": 0.17516988785131596, + "grad_norm": 0.1275174766778946, + "learning_rate": 0.0001034171815463088, + "loss": 1.6156682968139648, + "step": 57870 + }, + { + "epoch": 0.1752001574016618, + "grad_norm": 0.13372518122196198, + "learning_rate": 0.00010341338636467141, + "loss": 1.5768110275268554, + "step": 57880 + }, + { + "epoch": 0.17523042695200763, + "grad_norm": 0.12064096331596375, + "learning_rate": 0.00010340959118303402, + "loss": 1.600698471069336, + "step": 57890 + }, + { + "epoch": 0.17526069650235346, + "grad_norm": 0.135248601436615, + "learning_rate": 0.00010340579600139662, + "loss": 1.554244041442871, + "step": 57900 + }, + { + "epoch": 0.17529096605269928, + "grad_norm": 0.13319359719753265, + "learning_rate": 0.00010340200081975924, + "loss": 1.5779190063476562, + "step": 57910 + }, + { + "epoch": 0.1753212356030451, + "grad_norm": 0.12974603474140167, + "learning_rate": 0.00010339820563812185, + "loss": 1.619650650024414, + "step": 57920 + }, + { + "epoch": 0.17535150515339096, + "grad_norm": 0.1335146129131317, + "learning_rate": 0.00010339441045648445, + "loss": 1.6050466537475585, + "step": 57930 + }, + { + "epoch": 0.17538177470373678, + "grad_norm": 0.12160754203796387, + "learning_rate": 0.00010339061527484706, + "loss": 1.583609390258789, + "step": 57940 + }, + { + "epoch": 0.1754120442540826, + "grad_norm": 0.12090082466602325, + "learning_rate": 0.00010338682009320967, + "loss": 1.6017526626586913, + "step": 57950 + }, + { + "epoch": 0.17544231380442843, + "grad_norm": 0.12349405139684677, + "learning_rate": 0.00010338302491157227, + "loss": 1.6119136810302734, + "step": 57960 + }, + { + "epoch": 0.17547258335477425, + "grad_norm": 0.11917407065629959, + "learning_rate": 0.00010337922972993488, + "loss": 1.5785130500793456, + "step": 57970 + }, + { + "epoch": 0.1755028529051201, + "grad_norm": 0.133358433842659, + "learning_rate": 0.00010337543454829748, + "loss": 1.6016353607177733, + "step": 57980 + }, + { + "epoch": 0.17553312245546593, + "grad_norm": 0.11465039104223251, + "learning_rate": 0.00010337163936666009, + "loss": 1.6262557983398438, + "step": 57990 + }, + { + "epoch": 0.17556339200581175, + "grad_norm": 0.12789642810821533, + "learning_rate": 0.0001033678441850227, + "loss": 1.6115642547607423, + "step": 58000 + }, + { + "epoch": 0.17556339200581175, + "eval_loss": 1.5886174440383911, + "eval_runtime": 28.1848, + "eval_samples_per_second": 17.74, + "eval_steps_per_second": 1.135, + "step": 58000 + }, + { + "epoch": 0.17559366155615758, + "grad_norm": 0.15166766941547394, + "learning_rate": 0.0001033640490033853, + "loss": 1.6059755325317382, + "step": 58010 + }, + { + "epoch": 0.1756239311065034, + "grad_norm": 0.13551686704158783, + "learning_rate": 0.00010336025382174791, + "loss": 1.5884007453918456, + "step": 58020 + }, + { + "epoch": 0.17565420065684925, + "grad_norm": 0.12269953638315201, + "learning_rate": 0.00010335645864011053, + "loss": 1.6190853118896484, + "step": 58030 + }, + { + "epoch": 0.17568447020719508, + "grad_norm": 0.12779831886291504, + "learning_rate": 0.00010335266345847312, + "loss": 1.607175064086914, + "step": 58040 + }, + { + "epoch": 0.1757147397575409, + "grad_norm": 0.13100755214691162, + "learning_rate": 0.00010334886827683574, + "loss": 1.6325077056884765, + "step": 58050 + }, + { + "epoch": 0.17574500930788672, + "grad_norm": 0.13852132856845856, + "learning_rate": 0.00010334507309519833, + "loss": 1.591293716430664, + "step": 58060 + }, + { + "epoch": 0.17577527885823255, + "grad_norm": 0.12479143589735031, + "learning_rate": 0.00010334127791356095, + "loss": 1.601425552368164, + "step": 58070 + }, + { + "epoch": 0.1758055484085784, + "grad_norm": 0.1321691870689392, + "learning_rate": 0.00010333748273192354, + "loss": 1.6334613800048827, + "step": 58080 + }, + { + "epoch": 0.17583581795892422, + "grad_norm": 0.13221193850040436, + "learning_rate": 0.00010333368755028616, + "loss": 1.6060295104980469, + "step": 58090 + }, + { + "epoch": 0.17586608750927005, + "grad_norm": 0.12642022967338562, + "learning_rate": 0.00010332989236864876, + "loss": 1.5886823654174804, + "step": 58100 + }, + { + "epoch": 0.17589635705961587, + "grad_norm": 0.1278769075870514, + "learning_rate": 0.00010332609718701137, + "loss": 1.6028236389160155, + "step": 58110 + }, + { + "epoch": 0.17592662660996172, + "grad_norm": 0.12790092825889587, + "learning_rate": 0.00010332230200537398, + "loss": 1.5946783065795898, + "step": 58120 + }, + { + "epoch": 0.17595689616030755, + "grad_norm": 0.13037163019180298, + "learning_rate": 0.00010331850682373659, + "loss": 1.550330352783203, + "step": 58130 + }, + { + "epoch": 0.17598716571065337, + "grad_norm": 0.13097573816776276, + "learning_rate": 0.00010331471164209919, + "loss": 1.575301170349121, + "step": 58140 + }, + { + "epoch": 0.1760174352609992, + "grad_norm": 0.11241770535707474, + "learning_rate": 0.0001033109164604618, + "loss": 1.609286117553711, + "step": 58150 + }, + { + "epoch": 0.17604770481134502, + "grad_norm": 0.12768082320690155, + "learning_rate": 0.00010330712127882442, + "loss": 1.6510139465332032, + "step": 58160 + }, + { + "epoch": 0.17607797436169087, + "grad_norm": 0.11540944874286652, + "learning_rate": 0.00010330332609718701, + "loss": 1.6119464874267577, + "step": 58170 + }, + { + "epoch": 0.1761082439120367, + "grad_norm": 0.14640982449054718, + "learning_rate": 0.00010329953091554963, + "loss": 1.6320423126220702, + "step": 58180 + }, + { + "epoch": 0.17613851346238252, + "grad_norm": 0.12169412523508072, + "learning_rate": 0.00010329573573391222, + "loss": 1.5927655220031738, + "step": 58190 + }, + { + "epoch": 0.17616878301272834, + "grad_norm": 0.11818362772464752, + "learning_rate": 0.00010329194055227484, + "loss": 1.5953227996826171, + "step": 58200 + }, + { + "epoch": 0.17619905256307417, + "grad_norm": 0.13820688426494598, + "learning_rate": 0.00010328814537063743, + "loss": 1.6241037368774414, + "step": 58210 + }, + { + "epoch": 0.17622932211342002, + "grad_norm": 0.1360817700624466, + "learning_rate": 0.00010328435018900005, + "loss": 1.5921575546264648, + "step": 58220 + }, + { + "epoch": 0.17625959166376584, + "grad_norm": 0.11833507567644119, + "learning_rate": 0.00010328055500736265, + "loss": 1.5882611274719238, + "step": 58230 + }, + { + "epoch": 0.17628986121411166, + "grad_norm": 0.11404730379581451, + "learning_rate": 0.00010327675982572527, + "loss": 1.6300281524658202, + "step": 58240 + }, + { + "epoch": 0.1763201307644575, + "grad_norm": 0.13485977053642273, + "learning_rate": 0.00010327296464408786, + "loss": 1.5902981758117676, + "step": 58250 + }, + { + "epoch": 0.1763504003148033, + "grad_norm": 0.11841461807489395, + "learning_rate": 0.00010326916946245048, + "loss": 1.5870679855346679, + "step": 58260 + }, + { + "epoch": 0.17638066986514916, + "grad_norm": 0.13516396284103394, + "learning_rate": 0.00010326537428081307, + "loss": 1.5897549629211425, + "step": 58270 + }, + { + "epoch": 0.176410939415495, + "grad_norm": 0.13978323340415955, + "learning_rate": 0.00010326157909917569, + "loss": 1.5642241477966308, + "step": 58280 + }, + { + "epoch": 0.1764412089658408, + "grad_norm": 0.1384986937046051, + "learning_rate": 0.0001032577839175383, + "loss": 1.5886048316955566, + "step": 58290 + }, + { + "epoch": 0.17647147851618664, + "grad_norm": 0.11497864872217178, + "learning_rate": 0.0001032539887359009, + "loss": 1.5992256164550782, + "step": 58300 + }, + { + "epoch": 0.17650174806653246, + "grad_norm": 0.12790676951408386, + "learning_rate": 0.00010325019355426351, + "loss": 1.6154596328735351, + "step": 58310 + }, + { + "epoch": 0.1765320176168783, + "grad_norm": 0.13058453798294067, + "learning_rate": 0.00010324639837262611, + "loss": 1.5922222137451172, + "step": 58320 + }, + { + "epoch": 0.17656228716722414, + "grad_norm": 0.13967260718345642, + "learning_rate": 0.00010324260319098873, + "loss": 1.6266424179077148, + "step": 58330 + }, + { + "epoch": 0.17659255671756996, + "grad_norm": 0.12681415677070618, + "learning_rate": 0.00010323880800935133, + "loss": 1.586129856109619, + "step": 58340 + }, + { + "epoch": 0.17662282626791578, + "grad_norm": 0.1264214962720871, + "learning_rate": 0.00010323501282771394, + "loss": 1.6029584884643555, + "step": 58350 + }, + { + "epoch": 0.1766530958182616, + "grad_norm": 0.12477656453847885, + "learning_rate": 0.00010323121764607654, + "loss": 1.6132804870605468, + "step": 58360 + }, + { + "epoch": 0.17668336536860746, + "grad_norm": 0.13122820854187012, + "learning_rate": 0.00010322742246443916, + "loss": 1.5745516777038575, + "step": 58370 + }, + { + "epoch": 0.17671363491895328, + "grad_norm": 0.12959358096122742, + "learning_rate": 0.00010322362728280175, + "loss": 1.6380613327026368, + "step": 58380 + }, + { + "epoch": 0.1767439044692991, + "grad_norm": 0.13435374200344086, + "learning_rate": 0.00010321983210116437, + "loss": 1.5669886589050293, + "step": 58390 + }, + { + "epoch": 0.17677417401964493, + "grad_norm": 0.12340220808982849, + "learning_rate": 0.00010321603691952697, + "loss": 1.6197967529296875, + "step": 58400 + }, + { + "epoch": 0.17680444356999075, + "grad_norm": 0.13248251378536224, + "learning_rate": 0.00010321224173788958, + "loss": 1.60030517578125, + "step": 58410 + }, + { + "epoch": 0.1768347131203366, + "grad_norm": 0.12814423441886902, + "learning_rate": 0.00010320844655625219, + "loss": 1.5830768585205077, + "step": 58420 + }, + { + "epoch": 0.17686498267068243, + "grad_norm": 0.11794646084308624, + "learning_rate": 0.00010320465137461479, + "loss": 1.6284751892089844, + "step": 58430 + }, + { + "epoch": 0.17689525222102825, + "grad_norm": 0.13381588459014893, + "learning_rate": 0.0001032008561929774, + "loss": 1.580583381652832, + "step": 58440 + }, + { + "epoch": 0.17692552177137408, + "grad_norm": 0.1264999657869339, + "learning_rate": 0.00010319706101134, + "loss": 1.6178728103637696, + "step": 58450 + }, + { + "epoch": 0.17695579132171993, + "grad_norm": 0.12875430285930634, + "learning_rate": 0.00010319326582970261, + "loss": 1.592259407043457, + "step": 58460 + }, + { + "epoch": 0.17698606087206575, + "grad_norm": 0.13829049468040466, + "learning_rate": 0.00010318947064806522, + "loss": 1.590066146850586, + "step": 58470 + }, + { + "epoch": 0.17701633042241158, + "grad_norm": 0.12114868313074112, + "learning_rate": 0.00010318567546642782, + "loss": 1.5894914627075196, + "step": 58480 + }, + { + "epoch": 0.1770465999727574, + "grad_norm": 0.13167470693588257, + "learning_rate": 0.00010318188028479043, + "loss": 1.609023094177246, + "step": 58490 + }, + { + "epoch": 0.17707686952310323, + "grad_norm": 0.1250486820936203, + "learning_rate": 0.00010317808510315303, + "loss": 1.5944873809814453, + "step": 58500 + }, + { + "epoch": 0.17707686952310323, + "eval_loss": 1.605364203453064, + "eval_runtime": 27.9921, + "eval_samples_per_second": 17.862, + "eval_steps_per_second": 1.143, + "step": 58500 + }, + { + "epoch": 0.17710713907344908, + "grad_norm": 0.11883457005023956, + "learning_rate": 0.00010317428992151564, + "loss": 1.5793453216552735, + "step": 58510 + }, + { + "epoch": 0.1771374086237949, + "grad_norm": 0.12533622980117798, + "learning_rate": 0.00010317049473987826, + "loss": 1.6126201629638672, + "step": 58520 + }, + { + "epoch": 0.17716767817414072, + "grad_norm": 0.12841613590717316, + "learning_rate": 0.00010316669955824087, + "loss": 1.6140712738037108, + "step": 58530 + }, + { + "epoch": 0.17719794772448655, + "grad_norm": 0.12737613916397095, + "learning_rate": 0.00010316290437660347, + "loss": 1.6179906845092773, + "step": 58540 + }, + { + "epoch": 0.17722821727483237, + "grad_norm": 0.12893593311309814, + "learning_rate": 0.00010315910919496608, + "loss": 1.5756031036376954, + "step": 58550 + }, + { + "epoch": 0.17725848682517822, + "grad_norm": 0.11690928041934967, + "learning_rate": 0.00010315531401332868, + "loss": 1.6190162658691407, + "step": 58560 + }, + { + "epoch": 0.17728875637552405, + "grad_norm": 0.13918298482894897, + "learning_rate": 0.00010315151883169129, + "loss": 1.6033027648925782, + "step": 58570 + }, + { + "epoch": 0.17731902592586987, + "grad_norm": 0.14545728266239166, + "learning_rate": 0.0001031477236500539, + "loss": 1.6105342864990235, + "step": 58580 + }, + { + "epoch": 0.1773492954762157, + "grad_norm": 0.1264743059873581, + "learning_rate": 0.0001031439284684165, + "loss": 1.633321762084961, + "step": 58590 + }, + { + "epoch": 0.17737956502656152, + "grad_norm": 0.12120231240987778, + "learning_rate": 0.00010314013328677911, + "loss": 1.5975591659545898, + "step": 58600 + }, + { + "epoch": 0.17740983457690737, + "grad_norm": 0.13938066363334656, + "learning_rate": 0.00010313633810514171, + "loss": 1.582695770263672, + "step": 58610 + }, + { + "epoch": 0.1774401041272532, + "grad_norm": 0.13075728714466095, + "learning_rate": 0.00010313254292350432, + "loss": 1.6110733032226563, + "step": 58620 + }, + { + "epoch": 0.17747037367759902, + "grad_norm": 0.12486114352941513, + "learning_rate": 0.00010312874774186692, + "loss": 1.5822453498840332, + "step": 58630 + }, + { + "epoch": 0.17750064322794484, + "grad_norm": 0.1393534541130066, + "learning_rate": 0.00010312495256022953, + "loss": 1.587986373901367, + "step": 58640 + }, + { + "epoch": 0.17753091277829067, + "grad_norm": 0.13097333908081055, + "learning_rate": 0.00010312115737859214, + "loss": 1.6184209823608398, + "step": 58650 + }, + { + "epoch": 0.17756118232863652, + "grad_norm": 0.13325971364974976, + "learning_rate": 0.00010311736219695476, + "loss": 1.6105138778686523, + "step": 58660 + }, + { + "epoch": 0.17759145187898234, + "grad_norm": 0.11459194123744965, + "learning_rate": 0.00010311356701531735, + "loss": 1.5880932807922363, + "step": 58670 + }, + { + "epoch": 0.17762172142932817, + "grad_norm": 0.12948285043239594, + "learning_rate": 0.00010310977183367997, + "loss": 1.5904326438903809, + "step": 58680 + }, + { + "epoch": 0.177651990979674, + "grad_norm": 0.11613690853118896, + "learning_rate": 0.00010310597665204256, + "loss": 1.628021240234375, + "step": 58690 + }, + { + "epoch": 0.17768226053001981, + "grad_norm": 0.13522706925868988, + "learning_rate": 0.00010310218147040518, + "loss": 1.6172595977783204, + "step": 58700 + }, + { + "epoch": 0.17771253008036567, + "grad_norm": 0.13156825304031372, + "learning_rate": 0.00010309838628876777, + "loss": 1.617521095275879, + "step": 58710 + }, + { + "epoch": 0.1777427996307115, + "grad_norm": 0.1320774406194687, + "learning_rate": 0.00010309459110713039, + "loss": 1.6154123306274415, + "step": 58720 + }, + { + "epoch": 0.1777730691810573, + "grad_norm": 0.13289903104305267, + "learning_rate": 0.000103090795925493, + "loss": 1.5801698684692382, + "step": 58730 + }, + { + "epoch": 0.17780333873140314, + "grad_norm": 0.11231467127799988, + "learning_rate": 0.0001030870007438556, + "loss": 1.6093475341796875, + "step": 58740 + }, + { + "epoch": 0.17783360828174896, + "grad_norm": 0.12367193400859833, + "learning_rate": 0.00010308320556221821, + "loss": 1.5637845039367675, + "step": 58750 + }, + { + "epoch": 0.1778638778320948, + "grad_norm": 0.13486327230930328, + "learning_rate": 0.00010307941038058082, + "loss": 1.6155481338500977, + "step": 58760 + }, + { + "epoch": 0.17789414738244064, + "grad_norm": 0.13188011944293976, + "learning_rate": 0.00010307561519894344, + "loss": 1.6032306671142578, + "step": 58770 + }, + { + "epoch": 0.17792441693278646, + "grad_norm": 0.1336311399936676, + "learning_rate": 0.00010307182001730603, + "loss": 1.5692598342895507, + "step": 58780 + }, + { + "epoch": 0.17795468648313228, + "grad_norm": 0.1321004331111908, + "learning_rate": 0.00010306802483566865, + "loss": 1.620474624633789, + "step": 58790 + }, + { + "epoch": 0.1779849560334781, + "grad_norm": 0.14481493830680847, + "learning_rate": 0.00010306422965403124, + "loss": 1.6320636749267579, + "step": 58800 + }, + { + "epoch": 0.17801522558382396, + "grad_norm": 0.13878974318504333, + "learning_rate": 0.00010306043447239386, + "loss": 1.6019176483154296, + "step": 58810 + }, + { + "epoch": 0.17804549513416978, + "grad_norm": 0.12943540513515472, + "learning_rate": 0.00010305663929075645, + "loss": 1.5634706497192383, + "step": 58820 + }, + { + "epoch": 0.1780757646845156, + "grad_norm": 0.12630225718021393, + "learning_rate": 0.00010305284410911907, + "loss": 1.5880367279052734, + "step": 58830 + }, + { + "epoch": 0.17810603423486143, + "grad_norm": 0.1303253024816513, + "learning_rate": 0.00010304904892748166, + "loss": 1.6109691619873048, + "step": 58840 + }, + { + "epoch": 0.17813630378520728, + "grad_norm": 0.14935138821601868, + "learning_rate": 0.00010304525374584428, + "loss": 1.5837814331054687, + "step": 58850 + }, + { + "epoch": 0.1781665733355531, + "grad_norm": 0.13543254137039185, + "learning_rate": 0.00010304145856420688, + "loss": 1.5972040176391602, + "step": 58860 + }, + { + "epoch": 0.17819684288589893, + "grad_norm": 0.1514882594347, + "learning_rate": 0.0001030376633825695, + "loss": 1.6111763000488282, + "step": 58870 + }, + { + "epoch": 0.17822711243624476, + "grad_norm": 0.13305175304412842, + "learning_rate": 0.00010303386820093209, + "loss": 1.5871316909790039, + "step": 58880 + }, + { + "epoch": 0.17825738198659058, + "grad_norm": 0.14587950706481934, + "learning_rate": 0.0001030300730192947, + "loss": 1.6145551681518555, + "step": 58890 + }, + { + "epoch": 0.17828765153693643, + "grad_norm": 0.1334415227174759, + "learning_rate": 0.00010302627783765731, + "loss": 1.6170486450195312, + "step": 58900 + }, + { + "epoch": 0.17831792108728226, + "grad_norm": 0.13766731321811676, + "learning_rate": 0.00010302248265601992, + "loss": 1.5972898483276368, + "step": 58910 + }, + { + "epoch": 0.17834819063762808, + "grad_norm": 0.13316576182842255, + "learning_rate": 0.00010301868747438252, + "loss": 1.5953340530395508, + "step": 58920 + }, + { + "epoch": 0.1783784601879739, + "grad_norm": 0.12562234699726105, + "learning_rate": 0.00010301489229274513, + "loss": 1.6288717269897461, + "step": 58930 + }, + { + "epoch": 0.17840872973831973, + "grad_norm": 0.12516728043556213, + "learning_rate": 0.00010301109711110775, + "loss": 1.5833467483520507, + "step": 58940 + }, + { + "epoch": 0.17843899928866558, + "grad_norm": 0.13051284849643707, + "learning_rate": 0.00010300730192947034, + "loss": 1.628838348388672, + "step": 58950 + }, + { + "epoch": 0.1784692688390114, + "grad_norm": 0.12684254348278046, + "learning_rate": 0.00010300350674783296, + "loss": 1.5840181350708007, + "step": 58960 + }, + { + "epoch": 0.17849953838935723, + "grad_norm": 0.11922363936901093, + "learning_rate": 0.00010299971156619555, + "loss": 1.6149702072143555, + "step": 58970 + }, + { + "epoch": 0.17852980793970305, + "grad_norm": 0.11946795880794525, + "learning_rate": 0.00010299591638455817, + "loss": 1.5874603271484375, + "step": 58980 + }, + { + "epoch": 0.17856007749004887, + "grad_norm": 0.14166438579559326, + "learning_rate": 0.00010299212120292077, + "loss": 1.5823249816894531, + "step": 58990 + }, + { + "epoch": 0.17859034704039473, + "grad_norm": 0.12669329345226288, + "learning_rate": 0.00010298832602128339, + "loss": 1.5890464782714844, + "step": 59000 + }, + { + "epoch": 0.17859034704039473, + "eval_loss": 1.601375699043274, + "eval_runtime": 28.0203, + "eval_samples_per_second": 17.844, + "eval_steps_per_second": 1.142, + "step": 59000 + }, + { + "epoch": 0.17862061659074055, + "grad_norm": 0.12138234078884125, + "learning_rate": 0.00010298453083964598, + "loss": 1.5921688079833984, + "step": 59010 + }, + { + "epoch": 0.17865088614108637, + "grad_norm": 0.11259637027978897, + "learning_rate": 0.0001029807356580086, + "loss": 1.6128124237060546, + "step": 59020 + }, + { + "epoch": 0.1786811556914322, + "grad_norm": 0.1347547173500061, + "learning_rate": 0.0001029769404763712, + "loss": 1.5932151794433593, + "step": 59030 + }, + { + "epoch": 0.17871142524177802, + "grad_norm": 0.12765555083751678, + "learning_rate": 0.00010297314529473381, + "loss": 1.6453565597534179, + "step": 59040 + }, + { + "epoch": 0.17874169479212387, + "grad_norm": 0.12765716016292572, + "learning_rate": 0.00010296935011309642, + "loss": 1.5501289367675781, + "step": 59050 + }, + { + "epoch": 0.1787719643424697, + "grad_norm": 0.11573264747858047, + "learning_rate": 0.00010296555493145902, + "loss": 1.5788347244262695, + "step": 59060 + }, + { + "epoch": 0.17880223389281552, + "grad_norm": 0.12407735735177994, + "learning_rate": 0.00010296175974982163, + "loss": 1.5766563415527344, + "step": 59070 + }, + { + "epoch": 0.17883250344316134, + "grad_norm": 0.139535591006279, + "learning_rate": 0.00010295796456818423, + "loss": 1.588667869567871, + "step": 59080 + }, + { + "epoch": 0.17886277299350717, + "grad_norm": 0.1281859129667282, + "learning_rate": 0.00010295416938654684, + "loss": 1.5624075889587403, + "step": 59090 + }, + { + "epoch": 0.17889304254385302, + "grad_norm": 0.1254376620054245, + "learning_rate": 0.00010295037420490945, + "loss": 1.5816864013671874, + "step": 59100 + }, + { + "epoch": 0.17892331209419884, + "grad_norm": 0.11254402250051498, + "learning_rate": 0.00010294657902327205, + "loss": 1.5754058837890625, + "step": 59110 + }, + { + "epoch": 0.17895358164454467, + "grad_norm": 0.13267751038074493, + "learning_rate": 0.00010294278384163466, + "loss": 1.5848097801208496, + "step": 59120 + }, + { + "epoch": 0.1789838511948905, + "grad_norm": 0.1275385171175003, + "learning_rate": 0.00010293898865999728, + "loss": 1.589343547821045, + "step": 59130 + }, + { + "epoch": 0.17901412074523632, + "grad_norm": 0.12853188812732697, + "learning_rate": 0.00010293519347835988, + "loss": 1.5888521194458007, + "step": 59140 + }, + { + "epoch": 0.17904439029558217, + "grad_norm": 0.13725660741329193, + "learning_rate": 0.00010293139829672249, + "loss": 1.5956703186035157, + "step": 59150 + }, + { + "epoch": 0.179074659845928, + "grad_norm": 0.13041052222251892, + "learning_rate": 0.0001029276031150851, + "loss": 1.5949695587158204, + "step": 59160 + }, + { + "epoch": 0.17910492939627382, + "grad_norm": 0.12949557602405548, + "learning_rate": 0.0001029238079334477, + "loss": 1.6038505554199218, + "step": 59170 + }, + { + "epoch": 0.17913519894661964, + "grad_norm": 0.14273841679096222, + "learning_rate": 0.0001029200127518103, + "loss": 1.5776142120361327, + "step": 59180 + }, + { + "epoch": 0.1791654684969655, + "grad_norm": 0.12141955643892288, + "learning_rate": 0.00010291621757017291, + "loss": 1.6017576217651368, + "step": 59190 + }, + { + "epoch": 0.17919573804731131, + "grad_norm": 0.11964911222457886, + "learning_rate": 0.00010291242238853552, + "loss": 1.6065216064453125, + "step": 59200 + }, + { + "epoch": 0.17922600759765714, + "grad_norm": 0.12186089903116226, + "learning_rate": 0.00010290862720689812, + "loss": 1.5876803398132324, + "step": 59210 + }, + { + "epoch": 0.17925627714800296, + "grad_norm": 0.11994278430938721, + "learning_rate": 0.00010290483202526073, + "loss": 1.6050395965576172, + "step": 59220 + }, + { + "epoch": 0.1792865466983488, + "grad_norm": 0.11638320982456207, + "learning_rate": 0.00010290103684362334, + "loss": 1.622135543823242, + "step": 59230 + }, + { + "epoch": 0.17931681624869464, + "grad_norm": 0.12766362726688385, + "learning_rate": 0.00010289724166198594, + "loss": 1.6043601989746095, + "step": 59240 + }, + { + "epoch": 0.17934708579904046, + "grad_norm": 0.11766599118709564, + "learning_rate": 0.00010289344648034855, + "loss": 1.5949932098388673, + "step": 59250 + }, + { + "epoch": 0.17937735534938629, + "grad_norm": 0.1330609917640686, + "learning_rate": 0.00010288965129871115, + "loss": 1.5696066856384276, + "step": 59260 + }, + { + "epoch": 0.1794076248997321, + "grad_norm": 0.12403680384159088, + "learning_rate": 0.00010288585611707377, + "loss": 1.608639907836914, + "step": 59270 + }, + { + "epoch": 0.17943789445007793, + "grad_norm": 0.12493506819009781, + "learning_rate": 0.00010288206093543637, + "loss": 1.631336784362793, + "step": 59280 + }, + { + "epoch": 0.17946816400042379, + "grad_norm": 0.11504264175891876, + "learning_rate": 0.00010287826575379899, + "loss": 1.601541519165039, + "step": 59290 + }, + { + "epoch": 0.1794984335507696, + "grad_norm": 0.11705191433429718, + "learning_rate": 0.00010287447057216158, + "loss": 1.581538200378418, + "step": 59300 + }, + { + "epoch": 0.17952870310111543, + "grad_norm": 0.11944347620010376, + "learning_rate": 0.0001028706753905242, + "loss": 1.5831424713134765, + "step": 59310 + }, + { + "epoch": 0.17955897265146126, + "grad_norm": 0.12182734906673431, + "learning_rate": 0.00010286688020888679, + "loss": 1.60986270904541, + "step": 59320 + }, + { + "epoch": 0.17958924220180708, + "grad_norm": 0.12712059915065765, + "learning_rate": 0.00010286308502724941, + "loss": 1.6507896423339843, + "step": 59330 + }, + { + "epoch": 0.17961951175215293, + "grad_norm": 0.1313166320323944, + "learning_rate": 0.00010285928984561202, + "loss": 1.609913444519043, + "step": 59340 + }, + { + "epoch": 0.17964978130249876, + "grad_norm": 0.11949583888053894, + "learning_rate": 0.00010285549466397462, + "loss": 1.603277587890625, + "step": 59350 + }, + { + "epoch": 0.17968005085284458, + "grad_norm": 0.1297459900379181, + "learning_rate": 0.00010285169948233723, + "loss": 1.6140476226806642, + "step": 59360 + }, + { + "epoch": 0.1797103204031904, + "grad_norm": 0.11657252162694931, + "learning_rate": 0.00010284790430069983, + "loss": 1.5806082725524901, + "step": 59370 + }, + { + "epoch": 0.17974058995353623, + "grad_norm": 0.12886179983615875, + "learning_rate": 0.00010284410911906244, + "loss": 1.578879165649414, + "step": 59380 + }, + { + "epoch": 0.17977085950388208, + "grad_norm": 0.13759668171405792, + "learning_rate": 0.00010284031393742504, + "loss": 1.5852597236633301, + "step": 59390 + }, + { + "epoch": 0.1798011290542279, + "grad_norm": 0.13031288981437683, + "learning_rate": 0.00010283651875578766, + "loss": 1.6050991058349608, + "step": 59400 + }, + { + "epoch": 0.17983139860457373, + "grad_norm": 0.12609733641147614, + "learning_rate": 0.00010283272357415026, + "loss": 1.590087890625, + "step": 59410 + }, + { + "epoch": 0.17986166815491955, + "grad_norm": 0.12773187458515167, + "learning_rate": 0.00010282892839251288, + "loss": 1.5837685585021972, + "step": 59420 + }, + { + "epoch": 0.17989193770526538, + "grad_norm": 0.11398225277662277, + "learning_rate": 0.00010282513321087547, + "loss": 1.581834602355957, + "step": 59430 + }, + { + "epoch": 0.17992220725561123, + "grad_norm": 0.12444780021905899, + "learning_rate": 0.00010282133802923809, + "loss": 1.5812483787536622, + "step": 59440 + }, + { + "epoch": 0.17995247680595705, + "grad_norm": 0.12415000051259995, + "learning_rate": 0.00010281754284760068, + "loss": 1.6289295196533202, + "step": 59450 + }, + { + "epoch": 0.17998274635630288, + "grad_norm": 0.12668950855731964, + "learning_rate": 0.0001028137476659633, + "loss": 1.6107845306396484, + "step": 59460 + }, + { + "epoch": 0.1800130159066487, + "grad_norm": 0.11724354326725006, + "learning_rate": 0.00010280995248432589, + "loss": 1.5986218452453613, + "step": 59470 + }, + { + "epoch": 0.18004328545699452, + "grad_norm": 0.133992537856102, + "learning_rate": 0.00010280615730268851, + "loss": 1.6002220153808593, + "step": 59480 + }, + { + "epoch": 0.18007355500734037, + "grad_norm": 0.12264135479927063, + "learning_rate": 0.0001028023621210511, + "loss": 1.624417495727539, + "step": 59490 + }, + { + "epoch": 0.1801038245576862, + "grad_norm": 0.12537053227424622, + "learning_rate": 0.00010279856693941372, + "loss": 1.6001949310302734, + "step": 59500 + }, + { + "epoch": 0.1801038245576862, + "eval_loss": 1.59330153465271, + "eval_runtime": 28.3889, + "eval_samples_per_second": 17.612, + "eval_steps_per_second": 1.127, + "step": 59500 + }, + { + "epoch": 0.18013409410803202, + "grad_norm": 0.13285601139068604, + "learning_rate": 0.00010279477175777633, + "loss": 1.612615966796875, + "step": 59510 + }, + { + "epoch": 0.18016436365837785, + "grad_norm": 0.11911787837743759, + "learning_rate": 0.00010279097657613894, + "loss": 1.6001262664794922, + "step": 59520 + }, + { + "epoch": 0.1801946332087237, + "grad_norm": 0.1341416984796524, + "learning_rate": 0.00010278718139450156, + "loss": 1.6194927215576171, + "step": 59530 + }, + { + "epoch": 0.18022490275906952, + "grad_norm": 0.12768305838108063, + "learning_rate": 0.00010278338621286415, + "loss": 1.6058860778808595, + "step": 59540 + }, + { + "epoch": 0.18025517230941535, + "grad_norm": 0.11917088180780411, + "learning_rate": 0.00010277959103122677, + "loss": 1.5734235763549804, + "step": 59550 + }, + { + "epoch": 0.18028544185976117, + "grad_norm": 0.1182471290230751, + "learning_rate": 0.00010277579584958936, + "loss": 1.6059144973754882, + "step": 59560 + }, + { + "epoch": 0.180315711410107, + "grad_norm": 0.11665474623441696, + "learning_rate": 0.00010277200066795198, + "loss": 1.5872355461120606, + "step": 59570 + }, + { + "epoch": 0.18034598096045285, + "grad_norm": 0.13052824139595032, + "learning_rate": 0.00010276820548631457, + "loss": 1.6258041381835937, + "step": 59580 + }, + { + "epoch": 0.18037625051079867, + "grad_norm": 0.12885288894176483, + "learning_rate": 0.00010276441030467719, + "loss": 1.575365924835205, + "step": 59590 + }, + { + "epoch": 0.1804065200611445, + "grad_norm": 0.1346263885498047, + "learning_rate": 0.00010276061512303978, + "loss": 1.6082138061523437, + "step": 59600 + }, + { + "epoch": 0.18043678961149032, + "grad_norm": 0.14869935810565948, + "learning_rate": 0.0001027568199414024, + "loss": 1.609427261352539, + "step": 59610 + }, + { + "epoch": 0.18046705916183614, + "grad_norm": 0.1404130607843399, + "learning_rate": 0.000102753024759765, + "loss": 1.5901814460754395, + "step": 59620 + }, + { + "epoch": 0.180497328712182, + "grad_norm": 0.13591179251670837, + "learning_rate": 0.00010274922957812761, + "loss": 1.5560346603393556, + "step": 59630 + }, + { + "epoch": 0.18052759826252782, + "grad_norm": 0.1242382600903511, + "learning_rate": 0.00010274543439649022, + "loss": 1.622587013244629, + "step": 59640 + }, + { + "epoch": 0.18055786781287364, + "grad_norm": 0.11634178459644318, + "learning_rate": 0.00010274163921485283, + "loss": 1.6329805374145507, + "step": 59650 + }, + { + "epoch": 0.18058813736321946, + "grad_norm": 0.1326034665107727, + "learning_rate": 0.00010273784403321543, + "loss": 1.624230194091797, + "step": 59660 + }, + { + "epoch": 0.1806184069135653, + "grad_norm": 0.1363874077796936, + "learning_rate": 0.00010273404885157804, + "loss": 1.5861443519592284, + "step": 59670 + }, + { + "epoch": 0.18064867646391114, + "grad_norm": 0.1253555417060852, + "learning_rate": 0.00010273025366994064, + "loss": 1.616640281677246, + "step": 59680 + }, + { + "epoch": 0.18067894601425696, + "grad_norm": 0.13337856531143188, + "learning_rate": 0.00010272645848830325, + "loss": 1.5835244178771972, + "step": 59690 + }, + { + "epoch": 0.1807092155646028, + "grad_norm": 0.13750985264778137, + "learning_rate": 0.00010272266330666586, + "loss": 1.5380353927612305, + "step": 59700 + }, + { + "epoch": 0.1807394851149486, + "grad_norm": 0.13036617636680603, + "learning_rate": 0.00010271886812502846, + "loss": 1.6092620849609376, + "step": 59710 + }, + { + "epoch": 0.18076975466529444, + "grad_norm": 0.13048739731311798, + "learning_rate": 0.00010271507294339107, + "loss": 1.6015617370605468, + "step": 59720 + }, + { + "epoch": 0.1808000242156403, + "grad_norm": 0.13017946481704712, + "learning_rate": 0.00010271127776175367, + "loss": 1.565401840209961, + "step": 59730 + }, + { + "epoch": 0.1808302937659861, + "grad_norm": 0.11473748832941055, + "learning_rate": 0.0001027074825801163, + "loss": 1.5968873023986816, + "step": 59740 + }, + { + "epoch": 0.18086056331633193, + "grad_norm": 0.11819946765899658, + "learning_rate": 0.00010270368739847889, + "loss": 1.6064409255981444, + "step": 59750 + }, + { + "epoch": 0.18089083286667776, + "grad_norm": 0.12002447992563248, + "learning_rate": 0.0001026998922168415, + "loss": 1.5740473747253418, + "step": 59760 + }, + { + "epoch": 0.18092110241702358, + "grad_norm": 0.1290189027786255, + "learning_rate": 0.00010269609703520411, + "loss": 1.5696331977844238, + "step": 59770 + }, + { + "epoch": 0.18095137196736943, + "grad_norm": 0.12065619230270386, + "learning_rate": 0.00010269230185356672, + "loss": 1.5999025344848632, + "step": 59780 + }, + { + "epoch": 0.18098164151771526, + "grad_norm": 0.11870818585157394, + "learning_rate": 0.00010268850667192932, + "loss": 1.572684383392334, + "step": 59790 + }, + { + "epoch": 0.18101191106806108, + "grad_norm": 0.13621164858341217, + "learning_rate": 0.00010268471149029193, + "loss": 1.5899452209472655, + "step": 59800 + }, + { + "epoch": 0.1810421806184069, + "grad_norm": 0.1323622465133667, + "learning_rate": 0.00010268091630865454, + "loss": 1.5818376541137695, + "step": 59810 + }, + { + "epoch": 0.18107245016875273, + "grad_norm": 0.1224125400185585, + "learning_rate": 0.00010267712112701714, + "loss": 1.5734749794006349, + "step": 59820 + }, + { + "epoch": 0.18110271971909858, + "grad_norm": 0.13046112656593323, + "learning_rate": 0.00010267332594537975, + "loss": 1.6033550262451173, + "step": 59830 + }, + { + "epoch": 0.1811329892694444, + "grad_norm": 0.13699941337108612, + "learning_rate": 0.00010266953076374235, + "loss": 1.6100555419921876, + "step": 59840 + }, + { + "epoch": 0.18116325881979023, + "grad_norm": 0.12921540439128876, + "learning_rate": 0.00010266573558210496, + "loss": 1.5790157318115234, + "step": 59850 + }, + { + "epoch": 0.18119352837013605, + "grad_norm": 0.13481400907039642, + "learning_rate": 0.00010266194040046757, + "loss": 1.574396800994873, + "step": 59860 + }, + { + "epoch": 0.1812237979204819, + "grad_norm": 0.14888407289981842, + "learning_rate": 0.00010265814521883017, + "loss": 1.5870623588562012, + "step": 59870 + }, + { + "epoch": 0.18125406747082773, + "grad_norm": 0.13014869391918182, + "learning_rate": 0.00010265435003719279, + "loss": 1.5620903015136718, + "step": 59880 + }, + { + "epoch": 0.18128433702117355, + "grad_norm": 0.1611405611038208, + "learning_rate": 0.00010265055485555538, + "loss": 1.6000911712646484, + "step": 59890 + }, + { + "epoch": 0.18131460657151938, + "grad_norm": 0.13639578223228455, + "learning_rate": 0.000102646759673918, + "loss": 1.6156484603881835, + "step": 59900 + }, + { + "epoch": 0.1813448761218652, + "grad_norm": 0.14096711575984955, + "learning_rate": 0.0001026429644922806, + "loss": 1.5999753952026368, + "step": 59910 + }, + { + "epoch": 0.18137514567221105, + "grad_norm": 0.13777896761894226, + "learning_rate": 0.00010263916931064321, + "loss": 1.5997299194335937, + "step": 59920 + }, + { + "epoch": 0.18140541522255688, + "grad_norm": 0.13337020576000214, + "learning_rate": 0.00010263537412900581, + "loss": 1.613670539855957, + "step": 59930 + }, + { + "epoch": 0.1814356847729027, + "grad_norm": 0.1234053522348404, + "learning_rate": 0.00010263157894736843, + "loss": 1.628488540649414, + "step": 59940 + }, + { + "epoch": 0.18146595432324852, + "grad_norm": 0.12724900245666504, + "learning_rate": 0.00010262778376573103, + "loss": 1.6371091842651366, + "step": 59950 + }, + { + "epoch": 0.18149622387359435, + "grad_norm": 0.12287959456443787, + "learning_rate": 0.00010262398858409364, + "loss": 1.6061779022216798, + "step": 59960 + }, + { + "epoch": 0.1815264934239402, + "grad_norm": 0.139169842004776, + "learning_rate": 0.00010262019340245624, + "loss": 1.5767386436462403, + "step": 59970 + }, + { + "epoch": 0.18155676297428602, + "grad_norm": 0.12186144292354584, + "learning_rate": 0.00010261639822081885, + "loss": 1.612176513671875, + "step": 59980 + }, + { + "epoch": 0.18158703252463185, + "grad_norm": 0.15738782286643982, + "learning_rate": 0.00010261260303918146, + "loss": 1.5925475120544434, + "step": 59990 + }, + { + "epoch": 0.18161730207497767, + "grad_norm": 0.1329454928636551, + "learning_rate": 0.00010260880785754406, + "loss": 1.608283233642578, + "step": 60000 + }, + { + "epoch": 0.18161730207497767, + "eval_loss": 1.5884922742843628, + "eval_runtime": 27.9237, + "eval_samples_per_second": 17.906, + "eval_steps_per_second": 1.146, + "step": 60000 + }, + { + "epoch": 0.1816475716253235, + "grad_norm": 0.1269540935754776, + "learning_rate": 0.00010260501267590668, + "loss": 1.5943991661071777, + "step": 60010 + }, + { + "epoch": 0.18167784117566935, + "grad_norm": 0.11855510622262955, + "learning_rate": 0.00010260121749426927, + "loss": 1.5980372428894043, + "step": 60020 + }, + { + "epoch": 0.18170811072601517, + "grad_norm": 0.1450532227754593, + "learning_rate": 0.0001025974223126319, + "loss": 1.5638376235961915, + "step": 60030 + }, + { + "epoch": 0.181738380276361, + "grad_norm": 0.12157466262578964, + "learning_rate": 0.00010259362713099449, + "loss": 1.5840845108032227, + "step": 60040 + }, + { + "epoch": 0.18176864982670682, + "grad_norm": 0.11240535229444504, + "learning_rate": 0.0001025898319493571, + "loss": 1.6228876113891602, + "step": 60050 + }, + { + "epoch": 0.18179891937705264, + "grad_norm": 0.12545117735862732, + "learning_rate": 0.0001025860367677197, + "loss": 1.535171890258789, + "step": 60060 + }, + { + "epoch": 0.1818291889273985, + "grad_norm": 0.12276090681552887, + "learning_rate": 0.00010258224158608232, + "loss": 1.5809423446655273, + "step": 60070 + }, + { + "epoch": 0.18185945847774432, + "grad_norm": 0.12828774750232697, + "learning_rate": 0.00010257844640444491, + "loss": 1.5949214935302733, + "step": 60080 + }, + { + "epoch": 0.18188972802809014, + "grad_norm": 0.11664494127035141, + "learning_rate": 0.00010257465122280753, + "loss": 1.5941178321838378, + "step": 60090 + }, + { + "epoch": 0.18191999757843597, + "grad_norm": 0.12606661021709442, + "learning_rate": 0.00010257085604117012, + "loss": 1.6388748168945313, + "step": 60100 + }, + { + "epoch": 0.1819502671287818, + "grad_norm": 0.1201825886964798, + "learning_rate": 0.00010256706085953274, + "loss": 1.629358673095703, + "step": 60110 + }, + { + "epoch": 0.18198053667912764, + "grad_norm": 0.11895320564508438, + "learning_rate": 0.00010256326567789533, + "loss": 1.5551618576049804, + "step": 60120 + }, + { + "epoch": 0.18201080622947347, + "grad_norm": 0.1364736407995224, + "learning_rate": 0.00010255947049625795, + "loss": 1.5629842758178711, + "step": 60130 + }, + { + "epoch": 0.1820410757798193, + "grad_norm": 0.13264113664627075, + "learning_rate": 0.00010255567531462057, + "loss": 1.6257335662841796, + "step": 60140 + }, + { + "epoch": 0.1820713453301651, + "grad_norm": 0.15613454580307007, + "learning_rate": 0.00010255188013298316, + "loss": 1.5884946823120116, + "step": 60150 + }, + { + "epoch": 0.18210161488051094, + "grad_norm": 0.12952305376529694, + "learning_rate": 0.00010254808495134578, + "loss": 1.6116432189941405, + "step": 60160 + }, + { + "epoch": 0.1821318844308568, + "grad_norm": 0.1351434737443924, + "learning_rate": 0.00010254428976970838, + "loss": 1.6338935852050782, + "step": 60170 + }, + { + "epoch": 0.1821621539812026, + "grad_norm": 0.12737563252449036, + "learning_rate": 0.000102540494588071, + "loss": 1.60406551361084, + "step": 60180 + }, + { + "epoch": 0.18219242353154844, + "grad_norm": 0.12707307934761047, + "learning_rate": 0.00010253669940643359, + "loss": 1.581225299835205, + "step": 60190 + }, + { + "epoch": 0.18222269308189426, + "grad_norm": 0.11707141995429993, + "learning_rate": 0.00010253290422479621, + "loss": 1.5891502380371094, + "step": 60200 + }, + { + "epoch": 0.1822529626322401, + "grad_norm": 0.12543141841888428, + "learning_rate": 0.0001025291090431588, + "loss": 1.5891886711120606, + "step": 60210 + }, + { + "epoch": 0.18228323218258594, + "grad_norm": 0.13923189043998718, + "learning_rate": 0.00010252531386152142, + "loss": 1.5871622085571289, + "step": 60220 + }, + { + "epoch": 0.18231350173293176, + "grad_norm": 0.1311221718788147, + "learning_rate": 0.00010252151867988401, + "loss": 1.6195411682128906, + "step": 60230 + }, + { + "epoch": 0.18234377128327758, + "grad_norm": 0.11594732105731964, + "learning_rate": 0.00010251772349824663, + "loss": 1.5949262619018554, + "step": 60240 + }, + { + "epoch": 0.1823740408336234, + "grad_norm": 0.13256077468395233, + "learning_rate": 0.00010251392831660924, + "loss": 1.57047700881958, + "step": 60250 + }, + { + "epoch": 0.18240431038396926, + "grad_norm": 0.1283193677663803, + "learning_rate": 0.00010251013313497184, + "loss": 1.6040660858154296, + "step": 60260 + }, + { + "epoch": 0.18243457993431508, + "grad_norm": 0.12175782024860382, + "learning_rate": 0.00010250633795333445, + "loss": 1.5842811584472656, + "step": 60270 + }, + { + "epoch": 0.1824648494846609, + "grad_norm": 0.12267950177192688, + "learning_rate": 0.00010250254277169706, + "loss": 1.5944799423217773, + "step": 60280 + }, + { + "epoch": 0.18249511903500673, + "grad_norm": 0.12369908392429352, + "learning_rate": 0.00010249874759005966, + "loss": 1.6089105606079102, + "step": 60290 + }, + { + "epoch": 0.18252538858535255, + "grad_norm": 0.1409832239151001, + "learning_rate": 0.00010249495240842227, + "loss": 1.5763969421386719, + "step": 60300 + }, + { + "epoch": 0.1825556581356984, + "grad_norm": 0.11936021596193314, + "learning_rate": 0.00010249115722678487, + "loss": 1.605135726928711, + "step": 60310 + }, + { + "epoch": 0.18258592768604423, + "grad_norm": 0.13449282944202423, + "learning_rate": 0.00010248736204514748, + "loss": 1.59885892868042, + "step": 60320 + }, + { + "epoch": 0.18261619723639005, + "grad_norm": 0.1439681202173233, + "learning_rate": 0.00010248356686351009, + "loss": 1.571073341369629, + "step": 60330 + }, + { + "epoch": 0.18264646678673588, + "grad_norm": 0.1414068192243576, + "learning_rate": 0.00010247977168187269, + "loss": 1.57203950881958, + "step": 60340 + }, + { + "epoch": 0.1826767363370817, + "grad_norm": 0.14524975419044495, + "learning_rate": 0.00010247597650023531, + "loss": 1.5645858764648437, + "step": 60350 + }, + { + "epoch": 0.18270700588742755, + "grad_norm": 0.13923189043998718, + "learning_rate": 0.0001024721813185979, + "loss": 1.6090150833129884, + "step": 60360 + }, + { + "epoch": 0.18273727543777338, + "grad_norm": 0.11805014312267303, + "learning_rate": 0.00010246838613696052, + "loss": 1.5945841789245605, + "step": 60370 + }, + { + "epoch": 0.1827675449881192, + "grad_norm": 0.12899260222911835, + "learning_rate": 0.00010246459095532313, + "loss": 1.563084030151367, + "step": 60380 + }, + { + "epoch": 0.18279781453846503, + "grad_norm": 0.13102875649929047, + "learning_rate": 0.00010246079577368573, + "loss": 1.6087381362915039, + "step": 60390 + }, + { + "epoch": 0.18282808408881085, + "grad_norm": 0.12669599056243896, + "learning_rate": 0.00010245700059204834, + "loss": 1.61441650390625, + "step": 60400 + }, + { + "epoch": 0.1828583536391567, + "grad_norm": 0.1254306584596634, + "learning_rate": 0.00010245320541041095, + "loss": 1.5847134590148926, + "step": 60410 + }, + { + "epoch": 0.18288862318950252, + "grad_norm": 0.12064661830663681, + "learning_rate": 0.00010244941022877355, + "loss": 1.5893924713134766, + "step": 60420 + }, + { + "epoch": 0.18291889273984835, + "grad_norm": 0.13625238835811615, + "learning_rate": 0.00010244561504713616, + "loss": 1.5978272438049317, + "step": 60430 + }, + { + "epoch": 0.18294916229019417, + "grad_norm": 0.12270952016115189, + "learning_rate": 0.00010244181986549876, + "loss": 1.6072689056396485, + "step": 60440 + }, + { + "epoch": 0.18297943184054, + "grad_norm": 0.12264902889728546, + "learning_rate": 0.00010243802468386137, + "loss": 1.5877042770385743, + "step": 60450 + }, + { + "epoch": 0.18300970139088585, + "grad_norm": 0.12131110578775406, + "learning_rate": 0.00010243422950222398, + "loss": 1.5827686309814453, + "step": 60460 + }, + { + "epoch": 0.18303997094123167, + "grad_norm": 0.12176395207643509, + "learning_rate": 0.00010243043432058658, + "loss": 1.6020172119140625, + "step": 60470 + }, + { + "epoch": 0.1830702404915775, + "grad_norm": 0.1275755614042282, + "learning_rate": 0.00010242663913894919, + "loss": 1.6240623474121094, + "step": 60480 + }, + { + "epoch": 0.18310051004192332, + "grad_norm": 0.12053845822811127, + "learning_rate": 0.0001024228439573118, + "loss": 1.5648792266845704, + "step": 60490 + }, + { + "epoch": 0.18313077959226914, + "grad_norm": 0.12405214458703995, + "learning_rate": 0.0001024190487756744, + "loss": 1.5672833442687988, + "step": 60500 + }, + { + "epoch": 0.18313077959226914, + "eval_loss": 1.6168197393417358, + "eval_runtime": 28.352, + "eval_samples_per_second": 17.635, + "eval_steps_per_second": 1.129, + "step": 60500 + }, + { + "epoch": 0.183161049142615, + "grad_norm": 0.12431562691926956, + "learning_rate": 0.00010241525359403702, + "loss": 1.5631794929504395, + "step": 60510 + }, + { + "epoch": 0.18319131869296082, + "grad_norm": 0.11942952126264572, + "learning_rate": 0.00010241145841239961, + "loss": 1.5942108154296875, + "step": 60520 + }, + { + "epoch": 0.18322158824330664, + "grad_norm": 0.11151262372732162, + "learning_rate": 0.00010240766323076223, + "loss": 1.5672908782958985, + "step": 60530 + }, + { + "epoch": 0.18325185779365247, + "grad_norm": 0.11266437917947769, + "learning_rate": 0.00010240386804912482, + "loss": 1.6003189086914062, + "step": 60540 + }, + { + "epoch": 0.1832821273439983, + "grad_norm": 0.12163291126489639, + "learning_rate": 0.00010240007286748744, + "loss": 1.6180992126464844, + "step": 60550 + }, + { + "epoch": 0.18331239689434414, + "grad_norm": 0.13239341974258423, + "learning_rate": 0.00010239627768585005, + "loss": 1.5695809364318847, + "step": 60560 + }, + { + "epoch": 0.18334266644468997, + "grad_norm": 0.12370964139699936, + "learning_rate": 0.00010239248250421266, + "loss": 1.6136299133300782, + "step": 60570 + }, + { + "epoch": 0.1833729359950358, + "grad_norm": 0.13674971461296082, + "learning_rate": 0.00010238868732257526, + "loss": 1.6164052963256836, + "step": 60580 + }, + { + "epoch": 0.18340320554538161, + "grad_norm": 0.11389664560556412, + "learning_rate": 0.00010238489214093787, + "loss": 1.562631607055664, + "step": 60590 + }, + { + "epoch": 0.18343347509572747, + "grad_norm": 0.1439622938632965, + "learning_rate": 0.00010238109695930047, + "loss": 1.6133563995361329, + "step": 60600 + }, + { + "epoch": 0.1834637446460733, + "grad_norm": 0.12584474682807922, + "learning_rate": 0.00010237730177766308, + "loss": 1.5886804580688476, + "step": 60610 + }, + { + "epoch": 0.18349401419641911, + "grad_norm": 0.13222794234752655, + "learning_rate": 0.0001023735065960257, + "loss": 1.5894380569458009, + "step": 60620 + }, + { + "epoch": 0.18352428374676494, + "grad_norm": 0.12802961468696594, + "learning_rate": 0.00010236971141438829, + "loss": 1.5511085510253906, + "step": 60630 + }, + { + "epoch": 0.18355455329711076, + "grad_norm": 0.11636152118444443, + "learning_rate": 0.00010236591623275091, + "loss": 1.5651689529418946, + "step": 60640 + }, + { + "epoch": 0.1835848228474566, + "grad_norm": 0.11611853539943695, + "learning_rate": 0.0001023621210511135, + "loss": 1.6015865325927734, + "step": 60650 + }, + { + "epoch": 0.18361509239780244, + "grad_norm": 0.11600940674543381, + "learning_rate": 0.00010235832586947612, + "loss": 1.6466808319091797, + "step": 60660 + }, + { + "epoch": 0.18364536194814826, + "grad_norm": 0.1414565145969391, + "learning_rate": 0.00010235453068783871, + "loss": 1.570368194580078, + "step": 60670 + }, + { + "epoch": 0.18367563149849409, + "grad_norm": 0.12751871347427368, + "learning_rate": 0.00010235073550620133, + "loss": 1.5759888648986817, + "step": 60680 + }, + { + "epoch": 0.1837059010488399, + "grad_norm": 0.1463731974363327, + "learning_rate": 0.00010234694032456393, + "loss": 1.5710926055908203, + "step": 60690 + }, + { + "epoch": 0.18373617059918576, + "grad_norm": 0.1155533492565155, + "learning_rate": 0.00010234314514292655, + "loss": 1.6101329803466797, + "step": 60700 + }, + { + "epoch": 0.18376644014953158, + "grad_norm": 0.1286051869392395, + "learning_rate": 0.00010233934996128914, + "loss": 1.606787109375, + "step": 60710 + }, + { + "epoch": 0.1837967096998774, + "grad_norm": 0.11992402374744415, + "learning_rate": 0.00010233555477965176, + "loss": 1.632692527770996, + "step": 60720 + }, + { + "epoch": 0.18382697925022323, + "grad_norm": 0.11531573534011841, + "learning_rate": 0.00010233175959801435, + "loss": 1.5884273529052735, + "step": 60730 + }, + { + "epoch": 0.18385724880056906, + "grad_norm": 0.11569986492395401, + "learning_rate": 0.00010232796441637697, + "loss": 1.6218828201293944, + "step": 60740 + }, + { + "epoch": 0.1838875183509149, + "grad_norm": 0.13564041256904602, + "learning_rate": 0.00010232416923473959, + "loss": 1.572570514678955, + "step": 60750 + }, + { + "epoch": 0.18391778790126073, + "grad_norm": 0.1303900182247162, + "learning_rate": 0.00010232037405310218, + "loss": 1.608574676513672, + "step": 60760 + }, + { + "epoch": 0.18394805745160656, + "grad_norm": 0.141214519739151, + "learning_rate": 0.0001023165788714648, + "loss": 1.558059310913086, + "step": 60770 + }, + { + "epoch": 0.18397832700195238, + "grad_norm": 0.12548288702964783, + "learning_rate": 0.0001023127836898274, + "loss": 1.6210056304931642, + "step": 60780 + }, + { + "epoch": 0.1840085965522982, + "grad_norm": 0.13192002475261688, + "learning_rate": 0.00010230898850819001, + "loss": 1.5842933654785156, + "step": 60790 + }, + { + "epoch": 0.18403886610264406, + "grad_norm": 0.12008152157068253, + "learning_rate": 0.0001023051933265526, + "loss": 1.6211349487304687, + "step": 60800 + }, + { + "epoch": 0.18406913565298988, + "grad_norm": 0.12019389867782593, + "learning_rate": 0.00010230139814491523, + "loss": 1.5657185554504394, + "step": 60810 + }, + { + "epoch": 0.1840994052033357, + "grad_norm": 0.12132236361503601, + "learning_rate": 0.00010229760296327782, + "loss": 1.608056640625, + "step": 60820 + }, + { + "epoch": 0.18412967475368153, + "grad_norm": 0.1266157478094101, + "learning_rate": 0.00010229380778164044, + "loss": 1.5858614921569825, + "step": 60830 + }, + { + "epoch": 0.18415994430402735, + "grad_norm": 0.12725356221199036, + "learning_rate": 0.00010229001260000303, + "loss": 1.6113950729370117, + "step": 60840 + }, + { + "epoch": 0.1841902138543732, + "grad_norm": 0.12775738537311554, + "learning_rate": 0.00010228621741836565, + "loss": 1.591816520690918, + "step": 60850 + }, + { + "epoch": 0.18422048340471903, + "grad_norm": 0.14120055735111237, + "learning_rate": 0.00010228242223672824, + "loss": 1.583637809753418, + "step": 60860 + }, + { + "epoch": 0.18425075295506485, + "grad_norm": 0.12890027463436127, + "learning_rate": 0.00010227862705509086, + "loss": 1.6058074951171875, + "step": 60870 + }, + { + "epoch": 0.18428102250541067, + "grad_norm": 0.13172705471515656, + "learning_rate": 0.00010227483187345347, + "loss": 1.6094064712524414, + "step": 60880 + }, + { + "epoch": 0.1843112920557565, + "grad_norm": 0.12596435844898224, + "learning_rate": 0.00010227103669181607, + "loss": 1.592928981781006, + "step": 60890 + }, + { + "epoch": 0.18434156160610235, + "grad_norm": 0.1258416771888733, + "learning_rate": 0.00010226724151017868, + "loss": 1.5825390815734863, + "step": 60900 + }, + { + "epoch": 0.18437183115644817, + "grad_norm": 0.1269461214542389, + "learning_rate": 0.00010226344632854128, + "loss": 1.6493961334228515, + "step": 60910 + }, + { + "epoch": 0.184402100706794, + "grad_norm": 0.13260483741760254, + "learning_rate": 0.00010225965114690389, + "loss": 1.5808635711669923, + "step": 60920 + }, + { + "epoch": 0.18443237025713982, + "grad_norm": 0.13022024929523468, + "learning_rate": 0.0001022558559652665, + "loss": 1.5588191986083983, + "step": 60930 + }, + { + "epoch": 0.18446263980748567, + "grad_norm": 0.12526747584342957, + "learning_rate": 0.0001022520607836291, + "loss": 1.5799976348876954, + "step": 60940 + }, + { + "epoch": 0.1844929093578315, + "grad_norm": 0.14880581200122833, + "learning_rate": 0.00010224826560199171, + "loss": 1.5818955421447753, + "step": 60950 + }, + { + "epoch": 0.18452317890817732, + "grad_norm": 0.1261218786239624, + "learning_rate": 0.00010224447042035433, + "loss": 1.5875871658325196, + "step": 60960 + }, + { + "epoch": 0.18455344845852314, + "grad_norm": 0.14166459441184998, + "learning_rate": 0.00010224067523871692, + "loss": 1.5851265907287597, + "step": 60970 + }, + { + "epoch": 0.18458371800886897, + "grad_norm": 0.13111518323421478, + "learning_rate": 0.00010223688005707954, + "loss": 1.574619197845459, + "step": 60980 + }, + { + "epoch": 0.18461398755921482, + "grad_norm": 0.13223682343959808, + "learning_rate": 0.00010223308487544215, + "loss": 1.563649845123291, + "step": 60990 + }, + { + "epoch": 0.18464425710956064, + "grad_norm": 0.11265280097723007, + "learning_rate": 0.00010222928969380475, + "loss": 1.6290006637573242, + "step": 61000 + }, + { + "epoch": 0.18464425710956064, + "eval_loss": 1.6170766353607178, + "eval_runtime": 28.1511, + "eval_samples_per_second": 17.761, + "eval_steps_per_second": 1.137, + "step": 61000 + }, + { + "epoch": 0.18467452665990647, + "grad_norm": 0.11621015518903732, + "learning_rate": 0.00010222549451216736, + "loss": 1.611574935913086, + "step": 61010 + }, + { + "epoch": 0.1847047962102523, + "grad_norm": 0.12916527688503265, + "learning_rate": 0.00010222169933052996, + "loss": 1.5916943550109863, + "step": 61020 + }, + { + "epoch": 0.18473506576059812, + "grad_norm": 0.12102814763784409, + "learning_rate": 0.00010221790414889257, + "loss": 1.5988029479980468, + "step": 61030 + }, + { + "epoch": 0.18476533531094397, + "grad_norm": 0.11272992193698883, + "learning_rate": 0.00010221410896725518, + "loss": 1.6268934249877929, + "step": 61040 + }, + { + "epoch": 0.1847956048612898, + "grad_norm": 0.14037775993347168, + "learning_rate": 0.00010221031378561778, + "loss": 1.6238147735595703, + "step": 61050 + }, + { + "epoch": 0.18482587441163562, + "grad_norm": 0.13501611351966858, + "learning_rate": 0.00010220651860398039, + "loss": 1.5924263000488281, + "step": 61060 + }, + { + "epoch": 0.18485614396198144, + "grad_norm": 0.13838915526866913, + "learning_rate": 0.000102202723422343, + "loss": 1.6154260635375977, + "step": 61070 + }, + { + "epoch": 0.18488641351232726, + "grad_norm": 0.12640339136123657, + "learning_rate": 0.0001021989282407056, + "loss": 1.5799028396606445, + "step": 61080 + }, + { + "epoch": 0.18491668306267312, + "grad_norm": 0.1367776095867157, + "learning_rate": 0.0001021951330590682, + "loss": 1.633321762084961, + "step": 61090 + }, + { + "epoch": 0.18494695261301894, + "grad_norm": 0.1374058723449707, + "learning_rate": 0.00010219133787743081, + "loss": 1.5875572204589843, + "step": 61100 + }, + { + "epoch": 0.18497722216336476, + "grad_norm": 0.1524210423231125, + "learning_rate": 0.00010218754269579342, + "loss": 1.5673862457275392, + "step": 61110 + }, + { + "epoch": 0.1850074917137106, + "grad_norm": 0.1274242252111435, + "learning_rate": 0.00010218374751415604, + "loss": 1.6232213973999023, + "step": 61120 + }, + { + "epoch": 0.1850377612640564, + "grad_norm": 0.12434802949428558, + "learning_rate": 0.00010217995233251863, + "loss": 1.597765064239502, + "step": 61130 + }, + { + "epoch": 0.18506803081440226, + "grad_norm": 0.12235406041145325, + "learning_rate": 0.00010217615715088125, + "loss": 1.5900507926940919, + "step": 61140 + }, + { + "epoch": 0.1850983003647481, + "grad_norm": 0.15338462591171265, + "learning_rate": 0.00010217236196924384, + "loss": 1.585474395751953, + "step": 61150 + }, + { + "epoch": 0.1851285699150939, + "grad_norm": 0.12689295411109924, + "learning_rate": 0.00010216856678760646, + "loss": 1.6307464599609376, + "step": 61160 + }, + { + "epoch": 0.18515883946543973, + "grad_norm": 0.13408567011356354, + "learning_rate": 0.00010216477160596907, + "loss": 1.6336225509643554, + "step": 61170 + }, + { + "epoch": 0.18518910901578556, + "grad_norm": 0.15800882875919342, + "learning_rate": 0.00010216097642433167, + "loss": 1.6050285339355468, + "step": 61180 + }, + { + "epoch": 0.1852193785661314, + "grad_norm": 0.1408676654100418, + "learning_rate": 0.00010215718124269428, + "loss": 1.5700029373168944, + "step": 61190 + }, + { + "epoch": 0.18524964811647723, + "grad_norm": 0.1890072375535965, + "learning_rate": 0.00010215338606105688, + "loss": 1.5899698257446289, + "step": 61200 + }, + { + "epoch": 0.18527991766682306, + "grad_norm": 0.1244792640209198, + "learning_rate": 0.00010214959087941949, + "loss": 1.5960928916931152, + "step": 61210 + }, + { + "epoch": 0.18531018721716888, + "grad_norm": 0.12290588021278381, + "learning_rate": 0.0001021457956977821, + "loss": 1.5894964218139649, + "step": 61220 + }, + { + "epoch": 0.1853404567675147, + "grad_norm": 0.14021626114845276, + "learning_rate": 0.00010214200051614472, + "loss": 1.593926239013672, + "step": 61230 + }, + { + "epoch": 0.18537072631786056, + "grad_norm": 0.13712741434574127, + "learning_rate": 0.00010213820533450731, + "loss": 1.6087804794311524, + "step": 61240 + }, + { + "epoch": 0.18540099586820638, + "grad_norm": 0.12325738370418549, + "learning_rate": 0.00010213441015286993, + "loss": 1.601632308959961, + "step": 61250 + }, + { + "epoch": 0.1854312654185522, + "grad_norm": 0.13739340007305145, + "learning_rate": 0.00010213061497123252, + "loss": 1.6011978149414063, + "step": 61260 + }, + { + "epoch": 0.18546153496889803, + "grad_norm": 0.1256880760192871, + "learning_rate": 0.00010212681978959514, + "loss": 1.635244369506836, + "step": 61270 + }, + { + "epoch": 0.18549180451924388, + "grad_norm": 0.1258905827999115, + "learning_rate": 0.00010212302460795773, + "loss": 1.605797576904297, + "step": 61280 + }, + { + "epoch": 0.1855220740695897, + "grad_norm": 0.14794616401195526, + "learning_rate": 0.00010211922942632035, + "loss": 1.6000051498413086, + "step": 61290 + }, + { + "epoch": 0.18555234361993553, + "grad_norm": 0.11884117126464844, + "learning_rate": 0.00010211543424468294, + "loss": 1.5836435317993165, + "step": 61300 + }, + { + "epoch": 0.18558261317028135, + "grad_norm": 0.12550100684165955, + "learning_rate": 0.00010211163906304556, + "loss": 1.617718505859375, + "step": 61310 + }, + { + "epoch": 0.18561288272062718, + "grad_norm": 0.1292741894721985, + "learning_rate": 0.00010210784388140816, + "loss": 1.5829330444335938, + "step": 61320 + }, + { + "epoch": 0.18564315227097303, + "grad_norm": 0.1220034658908844, + "learning_rate": 0.00010210404869977078, + "loss": 1.597503662109375, + "step": 61330 + }, + { + "epoch": 0.18567342182131885, + "grad_norm": 0.12340231984853745, + "learning_rate": 0.00010210025351813337, + "loss": 1.564626121520996, + "step": 61340 + }, + { + "epoch": 0.18570369137166468, + "grad_norm": 0.1172836422920227, + "learning_rate": 0.00010209645833649599, + "loss": 1.5964084625244142, + "step": 61350 + }, + { + "epoch": 0.1857339609220105, + "grad_norm": 0.13220201432704926, + "learning_rate": 0.00010209266315485861, + "loss": 1.5802955627441406, + "step": 61360 + }, + { + "epoch": 0.18576423047235632, + "grad_norm": 0.1123276874423027, + "learning_rate": 0.0001020888679732212, + "loss": 1.6126522064208983, + "step": 61370 + }, + { + "epoch": 0.18579450002270217, + "grad_norm": 0.1295737624168396, + "learning_rate": 0.00010208507279158382, + "loss": 1.591942596435547, + "step": 61380 + }, + { + "epoch": 0.185824769573048, + "grad_norm": 0.13592547178268433, + "learning_rate": 0.00010208127760994641, + "loss": 1.573124313354492, + "step": 61390 + }, + { + "epoch": 0.18585503912339382, + "grad_norm": 0.12482651323080063, + "learning_rate": 0.00010207748242830903, + "loss": 1.5686901092529297, + "step": 61400 + }, + { + "epoch": 0.18588530867373965, + "grad_norm": 0.13076111674308777, + "learning_rate": 0.00010207368724667162, + "loss": 1.5969205856323243, + "step": 61410 + }, + { + "epoch": 0.18591557822408547, + "grad_norm": 0.12948626279830933, + "learning_rate": 0.00010206989206503424, + "loss": 1.6164161682128906, + "step": 61420 + }, + { + "epoch": 0.18594584777443132, + "grad_norm": 0.11835328489542007, + "learning_rate": 0.00010206609688339683, + "loss": 1.582012939453125, + "step": 61430 + }, + { + "epoch": 0.18597611732477715, + "grad_norm": 0.1158580556511879, + "learning_rate": 0.00010206230170175945, + "loss": 1.6051769256591797, + "step": 61440 + }, + { + "epoch": 0.18600638687512297, + "grad_norm": 0.1348319798707962, + "learning_rate": 0.00010205850652012205, + "loss": 1.6239229202270509, + "step": 61450 + }, + { + "epoch": 0.1860366564254688, + "grad_norm": 0.11581241339445114, + "learning_rate": 0.00010205471133848467, + "loss": 1.6181093215942384, + "step": 61460 + }, + { + "epoch": 0.18606692597581462, + "grad_norm": 0.1265774965286255, + "learning_rate": 0.00010205091615684726, + "loss": 1.5696424484252929, + "step": 61470 + }, + { + "epoch": 0.18609719552616047, + "grad_norm": 0.12545107305049896, + "learning_rate": 0.00010204712097520988, + "loss": 1.5878520011901855, + "step": 61480 + }, + { + "epoch": 0.1861274650765063, + "grad_norm": 0.13314951956272125, + "learning_rate": 0.00010204332579357248, + "loss": 1.5649519920349122, + "step": 61490 + }, + { + "epoch": 0.18615773462685212, + "grad_norm": 0.12222481518983841, + "learning_rate": 0.00010203953061193509, + "loss": 1.600668716430664, + "step": 61500 + }, + { + "epoch": 0.18615773462685212, + "eval_loss": 1.5971583127975464, + "eval_runtime": 28.0976, + "eval_samples_per_second": 17.795, + "eval_steps_per_second": 1.139, + "step": 61500 + }, + { + "epoch": 0.18618800417719794, + "grad_norm": 0.16273614764213562, + "learning_rate": 0.0001020357354302977, + "loss": 1.5735546112060548, + "step": 61510 + }, + { + "epoch": 0.18621827372754376, + "grad_norm": 0.1225290521979332, + "learning_rate": 0.0001020319402486603, + "loss": 1.6171041488647462, + "step": 61520 + }, + { + "epoch": 0.18624854327788962, + "grad_norm": 0.12086047977209091, + "learning_rate": 0.00010202814506702291, + "loss": 1.5994305610656738, + "step": 61530 + }, + { + "epoch": 0.18627881282823544, + "grad_norm": 0.11414093524217606, + "learning_rate": 0.00010202434988538551, + "loss": 1.5861791610717773, + "step": 61540 + }, + { + "epoch": 0.18630908237858126, + "grad_norm": 0.12693707644939423, + "learning_rate": 0.00010202055470374812, + "loss": 1.5622827529907226, + "step": 61550 + }, + { + "epoch": 0.1863393519289271, + "grad_norm": 0.1196897104382515, + "learning_rate": 0.00010201675952211073, + "loss": 1.6157272338867188, + "step": 61560 + }, + { + "epoch": 0.1863696214792729, + "grad_norm": 0.1261812448501587, + "learning_rate": 0.00010201296434047335, + "loss": 1.6049488067626954, + "step": 61570 + }, + { + "epoch": 0.18639989102961876, + "grad_norm": 0.11785703897476196, + "learning_rate": 0.00010200916915883594, + "loss": 1.6255659103393554, + "step": 61580 + }, + { + "epoch": 0.1864301605799646, + "grad_norm": 0.12389887869358063, + "learning_rate": 0.00010200537397719856, + "loss": 1.609481430053711, + "step": 61590 + }, + { + "epoch": 0.1864604301303104, + "grad_norm": 0.1121731773018837, + "learning_rate": 0.00010200157879556115, + "loss": 1.570139503479004, + "step": 61600 + }, + { + "epoch": 0.18649069968065624, + "grad_norm": 0.12132184952497482, + "learning_rate": 0.00010199778361392377, + "loss": 1.6147584915161133, + "step": 61610 + }, + { + "epoch": 0.1865209692310021, + "grad_norm": 0.11286064237356186, + "learning_rate": 0.00010199398843228638, + "loss": 1.5923519134521484, + "step": 61620 + }, + { + "epoch": 0.1865512387813479, + "grad_norm": 0.1373278945684433, + "learning_rate": 0.00010199019325064898, + "loss": 1.6178972244262695, + "step": 61630 + }, + { + "epoch": 0.18658150833169374, + "grad_norm": 0.11362288892269135, + "learning_rate": 0.00010198639806901159, + "loss": 1.5632390022277831, + "step": 61640 + }, + { + "epoch": 0.18661177788203956, + "grad_norm": 0.12761780619621277, + "learning_rate": 0.00010198260288737419, + "loss": 1.5782549858093262, + "step": 61650 + }, + { + "epoch": 0.18664204743238538, + "grad_norm": 0.12184067815542221, + "learning_rate": 0.0001019788077057368, + "loss": 1.6181255340576173, + "step": 61660 + }, + { + "epoch": 0.18667231698273123, + "grad_norm": 0.1310606598854065, + "learning_rate": 0.0001019750125240994, + "loss": 1.5962484359741211, + "step": 61670 + }, + { + "epoch": 0.18670258653307706, + "grad_norm": 0.13947750627994537, + "learning_rate": 0.00010197121734246201, + "loss": 1.5716590881347656, + "step": 61680 + }, + { + "epoch": 0.18673285608342288, + "grad_norm": 0.12655523419380188, + "learning_rate": 0.00010196742216082462, + "loss": 1.6049228668212892, + "step": 61690 + }, + { + "epoch": 0.1867631256337687, + "grad_norm": 0.12351619452238083, + "learning_rate": 0.00010196362697918722, + "loss": 1.5828323364257812, + "step": 61700 + }, + { + "epoch": 0.18679339518411453, + "grad_norm": 0.14277178049087524, + "learning_rate": 0.00010195983179754983, + "loss": 1.55218448638916, + "step": 61710 + }, + { + "epoch": 0.18682366473446038, + "grad_norm": 0.1536843627691269, + "learning_rate": 0.00010195603661591243, + "loss": 1.5666300773620605, + "step": 61720 + }, + { + "epoch": 0.1868539342848062, + "grad_norm": 0.12287422269582748, + "learning_rate": 0.00010195224143427505, + "loss": 1.5418584823608399, + "step": 61730 + }, + { + "epoch": 0.18688420383515203, + "grad_norm": 0.12331221252679825, + "learning_rate": 0.00010194844625263765, + "loss": 1.5738405227661132, + "step": 61740 + }, + { + "epoch": 0.18691447338549785, + "grad_norm": 0.12701953947544098, + "learning_rate": 0.00010194465107100027, + "loss": 1.586904525756836, + "step": 61750 + }, + { + "epoch": 0.18694474293584368, + "grad_norm": 0.12507787346839905, + "learning_rate": 0.00010194085588936286, + "loss": 1.57528076171875, + "step": 61760 + }, + { + "epoch": 0.18697501248618953, + "grad_norm": 0.13240376114845276, + "learning_rate": 0.00010193706070772548, + "loss": 1.6129375457763673, + "step": 61770 + }, + { + "epoch": 0.18700528203653535, + "grad_norm": 0.11787473410367966, + "learning_rate": 0.00010193326552608808, + "loss": 1.598724937438965, + "step": 61780 + }, + { + "epoch": 0.18703555158688118, + "grad_norm": 0.1344074010848999, + "learning_rate": 0.00010192947034445069, + "loss": 1.5862091064453125, + "step": 61790 + }, + { + "epoch": 0.187065821137227, + "grad_norm": 0.1707175076007843, + "learning_rate": 0.0001019256751628133, + "loss": 1.6151159286499024, + "step": 61800 + }, + { + "epoch": 0.18709609068757282, + "grad_norm": 0.13296714425086975, + "learning_rate": 0.0001019218799811759, + "loss": 1.6037097930908204, + "step": 61810 + }, + { + "epoch": 0.18712636023791868, + "grad_norm": 0.12214051187038422, + "learning_rate": 0.00010191808479953851, + "loss": 1.6041481018066406, + "step": 61820 + }, + { + "epoch": 0.1871566297882645, + "grad_norm": 0.12030491977930069, + "learning_rate": 0.00010191428961790111, + "loss": 1.6043760299682617, + "step": 61830 + }, + { + "epoch": 0.18718689933861032, + "grad_norm": 0.13777974247932434, + "learning_rate": 0.00010191049443626372, + "loss": 1.6200439453125, + "step": 61840 + }, + { + "epoch": 0.18721716888895615, + "grad_norm": 0.131963312625885, + "learning_rate": 0.00010190669925462633, + "loss": 1.5288446426391602, + "step": 61850 + }, + { + "epoch": 0.18724743843930197, + "grad_norm": 0.12605778872966766, + "learning_rate": 0.00010190290407298894, + "loss": 1.5728488922119142, + "step": 61860 + }, + { + "epoch": 0.18727770798964782, + "grad_norm": 0.11011070758104324, + "learning_rate": 0.00010189910889135154, + "loss": 1.5671068191528321, + "step": 61870 + }, + { + "epoch": 0.18730797753999365, + "grad_norm": 0.13149480521678925, + "learning_rate": 0.00010189531370971416, + "loss": 1.6215091705322267, + "step": 61880 + }, + { + "epoch": 0.18733824709033947, + "grad_norm": 0.11409366130828857, + "learning_rate": 0.00010189151852807675, + "loss": 1.6058591842651366, + "step": 61890 + }, + { + "epoch": 0.1873685166406853, + "grad_norm": 0.1267775595188141, + "learning_rate": 0.00010188772334643937, + "loss": 1.5846576690673828, + "step": 61900 + }, + { + "epoch": 0.18739878619103112, + "grad_norm": 0.12365683168172836, + "learning_rate": 0.00010188392816480196, + "loss": 1.5664485931396483, + "step": 61910 + }, + { + "epoch": 0.18742905574137697, + "grad_norm": 0.1330600380897522, + "learning_rate": 0.00010188013298316458, + "loss": 1.5715509414672852, + "step": 61920 + }, + { + "epoch": 0.1874593252917228, + "grad_norm": 0.1386309415102005, + "learning_rate": 0.00010187633780152717, + "loss": 1.5803142547607423, + "step": 61930 + }, + { + "epoch": 0.18748959484206862, + "grad_norm": 0.12063786387443542, + "learning_rate": 0.00010187254261988979, + "loss": 1.596656322479248, + "step": 61940 + }, + { + "epoch": 0.18751986439241444, + "grad_norm": 0.1338697075843811, + "learning_rate": 0.00010186874743825238, + "loss": 1.6046533584594727, + "step": 61950 + }, + { + "epoch": 0.18755013394276027, + "grad_norm": 0.11783741414546967, + "learning_rate": 0.000101864952256615, + "loss": 1.5585831642150878, + "step": 61960 + }, + { + "epoch": 0.18758040349310612, + "grad_norm": 0.12581884860992432, + "learning_rate": 0.00010186115707497762, + "loss": 1.5937606811523437, + "step": 61970 + }, + { + "epoch": 0.18761067304345194, + "grad_norm": 0.1406295895576477, + "learning_rate": 0.00010185736189334022, + "loss": 1.6043140411376953, + "step": 61980 + }, + { + "epoch": 0.18764094259379777, + "grad_norm": 0.11834564059972763, + "learning_rate": 0.00010185356671170284, + "loss": 1.5979015350341796, + "step": 61990 + }, + { + "epoch": 0.1876712121441436, + "grad_norm": 0.12168581038713455, + "learning_rate": 0.00010184977153006543, + "loss": 1.5970285415649415, + "step": 62000 + }, + { + "epoch": 0.1876712121441436, + "eval_loss": 1.5800741910934448, + "eval_runtime": 27.9202, + "eval_samples_per_second": 17.908, + "eval_steps_per_second": 1.146, + "step": 62000 + }, + { + "epoch": 0.18770148169448944, + "grad_norm": 0.12255113571882248, + "learning_rate": 0.00010184597634842805, + "loss": 1.5923758506774903, + "step": 62010 + }, + { + "epoch": 0.18773175124483527, + "grad_norm": 0.12677468359470367, + "learning_rate": 0.00010184218116679064, + "loss": 1.6146482467651366, + "step": 62020 + }, + { + "epoch": 0.1877620207951811, + "grad_norm": 0.12918508052825928, + "learning_rate": 0.00010183838598515326, + "loss": 1.6136472702026368, + "step": 62030 + }, + { + "epoch": 0.1877922903455269, + "grad_norm": 0.120759978890419, + "learning_rate": 0.00010183459080351585, + "loss": 1.5654754638671875, + "step": 62040 + }, + { + "epoch": 0.18782255989587274, + "grad_norm": 0.12416929006576538, + "learning_rate": 0.00010183079562187847, + "loss": 1.6022983551025392, + "step": 62050 + }, + { + "epoch": 0.1878528294462186, + "grad_norm": 0.13181941211223602, + "learning_rate": 0.00010182700044024106, + "loss": 1.5824838638305665, + "step": 62060 + }, + { + "epoch": 0.1878830989965644, + "grad_norm": 0.1299835443496704, + "learning_rate": 0.00010182320525860368, + "loss": 1.6113079071044922, + "step": 62070 + }, + { + "epoch": 0.18791336854691024, + "grad_norm": 0.13668769598007202, + "learning_rate": 0.00010181941007696628, + "loss": 1.6159072875976563, + "step": 62080 + }, + { + "epoch": 0.18794363809725606, + "grad_norm": 0.14659364521503448, + "learning_rate": 0.0001018156148953289, + "loss": 1.5749267578125, + "step": 62090 + }, + { + "epoch": 0.18797390764760188, + "grad_norm": 0.12599562108516693, + "learning_rate": 0.0001018118197136915, + "loss": 1.5674301147460938, + "step": 62100 + }, + { + "epoch": 0.18800417719794774, + "grad_norm": 0.12384675443172455, + "learning_rate": 0.00010180802453205411, + "loss": 1.5761897087097168, + "step": 62110 + }, + { + "epoch": 0.18803444674829356, + "grad_norm": 0.12351565808057785, + "learning_rate": 0.00010180422935041671, + "loss": 1.609335708618164, + "step": 62120 + }, + { + "epoch": 0.18806471629863938, + "grad_norm": 0.12178421020507812, + "learning_rate": 0.00010180043416877932, + "loss": 1.5882888793945313, + "step": 62130 + }, + { + "epoch": 0.1880949858489852, + "grad_norm": 0.12013337761163712, + "learning_rate": 0.00010179663898714193, + "loss": 1.6030113220214843, + "step": 62140 + }, + { + "epoch": 0.18812525539933103, + "grad_norm": 0.13563592731952667, + "learning_rate": 0.00010179284380550453, + "loss": 1.5940213203430176, + "step": 62150 + }, + { + "epoch": 0.18815552494967688, + "grad_norm": 0.1243252381682396, + "learning_rate": 0.00010178904862386714, + "loss": 1.6139453887939452, + "step": 62160 + }, + { + "epoch": 0.1881857945000227, + "grad_norm": 0.11872807890176773, + "learning_rate": 0.00010178525344222974, + "loss": 1.5829904556274415, + "step": 62170 + }, + { + "epoch": 0.18821606405036853, + "grad_norm": 0.14396052062511444, + "learning_rate": 0.00010178145826059236, + "loss": 1.5927127838134765, + "step": 62180 + }, + { + "epoch": 0.18824633360071436, + "grad_norm": 0.1376146525144577, + "learning_rate": 0.00010177766307895495, + "loss": 1.5902084350585937, + "step": 62190 + }, + { + "epoch": 0.18827660315106018, + "grad_norm": 0.13277651369571686, + "learning_rate": 0.00010177386789731757, + "loss": 1.5801342964172362, + "step": 62200 + }, + { + "epoch": 0.18830687270140603, + "grad_norm": 0.11530670523643494, + "learning_rate": 0.00010177007271568017, + "loss": 1.6234041213989259, + "step": 62210 + }, + { + "epoch": 0.18833714225175185, + "grad_norm": 0.12096758931875229, + "learning_rate": 0.00010176627753404279, + "loss": 1.5922243118286132, + "step": 62220 + }, + { + "epoch": 0.18836741180209768, + "grad_norm": 0.11915457993745804, + "learning_rate": 0.00010176248235240539, + "loss": 1.5899877548217773, + "step": 62230 + }, + { + "epoch": 0.1883976813524435, + "grad_norm": 0.12275560200214386, + "learning_rate": 0.000101758687170768, + "loss": 1.5748477935791017, + "step": 62240 + }, + { + "epoch": 0.18842795090278933, + "grad_norm": 0.13833989202976227, + "learning_rate": 0.0001017548919891306, + "loss": 1.593644618988037, + "step": 62250 + }, + { + "epoch": 0.18845822045313518, + "grad_norm": 0.1261686235666275, + "learning_rate": 0.00010175109680749321, + "loss": 1.5620637893676759, + "step": 62260 + }, + { + "epoch": 0.188488490003481, + "grad_norm": 0.14161445200443268, + "learning_rate": 0.00010174730162585582, + "loss": 1.603866195678711, + "step": 62270 + }, + { + "epoch": 0.18851875955382683, + "grad_norm": 0.12803280353546143, + "learning_rate": 0.00010174350644421842, + "loss": 1.5739276885986329, + "step": 62280 + }, + { + "epoch": 0.18854902910417265, + "grad_norm": 0.12878945469856262, + "learning_rate": 0.00010173971126258103, + "loss": 1.6060237884521484, + "step": 62290 + }, + { + "epoch": 0.18857929865451847, + "grad_norm": 0.1214522272348404, + "learning_rate": 0.00010173591608094363, + "loss": 1.5964475631713868, + "step": 62300 + }, + { + "epoch": 0.18860956820486433, + "grad_norm": 0.12580837309360504, + "learning_rate": 0.00010173212089930624, + "loss": 1.577756404876709, + "step": 62310 + }, + { + "epoch": 0.18863983775521015, + "grad_norm": 0.12939701974391937, + "learning_rate": 0.00010172832571766885, + "loss": 1.5822851181030273, + "step": 62320 + }, + { + "epoch": 0.18867010730555597, + "grad_norm": 0.14257964491844177, + "learning_rate": 0.00010172453053603145, + "loss": 1.5755548477172852, + "step": 62330 + }, + { + "epoch": 0.1887003768559018, + "grad_norm": 0.1400584578514099, + "learning_rate": 0.00010172073535439407, + "loss": 1.6514251708984375, + "step": 62340 + }, + { + "epoch": 0.18873064640624765, + "grad_norm": 0.12212856858968735, + "learning_rate": 0.00010171694017275666, + "loss": 1.6102779388427735, + "step": 62350 + }, + { + "epoch": 0.18876091595659347, + "grad_norm": 0.13677342236042023, + "learning_rate": 0.00010171314499111928, + "loss": 1.5763367652893066, + "step": 62360 + }, + { + "epoch": 0.1887911855069393, + "grad_norm": 0.12134196609258652, + "learning_rate": 0.00010170934980948188, + "loss": 1.6211875915527343, + "step": 62370 + }, + { + "epoch": 0.18882145505728512, + "grad_norm": 0.13506533205509186, + "learning_rate": 0.0001017055546278445, + "loss": 1.58632230758667, + "step": 62380 + }, + { + "epoch": 0.18885172460763094, + "grad_norm": 0.12960323691368103, + "learning_rate": 0.0001017017594462071, + "loss": 1.5798428535461426, + "step": 62390 + }, + { + "epoch": 0.1888819941579768, + "grad_norm": 0.12814126908779144, + "learning_rate": 0.00010169796426456971, + "loss": 1.5906230926513671, + "step": 62400 + }, + { + "epoch": 0.18891226370832262, + "grad_norm": 0.12146014720201492, + "learning_rate": 0.00010169416908293231, + "loss": 1.5723533630371094, + "step": 62410 + }, + { + "epoch": 0.18894253325866844, + "grad_norm": 0.1116926521062851, + "learning_rate": 0.00010169037390129492, + "loss": 1.5580477714538574, + "step": 62420 + }, + { + "epoch": 0.18897280280901427, + "grad_norm": 0.12522770464420319, + "learning_rate": 0.00010168657871965752, + "loss": 1.5954776763916017, + "step": 62430 + }, + { + "epoch": 0.1890030723593601, + "grad_norm": 0.11324042081832886, + "learning_rate": 0.00010168278353802013, + "loss": 1.5318132400512696, + "step": 62440 + }, + { + "epoch": 0.18903334190970594, + "grad_norm": 0.14688082039356232, + "learning_rate": 0.00010167898835638274, + "loss": 1.578294849395752, + "step": 62450 + }, + { + "epoch": 0.18906361146005177, + "grad_norm": 0.13073128461837769, + "learning_rate": 0.00010167519317474534, + "loss": 1.6162578582763671, + "step": 62460 + }, + { + "epoch": 0.1890938810103976, + "grad_norm": 0.11727927625179291, + "learning_rate": 0.00010167139799310796, + "loss": 1.6334983825683593, + "step": 62470 + }, + { + "epoch": 0.18912415056074341, + "grad_norm": 0.12195513397455215, + "learning_rate": 0.00010166760281147055, + "loss": 1.5480178833007812, + "step": 62480 + }, + { + "epoch": 0.18915442011108924, + "grad_norm": 0.13873520493507385, + "learning_rate": 0.00010166380762983317, + "loss": 1.5842092514038086, + "step": 62490 + }, + { + "epoch": 0.1891846896614351, + "grad_norm": 0.12527745962142944, + "learning_rate": 0.00010166001244819577, + "loss": 1.6068695068359375, + "step": 62500 + }, + { + "epoch": 0.1891846896614351, + "eval_loss": 1.6150072813034058, + "eval_runtime": 28.1387, + "eval_samples_per_second": 17.769, + "eval_steps_per_second": 1.137, + "step": 62500 + }, + { + "epoch": 0.18921495921178091, + "grad_norm": 0.12534993886947632, + "learning_rate": 0.00010165621726655839, + "loss": 1.5577836990356446, + "step": 62510 + }, + { + "epoch": 0.18924522876212674, + "grad_norm": 0.14093619585037231, + "learning_rate": 0.00010165242208492098, + "loss": 1.5699193954467774, + "step": 62520 + }, + { + "epoch": 0.18927549831247256, + "grad_norm": 0.11901417374610901, + "learning_rate": 0.0001016486269032836, + "loss": 1.6060173034667968, + "step": 62530 + }, + { + "epoch": 0.18930576786281839, + "grad_norm": 0.130356565117836, + "learning_rate": 0.00010164483172164619, + "loss": 1.5401748657226562, + "step": 62540 + }, + { + "epoch": 0.18933603741316424, + "grad_norm": 0.11903531849384308, + "learning_rate": 0.00010164103654000881, + "loss": 1.5870969772338868, + "step": 62550 + }, + { + "epoch": 0.18936630696351006, + "grad_norm": 0.12510618567466736, + "learning_rate": 0.0001016372413583714, + "loss": 1.610920524597168, + "step": 62560 + }, + { + "epoch": 0.18939657651385589, + "grad_norm": 0.1330842524766922, + "learning_rate": 0.00010163344617673402, + "loss": 1.565878677368164, + "step": 62570 + }, + { + "epoch": 0.1894268460642017, + "grad_norm": 0.12298000603914261, + "learning_rate": 0.00010162965099509663, + "loss": 1.5759979248046876, + "step": 62580 + }, + { + "epoch": 0.18945711561454753, + "grad_norm": 0.119438037276268, + "learning_rate": 0.00010162585581345923, + "loss": 1.5900896072387696, + "step": 62590 + }, + { + "epoch": 0.18948738516489338, + "grad_norm": 0.12879033386707306, + "learning_rate": 0.00010162206063182185, + "loss": 1.6020376205444335, + "step": 62600 + }, + { + "epoch": 0.1895176547152392, + "grad_norm": 0.12569820880889893, + "learning_rate": 0.00010161826545018445, + "loss": 1.5952709197998047, + "step": 62610 + }, + { + "epoch": 0.18954792426558503, + "grad_norm": 0.12378911674022675, + "learning_rate": 0.00010161447026854706, + "loss": 1.5651422500610352, + "step": 62620 + }, + { + "epoch": 0.18957819381593086, + "grad_norm": 0.13199658691883087, + "learning_rate": 0.00010161067508690966, + "loss": 1.5769056320190429, + "step": 62630 + }, + { + "epoch": 0.18960846336627668, + "grad_norm": 0.1444692760705948, + "learning_rate": 0.00010160687990527228, + "loss": 1.6054418563842774, + "step": 62640 + }, + { + "epoch": 0.18963873291662253, + "grad_norm": 0.14475926756858826, + "learning_rate": 0.00010160308472363487, + "loss": 1.5405754089355468, + "step": 62650 + }, + { + "epoch": 0.18966900246696836, + "grad_norm": 0.13338787853717804, + "learning_rate": 0.00010159928954199749, + "loss": 1.6204986572265625, + "step": 62660 + }, + { + "epoch": 0.18969927201731418, + "grad_norm": 0.11681070178747177, + "learning_rate": 0.00010159549436036008, + "loss": 1.6101821899414062, + "step": 62670 + }, + { + "epoch": 0.18972954156766, + "grad_norm": 0.12510475516319275, + "learning_rate": 0.0001015916991787227, + "loss": 1.6249584197998046, + "step": 62680 + }, + { + "epoch": 0.18975981111800586, + "grad_norm": 0.15162357687950134, + "learning_rate": 0.00010158790399708529, + "loss": 1.5636120796203614, + "step": 62690 + }, + { + "epoch": 0.18979008066835168, + "grad_norm": 0.1391102820634842, + "learning_rate": 0.00010158410881544791, + "loss": 1.5673598289489745, + "step": 62700 + }, + { + "epoch": 0.1898203502186975, + "grad_norm": 0.129377543926239, + "learning_rate": 0.0001015803136338105, + "loss": 1.5583290100097655, + "step": 62710 + }, + { + "epoch": 0.18985061976904333, + "grad_norm": 0.14045163989067078, + "learning_rate": 0.00010157651845217312, + "loss": 1.5719676971435548, + "step": 62720 + }, + { + "epoch": 0.18988088931938915, + "grad_norm": 0.12387409061193466, + "learning_rate": 0.00010157272327053573, + "loss": 1.58707857131958, + "step": 62730 + }, + { + "epoch": 0.189911158869735, + "grad_norm": 0.10287116467952728, + "learning_rate": 0.00010156892808889834, + "loss": 1.5775808334350585, + "step": 62740 + }, + { + "epoch": 0.18994142842008083, + "grad_norm": 0.14146305620670319, + "learning_rate": 0.00010156513290726094, + "loss": 1.5709218978881836, + "step": 62750 + }, + { + "epoch": 0.18997169797042665, + "grad_norm": 0.12130588293075562, + "learning_rate": 0.00010156133772562355, + "loss": 1.6091928482055664, + "step": 62760 + }, + { + "epoch": 0.19000196752077247, + "grad_norm": 0.1293889433145523, + "learning_rate": 0.00010155754254398615, + "loss": 1.6415945053100587, + "step": 62770 + }, + { + "epoch": 0.1900322370711183, + "grad_norm": 0.11039210110902786, + "learning_rate": 0.00010155374736234876, + "loss": 1.6065788269042969, + "step": 62780 + }, + { + "epoch": 0.19006250662146415, + "grad_norm": 0.11264822632074356, + "learning_rate": 0.00010154995218071138, + "loss": 1.6064176559448242, + "step": 62790 + }, + { + "epoch": 0.19009277617180997, + "grad_norm": 0.1175992488861084, + "learning_rate": 0.00010154615699907397, + "loss": 1.589884376525879, + "step": 62800 + }, + { + "epoch": 0.1901230457221558, + "grad_norm": 0.13290520012378693, + "learning_rate": 0.00010154236181743659, + "loss": 1.6104019165039063, + "step": 62810 + }, + { + "epoch": 0.19015331527250162, + "grad_norm": 0.1166672483086586, + "learning_rate": 0.00010153856663579918, + "loss": 1.625905990600586, + "step": 62820 + }, + { + "epoch": 0.19018358482284745, + "grad_norm": 0.12151577323675156, + "learning_rate": 0.0001015347714541618, + "loss": 1.5739341735839845, + "step": 62830 + }, + { + "epoch": 0.1902138543731933, + "grad_norm": 0.14059843122959137, + "learning_rate": 0.00010153097627252441, + "loss": 1.6222345352172851, + "step": 62840 + }, + { + "epoch": 0.19024412392353912, + "grad_norm": 0.12922941148281097, + "learning_rate": 0.00010152718109088702, + "loss": 1.5932835578918456, + "step": 62850 + }, + { + "epoch": 0.19027439347388495, + "grad_norm": 0.12326645851135254, + "learning_rate": 0.00010152338590924962, + "loss": 1.6459428787231445, + "step": 62860 + }, + { + "epoch": 0.19030466302423077, + "grad_norm": 0.12126581370830536, + "learning_rate": 0.00010151959072761223, + "loss": 1.6377506256103516, + "step": 62870 + }, + { + "epoch": 0.1903349325745766, + "grad_norm": 0.1215573102235794, + "learning_rate": 0.00010151579554597483, + "loss": 1.6011648178100586, + "step": 62880 + }, + { + "epoch": 0.19036520212492244, + "grad_norm": 0.13375224173069, + "learning_rate": 0.00010151200036433744, + "loss": 1.589162254333496, + "step": 62890 + }, + { + "epoch": 0.19039547167526827, + "grad_norm": 0.11046929657459259, + "learning_rate": 0.00010150820518270005, + "loss": 1.5888980865478515, + "step": 62900 + }, + { + "epoch": 0.1904257412256141, + "grad_norm": 0.14280831813812256, + "learning_rate": 0.00010150441000106265, + "loss": 1.623422622680664, + "step": 62910 + }, + { + "epoch": 0.19045601077595992, + "grad_norm": 0.13096871972084045, + "learning_rate": 0.00010150061481942526, + "loss": 1.5746143341064454, + "step": 62920 + }, + { + "epoch": 0.19048628032630574, + "grad_norm": 0.13746626675128937, + "learning_rate": 0.00010149681963778786, + "loss": 1.5724714279174805, + "step": 62930 + }, + { + "epoch": 0.1905165498766516, + "grad_norm": 0.11666852235794067, + "learning_rate": 0.00010149302445615047, + "loss": 1.5987796783447266, + "step": 62940 + }, + { + "epoch": 0.19054681942699742, + "grad_norm": 0.13941481709480286, + "learning_rate": 0.00010148922927451307, + "loss": 1.5571515083312988, + "step": 62950 + }, + { + "epoch": 0.19057708897734324, + "grad_norm": 0.1362202912569046, + "learning_rate": 0.00010148543409287568, + "loss": 1.6104127883911132, + "step": 62960 + }, + { + "epoch": 0.19060735852768906, + "grad_norm": 0.1169334203004837, + "learning_rate": 0.0001014816389112383, + "loss": 1.6072601318359374, + "step": 62970 + }, + { + "epoch": 0.1906376280780349, + "grad_norm": 0.12371308356523514, + "learning_rate": 0.0001014778437296009, + "loss": 1.6043712615966796, + "step": 62980 + }, + { + "epoch": 0.19066789762838074, + "grad_norm": 0.12479482591152191, + "learning_rate": 0.00010147404854796351, + "loss": 1.6186471939086915, + "step": 62990 + }, + { + "epoch": 0.19069816717872656, + "grad_norm": 0.1290397047996521, + "learning_rate": 0.00010147025336632612, + "loss": 1.6131282806396485, + "step": 63000 + }, + { + "epoch": 0.19069816717872656, + "eval_loss": 1.5906201601028442, + "eval_runtime": 27.503, + "eval_samples_per_second": 18.18, + "eval_steps_per_second": 1.164, + "step": 63000 + }, + { + "epoch": 0.1907284367290724, + "grad_norm": 0.12317521870136261, + "learning_rate": 0.00010146645818468872, + "loss": 1.5944998741149903, + "step": 63010 + }, + { + "epoch": 0.1907587062794182, + "grad_norm": 0.12432395666837692, + "learning_rate": 0.00010146266300305133, + "loss": 1.583869457244873, + "step": 63020 + }, + { + "epoch": 0.19078897582976406, + "grad_norm": 0.11711762100458145, + "learning_rate": 0.00010145886782141394, + "loss": 1.61364803314209, + "step": 63030 + }, + { + "epoch": 0.1908192453801099, + "grad_norm": 0.1366429626941681, + "learning_rate": 0.00010145507263977654, + "loss": 1.6164356231689454, + "step": 63040 + }, + { + "epoch": 0.1908495149304557, + "grad_norm": 0.12546616792678833, + "learning_rate": 0.00010145127745813915, + "loss": 1.5673389434814453, + "step": 63050 + }, + { + "epoch": 0.19087978448080153, + "grad_norm": 0.11011143773794174, + "learning_rate": 0.00010144748227650175, + "loss": 1.5949162483215331, + "step": 63060 + }, + { + "epoch": 0.19091005403114736, + "grad_norm": 0.13544782996177673, + "learning_rate": 0.00010144368709486436, + "loss": 1.5783166885375977, + "step": 63070 + }, + { + "epoch": 0.1909403235814932, + "grad_norm": 0.12597538530826569, + "learning_rate": 0.00010143989191322698, + "loss": 1.5932086944580077, + "step": 63080 + }, + { + "epoch": 0.19097059313183903, + "grad_norm": 0.11970677226781845, + "learning_rate": 0.00010143609673158957, + "loss": 1.6224809646606446, + "step": 63090 + }, + { + "epoch": 0.19100086268218486, + "grad_norm": 0.13412685692310333, + "learning_rate": 0.00010143230154995219, + "loss": 1.5762762069702148, + "step": 63100 + }, + { + "epoch": 0.19103113223253068, + "grad_norm": 0.12969790399074554, + "learning_rate": 0.00010142850636831478, + "loss": 1.612396240234375, + "step": 63110 + }, + { + "epoch": 0.1910614017828765, + "grad_norm": 0.1206020712852478, + "learning_rate": 0.0001014247111866774, + "loss": 1.5710264205932618, + "step": 63120 + }, + { + "epoch": 0.19109167133322236, + "grad_norm": 0.1458449363708496, + "learning_rate": 0.00010142091600504, + "loss": 1.608255386352539, + "step": 63130 + }, + { + "epoch": 0.19112194088356818, + "grad_norm": 0.1236555352807045, + "learning_rate": 0.00010141712082340262, + "loss": 1.5812746047973634, + "step": 63140 + }, + { + "epoch": 0.191152210433914, + "grad_norm": 0.11812237650156021, + "learning_rate": 0.00010141332564176521, + "loss": 1.5909225463867187, + "step": 63150 + }, + { + "epoch": 0.19118247998425983, + "grad_norm": 0.1286689043045044, + "learning_rate": 0.00010140953046012783, + "loss": 1.5838171005249024, + "step": 63160 + }, + { + "epoch": 0.19121274953460565, + "grad_norm": 0.13429893553256989, + "learning_rate": 0.00010140573527849042, + "loss": 1.5868659973144532, + "step": 63170 + }, + { + "epoch": 0.1912430190849515, + "grad_norm": 0.13162468373775482, + "learning_rate": 0.00010140194009685304, + "loss": 1.5567790985107421, + "step": 63180 + }, + { + "epoch": 0.19127328863529733, + "grad_norm": 0.11009042710065842, + "learning_rate": 0.00010139814491521564, + "loss": 1.608988380432129, + "step": 63190 + }, + { + "epoch": 0.19130355818564315, + "grad_norm": 0.1335471123456955, + "learning_rate": 0.00010139434973357825, + "loss": 1.572671890258789, + "step": 63200 + }, + { + "epoch": 0.19133382773598898, + "grad_norm": 0.13187247514724731, + "learning_rate": 0.00010139055455194087, + "loss": 1.605156707763672, + "step": 63210 + }, + { + "epoch": 0.1913640972863348, + "grad_norm": 0.1422860473394394, + "learning_rate": 0.00010138675937030346, + "loss": 1.5845853805541992, + "step": 63220 + }, + { + "epoch": 0.19139436683668065, + "grad_norm": 0.12673109769821167, + "learning_rate": 0.00010138296418866608, + "loss": 1.5593912124633789, + "step": 63230 + }, + { + "epoch": 0.19142463638702648, + "grad_norm": 0.11653554439544678, + "learning_rate": 0.00010137916900702867, + "loss": 1.5870994567871093, + "step": 63240 + }, + { + "epoch": 0.1914549059373723, + "grad_norm": 0.14597487449645996, + "learning_rate": 0.0001013753738253913, + "loss": 1.603135871887207, + "step": 63250 + }, + { + "epoch": 0.19148517548771812, + "grad_norm": 0.12692908942699432, + "learning_rate": 0.00010137157864375389, + "loss": 1.5493790626525878, + "step": 63260 + }, + { + "epoch": 0.19151544503806395, + "grad_norm": 0.1295439749956131, + "learning_rate": 0.0001013677834621165, + "loss": 1.5727540969848632, + "step": 63270 + }, + { + "epoch": 0.1915457145884098, + "grad_norm": 0.1358630359172821, + "learning_rate": 0.0001013639882804791, + "loss": 1.6036153793334962, + "step": 63280 + }, + { + "epoch": 0.19157598413875562, + "grad_norm": 0.1336755007505417, + "learning_rate": 0.00010136019309884172, + "loss": 1.5645177841186524, + "step": 63290 + }, + { + "epoch": 0.19160625368910145, + "grad_norm": 0.12405920028686523, + "learning_rate": 0.00010135639791720431, + "loss": 1.5919824600219727, + "step": 63300 + }, + { + "epoch": 0.19163652323944727, + "grad_norm": 0.12549971044063568, + "learning_rate": 0.00010135260273556693, + "loss": 1.603200912475586, + "step": 63310 + }, + { + "epoch": 0.1916667927897931, + "grad_norm": 0.12924642860889435, + "learning_rate": 0.00010134880755392952, + "loss": 1.5731443405151366, + "step": 63320 + }, + { + "epoch": 0.19169706234013895, + "grad_norm": 0.1294964849948883, + "learning_rate": 0.00010134501237229214, + "loss": 1.5576329231262207, + "step": 63330 + }, + { + "epoch": 0.19172733189048477, + "grad_norm": 0.11176301538944244, + "learning_rate": 0.00010134121719065475, + "loss": 1.604163932800293, + "step": 63340 + }, + { + "epoch": 0.1917576014408306, + "grad_norm": 0.12821157276630402, + "learning_rate": 0.00010133742200901735, + "loss": 1.6467931747436524, + "step": 63350 + }, + { + "epoch": 0.19178787099117642, + "grad_norm": 0.11883433163166046, + "learning_rate": 0.00010133362682737996, + "loss": 1.5828338623046876, + "step": 63360 + }, + { + "epoch": 0.19181814054152227, + "grad_norm": 0.13046561181545258, + "learning_rate": 0.00010132983164574257, + "loss": 1.6170719146728516, + "step": 63370 + }, + { + "epoch": 0.1918484100918681, + "grad_norm": 0.11655024439096451, + "learning_rate": 0.00010132603646410517, + "loss": 1.5824058532714844, + "step": 63380 + }, + { + "epoch": 0.19187867964221392, + "grad_norm": 0.12208186089992523, + "learning_rate": 0.00010132224128246778, + "loss": 1.6362964630126953, + "step": 63390 + }, + { + "epoch": 0.19190894919255974, + "grad_norm": 0.1391797810792923, + "learning_rate": 0.0001013184461008304, + "loss": 1.562984561920166, + "step": 63400 + }, + { + "epoch": 0.19193921874290557, + "grad_norm": 0.14005358517169952, + "learning_rate": 0.00010131465091919299, + "loss": 1.5768166542053224, + "step": 63410 + }, + { + "epoch": 0.19196948829325142, + "grad_norm": 0.12091640383005142, + "learning_rate": 0.00010131085573755561, + "loss": 1.548899269104004, + "step": 63420 + }, + { + "epoch": 0.19199975784359724, + "grad_norm": 0.15053093433380127, + "learning_rate": 0.0001013070605559182, + "loss": 1.5612234115600585, + "step": 63430 + }, + { + "epoch": 0.19203002739394306, + "grad_norm": 0.12663739919662476, + "learning_rate": 0.00010130326537428082, + "loss": 1.5592578887939452, + "step": 63440 + }, + { + "epoch": 0.1920602969442889, + "grad_norm": 0.11966986209154129, + "learning_rate": 0.00010129947019264341, + "loss": 1.6195472717285155, + "step": 63450 + }, + { + "epoch": 0.1920905664946347, + "grad_norm": 0.14007852971553802, + "learning_rate": 0.00010129567501100603, + "loss": 1.5914217948913574, + "step": 63460 + }, + { + "epoch": 0.19212083604498056, + "grad_norm": 0.14360548555850983, + "learning_rate": 0.00010129187982936864, + "loss": 1.5557106018066407, + "step": 63470 + }, + { + "epoch": 0.1921511055953264, + "grad_norm": 0.12416031956672668, + "learning_rate": 0.00010128808464773124, + "loss": 1.622555923461914, + "step": 63480 + }, + { + "epoch": 0.1921813751456722, + "grad_norm": 0.12861843407154083, + "learning_rate": 0.00010128428946609385, + "loss": 1.5691006660461426, + "step": 63490 + }, + { + "epoch": 0.19221164469601804, + "grad_norm": 0.13335658609867096, + "learning_rate": 0.00010128049428445646, + "loss": 1.5675070762634278, + "step": 63500 + }, + { + "epoch": 0.19221164469601804, + "eval_loss": 1.574894666671753, + "eval_runtime": 28.3788, + "eval_samples_per_second": 17.619, + "eval_steps_per_second": 1.128, + "step": 63500 + }, + { + "epoch": 0.19224191424636386, + "grad_norm": 0.12033192068338394, + "learning_rate": 0.00010127669910281906, + "loss": 1.5535679817199708, + "step": 63510 + }, + { + "epoch": 0.1922721837967097, + "grad_norm": 0.12554170191287994, + "learning_rate": 0.00010127290392118167, + "loss": 1.620446014404297, + "step": 63520 + }, + { + "epoch": 0.19230245334705554, + "grad_norm": 0.11930878460407257, + "learning_rate": 0.00010126910873954427, + "loss": 1.5625837326049805, + "step": 63530 + }, + { + "epoch": 0.19233272289740136, + "grad_norm": 0.12848223745822906, + "learning_rate": 0.00010126531355790688, + "loss": 1.5822446823120118, + "step": 63540 + }, + { + "epoch": 0.19236299244774718, + "grad_norm": 0.126322403550148, + "learning_rate": 0.00010126151837626949, + "loss": 1.5739286422729493, + "step": 63550 + }, + { + "epoch": 0.192393261998093, + "grad_norm": 0.12348710000514984, + "learning_rate": 0.00010125772319463209, + "loss": 1.5907979965209962, + "step": 63560 + }, + { + "epoch": 0.19242353154843886, + "grad_norm": 0.12314720451831818, + "learning_rate": 0.0001012539280129947, + "loss": 1.590617561340332, + "step": 63570 + }, + { + "epoch": 0.19245380109878468, + "grad_norm": 0.15531432628631592, + "learning_rate": 0.00010125013283135732, + "loss": 1.5701751708984375, + "step": 63580 + }, + { + "epoch": 0.1924840706491305, + "grad_norm": 0.12553128600120544, + "learning_rate": 0.00010124633764971992, + "loss": 1.6104114532470704, + "step": 63590 + }, + { + "epoch": 0.19251434019947633, + "grad_norm": 0.1319384127855301, + "learning_rate": 0.00010124254246808253, + "loss": 1.6056419372558595, + "step": 63600 + }, + { + "epoch": 0.19254460974982215, + "grad_norm": 0.1293243169784546, + "learning_rate": 0.00010123874728644514, + "loss": 1.5751174926757812, + "step": 63610 + }, + { + "epoch": 0.192574879300168, + "grad_norm": 0.13614904880523682, + "learning_rate": 0.00010123495210480774, + "loss": 1.5898906707763671, + "step": 63620 + }, + { + "epoch": 0.19260514885051383, + "grad_norm": 0.14466483891010284, + "learning_rate": 0.00010123115692317035, + "loss": 1.6162517547607422, + "step": 63630 + }, + { + "epoch": 0.19263541840085965, + "grad_norm": 0.12260407209396362, + "learning_rate": 0.00010122736174153295, + "loss": 1.5940610885620117, + "step": 63640 + }, + { + "epoch": 0.19266568795120548, + "grad_norm": 0.122704416513443, + "learning_rate": 0.00010122356655989556, + "loss": 1.6200859069824218, + "step": 63650 + }, + { + "epoch": 0.1926959575015513, + "grad_norm": 0.13213510811328888, + "learning_rate": 0.00010121977137825817, + "loss": 1.5819147109985352, + "step": 63660 + }, + { + "epoch": 0.19272622705189715, + "grad_norm": 0.13146010041236877, + "learning_rate": 0.00010121597619662077, + "loss": 1.5897106170654296, + "step": 63670 + }, + { + "epoch": 0.19275649660224298, + "grad_norm": 0.12765660881996155, + "learning_rate": 0.00010121218101498338, + "loss": 1.584348487854004, + "step": 63680 + }, + { + "epoch": 0.1927867661525888, + "grad_norm": 0.12109716981649399, + "learning_rate": 0.00010120838583334598, + "loss": 1.5799205780029297, + "step": 63690 + }, + { + "epoch": 0.19281703570293462, + "grad_norm": 0.1523972600698471, + "learning_rate": 0.00010120459065170859, + "loss": 1.5508350372314452, + "step": 63700 + }, + { + "epoch": 0.19284730525328045, + "grad_norm": 0.1422669142484665, + "learning_rate": 0.00010120079547007121, + "loss": 1.6186925888061523, + "step": 63710 + }, + { + "epoch": 0.1928775748036263, + "grad_norm": 0.13928250968456268, + "learning_rate": 0.0001011970002884338, + "loss": 1.5706336975097657, + "step": 63720 + }, + { + "epoch": 0.19290784435397212, + "grad_norm": 0.1164516881108284, + "learning_rate": 0.00010119320510679642, + "loss": 1.5918999671936036, + "step": 63730 + }, + { + "epoch": 0.19293811390431795, + "grad_norm": 0.1357457935810089, + "learning_rate": 0.00010118940992515901, + "loss": 1.581502342224121, + "step": 63740 + }, + { + "epoch": 0.19296838345466377, + "grad_norm": 0.12196876853704453, + "learning_rate": 0.00010118561474352163, + "loss": 1.5798601150512694, + "step": 63750 + }, + { + "epoch": 0.19299865300500962, + "grad_norm": 0.13194435834884644, + "learning_rate": 0.00010118181956188422, + "loss": 1.6012035369873048, + "step": 63760 + }, + { + "epoch": 0.19302892255535545, + "grad_norm": 0.1182241290807724, + "learning_rate": 0.00010117802438024684, + "loss": 1.596679973602295, + "step": 63770 + }, + { + "epoch": 0.19305919210570127, + "grad_norm": 0.15403234958648682, + "learning_rate": 0.00010117422919860944, + "loss": 1.596632480621338, + "step": 63780 + }, + { + "epoch": 0.1930894616560471, + "grad_norm": 0.12248501181602478, + "learning_rate": 0.00010117043401697206, + "loss": 1.6030235290527344, + "step": 63790 + }, + { + "epoch": 0.19311973120639292, + "grad_norm": 0.11885058134794235, + "learning_rate": 0.00010116663883533466, + "loss": 1.5806215286254883, + "step": 63800 + }, + { + "epoch": 0.19315000075673877, + "grad_norm": 0.11913534998893738, + "learning_rate": 0.00010116284365369727, + "loss": 1.6348514556884766, + "step": 63810 + }, + { + "epoch": 0.1931802703070846, + "grad_norm": 0.13145188987255096, + "learning_rate": 0.00010115904847205989, + "loss": 1.6093784332275392, + "step": 63820 + }, + { + "epoch": 0.19321053985743042, + "grad_norm": 0.12833839654922485, + "learning_rate": 0.00010115525329042248, + "loss": 1.5923911094665528, + "step": 63830 + }, + { + "epoch": 0.19324080940777624, + "grad_norm": 0.13970282673835754, + "learning_rate": 0.0001011514581087851, + "loss": 1.5789602279663086, + "step": 63840 + }, + { + "epoch": 0.19327107895812207, + "grad_norm": 0.1244732066988945, + "learning_rate": 0.00010114766292714769, + "loss": 1.5990399360656737, + "step": 63850 + }, + { + "epoch": 0.19330134850846792, + "grad_norm": 0.12373293936252594, + "learning_rate": 0.00010114386774551031, + "loss": 1.5823713302612306, + "step": 63860 + }, + { + "epoch": 0.19333161805881374, + "grad_norm": 0.12119913101196289, + "learning_rate": 0.0001011400725638729, + "loss": 1.6370777130126952, + "step": 63870 + }, + { + "epoch": 0.19336188760915957, + "grad_norm": 0.12147068977355957, + "learning_rate": 0.00010113627738223552, + "loss": 1.5730876922607422, + "step": 63880 + }, + { + "epoch": 0.1933921571595054, + "grad_norm": 0.11890505254268646, + "learning_rate": 0.00010113248220059812, + "loss": 1.610617446899414, + "step": 63890 + }, + { + "epoch": 0.19342242670985121, + "grad_norm": 0.13336148858070374, + "learning_rate": 0.00010112868701896073, + "loss": 1.5554780960083008, + "step": 63900 + }, + { + "epoch": 0.19345269626019707, + "grad_norm": 0.1214086189866066, + "learning_rate": 0.00010112489183732333, + "loss": 1.5459833145141602, + "step": 63910 + }, + { + "epoch": 0.1934829658105429, + "grad_norm": 0.12325844168663025, + "learning_rate": 0.00010112109665568595, + "loss": 1.590904998779297, + "step": 63920 + }, + { + "epoch": 0.1935132353608887, + "grad_norm": 0.14665354788303375, + "learning_rate": 0.00010111730147404854, + "loss": 1.5799986839294433, + "step": 63930 + }, + { + "epoch": 0.19354350491123454, + "grad_norm": 0.12057764828205109, + "learning_rate": 0.00010111350629241116, + "loss": 1.5759525299072266, + "step": 63940 + }, + { + "epoch": 0.19357377446158036, + "grad_norm": 0.13060325384140015, + "learning_rate": 0.00010110971111077376, + "loss": 1.559480857849121, + "step": 63950 + }, + { + "epoch": 0.1936040440119262, + "grad_norm": 0.134751558303833, + "learning_rate": 0.00010110591592913637, + "loss": 1.5476368904113769, + "step": 63960 + }, + { + "epoch": 0.19363431356227204, + "grad_norm": 0.1241430938243866, + "learning_rate": 0.00010110212074749898, + "loss": 1.6269989013671875, + "step": 63970 + }, + { + "epoch": 0.19366458311261786, + "grad_norm": 0.1260584145784378, + "learning_rate": 0.00010109832556586158, + "loss": 1.6435352325439454, + "step": 63980 + }, + { + "epoch": 0.19369485266296368, + "grad_norm": 0.11582029610872269, + "learning_rate": 0.00010109453038422419, + "loss": 1.5978992462158204, + "step": 63990 + }, + { + "epoch": 0.1937251222133095, + "grad_norm": 0.1235235258936882, + "learning_rate": 0.0001010907352025868, + "loss": 1.6172183990478515, + "step": 64000 + }, + { + "epoch": 0.1937251222133095, + "eval_loss": 1.5691332817077637, + "eval_runtime": 28.101, + "eval_samples_per_second": 17.793, + "eval_steps_per_second": 1.139, + "step": 64000 + }, + { + "epoch": 0.19375539176365536, + "grad_norm": 0.1287582516670227, + "learning_rate": 0.00010108694002094941, + "loss": 1.557734489440918, + "step": 64010 + }, + { + "epoch": 0.19378566131400118, + "grad_norm": 0.11150401085615158, + "learning_rate": 0.000101083144839312, + "loss": 1.6175994873046875, + "step": 64020 + }, + { + "epoch": 0.193815930864347, + "grad_norm": 0.13148364424705505, + "learning_rate": 0.00010107934965767463, + "loss": 1.6046295166015625, + "step": 64030 + }, + { + "epoch": 0.19384620041469283, + "grad_norm": 0.11917371302843094, + "learning_rate": 0.00010107555447603722, + "loss": 1.5960807800292969, + "step": 64040 + }, + { + "epoch": 0.19387646996503866, + "grad_norm": 0.121734619140625, + "learning_rate": 0.00010107175929439984, + "loss": 1.5879119873046874, + "step": 64050 + }, + { + "epoch": 0.1939067395153845, + "grad_norm": 0.12458433955907822, + "learning_rate": 0.00010106796411276243, + "loss": 1.6055183410644531, + "step": 64060 + }, + { + "epoch": 0.19393700906573033, + "grad_norm": 0.1312047243118286, + "learning_rate": 0.00010106416893112505, + "loss": 1.5991107940673828, + "step": 64070 + }, + { + "epoch": 0.19396727861607616, + "grad_norm": 0.1489911526441574, + "learning_rate": 0.00010106037374948766, + "loss": 1.612343978881836, + "step": 64080 + }, + { + "epoch": 0.19399754816642198, + "grad_norm": 0.12582755088806152, + "learning_rate": 0.00010105657856785026, + "loss": 1.6022293090820312, + "step": 64090 + }, + { + "epoch": 0.19402781771676783, + "grad_norm": 0.13512994349002838, + "learning_rate": 0.00010105278338621287, + "loss": 1.5757110595703125, + "step": 64100 + }, + { + "epoch": 0.19405808726711365, + "grad_norm": 0.12792880833148956, + "learning_rate": 0.00010104898820457547, + "loss": 1.6237312316894532, + "step": 64110 + }, + { + "epoch": 0.19408835681745948, + "grad_norm": 0.11986920982599258, + "learning_rate": 0.00010104519302293808, + "loss": 1.5460423469543456, + "step": 64120 + }, + { + "epoch": 0.1941186263678053, + "grad_norm": 0.1095329076051712, + "learning_rate": 0.00010104139784130069, + "loss": 1.5607709884643555, + "step": 64130 + }, + { + "epoch": 0.19414889591815113, + "grad_norm": 0.14392611384391785, + "learning_rate": 0.00010103760265966329, + "loss": 1.5833703994750976, + "step": 64140 + }, + { + "epoch": 0.19417916546849698, + "grad_norm": 0.12050279974937439, + "learning_rate": 0.0001010338074780259, + "loss": 1.5788887023925782, + "step": 64150 + }, + { + "epoch": 0.1942094350188428, + "grad_norm": 0.135563462972641, + "learning_rate": 0.0001010300122963885, + "loss": 1.5884761810302734, + "step": 64160 + }, + { + "epoch": 0.19423970456918863, + "grad_norm": 0.13073837757110596, + "learning_rate": 0.00010102621711475111, + "loss": 1.6143123626708984, + "step": 64170 + }, + { + "epoch": 0.19426997411953445, + "grad_norm": 0.13417235016822815, + "learning_rate": 0.00010102242193311372, + "loss": 1.5756053924560547, + "step": 64180 + }, + { + "epoch": 0.19430024366988027, + "grad_norm": 0.13838014006614685, + "learning_rate": 0.00010101862675147633, + "loss": 1.5618807792663574, + "step": 64190 + }, + { + "epoch": 0.19433051322022613, + "grad_norm": 0.13886764645576477, + "learning_rate": 0.00010101483156983894, + "loss": 1.6176088333129883, + "step": 64200 + }, + { + "epoch": 0.19436078277057195, + "grad_norm": 0.11145610362291336, + "learning_rate": 0.00010101103638820155, + "loss": 1.603316879272461, + "step": 64210 + }, + { + "epoch": 0.19439105232091777, + "grad_norm": 0.11575166881084442, + "learning_rate": 0.00010100724120656415, + "loss": 1.5996738433837892, + "step": 64220 + }, + { + "epoch": 0.1944213218712636, + "grad_norm": 0.11123306304216385, + "learning_rate": 0.00010100344602492676, + "loss": 1.5921289443969726, + "step": 64230 + }, + { + "epoch": 0.19445159142160942, + "grad_norm": 0.11354855448007584, + "learning_rate": 0.00010099965084328936, + "loss": 1.6170173645019532, + "step": 64240 + }, + { + "epoch": 0.19448186097195527, + "grad_norm": 0.1186690405011177, + "learning_rate": 0.00010099585566165197, + "loss": 1.5949241638183593, + "step": 64250 + }, + { + "epoch": 0.1945121305223011, + "grad_norm": 0.12361996620893478, + "learning_rate": 0.00010099206048001458, + "loss": 1.629187774658203, + "step": 64260 + }, + { + "epoch": 0.19454240007264692, + "grad_norm": 0.11321704089641571, + "learning_rate": 0.00010098826529837718, + "loss": 1.5654558181762694, + "step": 64270 + }, + { + "epoch": 0.19457266962299274, + "grad_norm": 0.13158436119556427, + "learning_rate": 0.00010098447011673979, + "loss": 1.615140151977539, + "step": 64280 + }, + { + "epoch": 0.19460293917333857, + "grad_norm": 0.13541099429130554, + "learning_rate": 0.0001009806749351024, + "loss": 1.609018325805664, + "step": 64290 + }, + { + "epoch": 0.19463320872368442, + "grad_norm": 0.12837497889995575, + "learning_rate": 0.000100976879753465, + "loss": 1.5747485160827637, + "step": 64300 + }, + { + "epoch": 0.19466347827403024, + "grad_norm": 0.1200004518032074, + "learning_rate": 0.0001009730845718276, + "loss": 1.6075897216796875, + "step": 64310 + }, + { + "epoch": 0.19469374782437607, + "grad_norm": 0.12281796336174011, + "learning_rate": 0.00010096928939019023, + "loss": 1.5830245971679688, + "step": 64320 + }, + { + "epoch": 0.1947240173747219, + "grad_norm": 0.11108169704675674, + "learning_rate": 0.00010096549420855282, + "loss": 1.6263423919677735, + "step": 64330 + }, + { + "epoch": 0.19475428692506772, + "grad_norm": 0.1335403472185135, + "learning_rate": 0.00010096169902691544, + "loss": 1.5874184608459472, + "step": 64340 + }, + { + "epoch": 0.19478455647541357, + "grad_norm": 0.12371215969324112, + "learning_rate": 0.00010095790384527803, + "loss": 1.59782075881958, + "step": 64350 + }, + { + "epoch": 0.1948148260257594, + "grad_norm": 0.15228132903575897, + "learning_rate": 0.00010095410866364065, + "loss": 1.5895267486572267, + "step": 64360 + }, + { + "epoch": 0.19484509557610522, + "grad_norm": 0.12345129251480103, + "learning_rate": 0.00010095031348200324, + "loss": 1.5821621894836426, + "step": 64370 + }, + { + "epoch": 0.19487536512645104, + "grad_norm": 0.12686121463775635, + "learning_rate": 0.00010094651830036586, + "loss": 1.5813241958618165, + "step": 64380 + }, + { + "epoch": 0.19490563467679686, + "grad_norm": 0.11971999704837799, + "learning_rate": 0.00010094272311872845, + "loss": 1.5729933738708497, + "step": 64390 + }, + { + "epoch": 0.19493590422714271, + "grad_norm": 0.13590720295906067, + "learning_rate": 0.00010093892793709107, + "loss": 1.5994985580444336, + "step": 64400 + }, + { + "epoch": 0.19496617377748854, + "grad_norm": 0.12548837065696716, + "learning_rate": 0.00010093513275545368, + "loss": 1.6137147903442384, + "step": 64410 + }, + { + "epoch": 0.19499644332783436, + "grad_norm": 0.11196772009134293, + "learning_rate": 0.00010093133757381629, + "loss": 1.603418731689453, + "step": 64420 + }, + { + "epoch": 0.1950267128781802, + "grad_norm": 0.12182897329330444, + "learning_rate": 0.00010092754239217889, + "loss": 1.585927391052246, + "step": 64430 + }, + { + "epoch": 0.19505698242852604, + "grad_norm": 0.1318446844816208, + "learning_rate": 0.0001009237472105415, + "loss": 1.5991031646728515, + "step": 64440 + }, + { + "epoch": 0.19508725197887186, + "grad_norm": 0.12546151876449585, + "learning_rate": 0.00010091995202890412, + "loss": 1.6048341751098634, + "step": 64450 + }, + { + "epoch": 0.19511752152921769, + "grad_norm": 0.12100204825401306, + "learning_rate": 0.00010091615684726671, + "loss": 1.5756261825561524, + "step": 64460 + }, + { + "epoch": 0.1951477910795635, + "grad_norm": 0.14184238016605377, + "learning_rate": 0.00010091236166562933, + "loss": 1.5911514282226562, + "step": 64470 + }, + { + "epoch": 0.19517806062990933, + "grad_norm": 0.1430027037858963, + "learning_rate": 0.00010090856648399192, + "loss": 1.6081375122070312, + "step": 64480 + }, + { + "epoch": 0.19520833018025519, + "grad_norm": 0.1287970244884491, + "learning_rate": 0.00010090477130235454, + "loss": 1.5680599212646484, + "step": 64490 + }, + { + "epoch": 0.195238599730601, + "grad_norm": 0.11992888897657394, + "learning_rate": 0.00010090097612071713, + "loss": 1.6134387969970703, + "step": 64500 + }, + { + "epoch": 0.195238599730601, + "eval_loss": 1.5953083038330078, + "eval_runtime": 28.3349, + "eval_samples_per_second": 17.646, + "eval_steps_per_second": 1.129, + "step": 64500 + }, + { + "epoch": 0.19526886928094683, + "grad_norm": 0.1217583492398262, + "learning_rate": 0.00010089718093907975, + "loss": 1.5629308700561524, + "step": 64510 + }, + { + "epoch": 0.19529913883129266, + "grad_norm": 0.12674890458583832, + "learning_rate": 0.00010089338575744234, + "loss": 1.614494514465332, + "step": 64520 + }, + { + "epoch": 0.19532940838163848, + "grad_norm": 0.14164307713508606, + "learning_rate": 0.00010088959057580496, + "loss": 1.6220794677734376, + "step": 64530 + }, + { + "epoch": 0.19535967793198433, + "grad_norm": 0.14556413888931274, + "learning_rate": 0.00010088579539416756, + "loss": 1.5581993103027343, + "step": 64540 + }, + { + "epoch": 0.19538994748233016, + "grad_norm": 0.13005632162094116, + "learning_rate": 0.00010088200021253018, + "loss": 1.5903997421264648, + "step": 64550 + }, + { + "epoch": 0.19542021703267598, + "grad_norm": 0.12839539349079132, + "learning_rate": 0.00010087820503089277, + "loss": 1.5786718368530273, + "step": 64560 + }, + { + "epoch": 0.1954504865830218, + "grad_norm": 0.12333544343709946, + "learning_rate": 0.00010087440984925539, + "loss": 1.5554283142089844, + "step": 64570 + }, + { + "epoch": 0.19548075613336763, + "grad_norm": 0.14145614206790924, + "learning_rate": 0.000100870614667618, + "loss": 1.6050540924072265, + "step": 64580 + }, + { + "epoch": 0.19551102568371348, + "grad_norm": 0.13473811745643616, + "learning_rate": 0.0001008668194859806, + "loss": 1.5802005767822265, + "step": 64590 + }, + { + "epoch": 0.1955412952340593, + "grad_norm": 0.11790663003921509, + "learning_rate": 0.0001008630243043432, + "loss": 1.5889034271240234, + "step": 64600 + }, + { + "epoch": 0.19557156478440513, + "grad_norm": 0.13368402421474457, + "learning_rate": 0.00010085922912270581, + "loss": 1.6189430236816407, + "step": 64610 + }, + { + "epoch": 0.19560183433475095, + "grad_norm": 0.12259742617607117, + "learning_rate": 0.00010085543394106843, + "loss": 1.571992778778076, + "step": 64620 + }, + { + "epoch": 0.19563210388509678, + "grad_norm": 0.13493353128433228, + "learning_rate": 0.00010085163875943102, + "loss": 1.6010347366333009, + "step": 64630 + }, + { + "epoch": 0.19566237343544263, + "grad_norm": 0.10961340367794037, + "learning_rate": 0.00010084784357779364, + "loss": 1.6202728271484375, + "step": 64640 + }, + { + "epoch": 0.19569264298578845, + "grad_norm": 0.11218561977148056, + "learning_rate": 0.00010084404839615624, + "loss": 1.592470073699951, + "step": 64650 + }, + { + "epoch": 0.19572291253613427, + "grad_norm": 0.12371028959751129, + "learning_rate": 0.00010084025321451885, + "loss": 1.6275674819946289, + "step": 64660 + }, + { + "epoch": 0.1957531820864801, + "grad_norm": 0.1337384432554245, + "learning_rate": 0.00010083645803288145, + "loss": 1.5687945365905762, + "step": 64670 + }, + { + "epoch": 0.19578345163682592, + "grad_norm": 0.132412388920784, + "learning_rate": 0.00010083266285124407, + "loss": 1.5849674224853516, + "step": 64680 + }, + { + "epoch": 0.19581372118717177, + "grad_norm": 0.12225154042243958, + "learning_rate": 0.00010082886766960667, + "loss": 1.5981903076171875, + "step": 64690 + }, + { + "epoch": 0.1958439907375176, + "grad_norm": 0.12886255979537964, + "learning_rate": 0.00010082507248796928, + "loss": 1.620412826538086, + "step": 64700 + }, + { + "epoch": 0.19587426028786342, + "grad_norm": 0.1216973066329956, + "learning_rate": 0.00010082127730633188, + "loss": 1.6015159606933593, + "step": 64710 + }, + { + "epoch": 0.19590452983820925, + "grad_norm": 0.12295351922512054, + "learning_rate": 0.00010081748212469449, + "loss": 1.589749526977539, + "step": 64720 + }, + { + "epoch": 0.19593479938855507, + "grad_norm": 0.1266198307275772, + "learning_rate": 0.0001008136869430571, + "loss": 1.5823290824890137, + "step": 64730 + }, + { + "epoch": 0.19596506893890092, + "grad_norm": 0.13374193012714386, + "learning_rate": 0.0001008098917614197, + "loss": 1.6028141021728515, + "step": 64740 + }, + { + "epoch": 0.19599533848924675, + "grad_norm": 0.11573004722595215, + "learning_rate": 0.00010080609657978231, + "loss": 1.584677505493164, + "step": 64750 + }, + { + "epoch": 0.19602560803959257, + "grad_norm": 0.12638716399669647, + "learning_rate": 0.00010080230139814491, + "loss": 1.6230039596557617, + "step": 64760 + }, + { + "epoch": 0.1960558775899384, + "grad_norm": 0.13313992321491241, + "learning_rate": 0.00010079850621650752, + "loss": 1.5539112091064453, + "step": 64770 + }, + { + "epoch": 0.19608614714028424, + "grad_norm": 0.11715885251760483, + "learning_rate": 0.00010079471103487013, + "loss": 1.5897584915161134, + "step": 64780 + }, + { + "epoch": 0.19611641669063007, + "grad_norm": 0.11686296761035919, + "learning_rate": 0.00010079091585323273, + "loss": 1.5726890563964844, + "step": 64790 + }, + { + "epoch": 0.1961466862409759, + "grad_norm": 0.13606683909893036, + "learning_rate": 0.00010078712067159534, + "loss": 1.602877426147461, + "step": 64800 + }, + { + "epoch": 0.19617695579132172, + "grad_norm": 0.123662069439888, + "learning_rate": 0.00010078332548995796, + "loss": 1.594702911376953, + "step": 64810 + }, + { + "epoch": 0.19620722534166754, + "grad_norm": 0.12551192939281464, + "learning_rate": 0.00010077953030832056, + "loss": 1.5776113510131835, + "step": 64820 + }, + { + "epoch": 0.1962374948920134, + "grad_norm": 0.12226787209510803, + "learning_rate": 0.00010077573512668317, + "loss": 1.617453384399414, + "step": 64830 + }, + { + "epoch": 0.19626776444235922, + "grad_norm": 0.1199352964758873, + "learning_rate": 0.00010077193994504578, + "loss": 1.582345199584961, + "step": 64840 + }, + { + "epoch": 0.19629803399270504, + "grad_norm": 0.11724646389484406, + "learning_rate": 0.00010076814476340838, + "loss": 1.6103094100952149, + "step": 64850 + }, + { + "epoch": 0.19632830354305086, + "grad_norm": 0.12028229981660843, + "learning_rate": 0.00010076434958177099, + "loss": 1.5484798431396485, + "step": 64860 + }, + { + "epoch": 0.1963585730933967, + "grad_norm": 0.12699326872825623, + "learning_rate": 0.0001007605544001336, + "loss": 1.5742948532104493, + "step": 64870 + }, + { + "epoch": 0.19638884264374254, + "grad_norm": 0.12060325592756271, + "learning_rate": 0.0001007567592184962, + "loss": 1.5958568572998046, + "step": 64880 + }, + { + "epoch": 0.19641911219408836, + "grad_norm": 0.13090009987354279, + "learning_rate": 0.0001007529640368588, + "loss": 1.603424072265625, + "step": 64890 + }, + { + "epoch": 0.1964493817444342, + "grad_norm": 0.10767782479524612, + "learning_rate": 0.00010074916885522141, + "loss": 1.5826756477355957, + "step": 64900 + }, + { + "epoch": 0.19647965129478, + "grad_norm": 0.10856974124908447, + "learning_rate": 0.00010074537367358402, + "loss": 1.5974107742309571, + "step": 64910 + }, + { + "epoch": 0.19650992084512584, + "grad_norm": 0.12191152572631836, + "learning_rate": 0.00010074157849194662, + "loss": 1.5962931632995605, + "step": 64920 + }, + { + "epoch": 0.1965401903954717, + "grad_norm": 0.1239997148513794, + "learning_rate": 0.00010073778331030924, + "loss": 1.6158252716064454, + "step": 64930 + }, + { + "epoch": 0.1965704599458175, + "grad_norm": 0.11347094923257828, + "learning_rate": 0.00010073398812867184, + "loss": 1.6277324676513671, + "step": 64940 + }, + { + "epoch": 0.19660072949616333, + "grad_norm": 0.12884582579135895, + "learning_rate": 0.00010073019294703445, + "loss": 1.581747341156006, + "step": 64950 + }, + { + "epoch": 0.19663099904650916, + "grad_norm": 0.11771129816770554, + "learning_rate": 0.00010072639776539705, + "loss": 1.5886525154113769, + "step": 64960 + }, + { + "epoch": 0.19666126859685498, + "grad_norm": 0.12019237130880356, + "learning_rate": 0.00010072260258375967, + "loss": 1.6148868560791017, + "step": 64970 + }, + { + "epoch": 0.19669153814720083, + "grad_norm": 0.1260419338941574, + "learning_rate": 0.00010071880740212226, + "loss": 1.624826431274414, + "step": 64980 + }, + { + "epoch": 0.19672180769754666, + "grad_norm": 0.12451157718896866, + "learning_rate": 0.00010071501222048488, + "loss": 1.6142837524414062, + "step": 64990 + }, + { + "epoch": 0.19675207724789248, + "grad_norm": 0.1346367746591568, + "learning_rate": 0.00010071121703884747, + "loss": 1.5674439430236817, + "step": 65000 + }, + { + "epoch": 0.19675207724789248, + "eval_loss": 1.60201096534729, + "eval_runtime": 28.2244, + "eval_samples_per_second": 17.715, + "eval_steps_per_second": 1.134, + "step": 65000 + }, + { + "epoch": 0.1967823467982383, + "grad_norm": 0.13605454564094543, + "learning_rate": 0.00010070742185721009, + "loss": 1.5975971221923828, + "step": 65010 + }, + { + "epoch": 0.19681261634858413, + "grad_norm": 0.1183566078543663, + "learning_rate": 0.0001007036266755727, + "loss": 1.6135738372802735, + "step": 65020 + }, + { + "epoch": 0.19684288589892998, + "grad_norm": 0.1385612189769745, + "learning_rate": 0.0001006998314939353, + "loss": 1.5381877899169922, + "step": 65030 + }, + { + "epoch": 0.1968731554492758, + "grad_norm": 0.1291699856519699, + "learning_rate": 0.00010069603631229791, + "loss": 1.5837973594665526, + "step": 65040 + }, + { + "epoch": 0.19690342499962163, + "grad_norm": 0.12464241683483124, + "learning_rate": 0.00010069224113066051, + "loss": 1.5848085403442382, + "step": 65050 + }, + { + "epoch": 0.19693369454996745, + "grad_norm": 0.11817696690559387, + "learning_rate": 0.00010068844594902313, + "loss": 1.565584373474121, + "step": 65060 + }, + { + "epoch": 0.19696396410031328, + "grad_norm": 0.12902764976024628, + "learning_rate": 0.00010068465076738573, + "loss": 1.551612663269043, + "step": 65070 + }, + { + "epoch": 0.19699423365065913, + "grad_norm": 0.14132900536060333, + "learning_rate": 0.00010068085558574835, + "loss": 1.5823232650756835, + "step": 65080 + }, + { + "epoch": 0.19702450320100495, + "grad_norm": 0.11034560948610306, + "learning_rate": 0.00010067706040411094, + "loss": 1.539380168914795, + "step": 65090 + }, + { + "epoch": 0.19705477275135078, + "grad_norm": 0.13509568572044373, + "learning_rate": 0.00010067326522247356, + "loss": 1.5876935005187989, + "step": 65100 + }, + { + "epoch": 0.1970850423016966, + "grad_norm": 0.12734247744083405, + "learning_rate": 0.00010066947004083615, + "loss": 1.600154495239258, + "step": 65110 + }, + { + "epoch": 0.19711531185204242, + "grad_norm": 0.12612809240818024, + "learning_rate": 0.00010066567485919877, + "loss": 1.5900598526000977, + "step": 65120 + }, + { + "epoch": 0.19714558140238828, + "grad_norm": 0.1232321709394455, + "learning_rate": 0.00010066187967756136, + "loss": 1.598127555847168, + "step": 65130 + }, + { + "epoch": 0.1971758509527341, + "grad_norm": 0.13264857232570648, + "learning_rate": 0.00010065808449592398, + "loss": 1.5953989028930664, + "step": 65140 + }, + { + "epoch": 0.19720612050307992, + "grad_norm": 0.12936905026435852, + "learning_rate": 0.00010065428931428657, + "loss": 1.6226972579956054, + "step": 65150 + }, + { + "epoch": 0.19723639005342575, + "grad_norm": 0.14260897040367126, + "learning_rate": 0.00010065049413264919, + "loss": 1.5786664009094238, + "step": 65160 + }, + { + "epoch": 0.1972666596037716, + "grad_norm": 0.12880319356918335, + "learning_rate": 0.00010064669895101179, + "loss": 1.593519687652588, + "step": 65170 + }, + { + "epoch": 0.19729692915411742, + "grad_norm": 0.12628187239170074, + "learning_rate": 0.0001006429037693744, + "loss": 1.6237348556518554, + "step": 65180 + }, + { + "epoch": 0.19732719870446325, + "grad_norm": 0.14228758215904236, + "learning_rate": 0.00010063910858773701, + "loss": 1.5841020584106444, + "step": 65190 + }, + { + "epoch": 0.19735746825480907, + "grad_norm": 0.12789754569530487, + "learning_rate": 0.00010063531340609962, + "loss": 1.5562731742858886, + "step": 65200 + }, + { + "epoch": 0.1973877378051549, + "grad_norm": 0.13275663554668427, + "learning_rate": 0.00010063151822446222, + "loss": 1.6038116455078124, + "step": 65210 + }, + { + "epoch": 0.19741800735550075, + "grad_norm": 0.12714436650276184, + "learning_rate": 0.00010062772304282483, + "loss": 1.6004951477050782, + "step": 65220 + }, + { + "epoch": 0.19744827690584657, + "grad_norm": 0.11992139369249344, + "learning_rate": 0.00010062392786118745, + "loss": 1.562105941772461, + "step": 65230 + }, + { + "epoch": 0.1974785464561924, + "grad_norm": 0.1207437515258789, + "learning_rate": 0.00010062013267955004, + "loss": 1.6549943923950194, + "step": 65240 + }, + { + "epoch": 0.19750881600653822, + "grad_norm": 0.12908406555652618, + "learning_rate": 0.00010061633749791266, + "loss": 1.603175735473633, + "step": 65250 + }, + { + "epoch": 0.19753908555688404, + "grad_norm": 0.12646043300628662, + "learning_rate": 0.00010061254231627525, + "loss": 1.5765783309936523, + "step": 65260 + }, + { + "epoch": 0.1975693551072299, + "grad_norm": 0.1380731463432312, + "learning_rate": 0.00010060874713463787, + "loss": 1.5867977142333984, + "step": 65270 + }, + { + "epoch": 0.19759962465757572, + "grad_norm": 0.11156780272722244, + "learning_rate": 0.00010060495195300046, + "loss": 1.5825700759887695, + "step": 65280 + }, + { + "epoch": 0.19762989420792154, + "grad_norm": 0.11401984840631485, + "learning_rate": 0.00010060115677136308, + "loss": 1.5451061248779296, + "step": 65290 + }, + { + "epoch": 0.19766016375826737, + "grad_norm": 0.1409967839717865, + "learning_rate": 0.00010059736158972569, + "loss": 1.6208576202392577, + "step": 65300 + }, + { + "epoch": 0.1976904333086132, + "grad_norm": 0.12442553788423538, + "learning_rate": 0.0001005935664080883, + "loss": 1.6050895690917968, + "step": 65310 + }, + { + "epoch": 0.19772070285895904, + "grad_norm": 0.13106797635555267, + "learning_rate": 0.0001005897712264509, + "loss": 1.605657958984375, + "step": 65320 + }, + { + "epoch": 0.19775097240930486, + "grad_norm": 0.13574162125587463, + "learning_rate": 0.00010058597604481351, + "loss": 1.5586750030517578, + "step": 65330 + }, + { + "epoch": 0.1977812419596507, + "grad_norm": 0.11836069077253342, + "learning_rate": 0.00010058218086317611, + "loss": 1.603982162475586, + "step": 65340 + }, + { + "epoch": 0.1978115115099965, + "grad_norm": 0.12507040798664093, + "learning_rate": 0.00010057838568153872, + "loss": 1.5767900466918945, + "step": 65350 + }, + { + "epoch": 0.19784178106034234, + "grad_norm": 0.11984199285507202, + "learning_rate": 0.00010057459049990133, + "loss": 1.5734935760498048, + "step": 65360 + }, + { + "epoch": 0.1978720506106882, + "grad_norm": 0.12220878899097443, + "learning_rate": 0.00010057079531826393, + "loss": 1.6164884567260742, + "step": 65370 + }, + { + "epoch": 0.197902320161034, + "grad_norm": 0.1369370073080063, + "learning_rate": 0.00010056700013662654, + "loss": 1.573186492919922, + "step": 65380 + }, + { + "epoch": 0.19793258971137984, + "grad_norm": 0.1255236119031906, + "learning_rate": 0.00010056320495498914, + "loss": 1.6045286178588867, + "step": 65390 + }, + { + "epoch": 0.19796285926172566, + "grad_norm": 0.11672700196504593, + "learning_rate": 0.00010055940977335175, + "loss": 1.594796371459961, + "step": 65400 + }, + { + "epoch": 0.19799312881207148, + "grad_norm": 0.11443793028593063, + "learning_rate": 0.00010055561459171436, + "loss": 1.5767457008361816, + "step": 65410 + }, + { + "epoch": 0.19802339836241734, + "grad_norm": 0.12763157486915588, + "learning_rate": 0.00010055181941007697, + "loss": 1.5610599517822266, + "step": 65420 + }, + { + "epoch": 0.19805366791276316, + "grad_norm": 0.12299130111932755, + "learning_rate": 0.00010054802422843958, + "loss": 1.6265213012695312, + "step": 65430 + }, + { + "epoch": 0.19808393746310898, + "grad_norm": 0.12882260978221893, + "learning_rate": 0.00010054422904680219, + "loss": 1.581360149383545, + "step": 65440 + }, + { + "epoch": 0.1981142070134548, + "grad_norm": 0.12312339246273041, + "learning_rate": 0.00010054043386516479, + "loss": 1.6112455368041991, + "step": 65450 + }, + { + "epoch": 0.19814447656380063, + "grad_norm": 0.13432727754116058, + "learning_rate": 0.0001005366386835274, + "loss": 1.5850795745849608, + "step": 65460 + }, + { + "epoch": 0.19817474611414648, + "grad_norm": 0.12094990164041519, + "learning_rate": 0.00010053284350189, + "loss": 1.5955163955688476, + "step": 65470 + }, + { + "epoch": 0.1982050156644923, + "grad_norm": 0.13451579213142395, + "learning_rate": 0.00010052904832025261, + "loss": 1.5892230987548828, + "step": 65480 + }, + { + "epoch": 0.19823528521483813, + "grad_norm": 0.12673960626125336, + "learning_rate": 0.00010052525313861522, + "loss": 1.57991304397583, + "step": 65490 + }, + { + "epoch": 0.19826555476518395, + "grad_norm": 0.12605954706668854, + "learning_rate": 0.00010052145795697782, + "loss": 1.6263435363769532, + "step": 65500 + }, + { + "epoch": 0.19826555476518395, + "eval_loss": 1.5888832807540894, + "eval_runtime": 28.4722, + "eval_samples_per_second": 17.561, + "eval_steps_per_second": 1.124, + "step": 65500 + }, + { + "epoch": 0.1982958243155298, + "grad_norm": 0.13066624104976654, + "learning_rate": 0.00010051766277534043, + "loss": 1.6011045455932618, + "step": 65510 + }, + { + "epoch": 0.19832609386587563, + "grad_norm": 0.12421156466007233, + "learning_rate": 0.00010051386759370303, + "loss": 1.5773506164550781, + "step": 65520 + }, + { + "epoch": 0.19835636341622145, + "grad_norm": 0.14578381180763245, + "learning_rate": 0.00010051007241206564, + "loss": 1.5894878387451172, + "step": 65530 + }, + { + "epoch": 0.19838663296656728, + "grad_norm": 0.12902185320854187, + "learning_rate": 0.00010050627723042825, + "loss": 1.60792236328125, + "step": 65540 + }, + { + "epoch": 0.1984169025169131, + "grad_norm": 0.13165205717086792, + "learning_rate": 0.00010050248204879085, + "loss": 1.5782466888427735, + "step": 65550 + }, + { + "epoch": 0.19844717206725895, + "grad_norm": 0.12789416313171387, + "learning_rate": 0.00010049868686715347, + "loss": 1.5919075012207031, + "step": 65560 + }, + { + "epoch": 0.19847744161760478, + "grad_norm": 0.14385612308979034, + "learning_rate": 0.00010049489168551606, + "loss": 1.6137212753295898, + "step": 65570 + }, + { + "epoch": 0.1985077111679506, + "grad_norm": 0.12648600339889526, + "learning_rate": 0.00010049109650387868, + "loss": 1.5742396354675292, + "step": 65580 + }, + { + "epoch": 0.19853798071829643, + "grad_norm": 0.11709719151258469, + "learning_rate": 0.00010048730132224128, + "loss": 1.5852840423583985, + "step": 65590 + }, + { + "epoch": 0.19856825026864225, + "grad_norm": 0.1342286318540573, + "learning_rate": 0.0001004835061406039, + "loss": 1.546941375732422, + "step": 65600 + }, + { + "epoch": 0.1985985198189881, + "grad_norm": 0.11979833990335464, + "learning_rate": 0.00010047971095896649, + "loss": 1.588179111480713, + "step": 65610 + }, + { + "epoch": 0.19862878936933392, + "grad_norm": 0.1415265053510666, + "learning_rate": 0.00010047591577732911, + "loss": 1.5853827476501465, + "step": 65620 + }, + { + "epoch": 0.19865905891967975, + "grad_norm": 0.11546498537063599, + "learning_rate": 0.00010047212059569171, + "loss": 1.5904026985168458, + "step": 65630 + }, + { + "epoch": 0.19868932847002557, + "grad_norm": 0.1272253841161728, + "learning_rate": 0.00010046832541405432, + "loss": 1.5716519355773926, + "step": 65640 + }, + { + "epoch": 0.1987195980203714, + "grad_norm": 0.12700042128562927, + "learning_rate": 0.00010046453023241693, + "loss": 1.5850627899169922, + "step": 65650 + }, + { + "epoch": 0.19874986757071725, + "grad_norm": 0.1398238241672516, + "learning_rate": 0.00010046073505077953, + "loss": 1.5733386039733888, + "step": 65660 + }, + { + "epoch": 0.19878013712106307, + "grad_norm": 0.13716813921928406, + "learning_rate": 0.00010045693986914215, + "loss": 1.5874377250671388, + "step": 65670 + }, + { + "epoch": 0.1988104066714089, + "grad_norm": 0.12622977793216705, + "learning_rate": 0.00010045314468750474, + "loss": 1.5530731201171875, + "step": 65680 + }, + { + "epoch": 0.19884067622175472, + "grad_norm": 0.13578225672245026, + "learning_rate": 0.00010044934950586736, + "loss": 1.5988592147827148, + "step": 65690 + }, + { + "epoch": 0.19887094577210054, + "grad_norm": 0.13664181530475616, + "learning_rate": 0.00010044555432422996, + "loss": 1.5920125007629395, + "step": 65700 + }, + { + "epoch": 0.1989012153224464, + "grad_norm": 0.12137371301651001, + "learning_rate": 0.00010044175914259257, + "loss": 1.564378833770752, + "step": 65710 + }, + { + "epoch": 0.19893148487279222, + "grad_norm": 0.12960189580917358, + "learning_rate": 0.00010043796396095517, + "loss": 1.5885485649108886, + "step": 65720 + }, + { + "epoch": 0.19896175442313804, + "grad_norm": 0.12638074159622192, + "learning_rate": 0.00010043416877931779, + "loss": 1.5759294509887696, + "step": 65730 + }, + { + "epoch": 0.19899202397348387, + "grad_norm": 0.12201491743326187, + "learning_rate": 0.00010043037359768038, + "loss": 1.583768367767334, + "step": 65740 + }, + { + "epoch": 0.1990222935238297, + "grad_norm": 0.12050466984510422, + "learning_rate": 0.000100426578416043, + "loss": 1.5927768707275392, + "step": 65750 + }, + { + "epoch": 0.19905256307417554, + "grad_norm": 0.13014090061187744, + "learning_rate": 0.00010042278323440559, + "loss": 1.5748538017272948, + "step": 65760 + }, + { + "epoch": 0.19908283262452137, + "grad_norm": 0.12703648209571838, + "learning_rate": 0.00010041898805276821, + "loss": 1.6002773284912108, + "step": 65770 + }, + { + "epoch": 0.1991131021748672, + "grad_norm": 0.11531852930784225, + "learning_rate": 0.0001004151928711308, + "loss": 1.6333614349365235, + "step": 65780 + }, + { + "epoch": 0.19914337172521301, + "grad_norm": 0.11452748626470566, + "learning_rate": 0.00010041139768949342, + "loss": 1.5818183898925782, + "step": 65790 + }, + { + "epoch": 0.19917364127555884, + "grad_norm": 0.11945368349552155, + "learning_rate": 0.00010040760250785603, + "loss": 1.6028026580810546, + "step": 65800 + }, + { + "epoch": 0.1992039108259047, + "grad_norm": 0.1326635628938675, + "learning_rate": 0.00010040380732621863, + "loss": 1.6090879440307617, + "step": 65810 + }, + { + "epoch": 0.1992341803762505, + "grad_norm": 0.12667442858219147, + "learning_rate": 0.00010040001214458125, + "loss": 1.6054727554321289, + "step": 65820 + }, + { + "epoch": 0.19926444992659634, + "grad_norm": 0.13316656649112701, + "learning_rate": 0.00010039621696294385, + "loss": 1.5997102737426758, + "step": 65830 + }, + { + "epoch": 0.19929471947694216, + "grad_norm": 0.12110400944948196, + "learning_rate": 0.00010039242178130647, + "loss": 1.5688287734985351, + "step": 65840 + }, + { + "epoch": 0.199324989027288, + "grad_norm": 0.11071501672267914, + "learning_rate": 0.00010038862659966906, + "loss": 1.5578035354614257, + "step": 65850 + }, + { + "epoch": 0.19935525857763384, + "grad_norm": 0.12554585933685303, + "learning_rate": 0.00010038483141803168, + "loss": 1.6152074813842774, + "step": 65860 + }, + { + "epoch": 0.19938552812797966, + "grad_norm": 0.13231214880943298, + "learning_rate": 0.00010038103623639427, + "loss": 1.577993106842041, + "step": 65870 + }, + { + "epoch": 0.19941579767832548, + "grad_norm": 0.1336909830570221, + "learning_rate": 0.00010037724105475689, + "loss": 1.5996620178222656, + "step": 65880 + }, + { + "epoch": 0.1994460672286713, + "grad_norm": 0.12078417092561722, + "learning_rate": 0.00010037344587311948, + "loss": 1.587470817565918, + "step": 65890 + }, + { + "epoch": 0.19947633677901716, + "grad_norm": 0.13481909036636353, + "learning_rate": 0.0001003696506914821, + "loss": 1.5890408515930177, + "step": 65900 + }, + { + "epoch": 0.19950660632936298, + "grad_norm": 0.12060734629631042, + "learning_rate": 0.0001003658555098447, + "loss": 1.6143686294555664, + "step": 65910 + }, + { + "epoch": 0.1995368758797088, + "grad_norm": 0.11886761337518692, + "learning_rate": 0.00010036206032820731, + "loss": 1.5987573623657227, + "step": 65920 + }, + { + "epoch": 0.19956714543005463, + "grad_norm": 0.11311328411102295, + "learning_rate": 0.00010035826514656992, + "loss": 1.593307113647461, + "step": 65930 + }, + { + "epoch": 0.19959741498040046, + "grad_norm": 0.12045621126890182, + "learning_rate": 0.00010035446996493253, + "loss": 1.5684109687805177, + "step": 65940 + }, + { + "epoch": 0.1996276845307463, + "grad_norm": 0.12882746756076813, + "learning_rate": 0.00010035067478329513, + "loss": 1.5550026893615723, + "step": 65950 + }, + { + "epoch": 0.19965795408109213, + "grad_norm": 0.13381308317184448, + "learning_rate": 0.00010034687960165774, + "loss": 1.5782365798950195, + "step": 65960 + }, + { + "epoch": 0.19968822363143796, + "grad_norm": 0.13452477753162384, + "learning_rate": 0.00010034308442002034, + "loss": 1.5953907012939452, + "step": 65970 + }, + { + "epoch": 0.19971849318178378, + "grad_norm": 0.1277938038110733, + "learning_rate": 0.00010033928923838295, + "loss": 1.5842308044433593, + "step": 65980 + }, + { + "epoch": 0.1997487627321296, + "grad_norm": 0.12695102393627167, + "learning_rate": 0.00010033549405674555, + "loss": 1.5596638679504395, + "step": 65990 + }, + { + "epoch": 0.19977903228247545, + "grad_norm": 0.13663512468338013, + "learning_rate": 0.00010033169887510816, + "loss": 1.5864893913269043, + "step": 66000 + }, + { + "epoch": 0.19977903228247545, + "eval_loss": 1.5779587030410767, + "eval_runtime": 27.9291, + "eval_samples_per_second": 17.902, + "eval_steps_per_second": 1.146, + "step": 66000 + }, + { + "epoch": 0.19980930183282128, + "grad_norm": 0.1259257048368454, + "learning_rate": 0.00010032790369347077, + "loss": 1.5810007095336913, + "step": 66010 + }, + { + "epoch": 0.1998395713831671, + "grad_norm": 0.1375008076429367, + "learning_rate": 0.00010032410851183337, + "loss": 1.589399528503418, + "step": 66020 + }, + { + "epoch": 0.19986984093351293, + "grad_norm": 0.12620355188846588, + "learning_rate": 0.00010032031333019599, + "loss": 1.5980671882629394, + "step": 66030 + }, + { + "epoch": 0.19990011048385875, + "grad_norm": 0.12475083023309708, + "learning_rate": 0.0001003165181485586, + "loss": 1.6249614715576173, + "step": 66040 + }, + { + "epoch": 0.1999303800342046, + "grad_norm": 0.11128854006528854, + "learning_rate": 0.0001003127229669212, + "loss": 1.5819072723388672, + "step": 66050 + }, + { + "epoch": 0.19996064958455043, + "grad_norm": 0.12040286511182785, + "learning_rate": 0.00010030892778528381, + "loss": 1.576395320892334, + "step": 66060 + }, + { + "epoch": 0.19999091913489625, + "grad_norm": 0.12188372015953064, + "learning_rate": 0.00010030513260364642, + "loss": 1.5430931091308593, + "step": 66070 + }, + { + "epoch": 0.20002118868524207, + "grad_norm": 0.11377669125795364, + "learning_rate": 0.00010030133742200902, + "loss": 1.6034063339233398, + "step": 66080 + }, + { + "epoch": 0.2000514582355879, + "grad_norm": 0.1407872885465622, + "learning_rate": 0.00010029754224037163, + "loss": 1.56905517578125, + "step": 66090 + }, + { + "epoch": 0.20008172778593375, + "grad_norm": 0.11501864343881607, + "learning_rate": 0.00010029374705873423, + "loss": 1.5970268249511719, + "step": 66100 + }, + { + "epoch": 0.20011199733627957, + "grad_norm": 0.11444870382547379, + "learning_rate": 0.00010028995187709684, + "loss": 1.606134796142578, + "step": 66110 + }, + { + "epoch": 0.2001422668866254, + "grad_norm": 0.14419429004192352, + "learning_rate": 0.00010028615669545945, + "loss": 1.589375114440918, + "step": 66120 + }, + { + "epoch": 0.20017253643697122, + "grad_norm": 0.1131029725074768, + "learning_rate": 0.00010028236151382205, + "loss": 1.5945276260375976, + "step": 66130 + }, + { + "epoch": 0.20020280598731705, + "grad_norm": 0.12236148118972778, + "learning_rate": 0.00010027856633218466, + "loss": 1.6239057540893556, + "step": 66140 + }, + { + "epoch": 0.2002330755376629, + "grad_norm": 0.1207837387919426, + "learning_rate": 0.00010027477115054726, + "loss": 1.6012908935546875, + "step": 66150 + }, + { + "epoch": 0.20026334508800872, + "grad_norm": 0.12117024511098862, + "learning_rate": 0.00010027097596890987, + "loss": 1.5676578521728515, + "step": 66160 + }, + { + "epoch": 0.20029361463835454, + "grad_norm": 0.12381454557180405, + "learning_rate": 0.00010026718078727249, + "loss": 1.551201629638672, + "step": 66170 + }, + { + "epoch": 0.20032388418870037, + "grad_norm": 0.11932844668626785, + "learning_rate": 0.00010026338560563508, + "loss": 1.6052331924438477, + "step": 66180 + }, + { + "epoch": 0.20035415373904622, + "grad_norm": 0.12181438505649567, + "learning_rate": 0.0001002595904239977, + "loss": 1.5654664039611816, + "step": 66190 + }, + { + "epoch": 0.20038442328939204, + "grad_norm": 0.12429209053516388, + "learning_rate": 0.0001002557952423603, + "loss": 1.5857666015625, + "step": 66200 + }, + { + "epoch": 0.20041469283973787, + "grad_norm": 0.1308077871799469, + "learning_rate": 0.00010025200006072291, + "loss": 1.588098907470703, + "step": 66210 + }, + { + "epoch": 0.2004449623900837, + "grad_norm": 0.12327854335308075, + "learning_rate": 0.0001002482048790855, + "loss": 1.5865924835205079, + "step": 66220 + }, + { + "epoch": 0.20047523194042952, + "grad_norm": 0.11835359781980515, + "learning_rate": 0.00010024440969744812, + "loss": 1.5772462844848634, + "step": 66230 + }, + { + "epoch": 0.20050550149077537, + "grad_norm": 0.13717935979366302, + "learning_rate": 0.00010024061451581073, + "loss": 1.5912050247192382, + "step": 66240 + }, + { + "epoch": 0.2005357710411212, + "grad_norm": 0.1252102255821228, + "learning_rate": 0.00010023681933417334, + "loss": 1.6071147918701172, + "step": 66250 + }, + { + "epoch": 0.20056604059146702, + "grad_norm": 0.13107092678546906, + "learning_rate": 0.00010023302415253594, + "loss": 1.6249799728393555, + "step": 66260 + }, + { + "epoch": 0.20059631014181284, + "grad_norm": 0.12153491377830505, + "learning_rate": 0.00010022922897089855, + "loss": 1.5802395820617676, + "step": 66270 + }, + { + "epoch": 0.20062657969215866, + "grad_norm": 0.11769791692495346, + "learning_rate": 0.00010022543378926115, + "loss": 1.592136764526367, + "step": 66280 + }, + { + "epoch": 0.20065684924250451, + "grad_norm": 0.12256143987178802, + "learning_rate": 0.00010022163860762376, + "loss": 1.569434356689453, + "step": 66290 + }, + { + "epoch": 0.20068711879285034, + "grad_norm": 0.11483072489500046, + "learning_rate": 0.00010021784342598638, + "loss": 1.5730192184448242, + "step": 66300 + }, + { + "epoch": 0.20071738834319616, + "grad_norm": 0.12244913727045059, + "learning_rate": 0.00010021404824434897, + "loss": 1.575899600982666, + "step": 66310 + }, + { + "epoch": 0.200747657893542, + "grad_norm": 0.11314239352941513, + "learning_rate": 0.00010021025306271159, + "loss": 1.5648093223571777, + "step": 66320 + }, + { + "epoch": 0.2007779274438878, + "grad_norm": 0.13421212136745453, + "learning_rate": 0.00010020645788107418, + "loss": 1.6043415069580078, + "step": 66330 + }, + { + "epoch": 0.20080819699423366, + "grad_norm": 0.12141325324773788, + "learning_rate": 0.0001002026626994368, + "loss": 1.6056306838989258, + "step": 66340 + }, + { + "epoch": 0.20083846654457949, + "grad_norm": 0.11486005038022995, + "learning_rate": 0.0001001988675177994, + "loss": 1.6141517639160157, + "step": 66350 + }, + { + "epoch": 0.2008687360949253, + "grad_norm": 0.14604249596595764, + "learning_rate": 0.00010019507233616202, + "loss": 1.5690881729125976, + "step": 66360 + }, + { + "epoch": 0.20089900564527113, + "grad_norm": 0.11041951924562454, + "learning_rate": 0.00010019127715452461, + "loss": 1.5556264877319337, + "step": 66370 + }, + { + "epoch": 0.20092927519561696, + "grad_norm": 0.12255560606718063, + "learning_rate": 0.00010018748197288723, + "loss": 1.5890013694763183, + "step": 66380 + }, + { + "epoch": 0.2009595447459628, + "grad_norm": 0.11919096112251282, + "learning_rate": 0.00010018368679124982, + "loss": 1.5731903076171876, + "step": 66390 + }, + { + "epoch": 0.20098981429630863, + "grad_norm": 0.12417034804821014, + "learning_rate": 0.00010017989160961244, + "loss": 1.5798364639282227, + "step": 66400 + }, + { + "epoch": 0.20102008384665446, + "grad_norm": 0.12928062677383423, + "learning_rate": 0.00010017609642797505, + "loss": 1.5849977493286134, + "step": 66410 + }, + { + "epoch": 0.20105035339700028, + "grad_norm": 0.12625174224376678, + "learning_rate": 0.00010017230124633765, + "loss": 1.6227386474609375, + "step": 66420 + }, + { + "epoch": 0.2010806229473461, + "grad_norm": 0.13014110922813416, + "learning_rate": 0.00010016850606470027, + "loss": 1.5858055114746095, + "step": 66430 + }, + { + "epoch": 0.20111089249769196, + "grad_norm": 0.1436300277709961, + "learning_rate": 0.00010016471088306286, + "loss": 1.5983762741088867, + "step": 66440 + }, + { + "epoch": 0.20114116204803778, + "grad_norm": 0.1211138591170311, + "learning_rate": 0.00010016091570142548, + "loss": 1.602584457397461, + "step": 66450 + }, + { + "epoch": 0.2011714315983836, + "grad_norm": 0.10990861803293228, + "learning_rate": 0.00010015712051978808, + "loss": 1.5654834747314452, + "step": 66460 + }, + { + "epoch": 0.20120170114872943, + "grad_norm": 0.10677140951156616, + "learning_rate": 0.0001001533253381507, + "loss": 1.595895004272461, + "step": 66470 + }, + { + "epoch": 0.20123197069907525, + "grad_norm": 0.11441663652658463, + "learning_rate": 0.00010014953015651329, + "loss": 1.5800848007202148, + "step": 66480 + }, + { + "epoch": 0.2012622402494211, + "grad_norm": 0.11745967715978622, + "learning_rate": 0.0001001457349748759, + "loss": 1.6042774200439454, + "step": 66490 + }, + { + "epoch": 0.20129250979976693, + "grad_norm": 0.1334230899810791, + "learning_rate": 0.0001001419397932385, + "loss": 1.5649684906005858, + "step": 66500 + }, + { + "epoch": 0.20129250979976693, + "eval_loss": 1.6140656471252441, + "eval_runtime": 28.3203, + "eval_samples_per_second": 17.655, + "eval_steps_per_second": 1.13, + "step": 66500 + }, + { + "epoch": 0.20132277935011275, + "grad_norm": 0.13009554147720337, + "learning_rate": 0.00010013814461160112, + "loss": 1.5737183570861817, + "step": 66510 + }, + { + "epoch": 0.20135304890045858, + "grad_norm": 0.13224627077579498, + "learning_rate": 0.00010013434942996371, + "loss": 1.565621566772461, + "step": 66520 + }, + { + "epoch": 0.20138331845080443, + "grad_norm": 0.13396404683589935, + "learning_rate": 0.00010013055424832633, + "loss": 1.5766357421875, + "step": 66530 + }, + { + "epoch": 0.20141358800115025, + "grad_norm": 0.12062738090753555, + "learning_rate": 0.00010012675906668894, + "loss": 1.5836050033569335, + "step": 66540 + }, + { + "epoch": 0.20144385755149607, + "grad_norm": 0.13173072040081024, + "learning_rate": 0.00010012296388505154, + "loss": 1.5599353790283204, + "step": 66550 + }, + { + "epoch": 0.2014741271018419, + "grad_norm": 0.12455712258815765, + "learning_rate": 0.00010011916870341415, + "loss": 1.5624004364013673, + "step": 66560 + }, + { + "epoch": 0.20150439665218772, + "grad_norm": 0.13374000787734985, + "learning_rate": 0.00010011537352177675, + "loss": 1.571316623687744, + "step": 66570 + }, + { + "epoch": 0.20153466620253357, + "grad_norm": 0.12215961515903473, + "learning_rate": 0.00010011157834013936, + "loss": 1.6311790466308593, + "step": 66580 + }, + { + "epoch": 0.2015649357528794, + "grad_norm": 0.147506445646286, + "learning_rate": 0.00010010778315850197, + "loss": 1.6108577728271485, + "step": 66590 + }, + { + "epoch": 0.20159520530322522, + "grad_norm": 0.1255124807357788, + "learning_rate": 0.00010010398797686457, + "loss": 1.6151283264160157, + "step": 66600 + }, + { + "epoch": 0.20162547485357105, + "grad_norm": 0.12085091322660446, + "learning_rate": 0.00010010019279522718, + "loss": 1.5906183242797851, + "step": 66610 + }, + { + "epoch": 0.20165574440391687, + "grad_norm": 0.12032590061426163, + "learning_rate": 0.00010009639761358978, + "loss": 1.5728940963745117, + "step": 66620 + }, + { + "epoch": 0.20168601395426272, + "grad_norm": 0.12583620846271515, + "learning_rate": 0.00010009260243195239, + "loss": 1.6089603424072265, + "step": 66630 + }, + { + "epoch": 0.20171628350460855, + "grad_norm": 0.1454663872718811, + "learning_rate": 0.00010008880725031501, + "loss": 1.5660374641418457, + "step": 66640 + }, + { + "epoch": 0.20174655305495437, + "grad_norm": 0.11929245293140411, + "learning_rate": 0.0001000850120686776, + "loss": 1.5708215713500977, + "step": 66650 + }, + { + "epoch": 0.2017768226053002, + "grad_norm": 0.1420077532529831, + "learning_rate": 0.00010008121688704022, + "loss": 1.5847421646118165, + "step": 66660 + }, + { + "epoch": 0.20180709215564602, + "grad_norm": 0.11179062724113464, + "learning_rate": 0.00010007742170540283, + "loss": 1.6161516189575196, + "step": 66670 + }, + { + "epoch": 0.20183736170599187, + "grad_norm": 0.12620118260383606, + "learning_rate": 0.00010007362652376543, + "loss": 1.6271841049194335, + "step": 66680 + }, + { + "epoch": 0.2018676312563377, + "grad_norm": 0.11493422836065292, + "learning_rate": 0.00010006983134212804, + "loss": 1.5832923889160155, + "step": 66690 + }, + { + "epoch": 0.20189790080668352, + "grad_norm": 0.12323830276727676, + "learning_rate": 0.00010006603616049064, + "loss": 1.5680156707763673, + "step": 66700 + }, + { + "epoch": 0.20192817035702934, + "grad_norm": 0.146366149187088, + "learning_rate": 0.00010006224097885325, + "loss": 1.5745487213134766, + "step": 66710 + }, + { + "epoch": 0.20195843990737516, + "grad_norm": 0.13310514390468597, + "learning_rate": 0.00010005844579721586, + "loss": 1.5699284553527832, + "step": 66720 + }, + { + "epoch": 0.20198870945772102, + "grad_norm": 0.11507164686918259, + "learning_rate": 0.00010005465061557846, + "loss": 1.6052986145019532, + "step": 66730 + }, + { + "epoch": 0.20201897900806684, + "grad_norm": 0.1309480369091034, + "learning_rate": 0.00010005085543394107, + "loss": 1.5791847229003906, + "step": 66740 + }, + { + "epoch": 0.20204924855841266, + "grad_norm": 0.12106584757566452, + "learning_rate": 0.00010004706025230367, + "loss": 1.5684945106506347, + "step": 66750 + }, + { + "epoch": 0.2020795181087585, + "grad_norm": 0.11885692179203033, + "learning_rate": 0.00010004326507066628, + "loss": 1.5436569213867188, + "step": 66760 + }, + { + "epoch": 0.2021097876591043, + "grad_norm": 0.11898140609264374, + "learning_rate": 0.00010003946988902889, + "loss": 1.5856346130371093, + "step": 66770 + }, + { + "epoch": 0.20214005720945016, + "grad_norm": 0.1160205528140068, + "learning_rate": 0.0001000356747073915, + "loss": 1.6264846801757813, + "step": 66780 + }, + { + "epoch": 0.202170326759796, + "grad_norm": 0.12135482579469681, + "learning_rate": 0.0001000318795257541, + "loss": 1.6152122497558594, + "step": 66790 + }, + { + "epoch": 0.2022005963101418, + "grad_norm": 0.13419239223003387, + "learning_rate": 0.00010002808434411672, + "loss": 1.5985326766967773, + "step": 66800 + }, + { + "epoch": 0.20223086586048764, + "grad_norm": 0.13342629373073578, + "learning_rate": 0.00010002428916247931, + "loss": 1.6010730743408204, + "step": 66810 + }, + { + "epoch": 0.20226113541083346, + "grad_norm": 0.1276291161775589, + "learning_rate": 0.00010002049398084193, + "loss": 1.602388572692871, + "step": 66820 + }, + { + "epoch": 0.2022914049611793, + "grad_norm": 0.11539289355278015, + "learning_rate": 0.00010001669879920452, + "loss": 1.580224609375, + "step": 66830 + }, + { + "epoch": 0.20232167451152513, + "grad_norm": 0.12493269890546799, + "learning_rate": 0.00010001290361756714, + "loss": 1.5719114303588868, + "step": 66840 + }, + { + "epoch": 0.20235194406187096, + "grad_norm": 0.12630309164524078, + "learning_rate": 0.00010000910843592975, + "loss": 1.581464672088623, + "step": 66850 + }, + { + "epoch": 0.20238221361221678, + "grad_norm": 0.14256687462329865, + "learning_rate": 0.00010000531325429235, + "loss": 1.58192138671875, + "step": 66860 + }, + { + "epoch": 0.2024124831625626, + "grad_norm": 0.14254750311374664, + "learning_rate": 0.00010000151807265496, + "loss": 1.584852123260498, + "step": 66870 + }, + { + "epoch": 0.20244275271290846, + "grad_norm": 0.12691238522529602, + "learning_rate": 9.999772289101757e-05, + "loss": 1.5652830123901367, + "step": 66880 + }, + { + "epoch": 0.20247302226325428, + "grad_norm": 0.1279030740261078, + "learning_rate": 9.999392770938017e-05, + "loss": 1.6019886016845704, + "step": 66890 + }, + { + "epoch": 0.2025032918136001, + "grad_norm": 0.1273617148399353, + "learning_rate": 9.999013252774278e-05, + "loss": 1.5683250427246094, + "step": 66900 + }, + { + "epoch": 0.20253356136394593, + "grad_norm": 0.1360340267419815, + "learning_rate": 9.99863373461054e-05, + "loss": 1.602165985107422, + "step": 66910 + }, + { + "epoch": 0.20256383091429178, + "grad_norm": 0.12339860945940018, + "learning_rate": 9.998254216446799e-05, + "loss": 1.5386635780334472, + "step": 66920 + }, + { + "epoch": 0.2025941004646376, + "grad_norm": 0.12620876729488373, + "learning_rate": 9.997874698283061e-05, + "loss": 1.561527633666992, + "step": 66930 + }, + { + "epoch": 0.20262437001498343, + "grad_norm": 0.11348805576562881, + "learning_rate": 9.99749518011932e-05, + "loss": 1.5826828956604004, + "step": 66940 + }, + { + "epoch": 0.20265463956532925, + "grad_norm": 0.14081545174121857, + "learning_rate": 9.997115661955582e-05, + "loss": 1.5953361511230468, + "step": 66950 + }, + { + "epoch": 0.20268490911567508, + "grad_norm": 0.12977994978427887, + "learning_rate": 9.996736143791841e-05, + "loss": 1.5827648162841796, + "step": 66960 + }, + { + "epoch": 0.20271517866602093, + "grad_norm": 0.11871980875730515, + "learning_rate": 9.996356625628103e-05, + "loss": 1.5989147186279298, + "step": 66970 + }, + { + "epoch": 0.20274544821636675, + "grad_norm": 0.11837704479694366, + "learning_rate": 9.995977107464363e-05, + "loss": 1.5737348556518556, + "step": 66980 + }, + { + "epoch": 0.20277571776671258, + "grad_norm": 0.11527372896671295, + "learning_rate": 9.995597589300624e-05, + "loss": 1.5473163604736329, + "step": 66990 + }, + { + "epoch": 0.2028059873170584, + "grad_norm": 0.12030532211065292, + "learning_rate": 9.995218071136884e-05, + "loss": 1.6181941986083985, + "step": 67000 + }, + { + "epoch": 0.2028059873170584, + "eval_loss": 1.6070307493209839, + "eval_runtime": 28.3415, + "eval_samples_per_second": 17.642, + "eval_steps_per_second": 1.129, + "step": 67000 + }, + { + "epoch": 0.20283625686740422, + "grad_norm": 0.14178970456123352, + "learning_rate": 9.994838552973146e-05, + "loss": 1.603539276123047, + "step": 67010 + }, + { + "epoch": 0.20286652641775008, + "grad_norm": 0.1206318587064743, + "learning_rate": 9.994459034809405e-05, + "loss": 1.5779146194458007, + "step": 67020 + }, + { + "epoch": 0.2028967959680959, + "grad_norm": 0.11520363390445709, + "learning_rate": 9.994079516645667e-05, + "loss": 1.5707588195800781, + "step": 67030 + }, + { + "epoch": 0.20292706551844172, + "grad_norm": 0.13364191353321075, + "learning_rate": 9.993699998481929e-05, + "loss": 1.5615121841430664, + "step": 67040 + }, + { + "epoch": 0.20295733506878755, + "grad_norm": 0.11565162241458893, + "learning_rate": 9.993320480318188e-05, + "loss": 1.5974757194519043, + "step": 67050 + }, + { + "epoch": 0.20298760461913337, + "grad_norm": 0.13405993580818176, + "learning_rate": 9.99294096215445e-05, + "loss": 1.583120059967041, + "step": 67060 + }, + { + "epoch": 0.20301787416947922, + "grad_norm": 0.11688563227653503, + "learning_rate": 9.992561443990709e-05, + "loss": 1.571757698059082, + "step": 67070 + }, + { + "epoch": 0.20304814371982505, + "grad_norm": 0.12451184540987015, + "learning_rate": 9.992181925826971e-05, + "loss": 1.5923007011413575, + "step": 67080 + }, + { + "epoch": 0.20307841327017087, + "grad_norm": 0.13446012139320374, + "learning_rate": 9.99180240766323e-05, + "loss": 1.6210569381713866, + "step": 67090 + }, + { + "epoch": 0.2031086828205167, + "grad_norm": 0.12965896725654602, + "learning_rate": 9.991422889499492e-05, + "loss": 1.6370504379272461, + "step": 67100 + }, + { + "epoch": 0.20313895237086252, + "grad_norm": 0.12171418219804764, + "learning_rate": 9.991043371335752e-05, + "loss": 1.5884655952453612, + "step": 67110 + }, + { + "epoch": 0.20316922192120837, + "grad_norm": 0.12899230420589447, + "learning_rate": 9.990663853172014e-05, + "loss": 1.609379196166992, + "step": 67120 + }, + { + "epoch": 0.2031994914715542, + "grad_norm": 0.12659278512001038, + "learning_rate": 9.990284335008273e-05, + "loss": 1.5932645797729492, + "step": 67130 + }, + { + "epoch": 0.20322976102190002, + "grad_norm": 0.12573058903217316, + "learning_rate": 9.989904816844535e-05, + "loss": 1.5592208862304688, + "step": 67140 + }, + { + "epoch": 0.20326003057224584, + "grad_norm": 0.12669043242931366, + "learning_rate": 9.989525298680795e-05, + "loss": 1.6040325164794922, + "step": 67150 + }, + { + "epoch": 0.20329030012259167, + "grad_norm": 0.12192931771278381, + "learning_rate": 9.989145780517056e-05, + "loss": 1.5927166938781738, + "step": 67160 + }, + { + "epoch": 0.20332056967293752, + "grad_norm": 0.12149888277053833, + "learning_rate": 9.988766262353317e-05, + "loss": 1.575474739074707, + "step": 67170 + }, + { + "epoch": 0.20335083922328334, + "grad_norm": 0.12293405085802078, + "learning_rate": 9.988386744189577e-05, + "loss": 1.5691253662109375, + "step": 67180 + }, + { + "epoch": 0.20338110877362917, + "grad_norm": 0.14034141600131989, + "learning_rate": 9.988007226025838e-05, + "loss": 1.5952199935913085, + "step": 67190 + }, + { + "epoch": 0.203411378323975, + "grad_norm": 0.11941781640052795, + "learning_rate": 9.987627707862098e-05, + "loss": 1.6280763626098633, + "step": 67200 + }, + { + "epoch": 0.2034416478743208, + "grad_norm": 0.11698916554450989, + "learning_rate": 9.987248189698359e-05, + "loss": 1.5772832870483398, + "step": 67210 + }, + { + "epoch": 0.20347191742466667, + "grad_norm": 0.11377350240945816, + "learning_rate": 9.98686867153462e-05, + "loss": 1.6022403717041016, + "step": 67220 + }, + { + "epoch": 0.2035021869750125, + "grad_norm": 0.12456964701414108, + "learning_rate": 9.98648915337088e-05, + "loss": 1.5919818878173828, + "step": 67230 + }, + { + "epoch": 0.2035324565253583, + "grad_norm": 0.1250777542591095, + "learning_rate": 9.986109635207141e-05, + "loss": 1.5909913063049317, + "step": 67240 + }, + { + "epoch": 0.20356272607570414, + "grad_norm": 0.12294860184192657, + "learning_rate": 9.985730117043403e-05, + "loss": 1.563812828063965, + "step": 67250 + }, + { + "epoch": 0.20359299562605, + "grad_norm": 0.12415505200624466, + "learning_rate": 9.985350598879662e-05, + "loss": 1.6120662689208984, + "step": 67260 + }, + { + "epoch": 0.2036232651763958, + "grad_norm": 0.1329265683889389, + "learning_rate": 9.984971080715924e-05, + "loss": 1.572641944885254, + "step": 67270 + }, + { + "epoch": 0.20365353472674164, + "grad_norm": 0.1250217854976654, + "learning_rate": 9.984591562552184e-05, + "loss": 1.5804656982421874, + "step": 67280 + }, + { + "epoch": 0.20368380427708746, + "grad_norm": 0.11666800081729889, + "learning_rate": 9.984212044388445e-05, + "loss": 1.5688014030456543, + "step": 67290 + }, + { + "epoch": 0.20371407382743328, + "grad_norm": 0.10855584591627121, + "learning_rate": 9.983832526224706e-05, + "loss": 1.5795538902282715, + "step": 67300 + }, + { + "epoch": 0.20374434337777914, + "grad_norm": 0.13681380450725555, + "learning_rate": 9.983453008060966e-05, + "loss": 1.6106534957885743, + "step": 67310 + }, + { + "epoch": 0.20377461292812496, + "grad_norm": 0.1278146505355835, + "learning_rate": 9.983073489897227e-05, + "loss": 1.5886699676513671, + "step": 67320 + }, + { + "epoch": 0.20380488247847078, + "grad_norm": 0.10910592973232269, + "learning_rate": 9.982693971733487e-05, + "loss": 1.6001548767089844, + "step": 67330 + }, + { + "epoch": 0.2038351520288166, + "grad_norm": 0.1290665566921234, + "learning_rate": 9.982314453569748e-05, + "loss": 1.606821060180664, + "step": 67340 + }, + { + "epoch": 0.20386542157916243, + "grad_norm": 0.12578827142715454, + "learning_rate": 9.981934935406009e-05, + "loss": 1.6094579696655273, + "step": 67350 + }, + { + "epoch": 0.20389569112950828, + "grad_norm": 0.10686299204826355, + "learning_rate": 9.981555417242269e-05, + "loss": 1.6283554077148437, + "step": 67360 + }, + { + "epoch": 0.2039259606798541, + "grad_norm": 0.12364662438631058, + "learning_rate": 9.98117589907853e-05, + "loss": 1.5919575691223145, + "step": 67370 + }, + { + "epoch": 0.20395623023019993, + "grad_norm": 0.12107453495264053, + "learning_rate": 9.98079638091479e-05, + "loss": 1.6084640502929688, + "step": 67380 + }, + { + "epoch": 0.20398649978054575, + "grad_norm": 0.1128712147474289, + "learning_rate": 9.980416862751051e-05, + "loss": 1.5958008766174316, + "step": 67390 + }, + { + "epoch": 0.20401676933089158, + "grad_norm": 0.11346789449453354, + "learning_rate": 9.980037344587312e-05, + "loss": 1.6247514724731444, + "step": 67400 + }, + { + "epoch": 0.20404703888123743, + "grad_norm": 0.13196450471878052, + "learning_rate": 9.979657826423574e-05, + "loss": 1.5827783584594726, + "step": 67410 + }, + { + "epoch": 0.20407730843158325, + "grad_norm": 0.11815398186445236, + "learning_rate": 9.979278308259833e-05, + "loss": 1.5914342880249024, + "step": 67420 + }, + { + "epoch": 0.20410757798192908, + "grad_norm": 0.1181815043091774, + "learning_rate": 9.978898790096095e-05, + "loss": 1.5828726768493653, + "step": 67430 + }, + { + "epoch": 0.2041378475322749, + "grad_norm": 0.13296210765838623, + "learning_rate": 9.978519271932354e-05, + "loss": 1.5551891326904297, + "step": 67440 + }, + { + "epoch": 0.20416811708262073, + "grad_norm": 0.12417884171009064, + "learning_rate": 9.978139753768616e-05, + "loss": 1.555596160888672, + "step": 67450 + }, + { + "epoch": 0.20419838663296658, + "grad_norm": 0.11776405572891235, + "learning_rate": 9.977760235604876e-05, + "loss": 1.6012313842773438, + "step": 67460 + }, + { + "epoch": 0.2042286561833124, + "grad_norm": 0.1525101363658905, + "learning_rate": 9.977380717441137e-05, + "loss": 1.5923602104187011, + "step": 67470 + }, + { + "epoch": 0.20425892573365823, + "grad_norm": 0.1140986979007721, + "learning_rate": 9.977001199277398e-05, + "loss": 1.57938871383667, + "step": 67480 + }, + { + "epoch": 0.20428919528400405, + "grad_norm": 0.14486393332481384, + "learning_rate": 9.976621681113658e-05, + "loss": 1.563258743286133, + "step": 67490 + }, + { + "epoch": 0.20431946483434987, + "grad_norm": 0.12856829166412354, + "learning_rate": 9.976242162949919e-05, + "loss": 1.5770288467407227, + "step": 67500 + }, + { + "epoch": 0.20431946483434987, + "eval_loss": 1.5809648036956787, + "eval_runtime": 28.0623, + "eval_samples_per_second": 17.817, + "eval_steps_per_second": 1.14, + "step": 67500 + }, + { + "epoch": 0.20434973438469572, + "grad_norm": 0.1249944418668747, + "learning_rate": 9.97586264478618e-05, + "loss": 1.6173629760742188, + "step": 67510 + }, + { + "epoch": 0.20438000393504155, + "grad_norm": 0.11799319833517075, + "learning_rate": 9.975483126622441e-05, + "loss": 1.571690559387207, + "step": 67520 + }, + { + "epoch": 0.20441027348538737, + "grad_norm": 0.1267448365688324, + "learning_rate": 9.9751036084587e-05, + "loss": 1.5829858779907227, + "step": 67530 + }, + { + "epoch": 0.2044405430357332, + "grad_norm": 0.13037052750587463, + "learning_rate": 9.974724090294963e-05, + "loss": 1.583774471282959, + "step": 67540 + }, + { + "epoch": 0.20447081258607902, + "grad_norm": 0.11165033280849457, + "learning_rate": 9.974344572131222e-05, + "loss": 1.5270071029663086, + "step": 67550 + }, + { + "epoch": 0.20450108213642487, + "grad_norm": 0.1190466582775116, + "learning_rate": 9.973965053967484e-05, + "loss": 1.595273208618164, + "step": 67560 + }, + { + "epoch": 0.2045313516867707, + "grad_norm": 0.12003181129693985, + "learning_rate": 9.973585535803743e-05, + "loss": 1.5944872856140138, + "step": 67570 + }, + { + "epoch": 0.20456162123711652, + "grad_norm": 0.11105260998010635, + "learning_rate": 9.973206017640005e-05, + "loss": 1.608905029296875, + "step": 67580 + }, + { + "epoch": 0.20459189078746234, + "grad_norm": 0.12191661447286606, + "learning_rate": 9.972826499476264e-05, + "loss": 1.581214427947998, + "step": 67590 + }, + { + "epoch": 0.2046221603378082, + "grad_norm": 0.1205679178237915, + "learning_rate": 9.972446981312526e-05, + "loss": 1.5562273025512696, + "step": 67600 + }, + { + "epoch": 0.20465242988815402, + "grad_norm": 0.11588488519191742, + "learning_rate": 9.972067463148785e-05, + "loss": 1.5600281715393067, + "step": 67610 + }, + { + "epoch": 0.20468269943849984, + "grad_norm": 0.12991471588611603, + "learning_rate": 9.971687944985047e-05, + "loss": 1.5585429191589355, + "step": 67620 + }, + { + "epoch": 0.20471296898884567, + "grad_norm": 0.14346547424793243, + "learning_rate": 9.971308426821307e-05, + "loss": 1.5783327102661133, + "step": 67630 + }, + { + "epoch": 0.2047432385391915, + "grad_norm": 0.14226031303405762, + "learning_rate": 9.970928908657569e-05, + "loss": 1.5833579063415528, + "step": 67640 + }, + { + "epoch": 0.20477350808953734, + "grad_norm": 0.11026635020971298, + "learning_rate": 9.97054939049383e-05, + "loss": 1.6087379455566406, + "step": 67650 + }, + { + "epoch": 0.20480377763988317, + "grad_norm": 0.11944912374019623, + "learning_rate": 9.97016987233009e-05, + "loss": 1.6061698913574218, + "step": 67660 + }, + { + "epoch": 0.204834047190229, + "grad_norm": 0.1229916512966156, + "learning_rate": 9.969790354166352e-05, + "loss": 1.609181022644043, + "step": 67670 + }, + { + "epoch": 0.20486431674057481, + "grad_norm": 0.1318999081850052, + "learning_rate": 9.969410836002611e-05, + "loss": 1.5812320709228516, + "step": 67680 + }, + { + "epoch": 0.20489458629092064, + "grad_norm": 0.11200767755508423, + "learning_rate": 9.969031317838873e-05, + "loss": 1.5918455123901367, + "step": 67690 + }, + { + "epoch": 0.2049248558412665, + "grad_norm": 0.13475938141345978, + "learning_rate": 9.968651799675132e-05, + "loss": 1.5949127197265625, + "step": 67700 + }, + { + "epoch": 0.20495512539161231, + "grad_norm": 0.11184771358966827, + "learning_rate": 9.968272281511394e-05, + "loss": 1.589468765258789, + "step": 67710 + }, + { + "epoch": 0.20498539494195814, + "grad_norm": 0.14050571620464325, + "learning_rate": 9.967892763347653e-05, + "loss": 1.5547756195068358, + "step": 67720 + }, + { + "epoch": 0.20501566449230396, + "grad_norm": 0.13939031958580017, + "learning_rate": 9.967513245183915e-05, + "loss": 1.6046440124511718, + "step": 67730 + }, + { + "epoch": 0.20504593404264979, + "grad_norm": 0.12886017560958862, + "learning_rate": 9.967133727020175e-05, + "loss": 1.5975757598876954, + "step": 67740 + }, + { + "epoch": 0.20507620359299564, + "grad_norm": 0.11480263620615005, + "learning_rate": 9.966754208856436e-05, + "loss": 1.5844999313354493, + "step": 67750 + }, + { + "epoch": 0.20510647314334146, + "grad_norm": 0.13219691812992096, + "learning_rate": 9.966374690692696e-05, + "loss": 1.5977218627929688, + "step": 67760 + }, + { + "epoch": 0.20513674269368729, + "grad_norm": 0.14658401906490326, + "learning_rate": 9.965995172528958e-05, + "loss": 1.6137666702270508, + "step": 67770 + }, + { + "epoch": 0.2051670122440331, + "grad_norm": 0.12076209485530853, + "learning_rate": 9.965615654365218e-05, + "loss": 1.6119937896728516, + "step": 67780 + }, + { + "epoch": 0.20519728179437893, + "grad_norm": 0.132217675447464, + "learning_rate": 9.965236136201479e-05, + "loss": 1.5683769226074218, + "step": 67790 + }, + { + "epoch": 0.20522755134472478, + "grad_norm": 0.1366778463125229, + "learning_rate": 9.96485661803774e-05, + "loss": 1.6091037750244142, + "step": 67800 + }, + { + "epoch": 0.2052578208950706, + "grad_norm": 0.12729401886463165, + "learning_rate": 9.964477099874e-05, + "loss": 1.6217750549316405, + "step": 67810 + }, + { + "epoch": 0.20528809044541643, + "grad_norm": 0.1290488839149475, + "learning_rate": 9.96409758171026e-05, + "loss": 1.6083642959594726, + "step": 67820 + }, + { + "epoch": 0.20531835999576226, + "grad_norm": 0.1378091722726822, + "learning_rate": 9.963718063546521e-05, + "loss": 1.6344249725341797, + "step": 67830 + }, + { + "epoch": 0.20534862954610808, + "grad_norm": 0.12199927121400833, + "learning_rate": 9.963338545382782e-05, + "loss": 1.5964766502380372, + "step": 67840 + }, + { + "epoch": 0.20537889909645393, + "grad_norm": 0.1315186768770218, + "learning_rate": 9.962959027219042e-05, + "loss": 1.5562679290771484, + "step": 67850 + }, + { + "epoch": 0.20540916864679976, + "grad_norm": 0.11023250967264175, + "learning_rate": 9.962579509055304e-05, + "loss": 1.6258953094482422, + "step": 67860 + }, + { + "epoch": 0.20543943819714558, + "grad_norm": 0.1330016404390335, + "learning_rate": 9.962199990891564e-05, + "loss": 1.6139575958251953, + "step": 67870 + }, + { + "epoch": 0.2054697077474914, + "grad_norm": 0.13064152002334595, + "learning_rate": 9.961820472727826e-05, + "loss": 1.5608988761901856, + "step": 67880 + }, + { + "epoch": 0.20549997729783723, + "grad_norm": 0.12094807624816895, + "learning_rate": 9.961440954564086e-05, + "loss": 1.6070459365844727, + "step": 67890 + }, + { + "epoch": 0.20553024684818308, + "grad_norm": 0.14457592368125916, + "learning_rate": 9.961061436400347e-05, + "loss": 1.5702890396118163, + "step": 67900 + }, + { + "epoch": 0.2055605163985289, + "grad_norm": 0.11303800344467163, + "learning_rate": 9.960681918236607e-05, + "loss": 1.6453086853027343, + "step": 67910 + }, + { + "epoch": 0.20559078594887473, + "grad_norm": 0.12439484894275665, + "learning_rate": 9.960302400072868e-05, + "loss": 1.5772769927978516, + "step": 67920 + }, + { + "epoch": 0.20562105549922055, + "grad_norm": 0.1141216978430748, + "learning_rate": 9.959922881909129e-05, + "loss": 1.5853413581848144, + "step": 67930 + }, + { + "epoch": 0.2056513250495664, + "grad_norm": 0.13918200135231018, + "learning_rate": 9.959543363745389e-05, + "loss": 1.553173828125, + "step": 67940 + }, + { + "epoch": 0.20568159459991223, + "grad_norm": 0.12632867693901062, + "learning_rate": 9.95916384558165e-05, + "loss": 1.587938117980957, + "step": 67950 + }, + { + "epoch": 0.20571186415025805, + "grad_norm": 0.1253950446844101, + "learning_rate": 9.95878432741791e-05, + "loss": 1.5754122734069824, + "step": 67960 + }, + { + "epoch": 0.20574213370060387, + "grad_norm": 0.11887699365615845, + "learning_rate": 9.958404809254171e-05, + "loss": 1.569573974609375, + "step": 67970 + }, + { + "epoch": 0.2057724032509497, + "grad_norm": 0.1242927685379982, + "learning_rate": 9.958025291090432e-05, + "loss": 1.6001062393188477, + "step": 67980 + }, + { + "epoch": 0.20580267280129555, + "grad_norm": 0.12050468474626541, + "learning_rate": 9.957645772926692e-05, + "loss": 1.5678646087646484, + "step": 67990 + }, + { + "epoch": 0.20583294235164137, + "grad_norm": 0.12103145569562912, + "learning_rate": 9.957266254762953e-05, + "loss": 1.5833086013793944, + "step": 68000 + }, + { + "epoch": 0.20583294235164137, + "eval_loss": 1.595700740814209, + "eval_runtime": 27.8816, + "eval_samples_per_second": 17.933, + "eval_steps_per_second": 1.148, + "step": 68000 + }, + { + "epoch": 0.2058632119019872, + "grad_norm": 0.11665388196706772, + "learning_rate": 9.956886736599213e-05, + "loss": 1.5828914642333984, + "step": 68010 + }, + { + "epoch": 0.20589348145233302, + "grad_norm": 0.11790731549263, + "learning_rate": 9.956507218435475e-05, + "loss": 1.6084260940551758, + "step": 68020 + }, + { + "epoch": 0.20592375100267885, + "grad_norm": 0.12123039364814758, + "learning_rate": 9.956127700271734e-05, + "loss": 1.5929887771606446, + "step": 68030 + }, + { + "epoch": 0.2059540205530247, + "grad_norm": 0.1255231350660324, + "learning_rate": 9.955748182107996e-05, + "loss": 1.5619709968566895, + "step": 68040 + }, + { + "epoch": 0.20598429010337052, + "grad_norm": 0.13658639788627625, + "learning_rate": 9.955368663944256e-05, + "loss": 1.5824650764465331, + "step": 68050 + }, + { + "epoch": 0.20601455965371634, + "grad_norm": 0.13275566697120667, + "learning_rate": 9.954989145780518e-05, + "loss": 1.5809268951416016, + "step": 68060 + }, + { + "epoch": 0.20604482920406217, + "grad_norm": 0.13609831035137177, + "learning_rate": 9.954609627616778e-05, + "loss": 1.5837894439697267, + "step": 68070 + }, + { + "epoch": 0.206075098754408, + "grad_norm": 0.125677227973938, + "learning_rate": 9.954230109453039e-05, + "loss": 1.610833740234375, + "step": 68080 + }, + { + "epoch": 0.20610536830475384, + "grad_norm": 0.12598851323127747, + "learning_rate": 9.9538505912893e-05, + "loss": 1.5889806747436523, + "step": 68090 + }, + { + "epoch": 0.20613563785509967, + "grad_norm": 0.13464480638504028, + "learning_rate": 9.95347107312556e-05, + "loss": 1.5735466003417968, + "step": 68100 + }, + { + "epoch": 0.2061659074054455, + "grad_norm": 0.10948240756988525, + "learning_rate": 9.95309155496182e-05, + "loss": 1.5857964515686036, + "step": 68110 + }, + { + "epoch": 0.20619617695579132, + "grad_norm": 0.11449004709720612, + "learning_rate": 9.952712036798081e-05, + "loss": 1.5624800682067872, + "step": 68120 + }, + { + "epoch": 0.20622644650613714, + "grad_norm": 0.11606673896312714, + "learning_rate": 9.952332518634343e-05, + "loss": 1.613525390625, + "step": 68130 + }, + { + "epoch": 0.206256716056483, + "grad_norm": 0.1169581189751625, + "learning_rate": 9.951953000470602e-05, + "loss": 1.5299226760864257, + "step": 68140 + }, + { + "epoch": 0.20628698560682882, + "grad_norm": 0.12316492944955826, + "learning_rate": 9.951573482306864e-05, + "loss": 1.59944486618042, + "step": 68150 + }, + { + "epoch": 0.20631725515717464, + "grad_norm": 0.12992726266384125, + "learning_rate": 9.951193964143124e-05, + "loss": 1.5734661102294922, + "step": 68160 + }, + { + "epoch": 0.20634752470752046, + "grad_norm": 0.12147433310747147, + "learning_rate": 9.950814445979386e-05, + "loss": 1.6034770965576173, + "step": 68170 + }, + { + "epoch": 0.2063777942578663, + "grad_norm": 0.13428151607513428, + "learning_rate": 9.950434927815645e-05, + "loss": 1.5944530487060546, + "step": 68180 + }, + { + "epoch": 0.20640806380821214, + "grad_norm": 0.11087863147258759, + "learning_rate": 9.950055409651907e-05, + "loss": 1.6588993072509766, + "step": 68190 + }, + { + "epoch": 0.20643833335855796, + "grad_norm": 0.14908955991268158, + "learning_rate": 9.949675891488166e-05, + "loss": 1.6107248306274413, + "step": 68200 + }, + { + "epoch": 0.2064686029089038, + "grad_norm": 0.122990183532238, + "learning_rate": 9.949296373324428e-05, + "loss": 1.6007434844970703, + "step": 68210 + }, + { + "epoch": 0.2064988724592496, + "grad_norm": 0.1216944009065628, + "learning_rate": 9.948916855160687e-05, + "loss": 1.6437889099121095, + "step": 68220 + }, + { + "epoch": 0.20652914200959543, + "grad_norm": 0.11709506809711456, + "learning_rate": 9.948537336996949e-05, + "loss": 1.5705185890197755, + "step": 68230 + }, + { + "epoch": 0.20655941155994129, + "grad_norm": 0.12590037286281586, + "learning_rate": 9.948157818833208e-05, + "loss": 1.5907325744628906, + "step": 68240 + }, + { + "epoch": 0.2065896811102871, + "grad_norm": 0.12077952176332474, + "learning_rate": 9.94777830066947e-05, + "loss": 1.619333839416504, + "step": 68250 + }, + { + "epoch": 0.20661995066063293, + "grad_norm": 0.12643872201442719, + "learning_rate": 9.947398782505732e-05, + "loss": 1.5952630043029785, + "step": 68260 + }, + { + "epoch": 0.20665022021097876, + "grad_norm": 0.11737287789583206, + "learning_rate": 9.947019264341991e-05, + "loss": 1.556852436065674, + "step": 68270 + }, + { + "epoch": 0.20668048976132458, + "grad_norm": 0.12133125960826874, + "learning_rate": 9.946639746178253e-05, + "loss": 1.6050107955932618, + "step": 68280 + }, + { + "epoch": 0.20671075931167043, + "grad_norm": 0.12137721478939056, + "learning_rate": 9.946260228014513e-05, + "loss": 1.5582468032836914, + "step": 68290 + }, + { + "epoch": 0.20674102886201626, + "grad_norm": 0.12158888578414917, + "learning_rate": 9.945880709850775e-05, + "loss": 1.6310453414916992, + "step": 68300 + }, + { + "epoch": 0.20677129841236208, + "grad_norm": 0.1343720555305481, + "learning_rate": 9.945501191687034e-05, + "loss": 1.5780573844909669, + "step": 68310 + }, + { + "epoch": 0.2068015679627079, + "grad_norm": 0.12000841647386551, + "learning_rate": 9.945121673523296e-05, + "loss": 1.5810616493225098, + "step": 68320 + }, + { + "epoch": 0.20683183751305376, + "grad_norm": 0.13210313022136688, + "learning_rate": 9.944742155359555e-05, + "loss": 1.6143175125122071, + "step": 68330 + }, + { + "epoch": 0.20686210706339958, + "grad_norm": 0.12779925763607025, + "learning_rate": 9.944362637195817e-05, + "loss": 1.5725632667541505, + "step": 68340 + }, + { + "epoch": 0.2068923766137454, + "grad_norm": 0.13172782957553864, + "learning_rate": 9.943983119032076e-05, + "loss": 1.6105873107910156, + "step": 68350 + }, + { + "epoch": 0.20692264616409123, + "grad_norm": 0.12437400966882706, + "learning_rate": 9.943603600868338e-05, + "loss": 1.5844245910644532, + "step": 68360 + }, + { + "epoch": 0.20695291571443705, + "grad_norm": 0.11660853028297424, + "learning_rate": 9.943224082704597e-05, + "loss": 1.6314233779907226, + "step": 68370 + }, + { + "epoch": 0.2069831852647829, + "grad_norm": 0.12864616513252258, + "learning_rate": 9.94284456454086e-05, + "loss": 1.5930883407592773, + "step": 68380 + }, + { + "epoch": 0.20701345481512873, + "grad_norm": 0.12098215520381927, + "learning_rate": 9.94246504637712e-05, + "loss": 1.5801214218139648, + "step": 68390 + }, + { + "epoch": 0.20704372436547455, + "grad_norm": 0.11551102250814438, + "learning_rate": 9.94208552821338e-05, + "loss": 1.569257640838623, + "step": 68400 + }, + { + "epoch": 0.20707399391582038, + "grad_norm": 0.11660102754831314, + "learning_rate": 9.941706010049641e-05, + "loss": 1.6268585205078125, + "step": 68410 + }, + { + "epoch": 0.2071042634661662, + "grad_norm": 0.11199913173913956, + "learning_rate": 9.941326491885902e-05, + "loss": 1.632085418701172, + "step": 68420 + }, + { + "epoch": 0.20713453301651205, + "grad_norm": 0.12444627285003662, + "learning_rate": 9.940946973722162e-05, + "loss": 1.5681747436523437, + "step": 68430 + }, + { + "epoch": 0.20716480256685788, + "grad_norm": 0.11807026714086533, + "learning_rate": 9.940567455558423e-05, + "loss": 1.5549107551574708, + "step": 68440 + }, + { + "epoch": 0.2071950721172037, + "grad_norm": 0.12548914551734924, + "learning_rate": 9.940187937394684e-05, + "loss": 1.5464052200317382, + "step": 68450 + }, + { + "epoch": 0.20722534166754952, + "grad_norm": 0.11786607652902603, + "learning_rate": 9.939808419230944e-05, + "loss": 1.5925620079040528, + "step": 68460 + }, + { + "epoch": 0.20725561121789535, + "grad_norm": 0.11811333894729614, + "learning_rate": 9.939428901067206e-05, + "loss": 1.56895112991333, + "step": 68470 + }, + { + "epoch": 0.2072858807682412, + "grad_norm": 0.12344080209732056, + "learning_rate": 9.939049382903465e-05, + "loss": 1.584945011138916, + "step": 68480 + }, + { + "epoch": 0.20731615031858702, + "grad_norm": 0.12098688632249832, + "learning_rate": 9.938669864739727e-05, + "loss": 1.5757085800170898, + "step": 68490 + }, + { + "epoch": 0.20734641986893285, + "grad_norm": 0.12136692553758621, + "learning_rate": 9.938290346575987e-05, + "loss": 1.5430353164672852, + "step": 68500 + }, + { + "epoch": 0.20734641986893285, + "eval_loss": 1.613083004951477, + "eval_runtime": 27.8121, + "eval_samples_per_second": 17.978, + "eval_steps_per_second": 1.151, + "step": 68500 + }, + { + "epoch": 0.20737668941927867, + "grad_norm": 0.11792799830436707, + "learning_rate": 9.937910828412248e-05, + "loss": 1.6018524169921875, + "step": 68510 + }, + { + "epoch": 0.2074069589696245, + "grad_norm": 0.1267991065979004, + "learning_rate": 9.937531310248509e-05, + "loss": 1.5996603965759277, + "step": 68520 + }, + { + "epoch": 0.20743722851997035, + "grad_norm": 0.12987899780273438, + "learning_rate": 9.93715179208477e-05, + "loss": 1.6239561080932616, + "step": 68530 + }, + { + "epoch": 0.20746749807031617, + "grad_norm": 0.13510355353355408, + "learning_rate": 9.93677227392103e-05, + "loss": 1.5577189445495605, + "step": 68540 + }, + { + "epoch": 0.207497767620662, + "grad_norm": 0.12193109095096588, + "learning_rate": 9.936392755757291e-05, + "loss": 1.5699321746826171, + "step": 68550 + }, + { + "epoch": 0.20752803717100782, + "grad_norm": 0.12393435090780258, + "learning_rate": 9.936013237593551e-05, + "loss": 1.5723939895629884, + "step": 68560 + }, + { + "epoch": 0.20755830672135364, + "grad_norm": 0.11775383353233337, + "learning_rate": 9.935633719429812e-05, + "loss": 1.569468116760254, + "step": 68570 + }, + { + "epoch": 0.2075885762716995, + "grad_norm": 0.1222185343503952, + "learning_rate": 9.935254201266073e-05, + "loss": 1.5737194061279296, + "step": 68580 + }, + { + "epoch": 0.20761884582204532, + "grad_norm": 0.1305946558713913, + "learning_rate": 9.934874683102333e-05, + "loss": 1.6383037567138672, + "step": 68590 + }, + { + "epoch": 0.20764911537239114, + "grad_norm": 0.13766174018383026, + "learning_rate": 9.934495164938594e-05, + "loss": 1.5549593925476075, + "step": 68600 + }, + { + "epoch": 0.20767938492273696, + "grad_norm": 0.12657611072063446, + "learning_rate": 9.934115646774854e-05, + "loss": 1.5668181419372558, + "step": 68610 + }, + { + "epoch": 0.2077096544730828, + "grad_norm": 0.11596374958753586, + "learning_rate": 9.933736128611115e-05, + "loss": 1.5924402236938477, + "step": 68620 + }, + { + "epoch": 0.20773992402342864, + "grad_norm": 0.1305558830499649, + "learning_rate": 9.933356610447377e-05, + "loss": 1.5607992172241212, + "step": 68630 + }, + { + "epoch": 0.20777019357377446, + "grad_norm": 0.1189022809267044, + "learning_rate": 9.932977092283636e-05, + "loss": 1.5634818077087402, + "step": 68640 + }, + { + "epoch": 0.2078004631241203, + "grad_norm": 0.12697912752628326, + "learning_rate": 9.932597574119898e-05, + "loss": 1.5845913887023926, + "step": 68650 + }, + { + "epoch": 0.2078307326744661, + "grad_norm": 0.1260496973991394, + "learning_rate": 9.932218055956159e-05, + "loss": 1.5889249801635743, + "step": 68660 + }, + { + "epoch": 0.20786100222481196, + "grad_norm": 0.11610449850559235, + "learning_rate": 9.93183853779242e-05, + "loss": 1.606698989868164, + "step": 68670 + }, + { + "epoch": 0.2078912717751578, + "grad_norm": 0.1319102793931961, + "learning_rate": 9.93145901962868e-05, + "loss": 1.5945380210876465, + "step": 68680 + }, + { + "epoch": 0.2079215413255036, + "grad_norm": 0.12602968513965607, + "learning_rate": 9.93107950146494e-05, + "loss": 1.5877894401550292, + "step": 68690 + }, + { + "epoch": 0.20795181087584944, + "grad_norm": 0.11021658033132553, + "learning_rate": 9.930699983301201e-05, + "loss": 1.5699102401733398, + "step": 68700 + }, + { + "epoch": 0.20798208042619526, + "grad_norm": 0.13207903504371643, + "learning_rate": 9.930320465137462e-05, + "loss": 1.5570122718811035, + "step": 68710 + }, + { + "epoch": 0.2080123499765411, + "grad_norm": 0.13696694374084473, + "learning_rate": 9.929940946973722e-05, + "loss": 1.6084197998046874, + "step": 68720 + }, + { + "epoch": 0.20804261952688693, + "grad_norm": 0.12192633748054504, + "learning_rate": 9.929561428809983e-05, + "loss": 1.570454216003418, + "step": 68730 + }, + { + "epoch": 0.20807288907723276, + "grad_norm": 0.1221570372581482, + "learning_rate": 9.929181910646244e-05, + "loss": 1.5968727111816405, + "step": 68740 + }, + { + "epoch": 0.20810315862757858, + "grad_norm": 0.11936749517917633, + "learning_rate": 9.928802392482504e-05, + "loss": 1.6197790145874023, + "step": 68750 + }, + { + "epoch": 0.2081334281779244, + "grad_norm": 0.1307719349861145, + "learning_rate": 9.928422874318766e-05, + "loss": 1.5793448448181153, + "step": 68760 + }, + { + "epoch": 0.20816369772827026, + "grad_norm": 0.13079433143138885, + "learning_rate": 9.928043356155025e-05, + "loss": 1.616847610473633, + "step": 68770 + }, + { + "epoch": 0.20819396727861608, + "grad_norm": 0.12345418334007263, + "learning_rate": 9.927663837991287e-05, + "loss": 1.6362035751342774, + "step": 68780 + }, + { + "epoch": 0.2082242368289619, + "grad_norm": 0.12110183387994766, + "learning_rate": 9.927284319827546e-05, + "loss": 1.5452292442321778, + "step": 68790 + }, + { + "epoch": 0.20825450637930773, + "grad_norm": 0.1378924548625946, + "learning_rate": 9.926904801663808e-05, + "loss": 1.6018768310546876, + "step": 68800 + }, + { + "epoch": 0.20828477592965355, + "grad_norm": 0.12114457786083221, + "learning_rate": 9.926525283500068e-05, + "loss": 1.572123908996582, + "step": 68810 + }, + { + "epoch": 0.2083150454799994, + "grad_norm": 0.12090972065925598, + "learning_rate": 9.92614576533633e-05, + "loss": 1.5839554786682128, + "step": 68820 + }, + { + "epoch": 0.20834531503034523, + "grad_norm": 0.10991793125867844, + "learning_rate": 9.925766247172589e-05, + "loss": 1.6275459289550782, + "step": 68830 + }, + { + "epoch": 0.20837558458069105, + "grad_norm": 0.1236192062497139, + "learning_rate": 9.925386729008851e-05, + "loss": 1.5770343780517577, + "step": 68840 + }, + { + "epoch": 0.20840585413103688, + "grad_norm": 0.13319599628448486, + "learning_rate": 9.92500721084511e-05, + "loss": 1.5953210830688476, + "step": 68850 + }, + { + "epoch": 0.2084361236813827, + "grad_norm": 0.12103968858718872, + "learning_rate": 9.924627692681372e-05, + "loss": 1.593271827697754, + "step": 68860 + }, + { + "epoch": 0.20846639323172855, + "grad_norm": 0.11041425168514252, + "learning_rate": 9.924248174517634e-05, + "loss": 1.5891952514648438, + "step": 68870 + }, + { + "epoch": 0.20849666278207438, + "grad_norm": 0.1377655416727066, + "learning_rate": 9.923868656353893e-05, + "loss": 1.547957420349121, + "step": 68880 + }, + { + "epoch": 0.2085269323324202, + "grad_norm": 0.12508633732795715, + "learning_rate": 9.923489138190155e-05, + "loss": 1.5914083480834962, + "step": 68890 + }, + { + "epoch": 0.20855720188276602, + "grad_norm": 0.12461826950311661, + "learning_rate": 9.923109620026414e-05, + "loss": 1.576070213317871, + "step": 68900 + }, + { + "epoch": 0.20858747143311185, + "grad_norm": 0.10627574473619461, + "learning_rate": 9.922730101862676e-05, + "loss": 1.5846839904785157, + "step": 68910 + }, + { + "epoch": 0.2086177409834577, + "grad_norm": 0.15518760681152344, + "learning_rate": 9.922350583698936e-05, + "loss": 1.5582719802856446, + "step": 68920 + }, + { + "epoch": 0.20864801053380352, + "grad_norm": 0.12471217662096024, + "learning_rate": 9.921971065535198e-05, + "loss": 1.5692583084106446, + "step": 68930 + }, + { + "epoch": 0.20867828008414935, + "grad_norm": 0.12112856656312943, + "learning_rate": 9.921591547371457e-05, + "loss": 1.6080211639404296, + "step": 68940 + }, + { + "epoch": 0.20870854963449517, + "grad_norm": 0.129845529794693, + "learning_rate": 9.921212029207719e-05, + "loss": 1.5768609046936035, + "step": 68950 + }, + { + "epoch": 0.208738819184841, + "grad_norm": 0.1285451203584671, + "learning_rate": 9.920832511043978e-05, + "loss": 1.5650026321411132, + "step": 68960 + }, + { + "epoch": 0.20876908873518685, + "grad_norm": 0.11864767223596573, + "learning_rate": 9.92045299288024e-05, + "loss": 1.5534852981567382, + "step": 68970 + }, + { + "epoch": 0.20879935828553267, + "grad_norm": 0.11976096034049988, + "learning_rate": 9.920073474716499e-05, + "loss": 1.590377140045166, + "step": 68980 + }, + { + "epoch": 0.2088296278358785, + "grad_norm": 0.12333165854215622, + "learning_rate": 9.919693956552761e-05, + "loss": 1.5780391693115234, + "step": 68990 + }, + { + "epoch": 0.20885989738622432, + "grad_norm": 0.11601356416940689, + "learning_rate": 9.919314438389022e-05, + "loss": 1.6315420150756836, + "step": 69000 + }, + { + "epoch": 0.20885989738622432, + "eval_loss": 1.5708612203598022, + "eval_runtime": 27.9335, + "eval_samples_per_second": 17.9, + "eval_steps_per_second": 1.146, + "step": 69000 + }, + { + "epoch": 0.20889016693657017, + "grad_norm": 0.13024678826332092, + "learning_rate": 9.918934920225282e-05, + "loss": 1.5659208297729492, + "step": 69010 + }, + { + "epoch": 0.208920436486916, + "grad_norm": 0.12453287839889526, + "learning_rate": 9.918555402061543e-05, + "loss": 1.617691421508789, + "step": 69020 + }, + { + "epoch": 0.20895070603726182, + "grad_norm": 0.11838974803686142, + "learning_rate": 9.918175883897803e-05, + "loss": 1.6195075988769532, + "step": 69030 + }, + { + "epoch": 0.20898097558760764, + "grad_norm": 0.11871778219938278, + "learning_rate": 9.917796365734064e-05, + "loss": 1.5947669982910155, + "step": 69040 + }, + { + "epoch": 0.20901124513795347, + "grad_norm": 0.10973582416772842, + "learning_rate": 9.917416847570325e-05, + "loss": 1.5911596298217774, + "step": 69050 + }, + { + "epoch": 0.20904151468829932, + "grad_norm": 0.12410454452037811, + "learning_rate": 9.917037329406585e-05, + "loss": 1.5879602432250977, + "step": 69060 + }, + { + "epoch": 0.20907178423864514, + "grad_norm": 0.12705039978027344, + "learning_rate": 9.916657811242846e-05, + "loss": 1.5908800125122071, + "step": 69070 + }, + { + "epoch": 0.20910205378899097, + "grad_norm": 0.13469941914081573, + "learning_rate": 9.916278293079108e-05, + "loss": 1.5659317016601562, + "step": 69080 + }, + { + "epoch": 0.2091323233393368, + "grad_norm": 0.1341034322977066, + "learning_rate": 9.915898774915367e-05, + "loss": 1.5557973861694336, + "step": 69090 + }, + { + "epoch": 0.2091625928896826, + "grad_norm": 0.1292959451675415, + "learning_rate": 9.915519256751629e-05, + "loss": 1.5805670738220214, + "step": 69100 + }, + { + "epoch": 0.20919286244002847, + "grad_norm": 0.12909060716629028, + "learning_rate": 9.915139738587888e-05, + "loss": 1.5582468032836914, + "step": 69110 + }, + { + "epoch": 0.2092231319903743, + "grad_norm": 0.1145835891366005, + "learning_rate": 9.91476022042415e-05, + "loss": 1.5950698852539062, + "step": 69120 + }, + { + "epoch": 0.2092534015407201, + "grad_norm": 0.1207427904009819, + "learning_rate": 9.914380702260411e-05, + "loss": 1.565582847595215, + "step": 69130 + }, + { + "epoch": 0.20928367109106594, + "grad_norm": 0.1251150220632553, + "learning_rate": 9.914001184096671e-05, + "loss": 1.6010072708129883, + "step": 69140 + }, + { + "epoch": 0.20931394064141176, + "grad_norm": 0.12812760472297668, + "learning_rate": 9.913621665932932e-05, + "loss": 1.580867385864258, + "step": 69150 + }, + { + "epoch": 0.2093442101917576, + "grad_norm": 0.12567420303821564, + "learning_rate": 9.913242147769193e-05, + "loss": 1.596670913696289, + "step": 69160 + }, + { + "epoch": 0.20937447974210344, + "grad_norm": 0.11934769153594971, + "learning_rate": 9.912862629605453e-05, + "loss": 1.6111928939819335, + "step": 69170 + }, + { + "epoch": 0.20940474929244926, + "grad_norm": 0.12778973579406738, + "learning_rate": 9.912483111441714e-05, + "loss": 1.6094989776611328, + "step": 69180 + }, + { + "epoch": 0.20943501884279508, + "grad_norm": 0.11807629466056824, + "learning_rate": 9.912103593277974e-05, + "loss": 1.5845945358276368, + "step": 69190 + }, + { + "epoch": 0.2094652883931409, + "grad_norm": 0.116664819419384, + "learning_rate": 9.911724075114235e-05, + "loss": 1.5817955017089844, + "step": 69200 + }, + { + "epoch": 0.20949555794348676, + "grad_norm": 0.11186090856790543, + "learning_rate": 9.911344556950496e-05, + "loss": 1.538325881958008, + "step": 69210 + }, + { + "epoch": 0.20952582749383258, + "grad_norm": 0.12051793932914734, + "learning_rate": 9.910965038786756e-05, + "loss": 1.5918642044067384, + "step": 69220 + }, + { + "epoch": 0.2095560970441784, + "grad_norm": 0.14620129764080048, + "learning_rate": 9.910585520623017e-05, + "loss": 1.5934965133666992, + "step": 69230 + }, + { + "epoch": 0.20958636659452423, + "grad_norm": 0.1163649633526802, + "learning_rate": 9.910206002459279e-05, + "loss": 1.5882596969604492, + "step": 69240 + }, + { + "epoch": 0.20961663614487006, + "grad_norm": 0.12710464000701904, + "learning_rate": 9.909826484295538e-05, + "loss": 1.5667640686035156, + "step": 69250 + }, + { + "epoch": 0.2096469056952159, + "grad_norm": 0.12585510313510895, + "learning_rate": 9.9094469661318e-05, + "loss": 1.6123559951782227, + "step": 69260 + }, + { + "epoch": 0.20967717524556173, + "grad_norm": 0.13369697332382202, + "learning_rate": 9.90906744796806e-05, + "loss": 1.5261139869689941, + "step": 69270 + }, + { + "epoch": 0.20970744479590755, + "grad_norm": 0.11874982714653015, + "learning_rate": 9.908687929804321e-05, + "loss": 1.5680740356445313, + "step": 69280 + }, + { + "epoch": 0.20973771434625338, + "grad_norm": 0.12346671521663666, + "learning_rate": 9.908308411640582e-05, + "loss": 1.5659552574157716, + "step": 69290 + }, + { + "epoch": 0.2097679838965992, + "grad_norm": 0.1476912945508957, + "learning_rate": 9.907928893476842e-05, + "loss": 1.587864875793457, + "step": 69300 + }, + { + "epoch": 0.20979825344694505, + "grad_norm": 0.12658365070819855, + "learning_rate": 9.907549375313103e-05, + "loss": 1.5751839637756349, + "step": 69310 + }, + { + "epoch": 0.20982852299729088, + "grad_norm": 0.11240845918655396, + "learning_rate": 9.907169857149363e-05, + "loss": 1.602159309387207, + "step": 69320 + }, + { + "epoch": 0.2098587925476367, + "grad_norm": 0.12358374148607254, + "learning_rate": 9.906790338985624e-05, + "loss": 1.5743278503417968, + "step": 69330 + }, + { + "epoch": 0.20988906209798253, + "grad_norm": 0.12126483023166656, + "learning_rate": 9.906410820821885e-05, + "loss": 1.6056705474853517, + "step": 69340 + }, + { + "epoch": 0.20991933164832838, + "grad_norm": 0.11802739650011063, + "learning_rate": 9.906031302658145e-05, + "loss": 1.5747934341430665, + "step": 69350 + }, + { + "epoch": 0.2099496011986742, + "grad_norm": 0.11360149830579758, + "learning_rate": 9.905651784494406e-05, + "loss": 1.5673900604248048, + "step": 69360 + }, + { + "epoch": 0.20997987074902003, + "grad_norm": 0.12217792123556137, + "learning_rate": 9.905272266330668e-05, + "loss": 1.62901611328125, + "step": 69370 + }, + { + "epoch": 0.21001014029936585, + "grad_norm": 0.12386723607778549, + "learning_rate": 9.904892748166927e-05, + "loss": 1.5913590431213378, + "step": 69380 + }, + { + "epoch": 0.21004040984971167, + "grad_norm": 0.1262807697057724, + "learning_rate": 9.904513230003189e-05, + "loss": 1.6032808303833008, + "step": 69390 + }, + { + "epoch": 0.21007067940005753, + "grad_norm": 0.12355171144008636, + "learning_rate": 9.904133711839448e-05, + "loss": 1.5851799964904785, + "step": 69400 + }, + { + "epoch": 0.21010094895040335, + "grad_norm": 0.11738093942403793, + "learning_rate": 9.90375419367571e-05, + "loss": 1.5535146713256835, + "step": 69410 + }, + { + "epoch": 0.21013121850074917, + "grad_norm": 0.11903470754623413, + "learning_rate": 9.90337467551197e-05, + "loss": 1.5697745323181151, + "step": 69420 + }, + { + "epoch": 0.210161488051095, + "grad_norm": 0.12265679985284805, + "learning_rate": 9.902995157348231e-05, + "loss": 1.5901778221130372, + "step": 69430 + }, + { + "epoch": 0.21019175760144082, + "grad_norm": 0.13012205064296722, + "learning_rate": 9.90261563918449e-05, + "loss": 1.5970192909240724, + "step": 69440 + }, + { + "epoch": 0.21022202715178667, + "grad_norm": 0.1225133091211319, + "learning_rate": 9.902236121020753e-05, + "loss": 1.5936639785766602, + "step": 69450 + }, + { + "epoch": 0.2102522967021325, + "grad_norm": 0.14037266373634338, + "learning_rate": 9.901856602857012e-05, + "loss": 1.5758932113647461, + "step": 69460 + }, + { + "epoch": 0.21028256625247832, + "grad_norm": 0.13885560631752014, + "learning_rate": 9.901477084693274e-05, + "loss": 1.555105972290039, + "step": 69470 + }, + { + "epoch": 0.21031283580282414, + "grad_norm": 0.12197069823741913, + "learning_rate": 9.901097566529534e-05, + "loss": 1.6105018615722657, + "step": 69480 + }, + { + "epoch": 0.21034310535316997, + "grad_norm": 0.1339557021856308, + "learning_rate": 9.900718048365795e-05, + "loss": 1.55853271484375, + "step": 69490 + }, + { + "epoch": 0.21037337490351582, + "grad_norm": 0.138475701212883, + "learning_rate": 9.900338530202057e-05, + "loss": 1.583064079284668, + "step": 69500 + }, + { + "epoch": 0.21037337490351582, + "eval_loss": 1.584317684173584, + "eval_runtime": 27.8956, + "eval_samples_per_second": 17.924, + "eval_steps_per_second": 1.147, + "step": 69500 + }, + { + "epoch": 0.21040364445386164, + "grad_norm": 0.12938101589679718, + "learning_rate": 9.899959012038316e-05, + "loss": 1.5991920471191405, + "step": 69510 + }, + { + "epoch": 0.21043391400420747, + "grad_norm": 0.11602578312158585, + "learning_rate": 9.899579493874578e-05, + "loss": 1.5512686729431153, + "step": 69520 + }, + { + "epoch": 0.2104641835545533, + "grad_norm": 0.13933539390563965, + "learning_rate": 9.899199975710837e-05, + "loss": 1.5524837493896484, + "step": 69530 + }, + { + "epoch": 0.21049445310489912, + "grad_norm": 0.13071882724761963, + "learning_rate": 9.898820457547099e-05, + "loss": 1.5901506423950196, + "step": 69540 + }, + { + "epoch": 0.21052472265524497, + "grad_norm": 0.12946267426013947, + "learning_rate": 9.898440939383358e-05, + "loss": 1.5940254211425782, + "step": 69550 + }, + { + "epoch": 0.2105549922055908, + "grad_norm": 0.12482309341430664, + "learning_rate": 9.89806142121962e-05, + "loss": 1.5599542617797852, + "step": 69560 + }, + { + "epoch": 0.21058526175593661, + "grad_norm": 0.11160998791456223, + "learning_rate": 9.89768190305588e-05, + "loss": 1.5695671081542968, + "step": 69570 + }, + { + "epoch": 0.21061553130628244, + "grad_norm": 0.1380218118429184, + "learning_rate": 9.897302384892142e-05, + "loss": 1.5788004875183106, + "step": 69580 + }, + { + "epoch": 0.21064580085662826, + "grad_norm": 0.11446337401866913, + "learning_rate": 9.896922866728401e-05, + "loss": 1.5583009719848633, + "step": 69590 + }, + { + "epoch": 0.21067607040697411, + "grad_norm": 0.11396058648824692, + "learning_rate": 9.896543348564663e-05, + "loss": 1.6102581024169922, + "step": 69600 + }, + { + "epoch": 0.21070633995731994, + "grad_norm": 0.11361232399940491, + "learning_rate": 9.896163830400922e-05, + "loss": 1.5726431846618651, + "step": 69610 + }, + { + "epoch": 0.21073660950766576, + "grad_norm": 0.1173662319779396, + "learning_rate": 9.895784312237184e-05, + "loss": 1.636295700073242, + "step": 69620 + }, + { + "epoch": 0.21076687905801159, + "grad_norm": 0.1377921998500824, + "learning_rate": 9.895404794073445e-05, + "loss": 1.5969203948974608, + "step": 69630 + }, + { + "epoch": 0.2107971486083574, + "grad_norm": 0.12477636337280273, + "learning_rate": 9.895025275909705e-05, + "loss": 1.5948017120361329, + "step": 69640 + }, + { + "epoch": 0.21082741815870326, + "grad_norm": 0.13529826700687408, + "learning_rate": 9.894645757745966e-05, + "loss": 1.5823486328125, + "step": 69650 + }, + { + "epoch": 0.21085768770904909, + "grad_norm": 0.13921640813350677, + "learning_rate": 9.894266239582226e-05, + "loss": 1.599386501312256, + "step": 69660 + }, + { + "epoch": 0.2108879572593949, + "grad_norm": 0.10502346605062485, + "learning_rate": 9.893886721418487e-05, + "loss": 1.5923137664794922, + "step": 69670 + }, + { + "epoch": 0.21091822680974073, + "grad_norm": 0.11174995452165604, + "learning_rate": 9.893507203254748e-05, + "loss": 1.5947844505310058, + "step": 69680 + }, + { + "epoch": 0.21094849636008658, + "grad_norm": 0.12942981719970703, + "learning_rate": 9.89312768509101e-05, + "loss": 1.5513973236083984, + "step": 69690 + }, + { + "epoch": 0.2109787659104324, + "grad_norm": 0.11724603921175003, + "learning_rate": 9.892748166927269e-05, + "loss": 1.5934026718139649, + "step": 69700 + }, + { + "epoch": 0.21100903546077823, + "grad_norm": 0.15178799629211426, + "learning_rate": 9.892368648763531e-05, + "loss": 1.5787220001220703, + "step": 69710 + }, + { + "epoch": 0.21103930501112406, + "grad_norm": 0.12500376999378204, + "learning_rate": 9.89198913059979e-05, + "loss": 1.5929019927978516, + "step": 69720 + }, + { + "epoch": 0.21106957456146988, + "grad_norm": 0.12646901607513428, + "learning_rate": 9.891609612436052e-05, + "loss": 1.5531591415405273, + "step": 69730 + }, + { + "epoch": 0.21109984411181573, + "grad_norm": 0.12103047966957092, + "learning_rate": 9.891230094272312e-05, + "loss": 1.6113243103027344, + "step": 69740 + }, + { + "epoch": 0.21113011366216156, + "grad_norm": 0.12204483896493912, + "learning_rate": 9.890850576108573e-05, + "loss": 1.5515988349914551, + "step": 69750 + }, + { + "epoch": 0.21116038321250738, + "grad_norm": 0.11831527203321457, + "learning_rate": 9.890471057944834e-05, + "loss": 1.6137163162231445, + "step": 69760 + }, + { + "epoch": 0.2111906527628532, + "grad_norm": 0.11324962228536606, + "learning_rate": 9.890091539781094e-05, + "loss": 1.6129398345947266, + "step": 69770 + }, + { + "epoch": 0.21122092231319903, + "grad_norm": 0.14057299494743347, + "learning_rate": 9.889712021617355e-05, + "loss": 1.5993220329284668, + "step": 69780 + }, + { + "epoch": 0.21125119186354488, + "grad_norm": 0.13224518299102783, + "learning_rate": 9.889332503453615e-05, + "loss": 1.5692143440246582, + "step": 69790 + }, + { + "epoch": 0.2112814614138907, + "grad_norm": 0.12174686789512634, + "learning_rate": 9.888952985289876e-05, + "loss": 1.5926547050476074, + "step": 69800 + }, + { + "epoch": 0.21131173096423653, + "grad_norm": 0.12412257492542267, + "learning_rate": 9.888573467126137e-05, + "loss": 1.6333145141601562, + "step": 69810 + }, + { + "epoch": 0.21134200051458235, + "grad_norm": 0.13579994440078735, + "learning_rate": 9.888193948962397e-05, + "loss": 1.5885351181030274, + "step": 69820 + }, + { + "epoch": 0.21137227006492817, + "grad_norm": 0.13977129757404327, + "learning_rate": 9.887814430798658e-05, + "loss": 1.574533462524414, + "step": 69830 + }, + { + "epoch": 0.21140253961527403, + "grad_norm": 0.13301025331020355, + "learning_rate": 9.887434912634918e-05, + "loss": 1.576168155670166, + "step": 69840 + }, + { + "epoch": 0.21143280916561985, + "grad_norm": 0.12828125059604645, + "learning_rate": 9.887055394471179e-05, + "loss": 1.5827726364135741, + "step": 69850 + }, + { + "epoch": 0.21146307871596567, + "grad_norm": 0.13268005847930908, + "learning_rate": 9.88667587630744e-05, + "loss": 1.583163070678711, + "step": 69860 + }, + { + "epoch": 0.2114933482663115, + "grad_norm": 0.11898870766162872, + "learning_rate": 9.886296358143702e-05, + "loss": 1.6015026092529296, + "step": 69870 + }, + { + "epoch": 0.21152361781665732, + "grad_norm": 0.12444839626550674, + "learning_rate": 9.885916839979962e-05, + "loss": 1.5745101928710938, + "step": 69880 + }, + { + "epoch": 0.21155388736700317, + "grad_norm": 0.11931486427783966, + "learning_rate": 9.885537321816223e-05, + "loss": 1.5999105453491211, + "step": 69890 + }, + { + "epoch": 0.211584156917349, + "grad_norm": 0.10416732728481293, + "learning_rate": 9.885157803652483e-05, + "loss": 1.570997428894043, + "step": 69900 + }, + { + "epoch": 0.21161442646769482, + "grad_norm": 0.11509785056114197, + "learning_rate": 9.884778285488744e-05, + "loss": 1.5862625122070313, + "step": 69910 + }, + { + "epoch": 0.21164469601804065, + "grad_norm": 0.11286916583776474, + "learning_rate": 9.884398767325005e-05, + "loss": 1.5727184295654297, + "step": 69920 + }, + { + "epoch": 0.21167496556838647, + "grad_norm": 0.15540070831775665, + "learning_rate": 9.884019249161265e-05, + "loss": 1.581275749206543, + "step": 69930 + }, + { + "epoch": 0.21170523511873232, + "grad_norm": 0.11774124950170517, + "learning_rate": 9.883639730997526e-05, + "loss": 1.548330020904541, + "step": 69940 + }, + { + "epoch": 0.21173550466907815, + "grad_norm": 0.1318485140800476, + "learning_rate": 9.883260212833786e-05, + "loss": 1.5862424850463868, + "step": 69950 + }, + { + "epoch": 0.21176577421942397, + "grad_norm": 0.13790510594844818, + "learning_rate": 9.882880694670047e-05, + "loss": 1.5807065963745117, + "step": 69960 + }, + { + "epoch": 0.2117960437697698, + "grad_norm": 0.12468302249908447, + "learning_rate": 9.882501176506308e-05, + "loss": 1.5827688217163085, + "step": 69970 + }, + { + "epoch": 0.21182631332011562, + "grad_norm": 0.11530157178640366, + "learning_rate": 9.88212165834257e-05, + "loss": 1.5589290618896485, + "step": 69980 + }, + { + "epoch": 0.21185658287046147, + "grad_norm": 0.11532016098499298, + "learning_rate": 9.881742140178829e-05, + "loss": 1.604486846923828, + "step": 69990 + }, + { + "epoch": 0.2118868524208073, + "grad_norm": 0.10493257641792297, + "learning_rate": 9.88136262201509e-05, + "loss": 1.6023502349853516, + "step": 70000 + }, + { + "epoch": 0.2118868524208073, + "eval_loss": 1.6073895692825317, + "eval_runtime": 28.098, + "eval_samples_per_second": 17.795, + "eval_steps_per_second": 1.139, + "step": 70000 + }, + { + "epoch": 0.21191712197115312, + "grad_norm": 0.1316649317741394, + "learning_rate": 9.88098310385135e-05, + "loss": 1.559964370727539, + "step": 70010 + }, + { + "epoch": 0.21194739152149894, + "grad_norm": 0.10433951765298843, + "learning_rate": 9.880603585687612e-05, + "loss": 1.5991020202636719, + "step": 70020 + }, + { + "epoch": 0.21197766107184476, + "grad_norm": 0.11550022661685944, + "learning_rate": 9.880224067523871e-05, + "loss": 1.576225471496582, + "step": 70030 + }, + { + "epoch": 0.21200793062219062, + "grad_norm": 0.12047545611858368, + "learning_rate": 9.879844549360133e-05, + "loss": 1.5942428588867188, + "step": 70040 + }, + { + "epoch": 0.21203820017253644, + "grad_norm": 0.11736557632684708, + "learning_rate": 9.879465031196392e-05, + "loss": 1.5781081199645997, + "step": 70050 + }, + { + "epoch": 0.21206846972288226, + "grad_norm": 0.1292569935321808, + "learning_rate": 9.879085513032654e-05, + "loss": 1.5554327011108398, + "step": 70060 + }, + { + "epoch": 0.2120987392732281, + "grad_norm": 0.11689251661300659, + "learning_rate": 9.878705994868913e-05, + "loss": 1.561918067932129, + "step": 70070 + }, + { + "epoch": 0.21212900882357394, + "grad_norm": 0.13328774273395538, + "learning_rate": 9.878326476705175e-05, + "loss": 1.6228561401367188, + "step": 70080 + }, + { + "epoch": 0.21215927837391976, + "grad_norm": 0.13174501061439514, + "learning_rate": 9.877946958541436e-05, + "loss": 1.600344467163086, + "step": 70090 + }, + { + "epoch": 0.2121895479242656, + "grad_norm": 0.13384297490119934, + "learning_rate": 9.877567440377697e-05, + "loss": 1.5509677886962892, + "step": 70100 + }, + { + "epoch": 0.2122198174746114, + "grad_norm": 0.11139192432165146, + "learning_rate": 9.877187922213959e-05, + "loss": 1.5957757949829101, + "step": 70110 + }, + { + "epoch": 0.21225008702495723, + "grad_norm": 0.12223400175571442, + "learning_rate": 9.876808404050218e-05, + "loss": 1.5570077896118164, + "step": 70120 + }, + { + "epoch": 0.2122803565753031, + "grad_norm": 0.12766337394714355, + "learning_rate": 9.87642888588648e-05, + "loss": 1.6236795425415038, + "step": 70130 + }, + { + "epoch": 0.2123106261256489, + "grad_norm": 0.13865216076374054, + "learning_rate": 9.876049367722739e-05, + "loss": 1.5651856422424317, + "step": 70140 + }, + { + "epoch": 0.21234089567599473, + "grad_norm": 0.12270451337099075, + "learning_rate": 9.875669849559001e-05, + "loss": 1.5629159927368164, + "step": 70150 + }, + { + "epoch": 0.21237116522634056, + "grad_norm": 0.14613035321235657, + "learning_rate": 9.87529033139526e-05, + "loss": 1.6272418975830079, + "step": 70160 + }, + { + "epoch": 0.21240143477668638, + "grad_norm": 0.10308656841516495, + "learning_rate": 9.874910813231522e-05, + "loss": 1.6012121200561524, + "step": 70170 + }, + { + "epoch": 0.21243170432703223, + "grad_norm": 0.11220864206552505, + "learning_rate": 9.874531295067781e-05, + "loss": 1.5857107162475585, + "step": 70180 + }, + { + "epoch": 0.21246197387737806, + "grad_norm": 0.13451580703258514, + "learning_rate": 9.874151776904043e-05, + "loss": 1.5767085075378418, + "step": 70190 + }, + { + "epoch": 0.21249224342772388, + "grad_norm": 0.11601261049509048, + "learning_rate": 9.873772258740303e-05, + "loss": 1.555150604248047, + "step": 70200 + }, + { + "epoch": 0.2125225129780697, + "grad_norm": 0.11785440891981125, + "learning_rate": 9.873392740576565e-05, + "loss": 1.5887884140014648, + "step": 70210 + }, + { + "epoch": 0.21255278252841553, + "grad_norm": 0.10656803101301193, + "learning_rate": 9.873013222412824e-05, + "loss": 1.6215517044067382, + "step": 70220 + }, + { + "epoch": 0.21258305207876138, + "grad_norm": 0.12918229401111603, + "learning_rate": 9.872633704249086e-05, + "loss": 1.5812408447265625, + "step": 70230 + }, + { + "epoch": 0.2126133216291072, + "grad_norm": 0.12720397114753723, + "learning_rate": 9.872254186085346e-05, + "loss": 1.565187644958496, + "step": 70240 + }, + { + "epoch": 0.21264359117945303, + "grad_norm": 0.12709029018878937, + "learning_rate": 9.871874667921607e-05, + "loss": 1.6117910385131835, + "step": 70250 + }, + { + "epoch": 0.21267386072979885, + "grad_norm": 0.1263400763273239, + "learning_rate": 9.871495149757867e-05, + "loss": 1.5595335006713866, + "step": 70260 + }, + { + "epoch": 0.21270413028014468, + "grad_norm": 0.11317533254623413, + "learning_rate": 9.871115631594128e-05, + "loss": 1.5695323944091797, + "step": 70270 + }, + { + "epoch": 0.21273439983049053, + "grad_norm": 0.11987864226102829, + "learning_rate": 9.870736113430389e-05, + "loss": 1.623300552368164, + "step": 70280 + }, + { + "epoch": 0.21276466938083635, + "grad_norm": 0.1265798658132553, + "learning_rate": 9.870356595266649e-05, + "loss": 1.579633617401123, + "step": 70290 + }, + { + "epoch": 0.21279493893118218, + "grad_norm": 0.12064550817012787, + "learning_rate": 9.869977077102911e-05, + "loss": 1.575129508972168, + "step": 70300 + }, + { + "epoch": 0.212825208481528, + "grad_norm": 0.1179472953081131, + "learning_rate": 9.86959755893917e-05, + "loss": 1.5674810409545898, + "step": 70310 + }, + { + "epoch": 0.21285547803187382, + "grad_norm": 0.13021627068519592, + "learning_rate": 9.869218040775432e-05, + "loss": 1.5824867248535157, + "step": 70320 + }, + { + "epoch": 0.21288574758221968, + "grad_norm": 0.10972364246845245, + "learning_rate": 9.868838522611692e-05, + "loss": 1.5850038528442383, + "step": 70330 + }, + { + "epoch": 0.2129160171325655, + "grad_norm": 0.11587976664304733, + "learning_rate": 9.868459004447954e-05, + "loss": 1.5889741897583007, + "step": 70340 + }, + { + "epoch": 0.21294628668291132, + "grad_norm": 0.12830322980880737, + "learning_rate": 9.868079486284214e-05, + "loss": 1.5755449295043946, + "step": 70350 + }, + { + "epoch": 0.21297655623325715, + "grad_norm": 0.13755415380001068, + "learning_rate": 9.867699968120475e-05, + "loss": 1.5677240371704102, + "step": 70360 + }, + { + "epoch": 0.21300682578360297, + "grad_norm": 0.1256384253501892, + "learning_rate": 9.867320449956735e-05, + "loss": 1.5978886604309082, + "step": 70370 + }, + { + "epoch": 0.21303709533394882, + "grad_norm": 0.1325317919254303, + "learning_rate": 9.866940931792996e-05, + "loss": 1.5824196815490723, + "step": 70380 + }, + { + "epoch": 0.21306736488429465, + "grad_norm": 0.11637300252914429, + "learning_rate": 9.866561413629257e-05, + "loss": 1.592009925842285, + "step": 70390 + }, + { + "epoch": 0.21309763443464047, + "grad_norm": 0.1135898232460022, + "learning_rate": 9.866181895465517e-05, + "loss": 1.575477123260498, + "step": 70400 + }, + { + "epoch": 0.2131279039849863, + "grad_norm": 0.12109531462192535, + "learning_rate": 9.865802377301778e-05, + "loss": 1.6306037902832031, + "step": 70410 + }, + { + "epoch": 0.21315817353533215, + "grad_norm": 0.10978332906961441, + "learning_rate": 9.865422859138038e-05, + "loss": 1.5688149452209472, + "step": 70420 + }, + { + "epoch": 0.21318844308567797, + "grad_norm": 0.12025395035743713, + "learning_rate": 9.865043340974299e-05, + "loss": 1.5708700180053712, + "step": 70430 + }, + { + "epoch": 0.2132187126360238, + "grad_norm": 0.13707925379276276, + "learning_rate": 9.86466382281056e-05, + "loss": 1.5949421882629395, + "step": 70440 + }, + { + "epoch": 0.21324898218636962, + "grad_norm": 0.12303533405065536, + "learning_rate": 9.86428430464682e-05, + "loss": 1.5562039375305177, + "step": 70450 + }, + { + "epoch": 0.21327925173671544, + "grad_norm": 0.126114621758461, + "learning_rate": 9.863904786483081e-05, + "loss": 1.5713333129882812, + "step": 70460 + }, + { + "epoch": 0.2133095212870613, + "grad_norm": 0.13060881197452545, + "learning_rate": 9.863525268319341e-05, + "loss": 1.5706722259521484, + "step": 70470 + }, + { + "epoch": 0.21333979083740712, + "grad_norm": 0.11342187970876694, + "learning_rate": 9.863145750155603e-05, + "loss": 1.6033103942871094, + "step": 70480 + }, + { + "epoch": 0.21337006038775294, + "grad_norm": 0.1428966373205185, + "learning_rate": 9.862766231991864e-05, + "loss": 1.5618804931640624, + "step": 70490 + }, + { + "epoch": 0.21340032993809877, + "grad_norm": 0.12161360681056976, + "learning_rate": 9.862386713828124e-05, + "loss": 1.5613826751708983, + "step": 70500 + }, + { + "epoch": 0.21340032993809877, + "eval_loss": 1.56339430809021, + "eval_runtime": 28.3591, + "eval_samples_per_second": 17.631, + "eval_steps_per_second": 1.128, + "step": 70500 + }, + { + "epoch": 0.2134305994884446, + "grad_norm": 0.12291104346513748, + "learning_rate": 9.862007195664385e-05, + "loss": 1.5913726806640625, + "step": 70510 + }, + { + "epoch": 0.21346086903879044, + "grad_norm": 0.11264238506555557, + "learning_rate": 9.861627677500646e-05, + "loss": 1.5862009048461914, + "step": 70520 + }, + { + "epoch": 0.21349113858913626, + "grad_norm": 0.1218278780579567, + "learning_rate": 9.861248159336906e-05, + "loss": 1.5900737762451171, + "step": 70530 + }, + { + "epoch": 0.2135214081394821, + "grad_norm": 0.12376096099615097, + "learning_rate": 9.860868641173167e-05, + "loss": 1.5842401504516601, + "step": 70540 + }, + { + "epoch": 0.2135516776898279, + "grad_norm": 0.11501020938158035, + "learning_rate": 9.860489123009427e-05, + "loss": 1.5873438835144043, + "step": 70550 + }, + { + "epoch": 0.21358194724017374, + "grad_norm": 0.11374498158693314, + "learning_rate": 9.860109604845688e-05, + "loss": 1.573676872253418, + "step": 70560 + }, + { + "epoch": 0.2136122167905196, + "grad_norm": 0.12025674432516098, + "learning_rate": 9.859730086681949e-05, + "loss": 1.5738564491271974, + "step": 70570 + }, + { + "epoch": 0.2136424863408654, + "grad_norm": 0.1290929615497589, + "learning_rate": 9.859350568518209e-05, + "loss": 1.586214828491211, + "step": 70580 + }, + { + "epoch": 0.21367275589121124, + "grad_norm": 0.1273125261068344, + "learning_rate": 9.85897105035447e-05, + "loss": 1.5981834411621094, + "step": 70590 + }, + { + "epoch": 0.21370302544155706, + "grad_norm": 0.11341188102960587, + "learning_rate": 9.85859153219073e-05, + "loss": 1.592433547973633, + "step": 70600 + }, + { + "epoch": 0.21373329499190288, + "grad_norm": 0.12383920699357986, + "learning_rate": 9.858212014026992e-05, + "loss": 1.60291748046875, + "step": 70610 + }, + { + "epoch": 0.21376356454224874, + "grad_norm": 0.13128843903541565, + "learning_rate": 9.857832495863252e-05, + "loss": 1.5371818542480469, + "step": 70620 + }, + { + "epoch": 0.21379383409259456, + "grad_norm": 0.11266354471445084, + "learning_rate": 9.857452977699514e-05, + "loss": 1.6172496795654296, + "step": 70630 + }, + { + "epoch": 0.21382410364294038, + "grad_norm": 0.12248922139406204, + "learning_rate": 9.857073459535773e-05, + "loss": 1.573908519744873, + "step": 70640 + }, + { + "epoch": 0.2138543731932862, + "grad_norm": 0.13022130727767944, + "learning_rate": 9.856693941372035e-05, + "loss": 1.6103281021118163, + "step": 70650 + }, + { + "epoch": 0.21388464274363203, + "grad_norm": 0.12960781157016754, + "learning_rate": 9.856314423208294e-05, + "loss": 1.6272197723388673, + "step": 70660 + }, + { + "epoch": 0.21391491229397788, + "grad_norm": 0.11600720882415771, + "learning_rate": 9.855934905044556e-05, + "loss": 1.6152851104736328, + "step": 70670 + }, + { + "epoch": 0.2139451818443237, + "grad_norm": 0.11389872431755066, + "learning_rate": 9.855555386880815e-05, + "loss": 1.5767536163330078, + "step": 70680 + }, + { + "epoch": 0.21397545139466953, + "grad_norm": 0.12065191566944122, + "learning_rate": 9.855175868717077e-05, + "loss": 1.6006685256958009, + "step": 70690 + }, + { + "epoch": 0.21400572094501535, + "grad_norm": 0.11731316149234772, + "learning_rate": 9.854796350553338e-05, + "loss": 1.5935917854309083, + "step": 70700 + }, + { + "epoch": 0.21403599049536118, + "grad_norm": 0.123940609395504, + "learning_rate": 9.854416832389598e-05, + "loss": 1.587773323059082, + "step": 70710 + }, + { + "epoch": 0.21406626004570703, + "grad_norm": 0.13448560237884521, + "learning_rate": 9.85403731422586e-05, + "loss": 1.5879011154174805, + "step": 70720 + }, + { + "epoch": 0.21409652959605285, + "grad_norm": 0.12112632393836975, + "learning_rate": 9.85365779606212e-05, + "loss": 1.5969443321228027, + "step": 70730 + }, + { + "epoch": 0.21412679914639868, + "grad_norm": 0.131217360496521, + "learning_rate": 9.853278277898381e-05, + "loss": 1.61103515625, + "step": 70740 + }, + { + "epoch": 0.2141570686967445, + "grad_norm": 0.1415129154920578, + "learning_rate": 9.852898759734641e-05, + "loss": 1.5911623001098634, + "step": 70750 + }, + { + "epoch": 0.21418733824709035, + "grad_norm": 0.1359366476535797, + "learning_rate": 9.852519241570903e-05, + "loss": 1.6156681060791016, + "step": 70760 + }, + { + "epoch": 0.21421760779743618, + "grad_norm": 0.11115533113479614, + "learning_rate": 9.852139723407162e-05, + "loss": 1.606301498413086, + "step": 70770 + }, + { + "epoch": 0.214247877347782, + "grad_norm": 0.13452693819999695, + "learning_rate": 9.851760205243424e-05, + "loss": 1.5983308792114257, + "step": 70780 + }, + { + "epoch": 0.21427814689812782, + "grad_norm": 0.13180822134017944, + "learning_rate": 9.851380687079683e-05, + "loss": 1.5933853149414063, + "step": 70790 + }, + { + "epoch": 0.21430841644847365, + "grad_norm": 0.11271516233682632, + "learning_rate": 9.851001168915945e-05, + "loss": 1.5571374893188477, + "step": 70800 + }, + { + "epoch": 0.2143386859988195, + "grad_norm": 0.1374223530292511, + "learning_rate": 9.850621650752204e-05, + "loss": 1.5604961395263672, + "step": 70810 + }, + { + "epoch": 0.21436895554916532, + "grad_norm": 0.13116395473480225, + "learning_rate": 9.850242132588466e-05, + "loss": 1.5775440216064454, + "step": 70820 + }, + { + "epoch": 0.21439922509951115, + "grad_norm": 0.12318733334541321, + "learning_rate": 9.849862614424725e-05, + "loss": 1.611166763305664, + "step": 70830 + }, + { + "epoch": 0.21442949464985697, + "grad_norm": 0.11823836714029312, + "learning_rate": 9.849483096260987e-05, + "loss": 1.6158798217773438, + "step": 70840 + }, + { + "epoch": 0.2144597642002028, + "grad_norm": 0.10933985561132431, + "learning_rate": 9.849103578097248e-05, + "loss": 1.5874265670776366, + "step": 70850 + }, + { + "epoch": 0.21449003375054865, + "grad_norm": 0.12798678874969482, + "learning_rate": 9.848724059933509e-05, + "loss": 1.5880631446838378, + "step": 70860 + }, + { + "epoch": 0.21452030330089447, + "grad_norm": 0.12010365724563599, + "learning_rate": 9.848344541769769e-05, + "loss": 1.6159879684448242, + "step": 70870 + }, + { + "epoch": 0.2145505728512403, + "grad_norm": 0.11562562733888626, + "learning_rate": 9.84796502360603e-05, + "loss": 1.567243766784668, + "step": 70880 + }, + { + "epoch": 0.21458084240158612, + "grad_norm": 0.11716742068529129, + "learning_rate": 9.84758550544229e-05, + "loss": 1.5856807708740235, + "step": 70890 + }, + { + "epoch": 0.21461111195193194, + "grad_norm": 0.11726466566324234, + "learning_rate": 9.847205987278551e-05, + "loss": 1.5769886016845702, + "step": 70900 + }, + { + "epoch": 0.2146413815022778, + "grad_norm": 0.11454214155673981, + "learning_rate": 9.846826469114813e-05, + "loss": 1.5972448348999024, + "step": 70910 + }, + { + "epoch": 0.21467165105262362, + "grad_norm": 0.1223042905330658, + "learning_rate": 9.846446950951072e-05, + "loss": 1.5881974220275878, + "step": 70920 + }, + { + "epoch": 0.21470192060296944, + "grad_norm": 0.11146767437458038, + "learning_rate": 9.846067432787334e-05, + "loss": 1.583696174621582, + "step": 70930 + }, + { + "epoch": 0.21473219015331527, + "grad_norm": 0.11276643723249435, + "learning_rate": 9.845687914623593e-05, + "loss": 1.608152198791504, + "step": 70940 + }, + { + "epoch": 0.2147624597036611, + "grad_norm": 0.12518993020057678, + "learning_rate": 9.845308396459855e-05, + "loss": 1.5837958335876465, + "step": 70950 + }, + { + "epoch": 0.21479272925400694, + "grad_norm": 0.11576381325721741, + "learning_rate": 9.844928878296115e-05, + "loss": 1.6020536422729492, + "step": 70960 + }, + { + "epoch": 0.21482299880435277, + "grad_norm": 0.1388092339038849, + "learning_rate": 9.844549360132377e-05, + "loss": 1.5744531631469727, + "step": 70970 + }, + { + "epoch": 0.2148532683546986, + "grad_norm": 0.13154643774032593, + "learning_rate": 9.844169841968637e-05, + "loss": 1.5799945831298827, + "step": 70980 + }, + { + "epoch": 0.2148835379050444, + "grad_norm": 0.10961994528770447, + "learning_rate": 9.843790323804898e-05, + "loss": 1.5905430793762207, + "step": 70990 + }, + { + "epoch": 0.21491380745539024, + "grad_norm": 0.12046980112791061, + "learning_rate": 9.843410805641158e-05, + "loss": 1.5873207092285155, + "step": 71000 + }, + { + "epoch": 0.21491380745539024, + "eval_loss": 1.5722459554672241, + "eval_runtime": 28.1802, + "eval_samples_per_second": 17.743, + "eval_steps_per_second": 1.136, + "step": 71000 + }, + { + "epoch": 0.2149440770057361, + "grad_norm": 0.13583970069885254, + "learning_rate": 9.843031287477419e-05, + "loss": 1.5391191482543944, + "step": 71010 + }, + { + "epoch": 0.2149743465560819, + "grad_norm": 0.1182100921869278, + "learning_rate": 9.84265176931368e-05, + "loss": 1.5630596160888672, + "step": 71020 + }, + { + "epoch": 0.21500461610642774, + "grad_norm": 0.12468554079532623, + "learning_rate": 9.84227225114994e-05, + "loss": 1.5830717086791992, + "step": 71030 + }, + { + "epoch": 0.21503488565677356, + "grad_norm": 0.12665490806102753, + "learning_rate": 9.841892732986201e-05, + "loss": 1.5372512817382813, + "step": 71040 + }, + { + "epoch": 0.21506515520711939, + "grad_norm": 0.14348028600215912, + "learning_rate": 9.841513214822461e-05, + "loss": 1.6025562286376953, + "step": 71050 + }, + { + "epoch": 0.21509542475746524, + "grad_norm": 0.15214888751506805, + "learning_rate": 9.841133696658722e-05, + "loss": 1.5821495056152344, + "step": 71060 + }, + { + "epoch": 0.21512569430781106, + "grad_norm": 0.11987672746181488, + "learning_rate": 9.840754178494982e-05, + "loss": 1.5382142066955566, + "step": 71070 + }, + { + "epoch": 0.21515596385815688, + "grad_norm": 0.13765574991703033, + "learning_rate": 9.840374660331243e-05, + "loss": 1.5548245429992675, + "step": 71080 + }, + { + "epoch": 0.2151862334085027, + "grad_norm": 0.12168151140213013, + "learning_rate": 9.839995142167505e-05, + "loss": 1.5864015579223634, + "step": 71090 + }, + { + "epoch": 0.21521650295884856, + "grad_norm": 0.10747688263654709, + "learning_rate": 9.839615624003766e-05, + "loss": 1.5971803665161133, + "step": 71100 + }, + { + "epoch": 0.21524677250919438, + "grad_norm": 0.12694506347179413, + "learning_rate": 9.839236105840026e-05, + "loss": 1.5762206077575684, + "step": 71110 + }, + { + "epoch": 0.2152770420595402, + "grad_norm": 0.12271824479103088, + "learning_rate": 9.838856587676287e-05, + "loss": 1.5646076202392578, + "step": 71120 + }, + { + "epoch": 0.21530731160988603, + "grad_norm": 0.10933224111795425, + "learning_rate": 9.838477069512547e-05, + "loss": 1.5932657241821289, + "step": 71130 + }, + { + "epoch": 0.21533758116023186, + "grad_norm": 0.11838391423225403, + "learning_rate": 9.838097551348808e-05, + "loss": 1.5967031478881837, + "step": 71140 + }, + { + "epoch": 0.2153678507105777, + "grad_norm": 0.12191233783960342, + "learning_rate": 9.837718033185069e-05, + "loss": 1.5390491485595703, + "step": 71150 + }, + { + "epoch": 0.21539812026092353, + "grad_norm": 0.12721893191337585, + "learning_rate": 9.837338515021329e-05, + "loss": 1.605974006652832, + "step": 71160 + }, + { + "epoch": 0.21542838981126936, + "grad_norm": 0.12914037704467773, + "learning_rate": 9.83695899685759e-05, + "loss": 1.6014148712158203, + "step": 71170 + }, + { + "epoch": 0.21545865936161518, + "grad_norm": 0.130178764462471, + "learning_rate": 9.83657947869385e-05, + "loss": 1.5876917839050293, + "step": 71180 + }, + { + "epoch": 0.215488928911961, + "grad_norm": 0.12042161822319031, + "learning_rate": 9.836199960530111e-05, + "loss": 1.596926498413086, + "step": 71190 + }, + { + "epoch": 0.21551919846230685, + "grad_norm": 0.1110658049583435, + "learning_rate": 9.835820442366372e-05, + "loss": 1.5868355751037597, + "step": 71200 + }, + { + "epoch": 0.21554946801265268, + "grad_norm": 0.12427117675542831, + "learning_rate": 9.835440924202632e-05, + "loss": 1.5827877998352051, + "step": 71210 + }, + { + "epoch": 0.2155797375629985, + "grad_norm": 0.11272235214710236, + "learning_rate": 9.835061406038894e-05, + "loss": 1.6159677505493164, + "step": 71220 + }, + { + "epoch": 0.21561000711334433, + "grad_norm": 0.12544171512126923, + "learning_rate": 9.834681887875153e-05, + "loss": 1.5811992645263673, + "step": 71230 + }, + { + "epoch": 0.21564027666369015, + "grad_norm": 0.11058507114648819, + "learning_rate": 9.834302369711415e-05, + "loss": 1.6174736022949219, + "step": 71240 + }, + { + "epoch": 0.215670546214036, + "grad_norm": 0.12694795429706573, + "learning_rate": 9.833922851547675e-05, + "loss": 1.5566045761108398, + "step": 71250 + }, + { + "epoch": 0.21570081576438183, + "grad_norm": 0.1165461540222168, + "learning_rate": 9.833543333383936e-05, + "loss": 1.5869518280029298, + "step": 71260 + }, + { + "epoch": 0.21573108531472765, + "grad_norm": 0.12481482326984406, + "learning_rate": 9.833163815220196e-05, + "loss": 1.5787752151489258, + "step": 71270 + }, + { + "epoch": 0.21576135486507347, + "grad_norm": 0.13366419076919556, + "learning_rate": 9.832784297056458e-05, + "loss": 1.5807409286499023, + "step": 71280 + }, + { + "epoch": 0.2157916244154193, + "grad_norm": 0.12104000896215439, + "learning_rate": 9.832404778892717e-05, + "loss": 1.579042625427246, + "step": 71290 + }, + { + "epoch": 0.21582189396576515, + "grad_norm": 0.12619872391223907, + "learning_rate": 9.832025260728979e-05, + "loss": 1.562510871887207, + "step": 71300 + }, + { + "epoch": 0.21585216351611097, + "grad_norm": 0.13388313353061676, + "learning_rate": 9.83164574256524e-05, + "loss": 1.5570662498474122, + "step": 71310 + }, + { + "epoch": 0.2158824330664568, + "grad_norm": 0.11365873366594315, + "learning_rate": 9.8312662244015e-05, + "loss": 1.5566983222961426, + "step": 71320 + }, + { + "epoch": 0.21591270261680262, + "grad_norm": 0.11335870623588562, + "learning_rate": 9.83088670623776e-05, + "loss": 1.5664284706115723, + "step": 71330 + }, + { + "epoch": 0.21594297216714844, + "grad_norm": 0.1243094801902771, + "learning_rate": 9.830507188074021e-05, + "loss": 1.5797083854675293, + "step": 71340 + }, + { + "epoch": 0.2159732417174943, + "grad_norm": 0.13027028739452362, + "learning_rate": 9.830127669910283e-05, + "loss": 1.5731208801269532, + "step": 71350 + }, + { + "epoch": 0.21600351126784012, + "grad_norm": 0.1330798864364624, + "learning_rate": 9.829748151746542e-05, + "loss": 1.565083122253418, + "step": 71360 + }, + { + "epoch": 0.21603378081818594, + "grad_norm": 0.11134698241949081, + "learning_rate": 9.829368633582804e-05, + "loss": 1.568166160583496, + "step": 71370 + }, + { + "epoch": 0.21606405036853177, + "grad_norm": 0.14078539609909058, + "learning_rate": 9.828989115419064e-05, + "loss": 1.5912912368774415, + "step": 71380 + }, + { + "epoch": 0.2160943199188776, + "grad_norm": 0.12199946492910385, + "learning_rate": 9.828609597255326e-05, + "loss": 1.5655061721801757, + "step": 71390 + }, + { + "epoch": 0.21612458946922344, + "grad_norm": 0.12577201426029205, + "learning_rate": 9.828230079091585e-05, + "loss": 1.5812931060791016, + "step": 71400 + }, + { + "epoch": 0.21615485901956927, + "grad_norm": 0.11608994752168655, + "learning_rate": 9.827850560927847e-05, + "loss": 1.6118053436279296, + "step": 71410 + }, + { + "epoch": 0.2161851285699151, + "grad_norm": 0.12474855780601501, + "learning_rate": 9.827471042764106e-05, + "loss": 1.576332187652588, + "step": 71420 + }, + { + "epoch": 0.21621539812026092, + "grad_norm": 0.1302911788225174, + "learning_rate": 9.827091524600368e-05, + "loss": 1.5826098442077636, + "step": 71430 + }, + { + "epoch": 0.21624566767060674, + "grad_norm": 0.12890471518039703, + "learning_rate": 9.826712006436627e-05, + "loss": 1.581581211090088, + "step": 71440 + }, + { + "epoch": 0.2162759372209526, + "grad_norm": 0.1233491599559784, + "learning_rate": 9.826332488272889e-05, + "loss": 1.6048032760620117, + "step": 71450 + }, + { + "epoch": 0.21630620677129841, + "grad_norm": 0.14211392402648926, + "learning_rate": 9.82595297010915e-05, + "loss": 1.5730228424072266, + "step": 71460 + }, + { + "epoch": 0.21633647632164424, + "grad_norm": 0.1371198296546936, + "learning_rate": 9.82557345194541e-05, + "loss": 1.5983665466308594, + "step": 71470 + }, + { + "epoch": 0.21636674587199006, + "grad_norm": 0.11438121646642685, + "learning_rate": 9.825193933781671e-05, + "loss": 1.575674819946289, + "step": 71480 + }, + { + "epoch": 0.21639701542233591, + "grad_norm": 0.12598516047000885, + "learning_rate": 9.824814415617932e-05, + "loss": 1.5739269256591797, + "step": 71490 + }, + { + "epoch": 0.21642728497268174, + "grad_norm": 0.12531426548957825, + "learning_rate": 9.824434897454193e-05, + "loss": 1.5920811653137208, + "step": 71500 + }, + { + "epoch": 0.21642728497268174, + "eval_loss": 1.5975539684295654, + "eval_runtime": 28.5873, + "eval_samples_per_second": 17.49, + "eval_steps_per_second": 1.119, + "step": 71500 + }, + { + "epoch": 0.21645755452302756, + "grad_norm": 0.11484415084123611, + "learning_rate": 9.824055379290453e-05, + "loss": 1.5894962310791017, + "step": 71510 + }, + { + "epoch": 0.21648782407337339, + "grad_norm": 0.11379814147949219, + "learning_rate": 9.823675861126715e-05, + "loss": 1.565598487854004, + "step": 71520 + }, + { + "epoch": 0.2165180936237192, + "grad_norm": 0.12003849446773529, + "learning_rate": 9.823296342962974e-05, + "loss": 1.6259973526000977, + "step": 71530 + }, + { + "epoch": 0.21654836317406506, + "grad_norm": 0.1154252216219902, + "learning_rate": 9.822916824799236e-05, + "loss": 1.6086681365966797, + "step": 71540 + }, + { + "epoch": 0.21657863272441089, + "grad_norm": 0.12437662482261658, + "learning_rate": 9.822537306635495e-05, + "loss": 1.6071786880493164, + "step": 71550 + }, + { + "epoch": 0.2166089022747567, + "grad_norm": 0.13215428590774536, + "learning_rate": 9.822157788471757e-05, + "loss": 1.593726921081543, + "step": 71560 + }, + { + "epoch": 0.21663917182510253, + "grad_norm": 0.11361507326364517, + "learning_rate": 9.821778270308016e-05, + "loss": 1.5757357597351074, + "step": 71570 + }, + { + "epoch": 0.21666944137544836, + "grad_norm": 0.12427905946969986, + "learning_rate": 9.821398752144278e-05, + "loss": 1.5418234825134278, + "step": 71580 + }, + { + "epoch": 0.2166997109257942, + "grad_norm": 0.12915806472301483, + "learning_rate": 9.821019233980539e-05, + "loss": 1.5610692024230957, + "step": 71590 + }, + { + "epoch": 0.21672998047614003, + "grad_norm": 0.12479060888290405, + "learning_rate": 9.8206397158168e-05, + "loss": 1.6108695983886718, + "step": 71600 + }, + { + "epoch": 0.21676025002648586, + "grad_norm": 0.11609567701816559, + "learning_rate": 9.82026019765306e-05, + "loss": 1.5728662490844727, + "step": 71610 + }, + { + "epoch": 0.21679051957683168, + "grad_norm": 0.13123644888401031, + "learning_rate": 9.81988067948932e-05, + "loss": 1.580571746826172, + "step": 71620 + }, + { + "epoch": 0.2168207891271775, + "grad_norm": 0.1275021880865097, + "learning_rate": 9.819501161325581e-05, + "loss": 1.5757596969604493, + "step": 71630 + }, + { + "epoch": 0.21685105867752336, + "grad_norm": 0.13331228494644165, + "learning_rate": 9.819121643161842e-05, + "loss": 1.5869707107543944, + "step": 71640 + }, + { + "epoch": 0.21688132822786918, + "grad_norm": 0.12063981592655182, + "learning_rate": 9.818742124998102e-05, + "loss": 1.567482089996338, + "step": 71650 + }, + { + "epoch": 0.216911597778215, + "grad_norm": 0.1241011843085289, + "learning_rate": 9.818362606834363e-05, + "loss": 1.6236278533935546, + "step": 71660 + }, + { + "epoch": 0.21694186732856083, + "grad_norm": 0.12819866836071014, + "learning_rate": 9.817983088670624e-05, + "loss": 1.6057743072509765, + "step": 71670 + }, + { + "epoch": 0.21697213687890665, + "grad_norm": 0.1384282112121582, + "learning_rate": 9.817603570506884e-05, + "loss": 1.5691303253173827, + "step": 71680 + }, + { + "epoch": 0.2170024064292525, + "grad_norm": 0.12187031656503677, + "learning_rate": 9.817224052343145e-05, + "loss": 1.575010108947754, + "step": 71690 + }, + { + "epoch": 0.21703267597959833, + "grad_norm": 0.12436758726835251, + "learning_rate": 9.816844534179405e-05, + "loss": 1.5852556228637695, + "step": 71700 + }, + { + "epoch": 0.21706294552994415, + "grad_norm": 0.13128753006458282, + "learning_rate": 9.816465016015667e-05, + "loss": 1.5903322219848632, + "step": 71710 + }, + { + "epoch": 0.21709321508028998, + "grad_norm": 0.12852561473846436, + "learning_rate": 9.816085497851928e-05, + "loss": 1.568489170074463, + "step": 71720 + }, + { + "epoch": 0.2171234846306358, + "grad_norm": 0.13750885426998138, + "learning_rate": 9.815705979688189e-05, + "loss": 1.5801359176635743, + "step": 71730 + }, + { + "epoch": 0.21715375418098165, + "grad_norm": 0.11787863820791245, + "learning_rate": 9.815326461524449e-05, + "loss": 1.5926801681518554, + "step": 71740 + }, + { + "epoch": 0.21718402373132747, + "grad_norm": 0.12235134094953537, + "learning_rate": 9.81494694336071e-05, + "loss": 1.586883544921875, + "step": 71750 + }, + { + "epoch": 0.2172142932816733, + "grad_norm": 0.10835873335599899, + "learning_rate": 9.81456742519697e-05, + "loss": 1.5941917419433593, + "step": 71760 + }, + { + "epoch": 0.21724456283201912, + "grad_norm": 0.1192779690027237, + "learning_rate": 9.814187907033231e-05, + "loss": 1.5875646591186523, + "step": 71770 + }, + { + "epoch": 0.21727483238236495, + "grad_norm": 0.12411859631538391, + "learning_rate": 9.813808388869491e-05, + "loss": 1.5795896530151368, + "step": 71780 + }, + { + "epoch": 0.2173051019327108, + "grad_norm": 0.12060622870922089, + "learning_rate": 9.813428870705752e-05, + "loss": 1.517697525024414, + "step": 71790 + }, + { + "epoch": 0.21733537148305662, + "grad_norm": 0.12032290548086166, + "learning_rate": 9.813049352542013e-05, + "loss": 1.5956923484802246, + "step": 71800 + }, + { + "epoch": 0.21736564103340245, + "grad_norm": 0.13010747730731964, + "learning_rate": 9.812669834378273e-05, + "loss": 1.5519515991210937, + "step": 71810 + }, + { + "epoch": 0.21739591058374827, + "grad_norm": 0.1195685863494873, + "learning_rate": 9.812290316214534e-05, + "loss": 1.5884830474853515, + "step": 71820 + }, + { + "epoch": 0.21742618013409412, + "grad_norm": 0.1117779091000557, + "learning_rate": 9.811910798050796e-05, + "loss": 1.5745399475097657, + "step": 71830 + }, + { + "epoch": 0.21745644968443995, + "grad_norm": 0.12622201442718506, + "learning_rate": 9.811531279887055e-05, + "loss": 1.5836369514465332, + "step": 71840 + }, + { + "epoch": 0.21748671923478577, + "grad_norm": 0.13228687644004822, + "learning_rate": 9.811151761723317e-05, + "loss": 1.5716368675231933, + "step": 71850 + }, + { + "epoch": 0.2175169887851316, + "grad_norm": 0.11617773026227951, + "learning_rate": 9.810772243559576e-05, + "loss": 1.5558882713317872, + "step": 71860 + }, + { + "epoch": 0.21754725833547742, + "grad_norm": 0.11284242570400238, + "learning_rate": 9.810392725395838e-05, + "loss": 1.5767782211303711, + "step": 71870 + }, + { + "epoch": 0.21757752788582327, + "grad_norm": 0.12589599192142487, + "learning_rate": 9.810013207232097e-05, + "loss": 1.5647397994995118, + "step": 71880 + }, + { + "epoch": 0.2176077974361691, + "grad_norm": 0.11728060245513916, + "learning_rate": 9.80963368906836e-05, + "loss": 1.593337059020996, + "step": 71890 + }, + { + "epoch": 0.21763806698651492, + "grad_norm": 0.11522621661424637, + "learning_rate": 9.809254170904619e-05, + "loss": 1.5957937240600586, + "step": 71900 + }, + { + "epoch": 0.21766833653686074, + "grad_norm": 0.1263316571712494, + "learning_rate": 9.80887465274088e-05, + "loss": 1.5440071105957032, + "step": 71910 + }, + { + "epoch": 0.21769860608720656, + "grad_norm": 0.1329108625650406, + "learning_rate": 9.808495134577141e-05, + "loss": 1.6046146392822265, + "step": 71920 + }, + { + "epoch": 0.21772887563755242, + "grad_norm": 0.12728798389434814, + "learning_rate": 9.808115616413402e-05, + "loss": 1.5901030540466308, + "step": 71930 + }, + { + "epoch": 0.21775914518789824, + "grad_norm": 0.12160203605890274, + "learning_rate": 9.807736098249662e-05, + "loss": 1.6425605773925782, + "step": 71940 + }, + { + "epoch": 0.21778941473824406, + "grad_norm": 0.10868892818689346, + "learning_rate": 9.807356580085923e-05, + "loss": 1.6151321411132813, + "step": 71950 + }, + { + "epoch": 0.2178196842885899, + "grad_norm": 0.14201052486896515, + "learning_rate": 9.806977061922185e-05, + "loss": 1.5707222938537597, + "step": 71960 + }, + { + "epoch": 0.2178499538389357, + "grad_norm": 0.1383359432220459, + "learning_rate": 9.806597543758444e-05, + "loss": 1.5615831375122071, + "step": 71970 + }, + { + "epoch": 0.21788022338928156, + "grad_norm": 0.12108366191387177, + "learning_rate": 9.806218025594706e-05, + "loss": 1.571558666229248, + "step": 71980 + }, + { + "epoch": 0.2179104929396274, + "grad_norm": 0.12745606899261475, + "learning_rate": 9.805838507430965e-05, + "loss": 1.5844205856323241, + "step": 71990 + }, + { + "epoch": 0.2179407624899732, + "grad_norm": 0.11818758398294449, + "learning_rate": 9.805458989267227e-05, + "loss": 1.5689204216003418, + "step": 72000 + }, + { + "epoch": 0.2179407624899732, + "eval_loss": 1.5902239084243774, + "eval_runtime": 28.0579, + "eval_samples_per_second": 17.82, + "eval_steps_per_second": 1.14, + "step": 72000 + }, + { + "epoch": 0.21797103204031903, + "grad_norm": 0.1178092509508133, + "learning_rate": 9.805079471103487e-05, + "loss": 1.5489699363708496, + "step": 72010 + }, + { + "epoch": 0.21800130159066486, + "grad_norm": 0.11032934486865997, + "learning_rate": 9.804699952939748e-05, + "loss": 1.5773329734802246, + "step": 72020 + }, + { + "epoch": 0.2180315711410107, + "grad_norm": 0.12156101316213608, + "learning_rate": 9.804320434776008e-05, + "loss": 1.6128475189208984, + "step": 72030 + }, + { + "epoch": 0.21806184069135653, + "grad_norm": 0.13110841810703278, + "learning_rate": 9.80394091661227e-05, + "loss": 1.5669668197631836, + "step": 72040 + }, + { + "epoch": 0.21809211024170236, + "grad_norm": 0.12069664895534515, + "learning_rate": 9.803561398448529e-05, + "loss": 1.5800600051879883, + "step": 72050 + }, + { + "epoch": 0.21812237979204818, + "grad_norm": 0.11714132875204086, + "learning_rate": 9.803181880284791e-05, + "loss": 1.6026565551757812, + "step": 72060 + }, + { + "epoch": 0.218152649342394, + "grad_norm": 0.10696373879909515, + "learning_rate": 9.80280236212105e-05, + "loss": 1.5728021621704102, + "step": 72070 + }, + { + "epoch": 0.21818291889273986, + "grad_norm": 0.12403272092342377, + "learning_rate": 9.802422843957312e-05, + "loss": 1.6151138305664063, + "step": 72080 + }, + { + "epoch": 0.21821318844308568, + "grad_norm": 0.11323603242635727, + "learning_rate": 9.802043325793573e-05, + "loss": 1.593297004699707, + "step": 72090 + }, + { + "epoch": 0.2182434579934315, + "grad_norm": 0.11849881708621979, + "learning_rate": 9.801663807629833e-05, + "loss": 1.5512706756591796, + "step": 72100 + }, + { + "epoch": 0.21827372754377733, + "grad_norm": 0.1317855417728424, + "learning_rate": 9.801284289466095e-05, + "loss": 1.5490605354309082, + "step": 72110 + }, + { + "epoch": 0.21830399709412315, + "grad_norm": 0.11691634356975555, + "learning_rate": 9.800904771302354e-05, + "loss": 1.5809822082519531, + "step": 72120 + }, + { + "epoch": 0.218334266644469, + "grad_norm": 0.12190612405538559, + "learning_rate": 9.800525253138616e-05, + "loss": 1.6203958511352539, + "step": 72130 + }, + { + "epoch": 0.21836453619481483, + "grad_norm": 0.12334869056940079, + "learning_rate": 9.800145734974876e-05, + "loss": 1.556187629699707, + "step": 72140 + }, + { + "epoch": 0.21839480574516065, + "grad_norm": 0.141811802983284, + "learning_rate": 9.799766216811138e-05, + "loss": 1.5683368682861327, + "step": 72150 + }, + { + "epoch": 0.21842507529550648, + "grad_norm": 0.11711972951889038, + "learning_rate": 9.799386698647397e-05, + "loss": 1.5966248512268066, + "step": 72160 + }, + { + "epoch": 0.21845534484585233, + "grad_norm": 0.1156233623623848, + "learning_rate": 9.799007180483659e-05, + "loss": 1.5719536781311034, + "step": 72170 + }, + { + "epoch": 0.21848561439619815, + "grad_norm": 0.11515135318040848, + "learning_rate": 9.798627662319918e-05, + "loss": 1.5517854690551758, + "step": 72180 + }, + { + "epoch": 0.21851588394654398, + "grad_norm": 0.12523610889911652, + "learning_rate": 9.79824814415618e-05, + "loss": 1.5580389022827148, + "step": 72190 + }, + { + "epoch": 0.2185461534968898, + "grad_norm": 0.1147088035941124, + "learning_rate": 9.79786862599244e-05, + "loss": 1.6089813232421875, + "step": 72200 + }, + { + "epoch": 0.21857642304723562, + "grad_norm": 0.11473584175109863, + "learning_rate": 9.797489107828701e-05, + "loss": 1.5764245986938477, + "step": 72210 + }, + { + "epoch": 0.21860669259758148, + "grad_norm": 0.11068815737962723, + "learning_rate": 9.797109589664962e-05, + "loss": 1.6092287063598634, + "step": 72220 + }, + { + "epoch": 0.2186369621479273, + "grad_norm": 0.12286069244146347, + "learning_rate": 9.796730071501222e-05, + "loss": 1.5874763488769532, + "step": 72230 + }, + { + "epoch": 0.21866723169827312, + "grad_norm": 0.1229507252573967, + "learning_rate": 9.796350553337483e-05, + "loss": 1.5671611785888673, + "step": 72240 + }, + { + "epoch": 0.21869750124861895, + "grad_norm": 0.1273316890001297, + "learning_rate": 9.795971035173744e-05, + "loss": 1.618804359436035, + "step": 72250 + }, + { + "epoch": 0.21872777079896477, + "grad_norm": 0.13419251143932343, + "learning_rate": 9.795591517010004e-05, + "loss": 1.5999937057495117, + "step": 72260 + }, + { + "epoch": 0.21875804034931062, + "grad_norm": 0.12649308145046234, + "learning_rate": 9.795211998846265e-05, + "loss": 1.5937860488891602, + "step": 72270 + }, + { + "epoch": 0.21878830989965645, + "grad_norm": 0.12487944960594177, + "learning_rate": 9.794832480682525e-05, + "loss": 1.567759895324707, + "step": 72280 + }, + { + "epoch": 0.21881857945000227, + "grad_norm": 0.11438366025686264, + "learning_rate": 9.794452962518786e-05, + "loss": 1.5640289306640625, + "step": 72290 + }, + { + "epoch": 0.2188488490003481, + "grad_norm": 0.11612008512020111, + "learning_rate": 9.794073444355047e-05, + "loss": 1.5605951309204102, + "step": 72300 + }, + { + "epoch": 0.21887911855069392, + "grad_norm": 0.15226204693317413, + "learning_rate": 9.793693926191307e-05, + "loss": 1.5623956680297852, + "step": 72310 + }, + { + "epoch": 0.21890938810103977, + "grad_norm": 0.11538930237293243, + "learning_rate": 9.793314408027569e-05, + "loss": 1.5887928009033203, + "step": 72320 + }, + { + "epoch": 0.2189396576513856, + "grad_norm": 0.11396818608045578, + "learning_rate": 9.79293488986383e-05, + "loss": 1.614841842651367, + "step": 72330 + }, + { + "epoch": 0.21896992720173142, + "grad_norm": 0.11542115360498428, + "learning_rate": 9.79255537170009e-05, + "loss": 1.5633875846862793, + "step": 72340 + }, + { + "epoch": 0.21900019675207724, + "grad_norm": 0.1202194094657898, + "learning_rate": 9.792175853536351e-05, + "loss": 1.6266059875488281, + "step": 72350 + }, + { + "epoch": 0.21903046630242307, + "grad_norm": 0.11482777446508408, + "learning_rate": 9.791796335372611e-05, + "loss": 1.55386962890625, + "step": 72360 + }, + { + "epoch": 0.21906073585276892, + "grad_norm": 0.10835827887058258, + "learning_rate": 9.791416817208872e-05, + "loss": 1.5649221420288086, + "step": 72370 + }, + { + "epoch": 0.21909100540311474, + "grad_norm": 0.10812738537788391, + "learning_rate": 9.791037299045133e-05, + "loss": 1.6080667495727539, + "step": 72380 + }, + { + "epoch": 0.21912127495346057, + "grad_norm": 0.11858905106782913, + "learning_rate": 9.790657780881393e-05, + "loss": 1.5910673141479492, + "step": 72390 + }, + { + "epoch": 0.2191515445038064, + "grad_norm": 0.12910307943820953, + "learning_rate": 9.790278262717654e-05, + "loss": 1.600330924987793, + "step": 72400 + }, + { + "epoch": 0.2191818140541522, + "grad_norm": 0.12980987131595612, + "learning_rate": 9.789898744553914e-05, + "loss": 1.5478816986083985, + "step": 72410 + }, + { + "epoch": 0.21921208360449806, + "grad_norm": 0.11339081823825836, + "learning_rate": 9.789519226390175e-05, + "loss": 1.6018400192260742, + "step": 72420 + }, + { + "epoch": 0.2192423531548439, + "grad_norm": 0.112760029733181, + "learning_rate": 9.789139708226436e-05, + "loss": 1.6021034240722656, + "step": 72430 + }, + { + "epoch": 0.2192726227051897, + "grad_norm": 0.11559613794088364, + "learning_rate": 9.788760190062696e-05, + "loss": 1.5797004699707031, + "step": 72440 + }, + { + "epoch": 0.21930289225553554, + "grad_norm": 0.11990019679069519, + "learning_rate": 9.788380671898957e-05, + "loss": 1.5702877998352052, + "step": 72450 + }, + { + "epoch": 0.21933316180588136, + "grad_norm": 0.11896048486232758, + "learning_rate": 9.788001153735219e-05, + "loss": 1.581313705444336, + "step": 72460 + }, + { + "epoch": 0.2193634313562272, + "grad_norm": 0.11619429290294647, + "learning_rate": 9.787621635571478e-05, + "loss": 1.5492549896240235, + "step": 72470 + }, + { + "epoch": 0.21939370090657304, + "grad_norm": 0.13210207223892212, + "learning_rate": 9.78724211740774e-05, + "loss": 1.5502958297729492, + "step": 72480 + }, + { + "epoch": 0.21942397045691886, + "grad_norm": 0.13109855353832245, + "learning_rate": 9.786862599243999e-05, + "loss": 1.5798328399658204, + "step": 72490 + }, + { + "epoch": 0.21945424000726468, + "grad_norm": 0.12299815565347672, + "learning_rate": 9.786483081080261e-05, + "loss": 1.5513906478881836, + "step": 72500 + }, + { + "epoch": 0.21945424000726468, + "eval_loss": 1.5972380638122559, + "eval_runtime": 28.1244, + "eval_samples_per_second": 17.778, + "eval_steps_per_second": 1.138, + "step": 72500 + }, + { + "epoch": 0.21948450955761054, + "grad_norm": 0.1143694519996643, + "learning_rate": 9.78610356291652e-05, + "loss": 1.6083545684814453, + "step": 72510 + }, + { + "epoch": 0.21951477910795636, + "grad_norm": 0.12572070956230164, + "learning_rate": 9.785724044752782e-05, + "loss": 1.5608912467956544, + "step": 72520 + }, + { + "epoch": 0.21954504865830218, + "grad_norm": 0.11192329972982407, + "learning_rate": 9.785344526589043e-05, + "loss": 1.5510690689086915, + "step": 72530 + }, + { + "epoch": 0.219575318208648, + "grad_norm": 0.11963319778442383, + "learning_rate": 9.784965008425303e-05, + "loss": 1.6448467254638672, + "step": 72540 + }, + { + "epoch": 0.21960558775899383, + "grad_norm": 0.11642668396234512, + "learning_rate": 9.784585490261564e-05, + "loss": 1.5851774215698242, + "step": 72550 + }, + { + "epoch": 0.21963585730933968, + "grad_norm": 0.12304810434579849, + "learning_rate": 9.784205972097825e-05, + "loss": 1.623556900024414, + "step": 72560 + }, + { + "epoch": 0.2196661268596855, + "grad_norm": 0.13062617182731628, + "learning_rate": 9.783826453934087e-05, + "loss": 1.5677423477172852, + "step": 72570 + }, + { + "epoch": 0.21969639641003133, + "grad_norm": 0.11748792976140976, + "learning_rate": 9.783446935770346e-05, + "loss": 1.573018741607666, + "step": 72580 + }, + { + "epoch": 0.21972666596037715, + "grad_norm": 0.11787424236536026, + "learning_rate": 9.783067417606608e-05, + "loss": 1.5676387786865233, + "step": 72590 + }, + { + "epoch": 0.21975693551072298, + "grad_norm": 0.1355067640542984, + "learning_rate": 9.782687899442867e-05, + "loss": 1.578727912902832, + "step": 72600 + }, + { + "epoch": 0.21978720506106883, + "grad_norm": 0.11362462490797043, + "learning_rate": 9.782308381279129e-05, + "loss": 1.5728095054626465, + "step": 72610 + }, + { + "epoch": 0.21981747461141465, + "grad_norm": 0.1242331862449646, + "learning_rate": 9.781928863115388e-05, + "loss": 1.5730201721191406, + "step": 72620 + }, + { + "epoch": 0.21984774416176048, + "grad_norm": 0.12551134824752808, + "learning_rate": 9.78154934495165e-05, + "loss": 1.5695280075073241, + "step": 72630 + }, + { + "epoch": 0.2198780137121063, + "grad_norm": 0.136996790766716, + "learning_rate": 9.78116982678791e-05, + "loss": 1.5098424911499024, + "step": 72640 + }, + { + "epoch": 0.21990828326245213, + "grad_norm": 0.11697621643543243, + "learning_rate": 9.780790308624171e-05, + "loss": 1.582726001739502, + "step": 72650 + }, + { + "epoch": 0.21993855281279798, + "grad_norm": 0.12726473808288574, + "learning_rate": 9.78041079046043e-05, + "loss": 1.6250934600830078, + "step": 72660 + }, + { + "epoch": 0.2199688223631438, + "grad_norm": 0.12316515296697617, + "learning_rate": 9.780031272296693e-05, + "loss": 1.603934097290039, + "step": 72670 + }, + { + "epoch": 0.21999909191348963, + "grad_norm": 0.11392088979482651, + "learning_rate": 9.779651754132952e-05, + "loss": 1.5904592514038085, + "step": 72680 + }, + { + "epoch": 0.22002936146383545, + "grad_norm": 0.11536911875009537, + "learning_rate": 9.779272235969214e-05, + "loss": 1.5994586944580078, + "step": 72690 + }, + { + "epoch": 0.22005963101418127, + "grad_norm": 0.13875460624694824, + "learning_rate": 9.778892717805474e-05, + "loss": 1.5513677597045898, + "step": 72700 + }, + { + "epoch": 0.22008990056452712, + "grad_norm": 0.11550191789865494, + "learning_rate": 9.778513199641735e-05, + "loss": 1.564488697052002, + "step": 72710 + }, + { + "epoch": 0.22012017011487295, + "grad_norm": 0.13231804966926575, + "learning_rate": 9.778133681477997e-05, + "loss": 1.5426787376403808, + "step": 72720 + }, + { + "epoch": 0.22015043966521877, + "grad_norm": 0.1210130825638771, + "learning_rate": 9.777754163314256e-05, + "loss": 1.5583955764770507, + "step": 72730 + }, + { + "epoch": 0.2201807092155646, + "grad_norm": 0.1253616213798523, + "learning_rate": 9.777374645150518e-05, + "loss": 1.568436622619629, + "step": 72740 + }, + { + "epoch": 0.22021097876591042, + "grad_norm": 0.13143479824066162, + "learning_rate": 9.776995126986777e-05, + "loss": 1.6017236709594727, + "step": 72750 + }, + { + "epoch": 0.22024124831625627, + "grad_norm": 0.11769619584083557, + "learning_rate": 9.776615608823039e-05, + "loss": 1.6000457763671876, + "step": 72760 + }, + { + "epoch": 0.2202715178666021, + "grad_norm": 0.12024495005607605, + "learning_rate": 9.776236090659299e-05, + "loss": 1.5682225227355957, + "step": 72770 + }, + { + "epoch": 0.22030178741694792, + "grad_norm": 0.11532077938318253, + "learning_rate": 9.77585657249556e-05, + "loss": 1.5810376167297364, + "step": 72780 + }, + { + "epoch": 0.22033205696729374, + "grad_norm": 0.12505312263965607, + "learning_rate": 9.77547705433182e-05, + "loss": 1.6208671569824218, + "step": 72790 + }, + { + "epoch": 0.22036232651763957, + "grad_norm": 0.11136554181575775, + "learning_rate": 9.775097536168082e-05, + "loss": 1.5983413696289062, + "step": 72800 + }, + { + "epoch": 0.22039259606798542, + "grad_norm": 0.11722565442323685, + "learning_rate": 9.774718018004341e-05, + "loss": 1.5950445175170898, + "step": 72810 + }, + { + "epoch": 0.22042286561833124, + "grad_norm": 0.1306169033050537, + "learning_rate": 9.774338499840603e-05, + "loss": 1.627493667602539, + "step": 72820 + }, + { + "epoch": 0.22045313516867707, + "grad_norm": 0.12626613676548004, + "learning_rate": 9.773958981676863e-05, + "loss": 1.5479320526123046, + "step": 72830 + }, + { + "epoch": 0.2204834047190229, + "grad_norm": 0.11565784364938736, + "learning_rate": 9.773579463513124e-05, + "loss": 1.552103042602539, + "step": 72840 + }, + { + "epoch": 0.22051367426936874, + "grad_norm": 0.12147960066795349, + "learning_rate": 9.773199945349385e-05, + "loss": 1.5735089302062988, + "step": 72850 + }, + { + "epoch": 0.22054394381971457, + "grad_norm": 0.1142495721578598, + "learning_rate": 9.772820427185645e-05, + "loss": 1.6011602401733398, + "step": 72860 + }, + { + "epoch": 0.2205742133700604, + "grad_norm": 0.11042627692222595, + "learning_rate": 9.772440909021906e-05, + "loss": 1.5896696090698241, + "step": 72870 + }, + { + "epoch": 0.22060448292040621, + "grad_norm": 0.10351534187793732, + "learning_rate": 9.772061390858166e-05, + "loss": 1.5384946823120118, + "step": 72880 + }, + { + "epoch": 0.22063475247075204, + "grad_norm": 0.11193415522575378, + "learning_rate": 9.771681872694427e-05, + "loss": 1.5644428253173828, + "step": 72890 + }, + { + "epoch": 0.2206650220210979, + "grad_norm": 0.12795887887477875, + "learning_rate": 9.771302354530688e-05, + "loss": 1.6010141372680664, + "step": 72900 + }, + { + "epoch": 0.2206952915714437, + "grad_norm": 0.1188027411699295, + "learning_rate": 9.770922836366948e-05, + "loss": 1.590703582763672, + "step": 72910 + }, + { + "epoch": 0.22072556112178954, + "grad_norm": 0.12687252461910248, + "learning_rate": 9.770543318203209e-05, + "loss": 1.5690365791320802, + "step": 72920 + }, + { + "epoch": 0.22075583067213536, + "grad_norm": 0.1311686784029007, + "learning_rate": 9.770163800039471e-05, + "loss": 1.5925350189208984, + "step": 72930 + }, + { + "epoch": 0.22078610022248119, + "grad_norm": 0.12908878922462463, + "learning_rate": 9.769784281875731e-05, + "loss": 1.5584098815917968, + "step": 72940 + }, + { + "epoch": 0.22081636977282704, + "grad_norm": 0.12263950705528259, + "learning_rate": 9.769404763711992e-05, + "loss": 1.563467788696289, + "step": 72950 + }, + { + "epoch": 0.22084663932317286, + "grad_norm": 0.11331655830144882, + "learning_rate": 9.769025245548253e-05, + "loss": 1.6075761795043946, + "step": 72960 + }, + { + "epoch": 0.22087690887351868, + "grad_norm": 0.11084133386611938, + "learning_rate": 9.768645727384513e-05, + "loss": 1.594345474243164, + "step": 72970 + }, + { + "epoch": 0.2209071784238645, + "grad_norm": 0.11334411799907684, + "learning_rate": 9.768266209220774e-05, + "loss": 1.625617218017578, + "step": 72980 + }, + { + "epoch": 0.22093744797421033, + "grad_norm": 0.11225578188896179, + "learning_rate": 9.767886691057034e-05, + "loss": 1.6109231948852538, + "step": 72990 + }, + { + "epoch": 0.22096771752455618, + "grad_norm": 0.11978941410779953, + "learning_rate": 9.767507172893295e-05, + "loss": 1.597001838684082, + "step": 73000 + }, + { + "epoch": 0.22096771752455618, + "eval_loss": 1.5842455625534058, + "eval_runtime": 28.3491, + "eval_samples_per_second": 17.637, + "eval_steps_per_second": 1.129, + "step": 73000 + }, + { + "epoch": 0.220997987074902, + "grad_norm": 0.1150880828499794, + "learning_rate": 9.767127654729556e-05, + "loss": 1.6119047164916993, + "step": 73010 + }, + { + "epoch": 0.22102825662524783, + "grad_norm": 0.13187262415885925, + "learning_rate": 9.766748136565816e-05, + "loss": 1.5750983238220215, + "step": 73020 + }, + { + "epoch": 0.22105852617559366, + "grad_norm": 0.1172177866101265, + "learning_rate": 9.766368618402077e-05, + "loss": 1.5790852546691894, + "step": 73030 + }, + { + "epoch": 0.22108879572593948, + "grad_norm": 0.1222253367304802, + "learning_rate": 9.765989100238337e-05, + "loss": 1.5870845794677735, + "step": 73040 + }, + { + "epoch": 0.22111906527628533, + "grad_norm": 0.12738873064517975, + "learning_rate": 9.765609582074598e-05, + "loss": 1.5578813552856445, + "step": 73050 + }, + { + "epoch": 0.22114933482663116, + "grad_norm": 0.12392516434192657, + "learning_rate": 9.765230063910858e-05, + "loss": 1.5948396682739259, + "step": 73060 + }, + { + "epoch": 0.22117960437697698, + "grad_norm": 0.1140139028429985, + "learning_rate": 9.76485054574712e-05, + "loss": 1.589370059967041, + "step": 73070 + }, + { + "epoch": 0.2212098739273228, + "grad_norm": 0.12060774117708206, + "learning_rate": 9.76447102758338e-05, + "loss": 1.5807497024536132, + "step": 73080 + }, + { + "epoch": 0.22124014347766863, + "grad_norm": 0.12867113947868347, + "learning_rate": 9.764091509419642e-05, + "loss": 1.589297389984131, + "step": 73090 + }, + { + "epoch": 0.22127041302801448, + "grad_norm": 0.1132129430770874, + "learning_rate": 9.763711991255901e-05, + "loss": 1.5718801498413086, + "step": 73100 + }, + { + "epoch": 0.2213006825783603, + "grad_norm": 0.1092250645160675, + "learning_rate": 9.763332473092163e-05, + "loss": 1.594565200805664, + "step": 73110 + }, + { + "epoch": 0.22133095212870613, + "grad_norm": 0.11964044719934464, + "learning_rate": 9.762952954928422e-05, + "loss": 1.5659893035888672, + "step": 73120 + }, + { + "epoch": 0.22136122167905195, + "grad_norm": 0.11466551572084427, + "learning_rate": 9.762573436764684e-05, + "loss": 1.5964987754821778, + "step": 73130 + }, + { + "epoch": 0.22139149122939777, + "grad_norm": 0.11373946070671082, + "learning_rate": 9.762193918600945e-05, + "loss": 1.5707176208496094, + "step": 73140 + }, + { + "epoch": 0.22142176077974363, + "grad_norm": 0.11718066036701202, + "learning_rate": 9.761814400437205e-05, + "loss": 1.5853761672973632, + "step": 73150 + }, + { + "epoch": 0.22145203033008945, + "grad_norm": 0.1199759691953659, + "learning_rate": 9.761434882273466e-05, + "loss": 1.6284912109375, + "step": 73160 + }, + { + "epoch": 0.22148229988043527, + "grad_norm": 0.11932148784399033, + "learning_rate": 9.761055364109726e-05, + "loss": 1.5855308532714845, + "step": 73170 + }, + { + "epoch": 0.2215125694307811, + "grad_norm": 0.11350970715284348, + "learning_rate": 9.760675845945988e-05, + "loss": 1.5597153663635255, + "step": 73180 + }, + { + "epoch": 0.22154283898112692, + "grad_norm": 0.11595067381858826, + "learning_rate": 9.760296327782248e-05, + "loss": 1.5558876037597655, + "step": 73190 + }, + { + "epoch": 0.22157310853147277, + "grad_norm": 0.12787805497646332, + "learning_rate": 9.75991680961851e-05, + "loss": 1.6177467346191405, + "step": 73200 + }, + { + "epoch": 0.2216033780818186, + "grad_norm": 0.12316133826971054, + "learning_rate": 9.759537291454769e-05, + "loss": 1.5709389686584472, + "step": 73210 + }, + { + "epoch": 0.22163364763216442, + "grad_norm": 0.11829114705324173, + "learning_rate": 9.759157773291031e-05, + "loss": 1.565955638885498, + "step": 73220 + }, + { + "epoch": 0.22166391718251025, + "grad_norm": 0.14045393466949463, + "learning_rate": 9.75877825512729e-05, + "loss": 1.618378257751465, + "step": 73230 + }, + { + "epoch": 0.2216941867328561, + "grad_norm": 0.12593621015548706, + "learning_rate": 9.758398736963552e-05, + "loss": 1.6332401275634765, + "step": 73240 + }, + { + "epoch": 0.22172445628320192, + "grad_norm": 0.12459095567464828, + "learning_rate": 9.758019218799811e-05, + "loss": 1.5731671333312989, + "step": 73250 + }, + { + "epoch": 0.22175472583354774, + "grad_norm": 0.12250939011573792, + "learning_rate": 9.757639700636073e-05, + "loss": 1.5758411407470703, + "step": 73260 + }, + { + "epoch": 0.22178499538389357, + "grad_norm": 0.13058215379714966, + "learning_rate": 9.757260182472332e-05, + "loss": 1.5885438919067383, + "step": 73270 + }, + { + "epoch": 0.2218152649342394, + "grad_norm": 0.12143488228321075, + "learning_rate": 9.756880664308594e-05, + "loss": 1.5765230178833007, + "step": 73280 + }, + { + "epoch": 0.22184553448458524, + "grad_norm": 0.13361696898937225, + "learning_rate": 9.756501146144854e-05, + "loss": 1.598160457611084, + "step": 73290 + }, + { + "epoch": 0.22187580403493107, + "grad_norm": 0.11325685679912567, + "learning_rate": 9.756121627981115e-05, + "loss": 1.5722129821777344, + "step": 73300 + }, + { + "epoch": 0.2219060735852769, + "grad_norm": 0.11533822864294052, + "learning_rate": 9.755742109817376e-05, + "loss": 1.6126741409301757, + "step": 73310 + }, + { + "epoch": 0.22193634313562272, + "grad_norm": 0.13322526216506958, + "learning_rate": 9.755362591653637e-05, + "loss": 1.5690389633178712, + "step": 73320 + }, + { + "epoch": 0.22196661268596854, + "grad_norm": 0.12841859459877014, + "learning_rate": 9.754983073489899e-05, + "loss": 1.596592330932617, + "step": 73330 + }, + { + "epoch": 0.2219968822363144, + "grad_norm": 0.1179526299238205, + "learning_rate": 9.754603555326158e-05, + "loss": 1.6000801086425782, + "step": 73340 + }, + { + "epoch": 0.22202715178666022, + "grad_norm": 0.11638063937425613, + "learning_rate": 9.75422403716242e-05, + "loss": 1.5983366012573241, + "step": 73350 + }, + { + "epoch": 0.22205742133700604, + "grad_norm": 0.11789263039827347, + "learning_rate": 9.753844518998679e-05, + "loss": 1.592430305480957, + "step": 73360 + }, + { + "epoch": 0.22208769088735186, + "grad_norm": 0.13638627529144287, + "learning_rate": 9.753465000834941e-05, + "loss": 1.5436702728271485, + "step": 73370 + }, + { + "epoch": 0.2221179604376977, + "grad_norm": 0.12586188316345215, + "learning_rate": 9.7530854826712e-05, + "loss": 1.5658378601074219, + "step": 73380 + }, + { + "epoch": 0.22214822998804354, + "grad_norm": 0.1136980950832367, + "learning_rate": 9.752705964507462e-05, + "loss": 1.569049072265625, + "step": 73390 + }, + { + "epoch": 0.22217849953838936, + "grad_norm": 0.10781162232160568, + "learning_rate": 9.752326446343721e-05, + "loss": 1.590659999847412, + "step": 73400 + }, + { + "epoch": 0.2222087690887352, + "grad_norm": 0.12238188087940216, + "learning_rate": 9.751946928179983e-05, + "loss": 1.6108964920043944, + "step": 73410 + }, + { + "epoch": 0.222239038639081, + "grad_norm": 0.11298854649066925, + "learning_rate": 9.751567410016243e-05, + "loss": 1.6002872467041016, + "step": 73420 + }, + { + "epoch": 0.22226930818942683, + "grad_norm": 0.11453253030776978, + "learning_rate": 9.751187891852505e-05, + "loss": 1.6015451431274415, + "step": 73430 + }, + { + "epoch": 0.22229957773977269, + "grad_norm": 0.1277921050786972, + "learning_rate": 9.750808373688765e-05, + "loss": 1.574179458618164, + "step": 73440 + }, + { + "epoch": 0.2223298472901185, + "grad_norm": 0.11028866469860077, + "learning_rate": 9.750428855525026e-05, + "loss": 1.5945140838623046, + "step": 73450 + }, + { + "epoch": 0.22236011684046433, + "grad_norm": 0.10741063952445984, + "learning_rate": 9.750049337361286e-05, + "loss": 1.599173355102539, + "step": 73460 + }, + { + "epoch": 0.22239038639081016, + "grad_norm": 0.13070642948150635, + "learning_rate": 9.749669819197547e-05, + "loss": 1.5536897659301758, + "step": 73470 + }, + { + "epoch": 0.22242065594115598, + "grad_norm": 0.1562069058418274, + "learning_rate": 9.749290301033808e-05, + "loss": 1.5619173049926758, + "step": 73480 + }, + { + "epoch": 0.22245092549150183, + "grad_norm": 0.1143181174993515, + "learning_rate": 9.748910782870068e-05, + "loss": 1.5838588714599608, + "step": 73490 + }, + { + "epoch": 0.22248119504184766, + "grad_norm": 0.11266132444143295, + "learning_rate": 9.748531264706329e-05, + "loss": 1.5764068603515624, + "step": 73500 + }, + { + "epoch": 0.22248119504184766, + "eval_loss": 1.5703071355819702, + "eval_runtime": 27.8454, + "eval_samples_per_second": 17.956, + "eval_steps_per_second": 1.149, + "step": 73500 + }, + { + "epoch": 0.22251146459219348, + "grad_norm": 0.11975830793380737, + "learning_rate": 9.74815174654259e-05, + "loss": 1.5673521995544433, + "step": 73510 + }, + { + "epoch": 0.2225417341425393, + "grad_norm": 0.11940308660268784, + "learning_rate": 9.74777222837885e-05, + "loss": 1.535671329498291, + "step": 73520 + }, + { + "epoch": 0.22257200369288513, + "grad_norm": 0.11204235255718231, + "learning_rate": 9.74739271021511e-05, + "loss": 1.5703416824340821, + "step": 73530 + }, + { + "epoch": 0.22260227324323098, + "grad_norm": 0.12264649569988251, + "learning_rate": 9.747013192051372e-05, + "loss": 1.5927264213562011, + "step": 73540 + }, + { + "epoch": 0.2226325427935768, + "grad_norm": 0.1283065378665924, + "learning_rate": 9.746633673887632e-05, + "loss": 1.6066431045532226, + "step": 73550 + }, + { + "epoch": 0.22266281234392263, + "grad_norm": 0.12876419723033905, + "learning_rate": 9.746254155723894e-05, + "loss": 1.6426740646362306, + "step": 73560 + }, + { + "epoch": 0.22269308189426845, + "grad_norm": 0.11450283229351044, + "learning_rate": 9.745874637560154e-05, + "loss": 1.5610711097717285, + "step": 73570 + }, + { + "epoch": 0.2227233514446143, + "grad_norm": 0.12529069185256958, + "learning_rate": 9.745495119396415e-05, + "loss": 1.5881600379943848, + "step": 73580 + }, + { + "epoch": 0.22275362099496013, + "grad_norm": 0.11168764531612396, + "learning_rate": 9.745115601232675e-05, + "loss": 1.5868000984191895, + "step": 73590 + }, + { + "epoch": 0.22278389054530595, + "grad_norm": 0.11388815939426422, + "learning_rate": 9.744736083068936e-05, + "loss": 1.542926025390625, + "step": 73600 + }, + { + "epoch": 0.22281416009565178, + "grad_norm": 0.11477338522672653, + "learning_rate": 9.744356564905197e-05, + "loss": 1.572722816467285, + "step": 73610 + }, + { + "epoch": 0.2228444296459976, + "grad_norm": 0.12759214639663696, + "learning_rate": 9.743977046741457e-05, + "loss": 1.548060989379883, + "step": 73620 + }, + { + "epoch": 0.22287469919634345, + "grad_norm": 0.12928742170333862, + "learning_rate": 9.743597528577718e-05, + "loss": 1.5829744338989258, + "step": 73630 + }, + { + "epoch": 0.22290496874668927, + "grad_norm": 0.13101358711719513, + "learning_rate": 9.743218010413978e-05, + "loss": 1.6071727752685547, + "step": 73640 + }, + { + "epoch": 0.2229352382970351, + "grad_norm": 0.1361764371395111, + "learning_rate": 9.742838492250239e-05, + "loss": 1.5536511421203614, + "step": 73650 + }, + { + "epoch": 0.22296550784738092, + "grad_norm": 0.12489829212427139, + "learning_rate": 9.7424589740865e-05, + "loss": 1.5575233459472657, + "step": 73660 + }, + { + "epoch": 0.22299577739772675, + "grad_norm": 0.10578300058841705, + "learning_rate": 9.74207945592276e-05, + "loss": 1.609621810913086, + "step": 73670 + }, + { + "epoch": 0.2230260469480726, + "grad_norm": 0.13707885146141052, + "learning_rate": 9.741699937759022e-05, + "loss": 1.6047796249389648, + "step": 73680 + }, + { + "epoch": 0.22305631649841842, + "grad_norm": 0.12146884948015213, + "learning_rate": 9.741320419595281e-05, + "loss": 1.5978436470031738, + "step": 73690 + }, + { + "epoch": 0.22308658604876425, + "grad_norm": 0.12420941889286041, + "learning_rate": 9.740940901431543e-05, + "loss": 1.5966706275939941, + "step": 73700 + }, + { + "epoch": 0.22311685559911007, + "grad_norm": 0.12374590337276459, + "learning_rate": 9.740561383267803e-05, + "loss": 1.5627561569213868, + "step": 73710 + }, + { + "epoch": 0.2231471251494559, + "grad_norm": 0.12077014148235321, + "learning_rate": 9.740181865104065e-05, + "loss": 1.5756013870239258, + "step": 73720 + }, + { + "epoch": 0.22317739469980175, + "grad_norm": 0.11451257765293121, + "learning_rate": 9.739802346940324e-05, + "loss": 1.5975093841552734, + "step": 73730 + }, + { + "epoch": 0.22320766425014757, + "grad_norm": 0.11909541487693787, + "learning_rate": 9.739422828776586e-05, + "loss": 1.595475196838379, + "step": 73740 + }, + { + "epoch": 0.2232379338004934, + "grad_norm": 0.11166075617074966, + "learning_rate": 9.739043310612846e-05, + "loss": 1.5555238723754883, + "step": 73750 + }, + { + "epoch": 0.22326820335083922, + "grad_norm": 0.13144537806510925, + "learning_rate": 9.738663792449107e-05, + "loss": 1.5763513565063476, + "step": 73760 + }, + { + "epoch": 0.22329847290118504, + "grad_norm": 0.11964008212089539, + "learning_rate": 9.738284274285368e-05, + "loss": 1.6183788299560546, + "step": 73770 + }, + { + "epoch": 0.2233287424515309, + "grad_norm": 0.12304165959358215, + "learning_rate": 9.737904756121628e-05, + "loss": 1.5646636962890625, + "step": 73780 + }, + { + "epoch": 0.22335901200187672, + "grad_norm": 0.10423441976308823, + "learning_rate": 9.737525237957889e-05, + "loss": 1.5807668685913085, + "step": 73790 + }, + { + "epoch": 0.22338928155222254, + "grad_norm": 0.1371510624885559, + "learning_rate": 9.737145719794149e-05, + "loss": 1.57017240524292, + "step": 73800 + }, + { + "epoch": 0.22341955110256836, + "grad_norm": 0.11897469311952591, + "learning_rate": 9.736766201630411e-05, + "loss": 1.5625985145568848, + "step": 73810 + }, + { + "epoch": 0.2234498206529142, + "grad_norm": 0.12644188106060028, + "learning_rate": 9.73638668346667e-05, + "loss": 1.5632575035095215, + "step": 73820 + }, + { + "epoch": 0.22348009020326004, + "grad_norm": 0.12019509822130203, + "learning_rate": 9.736007165302932e-05, + "loss": 1.584758186340332, + "step": 73830 + }, + { + "epoch": 0.22351035975360586, + "grad_norm": 0.13447736203670502, + "learning_rate": 9.735627647139192e-05, + "loss": 1.5753360748291017, + "step": 73840 + }, + { + "epoch": 0.2235406293039517, + "grad_norm": 0.12762244045734406, + "learning_rate": 9.735248128975454e-05, + "loss": 1.5667967796325684, + "step": 73850 + }, + { + "epoch": 0.2235708988542975, + "grad_norm": 0.11621326953172684, + "learning_rate": 9.734868610811713e-05, + "loss": 1.5910221099853517, + "step": 73860 + }, + { + "epoch": 0.22360116840464334, + "grad_norm": 0.112790547311306, + "learning_rate": 9.734489092647975e-05, + "loss": 1.579335880279541, + "step": 73870 + }, + { + "epoch": 0.2236314379549892, + "grad_norm": 0.1316162496805191, + "learning_rate": 9.734109574484234e-05, + "loss": 1.5898582458496093, + "step": 73880 + }, + { + "epoch": 0.223661707505335, + "grad_norm": 0.13337449729442596, + "learning_rate": 9.733730056320496e-05, + "loss": 1.5785452842712402, + "step": 73890 + }, + { + "epoch": 0.22369197705568084, + "grad_norm": 0.12300211191177368, + "learning_rate": 9.733350538156755e-05, + "loss": 1.586898422241211, + "step": 73900 + }, + { + "epoch": 0.22372224660602666, + "grad_norm": 0.11358293145895004, + "learning_rate": 9.732971019993017e-05, + "loss": 1.6170095443725585, + "step": 73910 + }, + { + "epoch": 0.2237525161563725, + "grad_norm": 0.1318395733833313, + "learning_rate": 9.732591501829276e-05, + "loss": 1.5788334846496581, + "step": 73920 + }, + { + "epoch": 0.22378278570671833, + "grad_norm": 0.11448151618242264, + "learning_rate": 9.732211983665538e-05, + "loss": 1.5838134765625, + "step": 73930 + }, + { + "epoch": 0.22381305525706416, + "grad_norm": 0.1199999451637268, + "learning_rate": 9.7318324655018e-05, + "loss": 1.562666130065918, + "step": 73940 + }, + { + "epoch": 0.22384332480740998, + "grad_norm": 0.11420523375272751, + "learning_rate": 9.73145294733806e-05, + "loss": 1.5913501739501954, + "step": 73950 + }, + { + "epoch": 0.2238735943577558, + "grad_norm": 0.11645881086587906, + "learning_rate": 9.731073429174322e-05, + "loss": 1.6142475128173828, + "step": 73960 + }, + { + "epoch": 0.22390386390810166, + "grad_norm": 0.10956278443336487, + "learning_rate": 9.730693911010581e-05, + "loss": 1.5665715217590332, + "step": 73970 + }, + { + "epoch": 0.22393413345844748, + "grad_norm": 0.10764949768781662, + "learning_rate": 9.730314392846843e-05, + "loss": 1.578923225402832, + "step": 73980 + }, + { + "epoch": 0.2239644030087933, + "grad_norm": 0.11435193568468094, + "learning_rate": 9.729934874683102e-05, + "loss": 1.5795324325561524, + "step": 73990 + }, + { + "epoch": 0.22399467255913913, + "grad_norm": 0.11042030900716782, + "learning_rate": 9.729555356519364e-05, + "loss": 1.6270679473876952, + "step": 74000 + }, + { + "epoch": 0.22399467255913913, + "eval_loss": 1.5763064622879028, + "eval_runtime": 27.8439, + "eval_samples_per_second": 17.957, + "eval_steps_per_second": 1.149, + "step": 74000 + }, + { + "epoch": 0.22402494210948495, + "grad_norm": 0.1125350147485733, + "learning_rate": 9.729175838355623e-05, + "loss": 1.5612768173217773, + "step": 74010 + }, + { + "epoch": 0.2240552116598308, + "grad_norm": 0.13297659158706665, + "learning_rate": 9.728796320191885e-05, + "loss": 1.5919167518615722, + "step": 74020 + }, + { + "epoch": 0.22408548121017663, + "grad_norm": 0.1148640438914299, + "learning_rate": 9.728416802028144e-05, + "loss": 1.5997968673706056, + "step": 74030 + }, + { + "epoch": 0.22411575076052245, + "grad_norm": 0.13824965059757233, + "learning_rate": 9.728037283864406e-05, + "loss": 1.581020450592041, + "step": 74040 + }, + { + "epoch": 0.22414602031086828, + "grad_norm": 0.10644073039293289, + "learning_rate": 9.727657765700667e-05, + "loss": 1.615036392211914, + "step": 74050 + }, + { + "epoch": 0.2241762898612141, + "grad_norm": 0.12056088447570801, + "learning_rate": 9.727278247536927e-05, + "loss": 1.5823629379272461, + "step": 74060 + }, + { + "epoch": 0.22420655941155995, + "grad_norm": 0.115880087018013, + "learning_rate": 9.726898729373188e-05, + "loss": 1.537816047668457, + "step": 74070 + }, + { + "epoch": 0.22423682896190578, + "grad_norm": 0.12102419883012772, + "learning_rate": 9.726519211209449e-05, + "loss": 1.6009346008300782, + "step": 74080 + }, + { + "epoch": 0.2242670985122516, + "grad_norm": 0.11389823257923126, + "learning_rate": 9.726139693045709e-05, + "loss": 1.5450757980346679, + "step": 74090 + }, + { + "epoch": 0.22429736806259742, + "grad_norm": 0.11824094504117966, + "learning_rate": 9.72576017488197e-05, + "loss": 1.5697271347045898, + "step": 74100 + }, + { + "epoch": 0.22432763761294325, + "grad_norm": 0.1286924183368683, + "learning_rate": 9.72538065671823e-05, + "loss": 1.565702247619629, + "step": 74110 + }, + { + "epoch": 0.2243579071632891, + "grad_norm": 0.1306334286928177, + "learning_rate": 9.725001138554491e-05, + "loss": 1.577571201324463, + "step": 74120 + }, + { + "epoch": 0.22438817671363492, + "grad_norm": 0.11872350424528122, + "learning_rate": 9.724621620390752e-05, + "loss": 1.5885260581970215, + "step": 74130 + }, + { + "epoch": 0.22441844626398075, + "grad_norm": 0.1167951375246048, + "learning_rate": 9.724242102227012e-05, + "loss": 1.5951627731323241, + "step": 74140 + }, + { + "epoch": 0.22444871581432657, + "grad_norm": 0.11203242838382721, + "learning_rate": 9.723862584063274e-05, + "loss": 1.5961735725402832, + "step": 74150 + }, + { + "epoch": 0.2244789853646724, + "grad_norm": 0.1264142245054245, + "learning_rate": 9.723483065899533e-05, + "loss": 1.5626419067382813, + "step": 74160 + }, + { + "epoch": 0.22450925491501825, + "grad_norm": 0.12141009420156479, + "learning_rate": 9.723103547735795e-05, + "loss": 1.6129453659057618, + "step": 74170 + }, + { + "epoch": 0.22453952446536407, + "grad_norm": 0.12641437351703644, + "learning_rate": 9.722724029572056e-05, + "loss": 1.5688403129577637, + "step": 74180 + }, + { + "epoch": 0.2245697940157099, + "grad_norm": 0.12109711766242981, + "learning_rate": 9.722344511408317e-05, + "loss": 1.5723910331726074, + "step": 74190 + }, + { + "epoch": 0.22460006356605572, + "grad_norm": 0.11350325495004654, + "learning_rate": 9.721964993244577e-05, + "loss": 1.6172855377197266, + "step": 74200 + }, + { + "epoch": 0.22463033311640154, + "grad_norm": 0.12184875458478928, + "learning_rate": 9.721585475080838e-05, + "loss": 1.5884993553161622, + "step": 74210 + }, + { + "epoch": 0.2246606026667474, + "grad_norm": 0.1250385344028473, + "learning_rate": 9.721205956917098e-05, + "loss": 1.62734375, + "step": 74220 + }, + { + "epoch": 0.22469087221709322, + "grad_norm": 0.13196195662021637, + "learning_rate": 9.720826438753359e-05, + "loss": 1.5632716178894044, + "step": 74230 + }, + { + "epoch": 0.22472114176743904, + "grad_norm": 0.12030012160539627, + "learning_rate": 9.72044692058962e-05, + "loss": 1.5569396018981934, + "step": 74240 + }, + { + "epoch": 0.22475141131778487, + "grad_norm": 0.12207387387752533, + "learning_rate": 9.72006740242588e-05, + "loss": 1.5880322456359863, + "step": 74250 + }, + { + "epoch": 0.22478168086813072, + "grad_norm": 0.11993362754583359, + "learning_rate": 9.719687884262141e-05, + "loss": 1.580158805847168, + "step": 74260 + }, + { + "epoch": 0.22481195041847654, + "grad_norm": 0.11029846221208572, + "learning_rate": 9.719308366098401e-05, + "loss": 1.5958917617797852, + "step": 74270 + }, + { + "epoch": 0.22484221996882237, + "grad_norm": 0.11912599205970764, + "learning_rate": 9.718928847934662e-05, + "loss": 1.6118841171264648, + "step": 74280 + }, + { + "epoch": 0.2248724895191682, + "grad_norm": 0.11840249598026276, + "learning_rate": 9.718549329770924e-05, + "loss": 1.5815019607543945, + "step": 74290 + }, + { + "epoch": 0.224902759069514, + "grad_norm": 0.11625726521015167, + "learning_rate": 9.718169811607183e-05, + "loss": 1.567215633392334, + "step": 74300 + }, + { + "epoch": 0.22493302861985986, + "grad_norm": 0.11521734297275543, + "learning_rate": 9.717790293443445e-05, + "loss": 1.5724327087402343, + "step": 74310 + }, + { + "epoch": 0.2249632981702057, + "grad_norm": 0.11280813813209534, + "learning_rate": 9.717410775279704e-05, + "loss": 1.54974308013916, + "step": 74320 + }, + { + "epoch": 0.2249935677205515, + "grad_norm": 0.11671315133571625, + "learning_rate": 9.717031257115966e-05, + "loss": 1.5988039016723632, + "step": 74330 + }, + { + "epoch": 0.22502383727089734, + "grad_norm": 0.13650324940681458, + "learning_rate": 9.716651738952227e-05, + "loss": 1.5603796005249024, + "step": 74340 + }, + { + "epoch": 0.22505410682124316, + "grad_norm": 0.11186600476503372, + "learning_rate": 9.716272220788487e-05, + "loss": 1.5867332458496093, + "step": 74350 + }, + { + "epoch": 0.225084376371589, + "grad_norm": 0.1219429224729538, + "learning_rate": 9.715892702624748e-05, + "loss": 1.5919261932373048, + "step": 74360 + }, + { + "epoch": 0.22511464592193484, + "grad_norm": 0.13615432381629944, + "learning_rate": 9.715513184461009e-05, + "loss": 1.5705856323242187, + "step": 74370 + }, + { + "epoch": 0.22514491547228066, + "grad_norm": 0.11493461579084396, + "learning_rate": 9.715133666297269e-05, + "loss": 1.5872814178466796, + "step": 74380 + }, + { + "epoch": 0.22517518502262648, + "grad_norm": 0.12890368700027466, + "learning_rate": 9.71475414813353e-05, + "loss": 1.5585241317749023, + "step": 74390 + }, + { + "epoch": 0.2252054545729723, + "grad_norm": 0.11856318265199661, + "learning_rate": 9.71437462996979e-05, + "loss": 1.559505081176758, + "step": 74400 + }, + { + "epoch": 0.22523572412331816, + "grad_norm": 0.1268400102853775, + "learning_rate": 9.713995111806051e-05, + "loss": 1.577706527709961, + "step": 74410 + }, + { + "epoch": 0.22526599367366398, + "grad_norm": 0.11893853545188904, + "learning_rate": 9.713615593642313e-05, + "loss": 1.5363006591796875, + "step": 74420 + }, + { + "epoch": 0.2252962632240098, + "grad_norm": 0.1170063465833664, + "learning_rate": 9.713236075478572e-05, + "loss": 1.5396501541137695, + "step": 74430 + }, + { + "epoch": 0.22532653277435563, + "grad_norm": 0.10659483820199966, + "learning_rate": 9.712856557314834e-05, + "loss": 1.5571356773376466, + "step": 74440 + }, + { + "epoch": 0.22535680232470146, + "grad_norm": 0.12768924236297607, + "learning_rate": 9.712477039151093e-05, + "loss": 1.5773794174194335, + "step": 74450 + }, + { + "epoch": 0.2253870718750473, + "grad_norm": 0.1294025033712387, + "learning_rate": 9.712097520987355e-05, + "loss": 1.5841797828674316, + "step": 74460 + }, + { + "epoch": 0.22541734142539313, + "grad_norm": 0.11251477152109146, + "learning_rate": 9.711718002823615e-05, + "loss": 1.566883659362793, + "step": 74470 + }, + { + "epoch": 0.22544761097573895, + "grad_norm": 0.11550378054380417, + "learning_rate": 9.711338484659877e-05, + "loss": 1.6042579650878905, + "step": 74480 + }, + { + "epoch": 0.22547788052608478, + "grad_norm": 0.10548366606235504, + "learning_rate": 9.710958966496136e-05, + "loss": 1.5952893257141114, + "step": 74490 + }, + { + "epoch": 0.2255081500764306, + "grad_norm": 0.12420608848333359, + "learning_rate": 9.710579448332398e-05, + "loss": 1.6066703796386719, + "step": 74500 + }, + { + "epoch": 0.2255081500764306, + "eval_loss": 1.595487117767334, + "eval_runtime": 28.3974, + "eval_samples_per_second": 17.607, + "eval_steps_per_second": 1.127, + "step": 74500 + }, + { + "epoch": 0.22553841962677645, + "grad_norm": 0.12911109626293182, + "learning_rate": 9.710199930168657e-05, + "loss": 1.6069690704345703, + "step": 74510 + }, + { + "epoch": 0.22556868917712228, + "grad_norm": 0.1369742602109909, + "learning_rate": 9.709820412004919e-05, + "loss": 1.5500128746032715, + "step": 74520 + }, + { + "epoch": 0.2255989587274681, + "grad_norm": 0.12635181844234467, + "learning_rate": 9.709440893841178e-05, + "loss": 1.5641799926757813, + "step": 74530 + }, + { + "epoch": 0.22562922827781393, + "grad_norm": 0.14235202968120575, + "learning_rate": 9.70906137567744e-05, + "loss": 1.5571115493774415, + "step": 74540 + }, + { + "epoch": 0.22565949782815975, + "grad_norm": 0.14785347878932953, + "learning_rate": 9.708681857513702e-05, + "loss": 1.5633233070373536, + "step": 74550 + }, + { + "epoch": 0.2256897673785056, + "grad_norm": 0.12218338251113892, + "learning_rate": 9.708302339349961e-05, + "loss": 1.557797622680664, + "step": 74560 + }, + { + "epoch": 0.22572003692885143, + "grad_norm": 0.11889857053756714, + "learning_rate": 9.707922821186223e-05, + "loss": 1.563436794281006, + "step": 74570 + }, + { + "epoch": 0.22575030647919725, + "grad_norm": 0.13008585572242737, + "learning_rate": 9.707543303022482e-05, + "loss": 1.5376947402954102, + "step": 74580 + }, + { + "epoch": 0.22578057602954307, + "grad_norm": 0.13068874180316925, + "learning_rate": 9.707163784858744e-05, + "loss": 1.5571675300598145, + "step": 74590 + }, + { + "epoch": 0.2258108455798889, + "grad_norm": 0.12300097197294235, + "learning_rate": 9.706784266695004e-05, + "loss": 1.5775150299072265, + "step": 74600 + }, + { + "epoch": 0.22584111513023475, + "grad_norm": 0.11897400766611099, + "learning_rate": 9.706404748531266e-05, + "loss": 1.581508731842041, + "step": 74610 + }, + { + "epoch": 0.22587138468058057, + "grad_norm": 0.11278212815523148, + "learning_rate": 9.706025230367525e-05, + "loss": 1.5750367164611816, + "step": 74620 + }, + { + "epoch": 0.2259016542309264, + "grad_norm": 0.11465836316347122, + "learning_rate": 9.705645712203787e-05, + "loss": 1.554468536376953, + "step": 74630 + }, + { + "epoch": 0.22593192378127222, + "grad_norm": 0.11494167894124985, + "learning_rate": 9.705266194040046e-05, + "loss": 1.6001176834106445, + "step": 74640 + }, + { + "epoch": 0.22596219333161807, + "grad_norm": 0.1248006597161293, + "learning_rate": 9.704886675876308e-05, + "loss": 1.5625906944274903, + "step": 74650 + }, + { + "epoch": 0.2259924628819639, + "grad_norm": 0.1205168068408966, + "learning_rate": 9.704507157712567e-05, + "loss": 1.5866575241088867, + "step": 74660 + }, + { + "epoch": 0.22602273243230972, + "grad_norm": 0.13376066088676453, + "learning_rate": 9.704127639548829e-05, + "loss": 1.6173671722412108, + "step": 74670 + }, + { + "epoch": 0.22605300198265554, + "grad_norm": 0.11420513689517975, + "learning_rate": 9.70374812138509e-05, + "loss": 1.5619546890258789, + "step": 74680 + }, + { + "epoch": 0.22608327153300137, + "grad_norm": 0.12327158451080322, + "learning_rate": 9.70336860322135e-05, + "loss": 1.5813309669494628, + "step": 74690 + }, + { + "epoch": 0.22611354108334722, + "grad_norm": 0.12108492851257324, + "learning_rate": 9.702989085057611e-05, + "loss": 1.614450454711914, + "step": 74700 + }, + { + "epoch": 0.22614381063369304, + "grad_norm": 0.12693771719932556, + "learning_rate": 9.702609566893872e-05, + "loss": 1.5950716018676758, + "step": 74710 + }, + { + "epoch": 0.22617408018403887, + "grad_norm": 0.12503880262374878, + "learning_rate": 9.702230048730132e-05, + "loss": 1.618310546875, + "step": 74720 + }, + { + "epoch": 0.2262043497343847, + "grad_norm": 0.11647021025419235, + "learning_rate": 9.701850530566393e-05, + "loss": 1.636782455444336, + "step": 74730 + }, + { + "epoch": 0.22623461928473051, + "grad_norm": 0.1163068413734436, + "learning_rate": 9.701471012402653e-05, + "loss": 1.5936418533325196, + "step": 74740 + }, + { + "epoch": 0.22626488883507637, + "grad_norm": 0.12839831411838531, + "learning_rate": 9.701091494238914e-05, + "loss": 1.5781572341918946, + "step": 74750 + }, + { + "epoch": 0.2262951583854222, + "grad_norm": 0.12189017981290817, + "learning_rate": 9.700711976075176e-05, + "loss": 1.5730547904968262, + "step": 74760 + }, + { + "epoch": 0.22632542793576801, + "grad_norm": 0.1361534595489502, + "learning_rate": 9.700332457911435e-05, + "loss": 1.5905332565307617, + "step": 74770 + }, + { + "epoch": 0.22635569748611384, + "grad_norm": 0.1173146516084671, + "learning_rate": 9.699952939747697e-05, + "loss": 1.5404370307922364, + "step": 74780 + }, + { + "epoch": 0.22638596703645966, + "grad_norm": 0.12180811166763306, + "learning_rate": 9.699573421583958e-05, + "loss": 1.6060791015625, + "step": 74790 + }, + { + "epoch": 0.2264162365868055, + "grad_norm": 0.12488718330860138, + "learning_rate": 9.699193903420218e-05, + "loss": 1.5637081146240235, + "step": 74800 + }, + { + "epoch": 0.22644650613715134, + "grad_norm": 0.11656468361616135, + "learning_rate": 9.698814385256479e-05, + "loss": 1.5943570137023926, + "step": 74810 + }, + { + "epoch": 0.22647677568749716, + "grad_norm": 0.1176661029458046, + "learning_rate": 9.69843486709274e-05, + "loss": 1.5488041877746581, + "step": 74820 + }, + { + "epoch": 0.22650704523784299, + "grad_norm": 0.1263640969991684, + "learning_rate": 9.698055348929e-05, + "loss": 1.559999656677246, + "step": 74830 + }, + { + "epoch": 0.2265373147881888, + "grad_norm": 0.1215357854962349, + "learning_rate": 9.69767583076526e-05, + "loss": 1.5511013984680175, + "step": 74840 + }, + { + "epoch": 0.22656758433853466, + "grad_norm": 0.10958737134933472, + "learning_rate": 9.697296312601521e-05, + "loss": 1.6582942962646485, + "step": 74850 + }, + { + "epoch": 0.22659785388888048, + "grad_norm": 0.11622839421033859, + "learning_rate": 9.696916794437782e-05, + "loss": 1.5585192680358886, + "step": 74860 + }, + { + "epoch": 0.2266281234392263, + "grad_norm": 0.13183458149433136, + "learning_rate": 9.696537276274042e-05, + "loss": 1.5670469284057618, + "step": 74870 + }, + { + "epoch": 0.22665839298957213, + "grad_norm": 0.09916001558303833, + "learning_rate": 9.696157758110303e-05, + "loss": 1.597175693511963, + "step": 74880 + }, + { + "epoch": 0.22668866253991796, + "grad_norm": 0.11246364563703537, + "learning_rate": 9.695778239946564e-05, + "loss": 1.563688373565674, + "step": 74890 + }, + { + "epoch": 0.2267189320902638, + "grad_norm": 0.12681254744529724, + "learning_rate": 9.695398721782824e-05, + "loss": 1.5441168785095214, + "step": 74900 + }, + { + "epoch": 0.22674920164060963, + "grad_norm": 0.12052275985479355, + "learning_rate": 9.695019203619085e-05, + "loss": 1.5842945098876953, + "step": 74910 + }, + { + "epoch": 0.22677947119095546, + "grad_norm": 0.12413047254085541, + "learning_rate": 9.694639685455347e-05, + "loss": 1.570373249053955, + "step": 74920 + }, + { + "epoch": 0.22680974074130128, + "grad_norm": 0.12826427817344666, + "learning_rate": 9.694260167291606e-05, + "loss": 1.604973602294922, + "step": 74930 + }, + { + "epoch": 0.2268400102916471, + "grad_norm": 0.12289790064096451, + "learning_rate": 9.693880649127868e-05, + "loss": 1.5924310684204102, + "step": 74940 + }, + { + "epoch": 0.22687027984199296, + "grad_norm": 0.11352238804101944, + "learning_rate": 9.693501130964129e-05, + "loss": 1.5888450622558594, + "step": 74950 + }, + { + "epoch": 0.22690054939233878, + "grad_norm": 0.1205134391784668, + "learning_rate": 9.693121612800389e-05, + "loss": 1.5872478485107422, + "step": 74960 + }, + { + "epoch": 0.2269308189426846, + "grad_norm": 0.11297819763422012, + "learning_rate": 9.69274209463665e-05, + "loss": 1.591837215423584, + "step": 74970 + }, + { + "epoch": 0.22696108849303043, + "grad_norm": 0.13670796155929565, + "learning_rate": 9.69236257647291e-05, + "loss": 1.5936182975769042, + "step": 74980 + }, + { + "epoch": 0.22699135804337628, + "grad_norm": 0.16447462141513824, + "learning_rate": 9.691983058309171e-05, + "loss": 1.6160394668579101, + "step": 74990 + }, + { + "epoch": 0.2270216275937221, + "grad_norm": 0.112755686044693, + "learning_rate": 9.691603540145432e-05, + "loss": 1.6036712646484375, + "step": 75000 + }, + { + "epoch": 0.2270216275937221, + "eval_loss": 1.602182149887085, + "eval_runtime": 27.8969, + "eval_samples_per_second": 17.923, + "eval_steps_per_second": 1.147, + "step": 75000 + }, + { + "epoch": 0.22705189714406793, + "grad_norm": 0.12147495150566101, + "learning_rate": 9.691224021981692e-05, + "loss": 1.5883769989013672, + "step": 75010 + }, + { + "epoch": 0.22708216669441375, + "grad_norm": 0.11831977963447571, + "learning_rate": 9.690844503817953e-05, + "loss": 1.5401390075683594, + "step": 75020 + }, + { + "epoch": 0.22711243624475957, + "grad_norm": 0.12718817591667175, + "learning_rate": 9.690464985654215e-05, + "loss": 1.5765352249145508, + "step": 75030 + }, + { + "epoch": 0.22714270579510543, + "grad_norm": 0.12446767091751099, + "learning_rate": 9.690085467490474e-05, + "loss": 1.5708078384399413, + "step": 75040 + }, + { + "epoch": 0.22717297534545125, + "grad_norm": 0.11313923448324203, + "learning_rate": 9.689705949326736e-05, + "loss": 1.5932841300964355, + "step": 75050 + }, + { + "epoch": 0.22720324489579707, + "grad_norm": 0.12809935212135315, + "learning_rate": 9.689326431162995e-05, + "loss": 1.589106559753418, + "step": 75060 + }, + { + "epoch": 0.2272335144461429, + "grad_norm": 0.13176460564136505, + "learning_rate": 9.688946912999257e-05, + "loss": 1.594968605041504, + "step": 75070 + }, + { + "epoch": 0.22726378399648872, + "grad_norm": 0.1243848130106926, + "learning_rate": 9.688567394835516e-05, + "loss": 1.5760751724243165, + "step": 75080 + }, + { + "epoch": 0.22729405354683457, + "grad_norm": 0.13021893799304962, + "learning_rate": 9.688187876671778e-05, + "loss": 1.5836851119995117, + "step": 75090 + }, + { + "epoch": 0.2273243230971804, + "grad_norm": 0.1147068589925766, + "learning_rate": 9.687808358508038e-05, + "loss": 1.567172145843506, + "step": 75100 + }, + { + "epoch": 0.22735459264752622, + "grad_norm": 0.1263834536075592, + "learning_rate": 9.6874288403443e-05, + "loss": 1.574370765686035, + "step": 75110 + }, + { + "epoch": 0.22738486219787205, + "grad_norm": 0.12704619765281677, + "learning_rate": 9.687049322180559e-05, + "loss": 1.5718505859375, + "step": 75120 + }, + { + "epoch": 0.22741513174821787, + "grad_norm": 0.10872866958379745, + "learning_rate": 9.68666980401682e-05, + "loss": 1.578198528289795, + "step": 75130 + }, + { + "epoch": 0.22744540129856372, + "grad_norm": 0.13881322741508484, + "learning_rate": 9.68629028585308e-05, + "loss": 1.6278499603271483, + "step": 75140 + }, + { + "epoch": 0.22747567084890954, + "grad_norm": 0.11983159184455872, + "learning_rate": 9.685910767689342e-05, + "loss": 1.5710865020751954, + "step": 75150 + }, + { + "epoch": 0.22750594039925537, + "grad_norm": 0.11776041239500046, + "learning_rate": 9.685531249525604e-05, + "loss": 1.614431381225586, + "step": 75160 + }, + { + "epoch": 0.2275362099496012, + "grad_norm": 0.12075543403625488, + "learning_rate": 9.685151731361863e-05, + "loss": 1.5765849113464356, + "step": 75170 + }, + { + "epoch": 0.22756647949994702, + "grad_norm": 0.12332672625780106, + "learning_rate": 9.684772213198125e-05, + "loss": 1.5969682693481446, + "step": 75180 + }, + { + "epoch": 0.22759674905029287, + "grad_norm": 0.10977785289287567, + "learning_rate": 9.684392695034384e-05, + "loss": 1.5583223342895507, + "step": 75190 + }, + { + "epoch": 0.2276270186006387, + "grad_norm": 0.11609804630279541, + "learning_rate": 9.684013176870646e-05, + "loss": 1.5984342575073243, + "step": 75200 + }, + { + "epoch": 0.22765728815098452, + "grad_norm": 0.11462356895208359, + "learning_rate": 9.683633658706905e-05, + "loss": 1.5454853057861329, + "step": 75210 + }, + { + "epoch": 0.22768755770133034, + "grad_norm": 0.12452369183301926, + "learning_rate": 9.683254140543167e-05, + "loss": 1.557731533050537, + "step": 75220 + }, + { + "epoch": 0.22771782725167616, + "grad_norm": 0.1160297691822052, + "learning_rate": 9.682874622379427e-05, + "loss": 1.5654947280883789, + "step": 75230 + }, + { + "epoch": 0.22774809680202202, + "grad_norm": 0.12375196814537048, + "learning_rate": 9.682495104215689e-05, + "loss": 1.6106365203857422, + "step": 75240 + }, + { + "epoch": 0.22777836635236784, + "grad_norm": 0.11950086057186127, + "learning_rate": 9.682115586051948e-05, + "loss": 1.5750429153442382, + "step": 75250 + }, + { + "epoch": 0.22780863590271366, + "grad_norm": 0.13787558674812317, + "learning_rate": 9.68173606788821e-05, + "loss": 1.5959253311157227, + "step": 75260 + }, + { + "epoch": 0.2278389054530595, + "grad_norm": 0.11384071409702301, + "learning_rate": 9.681356549724469e-05, + "loss": 1.5494142532348634, + "step": 75270 + }, + { + "epoch": 0.2278691750034053, + "grad_norm": 0.12504643201828003, + "learning_rate": 9.680977031560731e-05, + "loss": 1.6047000885009766, + "step": 75280 + }, + { + "epoch": 0.22789944455375116, + "grad_norm": 0.10396113991737366, + "learning_rate": 9.680597513396992e-05, + "loss": 1.558885097503662, + "step": 75290 + }, + { + "epoch": 0.227929714104097, + "grad_norm": 0.1122540757060051, + "learning_rate": 9.680217995233252e-05, + "loss": 1.592005729675293, + "step": 75300 + }, + { + "epoch": 0.2279599836544428, + "grad_norm": 0.11120831966400146, + "learning_rate": 9.679838477069513e-05, + "loss": 1.585443115234375, + "step": 75310 + }, + { + "epoch": 0.22799025320478863, + "grad_norm": 0.13013911247253418, + "learning_rate": 9.679458958905773e-05, + "loss": 1.5759772300720214, + "step": 75320 + }, + { + "epoch": 0.22802052275513449, + "grad_norm": 0.1255125254392624, + "learning_rate": 9.679079440742034e-05, + "loss": 1.5257015228271484, + "step": 75330 + }, + { + "epoch": 0.2280507923054803, + "grad_norm": 0.12331200391054153, + "learning_rate": 9.678699922578294e-05, + "loss": 1.5602053642272948, + "step": 75340 + }, + { + "epoch": 0.22808106185582613, + "grad_norm": 0.13033747673034668, + "learning_rate": 9.678320404414555e-05, + "loss": 1.581947135925293, + "step": 75350 + }, + { + "epoch": 0.22811133140617196, + "grad_norm": 0.11279221624135971, + "learning_rate": 9.677940886250816e-05, + "loss": 1.620230484008789, + "step": 75360 + }, + { + "epoch": 0.22814160095651778, + "grad_norm": 0.12851110100746155, + "learning_rate": 9.677561368087078e-05, + "loss": 1.6009487152099608, + "step": 75370 + }, + { + "epoch": 0.22817187050686363, + "grad_norm": 0.11423024535179138, + "learning_rate": 9.677181849923337e-05, + "loss": 1.5808245658874511, + "step": 75380 + }, + { + "epoch": 0.22820214005720946, + "grad_norm": 0.12894414365291595, + "learning_rate": 9.676802331759599e-05, + "loss": 1.6015708923339844, + "step": 75390 + }, + { + "epoch": 0.22823240960755528, + "grad_norm": 0.1351497918367386, + "learning_rate": 9.67642281359586e-05, + "loss": 1.577257251739502, + "step": 75400 + }, + { + "epoch": 0.2282626791579011, + "grad_norm": 0.11777164787054062, + "learning_rate": 9.67604329543212e-05, + "loss": 1.5670177459716796, + "step": 75410 + }, + { + "epoch": 0.22829294870824693, + "grad_norm": 0.13447366654872894, + "learning_rate": 9.67566377726838e-05, + "loss": 1.5270514488220215, + "step": 75420 + }, + { + "epoch": 0.22832321825859278, + "grad_norm": 0.12820690870285034, + "learning_rate": 9.675284259104641e-05, + "loss": 1.5480984687805175, + "step": 75430 + }, + { + "epoch": 0.2283534878089386, + "grad_norm": 0.10887233167886734, + "learning_rate": 9.674904740940902e-05, + "loss": 1.613353157043457, + "step": 75440 + }, + { + "epoch": 0.22838375735928443, + "grad_norm": 0.11879491806030273, + "learning_rate": 9.674525222777162e-05, + "loss": 1.5888729095458984, + "step": 75450 + }, + { + "epoch": 0.22841402690963025, + "grad_norm": 0.10719640552997589, + "learning_rate": 9.674145704613423e-05, + "loss": 1.5418514251708983, + "step": 75460 + }, + { + "epoch": 0.22844429645997608, + "grad_norm": 0.1281754970550537, + "learning_rate": 9.673766186449684e-05, + "loss": 1.520408821105957, + "step": 75470 + }, + { + "epoch": 0.22847456601032193, + "grad_norm": 0.12500761449337006, + "learning_rate": 9.673386668285944e-05, + "loss": 1.5710529327392577, + "step": 75480 + }, + { + "epoch": 0.22850483556066775, + "grad_norm": 0.11112914234399796, + "learning_rate": 9.673007150122205e-05, + "loss": 1.5450596809387207, + "step": 75490 + }, + { + "epoch": 0.22853510511101358, + "grad_norm": 0.12097073346376419, + "learning_rate": 9.672627631958465e-05, + "loss": 1.5557779312133788, + "step": 75500 + }, + { + "epoch": 0.22853510511101358, + "eval_loss": 1.5688917636871338, + "eval_runtime": 28.2499, + "eval_samples_per_second": 17.699, + "eval_steps_per_second": 1.133, + "step": 75500 + }, + { + "epoch": 0.2285653746613594, + "grad_norm": 0.11480742692947388, + "learning_rate": 9.672248113794726e-05, + "loss": 1.5608836174011231, + "step": 75510 + }, + { + "epoch": 0.22859564421170522, + "grad_norm": 0.12854918837547302, + "learning_rate": 9.671868595630987e-05, + "loss": 1.5678319931030273, + "step": 75520 + }, + { + "epoch": 0.22862591376205108, + "grad_norm": 0.1252436339855194, + "learning_rate": 9.671489077467249e-05, + "loss": 1.5656166076660156, + "step": 75530 + }, + { + "epoch": 0.2286561833123969, + "grad_norm": 0.12976433336734772, + "learning_rate": 9.671109559303508e-05, + "loss": 1.5845264434814452, + "step": 75540 + }, + { + "epoch": 0.22868645286274272, + "grad_norm": 0.11768090724945068, + "learning_rate": 9.67073004113977e-05, + "loss": 1.5750739097595214, + "step": 75550 + }, + { + "epoch": 0.22871672241308855, + "grad_norm": 0.12863585352897644, + "learning_rate": 9.67035052297603e-05, + "loss": 1.554819393157959, + "step": 75560 + }, + { + "epoch": 0.22874699196343437, + "grad_norm": 0.129537433385849, + "learning_rate": 9.669971004812291e-05, + "loss": 1.5676586151123046, + "step": 75570 + }, + { + "epoch": 0.22877726151378022, + "grad_norm": 0.11640427261590958, + "learning_rate": 9.669591486648551e-05, + "loss": 1.5563002586364747, + "step": 75580 + }, + { + "epoch": 0.22880753106412605, + "grad_norm": 0.12390224635601044, + "learning_rate": 9.669211968484812e-05, + "loss": 1.6098625183105468, + "step": 75590 + }, + { + "epoch": 0.22883780061447187, + "grad_norm": 0.1204579621553421, + "learning_rate": 9.668832450321073e-05, + "loss": 1.6133880615234375, + "step": 75600 + }, + { + "epoch": 0.2288680701648177, + "grad_norm": 0.12392484396696091, + "learning_rate": 9.668452932157333e-05, + "loss": 1.5964378356933593, + "step": 75610 + }, + { + "epoch": 0.22889833971516352, + "grad_norm": 0.12223705649375916, + "learning_rate": 9.668073413993594e-05, + "loss": 1.5794709205627442, + "step": 75620 + }, + { + "epoch": 0.22892860926550937, + "grad_norm": 0.11854306608438492, + "learning_rate": 9.667693895829854e-05, + "loss": 1.582606887817383, + "step": 75630 + }, + { + "epoch": 0.2289588788158552, + "grad_norm": 0.1168246865272522, + "learning_rate": 9.667314377666115e-05, + "loss": 1.5791311264038086, + "step": 75640 + }, + { + "epoch": 0.22898914836620102, + "grad_norm": 0.12226526439189911, + "learning_rate": 9.666934859502376e-05, + "loss": 1.5728935241699218, + "step": 75650 + }, + { + "epoch": 0.22901941791654684, + "grad_norm": 0.10376656800508499, + "learning_rate": 9.666555341338638e-05, + "loss": 1.5872289657592773, + "step": 75660 + }, + { + "epoch": 0.2290496874668927, + "grad_norm": 0.12065304815769196, + "learning_rate": 9.666175823174897e-05, + "loss": 1.5304628372192384, + "step": 75670 + }, + { + "epoch": 0.22907995701723852, + "grad_norm": 0.11495330929756165, + "learning_rate": 9.665796305011159e-05, + "loss": 1.5647405624389648, + "step": 75680 + }, + { + "epoch": 0.22911022656758434, + "grad_norm": 0.1261598765850067, + "learning_rate": 9.665416786847418e-05, + "loss": 1.6089500427246093, + "step": 75690 + }, + { + "epoch": 0.22914049611793016, + "grad_norm": 0.11820971965789795, + "learning_rate": 9.66503726868368e-05, + "loss": 1.5366806030273437, + "step": 75700 + }, + { + "epoch": 0.229170765668276, + "grad_norm": 0.12257788330316544, + "learning_rate": 9.664657750519939e-05, + "loss": 1.5602566719055175, + "step": 75710 + }, + { + "epoch": 0.22920103521862184, + "grad_norm": 0.12720537185668945, + "learning_rate": 9.664278232356201e-05, + "loss": 1.5483808517456055, + "step": 75720 + }, + { + "epoch": 0.22923130476896766, + "grad_norm": 0.11692911386489868, + "learning_rate": 9.66389871419246e-05, + "loss": 1.5780531883239746, + "step": 75730 + }, + { + "epoch": 0.2292615743193135, + "grad_norm": 0.13279195129871368, + "learning_rate": 9.663519196028722e-05, + "loss": 1.584081268310547, + "step": 75740 + }, + { + "epoch": 0.2292918438696593, + "grad_norm": 0.11921446025371552, + "learning_rate": 9.663139677864982e-05, + "loss": 1.5784124374389648, + "step": 75750 + }, + { + "epoch": 0.22932211342000514, + "grad_norm": 0.13121384382247925, + "learning_rate": 9.662760159701244e-05, + "loss": 1.5903234481811523, + "step": 75760 + }, + { + "epoch": 0.229352382970351, + "grad_norm": 0.12283974140882492, + "learning_rate": 9.662380641537506e-05, + "loss": 1.5666047096252442, + "step": 75770 + }, + { + "epoch": 0.2293826525206968, + "grad_norm": 0.13076643645763397, + "learning_rate": 9.662001123373765e-05, + "loss": 1.5925443649291993, + "step": 75780 + }, + { + "epoch": 0.22941292207104264, + "grad_norm": 0.11475292593240738, + "learning_rate": 9.661621605210027e-05, + "loss": 1.5831607818603515, + "step": 75790 + }, + { + "epoch": 0.22944319162138846, + "grad_norm": 0.13153581321239471, + "learning_rate": 9.661242087046286e-05, + "loss": 1.5259992599487304, + "step": 75800 + }, + { + "epoch": 0.22947346117173428, + "grad_norm": 0.12840165197849274, + "learning_rate": 9.660862568882548e-05, + "loss": 1.5878026962280274, + "step": 75810 + }, + { + "epoch": 0.22950373072208013, + "grad_norm": 0.11558302491903305, + "learning_rate": 9.660483050718807e-05, + "loss": 1.5789868354797363, + "step": 75820 + }, + { + "epoch": 0.22953400027242596, + "grad_norm": 0.12085948884487152, + "learning_rate": 9.660103532555069e-05, + "loss": 1.570092010498047, + "step": 75830 + }, + { + "epoch": 0.22956426982277178, + "grad_norm": 0.12807923555374146, + "learning_rate": 9.659724014391328e-05, + "loss": 1.5807123184204102, + "step": 75840 + }, + { + "epoch": 0.2295945393731176, + "grad_norm": 0.11785180121660233, + "learning_rate": 9.65934449622759e-05, + "loss": 1.6020195007324218, + "step": 75850 + }, + { + "epoch": 0.22962480892346343, + "grad_norm": 0.13894183933734894, + "learning_rate": 9.65896497806385e-05, + "loss": 1.5601341247558593, + "step": 75860 + }, + { + "epoch": 0.22965507847380928, + "grad_norm": 0.12402599304914474, + "learning_rate": 9.658585459900111e-05, + "loss": 1.5623372077941895, + "step": 75870 + }, + { + "epoch": 0.2296853480241551, + "grad_norm": 0.10819848626852036, + "learning_rate": 9.658205941736371e-05, + "loss": 1.5971287727355956, + "step": 75880 + }, + { + "epoch": 0.22971561757450093, + "grad_norm": 0.11979110538959503, + "learning_rate": 9.657826423572633e-05, + "loss": 1.6123680114746093, + "step": 75890 + }, + { + "epoch": 0.22974588712484675, + "grad_norm": 0.12002062052488327, + "learning_rate": 9.657446905408893e-05, + "loss": 1.5571167945861817, + "step": 75900 + }, + { + "epoch": 0.22977615667519258, + "grad_norm": 0.12076717615127563, + "learning_rate": 9.657067387245154e-05, + "loss": 1.6182464599609374, + "step": 75910 + }, + { + "epoch": 0.22980642622553843, + "grad_norm": 0.11844111979007721, + "learning_rate": 9.656687869081414e-05, + "loss": 1.5697105407714844, + "step": 75920 + }, + { + "epoch": 0.22983669577588425, + "grad_norm": 0.12360633909702301, + "learning_rate": 9.656308350917675e-05, + "loss": 1.5995660781860352, + "step": 75930 + }, + { + "epoch": 0.22986696532623008, + "grad_norm": 0.11136158555746078, + "learning_rate": 9.655928832753936e-05, + "loss": 1.572170352935791, + "step": 75940 + }, + { + "epoch": 0.2298972348765759, + "grad_norm": 0.12802138924598694, + "learning_rate": 9.655549314590196e-05, + "loss": 1.599266815185547, + "step": 75950 + }, + { + "epoch": 0.22992750442692172, + "grad_norm": 0.12343189120292664, + "learning_rate": 9.655169796426457e-05, + "loss": 1.5906105041503906, + "step": 75960 + }, + { + "epoch": 0.22995777397726758, + "grad_norm": 0.1115606427192688, + "learning_rate": 9.654790278262717e-05, + "loss": 1.6328258514404297, + "step": 75970 + }, + { + "epoch": 0.2299880435276134, + "grad_norm": 0.1277553290128708, + "learning_rate": 9.65441076009898e-05, + "loss": 1.5574795722961425, + "step": 75980 + }, + { + "epoch": 0.23001831307795922, + "grad_norm": 0.1359729766845703, + "learning_rate": 9.654031241935239e-05, + "loss": 1.6034337997436523, + "step": 75990 + }, + { + "epoch": 0.23004858262830505, + "grad_norm": 0.11759869754314423, + "learning_rate": 9.6536517237715e-05, + "loss": 1.5957371711730957, + "step": 76000 + }, + { + "epoch": 0.23004858262830505, + "eval_loss": 1.5653029680252075, + "eval_runtime": 27.9189, + "eval_samples_per_second": 17.909, + "eval_steps_per_second": 1.146, + "step": 76000 + }, + { + "epoch": 0.2300788521786509, + "grad_norm": 0.11134830117225647, + "learning_rate": 9.65327220560776e-05, + "loss": 1.5765039443969726, + "step": 76010 + }, + { + "epoch": 0.23010912172899672, + "grad_norm": 0.11802484840154648, + "learning_rate": 9.652892687444022e-05, + "loss": 1.5873540878295898, + "step": 76020 + }, + { + "epoch": 0.23013939127934255, + "grad_norm": 0.11183806508779526, + "learning_rate": 9.652513169280282e-05, + "loss": 1.5709800720214844, + "step": 76030 + }, + { + "epoch": 0.23016966082968837, + "grad_norm": 0.12501053512096405, + "learning_rate": 9.652133651116543e-05, + "loss": 1.6109485626220703, + "step": 76040 + }, + { + "epoch": 0.2301999303800342, + "grad_norm": 0.1307171881198883, + "learning_rate": 9.651754132952804e-05, + "loss": 1.5594093322753906, + "step": 76050 + }, + { + "epoch": 0.23023019993038005, + "grad_norm": 0.1299317479133606, + "learning_rate": 9.651374614789064e-05, + "loss": 1.577742862701416, + "step": 76060 + }, + { + "epoch": 0.23026046948072587, + "grad_norm": 0.12043095380067825, + "learning_rate": 9.650995096625325e-05, + "loss": 1.5406078338623046, + "step": 76070 + }, + { + "epoch": 0.2302907390310717, + "grad_norm": 0.11542845517396927, + "learning_rate": 9.650615578461585e-05, + "loss": 1.5589275360107422, + "step": 76080 + }, + { + "epoch": 0.23032100858141752, + "grad_norm": 0.13154469430446625, + "learning_rate": 9.650236060297846e-05, + "loss": 1.5765190124511719, + "step": 76090 + }, + { + "epoch": 0.23035127813176334, + "grad_norm": 0.13776564598083496, + "learning_rate": 9.649856542134106e-05, + "loss": 1.5638414382934571, + "step": 76100 + }, + { + "epoch": 0.2303815476821092, + "grad_norm": 0.13349886238574982, + "learning_rate": 9.649477023970367e-05, + "loss": 1.602229690551758, + "step": 76110 + }, + { + "epoch": 0.23041181723245502, + "grad_norm": 0.11420261859893799, + "learning_rate": 9.649097505806628e-05, + "loss": 1.5685226440429687, + "step": 76120 + }, + { + "epoch": 0.23044208678280084, + "grad_norm": 0.12554363906383514, + "learning_rate": 9.648717987642888e-05, + "loss": 1.5797471046447753, + "step": 76130 + }, + { + "epoch": 0.23047235633314667, + "grad_norm": 0.12942683696746826, + "learning_rate": 9.64833846947915e-05, + "loss": 1.5995399475097656, + "step": 76140 + }, + { + "epoch": 0.2305026258834925, + "grad_norm": 0.11602850258350372, + "learning_rate": 9.64795895131541e-05, + "loss": 1.6076881408691406, + "step": 76150 + }, + { + "epoch": 0.23053289543383834, + "grad_norm": 0.12724147737026215, + "learning_rate": 9.647579433151671e-05, + "loss": 1.5648435592651366, + "step": 76160 + }, + { + "epoch": 0.23056316498418417, + "grad_norm": 0.13687893748283386, + "learning_rate": 9.647199914987932e-05, + "loss": 1.5710405349731444, + "step": 76170 + }, + { + "epoch": 0.23059343453453, + "grad_norm": 0.1177411824464798, + "learning_rate": 9.646820396824193e-05, + "loss": 1.590279769897461, + "step": 76180 + }, + { + "epoch": 0.2306237040848758, + "grad_norm": 0.11263470351696014, + "learning_rate": 9.646440878660453e-05, + "loss": 1.6081981658935547, + "step": 76190 + }, + { + "epoch": 0.23065397363522164, + "grad_norm": 0.13285385072231293, + "learning_rate": 9.646061360496714e-05, + "loss": 1.5655887603759766, + "step": 76200 + }, + { + "epoch": 0.2306842431855675, + "grad_norm": 0.10562961548566818, + "learning_rate": 9.645681842332974e-05, + "loss": 1.5971649169921875, + "step": 76210 + }, + { + "epoch": 0.2307145127359133, + "grad_norm": 0.12698890268802643, + "learning_rate": 9.645302324169235e-05, + "loss": 1.5702003479003905, + "step": 76220 + }, + { + "epoch": 0.23074478228625914, + "grad_norm": 0.1250429004430771, + "learning_rate": 9.644922806005496e-05, + "loss": 1.5549023628234864, + "step": 76230 + }, + { + "epoch": 0.23077505183660496, + "grad_norm": 0.14304892718791962, + "learning_rate": 9.644543287841756e-05, + "loss": 1.5494070053100586, + "step": 76240 + }, + { + "epoch": 0.23080532138695078, + "grad_norm": 0.11427866667509079, + "learning_rate": 9.644163769678017e-05, + "loss": 1.6058383941650392, + "step": 76250 + }, + { + "epoch": 0.23083559093729664, + "grad_norm": 0.11929933726787567, + "learning_rate": 9.643784251514277e-05, + "loss": 1.5841894149780273, + "step": 76260 + }, + { + "epoch": 0.23086586048764246, + "grad_norm": 0.11808031797409058, + "learning_rate": 9.643404733350539e-05, + "loss": 1.6038156509399415, + "step": 76270 + }, + { + "epoch": 0.23089613003798828, + "grad_norm": 0.1304202675819397, + "learning_rate": 9.643025215186799e-05, + "loss": 1.6158987045288087, + "step": 76280 + }, + { + "epoch": 0.2309263995883341, + "grad_norm": 0.11770957708358765, + "learning_rate": 9.64264569702306e-05, + "loss": 1.566788101196289, + "step": 76290 + }, + { + "epoch": 0.23095666913867993, + "grad_norm": 0.12651173770427704, + "learning_rate": 9.64226617885932e-05, + "loss": 1.5684860229492188, + "step": 76300 + }, + { + "epoch": 0.23098693868902578, + "grad_norm": 0.14231757819652557, + "learning_rate": 9.641886660695582e-05, + "loss": 1.5907429695129394, + "step": 76310 + }, + { + "epoch": 0.2310172082393716, + "grad_norm": 0.11654739826917648, + "learning_rate": 9.641507142531841e-05, + "loss": 1.5883282661437987, + "step": 76320 + }, + { + "epoch": 0.23104747778971743, + "grad_norm": 0.13713018596172333, + "learning_rate": 9.641127624368103e-05, + "loss": 1.6061168670654298, + "step": 76330 + }, + { + "epoch": 0.23107774734006326, + "grad_norm": 0.11617536097764969, + "learning_rate": 9.640748106204362e-05, + "loss": 1.582835578918457, + "step": 76340 + }, + { + "epoch": 0.23110801689040908, + "grad_norm": 0.11653213948011398, + "learning_rate": 9.640368588040624e-05, + "loss": 1.5847553253173827, + "step": 76350 + }, + { + "epoch": 0.23113828644075493, + "grad_norm": 0.11409452557563782, + "learning_rate": 9.639989069876883e-05, + "loss": 1.5462360382080078, + "step": 76360 + }, + { + "epoch": 0.23116855599110075, + "grad_norm": 0.12903527915477753, + "learning_rate": 9.639609551713145e-05, + "loss": 1.5776236534118653, + "step": 76370 + }, + { + "epoch": 0.23119882554144658, + "grad_norm": 0.12607631087303162, + "learning_rate": 9.639230033549406e-05, + "loss": 1.5352464675903321, + "step": 76380 + }, + { + "epoch": 0.2312290950917924, + "grad_norm": 0.11513262242078781, + "learning_rate": 9.638850515385666e-05, + "loss": 1.611330795288086, + "step": 76390 + }, + { + "epoch": 0.23125936464213825, + "grad_norm": 0.12510709464550018, + "learning_rate": 9.638470997221928e-05, + "loss": 1.5569023132324218, + "step": 76400 + }, + { + "epoch": 0.23128963419248408, + "grad_norm": 0.11184360086917877, + "learning_rate": 9.638091479058188e-05, + "loss": 1.5865388870239259, + "step": 76410 + }, + { + "epoch": 0.2313199037428299, + "grad_norm": 0.10519010573625565, + "learning_rate": 9.63771196089445e-05, + "loss": 1.5958011627197266, + "step": 76420 + }, + { + "epoch": 0.23135017329317573, + "grad_norm": 0.10898474603891373, + "learning_rate": 9.637332442730709e-05, + "loss": 1.6081357955932618, + "step": 76430 + }, + { + "epoch": 0.23138044284352155, + "grad_norm": 0.12509001791477203, + "learning_rate": 9.636952924566971e-05, + "loss": 1.6151342391967773, + "step": 76440 + }, + { + "epoch": 0.2314107123938674, + "grad_norm": 0.12133437395095825, + "learning_rate": 9.63657340640323e-05, + "loss": 1.556235122680664, + "step": 76450 + }, + { + "epoch": 0.23144098194421323, + "grad_norm": 0.12649941444396973, + "learning_rate": 9.636193888239492e-05, + "loss": 1.5844610214233399, + "step": 76460 + }, + { + "epoch": 0.23147125149455905, + "grad_norm": 0.11197403818368912, + "learning_rate": 9.635814370075751e-05, + "loss": 1.565000343322754, + "step": 76470 + }, + { + "epoch": 0.23150152104490487, + "grad_norm": 0.09589146822690964, + "learning_rate": 9.635434851912013e-05, + "loss": 1.5986014366149903, + "step": 76480 + }, + { + "epoch": 0.2315317905952507, + "grad_norm": 0.12774530053138733, + "learning_rate": 9.635055333748272e-05, + "loss": 1.54892578125, + "step": 76490 + }, + { + "epoch": 0.23156206014559655, + "grad_norm": 0.11111695319414139, + "learning_rate": 9.634675815584534e-05, + "loss": 1.5663530349731445, + "step": 76500 + }, + { + "epoch": 0.23156206014559655, + "eval_loss": 1.5973706245422363, + "eval_runtime": 28.1321, + "eval_samples_per_second": 17.773, + "eval_steps_per_second": 1.137, + "step": 76500 + }, + { + "epoch": 0.23159232969594237, + "grad_norm": 0.12956732511520386, + "learning_rate": 9.634296297420794e-05, + "loss": 1.5657268524169923, + "step": 76510 + }, + { + "epoch": 0.2316225992462882, + "grad_norm": 0.12253692001104355, + "learning_rate": 9.633916779257056e-05, + "loss": 1.5548322677612305, + "step": 76520 + }, + { + "epoch": 0.23165286879663402, + "grad_norm": 0.12801939249038696, + "learning_rate": 9.633537261093316e-05, + "loss": 1.5790658950805665, + "step": 76530 + }, + { + "epoch": 0.23168313834697984, + "grad_norm": 0.13023655116558075, + "learning_rate": 9.633157742929577e-05, + "loss": 1.5845087051391602, + "step": 76540 + }, + { + "epoch": 0.2317134078973257, + "grad_norm": 0.11845855414867401, + "learning_rate": 9.632778224765837e-05, + "loss": 1.54342041015625, + "step": 76550 + }, + { + "epoch": 0.23174367744767152, + "grad_norm": 0.1278640627861023, + "learning_rate": 9.632398706602098e-05, + "loss": 1.5563719749450684, + "step": 76560 + }, + { + "epoch": 0.23177394699801734, + "grad_norm": 0.1192706972360611, + "learning_rate": 9.632019188438359e-05, + "loss": 1.5670278549194336, + "step": 76570 + }, + { + "epoch": 0.23180421654836317, + "grad_norm": 0.11999581754207611, + "learning_rate": 9.631639670274619e-05, + "loss": 1.5523187637329101, + "step": 76580 + }, + { + "epoch": 0.231834486098709, + "grad_norm": 0.11588151007890701, + "learning_rate": 9.631260152110881e-05, + "loss": 1.5667994499206543, + "step": 76590 + }, + { + "epoch": 0.23186475564905484, + "grad_norm": 0.11267935484647751, + "learning_rate": 9.63088063394714e-05, + "loss": 1.6221504211425781, + "step": 76600 + }, + { + "epoch": 0.23189502519940067, + "grad_norm": 0.11152441054582596, + "learning_rate": 9.630501115783402e-05, + "loss": 1.6041160583496095, + "step": 76610 + }, + { + "epoch": 0.2319252947497465, + "grad_norm": 0.11654901504516602, + "learning_rate": 9.630121597619661e-05, + "loss": 1.5684024810791015, + "step": 76620 + }, + { + "epoch": 0.23195556430009232, + "grad_norm": 0.12014549970626831, + "learning_rate": 9.629742079455923e-05, + "loss": 1.59901762008667, + "step": 76630 + }, + { + "epoch": 0.23198583385043814, + "grad_norm": 0.11130398511886597, + "learning_rate": 9.629362561292184e-05, + "loss": 1.5620774269104003, + "step": 76640 + }, + { + "epoch": 0.232016103400784, + "grad_norm": 0.12691143155097961, + "learning_rate": 9.628983043128445e-05, + "loss": 1.5537134170532227, + "step": 76650 + }, + { + "epoch": 0.23204637295112981, + "grad_norm": 0.1233915314078331, + "learning_rate": 9.628603524964705e-05, + "loss": 1.6270669937133788, + "step": 76660 + }, + { + "epoch": 0.23207664250147564, + "grad_norm": 0.1267181932926178, + "learning_rate": 9.628224006800966e-05, + "loss": 1.5388619422912597, + "step": 76670 + }, + { + "epoch": 0.23210691205182146, + "grad_norm": 0.1332184076309204, + "learning_rate": 9.627844488637226e-05, + "loss": 1.5262630462646485, + "step": 76680 + }, + { + "epoch": 0.2321371816021673, + "grad_norm": 0.10828657448291779, + "learning_rate": 9.627464970473487e-05, + "loss": 1.6066633224487306, + "step": 76690 + }, + { + "epoch": 0.23216745115251314, + "grad_norm": 0.1190217062830925, + "learning_rate": 9.627085452309748e-05, + "loss": 1.5551139831542968, + "step": 76700 + }, + { + "epoch": 0.23219772070285896, + "grad_norm": 0.11951225996017456, + "learning_rate": 9.626705934146008e-05, + "loss": 1.5689263343811035, + "step": 76710 + }, + { + "epoch": 0.23222799025320479, + "grad_norm": 0.12068745493888855, + "learning_rate": 9.626326415982269e-05, + "loss": 1.6146255493164063, + "step": 76720 + }, + { + "epoch": 0.2322582598035506, + "grad_norm": 0.14365604519844055, + "learning_rate": 9.62594689781853e-05, + "loss": 1.5923797607421875, + "step": 76730 + }, + { + "epoch": 0.23228852935389646, + "grad_norm": 0.10901971906423569, + "learning_rate": 9.62556737965479e-05, + "loss": 1.5645550727844237, + "step": 76740 + }, + { + "epoch": 0.23231879890424229, + "grad_norm": 0.1264725774526596, + "learning_rate": 9.62518786149105e-05, + "loss": 1.619944953918457, + "step": 76750 + }, + { + "epoch": 0.2323490684545881, + "grad_norm": 0.11373291909694672, + "learning_rate": 9.624808343327311e-05, + "loss": 1.578386116027832, + "step": 76760 + }, + { + "epoch": 0.23237933800493393, + "grad_norm": 0.109378382563591, + "learning_rate": 9.624428825163573e-05, + "loss": 1.6277521133422852, + "step": 76770 + }, + { + "epoch": 0.23240960755527976, + "grad_norm": 0.11612702161073685, + "learning_rate": 9.624049306999834e-05, + "loss": 1.5976410865783692, + "step": 76780 + }, + { + "epoch": 0.2324398771056256, + "grad_norm": 0.11322078853845596, + "learning_rate": 9.623669788836094e-05, + "loss": 1.5709136962890624, + "step": 76790 + }, + { + "epoch": 0.23247014665597143, + "grad_norm": 0.11136094480752945, + "learning_rate": 9.623290270672355e-05, + "loss": 1.6119195938110351, + "step": 76800 + }, + { + "epoch": 0.23250041620631726, + "grad_norm": 0.1060461476445198, + "learning_rate": 9.622910752508616e-05, + "loss": 1.6042060852050781, + "step": 76810 + }, + { + "epoch": 0.23253068575666308, + "grad_norm": 0.12210551649332047, + "learning_rate": 9.622531234344876e-05, + "loss": 1.5845586776733398, + "step": 76820 + }, + { + "epoch": 0.2325609553070089, + "grad_norm": 0.11832258105278015, + "learning_rate": 9.622151716181137e-05, + "loss": 1.595197868347168, + "step": 76830 + }, + { + "epoch": 0.23259122485735476, + "grad_norm": 0.1198997050523758, + "learning_rate": 9.621772198017397e-05, + "loss": 1.6064123153686523, + "step": 76840 + }, + { + "epoch": 0.23262149440770058, + "grad_norm": 0.12098624557256699, + "learning_rate": 9.621392679853658e-05, + "loss": 1.5987191200256348, + "step": 76850 + }, + { + "epoch": 0.2326517639580464, + "grad_norm": 0.11200353503227234, + "learning_rate": 9.621013161689918e-05, + "loss": 1.5785354614257812, + "step": 76860 + }, + { + "epoch": 0.23268203350839223, + "grad_norm": 0.11739974468946457, + "learning_rate": 9.620633643526179e-05, + "loss": 1.5665201187133788, + "step": 76870 + }, + { + "epoch": 0.23271230305873805, + "grad_norm": 0.11290857195854187, + "learning_rate": 9.620254125362441e-05, + "loss": 1.576283836364746, + "step": 76880 + }, + { + "epoch": 0.2327425726090839, + "grad_norm": 0.12401093542575836, + "learning_rate": 9.6198746071987e-05, + "loss": 1.5612998008728027, + "step": 76890 + }, + { + "epoch": 0.23277284215942973, + "grad_norm": 0.13128866255283356, + "learning_rate": 9.619495089034962e-05, + "loss": 1.556320571899414, + "step": 76900 + }, + { + "epoch": 0.23280311170977555, + "grad_norm": 0.1409604549407959, + "learning_rate": 9.619115570871221e-05, + "loss": 1.5962905883789062, + "step": 76910 + }, + { + "epoch": 0.23283338126012137, + "grad_norm": 0.12983018159866333, + "learning_rate": 9.618736052707483e-05, + "loss": 1.5515653610229492, + "step": 76920 + }, + { + "epoch": 0.2328636508104672, + "grad_norm": 0.13041584193706512, + "learning_rate": 9.618356534543743e-05, + "loss": 1.5811960220336914, + "step": 76930 + }, + { + "epoch": 0.23289392036081305, + "grad_norm": 0.11188437789678574, + "learning_rate": 9.617977016380005e-05, + "loss": 1.5848315238952637, + "step": 76940 + }, + { + "epoch": 0.23292418991115887, + "grad_norm": 0.11212726682424545, + "learning_rate": 9.617597498216264e-05, + "loss": 1.5679123878479004, + "step": 76950 + }, + { + "epoch": 0.2329544594615047, + "grad_norm": 0.10946749150753021, + "learning_rate": 9.617217980052526e-05, + "loss": 1.601031494140625, + "step": 76960 + }, + { + "epoch": 0.23298472901185052, + "grad_norm": 0.12499304860830307, + "learning_rate": 9.616838461888785e-05, + "loss": 1.5491325378417968, + "step": 76970 + }, + { + "epoch": 0.23301499856219635, + "grad_norm": 0.14425617456436157, + "learning_rate": 9.616458943725047e-05, + "loss": 1.5786486625671388, + "step": 76980 + }, + { + "epoch": 0.2330452681125422, + "grad_norm": 0.1120036393404007, + "learning_rate": 9.616079425561308e-05, + "loss": 1.5946664810180664, + "step": 76990 + }, + { + "epoch": 0.23307553766288802, + "grad_norm": 0.14021992683410645, + "learning_rate": 9.615699907397568e-05, + "loss": 1.6123855590820313, + "step": 77000 + }, + { + "epoch": 0.23307553766288802, + "eval_loss": 1.5768206119537354, + "eval_runtime": 27.9177, + "eval_samples_per_second": 17.91, + "eval_steps_per_second": 1.146, + "step": 77000 + }, + { + "epoch": 0.23310580721323385, + "grad_norm": 0.10775838047266006, + "learning_rate": 9.61532038923383e-05, + "loss": 1.5574686050415039, + "step": 77010 + }, + { + "epoch": 0.23313607676357967, + "grad_norm": 0.12367487698793411, + "learning_rate": 9.61494087107009e-05, + "loss": 1.5739361763000488, + "step": 77020 + }, + { + "epoch": 0.2331663463139255, + "grad_norm": 0.11961870640516281, + "learning_rate": 9.614561352906351e-05, + "loss": 1.5587377548217773, + "step": 77030 + }, + { + "epoch": 0.23319661586427134, + "grad_norm": 0.11266467720270157, + "learning_rate": 9.61418183474261e-05, + "loss": 1.5671609878540038, + "step": 77040 + }, + { + "epoch": 0.23322688541461717, + "grad_norm": 0.12899446487426758, + "learning_rate": 9.613802316578873e-05, + "loss": 1.5616287231445312, + "step": 77050 + }, + { + "epoch": 0.233257154964963, + "grad_norm": 0.11487337201833725, + "learning_rate": 9.613422798415132e-05, + "loss": 1.5682239532470703, + "step": 77060 + }, + { + "epoch": 0.23328742451530882, + "grad_norm": 0.11740978062152863, + "learning_rate": 9.613043280251394e-05, + "loss": 1.579474925994873, + "step": 77070 + }, + { + "epoch": 0.23331769406565467, + "grad_norm": 0.11267530918121338, + "learning_rate": 9.612663762087653e-05, + "loss": 1.5777055740356445, + "step": 77080 + }, + { + "epoch": 0.2333479636160005, + "grad_norm": 0.1234963908791542, + "learning_rate": 9.612284243923915e-05, + "loss": 1.5501605033874513, + "step": 77090 + }, + { + "epoch": 0.23337823316634632, + "grad_norm": 0.12505286931991577, + "learning_rate": 9.611904725760174e-05, + "loss": 1.5831377029418945, + "step": 77100 + }, + { + "epoch": 0.23340850271669214, + "grad_norm": 0.11795613914728165, + "learning_rate": 9.611525207596436e-05, + "loss": 1.5724030494689942, + "step": 77110 + }, + { + "epoch": 0.23343877226703796, + "grad_norm": 0.10704804956912994, + "learning_rate": 9.611145689432695e-05, + "loss": 1.586526870727539, + "step": 77120 + }, + { + "epoch": 0.23346904181738382, + "grad_norm": 0.13347123563289642, + "learning_rate": 9.610766171268957e-05, + "loss": 1.5656852722167969, + "step": 77130 + }, + { + "epoch": 0.23349931136772964, + "grad_norm": 0.12212245166301727, + "learning_rate": 9.610386653105218e-05, + "loss": 1.601259422302246, + "step": 77140 + }, + { + "epoch": 0.23352958091807546, + "grad_norm": 0.10563451051712036, + "learning_rate": 9.610007134941478e-05, + "loss": 1.5714941024780273, + "step": 77150 + }, + { + "epoch": 0.2335598504684213, + "grad_norm": 0.12056947499513626, + "learning_rate": 9.609627616777739e-05, + "loss": 1.5638646125793456, + "step": 77160 + }, + { + "epoch": 0.2335901200187671, + "grad_norm": 0.12063120305538177, + "learning_rate": 9.609248098614e-05, + "loss": 1.6062976837158203, + "step": 77170 + }, + { + "epoch": 0.23362038956911296, + "grad_norm": 0.11372439563274384, + "learning_rate": 9.608868580450262e-05, + "loss": 1.5619080543518067, + "step": 77180 + }, + { + "epoch": 0.2336506591194588, + "grad_norm": 0.13537387549877167, + "learning_rate": 9.608489062286521e-05, + "loss": 1.5581414222717285, + "step": 77190 + }, + { + "epoch": 0.2336809286698046, + "grad_norm": 0.1294410228729248, + "learning_rate": 9.608109544122783e-05, + "loss": 1.6101848602294921, + "step": 77200 + }, + { + "epoch": 0.23371119822015043, + "grad_norm": 0.12508055567741394, + "learning_rate": 9.607730025959042e-05, + "loss": 1.5601605415344237, + "step": 77210 + }, + { + "epoch": 0.23374146777049626, + "grad_norm": 0.1229480728507042, + "learning_rate": 9.607350507795304e-05, + "loss": 1.5742597579956055, + "step": 77220 + }, + { + "epoch": 0.2337717373208421, + "grad_norm": 0.10748277604579926, + "learning_rate": 9.606970989631563e-05, + "loss": 1.5623374938964845, + "step": 77230 + }, + { + "epoch": 0.23380200687118793, + "grad_norm": 0.12168453633785248, + "learning_rate": 9.606591471467825e-05, + "loss": 1.5767980575561524, + "step": 77240 + }, + { + "epoch": 0.23383227642153376, + "grad_norm": 0.11177928745746613, + "learning_rate": 9.606211953304086e-05, + "loss": 1.5957452774047851, + "step": 77250 + }, + { + "epoch": 0.23386254597187958, + "grad_norm": 0.1122114285826683, + "learning_rate": 9.605832435140346e-05, + "loss": 1.5643844604492188, + "step": 77260 + }, + { + "epoch": 0.2338928155222254, + "grad_norm": 0.1235673800110817, + "learning_rate": 9.605452916976607e-05, + "loss": 1.576267910003662, + "step": 77270 + }, + { + "epoch": 0.23392308507257126, + "grad_norm": 0.11568786203861237, + "learning_rate": 9.605073398812868e-05, + "loss": 1.570576000213623, + "step": 77280 + }, + { + "epoch": 0.23395335462291708, + "grad_norm": 0.1110411137342453, + "learning_rate": 9.604693880649128e-05, + "loss": 1.5731693267822267, + "step": 77290 + }, + { + "epoch": 0.2339836241732629, + "grad_norm": 0.11271978914737701, + "learning_rate": 9.604314362485389e-05, + "loss": 1.558930015563965, + "step": 77300 + }, + { + "epoch": 0.23401389372360873, + "grad_norm": 0.1211569532752037, + "learning_rate": 9.60393484432165e-05, + "loss": 1.5627471923828125, + "step": 77310 + }, + { + "epoch": 0.23404416327395455, + "grad_norm": 0.11767219752073288, + "learning_rate": 9.60355532615791e-05, + "loss": 1.574735450744629, + "step": 77320 + }, + { + "epoch": 0.2340744328243004, + "grad_norm": 0.11974775791168213, + "learning_rate": 9.60317580799417e-05, + "loss": 1.5787981033325196, + "step": 77330 + }, + { + "epoch": 0.23410470237464623, + "grad_norm": 0.10833388566970825, + "learning_rate": 9.602796289830431e-05, + "loss": 1.5831286430358886, + "step": 77340 + }, + { + "epoch": 0.23413497192499205, + "grad_norm": 0.1258988082408905, + "learning_rate": 9.602416771666692e-05, + "loss": 1.5914200782775878, + "step": 77350 + }, + { + "epoch": 0.23416524147533788, + "grad_norm": 0.11502459645271301, + "learning_rate": 9.602037253502952e-05, + "loss": 1.553419303894043, + "step": 77360 + }, + { + "epoch": 0.2341955110256837, + "grad_norm": 0.11792388558387756, + "learning_rate": 9.601657735339213e-05, + "loss": 1.601409912109375, + "step": 77370 + }, + { + "epoch": 0.23422578057602955, + "grad_norm": 0.12036187201738358, + "learning_rate": 9.601278217175475e-05, + "loss": 1.5399730682373047, + "step": 77380 + }, + { + "epoch": 0.23425605012637538, + "grad_norm": 0.11195837706327438, + "learning_rate": 9.600898699011735e-05, + "loss": 1.5835704803466797, + "step": 77390 + }, + { + "epoch": 0.2342863196767212, + "grad_norm": 0.11091061681509018, + "learning_rate": 9.600519180847996e-05, + "loss": 1.5528093338012696, + "step": 77400 + }, + { + "epoch": 0.23431658922706702, + "grad_norm": 0.1253374218940735, + "learning_rate": 9.600139662684257e-05, + "loss": 1.571143627166748, + "step": 77410 + }, + { + "epoch": 0.23434685877741288, + "grad_norm": 0.10968419909477234, + "learning_rate": 9.599760144520517e-05, + "loss": 1.6009490966796875, + "step": 77420 + }, + { + "epoch": 0.2343771283277587, + "grad_norm": 0.11772993206977844, + "learning_rate": 9.599380626356778e-05, + "loss": 1.5804157257080078, + "step": 77430 + }, + { + "epoch": 0.23440739787810452, + "grad_norm": 0.11173591762781143, + "learning_rate": 9.599001108193038e-05, + "loss": 1.5695375442504882, + "step": 77440 + }, + { + "epoch": 0.23443766742845035, + "grad_norm": 0.11545593291521072, + "learning_rate": 9.598621590029299e-05, + "loss": 1.60766544342041, + "step": 77450 + }, + { + "epoch": 0.23446793697879617, + "grad_norm": 0.13377298414707184, + "learning_rate": 9.59824207186556e-05, + "loss": 1.5546418190002442, + "step": 77460 + }, + { + "epoch": 0.23449820652914202, + "grad_norm": 0.11700522154569626, + "learning_rate": 9.59786255370182e-05, + "loss": 1.5728034973144531, + "step": 77470 + }, + { + "epoch": 0.23452847607948785, + "grad_norm": 0.12045948207378387, + "learning_rate": 9.597483035538081e-05, + "loss": 1.5772054672241211, + "step": 77480 + }, + { + "epoch": 0.23455874562983367, + "grad_norm": 0.11495320498943329, + "learning_rate": 9.597103517374341e-05, + "loss": 1.5750049591064452, + "step": 77490 + }, + { + "epoch": 0.2345890151801795, + "grad_norm": 0.12095622718334198, + "learning_rate": 9.596723999210602e-05, + "loss": 1.590508270263672, + "step": 77500 + }, + { + "epoch": 0.2345890151801795, + "eval_loss": 1.5732940435409546, + "eval_runtime": 27.9198, + "eval_samples_per_second": 17.908, + "eval_steps_per_second": 1.146, + "step": 77500 + }, + { + "epoch": 0.23461928473052532, + "grad_norm": 0.12296765297651291, + "learning_rate": 9.596344481046864e-05, + "loss": 1.5741342544555663, + "step": 77510 + }, + { + "epoch": 0.23464955428087117, + "grad_norm": 0.12218397855758667, + "learning_rate": 9.595964962883123e-05, + "loss": 1.5770303726196289, + "step": 77520 + }, + { + "epoch": 0.234679823831217, + "grad_norm": 0.11472438275814056, + "learning_rate": 9.595585444719385e-05, + "loss": 1.5792427062988281, + "step": 77530 + }, + { + "epoch": 0.23471009338156282, + "grad_norm": 0.11486206203699112, + "learning_rate": 9.595205926555644e-05, + "loss": 1.6108030319213866, + "step": 77540 + }, + { + "epoch": 0.23474036293190864, + "grad_norm": 0.12108262628316879, + "learning_rate": 9.594826408391906e-05, + "loss": 1.5733623504638672, + "step": 77550 + }, + { + "epoch": 0.23477063248225447, + "grad_norm": 0.11541721969842911, + "learning_rate": 9.594446890228166e-05, + "loss": 1.5737030029296875, + "step": 77560 + }, + { + "epoch": 0.23480090203260032, + "grad_norm": 0.12184541672468185, + "learning_rate": 9.594067372064428e-05, + "loss": 1.6053665161132813, + "step": 77570 + }, + { + "epoch": 0.23483117158294614, + "grad_norm": 0.12959395349025726, + "learning_rate": 9.593687853900687e-05, + "loss": 1.564961814880371, + "step": 77580 + }, + { + "epoch": 0.23486144113329196, + "grad_norm": 0.11100392043590546, + "learning_rate": 9.593308335736949e-05, + "loss": 1.5667308807373046, + "step": 77590 + }, + { + "epoch": 0.2348917106836378, + "grad_norm": 0.1288643479347229, + "learning_rate": 9.592928817573209e-05, + "loss": 1.5787322998046875, + "step": 77600 + }, + { + "epoch": 0.2349219802339836, + "grad_norm": 0.13095298409461975, + "learning_rate": 9.59254929940947e-05, + "loss": 1.5411853790283203, + "step": 77610 + }, + { + "epoch": 0.23495224978432946, + "grad_norm": 0.1299731582403183, + "learning_rate": 9.592169781245732e-05, + "loss": 1.58314208984375, + "step": 77620 + }, + { + "epoch": 0.2349825193346753, + "grad_norm": 0.13763628900051117, + "learning_rate": 9.591790263081991e-05, + "loss": 1.567281723022461, + "step": 77630 + }, + { + "epoch": 0.2350127888850211, + "grad_norm": 0.1230602115392685, + "learning_rate": 9.591410744918253e-05, + "loss": 1.585404396057129, + "step": 77640 + }, + { + "epoch": 0.23504305843536694, + "grad_norm": 0.12117121368646622, + "learning_rate": 9.591031226754512e-05, + "loss": 1.5399591445922851, + "step": 77650 + }, + { + "epoch": 0.23507332798571276, + "grad_norm": 0.12221431732177734, + "learning_rate": 9.590651708590774e-05, + "loss": 1.5525460243225098, + "step": 77660 + }, + { + "epoch": 0.2351035975360586, + "grad_norm": 0.10939148813486099, + "learning_rate": 9.590272190427033e-05, + "loss": 1.6145212173461914, + "step": 77670 + }, + { + "epoch": 0.23513386708640444, + "grad_norm": 0.10827586054801941, + "learning_rate": 9.589892672263295e-05, + "loss": 1.5776481628417969, + "step": 77680 + }, + { + "epoch": 0.23516413663675026, + "grad_norm": 0.11702575534582138, + "learning_rate": 9.589513154099555e-05, + "loss": 1.606978988647461, + "step": 77690 + }, + { + "epoch": 0.23519440618709608, + "grad_norm": 0.1163405105471611, + "learning_rate": 9.589133635935817e-05, + "loss": 1.5533703804016112, + "step": 77700 + }, + { + "epoch": 0.2352246757374419, + "grad_norm": 0.11694569140672684, + "learning_rate": 9.588754117772076e-05, + "loss": 1.6096935272216797, + "step": 77710 + }, + { + "epoch": 0.23525494528778776, + "grad_norm": 0.12808449566364288, + "learning_rate": 9.588374599608338e-05, + "loss": 1.5335132598876953, + "step": 77720 + }, + { + "epoch": 0.23528521483813358, + "grad_norm": 0.11793965846300125, + "learning_rate": 9.587995081444597e-05, + "loss": 1.6246383666992188, + "step": 77730 + }, + { + "epoch": 0.2353154843884794, + "grad_norm": 0.10796216130256653, + "learning_rate": 9.587615563280859e-05, + "loss": 1.5897663116455079, + "step": 77740 + }, + { + "epoch": 0.23534575393882523, + "grad_norm": 0.12414771318435669, + "learning_rate": 9.58723604511712e-05, + "loss": 1.606919288635254, + "step": 77750 + }, + { + "epoch": 0.23537602348917105, + "grad_norm": 0.11244763433933258, + "learning_rate": 9.58685652695338e-05, + "loss": 1.5914359092712402, + "step": 77760 + }, + { + "epoch": 0.2354062930395169, + "grad_norm": 0.12699934840202332, + "learning_rate": 9.586477008789641e-05, + "loss": 1.5713676452636718, + "step": 77770 + }, + { + "epoch": 0.23543656258986273, + "grad_norm": 0.12688970565795898, + "learning_rate": 9.586097490625901e-05, + "loss": 1.584256935119629, + "step": 77780 + }, + { + "epoch": 0.23546683214020855, + "grad_norm": 0.11116316169500351, + "learning_rate": 9.585717972462163e-05, + "loss": 1.5513830184936523, + "step": 77790 + }, + { + "epoch": 0.23549710169055438, + "grad_norm": 0.12173007428646088, + "learning_rate": 9.585338454298423e-05, + "loss": 1.591415023803711, + "step": 77800 + }, + { + "epoch": 0.23552737124090023, + "grad_norm": 0.11672127991914749, + "learning_rate": 9.584958936134685e-05, + "loss": 1.5410083770751952, + "step": 77810 + }, + { + "epoch": 0.23555764079124605, + "grad_norm": 0.1278429627418518, + "learning_rate": 9.584579417970944e-05, + "loss": 1.538347339630127, + "step": 77820 + }, + { + "epoch": 0.23558791034159188, + "grad_norm": 0.11517953127622604, + "learning_rate": 9.584199899807206e-05, + "loss": 1.5741514205932616, + "step": 77830 + }, + { + "epoch": 0.2356181798919377, + "grad_norm": 0.11349247395992279, + "learning_rate": 9.583820381643465e-05, + "loss": 1.5766467094421386, + "step": 77840 + }, + { + "epoch": 0.23564844944228353, + "grad_norm": 0.12045425176620483, + "learning_rate": 9.583440863479727e-05, + "loss": 1.5689804077148437, + "step": 77850 + }, + { + "epoch": 0.23567871899262938, + "grad_norm": 0.12769117951393127, + "learning_rate": 9.583061345315986e-05, + "loss": 1.5701894760131836, + "step": 77860 + }, + { + "epoch": 0.2357089885429752, + "grad_norm": 0.1270710527896881, + "learning_rate": 9.582681827152248e-05, + "loss": 1.5477724075317383, + "step": 77870 + }, + { + "epoch": 0.23573925809332102, + "grad_norm": 0.11868435889482498, + "learning_rate": 9.582302308988509e-05, + "loss": 1.5787298202514648, + "step": 77880 + }, + { + "epoch": 0.23576952764366685, + "grad_norm": 0.11354061961174011, + "learning_rate": 9.581922790824769e-05, + "loss": 1.5922443389892578, + "step": 77890 + }, + { + "epoch": 0.23579979719401267, + "grad_norm": 0.1177283450961113, + "learning_rate": 9.58154327266103e-05, + "loss": 1.5437214851379395, + "step": 77900 + }, + { + "epoch": 0.23583006674435852, + "grad_norm": 0.11390390992164612, + "learning_rate": 9.58116375449729e-05, + "loss": 1.5926784515380858, + "step": 77910 + }, + { + "epoch": 0.23586033629470435, + "grad_norm": 0.11902832239866257, + "learning_rate": 9.580784236333551e-05, + "loss": 1.5744964599609375, + "step": 77920 + }, + { + "epoch": 0.23589060584505017, + "grad_norm": 0.11268069595098495, + "learning_rate": 9.580404718169812e-05, + "loss": 1.5908281326293945, + "step": 77930 + }, + { + "epoch": 0.235920875395396, + "grad_norm": 0.12227508425712585, + "learning_rate": 9.580025200006072e-05, + "loss": 1.544339942932129, + "step": 77940 + }, + { + "epoch": 0.23595114494574182, + "grad_norm": 0.15223102271556854, + "learning_rate": 9.579645681842333e-05, + "loss": 1.5662938117980958, + "step": 77950 + }, + { + "epoch": 0.23598141449608767, + "grad_norm": 0.12405610084533691, + "learning_rate": 9.579266163678593e-05, + "loss": 1.5919944763183593, + "step": 77960 + }, + { + "epoch": 0.2360116840464335, + "grad_norm": 0.12299440801143646, + "learning_rate": 9.578886645514854e-05, + "loss": 1.5916236877441405, + "step": 77970 + }, + { + "epoch": 0.23604195359677932, + "grad_norm": 0.12061800807714462, + "learning_rate": 9.578507127351115e-05, + "loss": 1.6162174224853516, + "step": 77980 + }, + { + "epoch": 0.23607222314712514, + "grad_norm": 0.11053833365440369, + "learning_rate": 9.578127609187377e-05, + "loss": 1.5676916122436524, + "step": 77990 + }, + { + "epoch": 0.23610249269747097, + "grad_norm": 0.11315268278121948, + "learning_rate": 9.577748091023637e-05, + "loss": 1.591860294342041, + "step": 78000 + }, + { + "epoch": 0.23610249269747097, + "eval_loss": 1.576899528503418, + "eval_runtime": 28.1795, + "eval_samples_per_second": 17.743, + "eval_steps_per_second": 1.136, + "step": 78000 + }, + { + "epoch": 0.23613276224781682, + "grad_norm": 0.10654843598604202, + "learning_rate": 9.577368572859898e-05, + "loss": 1.5484416961669922, + "step": 78010 + }, + { + "epoch": 0.23616303179816264, + "grad_norm": 0.13700668513774872, + "learning_rate": 9.576989054696158e-05, + "loss": 1.611137580871582, + "step": 78020 + }, + { + "epoch": 0.23619330134850847, + "grad_norm": 0.10869066417217255, + "learning_rate": 9.576609536532419e-05, + "loss": 1.574571704864502, + "step": 78030 + }, + { + "epoch": 0.2362235708988543, + "grad_norm": 0.12489178776741028, + "learning_rate": 9.57623001836868e-05, + "loss": 1.5685785293579102, + "step": 78040 + }, + { + "epoch": 0.23625384044920011, + "grad_norm": 0.112385094165802, + "learning_rate": 9.57585050020494e-05, + "loss": 1.5817359924316405, + "step": 78050 + }, + { + "epoch": 0.23628410999954597, + "grad_norm": 0.10475874692201614, + "learning_rate": 9.575470982041201e-05, + "loss": 1.5929293632507324, + "step": 78060 + }, + { + "epoch": 0.2363143795498918, + "grad_norm": 0.11654163151979446, + "learning_rate": 9.575091463877461e-05, + "loss": 1.5449591636657716, + "step": 78070 + }, + { + "epoch": 0.2363446491002376, + "grad_norm": 0.11552827060222626, + "learning_rate": 9.574711945713722e-05, + "loss": 1.604707908630371, + "step": 78080 + }, + { + "epoch": 0.23637491865058344, + "grad_norm": 0.11358727514743805, + "learning_rate": 9.574332427549983e-05, + "loss": 1.5652105331420898, + "step": 78090 + }, + { + "epoch": 0.23640518820092926, + "grad_norm": 0.1119304671883583, + "learning_rate": 9.573952909386243e-05, + "loss": 1.5819101333618164, + "step": 78100 + }, + { + "epoch": 0.2364354577512751, + "grad_norm": 0.13158738613128662, + "learning_rate": 9.573573391222504e-05, + "loss": 1.5974077224731444, + "step": 78110 + }, + { + "epoch": 0.23646572730162094, + "grad_norm": 0.1146809384226799, + "learning_rate": 9.573193873058766e-05, + "loss": 1.5608792304992676, + "step": 78120 + }, + { + "epoch": 0.23649599685196676, + "grad_norm": 0.1286223977804184, + "learning_rate": 9.572814354895025e-05, + "loss": 1.613058090209961, + "step": 78130 + }, + { + "epoch": 0.23652626640231258, + "grad_norm": 0.1225406602025032, + "learning_rate": 9.572434836731287e-05, + "loss": 1.6139091491699218, + "step": 78140 + }, + { + "epoch": 0.23655653595265844, + "grad_norm": 0.15609364211559296, + "learning_rate": 9.572055318567546e-05, + "loss": 1.5680597305297852, + "step": 78150 + }, + { + "epoch": 0.23658680550300426, + "grad_norm": 0.13118834793567657, + "learning_rate": 9.571675800403808e-05, + "loss": 1.5800031661987304, + "step": 78160 + }, + { + "epoch": 0.23661707505335008, + "grad_norm": 0.13734781742095947, + "learning_rate": 9.571296282240067e-05, + "loss": 1.535720443725586, + "step": 78170 + }, + { + "epoch": 0.2366473446036959, + "grad_norm": 0.11793477088212967, + "learning_rate": 9.570916764076329e-05, + "loss": 1.5820141792297364, + "step": 78180 + }, + { + "epoch": 0.23667761415404173, + "grad_norm": 0.1217808797955513, + "learning_rate": 9.570537245912588e-05, + "loss": 1.5500639915466308, + "step": 78190 + }, + { + "epoch": 0.23670788370438758, + "grad_norm": 0.12240548431873322, + "learning_rate": 9.57015772774885e-05, + "loss": 1.5936712265014648, + "step": 78200 + }, + { + "epoch": 0.2367381532547334, + "grad_norm": 0.10989509522914886, + "learning_rate": 9.569778209585111e-05, + "loss": 1.5306758880615234, + "step": 78210 + }, + { + "epoch": 0.23676842280507923, + "grad_norm": 0.10878944396972656, + "learning_rate": 9.569398691421372e-05, + "loss": 1.5653995513916015, + "step": 78220 + }, + { + "epoch": 0.23679869235542506, + "grad_norm": 0.12517723441123962, + "learning_rate": 9.569019173257632e-05, + "loss": 1.5450182914733888, + "step": 78230 + }, + { + "epoch": 0.23682896190577088, + "grad_norm": 0.11474476009607315, + "learning_rate": 9.568639655093893e-05, + "loss": 1.6149616241455078, + "step": 78240 + }, + { + "epoch": 0.23685923145611673, + "grad_norm": 0.13399772346019745, + "learning_rate": 9.568260136930155e-05, + "loss": 1.5783980369567872, + "step": 78250 + }, + { + "epoch": 0.23688950100646256, + "grad_norm": 0.11989491432905197, + "learning_rate": 9.567880618766414e-05, + "loss": 1.5696123123168946, + "step": 78260 + }, + { + "epoch": 0.23691977055680838, + "grad_norm": 0.13194289803504944, + "learning_rate": 9.567501100602676e-05, + "loss": 1.550387191772461, + "step": 78270 + }, + { + "epoch": 0.2369500401071542, + "grad_norm": 0.12373937666416168, + "learning_rate": 9.567121582438935e-05, + "loss": 1.6066022872924806, + "step": 78280 + }, + { + "epoch": 0.23698030965750003, + "grad_norm": 0.12352997809648514, + "learning_rate": 9.566742064275197e-05, + "loss": 1.5988855361938477, + "step": 78290 + }, + { + "epoch": 0.23701057920784588, + "grad_norm": 0.11697845906019211, + "learning_rate": 9.566362546111456e-05, + "loss": 1.6004940032958985, + "step": 78300 + }, + { + "epoch": 0.2370408487581917, + "grad_norm": 0.11486383527517319, + "learning_rate": 9.565983027947718e-05, + "loss": 1.5515721321105957, + "step": 78310 + }, + { + "epoch": 0.23707111830853753, + "grad_norm": 0.1210886761546135, + "learning_rate": 9.565603509783978e-05, + "loss": 1.5961658477783203, + "step": 78320 + }, + { + "epoch": 0.23710138785888335, + "grad_norm": 0.12143119424581528, + "learning_rate": 9.56522399162024e-05, + "loss": 1.571647548675537, + "step": 78330 + }, + { + "epoch": 0.23713165740922917, + "grad_norm": 0.13466854393482208, + "learning_rate": 9.564844473456499e-05, + "loss": 1.57489070892334, + "step": 78340 + }, + { + "epoch": 0.23716192695957503, + "grad_norm": 0.1110013946890831, + "learning_rate": 9.564464955292761e-05, + "loss": 1.564451313018799, + "step": 78350 + }, + { + "epoch": 0.23719219650992085, + "grad_norm": 0.1115555688738823, + "learning_rate": 9.564085437129021e-05, + "loss": 1.5890133857727051, + "step": 78360 + }, + { + "epoch": 0.23722246606026667, + "grad_norm": 0.12496370822191238, + "learning_rate": 9.563705918965282e-05, + "loss": 1.550673770904541, + "step": 78370 + }, + { + "epoch": 0.2372527356106125, + "grad_norm": 0.12248049676418304, + "learning_rate": 9.563326400801542e-05, + "loss": 1.5570024490356444, + "step": 78380 + }, + { + "epoch": 0.23728300516095832, + "grad_norm": 0.13660593330860138, + "learning_rate": 9.562946882637803e-05, + "loss": 1.5684085845947267, + "step": 78390 + }, + { + "epoch": 0.23731327471130417, + "grad_norm": 0.12191981822252274, + "learning_rate": 9.562567364474065e-05, + "loss": 1.5670705795288087, + "step": 78400 + }, + { + "epoch": 0.23734354426165, + "grad_norm": 0.1209687739610672, + "learning_rate": 9.562187846310324e-05, + "loss": 1.5971229553222657, + "step": 78410 + }, + { + "epoch": 0.23737381381199582, + "grad_norm": 0.13204286992549896, + "learning_rate": 9.561808328146586e-05, + "loss": 1.5905738830566407, + "step": 78420 + }, + { + "epoch": 0.23740408336234164, + "grad_norm": 0.1452386975288391, + "learning_rate": 9.561428809982845e-05, + "loss": 1.5147397994995118, + "step": 78430 + }, + { + "epoch": 0.23743435291268747, + "grad_norm": 0.1285473108291626, + "learning_rate": 9.561049291819107e-05, + "loss": 1.5666046142578125, + "step": 78440 + }, + { + "epoch": 0.23746462246303332, + "grad_norm": 0.12319166213274002, + "learning_rate": 9.560669773655367e-05, + "loss": 1.575251579284668, + "step": 78450 + }, + { + "epoch": 0.23749489201337914, + "grad_norm": 0.10471302270889282, + "learning_rate": 9.560290255491629e-05, + "loss": 1.557682418823242, + "step": 78460 + }, + { + "epoch": 0.23752516156372497, + "grad_norm": 0.1025175005197525, + "learning_rate": 9.559910737327888e-05, + "loss": 1.5890055656433106, + "step": 78470 + }, + { + "epoch": 0.2375554311140708, + "grad_norm": 0.12126698344945908, + "learning_rate": 9.55953121916415e-05, + "loss": 1.5897972106933593, + "step": 78480 + }, + { + "epoch": 0.23758570066441664, + "grad_norm": 0.11008977890014648, + "learning_rate": 9.55915170100041e-05, + "loss": 1.5779632568359374, + "step": 78490 + }, + { + "epoch": 0.23761597021476247, + "grad_norm": 0.12705831229686737, + "learning_rate": 9.558772182836671e-05, + "loss": 1.5824440002441407, + "step": 78500 + }, + { + "epoch": 0.23761597021476247, + "eval_loss": 1.572576642036438, + "eval_runtime": 28.1232, + "eval_samples_per_second": 17.779, + "eval_steps_per_second": 1.138, + "step": 78500 + }, + { + "epoch": 0.2376462397651083, + "grad_norm": 0.1368810534477234, + "learning_rate": 9.558392664672932e-05, + "loss": 1.598008632659912, + "step": 78510 + }, + { + "epoch": 0.23767650931545412, + "grad_norm": 0.11716919392347336, + "learning_rate": 9.558013146509192e-05, + "loss": 1.5474202156066894, + "step": 78520 + }, + { + "epoch": 0.23770677886579994, + "grad_norm": 0.13089202344417572, + "learning_rate": 9.557633628345453e-05, + "loss": 1.603256607055664, + "step": 78530 + }, + { + "epoch": 0.2377370484161458, + "grad_norm": 0.13371551036834717, + "learning_rate": 9.557254110181713e-05, + "loss": 1.5956323623657227, + "step": 78540 + }, + { + "epoch": 0.23776731796649161, + "grad_norm": 0.1262315809726715, + "learning_rate": 9.556874592017974e-05, + "loss": 1.5657161712646483, + "step": 78550 + }, + { + "epoch": 0.23779758751683744, + "grad_norm": 0.11081387847661972, + "learning_rate": 9.556495073854235e-05, + "loss": 1.5600892066955567, + "step": 78560 + }, + { + "epoch": 0.23782785706718326, + "grad_norm": 0.12904496490955353, + "learning_rate": 9.556115555690495e-05, + "loss": 1.5787872314453124, + "step": 78570 + }, + { + "epoch": 0.2378581266175291, + "grad_norm": 0.12440882623195648, + "learning_rate": 9.555736037526756e-05, + "loss": 1.576686191558838, + "step": 78580 + }, + { + "epoch": 0.23788839616787494, + "grad_norm": 0.1158323809504509, + "learning_rate": 9.555356519363016e-05, + "loss": 1.6009250640869142, + "step": 78590 + }, + { + "epoch": 0.23791866571822076, + "grad_norm": 0.13093456625938416, + "learning_rate": 9.554977001199277e-05, + "loss": 1.5467446327209473, + "step": 78600 + }, + { + "epoch": 0.23794893526856659, + "grad_norm": 0.11412888020277023, + "learning_rate": 9.554597483035539e-05, + "loss": 1.5822109222412108, + "step": 78610 + }, + { + "epoch": 0.2379792048189124, + "grad_norm": 0.09989147633314133, + "learning_rate": 9.5542179648718e-05, + "loss": 1.622032928466797, + "step": 78620 + }, + { + "epoch": 0.23800947436925823, + "grad_norm": 0.12587042152881622, + "learning_rate": 9.55383844670806e-05, + "loss": 1.5718160629272462, + "step": 78630 + }, + { + "epoch": 0.23803974391960409, + "grad_norm": 0.1296146959066391, + "learning_rate": 9.55345892854432e-05, + "loss": 1.5535943984985352, + "step": 78640 + }, + { + "epoch": 0.2380700134699499, + "grad_norm": 0.11052094399929047, + "learning_rate": 9.553079410380581e-05, + "loss": 1.5554500579833985, + "step": 78650 + }, + { + "epoch": 0.23810028302029573, + "grad_norm": 0.10341693460941315, + "learning_rate": 9.552699892216842e-05, + "loss": 1.5839155197143555, + "step": 78660 + }, + { + "epoch": 0.23813055257064156, + "grad_norm": 0.11205566674470901, + "learning_rate": 9.552320374053102e-05, + "loss": 1.5520241737365723, + "step": 78670 + }, + { + "epoch": 0.23816082212098738, + "grad_norm": 0.13562935590744019, + "learning_rate": 9.551940855889363e-05, + "loss": 1.5923816680908203, + "step": 78680 + }, + { + "epoch": 0.23819109167133323, + "grad_norm": 0.1246960386633873, + "learning_rate": 9.551561337725624e-05, + "loss": 1.5774433135986328, + "step": 78690 + }, + { + "epoch": 0.23822136122167906, + "grad_norm": 0.12034789472818375, + "learning_rate": 9.551181819561884e-05, + "loss": 1.5865530967712402, + "step": 78700 + }, + { + "epoch": 0.23825163077202488, + "grad_norm": 0.10749820619821548, + "learning_rate": 9.550802301398145e-05, + "loss": 1.5831074714660645, + "step": 78710 + }, + { + "epoch": 0.2382819003223707, + "grad_norm": 0.12014526128768921, + "learning_rate": 9.550422783234405e-05, + "loss": 1.5897459030151366, + "step": 78720 + }, + { + "epoch": 0.23831216987271653, + "grad_norm": 0.11118009686470032, + "learning_rate": 9.550043265070667e-05, + "loss": 1.569768524169922, + "step": 78730 + }, + { + "epoch": 0.23834243942306238, + "grad_norm": 0.10450412333011627, + "learning_rate": 9.549663746906927e-05, + "loss": 1.5780069351196289, + "step": 78740 + }, + { + "epoch": 0.2383727089734082, + "grad_norm": 0.11539561301469803, + "learning_rate": 9.549284228743189e-05, + "loss": 1.565562629699707, + "step": 78750 + }, + { + "epoch": 0.23840297852375403, + "grad_norm": 0.12882377207279205, + "learning_rate": 9.548904710579448e-05, + "loss": 1.6075374603271484, + "step": 78760 + }, + { + "epoch": 0.23843324807409985, + "grad_norm": 0.1262766718864441, + "learning_rate": 9.54852519241571e-05, + "loss": 1.6168428421020509, + "step": 78770 + }, + { + "epoch": 0.23846351762444568, + "grad_norm": 0.11188483983278275, + "learning_rate": 9.548145674251969e-05, + "loss": 1.561549186706543, + "step": 78780 + }, + { + "epoch": 0.23849378717479153, + "grad_norm": 0.11672372370958328, + "learning_rate": 9.547766156088231e-05, + "loss": 1.594670867919922, + "step": 78790 + }, + { + "epoch": 0.23852405672513735, + "grad_norm": 0.1173982098698616, + "learning_rate": 9.54738663792449e-05, + "loss": 1.579173469543457, + "step": 78800 + }, + { + "epoch": 0.23855432627548318, + "grad_norm": 0.11033705621957779, + "learning_rate": 9.547007119760752e-05, + "loss": 1.5684199333190918, + "step": 78810 + }, + { + "epoch": 0.238584595825829, + "grad_norm": 0.1368497610092163, + "learning_rate": 9.546627601597013e-05, + "loss": 1.5704359054565429, + "step": 78820 + }, + { + "epoch": 0.23861486537617485, + "grad_norm": 0.12573999166488647, + "learning_rate": 9.546248083433273e-05, + "loss": 1.5935800552368165, + "step": 78830 + }, + { + "epoch": 0.23864513492652067, + "grad_norm": 0.11391035467386246, + "learning_rate": 9.545868565269534e-05, + "loss": 1.6050439834594727, + "step": 78840 + }, + { + "epoch": 0.2386754044768665, + "grad_norm": 0.11714745312929153, + "learning_rate": 9.545489047105795e-05, + "loss": 1.5584600448608399, + "step": 78850 + }, + { + "epoch": 0.23870567402721232, + "grad_norm": 0.130111962556839, + "learning_rate": 9.545109528942056e-05, + "loss": 1.594172477722168, + "step": 78860 + }, + { + "epoch": 0.23873594357755815, + "grad_norm": 0.11133508384227753, + "learning_rate": 9.544730010778316e-05, + "loss": 1.5708202362060546, + "step": 78870 + }, + { + "epoch": 0.238766213127904, + "grad_norm": 0.11004241555929184, + "learning_rate": 9.544350492614578e-05, + "loss": 1.5690217971801759, + "step": 78880 + }, + { + "epoch": 0.23879648267824982, + "grad_norm": 0.12705223262310028, + "learning_rate": 9.543970974450837e-05, + "loss": 1.601801300048828, + "step": 78890 + }, + { + "epoch": 0.23882675222859565, + "grad_norm": 0.11266763508319855, + "learning_rate": 9.543591456287099e-05, + "loss": 1.5623792648315429, + "step": 78900 + }, + { + "epoch": 0.23885702177894147, + "grad_norm": 0.10715968161821365, + "learning_rate": 9.543211938123358e-05, + "loss": 1.6304519653320313, + "step": 78910 + }, + { + "epoch": 0.2388872913292873, + "grad_norm": 0.13871000707149506, + "learning_rate": 9.54283241995962e-05, + "loss": 1.543803596496582, + "step": 78920 + }, + { + "epoch": 0.23891756087963315, + "grad_norm": 0.11013418436050415, + "learning_rate": 9.542452901795879e-05, + "loss": 1.5527979850769043, + "step": 78930 + }, + { + "epoch": 0.23894783042997897, + "grad_norm": 0.13465893268585205, + "learning_rate": 9.542073383632141e-05, + "loss": 1.5367595672607421, + "step": 78940 + }, + { + "epoch": 0.2389780999803248, + "grad_norm": 0.13445594906806946, + "learning_rate": 9.5416938654684e-05, + "loss": 1.5563015937805176, + "step": 78950 + }, + { + "epoch": 0.23900836953067062, + "grad_norm": 0.12034839391708374, + "learning_rate": 9.541314347304662e-05, + "loss": 1.5718503952026368, + "step": 78960 + }, + { + "epoch": 0.23903863908101644, + "grad_norm": 0.12707744538784027, + "learning_rate": 9.540934829140922e-05, + "loss": 1.5711669921875, + "step": 78970 + }, + { + "epoch": 0.2390689086313623, + "grad_norm": 0.11698254942893982, + "learning_rate": 9.540555310977184e-05, + "loss": 1.6033348083496093, + "step": 78980 + }, + { + "epoch": 0.23909917818170812, + "grad_norm": 0.11491412669420242, + "learning_rate": 9.540175792813444e-05, + "loss": 1.574801254272461, + "step": 78990 + }, + { + "epoch": 0.23912944773205394, + "grad_norm": 0.11582347005605698, + "learning_rate": 9.539796274649705e-05, + "loss": 1.5798349380493164, + "step": 79000 + }, + { + "epoch": 0.23912944773205394, + "eval_loss": 1.5624853372573853, + "eval_runtime": 28.0779, + "eval_samples_per_second": 17.808, + "eval_steps_per_second": 1.14, + "step": 79000 + }, + { + "epoch": 0.23915971728239976, + "grad_norm": 0.11011452227830887, + "learning_rate": 9.539416756485967e-05, + "loss": 1.5633821487426758, + "step": 79010 + }, + { + "epoch": 0.2391899868327456, + "grad_norm": 0.13855838775634766, + "learning_rate": 9.539037238322226e-05, + "loss": 1.5708768844604493, + "step": 79020 + }, + { + "epoch": 0.23922025638309144, + "grad_norm": 0.1266789436340332, + "learning_rate": 9.538657720158488e-05, + "loss": 1.5806150436401367, + "step": 79030 + }, + { + "epoch": 0.23925052593343726, + "grad_norm": 0.12139695882797241, + "learning_rate": 9.538278201994747e-05, + "loss": 1.5771455764770508, + "step": 79040 + }, + { + "epoch": 0.2392807954837831, + "grad_norm": 0.11457069218158722, + "learning_rate": 9.537898683831009e-05, + "loss": 1.590883731842041, + "step": 79050 + }, + { + "epoch": 0.2393110650341289, + "grad_norm": 0.11721065640449524, + "learning_rate": 9.537519165667268e-05, + "loss": 1.5467237472534179, + "step": 79060 + }, + { + "epoch": 0.23934133458447474, + "grad_norm": 0.11657813936471939, + "learning_rate": 9.53713964750353e-05, + "loss": 1.5518864631652831, + "step": 79070 + }, + { + "epoch": 0.2393716041348206, + "grad_norm": 0.12586486339569092, + "learning_rate": 9.53676012933979e-05, + "loss": 1.6067550659179688, + "step": 79080 + }, + { + "epoch": 0.2394018736851664, + "grad_norm": 0.12893126904964447, + "learning_rate": 9.536380611176052e-05, + "loss": 1.5749702453613281, + "step": 79090 + }, + { + "epoch": 0.23943214323551223, + "grad_norm": 0.12595321238040924, + "learning_rate": 9.536001093012312e-05, + "loss": 1.5719992637634277, + "step": 79100 + }, + { + "epoch": 0.23946241278585806, + "grad_norm": 0.11744904518127441, + "learning_rate": 9.535621574848573e-05, + "loss": 1.561309242248535, + "step": 79110 + }, + { + "epoch": 0.23949268233620388, + "grad_norm": 0.11822064965963364, + "learning_rate": 9.535242056684833e-05, + "loss": 1.592799186706543, + "step": 79120 + }, + { + "epoch": 0.23952295188654973, + "grad_norm": 0.12041425704956055, + "learning_rate": 9.534862538521094e-05, + "loss": 1.549039077758789, + "step": 79130 + }, + { + "epoch": 0.23955322143689556, + "grad_norm": 0.12389813363552094, + "learning_rate": 9.534483020357354e-05, + "loss": 1.5860145568847657, + "step": 79140 + }, + { + "epoch": 0.23958349098724138, + "grad_norm": 0.10907993465662003, + "learning_rate": 9.534103502193615e-05, + "loss": 1.5847439765930176, + "step": 79150 + }, + { + "epoch": 0.2396137605375872, + "grad_norm": 0.12613987922668457, + "learning_rate": 9.533723984029876e-05, + "loss": 1.564386749267578, + "step": 79160 + }, + { + "epoch": 0.23964403008793306, + "grad_norm": 0.11415159702301025, + "learning_rate": 9.533344465866136e-05, + "loss": 1.5708978652954102, + "step": 79170 + }, + { + "epoch": 0.23967429963827888, + "grad_norm": 0.11678450554609299, + "learning_rate": 9.532964947702397e-05, + "loss": 1.595317554473877, + "step": 79180 + }, + { + "epoch": 0.2397045691886247, + "grad_norm": 0.13691239058971405, + "learning_rate": 9.532585429538657e-05, + "loss": 1.5447989463806153, + "step": 79190 + }, + { + "epoch": 0.23973483873897053, + "grad_norm": 0.14205174148082733, + "learning_rate": 9.532205911374918e-05, + "loss": 1.5917091369628906, + "step": 79200 + }, + { + "epoch": 0.23976510828931635, + "grad_norm": 0.11402848362922668, + "learning_rate": 9.531826393211179e-05, + "loss": 1.5992308616638184, + "step": 79210 + }, + { + "epoch": 0.2397953778396622, + "grad_norm": 0.11766736954450607, + "learning_rate": 9.53144687504744e-05, + "loss": 1.5865293502807618, + "step": 79220 + }, + { + "epoch": 0.23982564739000803, + "grad_norm": 0.11667199432849884, + "learning_rate": 9.531067356883701e-05, + "loss": 1.5682680130004882, + "step": 79230 + }, + { + "epoch": 0.23985591694035385, + "grad_norm": 0.12742270529270172, + "learning_rate": 9.530687838719962e-05, + "loss": 1.5588560104370117, + "step": 79240 + }, + { + "epoch": 0.23988618649069968, + "grad_norm": 0.11629489809274673, + "learning_rate": 9.530308320556222e-05, + "loss": 1.600082778930664, + "step": 79250 + }, + { + "epoch": 0.2399164560410455, + "grad_norm": 0.12602728605270386, + "learning_rate": 9.529928802392483e-05, + "loss": 1.5750767707824707, + "step": 79260 + }, + { + "epoch": 0.23994672559139135, + "grad_norm": 0.10693638026714325, + "learning_rate": 9.529549284228744e-05, + "loss": 1.5974933624267578, + "step": 79270 + }, + { + "epoch": 0.23997699514173718, + "grad_norm": 0.12545238435268402, + "learning_rate": 9.529169766065004e-05, + "loss": 1.5797677993774415, + "step": 79280 + }, + { + "epoch": 0.240007264692083, + "grad_norm": 0.11571867018938065, + "learning_rate": 9.528790247901265e-05, + "loss": 1.5942320823669434, + "step": 79290 + }, + { + "epoch": 0.24003753424242882, + "grad_norm": 0.11468596756458282, + "learning_rate": 9.528410729737525e-05, + "loss": 1.5779560089111329, + "step": 79300 + }, + { + "epoch": 0.24006780379277465, + "grad_norm": 0.11091195046901703, + "learning_rate": 9.528031211573786e-05, + "loss": 1.611079788208008, + "step": 79310 + }, + { + "epoch": 0.2400980733431205, + "grad_norm": 0.11474325507879257, + "learning_rate": 9.527651693410047e-05, + "loss": 1.5513241767883301, + "step": 79320 + }, + { + "epoch": 0.24012834289346632, + "grad_norm": 0.09889322519302368, + "learning_rate": 9.527272175246307e-05, + "loss": 1.5630047798156739, + "step": 79330 + }, + { + "epoch": 0.24015861244381215, + "grad_norm": 0.1464618444442749, + "learning_rate": 9.526892657082568e-05, + "loss": 1.5965532302856444, + "step": 79340 + }, + { + "epoch": 0.24018888199415797, + "grad_norm": 0.13121046125888824, + "learning_rate": 9.526513138918828e-05, + "loss": 1.5774995803833007, + "step": 79350 + }, + { + "epoch": 0.2402191515445038, + "grad_norm": 0.11763063818216324, + "learning_rate": 9.52613362075509e-05, + "loss": 1.5493932723999024, + "step": 79360 + }, + { + "epoch": 0.24024942109484965, + "grad_norm": 0.10538198053836823, + "learning_rate": 9.52575410259135e-05, + "loss": 1.5868190765380858, + "step": 79370 + }, + { + "epoch": 0.24027969064519547, + "grad_norm": 0.11655829101800919, + "learning_rate": 9.525374584427611e-05, + "loss": 1.574612522125244, + "step": 79380 + }, + { + "epoch": 0.2403099601955413, + "grad_norm": 0.11446014791727066, + "learning_rate": 9.524995066263871e-05, + "loss": 1.560911273956299, + "step": 79390 + }, + { + "epoch": 0.24034022974588712, + "grad_norm": 0.12679201364517212, + "learning_rate": 9.524615548100133e-05, + "loss": 1.5313687324523926, + "step": 79400 + }, + { + "epoch": 0.24037049929623294, + "grad_norm": 0.11680425703525543, + "learning_rate": 9.524236029936392e-05, + "loss": 1.5547600746154786, + "step": 79410 + }, + { + "epoch": 0.2404007688465788, + "grad_norm": 0.10677006095647812, + "learning_rate": 9.523856511772654e-05, + "loss": 1.5712760925292968, + "step": 79420 + }, + { + "epoch": 0.24043103839692462, + "grad_norm": 0.12203372269868851, + "learning_rate": 9.523476993608914e-05, + "loss": 1.6078006744384765, + "step": 79430 + }, + { + "epoch": 0.24046130794727044, + "grad_norm": 0.11916756629943848, + "learning_rate": 9.523097475445175e-05, + "loss": 1.5873661994934083, + "step": 79440 + }, + { + "epoch": 0.24049157749761627, + "grad_norm": 0.11618990451097488, + "learning_rate": 9.522717957281436e-05, + "loss": 1.5871647834777831, + "step": 79450 + }, + { + "epoch": 0.2405218470479621, + "grad_norm": 0.1197088435292244, + "learning_rate": 9.522338439117696e-05, + "loss": 1.524481201171875, + "step": 79460 + }, + { + "epoch": 0.24055211659830794, + "grad_norm": 0.10962489992380142, + "learning_rate": 9.521958920953958e-05, + "loss": 1.6036056518554687, + "step": 79470 + }, + { + "epoch": 0.24058238614865377, + "grad_norm": 0.11234718561172485, + "learning_rate": 9.521579402790217e-05, + "loss": 1.5872072219848632, + "step": 79480 + }, + { + "epoch": 0.2406126556989996, + "grad_norm": 0.11994580924510956, + "learning_rate": 9.52119988462648e-05, + "loss": 1.6128606796264648, + "step": 79490 + }, + { + "epoch": 0.2406429252493454, + "grad_norm": 0.11784191429615021, + "learning_rate": 9.520820366462739e-05, + "loss": 1.5591779708862306, + "step": 79500 + }, + { + "epoch": 0.2406429252493454, + "eval_loss": 1.591293454170227, + "eval_runtime": 28.5613, + "eval_samples_per_second": 17.506, + "eval_steps_per_second": 1.12, + "step": 79500 + }, + { + "epoch": 0.24067319479969124, + "grad_norm": 0.12869271636009216, + "learning_rate": 9.520440848299e-05, + "loss": 1.5688678741455078, + "step": 79510 + }, + { + "epoch": 0.2407034643500371, + "grad_norm": 0.12054256349802017, + "learning_rate": 9.52006133013526e-05, + "loss": 1.5613737106323242, + "step": 79520 + }, + { + "epoch": 0.2407337339003829, + "grad_norm": 0.11138840019702911, + "learning_rate": 9.519681811971522e-05, + "loss": 1.5830819129943847, + "step": 79530 + }, + { + "epoch": 0.24076400345072874, + "grad_norm": 0.12490271031856537, + "learning_rate": 9.519302293807781e-05, + "loss": 1.602225112915039, + "step": 79540 + }, + { + "epoch": 0.24079427300107456, + "grad_norm": 0.10739609599113464, + "learning_rate": 9.518922775644043e-05, + "loss": 1.5430567741394043, + "step": 79550 + }, + { + "epoch": 0.2408245425514204, + "grad_norm": 0.13552550971508026, + "learning_rate": 9.518543257480302e-05, + "loss": 1.5675930023193358, + "step": 79560 + }, + { + "epoch": 0.24085481210176624, + "grad_norm": 0.11612845957279205, + "learning_rate": 9.518163739316564e-05, + "loss": 1.5609855651855469, + "step": 79570 + }, + { + "epoch": 0.24088508165211206, + "grad_norm": 0.12248668074607849, + "learning_rate": 9.517784221152823e-05, + "loss": 1.5429931640625, + "step": 79580 + }, + { + "epoch": 0.24091535120245788, + "grad_norm": 0.12330559641122818, + "learning_rate": 9.517404702989085e-05, + "loss": 1.5939670562744142, + "step": 79590 + }, + { + "epoch": 0.2409456207528037, + "grad_norm": 0.11876138299703598, + "learning_rate": 9.517025184825346e-05, + "loss": 1.6064390182495116, + "step": 79600 + }, + { + "epoch": 0.24097589030314956, + "grad_norm": 0.1322205811738968, + "learning_rate": 9.516645666661607e-05, + "loss": 1.5467180252075194, + "step": 79610 + }, + { + "epoch": 0.24100615985349538, + "grad_norm": 0.11820200830698013, + "learning_rate": 9.516266148497868e-05, + "loss": 1.5626714706420899, + "step": 79620 + }, + { + "epoch": 0.2410364294038412, + "grad_norm": 0.11006120592355728, + "learning_rate": 9.515886630334128e-05, + "loss": 1.5934993743896484, + "step": 79630 + }, + { + "epoch": 0.24106669895418703, + "grad_norm": 0.11872406303882599, + "learning_rate": 9.51550711217039e-05, + "loss": 1.5645973205566406, + "step": 79640 + }, + { + "epoch": 0.24109696850453285, + "grad_norm": 0.11562202870845795, + "learning_rate": 9.515127594006649e-05, + "loss": 1.544898796081543, + "step": 79650 + }, + { + "epoch": 0.2411272380548787, + "grad_norm": 0.13164305686950684, + "learning_rate": 9.514748075842911e-05, + "loss": 1.5688920974731446, + "step": 79660 + }, + { + "epoch": 0.24115750760522453, + "grad_norm": 0.13226331770420074, + "learning_rate": 9.51436855767917e-05, + "loss": 1.5404233932495117, + "step": 79670 + }, + { + "epoch": 0.24118777715557035, + "grad_norm": 0.1292196661233902, + "learning_rate": 9.513989039515432e-05, + "loss": 1.6003433227539063, + "step": 79680 + }, + { + "epoch": 0.24121804670591618, + "grad_norm": 0.13862261176109314, + "learning_rate": 9.513609521351691e-05, + "loss": 1.5407320022583009, + "step": 79690 + }, + { + "epoch": 0.241248316256262, + "grad_norm": 0.11830244958400726, + "learning_rate": 9.513230003187953e-05, + "loss": 1.5545242309570313, + "step": 79700 + }, + { + "epoch": 0.24127858580660785, + "grad_norm": 0.12032139301300049, + "learning_rate": 9.512850485024212e-05, + "loss": 1.6004024505615235, + "step": 79710 + }, + { + "epoch": 0.24130885535695368, + "grad_norm": 0.11636419594287872, + "learning_rate": 9.512470966860474e-05, + "loss": 1.5593463897705078, + "step": 79720 + }, + { + "epoch": 0.2413391249072995, + "grad_norm": 0.1092875674366951, + "learning_rate": 9.512091448696735e-05, + "loss": 1.5896053314208984, + "step": 79730 + }, + { + "epoch": 0.24136939445764533, + "grad_norm": 0.11015848815441132, + "learning_rate": 9.511711930532996e-05, + "loss": 1.5680566787719727, + "step": 79740 + }, + { + "epoch": 0.24139966400799115, + "grad_norm": 0.1059202253818512, + "learning_rate": 9.511332412369256e-05, + "loss": 1.5813632965087892, + "step": 79750 + }, + { + "epoch": 0.241429933558337, + "grad_norm": 0.1329554170370102, + "learning_rate": 9.510952894205517e-05, + "loss": 1.567253017425537, + "step": 79760 + }, + { + "epoch": 0.24146020310868282, + "grad_norm": 0.1343255639076233, + "learning_rate": 9.510573376041777e-05, + "loss": 1.6016050338745118, + "step": 79770 + }, + { + "epoch": 0.24149047265902865, + "grad_norm": 0.11306943744421005, + "learning_rate": 9.510193857878038e-05, + "loss": 1.5893989562988282, + "step": 79780 + }, + { + "epoch": 0.24152074220937447, + "grad_norm": 0.11182846128940582, + "learning_rate": 9.509814339714299e-05, + "loss": 1.5749462127685547, + "step": 79790 + }, + { + "epoch": 0.2415510117597203, + "grad_norm": 0.11335831880569458, + "learning_rate": 9.509434821550559e-05, + "loss": 1.5867716789245605, + "step": 79800 + }, + { + "epoch": 0.24158128131006615, + "grad_norm": 0.1289110779762268, + "learning_rate": 9.50905530338682e-05, + "loss": 1.5771847724914552, + "step": 79810 + }, + { + "epoch": 0.24161155086041197, + "grad_norm": 0.12567238509655, + "learning_rate": 9.50867578522308e-05, + "loss": 1.5465017318725587, + "step": 79820 + }, + { + "epoch": 0.2416418204107578, + "grad_norm": 0.13774093985557556, + "learning_rate": 9.508296267059342e-05, + "loss": 1.5607205390930177, + "step": 79830 + }, + { + "epoch": 0.24167208996110362, + "grad_norm": 0.1160736083984375, + "learning_rate": 9.507916748895603e-05, + "loss": 1.5818760871887207, + "step": 79840 + }, + { + "epoch": 0.24170235951144944, + "grad_norm": 0.10961804538965225, + "learning_rate": 9.507537230731864e-05, + "loss": 1.584244728088379, + "step": 79850 + }, + { + "epoch": 0.2417326290617953, + "grad_norm": 0.11485861241817474, + "learning_rate": 9.507157712568124e-05, + "loss": 1.575150203704834, + "step": 79860 + }, + { + "epoch": 0.24176289861214112, + "grad_norm": 0.11243429780006409, + "learning_rate": 9.506778194404385e-05, + "loss": 1.5972171783447267, + "step": 79870 + }, + { + "epoch": 0.24179316816248694, + "grad_norm": 0.1272726207971573, + "learning_rate": 9.506398676240645e-05, + "loss": 1.5661725044250487, + "step": 79880 + }, + { + "epoch": 0.24182343771283277, + "grad_norm": 0.11396926641464233, + "learning_rate": 9.506019158076906e-05, + "loss": 1.570916748046875, + "step": 79890 + }, + { + "epoch": 0.24185370726317862, + "grad_norm": 0.1198001354932785, + "learning_rate": 9.505639639913166e-05, + "loss": 1.5752452850341796, + "step": 79900 + }, + { + "epoch": 0.24188397681352444, + "grad_norm": 0.12098637223243713, + "learning_rate": 9.505260121749427e-05, + "loss": 1.581617259979248, + "step": 79910 + }, + { + "epoch": 0.24191424636387027, + "grad_norm": 0.12044202536344528, + "learning_rate": 9.504880603585688e-05, + "loss": 1.5750983238220215, + "step": 79920 + }, + { + "epoch": 0.2419445159142161, + "grad_norm": 0.10808050632476807, + "learning_rate": 9.504501085421948e-05, + "loss": 1.57620849609375, + "step": 79930 + }, + { + "epoch": 0.24197478546456191, + "grad_norm": 0.1310497373342514, + "learning_rate": 9.504121567258209e-05, + "loss": 1.551864242553711, + "step": 79940 + }, + { + "epoch": 0.24200505501490777, + "grad_norm": 0.11993198096752167, + "learning_rate": 9.50374204909447e-05, + "loss": 1.5682548522949218, + "step": 79950 + }, + { + "epoch": 0.2420353245652536, + "grad_norm": 0.12367460876703262, + "learning_rate": 9.50336253093073e-05, + "loss": 1.6035343170166017, + "step": 79960 + }, + { + "epoch": 0.24206559411559941, + "grad_norm": 0.10878776013851166, + "learning_rate": 9.502983012766992e-05, + "loss": 1.544327163696289, + "step": 79970 + }, + { + "epoch": 0.24209586366594524, + "grad_norm": 0.13071422278881073, + "learning_rate": 9.502603494603251e-05, + "loss": 1.5626999855041503, + "step": 79980 + }, + { + "epoch": 0.24212613321629106, + "grad_norm": 0.1282905638217926, + "learning_rate": 9.502223976439513e-05, + "loss": 1.5856031417846679, + "step": 79990 + }, + { + "epoch": 0.2421564027666369, + "grad_norm": 0.1300024390220642, + "learning_rate": 9.501844458275772e-05, + "loss": 1.5512987136840821, + "step": 80000 + }, + { + "epoch": 0.2421564027666369, + "eval_loss": 1.5664873123168945, + "eval_runtime": 28.2107, + "eval_samples_per_second": 17.724, + "eval_steps_per_second": 1.134, + "step": 80000 + }, + { + "epoch": 0.24218667231698274, + "grad_norm": 0.1386670172214508, + "learning_rate": 9.501464940112034e-05, + "loss": 1.6008796691894531, + "step": 80010 + }, + { + "epoch": 0.24221694186732856, + "grad_norm": 0.11579251289367676, + "learning_rate": 9.501085421948294e-05, + "loss": 1.5694612503051757, + "step": 80020 + }, + { + "epoch": 0.24224721141767439, + "grad_norm": 0.1211436316370964, + "learning_rate": 9.500705903784556e-05, + "loss": 1.5494084358215332, + "step": 80030 + }, + { + "epoch": 0.2422774809680202, + "grad_norm": 0.12344479560852051, + "learning_rate": 9.500326385620816e-05, + "loss": 1.5555581092834472, + "step": 80040 + }, + { + "epoch": 0.24230775051836606, + "grad_norm": 0.10943315178155899, + "learning_rate": 9.499946867457077e-05, + "loss": 1.5648902893066405, + "step": 80050 + }, + { + "epoch": 0.24233802006871188, + "grad_norm": 0.11220841854810715, + "learning_rate": 9.499567349293337e-05, + "loss": 1.5812777519226073, + "step": 80060 + }, + { + "epoch": 0.2423682896190577, + "grad_norm": 0.1114111915230751, + "learning_rate": 9.499187831129598e-05, + "loss": 1.6161582946777344, + "step": 80070 + }, + { + "epoch": 0.24239855916940353, + "grad_norm": 0.11086810380220413, + "learning_rate": 9.49880831296586e-05, + "loss": 1.5798176765441894, + "step": 80080 + }, + { + "epoch": 0.24242882871974936, + "grad_norm": 0.13042254745960236, + "learning_rate": 9.498428794802119e-05, + "loss": 1.5718521118164062, + "step": 80090 + }, + { + "epoch": 0.2424590982700952, + "grad_norm": 0.1169336661696434, + "learning_rate": 9.498049276638381e-05, + "loss": 1.5662335395812987, + "step": 80100 + }, + { + "epoch": 0.24248936782044103, + "grad_norm": 0.12312524020671844, + "learning_rate": 9.49766975847464e-05, + "loss": 1.5229358673095703, + "step": 80110 + }, + { + "epoch": 0.24251963737078686, + "grad_norm": 0.12363571673631668, + "learning_rate": 9.497290240310902e-05, + "loss": 1.5761369705200194, + "step": 80120 + }, + { + "epoch": 0.24254990692113268, + "grad_norm": 0.11871389299631119, + "learning_rate": 9.496910722147162e-05, + "loss": 1.5702911376953126, + "step": 80130 + }, + { + "epoch": 0.2425801764714785, + "grad_norm": 0.11825990676879883, + "learning_rate": 9.496531203983423e-05, + "loss": 1.5757762908935546, + "step": 80140 + }, + { + "epoch": 0.24261044602182436, + "grad_norm": 0.1225072517991066, + "learning_rate": 9.496151685819683e-05, + "loss": 1.5900089263916015, + "step": 80150 + }, + { + "epoch": 0.24264071557217018, + "grad_norm": 0.12220140546560287, + "learning_rate": 9.495772167655945e-05, + "loss": 1.5764917373657226, + "step": 80160 + }, + { + "epoch": 0.242670985122516, + "grad_norm": 0.12164728343486786, + "learning_rate": 9.495392649492204e-05, + "loss": 1.5571704864501954, + "step": 80170 + }, + { + "epoch": 0.24270125467286183, + "grad_norm": 0.11277732253074646, + "learning_rate": 9.495013131328466e-05, + "loss": 1.583829116821289, + "step": 80180 + }, + { + "epoch": 0.24273152422320765, + "grad_norm": 0.12299073487520218, + "learning_rate": 9.494633613164725e-05, + "loss": 1.5887031555175781, + "step": 80190 + }, + { + "epoch": 0.2427617937735535, + "grad_norm": 0.11610588431358337, + "learning_rate": 9.494254095000987e-05, + "loss": 1.5580897331237793, + "step": 80200 + }, + { + "epoch": 0.24279206332389933, + "grad_norm": 0.12023304402828217, + "learning_rate": 9.493874576837248e-05, + "loss": 1.534048080444336, + "step": 80210 + }, + { + "epoch": 0.24282233287424515, + "grad_norm": 0.10885855555534363, + "learning_rate": 9.493495058673508e-05, + "loss": 1.5659982681274414, + "step": 80220 + }, + { + "epoch": 0.24285260242459097, + "grad_norm": 0.137442946434021, + "learning_rate": 9.49311554050977e-05, + "loss": 1.6162706375122071, + "step": 80230 + }, + { + "epoch": 0.24288287197493683, + "grad_norm": 0.11647435277700424, + "learning_rate": 9.49273602234603e-05, + "loss": 1.5677515983581543, + "step": 80240 + }, + { + "epoch": 0.24291314152528265, + "grad_norm": 0.11693696677684784, + "learning_rate": 9.492356504182291e-05, + "loss": 1.5846426963806153, + "step": 80250 + }, + { + "epoch": 0.24294341107562847, + "grad_norm": 0.11973585933446884, + "learning_rate": 9.49197698601855e-05, + "loss": 1.5633960723876954, + "step": 80260 + }, + { + "epoch": 0.2429736806259743, + "grad_norm": 0.11269336938858032, + "learning_rate": 9.491597467854813e-05, + "loss": 1.5841888427734374, + "step": 80270 + }, + { + "epoch": 0.24300395017632012, + "grad_norm": 0.11930994689464569, + "learning_rate": 9.491217949691072e-05, + "loss": 1.5847774505615235, + "step": 80280 + }, + { + "epoch": 0.24303421972666597, + "grad_norm": 0.12026160210371017, + "learning_rate": 9.490838431527334e-05, + "loss": 1.5523246765136718, + "step": 80290 + }, + { + "epoch": 0.2430644892770118, + "grad_norm": 0.11233339458703995, + "learning_rate": 9.490458913363593e-05, + "loss": 1.5682140350341798, + "step": 80300 + }, + { + "epoch": 0.24309475882735762, + "grad_norm": 0.12181048095226288, + "learning_rate": 9.490079395199855e-05, + "loss": 1.5843149185180665, + "step": 80310 + }, + { + "epoch": 0.24312502837770344, + "grad_norm": 0.13704940676689148, + "learning_rate": 9.489699877036114e-05, + "loss": 1.5598993301391602, + "step": 80320 + }, + { + "epoch": 0.24315529792804927, + "grad_norm": 0.10370349884033203, + "learning_rate": 9.489320358872376e-05, + "loss": 1.5942051887512207, + "step": 80330 + }, + { + "epoch": 0.24318556747839512, + "grad_norm": 0.12322583049535751, + "learning_rate": 9.488940840708637e-05, + "loss": 1.5802495956420899, + "step": 80340 + }, + { + "epoch": 0.24321583702874094, + "grad_norm": 0.1312245875597, + "learning_rate": 9.488561322544897e-05, + "loss": 1.5749807357788086, + "step": 80350 + }, + { + "epoch": 0.24324610657908677, + "grad_norm": 0.12320656329393387, + "learning_rate": 9.488181804381158e-05, + "loss": 1.5763094902038575, + "step": 80360 + }, + { + "epoch": 0.2432763761294326, + "grad_norm": 0.10737841576337814, + "learning_rate": 9.487802286217419e-05, + "loss": 1.6103935241699219, + "step": 80370 + }, + { + "epoch": 0.24330664567977842, + "grad_norm": 0.11957648396492004, + "learning_rate": 9.487422768053679e-05, + "loss": 1.5437143325805665, + "step": 80380 + }, + { + "epoch": 0.24333691523012427, + "grad_norm": 0.1185232549905777, + "learning_rate": 9.48704324988994e-05, + "loss": 1.5313239097595215, + "step": 80390 + }, + { + "epoch": 0.2433671847804701, + "grad_norm": 0.1143304854631424, + "learning_rate": 9.4866637317262e-05, + "loss": 1.6215875625610352, + "step": 80400 + }, + { + "epoch": 0.24339745433081592, + "grad_norm": 0.11641199141740799, + "learning_rate": 9.486284213562461e-05, + "loss": 1.588587188720703, + "step": 80410 + }, + { + "epoch": 0.24342772388116174, + "grad_norm": 0.11530521512031555, + "learning_rate": 9.485904695398721e-05, + "loss": 1.54247989654541, + "step": 80420 + }, + { + "epoch": 0.24345799343150756, + "grad_norm": 0.1296122819185257, + "learning_rate": 9.485525177234982e-05, + "loss": 1.590423583984375, + "step": 80430 + }, + { + "epoch": 0.24348826298185341, + "grad_norm": 0.114944688975811, + "learning_rate": 9.485145659071244e-05, + "loss": 1.5911380767822265, + "step": 80440 + }, + { + "epoch": 0.24351853253219924, + "grad_norm": 0.11389563977718353, + "learning_rate": 9.484766140907503e-05, + "loss": 1.5763952255249023, + "step": 80450 + }, + { + "epoch": 0.24354880208254506, + "grad_norm": 0.11146233975887299, + "learning_rate": 9.484386622743765e-05, + "loss": 1.5747502326965332, + "step": 80460 + }, + { + "epoch": 0.2435790716328909, + "grad_norm": 0.11737295985221863, + "learning_rate": 9.484007104580026e-05, + "loss": 1.5695709228515624, + "step": 80470 + }, + { + "epoch": 0.2436093411832367, + "grad_norm": 0.11299905180931091, + "learning_rate": 9.483627586416286e-05, + "loss": 1.5679915428161622, + "step": 80480 + }, + { + "epoch": 0.24363961073358256, + "grad_norm": 0.13389550149440765, + "learning_rate": 9.483248068252547e-05, + "loss": 1.5862811088562012, + "step": 80490 + }, + { + "epoch": 0.2436698802839284, + "grad_norm": 0.12013792246580124, + "learning_rate": 9.482868550088808e-05, + "loss": 1.5831863403320312, + "step": 80500 + }, + { + "epoch": 0.2436698802839284, + "eval_loss": 1.5978612899780273, + "eval_runtime": 28.3288, + "eval_samples_per_second": 17.65, + "eval_steps_per_second": 1.13, + "step": 80500 + }, + { + "epoch": 0.2437001498342742, + "grad_norm": 0.12186238914728165, + "learning_rate": 9.482489031925068e-05, + "loss": 1.531287956237793, + "step": 80510 + }, + { + "epoch": 0.24373041938462003, + "grad_norm": 0.11658976227045059, + "learning_rate": 9.482109513761329e-05, + "loss": 1.598421287536621, + "step": 80520 + }, + { + "epoch": 0.24376068893496586, + "grad_norm": 0.12767091393470764, + "learning_rate": 9.48172999559759e-05, + "loss": 1.5647465705871582, + "step": 80530 + }, + { + "epoch": 0.2437909584853117, + "grad_norm": 0.12363332509994507, + "learning_rate": 9.48135047743385e-05, + "loss": 1.5745680809020997, + "step": 80540 + }, + { + "epoch": 0.24382122803565753, + "grad_norm": 0.1157059594988823, + "learning_rate": 9.48097095927011e-05, + "loss": 1.5725350379943848, + "step": 80550 + }, + { + "epoch": 0.24385149758600336, + "grad_norm": 0.11629138886928558, + "learning_rate": 9.480591441106371e-05, + "loss": 1.610542106628418, + "step": 80560 + }, + { + "epoch": 0.24388176713634918, + "grad_norm": 0.12323258072137833, + "learning_rate": 9.480211922942632e-05, + "loss": 1.559149169921875, + "step": 80570 + }, + { + "epoch": 0.24391203668669503, + "grad_norm": 0.13251638412475586, + "learning_rate": 9.479832404778894e-05, + "loss": 1.5628278732299805, + "step": 80580 + }, + { + "epoch": 0.24394230623704086, + "grad_norm": 0.1250227838754654, + "learning_rate": 9.479452886615153e-05, + "loss": 1.562411117553711, + "step": 80590 + }, + { + "epoch": 0.24397257578738668, + "grad_norm": 0.12786497175693512, + "learning_rate": 9.479073368451415e-05, + "loss": 1.5611671447753905, + "step": 80600 + }, + { + "epoch": 0.2440028453377325, + "grad_norm": 0.10671330243349075, + "learning_rate": 9.478693850287674e-05, + "loss": 1.5608282089233398, + "step": 80610 + }, + { + "epoch": 0.24403311488807833, + "grad_norm": 0.11810103803873062, + "learning_rate": 9.478314332123936e-05, + "loss": 1.548230266571045, + "step": 80620 + }, + { + "epoch": 0.24406338443842418, + "grad_norm": 0.11606258898973465, + "learning_rate": 9.477934813960197e-05, + "loss": 1.5560523986816406, + "step": 80630 + }, + { + "epoch": 0.24409365398877, + "grad_norm": 0.12264913320541382, + "learning_rate": 9.477555295796457e-05, + "loss": 1.5720576286315917, + "step": 80640 + }, + { + "epoch": 0.24412392353911583, + "grad_norm": 0.12755610048770905, + "learning_rate": 9.477175777632718e-05, + "loss": 1.6216140747070313, + "step": 80650 + }, + { + "epoch": 0.24415419308946165, + "grad_norm": 0.11714348196983337, + "learning_rate": 9.476796259468978e-05, + "loss": 1.6373598098754882, + "step": 80660 + }, + { + "epoch": 0.24418446263980748, + "grad_norm": 0.11361085623502731, + "learning_rate": 9.476416741305239e-05, + "loss": 1.5895703315734864, + "step": 80670 + }, + { + "epoch": 0.24421473219015333, + "grad_norm": 0.11242949217557907, + "learning_rate": 9.4760372231415e-05, + "loss": 1.555491065979004, + "step": 80680 + }, + { + "epoch": 0.24424500174049915, + "grad_norm": 0.11327043175697327, + "learning_rate": 9.47565770497776e-05, + "loss": 1.5494099617004395, + "step": 80690 + }, + { + "epoch": 0.24427527129084498, + "grad_norm": 0.11614689230918884, + "learning_rate": 9.475278186814021e-05, + "loss": 1.5217756271362304, + "step": 80700 + }, + { + "epoch": 0.2443055408411908, + "grad_norm": 0.13555125892162323, + "learning_rate": 9.474898668650283e-05, + "loss": 1.6068502426147462, + "step": 80710 + }, + { + "epoch": 0.24433581039153662, + "grad_norm": 0.11865662783384323, + "learning_rate": 9.474519150486542e-05, + "loss": 1.5601521492004395, + "step": 80720 + }, + { + "epoch": 0.24436607994188247, + "grad_norm": 0.10665993392467499, + "learning_rate": 9.474139632322804e-05, + "loss": 1.5560648918151856, + "step": 80730 + }, + { + "epoch": 0.2443963494922283, + "grad_norm": 0.11126497387886047, + "learning_rate": 9.473760114159063e-05, + "loss": 1.5338117599487304, + "step": 80740 + }, + { + "epoch": 0.24442661904257412, + "grad_norm": 0.1252506673336029, + "learning_rate": 9.473380595995325e-05, + "loss": 1.5441219329833984, + "step": 80750 + }, + { + "epoch": 0.24445688859291995, + "grad_norm": 0.1180272251367569, + "learning_rate": 9.473001077831584e-05, + "loss": 1.5675762176513672, + "step": 80760 + }, + { + "epoch": 0.24448715814326577, + "grad_norm": 0.11765774339437485, + "learning_rate": 9.472621559667846e-05, + "loss": 1.5517107009887696, + "step": 80770 + }, + { + "epoch": 0.24451742769361162, + "grad_norm": 0.1112835705280304, + "learning_rate": 9.472242041504106e-05, + "loss": 1.5735570907592773, + "step": 80780 + }, + { + "epoch": 0.24454769724395745, + "grad_norm": 0.12054045498371124, + "learning_rate": 9.471862523340368e-05, + "loss": 1.597407913208008, + "step": 80790 + }, + { + "epoch": 0.24457796679430327, + "grad_norm": 0.11415783315896988, + "learning_rate": 9.471483005176627e-05, + "loss": 1.5820968627929688, + "step": 80800 + }, + { + "epoch": 0.2446082363446491, + "grad_norm": 0.1281430721282959, + "learning_rate": 9.471103487012889e-05, + "loss": 1.5923341751098632, + "step": 80810 + }, + { + "epoch": 0.24463850589499492, + "grad_norm": 0.13465525209903717, + "learning_rate": 9.470723968849148e-05, + "loss": 1.5767759323120116, + "step": 80820 + }, + { + "epoch": 0.24466877544534077, + "grad_norm": 0.12430072575807571, + "learning_rate": 9.47034445068541e-05, + "loss": 1.5463043212890626, + "step": 80830 + }, + { + "epoch": 0.2446990449956866, + "grad_norm": 0.13377420604228973, + "learning_rate": 9.469964932521672e-05, + "loss": 1.5627534866333008, + "step": 80840 + }, + { + "epoch": 0.24472931454603242, + "grad_norm": 0.13138145208358765, + "learning_rate": 9.469585414357931e-05, + "loss": 1.5635242462158203, + "step": 80850 + }, + { + "epoch": 0.24475958409637824, + "grad_norm": 0.14158214628696442, + "learning_rate": 9.469205896194193e-05, + "loss": 1.5986732482910155, + "step": 80860 + }, + { + "epoch": 0.24478985364672406, + "grad_norm": 0.13195903599262238, + "learning_rate": 9.468826378030452e-05, + "loss": 1.5905163764953614, + "step": 80870 + }, + { + "epoch": 0.24482012319706992, + "grad_norm": 0.12370489537715912, + "learning_rate": 9.468446859866714e-05, + "loss": 1.5978450775146484, + "step": 80880 + }, + { + "epoch": 0.24485039274741574, + "grad_norm": 0.10334467887878418, + "learning_rate": 9.468067341702974e-05, + "loss": 1.5603717803955077, + "step": 80890 + }, + { + "epoch": 0.24488066229776156, + "grad_norm": 0.12616263329982758, + "learning_rate": 9.467687823539235e-05, + "loss": 1.583019733428955, + "step": 80900 + }, + { + "epoch": 0.2449109318481074, + "grad_norm": 0.11107971519231796, + "learning_rate": 9.467308305375495e-05, + "loss": 1.5926796913146972, + "step": 80910 + }, + { + "epoch": 0.2449412013984532, + "grad_norm": 0.10670112073421478, + "learning_rate": 9.466928787211757e-05, + "loss": 1.5536306381225586, + "step": 80920 + }, + { + "epoch": 0.24497147094879906, + "grad_norm": 0.11483664810657501, + "learning_rate": 9.466549269048016e-05, + "loss": 1.5590669631958007, + "step": 80930 + }, + { + "epoch": 0.2450017404991449, + "grad_norm": 0.10960526764392853, + "learning_rate": 9.466169750884278e-05, + "loss": 1.571731948852539, + "step": 80940 + }, + { + "epoch": 0.2450320100494907, + "grad_norm": 0.1073136180639267, + "learning_rate": 9.465790232720538e-05, + "loss": 1.5876304626464843, + "step": 80950 + }, + { + "epoch": 0.24506227959983654, + "grad_norm": 0.12123094499111176, + "learning_rate": 9.465410714556799e-05, + "loss": 1.6217859268188477, + "step": 80960 + }, + { + "epoch": 0.2450925491501824, + "grad_norm": 0.11622725427150726, + "learning_rate": 9.46503119639306e-05, + "loss": 1.5385875701904297, + "step": 80970 + }, + { + "epoch": 0.2451228187005282, + "grad_norm": 0.11644265800714493, + "learning_rate": 9.46465167822932e-05, + "loss": 1.5691349983215332, + "step": 80980 + }, + { + "epoch": 0.24515308825087403, + "grad_norm": 0.11498551815748215, + "learning_rate": 9.464272160065581e-05, + "loss": 1.579232120513916, + "step": 80990 + }, + { + "epoch": 0.24518335780121986, + "grad_norm": 0.10362085700035095, + "learning_rate": 9.463892641901841e-05, + "loss": 1.5822096824645997, + "step": 81000 + }, + { + "epoch": 0.24518335780121986, + "eval_loss": 1.5666110515594482, + "eval_runtime": 28.0735, + "eval_samples_per_second": 17.81, + "eval_steps_per_second": 1.14, + "step": 81000 + }, + { + "epoch": 0.24521362735156568, + "grad_norm": 0.11051207035779953, + "learning_rate": 9.463513123738102e-05, + "loss": 1.5570884704589845, + "step": 81010 + }, + { + "epoch": 0.24524389690191153, + "grad_norm": 0.13332757353782654, + "learning_rate": 9.463133605574363e-05, + "loss": 1.5622550010681153, + "step": 81020 + }, + { + "epoch": 0.24527416645225736, + "grad_norm": 0.126485213637352, + "learning_rate": 9.462754087410623e-05, + "loss": 1.5649001121520996, + "step": 81030 + }, + { + "epoch": 0.24530443600260318, + "grad_norm": 0.1318131983280182, + "learning_rate": 9.462374569246884e-05, + "loss": 1.5191258430480956, + "step": 81040 + }, + { + "epoch": 0.245334705552949, + "grad_norm": 0.10528193414211273, + "learning_rate": 9.461995051083146e-05, + "loss": 1.592789077758789, + "step": 81050 + }, + { + "epoch": 0.24536497510329483, + "grad_norm": 0.12106654793024063, + "learning_rate": 9.461615532919405e-05, + "loss": 1.6066856384277344, + "step": 81060 + }, + { + "epoch": 0.24539524465364068, + "grad_norm": 0.11411327868700027, + "learning_rate": 9.461236014755667e-05, + "loss": 1.571620559692383, + "step": 81070 + }, + { + "epoch": 0.2454255142039865, + "grad_norm": 0.12115097045898438, + "learning_rate": 9.460856496591928e-05, + "loss": 1.570558452606201, + "step": 81080 + }, + { + "epoch": 0.24545578375433233, + "grad_norm": 0.13281457126140594, + "learning_rate": 9.460476978428188e-05, + "loss": 1.5733098030090331, + "step": 81090 + }, + { + "epoch": 0.24548605330467815, + "grad_norm": 0.10983392596244812, + "learning_rate": 9.460097460264449e-05, + "loss": 1.564253044128418, + "step": 81100 + }, + { + "epoch": 0.24551632285502398, + "grad_norm": 0.13770101964473724, + "learning_rate": 9.459717942100709e-05, + "loss": 1.5814422607421874, + "step": 81110 + }, + { + "epoch": 0.24554659240536983, + "grad_norm": 0.11329000443220139, + "learning_rate": 9.45933842393697e-05, + "loss": 1.5421415328979493, + "step": 81120 + }, + { + "epoch": 0.24557686195571565, + "grad_norm": 0.11826098710298538, + "learning_rate": 9.45895890577323e-05, + "loss": 1.5791908264160157, + "step": 81130 + }, + { + "epoch": 0.24560713150606148, + "grad_norm": 0.11365455389022827, + "learning_rate": 9.458579387609491e-05, + "loss": 1.597590446472168, + "step": 81140 + }, + { + "epoch": 0.2456374010564073, + "grad_norm": 0.12576036155223846, + "learning_rate": 9.458199869445752e-05, + "loss": 1.6265167236328124, + "step": 81150 + }, + { + "epoch": 0.24566767060675312, + "grad_norm": 0.12955492734909058, + "learning_rate": 9.457820351282012e-05, + "loss": 1.5681982040405273, + "step": 81160 + }, + { + "epoch": 0.24569794015709898, + "grad_norm": 0.13656209409236908, + "learning_rate": 9.457440833118273e-05, + "loss": 1.588450813293457, + "step": 81170 + }, + { + "epoch": 0.2457282097074448, + "grad_norm": 0.11966199427843094, + "learning_rate": 9.457061314954533e-05, + "loss": 1.565772247314453, + "step": 81180 + }, + { + "epoch": 0.24575847925779062, + "grad_norm": 0.11618493497371674, + "learning_rate": 9.456681796790795e-05, + "loss": 1.5932559967041016, + "step": 81190 + }, + { + "epoch": 0.24578874880813645, + "grad_norm": 0.11795849353075027, + "learning_rate": 9.456302278627055e-05, + "loss": 1.5902298927307128, + "step": 81200 + }, + { + "epoch": 0.24581901835848227, + "grad_norm": 0.1276828497648239, + "learning_rate": 9.455922760463317e-05, + "loss": 1.5764375686645509, + "step": 81210 + }, + { + "epoch": 0.24584928790882812, + "grad_norm": 0.1155276820063591, + "learning_rate": 9.455543242299576e-05, + "loss": 1.578103733062744, + "step": 81220 + }, + { + "epoch": 0.24587955745917395, + "grad_norm": 0.11863353848457336, + "learning_rate": 9.455163724135838e-05, + "loss": 1.5790618896484374, + "step": 81230 + }, + { + "epoch": 0.24590982700951977, + "grad_norm": 0.12418896704912186, + "learning_rate": 9.454784205972098e-05, + "loss": 1.5793086051940919, + "step": 81240 + }, + { + "epoch": 0.2459400965598656, + "grad_norm": 0.13495664298534393, + "learning_rate": 9.454404687808359e-05, + "loss": 1.6154708862304688, + "step": 81250 + }, + { + "epoch": 0.24597036611021142, + "grad_norm": 0.11782442778348923, + "learning_rate": 9.45402516964462e-05, + "loss": 1.5928529739379882, + "step": 81260 + }, + { + "epoch": 0.24600063566055727, + "grad_norm": 0.11954569071531296, + "learning_rate": 9.45364565148088e-05, + "loss": 1.582555389404297, + "step": 81270 + }, + { + "epoch": 0.2460309052109031, + "grad_norm": 0.11944688856601715, + "learning_rate": 9.453266133317141e-05, + "loss": 1.5908671379089356, + "step": 81280 + }, + { + "epoch": 0.24606117476124892, + "grad_norm": 0.11981285363435745, + "learning_rate": 9.452886615153401e-05, + "loss": 1.5401859283447266, + "step": 81290 + }, + { + "epoch": 0.24609144431159474, + "grad_norm": 0.12302833050489426, + "learning_rate": 9.452507096989662e-05, + "loss": 1.554154872894287, + "step": 81300 + }, + { + "epoch": 0.2461217138619406, + "grad_norm": 0.11844673752784729, + "learning_rate": 9.452127578825923e-05, + "loss": 1.5448906898498536, + "step": 81310 + }, + { + "epoch": 0.24615198341228642, + "grad_norm": 0.1297735571861267, + "learning_rate": 9.451748060662185e-05, + "loss": 1.5995595932006836, + "step": 81320 + }, + { + "epoch": 0.24618225296263224, + "grad_norm": 0.1109696477651596, + "learning_rate": 9.451368542498444e-05, + "loss": 1.5977351188659668, + "step": 81330 + }, + { + "epoch": 0.24621252251297807, + "grad_norm": 0.11565618216991425, + "learning_rate": 9.450989024334706e-05, + "loss": 1.58227596282959, + "step": 81340 + }, + { + "epoch": 0.2462427920633239, + "grad_norm": 0.11799336969852448, + "learning_rate": 9.450609506170965e-05, + "loss": 1.6028844833374023, + "step": 81350 + }, + { + "epoch": 0.24627306161366974, + "grad_norm": 0.1316344290971756, + "learning_rate": 9.450229988007227e-05, + "loss": 1.5112923622131347, + "step": 81360 + }, + { + "epoch": 0.24630333116401557, + "grad_norm": 0.12486566603183746, + "learning_rate": 9.449850469843486e-05, + "loss": 1.5670955657958985, + "step": 81370 + }, + { + "epoch": 0.2463336007143614, + "grad_norm": 0.1197214424610138, + "learning_rate": 9.449470951679748e-05, + "loss": 1.5500106811523438, + "step": 81380 + }, + { + "epoch": 0.2463638702647072, + "grad_norm": 0.1163727417588234, + "learning_rate": 9.449091433516007e-05, + "loss": 1.5805582046508788, + "step": 81390 + }, + { + "epoch": 0.24639413981505304, + "grad_norm": 0.1130540743470192, + "learning_rate": 9.448711915352269e-05, + "loss": 1.550090503692627, + "step": 81400 + }, + { + "epoch": 0.2464244093653989, + "grad_norm": 0.13772732019424438, + "learning_rate": 9.448332397188529e-05, + "loss": 1.5449764251708984, + "step": 81410 + }, + { + "epoch": 0.2464546789157447, + "grad_norm": 0.13586899638175964, + "learning_rate": 9.44795287902479e-05, + "loss": 1.5965815544128419, + "step": 81420 + }, + { + "epoch": 0.24648494846609054, + "grad_norm": 0.11958891153335571, + "learning_rate": 9.44757336086105e-05, + "loss": 1.5856329917907714, + "step": 81430 + }, + { + "epoch": 0.24651521801643636, + "grad_norm": 0.12143343687057495, + "learning_rate": 9.447193842697312e-05, + "loss": 1.5612719535827637, + "step": 81440 + }, + { + "epoch": 0.24654548756678218, + "grad_norm": 0.11481048166751862, + "learning_rate": 9.446814324533574e-05, + "loss": 1.5636154174804688, + "step": 81450 + }, + { + "epoch": 0.24657575711712804, + "grad_norm": 0.12559659779071808, + "learning_rate": 9.446434806369833e-05, + "loss": 1.536231231689453, + "step": 81460 + }, + { + "epoch": 0.24660602666747386, + "grad_norm": 0.11764121800661087, + "learning_rate": 9.446055288206095e-05, + "loss": 1.577072525024414, + "step": 81470 + }, + { + "epoch": 0.24663629621781968, + "grad_norm": 0.11192237585783005, + "learning_rate": 9.445675770042354e-05, + "loss": 1.5725682258605957, + "step": 81480 + }, + { + "epoch": 0.2466665657681655, + "grad_norm": 0.11510703712701797, + "learning_rate": 9.445296251878616e-05, + "loss": 1.5875200271606444, + "step": 81490 + }, + { + "epoch": 0.24669683531851133, + "grad_norm": 0.10758009552955627, + "learning_rate": 9.444916733714875e-05, + "loss": 1.591505241394043, + "step": 81500 + }, + { + "epoch": 0.24669683531851133, + "eval_loss": 1.5855962038040161, + "eval_runtime": 28.5683, + "eval_samples_per_second": 17.502, + "eval_steps_per_second": 1.12, + "step": 81500 + }, + { + "epoch": 0.24672710486885718, + "grad_norm": 0.14112524688243866, + "learning_rate": 9.444537215551137e-05, + "loss": 1.5476411819458007, + "step": 81510 + }, + { + "epoch": 0.246757374419203, + "grad_norm": 0.11112253367900848, + "learning_rate": 9.444157697387396e-05, + "loss": 1.5724592208862305, + "step": 81520 + }, + { + "epoch": 0.24678764396954883, + "grad_norm": 0.11450455337762833, + "learning_rate": 9.443778179223658e-05, + "loss": 1.545371913909912, + "step": 81530 + }, + { + "epoch": 0.24681791351989465, + "grad_norm": 0.10819746553897858, + "learning_rate": 9.443398661059918e-05, + "loss": 1.5800270080566405, + "step": 81540 + }, + { + "epoch": 0.24684818307024048, + "grad_norm": 0.1334155946969986, + "learning_rate": 9.44301914289618e-05, + "loss": 1.555245590209961, + "step": 81550 + }, + { + "epoch": 0.24687845262058633, + "grad_norm": 0.11528020352125168, + "learning_rate": 9.442639624732439e-05, + "loss": 1.5634832382202148, + "step": 81560 + }, + { + "epoch": 0.24690872217093215, + "grad_norm": 0.1262226700782776, + "learning_rate": 9.442260106568701e-05, + "loss": 1.5779050827026366, + "step": 81570 + }, + { + "epoch": 0.24693899172127798, + "grad_norm": 0.11079977452754974, + "learning_rate": 9.441880588404961e-05, + "loss": 1.592779541015625, + "step": 81580 + }, + { + "epoch": 0.2469692612716238, + "grad_norm": 0.11394518613815308, + "learning_rate": 9.441501070241222e-05, + "loss": 1.605503463745117, + "step": 81590 + }, + { + "epoch": 0.24699953082196963, + "grad_norm": 0.1069389209151268, + "learning_rate": 9.441121552077483e-05, + "loss": 1.552263069152832, + "step": 81600 + }, + { + "epoch": 0.24702980037231548, + "grad_norm": 0.11312328279018402, + "learning_rate": 9.440742033913743e-05, + "loss": 1.565835952758789, + "step": 81610 + }, + { + "epoch": 0.2470600699226613, + "grad_norm": 0.11385420709848404, + "learning_rate": 9.440362515750004e-05, + "loss": 1.5730655670166016, + "step": 81620 + }, + { + "epoch": 0.24709033947300713, + "grad_norm": 0.11277095973491669, + "learning_rate": 9.439982997586264e-05, + "loss": 1.5391321182250977, + "step": 81630 + }, + { + "epoch": 0.24712060902335295, + "grad_norm": 0.1139988824725151, + "learning_rate": 9.439603479422525e-05, + "loss": 1.5825777053833008, + "step": 81640 + }, + { + "epoch": 0.2471508785736988, + "grad_norm": 0.11526840180158615, + "learning_rate": 9.439223961258786e-05, + "loss": 1.5381929397583007, + "step": 81650 + }, + { + "epoch": 0.24718114812404463, + "grad_norm": 0.12547871470451355, + "learning_rate": 9.438844443095047e-05, + "loss": 1.5879097938537599, + "step": 81660 + }, + { + "epoch": 0.24721141767439045, + "grad_norm": 0.13178087770938873, + "learning_rate": 9.438464924931307e-05, + "loss": 1.5676470756530763, + "step": 81670 + }, + { + "epoch": 0.24724168722473627, + "grad_norm": 0.11794871091842651, + "learning_rate": 9.438085406767569e-05, + "loss": 1.5691705703735352, + "step": 81680 + }, + { + "epoch": 0.2472719567750821, + "grad_norm": 0.11720377206802368, + "learning_rate": 9.437705888603829e-05, + "loss": 1.586436653137207, + "step": 81690 + }, + { + "epoch": 0.24730222632542795, + "grad_norm": 0.1245630607008934, + "learning_rate": 9.43732637044009e-05, + "loss": 1.5759535789489747, + "step": 81700 + }, + { + "epoch": 0.24733249587577377, + "grad_norm": 0.10678810626268387, + "learning_rate": 9.43694685227635e-05, + "loss": 1.566305160522461, + "step": 81710 + }, + { + "epoch": 0.2473627654261196, + "grad_norm": 0.12277030944824219, + "learning_rate": 9.436567334112611e-05, + "loss": 1.56707706451416, + "step": 81720 + }, + { + "epoch": 0.24739303497646542, + "grad_norm": 0.11994136869907379, + "learning_rate": 9.436187815948872e-05, + "loss": 1.5652006149291993, + "step": 81730 + }, + { + "epoch": 0.24742330452681124, + "grad_norm": 0.10273487865924835, + "learning_rate": 9.435808297785132e-05, + "loss": 1.572946548461914, + "step": 81740 + }, + { + "epoch": 0.2474535740771571, + "grad_norm": 0.12177558243274689, + "learning_rate": 9.435428779621393e-05, + "loss": 1.5712010383605957, + "step": 81750 + }, + { + "epoch": 0.24748384362750292, + "grad_norm": 0.12573394179344177, + "learning_rate": 9.435049261457653e-05, + "loss": 1.5368699073791503, + "step": 81760 + }, + { + "epoch": 0.24751411317784874, + "grad_norm": 0.11572984606027603, + "learning_rate": 9.434669743293914e-05, + "loss": 1.568537139892578, + "step": 81770 + }, + { + "epoch": 0.24754438272819457, + "grad_norm": 0.1315382868051529, + "learning_rate": 9.434290225130175e-05, + "loss": 1.5451558113098145, + "step": 81780 + }, + { + "epoch": 0.2475746522785404, + "grad_norm": 0.13362155854701996, + "learning_rate": 9.433910706966435e-05, + "loss": 1.5759828567504883, + "step": 81790 + }, + { + "epoch": 0.24760492182888624, + "grad_norm": 0.12462229281663895, + "learning_rate": 9.433531188802696e-05, + "loss": 1.5922677993774415, + "step": 81800 + }, + { + "epoch": 0.24763519137923207, + "grad_norm": 0.12214498221874237, + "learning_rate": 9.433151670638956e-05, + "loss": 1.57659912109375, + "step": 81810 + }, + { + "epoch": 0.2476654609295779, + "grad_norm": 0.12335055321455002, + "learning_rate": 9.432772152475218e-05, + "loss": 1.5519088745117187, + "step": 81820 + }, + { + "epoch": 0.24769573047992371, + "grad_norm": 0.12254133075475693, + "learning_rate": 9.432392634311478e-05, + "loss": 1.5526444435119628, + "step": 81830 + }, + { + "epoch": 0.24772600003026954, + "grad_norm": 0.13045848906040192, + "learning_rate": 9.43201311614774e-05, + "loss": 1.5679334640502929, + "step": 81840 + }, + { + "epoch": 0.2477562695806154, + "grad_norm": 0.13178633153438568, + "learning_rate": 9.431633597984e-05, + "loss": 1.5524221420288087, + "step": 81850 + }, + { + "epoch": 0.24778653913096121, + "grad_norm": 0.1320505291223526, + "learning_rate": 9.431254079820261e-05, + "loss": 1.528432846069336, + "step": 81860 + }, + { + "epoch": 0.24781680868130704, + "grad_norm": 0.12510235607624054, + "learning_rate": 9.430874561656521e-05, + "loss": 1.5463584899902343, + "step": 81870 + }, + { + "epoch": 0.24784707823165286, + "grad_norm": 0.11436045169830322, + "learning_rate": 9.430495043492782e-05, + "loss": 1.5536029815673829, + "step": 81880 + }, + { + "epoch": 0.24787734778199869, + "grad_norm": 0.12484356760978699, + "learning_rate": 9.430115525329043e-05, + "loss": 1.556326198577881, + "step": 81890 + }, + { + "epoch": 0.24790761733234454, + "grad_norm": 0.11702501773834229, + "learning_rate": 9.429736007165303e-05, + "loss": 1.592570686340332, + "step": 81900 + }, + { + "epoch": 0.24793788688269036, + "grad_norm": 0.10914624482393265, + "learning_rate": 9.429356489001564e-05, + "loss": 1.5820722579956055, + "step": 81910 + }, + { + "epoch": 0.24796815643303619, + "grad_norm": 0.11787162721157074, + "learning_rate": 9.428976970837824e-05, + "loss": 1.5678730964660645, + "step": 81920 + }, + { + "epoch": 0.247998425983382, + "grad_norm": 0.10986723005771637, + "learning_rate": 9.428597452674086e-05, + "loss": 1.5976116180419921, + "step": 81930 + }, + { + "epoch": 0.24802869553372783, + "grad_norm": 0.10254167020320892, + "learning_rate": 9.428217934510345e-05, + "loss": 1.547915267944336, + "step": 81940 + }, + { + "epoch": 0.24805896508407368, + "grad_norm": 0.10842840373516083, + "learning_rate": 9.427838416346607e-05, + "loss": 1.5867149353027343, + "step": 81950 + }, + { + "epoch": 0.2480892346344195, + "grad_norm": 0.12051333487033844, + "learning_rate": 9.427458898182867e-05, + "loss": 1.5638861656188965, + "step": 81960 + }, + { + "epoch": 0.24811950418476533, + "grad_norm": 0.12543393671512604, + "learning_rate": 9.427079380019129e-05, + "loss": 1.5558717727661133, + "step": 81970 + }, + { + "epoch": 0.24814977373511116, + "grad_norm": 0.11781011521816254, + "learning_rate": 9.426699861855388e-05, + "loss": 1.6206853866577149, + "step": 81980 + }, + { + "epoch": 0.248180043285457, + "grad_norm": 0.12595902383327484, + "learning_rate": 9.42632034369165e-05, + "loss": 1.5895276069641113, + "step": 81990 + }, + { + "epoch": 0.24821031283580283, + "grad_norm": 0.12970294058322906, + "learning_rate": 9.425940825527909e-05, + "loss": 1.5903578758239747, + "step": 82000 + }, + { + "epoch": 0.24821031283580283, + "eval_loss": 1.5771288871765137, + "eval_runtime": 27.3006, + "eval_samples_per_second": 18.315, + "eval_steps_per_second": 1.172, + "step": 82000 + }, + { + "epoch": 0.24824058238614866, + "grad_norm": 0.11290772259235382, + "learning_rate": 9.425561307364171e-05, + "loss": 1.5255401611328125, + "step": 82010 + }, + { + "epoch": 0.24827085193649448, + "grad_norm": 0.1122119203209877, + "learning_rate": 9.42518178920043e-05, + "loss": 1.5715499877929688, + "step": 82020 + }, + { + "epoch": 0.2483011214868403, + "grad_norm": 0.11790797859430313, + "learning_rate": 9.424802271036692e-05, + "loss": 1.5876646041870117, + "step": 82030 + }, + { + "epoch": 0.24833139103718616, + "grad_norm": 0.13631770014762878, + "learning_rate": 9.424422752872951e-05, + "loss": 1.5214564323425293, + "step": 82040 + }, + { + "epoch": 0.24836166058753198, + "grad_norm": 0.12446720153093338, + "learning_rate": 9.424043234709213e-05, + "loss": 1.6114921569824219, + "step": 82050 + }, + { + "epoch": 0.2483919301378778, + "grad_norm": 0.11177624762058258, + "learning_rate": 9.423663716545475e-05, + "loss": 1.5760208129882813, + "step": 82060 + }, + { + "epoch": 0.24842219968822363, + "grad_norm": 0.12468337267637253, + "learning_rate": 9.423284198381735e-05, + "loss": 1.553677463531494, + "step": 82070 + }, + { + "epoch": 0.24845246923856945, + "grad_norm": 0.13629479706287384, + "learning_rate": 9.422904680217997e-05, + "loss": 1.5949710845947265, + "step": 82080 + }, + { + "epoch": 0.2484827387889153, + "grad_norm": 0.10815016180276871, + "learning_rate": 9.422525162054256e-05, + "loss": 1.6044322967529296, + "step": 82090 + }, + { + "epoch": 0.24851300833926113, + "grad_norm": 0.11824129521846771, + "learning_rate": 9.422145643890518e-05, + "loss": 1.5818208694458007, + "step": 82100 + }, + { + "epoch": 0.24854327788960695, + "grad_norm": 0.11870016157627106, + "learning_rate": 9.421766125726777e-05, + "loss": 1.5611760139465332, + "step": 82110 + }, + { + "epoch": 0.24857354743995277, + "grad_norm": 0.11810977011919022, + "learning_rate": 9.421386607563039e-05, + "loss": 1.5936317443847656, + "step": 82120 + }, + { + "epoch": 0.2486038169902986, + "grad_norm": 0.1093871146440506, + "learning_rate": 9.421007089399298e-05, + "loss": 1.5694135665893554, + "step": 82130 + }, + { + "epoch": 0.24863408654064445, + "grad_norm": 0.10954669117927551, + "learning_rate": 9.42062757123556e-05, + "loss": 1.5737911224365235, + "step": 82140 + }, + { + "epoch": 0.24866435609099027, + "grad_norm": 0.10114935785531998, + "learning_rate": 9.42024805307182e-05, + "loss": 1.5811117172241211, + "step": 82150 + }, + { + "epoch": 0.2486946256413361, + "grad_norm": 0.1109304428100586, + "learning_rate": 9.419868534908081e-05, + "loss": 1.5998100280761718, + "step": 82160 + }, + { + "epoch": 0.24872489519168192, + "grad_norm": 0.12157043069601059, + "learning_rate": 9.41948901674434e-05, + "loss": 1.560896110534668, + "step": 82170 + }, + { + "epoch": 0.24875516474202775, + "grad_norm": 0.11966147273778915, + "learning_rate": 9.419109498580602e-05, + "loss": 1.556704616546631, + "step": 82180 + }, + { + "epoch": 0.2487854342923736, + "grad_norm": 0.11741655319929123, + "learning_rate": 9.418729980416863e-05, + "loss": 1.5811979293823242, + "step": 82190 + }, + { + "epoch": 0.24881570384271942, + "grad_norm": 0.12487056106328964, + "learning_rate": 9.418350462253124e-05, + "loss": 1.5890671730041503, + "step": 82200 + }, + { + "epoch": 0.24884597339306525, + "grad_norm": 0.11877283453941345, + "learning_rate": 9.417970944089384e-05, + "loss": 1.5488779067993164, + "step": 82210 + }, + { + "epoch": 0.24887624294341107, + "grad_norm": 0.12706932425498962, + "learning_rate": 9.417591425925645e-05, + "loss": 1.5592130661010741, + "step": 82220 + }, + { + "epoch": 0.2489065124937569, + "grad_norm": 0.11712432652711868, + "learning_rate": 9.417211907761905e-05, + "loss": 1.5457771301269532, + "step": 82230 + }, + { + "epoch": 0.24893678204410274, + "grad_norm": 0.11418253183364868, + "learning_rate": 9.416832389598166e-05, + "loss": 1.5429311752319337, + "step": 82240 + }, + { + "epoch": 0.24896705159444857, + "grad_norm": 0.11508897691965103, + "learning_rate": 9.416452871434427e-05, + "loss": 1.5495475769042968, + "step": 82250 + }, + { + "epoch": 0.2489973211447944, + "grad_norm": 0.11413668096065521, + "learning_rate": 9.416073353270687e-05, + "loss": 1.5646778106689454, + "step": 82260 + }, + { + "epoch": 0.24902759069514022, + "grad_norm": 0.12633995711803436, + "learning_rate": 9.415693835106949e-05, + "loss": 1.5446452140808105, + "step": 82270 + }, + { + "epoch": 0.24905786024548604, + "grad_norm": 0.11014872789382935, + "learning_rate": 9.415314316943208e-05, + "loss": 1.5779056549072266, + "step": 82280 + }, + { + "epoch": 0.2490881297958319, + "grad_norm": 0.11954592913389206, + "learning_rate": 9.41493479877947e-05, + "loss": 1.5687732696533203, + "step": 82290 + }, + { + "epoch": 0.24911839934617772, + "grad_norm": 0.11916156858205795, + "learning_rate": 9.414555280615731e-05, + "loss": 1.6014137268066406, + "step": 82300 + }, + { + "epoch": 0.24914866889652354, + "grad_norm": 0.1264656037092209, + "learning_rate": 9.414175762451992e-05, + "loss": 1.550777053833008, + "step": 82310 + }, + { + "epoch": 0.24917893844686936, + "grad_norm": 0.12536436319351196, + "learning_rate": 9.413796244288252e-05, + "loss": 1.6089387893676759, + "step": 82320 + }, + { + "epoch": 0.24920920799721522, + "grad_norm": 0.11654745042324066, + "learning_rate": 9.413416726124513e-05, + "loss": 1.5877079010009765, + "step": 82330 + }, + { + "epoch": 0.24923947754756104, + "grad_norm": 0.11593074351549149, + "learning_rate": 9.413037207960773e-05, + "loss": 1.5645744323730468, + "step": 82340 + }, + { + "epoch": 0.24926974709790686, + "grad_norm": 0.12697939574718475, + "learning_rate": 9.412657689797034e-05, + "loss": 1.5431496620178222, + "step": 82350 + }, + { + "epoch": 0.2493000166482527, + "grad_norm": 0.11432324349880219, + "learning_rate": 9.412278171633295e-05, + "loss": 1.5560031890869142, + "step": 82360 + }, + { + "epoch": 0.2493302861985985, + "grad_norm": 0.11344140768051147, + "learning_rate": 9.411898653469555e-05, + "loss": 1.5756637573242187, + "step": 82370 + }, + { + "epoch": 0.24936055574894436, + "grad_norm": 0.11839371174573898, + "learning_rate": 9.411519135305816e-05, + "loss": 1.5345190048217774, + "step": 82380 + }, + { + "epoch": 0.2493908252992902, + "grad_norm": 0.11324068158864975, + "learning_rate": 9.411139617142076e-05, + "loss": 1.5723033905029298, + "step": 82390 + }, + { + "epoch": 0.249421094849636, + "grad_norm": 0.11997973173856735, + "learning_rate": 9.410760098978337e-05, + "loss": 1.5204670906066895, + "step": 82400 + }, + { + "epoch": 0.24945136439998183, + "grad_norm": 0.11949123442173004, + "learning_rate": 9.410380580814598e-05, + "loss": 1.595263957977295, + "step": 82410 + }, + { + "epoch": 0.24948163395032766, + "grad_norm": 0.11467595398426056, + "learning_rate": 9.410001062650858e-05, + "loss": 1.528137493133545, + "step": 82420 + }, + { + "epoch": 0.2495119035006735, + "grad_norm": 0.10440559685230255, + "learning_rate": 9.40962154448712e-05, + "loss": 1.5605960845947267, + "step": 82430 + }, + { + "epoch": 0.24954217305101933, + "grad_norm": 0.11368744820356369, + "learning_rate": 9.409242026323379e-05, + "loss": 1.6026838302612305, + "step": 82440 + }, + { + "epoch": 0.24957244260136516, + "grad_norm": 0.11010005325078964, + "learning_rate": 9.408862508159641e-05, + "loss": 1.5621129989624023, + "step": 82450 + }, + { + "epoch": 0.24960271215171098, + "grad_norm": 0.11953739821910858, + "learning_rate": 9.408482989995902e-05, + "loss": 1.562499713897705, + "step": 82460 + }, + { + "epoch": 0.2496329817020568, + "grad_norm": 0.12326007336378098, + "learning_rate": 9.408103471832162e-05, + "loss": 1.5779827117919922, + "step": 82470 + }, + { + "epoch": 0.24966325125240266, + "grad_norm": 0.11416245251893997, + "learning_rate": 9.407723953668423e-05, + "loss": 1.6128055572509765, + "step": 82480 + }, + { + "epoch": 0.24969352080274848, + "grad_norm": 0.12269899249076843, + "learning_rate": 9.407344435504684e-05, + "loss": 1.5596317291259765, + "step": 82490 + }, + { + "epoch": 0.2497237903530943, + "grad_norm": 0.14594188332557678, + "learning_rate": 9.406964917340944e-05, + "loss": 1.5945680618286133, + "step": 82500 + }, + { + "epoch": 0.2497237903530943, + "eval_loss": 1.5805667638778687, + "eval_runtime": 28.1893, + "eval_samples_per_second": 17.737, + "eval_steps_per_second": 1.135, + "step": 82500 + }, + { + "epoch": 0.24975405990344013, + "grad_norm": 0.11266544461250305, + "learning_rate": 9.406585399177205e-05, + "loss": 1.6144948959350587, + "step": 82510 + }, + { + "epoch": 0.24978432945378595, + "grad_norm": 0.11099118739366531, + "learning_rate": 9.406205881013465e-05, + "loss": 1.5496164321899415, + "step": 82520 + }, + { + "epoch": 0.2498145990041318, + "grad_norm": 0.12508392333984375, + "learning_rate": 9.405826362849726e-05, + "loss": 1.5877683639526368, + "step": 82530 + }, + { + "epoch": 0.24984486855447763, + "grad_norm": 0.12724712491035461, + "learning_rate": 9.405446844685987e-05, + "loss": 1.5999064445495605, + "step": 82540 + }, + { + "epoch": 0.24987513810482345, + "grad_norm": 0.12245563417673111, + "learning_rate": 9.405067326522247e-05, + "loss": 1.5461466789245606, + "step": 82550 + }, + { + "epoch": 0.24990540765516928, + "grad_norm": 0.10741952806711197, + "learning_rate": 9.404687808358509e-05, + "loss": 1.5628973960876464, + "step": 82560 + }, + { + "epoch": 0.2499356772055151, + "grad_norm": 0.11109619587659836, + "learning_rate": 9.404308290194768e-05, + "loss": 1.5912169456481933, + "step": 82570 + }, + { + "epoch": 0.24996594675586095, + "grad_norm": 0.12275426834821701, + "learning_rate": 9.40392877203103e-05, + "loss": 1.542238998413086, + "step": 82580 + }, + { + "epoch": 0.24999621630620678, + "grad_norm": 0.13951250910758972, + "learning_rate": 9.40354925386729e-05, + "loss": 1.5473370552062988, + "step": 82590 + }, + { + "epoch": 0.2500264858565526, + "grad_norm": 0.11921439319849014, + "learning_rate": 9.403169735703552e-05, + "loss": 1.5875802993774415, + "step": 82600 + }, + { + "epoch": 0.2500567554068984, + "grad_norm": 0.13826557993888855, + "learning_rate": 9.402790217539811e-05, + "loss": 1.630019187927246, + "step": 82610 + }, + { + "epoch": 0.2500870249572443, + "grad_norm": 0.11995960772037506, + "learning_rate": 9.402410699376073e-05, + "loss": 1.550745391845703, + "step": 82620 + }, + { + "epoch": 0.25011729450759007, + "grad_norm": 0.10745029151439667, + "learning_rate": 9.402031181212332e-05, + "loss": 1.571980094909668, + "step": 82630 + }, + { + "epoch": 0.2501475640579359, + "grad_norm": 0.12490306794643402, + "learning_rate": 9.401651663048594e-05, + "loss": 1.5476943969726562, + "step": 82640 + }, + { + "epoch": 0.2501778336082818, + "grad_norm": 0.120815210044384, + "learning_rate": 9.401272144884853e-05, + "loss": 1.5836686134338378, + "step": 82650 + }, + { + "epoch": 0.25020810315862757, + "grad_norm": 0.11692170053720474, + "learning_rate": 9.400892626721115e-05, + "loss": 1.5755928039550782, + "step": 82660 + }, + { + "epoch": 0.2502383727089734, + "grad_norm": 0.1245240792632103, + "learning_rate": 9.400513108557377e-05, + "loss": 1.5801681518554687, + "step": 82670 + }, + { + "epoch": 0.2502686422593192, + "grad_norm": 0.12964479625225067, + "learning_rate": 9.400133590393636e-05, + "loss": 1.5576718330383301, + "step": 82680 + }, + { + "epoch": 0.25029891180966507, + "grad_norm": 0.11295616626739502, + "learning_rate": 9.399754072229898e-05, + "loss": 1.552357006072998, + "step": 82690 + }, + { + "epoch": 0.2503291813600109, + "grad_norm": 0.12739519774913788, + "learning_rate": 9.399374554066157e-05, + "loss": 1.5547054290771485, + "step": 82700 + }, + { + "epoch": 0.2503594509103567, + "grad_norm": 0.11813486367464066, + "learning_rate": 9.39899503590242e-05, + "loss": 1.5793683052062988, + "step": 82710 + }, + { + "epoch": 0.25038972046070257, + "grad_norm": 0.10137002915143967, + "learning_rate": 9.398615517738679e-05, + "loss": 1.5832868576049806, + "step": 82720 + }, + { + "epoch": 0.25041999001104837, + "grad_norm": 0.10569494217634201, + "learning_rate": 9.39823599957494e-05, + "loss": 1.5827982902526856, + "step": 82730 + }, + { + "epoch": 0.2504502595613942, + "grad_norm": 0.14209862053394318, + "learning_rate": 9.3978564814112e-05, + "loss": 1.5799777030944824, + "step": 82740 + }, + { + "epoch": 0.25048052911174007, + "grad_norm": 0.11796174198389053, + "learning_rate": 9.397476963247462e-05, + "loss": 1.545491600036621, + "step": 82750 + }, + { + "epoch": 0.25051079866208587, + "grad_norm": 0.12445246428251266, + "learning_rate": 9.397097445083721e-05, + "loss": 1.553436851501465, + "step": 82760 + }, + { + "epoch": 0.2505410682124317, + "grad_norm": 0.11441120505332947, + "learning_rate": 9.396717926919983e-05, + "loss": 1.5992615699768067, + "step": 82770 + }, + { + "epoch": 0.2505713377627775, + "grad_norm": 0.11352455615997314, + "learning_rate": 9.396338408756242e-05, + "loss": 1.5574045181274414, + "step": 82780 + }, + { + "epoch": 0.25060160731312336, + "grad_norm": 0.1432936042547226, + "learning_rate": 9.395958890592504e-05, + "loss": 1.61375732421875, + "step": 82790 + }, + { + "epoch": 0.2506318768634692, + "grad_norm": 0.11869531869888306, + "learning_rate": 9.395579372428765e-05, + "loss": 1.546403980255127, + "step": 82800 + }, + { + "epoch": 0.250662146413815, + "grad_norm": 0.11190614104270935, + "learning_rate": 9.395199854265025e-05, + "loss": 1.5663652420043945, + "step": 82810 + }, + { + "epoch": 0.25069241596416086, + "grad_norm": 0.1111435517668724, + "learning_rate": 9.394820336101286e-05, + "loss": 1.5482519149780274, + "step": 82820 + }, + { + "epoch": 0.25072268551450666, + "grad_norm": 0.1361456960439682, + "learning_rate": 9.394440817937547e-05, + "loss": 1.5545198440551757, + "step": 82830 + }, + { + "epoch": 0.2507529550648525, + "grad_norm": 0.11557848751544952, + "learning_rate": 9.394061299773807e-05, + "loss": 1.5730225563049316, + "step": 82840 + }, + { + "epoch": 0.25078322461519836, + "grad_norm": 0.10772501677274704, + "learning_rate": 9.393681781610068e-05, + "loss": 1.601715087890625, + "step": 82850 + }, + { + "epoch": 0.25081349416554416, + "grad_norm": 0.1372593194246292, + "learning_rate": 9.393302263446328e-05, + "loss": 1.5511783599853515, + "step": 82860 + }, + { + "epoch": 0.25084376371589, + "grad_norm": 0.11236710101366043, + "learning_rate": 9.392922745282589e-05, + "loss": 1.5370561599731445, + "step": 82870 + }, + { + "epoch": 0.2508740332662358, + "grad_norm": 0.11744386702775955, + "learning_rate": 9.392543227118851e-05, + "loss": 1.5721545219421387, + "step": 82880 + }, + { + "epoch": 0.25090430281658166, + "grad_norm": 0.12163328379392624, + "learning_rate": 9.39216370895511e-05, + "loss": 1.5874576568603516, + "step": 82890 + }, + { + "epoch": 0.2509345723669275, + "grad_norm": 0.13290518522262573, + "learning_rate": 9.391784190791372e-05, + "loss": 1.5914077758789062, + "step": 82900 + }, + { + "epoch": 0.2509648419172733, + "grad_norm": 0.12135430425405502, + "learning_rate": 9.391404672627631e-05, + "loss": 1.6084749221801757, + "step": 82910 + }, + { + "epoch": 0.25099511146761916, + "grad_norm": 0.12524616718292236, + "learning_rate": 9.391025154463893e-05, + "loss": 1.5744636535644532, + "step": 82920 + }, + { + "epoch": 0.25102538101796495, + "grad_norm": 0.1186794564127922, + "learning_rate": 9.390645636300154e-05, + "loss": 1.556348991394043, + "step": 82930 + }, + { + "epoch": 0.2510556505683108, + "grad_norm": 0.11348115652799606, + "learning_rate": 9.390266118136414e-05, + "loss": 1.585677146911621, + "step": 82940 + }, + { + "epoch": 0.25108592011865666, + "grad_norm": 0.11294157058000565, + "learning_rate": 9.389886599972675e-05, + "loss": 1.5822458267211914, + "step": 82950 + }, + { + "epoch": 0.25111618966900245, + "grad_norm": 0.11092421412467957, + "learning_rate": 9.389507081808936e-05, + "loss": 1.5833097457885743, + "step": 82960 + }, + { + "epoch": 0.2511464592193483, + "grad_norm": 0.13216446340084076, + "learning_rate": 9.389127563645196e-05, + "loss": 1.5699077606201173, + "step": 82970 + }, + { + "epoch": 0.2511767287696941, + "grad_norm": 0.11876118183135986, + "learning_rate": 9.388748045481457e-05, + "loss": 1.534297275543213, + "step": 82980 + }, + { + "epoch": 0.25120699832003995, + "grad_norm": 0.11409081518650055, + "learning_rate": 9.388368527317717e-05, + "loss": 1.5784565925598144, + "step": 82990 + }, + { + "epoch": 0.2512372678703858, + "grad_norm": 0.12235663831233978, + "learning_rate": 9.387989009153978e-05, + "loss": 1.5739727020263672, + "step": 83000 + }, + { + "epoch": 0.2512372678703858, + "eval_loss": 1.5605627298355103, + "eval_runtime": 28.1358, + "eval_samples_per_second": 17.771, + "eval_steps_per_second": 1.137, + "step": 83000 + }, + { + "epoch": 0.2512675374207316, + "grad_norm": 0.11284679919481277, + "learning_rate": 9.387609490990239e-05, + "loss": 1.6056146621704102, + "step": 83010 + }, + { + "epoch": 0.25129780697107745, + "grad_norm": 0.11720363795757294, + "learning_rate": 9.387229972826499e-05, + "loss": 1.5554410934448242, + "step": 83020 + }, + { + "epoch": 0.25132807652142325, + "grad_norm": 0.12241392582654953, + "learning_rate": 9.38685045466276e-05, + "loss": 1.5593006134033203, + "step": 83030 + }, + { + "epoch": 0.2513583460717691, + "grad_norm": 0.12977756559848785, + "learning_rate": 9.386470936499022e-05, + "loss": 1.5410821914672852, + "step": 83040 + }, + { + "epoch": 0.25138861562211495, + "grad_norm": 0.1134873628616333, + "learning_rate": 9.386091418335281e-05, + "loss": 1.5588129043579102, + "step": 83050 + }, + { + "epoch": 0.25141888517246075, + "grad_norm": 0.11595926433801651, + "learning_rate": 9.385711900171543e-05, + "loss": 1.511590576171875, + "step": 83060 + }, + { + "epoch": 0.2514491547228066, + "grad_norm": 0.11761318147182465, + "learning_rate": 9.385332382007804e-05, + "loss": 1.5937030792236329, + "step": 83070 + }, + { + "epoch": 0.2514794242731524, + "grad_norm": 0.10506366193294525, + "learning_rate": 9.384952863844064e-05, + "loss": 1.5515230178833008, + "step": 83080 + }, + { + "epoch": 0.25150969382349825, + "grad_norm": 0.1062421202659607, + "learning_rate": 9.384573345680325e-05, + "loss": 1.5912467956542968, + "step": 83090 + }, + { + "epoch": 0.2515399633738441, + "grad_norm": 0.10581709444522858, + "learning_rate": 9.384193827516585e-05, + "loss": 1.53828125, + "step": 83100 + }, + { + "epoch": 0.2515702329241899, + "grad_norm": 0.11713499575853348, + "learning_rate": 9.383814309352846e-05, + "loss": 1.5385578155517579, + "step": 83110 + }, + { + "epoch": 0.25160050247453575, + "grad_norm": 0.12942181527614594, + "learning_rate": 9.383434791189107e-05, + "loss": 1.548037338256836, + "step": 83120 + }, + { + "epoch": 0.25163077202488154, + "grad_norm": 0.12401198595762253, + "learning_rate": 9.383055273025367e-05, + "loss": 1.5887476921081543, + "step": 83130 + }, + { + "epoch": 0.2516610415752274, + "grad_norm": 0.11416209489107132, + "learning_rate": 9.382675754861628e-05, + "loss": 1.5596159934997558, + "step": 83140 + }, + { + "epoch": 0.25169131112557325, + "grad_norm": 0.12985844910144806, + "learning_rate": 9.382296236697888e-05, + "loss": 1.5947371482849122, + "step": 83150 + }, + { + "epoch": 0.25172158067591904, + "grad_norm": 0.1255258172750473, + "learning_rate": 9.381916718534149e-05, + "loss": 1.5842535972595215, + "step": 83160 + }, + { + "epoch": 0.2517518502262649, + "grad_norm": 0.10932961106300354, + "learning_rate": 9.381537200370411e-05, + "loss": 1.5880038261413574, + "step": 83170 + }, + { + "epoch": 0.2517821197766107, + "grad_norm": 0.1017574891448021, + "learning_rate": 9.38115768220667e-05, + "loss": 1.5791709899902344, + "step": 83180 + }, + { + "epoch": 0.25181238932695654, + "grad_norm": 0.11894583702087402, + "learning_rate": 9.380778164042932e-05, + "loss": 1.5649420738220214, + "step": 83190 + }, + { + "epoch": 0.2518426588773024, + "grad_norm": 0.1188054233789444, + "learning_rate": 9.380398645879191e-05, + "loss": 1.5734735488891602, + "step": 83200 + }, + { + "epoch": 0.2518729284276482, + "grad_norm": 0.11414100974798203, + "learning_rate": 9.380019127715453e-05, + "loss": 1.5707969665527344, + "step": 83210 + }, + { + "epoch": 0.25190319797799404, + "grad_norm": 0.12574318051338196, + "learning_rate": 9.379639609551712e-05, + "loss": 1.5480842590332031, + "step": 83220 + }, + { + "epoch": 0.2519334675283399, + "grad_norm": 0.11253193020820618, + "learning_rate": 9.379260091387974e-05, + "loss": 1.6407342910766602, + "step": 83230 + }, + { + "epoch": 0.2519637370786857, + "grad_norm": 0.09978735446929932, + "learning_rate": 9.378880573224234e-05, + "loss": 1.5937190055847168, + "step": 83240 + }, + { + "epoch": 0.25199400662903154, + "grad_norm": 0.11427462846040726, + "learning_rate": 9.378501055060496e-05, + "loss": 1.5845001220703125, + "step": 83250 + }, + { + "epoch": 0.25202427617937734, + "grad_norm": 0.0999375507235527, + "learning_rate": 9.378121536896755e-05, + "loss": 1.5469074249267578, + "step": 83260 + }, + { + "epoch": 0.2520545457297232, + "grad_norm": 0.13141494989395142, + "learning_rate": 9.377742018733017e-05, + "loss": 1.5651278495788574, + "step": 83270 + }, + { + "epoch": 0.25208481528006904, + "grad_norm": 0.12501218914985657, + "learning_rate": 9.377362500569277e-05, + "loss": 1.5600454330444335, + "step": 83280 + }, + { + "epoch": 0.25211508483041484, + "grad_norm": 0.1104034036397934, + "learning_rate": 9.376982982405538e-05, + "loss": 1.5861302375793458, + "step": 83290 + }, + { + "epoch": 0.2521453543807607, + "grad_norm": 0.11654309928417206, + "learning_rate": 9.3766034642418e-05, + "loss": 1.6207828521728516, + "step": 83300 + }, + { + "epoch": 0.2521756239311065, + "grad_norm": 0.13550180196762085, + "learning_rate": 9.376223946078059e-05, + "loss": 1.553340530395508, + "step": 83310 + }, + { + "epoch": 0.25220589348145234, + "grad_norm": 0.11350897699594498, + "learning_rate": 9.375844427914321e-05, + "loss": 1.5703413009643554, + "step": 83320 + }, + { + "epoch": 0.2522361630317982, + "grad_norm": 0.11302478611469269, + "learning_rate": 9.37546490975058e-05, + "loss": 1.5823614120483398, + "step": 83330 + }, + { + "epoch": 0.252266432582144, + "grad_norm": 0.11643217504024506, + "learning_rate": 9.375085391586842e-05, + "loss": 1.5472543716430665, + "step": 83340 + }, + { + "epoch": 0.25229670213248984, + "grad_norm": 0.13300687074661255, + "learning_rate": 9.374705873423102e-05, + "loss": 1.5714823722839355, + "step": 83350 + }, + { + "epoch": 0.25232697168283563, + "grad_norm": 0.13368327915668488, + "learning_rate": 9.374326355259364e-05, + "loss": 1.5861874580383302, + "step": 83360 + }, + { + "epoch": 0.2523572412331815, + "grad_norm": 0.10999852418899536, + "learning_rate": 9.373946837095623e-05, + "loss": 1.5808198928833008, + "step": 83370 + }, + { + "epoch": 0.25238751078352734, + "grad_norm": 0.11316905915737152, + "learning_rate": 9.373567318931885e-05, + "loss": 1.559709358215332, + "step": 83380 + }, + { + "epoch": 0.25241778033387313, + "grad_norm": 0.13041433691978455, + "learning_rate": 9.373187800768144e-05, + "loss": 1.5721708297729493, + "step": 83390 + }, + { + "epoch": 0.252448049884219, + "grad_norm": 0.11584065109491348, + "learning_rate": 9.372808282604406e-05, + "loss": 1.57164306640625, + "step": 83400 + }, + { + "epoch": 0.2524783194345648, + "grad_norm": 0.12196357548236847, + "learning_rate": 9.372428764440667e-05, + "loss": 1.55112943649292, + "step": 83410 + }, + { + "epoch": 0.25250858898491063, + "grad_norm": 0.11896687746047974, + "learning_rate": 9.372049246276927e-05, + "loss": 1.5451446533203126, + "step": 83420 + }, + { + "epoch": 0.2525388585352565, + "grad_norm": 0.11104856431484222, + "learning_rate": 9.371669728113188e-05, + "loss": 1.5421341896057128, + "step": 83430 + }, + { + "epoch": 0.2525691280856023, + "grad_norm": 0.11765383929014206, + "learning_rate": 9.371290209949448e-05, + "loss": 1.5604095458984375, + "step": 83440 + }, + { + "epoch": 0.25259939763594813, + "grad_norm": 0.10479294508695602, + "learning_rate": 9.370910691785709e-05, + "loss": 1.572705078125, + "step": 83450 + }, + { + "epoch": 0.2526296671862939, + "grad_norm": 0.11995608359575272, + "learning_rate": 9.37053117362197e-05, + "loss": 1.60070743560791, + "step": 83460 + }, + { + "epoch": 0.2526599367366398, + "grad_norm": 0.11791282892227173, + "learning_rate": 9.370151655458231e-05, + "loss": 1.6293561935424805, + "step": 83470 + }, + { + "epoch": 0.25269020628698563, + "grad_norm": 0.12883338332176208, + "learning_rate": 9.36977213729449e-05, + "loss": 1.5498493194580079, + "step": 83480 + }, + { + "epoch": 0.2527204758373314, + "grad_norm": 0.11342132091522217, + "learning_rate": 9.369392619130753e-05, + "loss": 1.6143131256103516, + "step": 83490 + }, + { + "epoch": 0.2527507453876773, + "grad_norm": 0.12823860347270966, + "learning_rate": 9.369013100967012e-05, + "loss": 1.570014762878418, + "step": 83500 + }, + { + "epoch": 0.2527507453876773, + "eval_loss": 1.5738959312438965, + "eval_runtime": 27.9614, + "eval_samples_per_second": 17.882, + "eval_steps_per_second": 1.144, + "step": 83500 + }, + { + "epoch": 0.2527810149380231, + "grad_norm": 0.11035449057817459, + "learning_rate": 9.368633582803274e-05, + "loss": 1.6100589752197265, + "step": 83510 + }, + { + "epoch": 0.2528112844883689, + "grad_norm": 0.11637389659881592, + "learning_rate": 9.368254064639533e-05, + "loss": 1.5644493103027344, + "step": 83520 + }, + { + "epoch": 0.2528415540387148, + "grad_norm": 0.11203605681657791, + "learning_rate": 9.367874546475795e-05, + "loss": 1.5729223251342774, + "step": 83530 + }, + { + "epoch": 0.2528718235890606, + "grad_norm": 0.11409413069486618, + "learning_rate": 9.367495028312056e-05, + "loss": 1.5923389434814452, + "step": 83540 + }, + { + "epoch": 0.2529020931394064, + "grad_norm": 0.11249076575040817, + "learning_rate": 9.367115510148316e-05, + "loss": 1.6005186080932616, + "step": 83550 + }, + { + "epoch": 0.2529323626897522, + "grad_norm": 0.11461137980222702, + "learning_rate": 9.366735991984577e-05, + "loss": 1.5499307632446289, + "step": 83560 + }, + { + "epoch": 0.2529626322400981, + "grad_norm": 0.11876227706670761, + "learning_rate": 9.366356473820837e-05, + "loss": 1.5501611709594727, + "step": 83570 + }, + { + "epoch": 0.2529929017904439, + "grad_norm": 0.11395012587308884, + "learning_rate": 9.365976955657098e-05, + "loss": 1.5600648880004884, + "step": 83580 + }, + { + "epoch": 0.2530231713407897, + "grad_norm": 0.12474647164344788, + "learning_rate": 9.365597437493359e-05, + "loss": 1.5827579498291016, + "step": 83590 + }, + { + "epoch": 0.2530534408911356, + "grad_norm": 0.13263949751853943, + "learning_rate": 9.365217919329619e-05, + "loss": 1.5584188461303712, + "step": 83600 + }, + { + "epoch": 0.25308371044148137, + "grad_norm": 0.1198820024728775, + "learning_rate": 9.36483840116588e-05, + "loss": 1.555152702331543, + "step": 83610 + }, + { + "epoch": 0.2531139799918272, + "grad_norm": 0.11583464592695236, + "learning_rate": 9.36445888300214e-05, + "loss": 1.59997615814209, + "step": 83620 + }, + { + "epoch": 0.25314424954217307, + "grad_norm": 0.12927551567554474, + "learning_rate": 9.364079364838401e-05, + "loss": 1.5661210060119628, + "step": 83630 + }, + { + "epoch": 0.25317451909251887, + "grad_norm": 0.12717300653457642, + "learning_rate": 9.363699846674662e-05, + "loss": 1.595963478088379, + "step": 83640 + }, + { + "epoch": 0.2532047886428647, + "grad_norm": 0.1377556174993515, + "learning_rate": 9.363320328510922e-05, + "loss": 1.5400599479675292, + "step": 83650 + }, + { + "epoch": 0.2532350581932105, + "grad_norm": 0.12237110733985901, + "learning_rate": 9.362940810347183e-05, + "loss": 1.5355920791625977, + "step": 83660 + }, + { + "epoch": 0.25326532774355637, + "grad_norm": 0.11026497930288315, + "learning_rate": 9.362561292183445e-05, + "loss": 1.551908588409424, + "step": 83670 + }, + { + "epoch": 0.2532955972939022, + "grad_norm": 0.10714057832956314, + "learning_rate": 9.362181774019705e-05, + "loss": 1.5932632446289063, + "step": 83680 + }, + { + "epoch": 0.253325866844248, + "grad_norm": 0.11820174753665924, + "learning_rate": 9.361802255855966e-05, + "loss": 1.5773784637451171, + "step": 83690 + }, + { + "epoch": 0.25335613639459387, + "grad_norm": 0.12085350602865219, + "learning_rate": 9.361422737692226e-05, + "loss": 1.5612686157226563, + "step": 83700 + }, + { + "epoch": 0.25338640594493966, + "grad_norm": 0.11667492985725403, + "learning_rate": 9.361043219528487e-05, + "loss": 1.5925448417663575, + "step": 83710 + }, + { + "epoch": 0.2534166754952855, + "grad_norm": 0.11702676117420197, + "learning_rate": 9.360663701364748e-05, + "loss": 1.545024585723877, + "step": 83720 + }, + { + "epoch": 0.25344694504563137, + "grad_norm": 0.122815802693367, + "learning_rate": 9.360284183201008e-05, + "loss": 1.561077880859375, + "step": 83730 + }, + { + "epoch": 0.25347721459597716, + "grad_norm": 0.11529561877250671, + "learning_rate": 9.359904665037269e-05, + "loss": 1.5302183151245117, + "step": 83740 + }, + { + "epoch": 0.253507484146323, + "grad_norm": 0.12644854187965393, + "learning_rate": 9.35952514687353e-05, + "loss": 1.5639856338500977, + "step": 83750 + }, + { + "epoch": 0.2535377536966688, + "grad_norm": 0.1072029396891594, + "learning_rate": 9.35914562870979e-05, + "loss": 1.5939296722412108, + "step": 83760 + }, + { + "epoch": 0.25356802324701466, + "grad_norm": 0.13014233112335205, + "learning_rate": 9.35876611054605e-05, + "loss": 1.5613807678222655, + "step": 83770 + }, + { + "epoch": 0.2535982927973605, + "grad_norm": 0.12011898308992386, + "learning_rate": 9.358386592382313e-05, + "loss": 1.5415534973144531, + "step": 83780 + }, + { + "epoch": 0.2536285623477063, + "grad_norm": 0.11849434673786163, + "learning_rate": 9.358007074218572e-05, + "loss": 1.5580087661743165, + "step": 83790 + }, + { + "epoch": 0.25365883189805216, + "grad_norm": 0.11941812932491302, + "learning_rate": 9.357627556054834e-05, + "loss": 1.5589677810668945, + "step": 83800 + }, + { + "epoch": 0.25368910144839796, + "grad_norm": 0.10569500178098679, + "learning_rate": 9.357248037891093e-05, + "loss": 1.5748629570007324, + "step": 83810 + }, + { + "epoch": 0.2537193709987438, + "grad_norm": 0.11700423061847687, + "learning_rate": 9.356868519727355e-05, + "loss": 1.5822152137756347, + "step": 83820 + }, + { + "epoch": 0.25374964054908966, + "grad_norm": 0.11995216459035873, + "learning_rate": 9.356489001563614e-05, + "loss": 1.5329524040222169, + "step": 83830 + }, + { + "epoch": 0.25377991009943546, + "grad_norm": 0.11545180529356003, + "learning_rate": 9.356109483399876e-05, + "loss": 1.6040538787841796, + "step": 83840 + }, + { + "epoch": 0.2538101796497813, + "grad_norm": 0.1207619160413742, + "learning_rate": 9.355729965236135e-05, + "loss": 1.5601648330688476, + "step": 83850 + }, + { + "epoch": 0.2538404492001271, + "grad_norm": 0.11934348195791245, + "learning_rate": 9.355350447072397e-05, + "loss": 1.579537296295166, + "step": 83860 + }, + { + "epoch": 0.25387071875047296, + "grad_norm": 0.12509430944919586, + "learning_rate": 9.354970928908657e-05, + "loss": 1.5930461883544922, + "step": 83870 + }, + { + "epoch": 0.2539009883008188, + "grad_norm": 0.13696037232875824, + "learning_rate": 9.354591410744919e-05, + "loss": 1.5524288177490235, + "step": 83880 + }, + { + "epoch": 0.2539312578511646, + "grad_norm": 0.10574410855770111, + "learning_rate": 9.354211892581179e-05, + "loss": 1.586740779876709, + "step": 83890 + }, + { + "epoch": 0.25396152740151046, + "grad_norm": 0.11593347787857056, + "learning_rate": 9.35383237441744e-05, + "loss": 1.5394770622253418, + "step": 83900 + }, + { + "epoch": 0.2539917969518563, + "grad_norm": 0.13781966269016266, + "learning_rate": 9.353452856253702e-05, + "loss": 1.5635057449340821, + "step": 83910 + }, + { + "epoch": 0.2540220665022021, + "grad_norm": 0.130576029419899, + "learning_rate": 9.353073338089961e-05, + "loss": 1.6012619018554688, + "step": 83920 + }, + { + "epoch": 0.25405233605254796, + "grad_norm": 0.1103973314166069, + "learning_rate": 9.352693819926223e-05, + "loss": 1.5580979347229005, + "step": 83930 + }, + { + "epoch": 0.25408260560289375, + "grad_norm": 0.10577215254306793, + "learning_rate": 9.352314301762482e-05, + "loss": 1.5394359588623048, + "step": 83940 + }, + { + "epoch": 0.2541128751532396, + "grad_norm": 0.12204470485448837, + "learning_rate": 9.351934783598744e-05, + "loss": 1.5669647216796876, + "step": 83950 + }, + { + "epoch": 0.25414314470358546, + "grad_norm": 0.12817198038101196, + "learning_rate": 9.351555265435003e-05, + "loss": 1.6071903228759765, + "step": 83960 + }, + { + "epoch": 0.25417341425393125, + "grad_norm": 0.10482898354530334, + "learning_rate": 9.351175747271265e-05, + "loss": 1.599241352081299, + "step": 83970 + }, + { + "epoch": 0.2542036838042771, + "grad_norm": 0.12326013296842575, + "learning_rate": 9.350796229107524e-05, + "loss": 1.588218879699707, + "step": 83980 + }, + { + "epoch": 0.2542339533546229, + "grad_norm": 0.11718589812517166, + "learning_rate": 9.350416710943786e-05, + "loss": 1.5437514305114746, + "step": 83990 + }, + { + "epoch": 0.25426422290496875, + "grad_norm": 0.12119022756814957, + "learning_rate": 9.350037192780046e-05, + "loss": 1.6096611022949219, + "step": 84000 + }, + { + "epoch": 0.25426422290496875, + "eval_loss": 1.5636337995529175, + "eval_runtime": 28.0083, + "eval_samples_per_second": 17.852, + "eval_steps_per_second": 1.143, + "step": 84000 + }, + { + "epoch": 0.2542944924553146, + "grad_norm": 0.12188898026943207, + "learning_rate": 9.349657674616308e-05, + "loss": 1.5562941551208496, + "step": 84010 + }, + { + "epoch": 0.2543247620056604, + "grad_norm": 0.10850316286087036, + "learning_rate": 9.349278156452567e-05, + "loss": 1.5464807510375977, + "step": 84020 + }, + { + "epoch": 0.25435503155600625, + "grad_norm": 0.11454358696937561, + "learning_rate": 9.348898638288829e-05, + "loss": 1.5837288856506349, + "step": 84030 + }, + { + "epoch": 0.25438530110635205, + "grad_norm": 0.1142067164182663, + "learning_rate": 9.34851912012509e-05, + "loss": 1.5630770683288575, + "step": 84040 + }, + { + "epoch": 0.2544155706566979, + "grad_norm": 0.13122844696044922, + "learning_rate": 9.34813960196135e-05, + "loss": 1.5564249992370605, + "step": 84050 + }, + { + "epoch": 0.25444584020704375, + "grad_norm": 0.11162245273590088, + "learning_rate": 9.34776008379761e-05, + "loss": 1.5733586311340333, + "step": 84060 + }, + { + "epoch": 0.25447610975738955, + "grad_norm": 0.11669693142175674, + "learning_rate": 9.347380565633871e-05, + "loss": 1.536578369140625, + "step": 84070 + }, + { + "epoch": 0.2545063793077354, + "grad_norm": 0.11738249659538269, + "learning_rate": 9.347001047470133e-05, + "loss": 1.5713436126708984, + "step": 84080 + }, + { + "epoch": 0.2545366488580812, + "grad_norm": 0.12406932562589645, + "learning_rate": 9.346621529306392e-05, + "loss": 1.5825839042663574, + "step": 84090 + }, + { + "epoch": 0.25456691840842705, + "grad_norm": 0.13445286452770233, + "learning_rate": 9.346242011142654e-05, + "loss": 1.5386601448059083, + "step": 84100 + }, + { + "epoch": 0.2545971879587729, + "grad_norm": 0.11668741703033447, + "learning_rate": 9.345862492978914e-05, + "loss": 1.5575684547424316, + "step": 84110 + }, + { + "epoch": 0.2546274575091187, + "grad_norm": 0.125377357006073, + "learning_rate": 9.345482974815176e-05, + "loss": 1.5930562973022462, + "step": 84120 + }, + { + "epoch": 0.25465772705946454, + "grad_norm": 0.13152626156806946, + "learning_rate": 9.345103456651435e-05, + "loss": 1.5594869613647462, + "step": 84130 + }, + { + "epoch": 0.25468799660981034, + "grad_norm": 0.1304105818271637, + "learning_rate": 9.344723938487697e-05, + "loss": 1.578316879272461, + "step": 84140 + }, + { + "epoch": 0.2547182661601562, + "grad_norm": 0.1081632599234581, + "learning_rate": 9.344344420323957e-05, + "loss": 1.6094169616699219, + "step": 84150 + }, + { + "epoch": 0.25474853571050204, + "grad_norm": 0.1295030117034912, + "learning_rate": 9.343964902160218e-05, + "loss": 1.5401494979858399, + "step": 84160 + }, + { + "epoch": 0.25477880526084784, + "grad_norm": 0.11459890007972717, + "learning_rate": 9.343585383996479e-05, + "loss": 1.561417007446289, + "step": 84170 + }, + { + "epoch": 0.2548090748111937, + "grad_norm": 0.11545808613300323, + "learning_rate": 9.343205865832739e-05, + "loss": 1.579824447631836, + "step": 84180 + }, + { + "epoch": 0.2548393443615395, + "grad_norm": 0.10221225768327713, + "learning_rate": 9.342826347669e-05, + "loss": 1.5511920928955079, + "step": 84190 + }, + { + "epoch": 0.25486961391188534, + "grad_norm": 0.12407203763723373, + "learning_rate": 9.34244682950526e-05, + "loss": 1.5716269493103028, + "step": 84200 + }, + { + "epoch": 0.2548998834622312, + "grad_norm": 0.1146177127957344, + "learning_rate": 9.342067311341521e-05, + "loss": 1.5792106628417968, + "step": 84210 + }, + { + "epoch": 0.254930153012577, + "grad_norm": 0.13064755499362946, + "learning_rate": 9.341687793177781e-05, + "loss": 1.5767114639282227, + "step": 84220 + }, + { + "epoch": 0.25496042256292284, + "grad_norm": 0.10403630137443542, + "learning_rate": 9.341308275014042e-05, + "loss": 1.5794086456298828, + "step": 84230 + }, + { + "epoch": 0.25499069211326864, + "grad_norm": 0.11257929354906082, + "learning_rate": 9.340928756850303e-05, + "loss": 1.5401123046875, + "step": 84240 + }, + { + "epoch": 0.2550209616636145, + "grad_norm": 0.11103415489196777, + "learning_rate": 9.340549238686563e-05, + "loss": 1.5769609451293944, + "step": 84250 + }, + { + "epoch": 0.25505123121396034, + "grad_norm": 0.11154523491859436, + "learning_rate": 9.340169720522824e-05, + "loss": 1.5893139839172363, + "step": 84260 + }, + { + "epoch": 0.25508150076430613, + "grad_norm": 0.11156847327947617, + "learning_rate": 9.339790202359084e-05, + "loss": 1.5864608764648438, + "step": 84270 + }, + { + "epoch": 0.255111770314652, + "grad_norm": 0.1192198097705841, + "learning_rate": 9.339410684195346e-05, + "loss": 1.612905502319336, + "step": 84280 + }, + { + "epoch": 0.2551420398649978, + "grad_norm": 0.1275477409362793, + "learning_rate": 9.339031166031607e-05, + "loss": 1.5199512481689452, + "step": 84290 + }, + { + "epoch": 0.25517230941534363, + "grad_norm": 0.1094459667801857, + "learning_rate": 9.338651647867868e-05, + "loss": 1.568350601196289, + "step": 84300 + }, + { + "epoch": 0.2552025789656895, + "grad_norm": 0.11889929324388504, + "learning_rate": 9.338272129704128e-05, + "loss": 1.5759393692016601, + "step": 84310 + }, + { + "epoch": 0.2552328485160353, + "grad_norm": 0.11145336180925369, + "learning_rate": 9.337892611540389e-05, + "loss": 1.5914506912231445, + "step": 84320 + }, + { + "epoch": 0.25526311806638113, + "grad_norm": 0.11569883674383163, + "learning_rate": 9.33751309337665e-05, + "loss": 1.5630127906799316, + "step": 84330 + }, + { + "epoch": 0.25529338761672693, + "grad_norm": 0.12634411454200745, + "learning_rate": 9.33713357521291e-05, + "loss": 1.5678466796875, + "step": 84340 + }, + { + "epoch": 0.2553236571670728, + "grad_norm": 0.11549276113510132, + "learning_rate": 9.33675405704917e-05, + "loss": 1.5460586547851562, + "step": 84350 + }, + { + "epoch": 0.25535392671741863, + "grad_norm": 0.108894944190979, + "learning_rate": 9.336374538885431e-05, + "loss": 1.5581535339355468, + "step": 84360 + }, + { + "epoch": 0.25538419626776443, + "grad_norm": 0.1422160416841507, + "learning_rate": 9.335995020721692e-05, + "loss": 1.5631338119506837, + "step": 84370 + }, + { + "epoch": 0.2554144658181103, + "grad_norm": 0.11956323683261871, + "learning_rate": 9.335615502557952e-05, + "loss": 1.617306900024414, + "step": 84380 + }, + { + "epoch": 0.2554447353684561, + "grad_norm": 0.11948923766613007, + "learning_rate": 9.335235984394213e-05, + "loss": 1.5824328422546388, + "step": 84390 + }, + { + "epoch": 0.25547500491880193, + "grad_norm": 0.11565553396940231, + "learning_rate": 9.334856466230474e-05, + "loss": 1.6033706665039062, + "step": 84400 + }, + { + "epoch": 0.2555052744691478, + "grad_norm": 0.12091617286205292, + "learning_rate": 9.334476948066735e-05, + "loss": 1.5547025680541993, + "step": 84410 + }, + { + "epoch": 0.2555355440194936, + "grad_norm": 0.11801750957965851, + "learning_rate": 9.334097429902995e-05, + "loss": 1.5673402786254882, + "step": 84420 + }, + { + "epoch": 0.25556581356983943, + "grad_norm": 0.1038905531167984, + "learning_rate": 9.333717911739257e-05, + "loss": 1.5718752861022949, + "step": 84430 + }, + { + "epoch": 0.2555960831201852, + "grad_norm": 0.11098361015319824, + "learning_rate": 9.333338393575516e-05, + "loss": 1.5599118232727052, + "step": 84440 + }, + { + "epoch": 0.2556263526705311, + "grad_norm": 0.1234486773610115, + "learning_rate": 9.332958875411778e-05, + "loss": 1.5655092239379882, + "step": 84450 + }, + { + "epoch": 0.25565662222087693, + "grad_norm": 0.11699392646551132, + "learning_rate": 9.332579357248037e-05, + "loss": 1.5885679244995117, + "step": 84460 + }, + { + "epoch": 0.2556868917712227, + "grad_norm": 0.12412359565496445, + "learning_rate": 9.332199839084299e-05, + "loss": 1.5569023132324218, + "step": 84470 + }, + { + "epoch": 0.2557171613215686, + "grad_norm": 0.11358339339494705, + "learning_rate": 9.331820320920558e-05, + "loss": 1.5678133010864257, + "step": 84480 + }, + { + "epoch": 0.25574743087191437, + "grad_norm": 0.12758931517601013, + "learning_rate": 9.33144080275682e-05, + "loss": 1.5265462875366211, + "step": 84490 + }, + { + "epoch": 0.2557777004222602, + "grad_norm": 0.14495672285556793, + "learning_rate": 9.331061284593081e-05, + "loss": 1.5356689453125, + "step": 84500 + }, + { + "epoch": 0.2557777004222602, + "eval_loss": 1.5722476243972778, + "eval_runtime": 28.188, + "eval_samples_per_second": 17.738, + "eval_steps_per_second": 1.135, + "step": 84500 + }, + { + "epoch": 0.2558079699726061, + "grad_norm": 0.12196695804595947, + "learning_rate": 9.330681766429341e-05, + "loss": 1.5844405174255372, + "step": 84510 + }, + { + "epoch": 0.25583823952295187, + "grad_norm": 0.11859823018312454, + "learning_rate": 9.330302248265603e-05, + "loss": 1.5359905242919922, + "step": 84520 + }, + { + "epoch": 0.2558685090732977, + "grad_norm": 0.10902465134859085, + "learning_rate": 9.329922730101863e-05, + "loss": 1.5738292694091798, + "step": 84530 + }, + { + "epoch": 0.2558987786236435, + "grad_norm": 0.12228580564260483, + "learning_rate": 9.329543211938125e-05, + "loss": 1.614243507385254, + "step": 84540 + }, + { + "epoch": 0.25592904817398937, + "grad_norm": 0.11356386542320251, + "learning_rate": 9.329163693774384e-05, + "loss": 1.5895983695983886, + "step": 84550 + }, + { + "epoch": 0.2559593177243352, + "grad_norm": 0.12044434994459152, + "learning_rate": 9.328784175610646e-05, + "loss": 1.5551471710205078, + "step": 84560 + }, + { + "epoch": 0.255989587274681, + "grad_norm": 0.11685251444578171, + "learning_rate": 9.328404657446905e-05, + "loss": 1.5537712097167968, + "step": 84570 + }, + { + "epoch": 0.25601985682502687, + "grad_norm": 0.10908888280391693, + "learning_rate": 9.328025139283167e-05, + "loss": 1.567917823791504, + "step": 84580 + }, + { + "epoch": 0.25605012637537267, + "grad_norm": 0.11995358765125275, + "learning_rate": 9.327645621119426e-05, + "loss": 1.541079330444336, + "step": 84590 + }, + { + "epoch": 0.2560803959257185, + "grad_norm": 0.11317172646522522, + "learning_rate": 9.327266102955688e-05, + "loss": 1.5920563697814942, + "step": 84600 + }, + { + "epoch": 0.25611066547606437, + "grad_norm": 0.11975333839654922, + "learning_rate": 9.326886584791947e-05, + "loss": 1.5824941635131835, + "step": 84610 + }, + { + "epoch": 0.25614093502641017, + "grad_norm": 0.1282220333814621, + "learning_rate": 9.32650706662821e-05, + "loss": 1.5790056228637694, + "step": 84620 + }, + { + "epoch": 0.256171204576756, + "grad_norm": 0.11305999010801315, + "learning_rate": 9.326127548464469e-05, + "loss": 1.5967493057250977, + "step": 84630 + }, + { + "epoch": 0.25620147412710187, + "grad_norm": 0.12995947897434235, + "learning_rate": 9.32574803030073e-05, + "loss": 1.5710444450378418, + "step": 84640 + }, + { + "epoch": 0.25623174367744767, + "grad_norm": 0.1244136169552803, + "learning_rate": 9.325368512136991e-05, + "loss": 1.5586427688598632, + "step": 84650 + }, + { + "epoch": 0.2562620132277935, + "grad_norm": 0.11023499071598053, + "learning_rate": 9.324988993973252e-05, + "loss": 1.574270248413086, + "step": 84660 + }, + { + "epoch": 0.2562922827781393, + "grad_norm": 0.13562670350074768, + "learning_rate": 9.324609475809512e-05, + "loss": 1.5576122283935547, + "step": 84670 + }, + { + "epoch": 0.25632255232848516, + "grad_norm": 0.11669154465198517, + "learning_rate": 9.324229957645773e-05, + "loss": 1.575755500793457, + "step": 84680 + }, + { + "epoch": 0.256352821878831, + "grad_norm": 0.10811840742826462, + "learning_rate": 9.323850439482035e-05, + "loss": 1.593923568725586, + "step": 84690 + }, + { + "epoch": 0.2563830914291768, + "grad_norm": 0.13103584945201874, + "learning_rate": 9.323470921318294e-05, + "loss": 1.5577651977539062, + "step": 84700 + }, + { + "epoch": 0.25641336097952266, + "grad_norm": 0.11147695034742355, + "learning_rate": 9.323091403154556e-05, + "loss": 1.549829864501953, + "step": 84710 + }, + { + "epoch": 0.25644363052986846, + "grad_norm": 0.10975322872400284, + "learning_rate": 9.322711884990815e-05, + "loss": 1.6010265350341797, + "step": 84720 + }, + { + "epoch": 0.2564739000802143, + "grad_norm": 0.12368320673704147, + "learning_rate": 9.322332366827077e-05, + "loss": 1.564546489715576, + "step": 84730 + }, + { + "epoch": 0.25650416963056016, + "grad_norm": 0.11496024578809738, + "learning_rate": 9.321952848663336e-05, + "loss": 1.5906922340393066, + "step": 84740 + }, + { + "epoch": 0.25653443918090596, + "grad_norm": 0.11881797760725021, + "learning_rate": 9.321573330499598e-05, + "loss": 1.5616605758666993, + "step": 84750 + }, + { + "epoch": 0.2565647087312518, + "grad_norm": 0.11550948023796082, + "learning_rate": 9.321193812335858e-05, + "loss": 1.6020204544067382, + "step": 84760 + }, + { + "epoch": 0.2565949782815976, + "grad_norm": 0.1094415932893753, + "learning_rate": 9.32081429417212e-05, + "loss": 1.5571358680725098, + "step": 84770 + }, + { + "epoch": 0.25662524783194346, + "grad_norm": 0.10905427485704422, + "learning_rate": 9.32043477600838e-05, + "loss": 1.6018299102783202, + "step": 84780 + }, + { + "epoch": 0.2566555173822893, + "grad_norm": 0.13746553659439087, + "learning_rate": 9.320055257844641e-05, + "loss": 1.5674989700317383, + "step": 84790 + }, + { + "epoch": 0.2566857869326351, + "grad_norm": 0.11362048983573914, + "learning_rate": 9.319675739680901e-05, + "loss": 1.5605317115783692, + "step": 84800 + }, + { + "epoch": 0.25671605648298096, + "grad_norm": 0.12149073928594589, + "learning_rate": 9.319296221517162e-05, + "loss": 1.5817611694335938, + "step": 84810 + }, + { + "epoch": 0.25674632603332675, + "grad_norm": 0.11022274941205978, + "learning_rate": 9.318916703353423e-05, + "loss": 1.5659481048583985, + "step": 84820 + }, + { + "epoch": 0.2567765955836726, + "grad_norm": 0.11349903792142868, + "learning_rate": 9.318537185189683e-05, + "loss": 1.5722951889038086, + "step": 84830 + }, + { + "epoch": 0.25680686513401846, + "grad_norm": 0.11140155047178268, + "learning_rate": 9.318157667025944e-05, + "loss": 1.5614029884338378, + "step": 84840 + }, + { + "epoch": 0.25683713468436425, + "grad_norm": 0.11162441223859787, + "learning_rate": 9.317778148862204e-05, + "loss": 1.595974063873291, + "step": 84850 + }, + { + "epoch": 0.2568674042347101, + "grad_norm": 0.11918336898088455, + "learning_rate": 9.317398630698465e-05, + "loss": 1.5254283905029298, + "step": 84860 + }, + { + "epoch": 0.2568976737850559, + "grad_norm": 0.12229431420564651, + "learning_rate": 9.317019112534726e-05, + "loss": 1.5468582153320312, + "step": 84870 + }, + { + "epoch": 0.25692794333540175, + "grad_norm": 0.12074455618858337, + "learning_rate": 9.316639594370986e-05, + "loss": 1.5689000129699706, + "step": 84880 + }, + { + "epoch": 0.2569582128857476, + "grad_norm": 0.11201819032430649, + "learning_rate": 9.316260076207248e-05, + "loss": 1.5409265518188477, + "step": 84890 + }, + { + "epoch": 0.2569884824360934, + "grad_norm": 0.11319691687822342, + "learning_rate": 9.315880558043509e-05, + "loss": 1.5578710556030273, + "step": 84900 + }, + { + "epoch": 0.25701875198643925, + "grad_norm": 0.11842662841081619, + "learning_rate": 9.315501039879769e-05, + "loss": 1.5600520133972169, + "step": 84910 + }, + { + "epoch": 0.25704902153678505, + "grad_norm": 0.12207592278718948, + "learning_rate": 9.31512152171603e-05, + "loss": 1.5643531799316406, + "step": 84920 + }, + { + "epoch": 0.2570792910871309, + "grad_norm": 0.11941684037446976, + "learning_rate": 9.31474200355229e-05, + "loss": 1.5912954330444335, + "step": 84930 + }, + { + "epoch": 0.25710956063747675, + "grad_norm": 0.12811671197414398, + "learning_rate": 9.314362485388551e-05, + "loss": 1.5478272438049316, + "step": 84940 + }, + { + "epoch": 0.25713983018782255, + "grad_norm": 0.10320945084095001, + "learning_rate": 9.313982967224812e-05, + "loss": 1.5831205368041992, + "step": 84950 + }, + { + "epoch": 0.2571700997381684, + "grad_norm": 0.11537372320890427, + "learning_rate": 9.313603449061072e-05, + "loss": 1.6065906524658202, + "step": 84960 + }, + { + "epoch": 0.2572003692885142, + "grad_norm": 0.11039592325687408, + "learning_rate": 9.313223930897333e-05, + "loss": 1.574587059020996, + "step": 84970 + }, + { + "epoch": 0.25723063883886005, + "grad_norm": 0.11383630335330963, + "learning_rate": 9.312844412733593e-05, + "loss": 1.5669122695922852, + "step": 84980 + }, + { + "epoch": 0.2572609083892059, + "grad_norm": 0.11237668246030807, + "learning_rate": 9.312464894569854e-05, + "loss": 1.5503395080566407, + "step": 84990 + }, + { + "epoch": 0.2572911779395517, + "grad_norm": 0.12459275871515274, + "learning_rate": 9.312085376406115e-05, + "loss": 1.5640536308288575, + "step": 85000 + }, + { + "epoch": 0.2572911779395517, + "eval_loss": 1.5845297574996948, + "eval_runtime": 27.6396, + "eval_samples_per_second": 18.09, + "eval_steps_per_second": 1.158, + "step": 85000 + }, + { + "epoch": 0.25732144748989755, + "grad_norm": 0.11455885320901871, + "learning_rate": 9.311705858242375e-05, + "loss": 1.5697881698608398, + "step": 85010 + }, + { + "epoch": 0.25735171704024334, + "grad_norm": 0.10056594759225845, + "learning_rate": 9.311326340078637e-05, + "loss": 1.5386737823486327, + "step": 85020 + }, + { + "epoch": 0.2573819865905892, + "grad_norm": 0.11448575556278229, + "learning_rate": 9.310946821914896e-05, + "loss": 1.575531005859375, + "step": 85030 + }, + { + "epoch": 0.25741225614093505, + "grad_norm": 0.11405046284198761, + "learning_rate": 9.310567303751158e-05, + "loss": 1.541957950592041, + "step": 85040 + }, + { + "epoch": 0.25744252569128084, + "grad_norm": 0.12015824764966965, + "learning_rate": 9.310187785587418e-05, + "loss": 1.5765695571899414, + "step": 85050 + }, + { + "epoch": 0.2574727952416267, + "grad_norm": 0.13159216940402985, + "learning_rate": 9.30980826742368e-05, + "loss": 1.606462287902832, + "step": 85060 + }, + { + "epoch": 0.2575030647919725, + "grad_norm": 0.11346719413995743, + "learning_rate": 9.309428749259939e-05, + "loss": 1.5676610946655274, + "step": 85070 + }, + { + "epoch": 0.25753333434231834, + "grad_norm": 0.1110466718673706, + "learning_rate": 9.309049231096201e-05, + "loss": 1.5488883972167968, + "step": 85080 + }, + { + "epoch": 0.2575636038926642, + "grad_norm": 0.11696851253509521, + "learning_rate": 9.30866971293246e-05, + "loss": 1.6037208557128906, + "step": 85090 + }, + { + "epoch": 0.25759387344301, + "grad_norm": 0.10554622858762741, + "learning_rate": 9.308290194768722e-05, + "loss": 1.5859253883361817, + "step": 85100 + }, + { + "epoch": 0.25762414299335584, + "grad_norm": 0.11785160005092621, + "learning_rate": 9.307910676604983e-05, + "loss": 1.5696466445922852, + "step": 85110 + }, + { + "epoch": 0.25765441254370164, + "grad_norm": 0.12745952606201172, + "learning_rate": 9.307531158441243e-05, + "loss": 1.6089237213134766, + "step": 85120 + }, + { + "epoch": 0.2576846820940475, + "grad_norm": 0.1244540587067604, + "learning_rate": 9.307151640277505e-05, + "loss": 1.5246759414672852, + "step": 85130 + }, + { + "epoch": 0.25771495164439334, + "grad_norm": 0.10421522706747055, + "learning_rate": 9.306772122113764e-05, + "loss": 1.6044017791748046, + "step": 85140 + }, + { + "epoch": 0.25774522119473914, + "grad_norm": 0.11907696723937988, + "learning_rate": 9.306392603950026e-05, + "loss": 1.5802823066711427, + "step": 85150 + }, + { + "epoch": 0.257775490745085, + "grad_norm": 0.16136762499809265, + "learning_rate": 9.306013085786286e-05, + "loss": 1.5333992958068847, + "step": 85160 + }, + { + "epoch": 0.2578057602954308, + "grad_norm": 0.11613138020038605, + "learning_rate": 9.305633567622547e-05, + "loss": 1.59226655960083, + "step": 85170 + }, + { + "epoch": 0.25783602984577664, + "grad_norm": 0.11545933783054352, + "learning_rate": 9.305254049458807e-05, + "loss": 1.544610595703125, + "step": 85180 + }, + { + "epoch": 0.2578662993961225, + "grad_norm": 0.10671111941337585, + "learning_rate": 9.304874531295069e-05, + "loss": 1.5705280303955078, + "step": 85190 + }, + { + "epoch": 0.2578965689464683, + "grad_norm": 0.11047039926052094, + "learning_rate": 9.304495013131328e-05, + "loss": 1.582765769958496, + "step": 85200 + }, + { + "epoch": 0.25792683849681414, + "grad_norm": 0.11951063573360443, + "learning_rate": 9.30411549496759e-05, + "loss": 1.5871036529541016, + "step": 85210 + }, + { + "epoch": 0.25795710804715993, + "grad_norm": 0.12170222401618958, + "learning_rate": 9.303735976803849e-05, + "loss": 1.553635025024414, + "step": 85220 + }, + { + "epoch": 0.2579873775975058, + "grad_norm": 0.14017625153064728, + "learning_rate": 9.303356458640111e-05, + "loss": 1.5613500595092773, + "step": 85230 + }, + { + "epoch": 0.25801764714785164, + "grad_norm": 0.12667493522167206, + "learning_rate": 9.30297694047637e-05, + "loss": 1.5636945724487306, + "step": 85240 + }, + { + "epoch": 0.25804791669819743, + "grad_norm": 0.1058012843132019, + "learning_rate": 9.302597422312632e-05, + "loss": 1.5743663787841797, + "step": 85250 + }, + { + "epoch": 0.2580781862485433, + "grad_norm": 0.11592474579811096, + "learning_rate": 9.302217904148893e-05, + "loss": 1.5684152603149415, + "step": 85260 + }, + { + "epoch": 0.2581084557988891, + "grad_norm": 0.11840420961380005, + "learning_rate": 9.301838385985153e-05, + "loss": 1.5869494438171388, + "step": 85270 + }, + { + "epoch": 0.25813872534923493, + "grad_norm": 0.10752755403518677, + "learning_rate": 9.301458867821414e-05, + "loss": 1.6070768356323242, + "step": 85280 + }, + { + "epoch": 0.2581689948995808, + "grad_norm": 0.13185851275920868, + "learning_rate": 9.301079349657675e-05, + "loss": 1.5764079093933105, + "step": 85290 + }, + { + "epoch": 0.2581992644499266, + "grad_norm": 0.12229876220226288, + "learning_rate": 9.300699831493937e-05, + "loss": 1.5293212890625, + "step": 85300 + }, + { + "epoch": 0.25822953400027243, + "grad_norm": 0.1080591082572937, + "learning_rate": 9.300320313330196e-05, + "loss": 1.5796480178833008, + "step": 85310 + }, + { + "epoch": 0.2582598035506183, + "grad_norm": 0.11317242681980133, + "learning_rate": 9.299940795166458e-05, + "loss": 1.558726692199707, + "step": 85320 + }, + { + "epoch": 0.2582900731009641, + "grad_norm": 0.1192002147436142, + "learning_rate": 9.299561277002717e-05, + "loss": 1.5971759796142577, + "step": 85330 + }, + { + "epoch": 0.25832034265130993, + "grad_norm": 0.11855309456586838, + "learning_rate": 9.299181758838979e-05, + "loss": 1.5737186431884767, + "step": 85340 + }, + { + "epoch": 0.2583506122016557, + "grad_norm": 0.11138457804918289, + "learning_rate": 9.298802240675238e-05, + "loss": 1.5719975471496581, + "step": 85350 + }, + { + "epoch": 0.2583808817520016, + "grad_norm": 0.12949426472187042, + "learning_rate": 9.2984227225115e-05, + "loss": 1.5319852828979492, + "step": 85360 + }, + { + "epoch": 0.25841115130234743, + "grad_norm": 0.11955436319112778, + "learning_rate": 9.29804320434776e-05, + "loss": 1.565652084350586, + "step": 85370 + }, + { + "epoch": 0.2584414208526932, + "grad_norm": 0.10396544635295868, + "learning_rate": 9.297663686184021e-05, + "loss": 1.6195634841918944, + "step": 85380 + }, + { + "epoch": 0.2584716904030391, + "grad_norm": 0.11124918609857559, + "learning_rate": 9.297284168020282e-05, + "loss": 1.5601837158203125, + "step": 85390 + }, + { + "epoch": 0.2585019599533849, + "grad_norm": 0.11409725993871689, + "learning_rate": 9.296904649856543e-05, + "loss": 1.6081718444824218, + "step": 85400 + }, + { + "epoch": 0.2585322295037307, + "grad_norm": 0.10976593941450119, + "learning_rate": 9.296525131692803e-05, + "loss": 1.5987327575683594, + "step": 85410 + }, + { + "epoch": 0.2585624990540766, + "grad_norm": 0.1291085034608841, + "learning_rate": 9.296145613529064e-05, + "loss": 1.5581982612609864, + "step": 85420 + }, + { + "epoch": 0.2585927686044224, + "grad_norm": 0.12273510545492172, + "learning_rate": 9.295766095365324e-05, + "loss": 1.5689374923706054, + "step": 85430 + }, + { + "epoch": 0.2586230381547682, + "grad_norm": 0.12647739052772522, + "learning_rate": 9.295386577201585e-05, + "loss": 1.5478274345397949, + "step": 85440 + }, + { + "epoch": 0.258653307705114, + "grad_norm": 0.10801399499177933, + "learning_rate": 9.295007059037846e-05, + "loss": 1.5576900482177733, + "step": 85450 + }, + { + "epoch": 0.2586835772554599, + "grad_norm": 0.12291300296783447, + "learning_rate": 9.294627540874106e-05, + "loss": 1.5889023780822753, + "step": 85460 + }, + { + "epoch": 0.2587138468058057, + "grad_norm": 0.1352209448814392, + "learning_rate": 9.294248022710367e-05, + "loss": 1.5647701263427733, + "step": 85470 + }, + { + "epoch": 0.2587441163561515, + "grad_norm": 0.12302964180707932, + "learning_rate": 9.293868504546627e-05, + "loss": 1.541719627380371, + "step": 85480 + }, + { + "epoch": 0.2587743859064974, + "grad_norm": 0.12363150715827942, + "learning_rate": 9.293488986382888e-05, + "loss": 1.558982753753662, + "step": 85490 + }, + { + "epoch": 0.25880465545684317, + "grad_norm": 0.11912693083286285, + "learning_rate": 9.293109468219148e-05, + "loss": 1.6268756866455079, + "step": 85500 + }, + { + "epoch": 0.25880465545684317, + "eval_loss": 1.602758765220642, + "eval_runtime": 28.3549, + "eval_samples_per_second": 17.634, + "eval_steps_per_second": 1.129, + "step": 85500 + }, + { + "epoch": 0.258834925007189, + "grad_norm": 0.11180908232927322, + "learning_rate": 9.29272995005541e-05, + "loss": 1.5854204177856446, + "step": 85510 + }, + { + "epoch": 0.2588651945575349, + "grad_norm": 0.11378423869609833, + "learning_rate": 9.292350431891671e-05, + "loss": 1.552363109588623, + "step": 85520 + }, + { + "epoch": 0.25889546410788067, + "grad_norm": 0.12243273854255676, + "learning_rate": 9.291970913727932e-05, + "loss": 1.5587583541870118, + "step": 85530 + }, + { + "epoch": 0.2589257336582265, + "grad_norm": 0.12446729093790054, + "learning_rate": 9.291591395564192e-05, + "loss": 1.5544721603393554, + "step": 85540 + }, + { + "epoch": 0.2589560032085723, + "grad_norm": 0.10100720077753067, + "learning_rate": 9.291211877400453e-05, + "loss": 1.5934619903564453, + "step": 85550 + }, + { + "epoch": 0.25898627275891817, + "grad_norm": 0.11630471050739288, + "learning_rate": 9.290832359236713e-05, + "loss": 1.5738611221313477, + "step": 85560 + }, + { + "epoch": 0.259016542309264, + "grad_norm": 0.12390338629484177, + "learning_rate": 9.290452841072974e-05, + "loss": 1.5492824554443358, + "step": 85570 + }, + { + "epoch": 0.2590468118596098, + "grad_norm": 0.10489139705896378, + "learning_rate": 9.290073322909235e-05, + "loss": 1.5489853858947753, + "step": 85580 + }, + { + "epoch": 0.25907708140995567, + "grad_norm": 0.1259627640247345, + "learning_rate": 9.289693804745495e-05, + "loss": 1.5453877449035645, + "step": 85590 + }, + { + "epoch": 0.25910735096030146, + "grad_norm": 0.12736305594444275, + "learning_rate": 9.289314286581756e-05, + "loss": 1.5937528610229492, + "step": 85600 + }, + { + "epoch": 0.2591376205106473, + "grad_norm": 0.1253833919763565, + "learning_rate": 9.288934768418016e-05, + "loss": 1.603611373901367, + "step": 85610 + }, + { + "epoch": 0.25916789006099317, + "grad_norm": 0.12100133299827576, + "learning_rate": 9.288555250254277e-05, + "loss": 1.565131378173828, + "step": 85620 + }, + { + "epoch": 0.25919815961133896, + "grad_norm": 0.12522177398204803, + "learning_rate": 9.288175732090539e-05, + "loss": 1.5798683166503906, + "step": 85630 + }, + { + "epoch": 0.2592284291616848, + "grad_norm": 0.11560066789388657, + "learning_rate": 9.287796213926798e-05, + "loss": 1.5474983215332032, + "step": 85640 + }, + { + "epoch": 0.2592586987120306, + "grad_norm": 0.11457042396068573, + "learning_rate": 9.28741669576306e-05, + "loss": 1.5138053894042969, + "step": 85650 + }, + { + "epoch": 0.25928896826237646, + "grad_norm": 0.10161883383989334, + "learning_rate": 9.28703717759932e-05, + "loss": 1.5887880325317383, + "step": 85660 + }, + { + "epoch": 0.2593192378127223, + "grad_norm": 0.12180492281913757, + "learning_rate": 9.286657659435581e-05, + "loss": 1.533336067199707, + "step": 85670 + }, + { + "epoch": 0.2593495073630681, + "grad_norm": 0.10435304790735245, + "learning_rate": 9.28627814127184e-05, + "loss": 1.578019618988037, + "step": 85680 + }, + { + "epoch": 0.25937977691341396, + "grad_norm": 0.12779845297336578, + "learning_rate": 9.285898623108102e-05, + "loss": 1.5634305953979493, + "step": 85690 + }, + { + "epoch": 0.25941004646375976, + "grad_norm": 0.1273418664932251, + "learning_rate": 9.285519104944362e-05, + "loss": 1.5490216255187987, + "step": 85700 + }, + { + "epoch": 0.2594403160141056, + "grad_norm": 0.13068465888500214, + "learning_rate": 9.285139586780624e-05, + "loss": 1.581632709503174, + "step": 85710 + }, + { + "epoch": 0.25947058556445146, + "grad_norm": 0.1108025535941124, + "learning_rate": 9.284760068616884e-05, + "loss": 1.5439825057983398, + "step": 85720 + }, + { + "epoch": 0.25950085511479726, + "grad_norm": 0.11310236155986786, + "learning_rate": 9.284380550453145e-05, + "loss": 1.5931619644165038, + "step": 85730 + }, + { + "epoch": 0.2595311246651431, + "grad_norm": 0.12777099013328552, + "learning_rate": 9.284001032289405e-05, + "loss": 1.548634147644043, + "step": 85740 + }, + { + "epoch": 0.2595613942154889, + "grad_norm": 0.13260048627853394, + "learning_rate": 9.283621514125666e-05, + "loss": 1.5723504066467284, + "step": 85750 + }, + { + "epoch": 0.25959166376583476, + "grad_norm": 0.1249653548002243, + "learning_rate": 9.283241995961928e-05, + "loss": 1.5945356369018555, + "step": 85760 + }, + { + "epoch": 0.2596219333161806, + "grad_norm": 0.12953589856624603, + "learning_rate": 9.282862477798187e-05, + "loss": 1.5861501693725586, + "step": 85770 + }, + { + "epoch": 0.2596522028665264, + "grad_norm": 0.12808233499526978, + "learning_rate": 9.282482959634449e-05, + "loss": 1.5914603233337403, + "step": 85780 + }, + { + "epoch": 0.25968247241687226, + "grad_norm": 0.12882038950920105, + "learning_rate": 9.282103441470708e-05, + "loss": 1.5383350372314453, + "step": 85790 + }, + { + "epoch": 0.25971274196721805, + "grad_norm": 0.13552600145339966, + "learning_rate": 9.28172392330697e-05, + "loss": 1.5519771575927734, + "step": 85800 + }, + { + "epoch": 0.2597430115175639, + "grad_norm": 0.15060736238956451, + "learning_rate": 9.28134440514323e-05, + "loss": 1.5476927757263184, + "step": 85810 + }, + { + "epoch": 0.25977328106790976, + "grad_norm": 0.11868299543857574, + "learning_rate": 9.280964886979492e-05, + "loss": 1.54774169921875, + "step": 85820 + }, + { + "epoch": 0.25980355061825555, + "grad_norm": 0.10989733040332794, + "learning_rate": 9.280585368815751e-05, + "loss": 1.5778821945190429, + "step": 85830 + }, + { + "epoch": 0.2598338201686014, + "grad_norm": 0.1088346615433693, + "learning_rate": 9.280205850652013e-05, + "loss": 1.566791820526123, + "step": 85840 + }, + { + "epoch": 0.2598640897189472, + "grad_norm": 0.1171790361404419, + "learning_rate": 9.279826332488272e-05, + "loss": 1.5582183837890624, + "step": 85850 + }, + { + "epoch": 0.25989435926929305, + "grad_norm": 0.11381667852401733, + "learning_rate": 9.279446814324534e-05, + "loss": 1.534520721435547, + "step": 85860 + }, + { + "epoch": 0.2599246288196389, + "grad_norm": 0.12136867642402649, + "learning_rate": 9.279067296160793e-05, + "loss": 1.5510507583618165, + "step": 85870 + }, + { + "epoch": 0.2599548983699847, + "grad_norm": 0.11921241134405136, + "learning_rate": 9.278687777997055e-05, + "loss": 1.561399269104004, + "step": 85880 + }, + { + "epoch": 0.25998516792033055, + "grad_norm": 0.10425890982151031, + "learning_rate": 9.278308259833316e-05, + "loss": 1.5646434783935548, + "step": 85890 + }, + { + "epoch": 0.26001543747067635, + "grad_norm": 0.12473299354314804, + "learning_rate": 9.277928741669576e-05, + "loss": 1.5730189323425292, + "step": 85900 + }, + { + "epoch": 0.2600457070210222, + "grad_norm": 0.11944570392370224, + "learning_rate": 9.277549223505838e-05, + "loss": 1.5612884521484376, + "step": 85910 + }, + { + "epoch": 0.26007597657136805, + "grad_norm": 0.1068592518568039, + "learning_rate": 9.277169705342098e-05, + "loss": 1.5872068405151367, + "step": 85920 + }, + { + "epoch": 0.26010624612171385, + "grad_norm": 0.11997656524181366, + "learning_rate": 9.27679018717836e-05, + "loss": 1.5738816261291504, + "step": 85930 + }, + { + "epoch": 0.2601365156720597, + "grad_norm": 0.11443600058555603, + "learning_rate": 9.276410669014619e-05, + "loss": 1.5869271278381347, + "step": 85940 + }, + { + "epoch": 0.2601667852224055, + "grad_norm": 0.10736817121505737, + "learning_rate": 9.276031150850881e-05, + "loss": 1.5907801628112792, + "step": 85950 + }, + { + "epoch": 0.26019705477275135, + "grad_norm": 0.11311545968055725, + "learning_rate": 9.27565163268714e-05, + "loss": 1.6254444122314453, + "step": 85960 + }, + { + "epoch": 0.2602273243230972, + "grad_norm": 0.11400067061185837, + "learning_rate": 9.275272114523402e-05, + "loss": 1.581629467010498, + "step": 85970 + }, + { + "epoch": 0.260257593873443, + "grad_norm": 0.11787669360637665, + "learning_rate": 9.274892596359661e-05, + "loss": 1.570150375366211, + "step": 85980 + }, + { + "epoch": 0.26028786342378885, + "grad_norm": 0.11342856287956238, + "learning_rate": 9.274513078195923e-05, + "loss": 1.5320903778076171, + "step": 85990 + }, + { + "epoch": 0.26031813297413464, + "grad_norm": 0.13927823305130005, + "learning_rate": 9.274133560032184e-05, + "loss": 1.5902164459228516, + "step": 86000 + }, + { + "epoch": 0.26031813297413464, + "eval_loss": 1.5659797191619873, + "eval_runtime": 27.4889, + "eval_samples_per_second": 18.189, + "eval_steps_per_second": 1.164, + "step": 86000 + }, + { + "epoch": 0.2603484025244805, + "grad_norm": 0.12359993159770966, + "learning_rate": 9.273754041868444e-05, + "loss": 1.5614561080932616, + "step": 86010 + }, + { + "epoch": 0.26037867207482635, + "grad_norm": 0.1093023419380188, + "learning_rate": 9.273374523704705e-05, + "loss": 1.5677736282348633, + "step": 86020 + }, + { + "epoch": 0.26040894162517214, + "grad_norm": 0.12348175793886185, + "learning_rate": 9.272995005540965e-05, + "loss": 1.5549388885498048, + "step": 86030 + }, + { + "epoch": 0.260439211175518, + "grad_norm": 0.12557196617126465, + "learning_rate": 9.272615487377226e-05, + "loss": 1.575187873840332, + "step": 86040 + }, + { + "epoch": 0.26046948072586384, + "grad_norm": 0.10919691622257233, + "learning_rate": 9.272235969213487e-05, + "loss": 1.5438010215759277, + "step": 86050 + }, + { + "epoch": 0.26049975027620964, + "grad_norm": 0.11798351258039474, + "learning_rate": 9.271856451049747e-05, + "loss": 1.5958804130554198, + "step": 86060 + }, + { + "epoch": 0.2605300198265555, + "grad_norm": 0.1226716861128807, + "learning_rate": 9.271476932886008e-05, + "loss": 1.583235263824463, + "step": 86070 + }, + { + "epoch": 0.2605602893769013, + "grad_norm": 0.11884836107492447, + "learning_rate": 9.271097414722268e-05, + "loss": 1.542823600769043, + "step": 86080 + }, + { + "epoch": 0.26059055892724714, + "grad_norm": 0.13017936050891876, + "learning_rate": 9.270717896558529e-05, + "loss": 1.5919988632202149, + "step": 86090 + }, + { + "epoch": 0.260620828477593, + "grad_norm": 0.12774881720542908, + "learning_rate": 9.27033837839479e-05, + "loss": 1.5751918792724608, + "step": 86100 + }, + { + "epoch": 0.2606510980279388, + "grad_norm": 0.1300143450498581, + "learning_rate": 9.26995886023105e-05, + "loss": 1.5506019592285156, + "step": 86110 + }, + { + "epoch": 0.26068136757828464, + "grad_norm": 0.10811365395784378, + "learning_rate": 9.269579342067312e-05, + "loss": 1.5518047332763671, + "step": 86120 + }, + { + "epoch": 0.26071163712863044, + "grad_norm": 0.10946198552846909, + "learning_rate": 9.269199823903573e-05, + "loss": 1.590884017944336, + "step": 86130 + }, + { + "epoch": 0.2607419066789763, + "grad_norm": 0.11626028269529343, + "learning_rate": 9.268820305739833e-05, + "loss": 1.5635278701782227, + "step": 86140 + }, + { + "epoch": 0.26077217622932214, + "grad_norm": 0.12364434450864792, + "learning_rate": 9.268440787576094e-05, + "loss": 1.5634821891784667, + "step": 86150 + }, + { + "epoch": 0.26080244577966794, + "grad_norm": 0.11515822261571884, + "learning_rate": 9.268061269412355e-05, + "loss": 1.570667552947998, + "step": 86160 + }, + { + "epoch": 0.2608327153300138, + "grad_norm": 0.11763748526573181, + "learning_rate": 9.267681751248615e-05, + "loss": 1.5629220962524415, + "step": 86170 + }, + { + "epoch": 0.2608629848803596, + "grad_norm": 0.12598608434200287, + "learning_rate": 9.267302233084876e-05, + "loss": 1.5372002601623536, + "step": 86180 + }, + { + "epoch": 0.26089325443070543, + "grad_norm": 0.13103345036506653, + "learning_rate": 9.266922714921136e-05, + "loss": 1.5593886375427246, + "step": 86190 + }, + { + "epoch": 0.2609235239810513, + "grad_norm": 0.12334954738616943, + "learning_rate": 9.266543196757397e-05, + "loss": 1.5287237167358398, + "step": 86200 + }, + { + "epoch": 0.2609537935313971, + "grad_norm": 0.10641638934612274, + "learning_rate": 9.266163678593658e-05, + "loss": 1.575831413269043, + "step": 86210 + }, + { + "epoch": 0.26098406308174293, + "grad_norm": 0.10652721673250198, + "learning_rate": 9.265784160429918e-05, + "loss": 1.5355138778686523, + "step": 86220 + }, + { + "epoch": 0.26101433263208873, + "grad_norm": 0.12298435717821121, + "learning_rate": 9.265404642266179e-05, + "loss": 1.5527142524719237, + "step": 86230 + }, + { + "epoch": 0.2610446021824346, + "grad_norm": 0.11072888225317001, + "learning_rate": 9.26502512410244e-05, + "loss": 1.5841575622558595, + "step": 86240 + }, + { + "epoch": 0.26107487173278043, + "grad_norm": 0.09845184534788132, + "learning_rate": 9.2646456059387e-05, + "loss": 1.577357864379883, + "step": 86250 + }, + { + "epoch": 0.26110514128312623, + "grad_norm": 0.12004780769348145, + "learning_rate": 9.264266087774962e-05, + "loss": 1.5568904876708984, + "step": 86260 + }, + { + "epoch": 0.2611354108334721, + "grad_norm": 0.12725336849689484, + "learning_rate": 9.263886569611221e-05, + "loss": 1.562091064453125, + "step": 86270 + }, + { + "epoch": 0.2611656803838179, + "grad_norm": 0.12093330919742584, + "learning_rate": 9.263507051447483e-05, + "loss": 1.6088706970214843, + "step": 86280 + }, + { + "epoch": 0.26119594993416373, + "grad_norm": 0.10951447486877441, + "learning_rate": 9.263127533283742e-05, + "loss": 1.545902156829834, + "step": 86290 + }, + { + "epoch": 0.2612262194845096, + "grad_norm": 0.12207239121198654, + "learning_rate": 9.262748015120004e-05, + "loss": 1.5956317901611328, + "step": 86300 + }, + { + "epoch": 0.2612564890348554, + "grad_norm": 0.11642385274171829, + "learning_rate": 9.262368496956265e-05, + "loss": 1.5668085098266602, + "step": 86310 + }, + { + "epoch": 0.26128675858520123, + "grad_norm": 0.10899201780557632, + "learning_rate": 9.261988978792525e-05, + "loss": 1.5745203018188476, + "step": 86320 + }, + { + "epoch": 0.261317028135547, + "grad_norm": 0.1348293572664261, + "learning_rate": 9.261609460628786e-05, + "loss": 1.5541378021240235, + "step": 86330 + }, + { + "epoch": 0.2613472976858929, + "grad_norm": 0.11695996671915054, + "learning_rate": 9.261229942465047e-05, + "loss": 1.5525922775268555, + "step": 86340 + }, + { + "epoch": 0.26137756723623873, + "grad_norm": 0.12517096102237701, + "learning_rate": 9.260850424301307e-05, + "loss": 1.5605117797851562, + "step": 86350 + }, + { + "epoch": 0.2614078367865845, + "grad_norm": 0.12642329931259155, + "learning_rate": 9.260470906137568e-05, + "loss": 1.555492115020752, + "step": 86360 + }, + { + "epoch": 0.2614381063369304, + "grad_norm": 0.11054127663373947, + "learning_rate": 9.26009138797383e-05, + "loss": 1.5608480453491211, + "step": 86370 + }, + { + "epoch": 0.26146837588727617, + "grad_norm": 0.12242846935987473, + "learning_rate": 9.259711869810089e-05, + "loss": 1.5838496208190918, + "step": 86380 + }, + { + "epoch": 0.261498645437622, + "grad_norm": 0.11940596997737885, + "learning_rate": 9.259332351646351e-05, + "loss": 1.570816707611084, + "step": 86390 + }, + { + "epoch": 0.2615289149879679, + "grad_norm": 0.1061655730009079, + "learning_rate": 9.25895283348261e-05, + "loss": 1.5796374320983886, + "step": 86400 + }, + { + "epoch": 0.26155918453831367, + "grad_norm": 0.10925783216953278, + "learning_rate": 9.258573315318872e-05, + "loss": 1.5791770935058593, + "step": 86410 + }, + { + "epoch": 0.2615894540886595, + "grad_norm": 0.10691174864768982, + "learning_rate": 9.258193797155131e-05, + "loss": 1.5296794891357421, + "step": 86420 + }, + { + "epoch": 0.2616197236390053, + "grad_norm": 0.11650405824184418, + "learning_rate": 9.257814278991393e-05, + "loss": 1.5530553817749024, + "step": 86430 + }, + { + "epoch": 0.26164999318935117, + "grad_norm": 0.10607907176017761, + "learning_rate": 9.257434760827653e-05, + "loss": 1.5425552368164062, + "step": 86440 + }, + { + "epoch": 0.261680262739697, + "grad_norm": 0.09823966771364212, + "learning_rate": 9.257055242663914e-05, + "loss": 1.5748636245727539, + "step": 86450 + }, + { + "epoch": 0.2617105322900428, + "grad_norm": 0.11619141697883606, + "learning_rate": 9.256675724500174e-05, + "loss": 1.5976531028747558, + "step": 86460 + }, + { + "epoch": 0.26174080184038867, + "grad_norm": 0.11273849755525589, + "learning_rate": 9.256296206336436e-05, + "loss": 1.544126605987549, + "step": 86470 + }, + { + "epoch": 0.26177107139073447, + "grad_norm": 0.10714130103588104, + "learning_rate": 9.255916688172695e-05, + "loss": 1.5647150039672852, + "step": 86480 + }, + { + "epoch": 0.2618013409410803, + "grad_norm": 0.1163850948214531, + "learning_rate": 9.255537170008957e-05, + "loss": 1.591550636291504, + "step": 86490 + }, + { + "epoch": 0.26183161049142617, + "grad_norm": 0.11345299333333969, + "learning_rate": 9.255157651845217e-05, + "loss": 1.5770124435424804, + "step": 86500 + }, + { + "epoch": 0.26183161049142617, + "eval_loss": 1.5831489562988281, + "eval_runtime": 28.1724, + "eval_samples_per_second": 17.748, + "eval_steps_per_second": 1.136, + "step": 86500 + }, + { + "epoch": 0.26186188004177197, + "grad_norm": 0.1096355989575386, + "learning_rate": 9.254778133681478e-05, + "loss": 1.5987958908081055, + "step": 86510 + }, + { + "epoch": 0.2618921495921178, + "grad_norm": 0.11400094628334045, + "learning_rate": 9.25439861551774e-05, + "loss": 1.6025850296020507, + "step": 86520 + }, + { + "epoch": 0.2619224191424636, + "grad_norm": 0.1162155419588089, + "learning_rate": 9.254019097353999e-05, + "loss": 1.5888282775878906, + "step": 86530 + }, + { + "epoch": 0.26195268869280947, + "grad_norm": 0.13434219360351562, + "learning_rate": 9.253639579190261e-05, + "loss": 1.5702162742614747, + "step": 86540 + }, + { + "epoch": 0.2619829582431553, + "grad_norm": 0.1227535605430603, + "learning_rate": 9.25326006102652e-05, + "loss": 1.584947109222412, + "step": 86550 + }, + { + "epoch": 0.2620132277935011, + "grad_norm": 0.11766911298036575, + "learning_rate": 9.252880542862782e-05, + "loss": 1.5585456848144532, + "step": 86560 + }, + { + "epoch": 0.26204349734384697, + "grad_norm": 0.12846092879772186, + "learning_rate": 9.252501024699042e-05, + "loss": 1.5756449699401855, + "step": 86570 + }, + { + "epoch": 0.26207376689419276, + "grad_norm": 0.12040700018405914, + "learning_rate": 9.252121506535304e-05, + "loss": 1.5587638854980468, + "step": 86580 + }, + { + "epoch": 0.2621040364445386, + "grad_norm": 0.1059710904955864, + "learning_rate": 9.251741988371563e-05, + "loss": 1.6337020874023438, + "step": 86590 + }, + { + "epoch": 0.26213430599488446, + "grad_norm": 0.11513780057430267, + "learning_rate": 9.251362470207825e-05, + "loss": 1.5665437698364257, + "step": 86600 + }, + { + "epoch": 0.26216457554523026, + "grad_norm": 0.1306016594171524, + "learning_rate": 9.250982952044084e-05, + "loss": 1.5488537788391112, + "step": 86610 + }, + { + "epoch": 0.2621948450955761, + "grad_norm": 0.11096565425395966, + "learning_rate": 9.250603433880346e-05, + "loss": 1.5879889488220216, + "step": 86620 + }, + { + "epoch": 0.2622251146459219, + "grad_norm": 0.12157668173313141, + "learning_rate": 9.250223915716607e-05, + "loss": 1.5971467971801758, + "step": 86630 + }, + { + "epoch": 0.26225538419626776, + "grad_norm": 0.12325945496559143, + "learning_rate": 9.249844397552867e-05, + "loss": 1.5760126113891602, + "step": 86640 + }, + { + "epoch": 0.2622856537466136, + "grad_norm": 0.11387620121240616, + "learning_rate": 9.249464879389128e-05, + "loss": 1.5729545593261718, + "step": 86650 + }, + { + "epoch": 0.2623159232969594, + "grad_norm": 0.11335563659667969, + "learning_rate": 9.249085361225388e-05, + "loss": 1.544163131713867, + "step": 86660 + }, + { + "epoch": 0.26234619284730526, + "grad_norm": 0.1209539994597435, + "learning_rate": 9.248705843061649e-05, + "loss": 1.6042301177978515, + "step": 86670 + }, + { + "epoch": 0.26237646239765106, + "grad_norm": 0.13872675597667694, + "learning_rate": 9.24832632489791e-05, + "loss": 1.5550271987915039, + "step": 86680 + }, + { + "epoch": 0.2624067319479969, + "grad_norm": 0.1377583146095276, + "learning_rate": 9.24794680673417e-05, + "loss": 1.5784942626953125, + "step": 86690 + }, + { + "epoch": 0.26243700149834276, + "grad_norm": 0.11715728789567947, + "learning_rate": 9.247567288570431e-05, + "loss": 1.5888883590698242, + "step": 86700 + }, + { + "epoch": 0.26246727104868856, + "grad_norm": 0.12751193344593048, + "learning_rate": 9.247187770406691e-05, + "loss": 1.537419033050537, + "step": 86710 + }, + { + "epoch": 0.2624975405990344, + "grad_norm": 0.13179536163806915, + "learning_rate": 9.246808252242952e-05, + "loss": 1.5695252418518066, + "step": 86720 + }, + { + "epoch": 0.26252781014938026, + "grad_norm": 0.12317967414855957, + "learning_rate": 9.246428734079214e-05, + "loss": 1.5629874229431153, + "step": 86730 + }, + { + "epoch": 0.26255807969972605, + "grad_norm": 0.12107859551906586, + "learning_rate": 9.246049215915474e-05, + "loss": 1.579374122619629, + "step": 86740 + }, + { + "epoch": 0.2625883492500719, + "grad_norm": 0.11725957691669464, + "learning_rate": 9.245669697751735e-05, + "loss": 1.5410371780395509, + "step": 86750 + }, + { + "epoch": 0.2626186188004177, + "grad_norm": 0.12468153983354568, + "learning_rate": 9.245290179587996e-05, + "loss": 1.581824493408203, + "step": 86760 + }, + { + "epoch": 0.26264888835076355, + "grad_norm": 0.10626672208309174, + "learning_rate": 9.244910661424256e-05, + "loss": 1.5353561401367188, + "step": 86770 + }, + { + "epoch": 0.2626791579011094, + "grad_norm": 0.1138785108923912, + "learning_rate": 9.244531143260517e-05, + "loss": 1.5649406433105468, + "step": 86780 + }, + { + "epoch": 0.2627094274514552, + "grad_norm": 0.12151342630386353, + "learning_rate": 9.244151625096777e-05, + "loss": 1.5919480323791504, + "step": 86790 + }, + { + "epoch": 0.26273969700180105, + "grad_norm": 0.11219728738069534, + "learning_rate": 9.243772106933038e-05, + "loss": 1.5624372482299804, + "step": 86800 + }, + { + "epoch": 0.26276996655214685, + "grad_norm": 0.1272212266921997, + "learning_rate": 9.243392588769299e-05, + "loss": 1.535346221923828, + "step": 86810 + }, + { + "epoch": 0.2628002361024927, + "grad_norm": 0.12883222103118896, + "learning_rate": 9.243013070605559e-05, + "loss": 1.5099213600158692, + "step": 86820 + }, + { + "epoch": 0.26283050565283855, + "grad_norm": 0.12251745909452438, + "learning_rate": 9.24263355244182e-05, + "loss": 1.5575554847717286, + "step": 86830 + }, + { + "epoch": 0.26286077520318435, + "grad_norm": 0.11681552231311798, + "learning_rate": 9.24225403427808e-05, + "loss": 1.5629964828491212, + "step": 86840 + }, + { + "epoch": 0.2628910447535302, + "grad_norm": 0.10849130898714066, + "learning_rate": 9.241874516114341e-05, + "loss": 1.5320011138916017, + "step": 86850 + }, + { + "epoch": 0.262921314303876, + "grad_norm": 0.10839388519525528, + "learning_rate": 9.241494997950602e-05, + "loss": 1.5562705993652344, + "step": 86860 + }, + { + "epoch": 0.26295158385422185, + "grad_norm": 0.1265270859003067, + "learning_rate": 9.241115479786864e-05, + "loss": 1.5998874664306642, + "step": 86870 + }, + { + "epoch": 0.2629818534045677, + "grad_norm": 0.11395295709371567, + "learning_rate": 9.240735961623123e-05, + "loss": 1.5714691162109375, + "step": 86880 + }, + { + "epoch": 0.2630121229549135, + "grad_norm": 0.11760744452476501, + "learning_rate": 9.240356443459385e-05, + "loss": 1.5797588348388671, + "step": 86890 + }, + { + "epoch": 0.26304239250525935, + "grad_norm": 0.12286917865276337, + "learning_rate": 9.239976925295644e-05, + "loss": 1.5791412353515626, + "step": 86900 + }, + { + "epoch": 0.26307266205560514, + "grad_norm": 0.14925086498260498, + "learning_rate": 9.239597407131906e-05, + "loss": 1.5085933685302735, + "step": 86910 + }, + { + "epoch": 0.263102931605951, + "grad_norm": 0.12135029584169388, + "learning_rate": 9.239217888968167e-05, + "loss": 1.5597829818725586, + "step": 86920 + }, + { + "epoch": 0.26313320115629685, + "grad_norm": 0.11610967665910721, + "learning_rate": 9.238838370804427e-05, + "loss": 1.5933168411254883, + "step": 86930 + }, + { + "epoch": 0.26316347070664264, + "grad_norm": 0.12187214940786362, + "learning_rate": 9.238458852640688e-05, + "loss": 1.5629581451416015, + "step": 86940 + }, + { + "epoch": 0.2631937402569885, + "grad_norm": 0.1278505176305771, + "learning_rate": 9.238079334476948e-05, + "loss": 1.5403013229370117, + "step": 86950 + }, + { + "epoch": 0.2632240098073343, + "grad_norm": 0.10820463299751282, + "learning_rate": 9.237699816313209e-05, + "loss": 1.5876430511474608, + "step": 86960 + }, + { + "epoch": 0.26325427935768014, + "grad_norm": 0.11067942529916763, + "learning_rate": 9.23732029814947e-05, + "loss": 1.568441677093506, + "step": 86970 + }, + { + "epoch": 0.263284548908026, + "grad_norm": 0.10728491842746735, + "learning_rate": 9.236940779985731e-05, + "loss": 1.5769356727600097, + "step": 86980 + }, + { + "epoch": 0.2633148184583718, + "grad_norm": 0.10862749069929123, + "learning_rate": 9.236561261821991e-05, + "loss": 1.5621434211730958, + "step": 86990 + }, + { + "epoch": 0.26334508800871764, + "grad_norm": 0.11850107461214066, + "learning_rate": 9.236181743658253e-05, + "loss": 1.5805461883544922, + "step": 87000 + }, + { + "epoch": 0.26334508800871764, + "eval_loss": 1.5709763765335083, + "eval_runtime": 28.185, + "eval_samples_per_second": 17.74, + "eval_steps_per_second": 1.135, + "step": 87000 + }, + { + "epoch": 0.26337535755906344, + "grad_norm": 0.10412946343421936, + "learning_rate": 9.235802225494512e-05, + "loss": 1.5707258224487304, + "step": 87010 + }, + { + "epoch": 0.2634056271094093, + "grad_norm": 0.12822335958480835, + "learning_rate": 9.235422707330774e-05, + "loss": 1.517673873901367, + "step": 87020 + }, + { + "epoch": 0.26343589665975514, + "grad_norm": 0.12078247964382172, + "learning_rate": 9.235043189167033e-05, + "loss": 1.571387481689453, + "step": 87030 + }, + { + "epoch": 0.26346616621010094, + "grad_norm": 0.12154323607683182, + "learning_rate": 9.234663671003295e-05, + "loss": 1.5254634857177733, + "step": 87040 + }, + { + "epoch": 0.2634964357604468, + "grad_norm": 0.11397645622491837, + "learning_rate": 9.234284152839554e-05, + "loss": 1.565029239654541, + "step": 87050 + }, + { + "epoch": 0.2635267053107926, + "grad_norm": 0.13489986956119537, + "learning_rate": 9.233904634675816e-05, + "loss": 1.591054344177246, + "step": 87060 + }, + { + "epoch": 0.26355697486113844, + "grad_norm": 0.11671080440282822, + "learning_rate": 9.233525116512075e-05, + "loss": 1.5918840408325194, + "step": 87070 + }, + { + "epoch": 0.2635872444114843, + "grad_norm": 0.11489992588758469, + "learning_rate": 9.233145598348337e-05, + "loss": 1.5783052444458008, + "step": 87080 + }, + { + "epoch": 0.2636175139618301, + "grad_norm": 0.11024067550897598, + "learning_rate": 9.232766080184597e-05, + "loss": 1.615652084350586, + "step": 87090 + }, + { + "epoch": 0.26364778351217594, + "grad_norm": 0.13180068135261536, + "learning_rate": 9.232386562020859e-05, + "loss": 1.5242461204528808, + "step": 87100 + }, + { + "epoch": 0.26367805306252173, + "grad_norm": 0.10882612317800522, + "learning_rate": 9.232007043857119e-05, + "loss": 1.58428897857666, + "step": 87110 + }, + { + "epoch": 0.2637083226128676, + "grad_norm": 0.11650330573320389, + "learning_rate": 9.23162752569338e-05, + "loss": 1.5871779441833496, + "step": 87120 + }, + { + "epoch": 0.26373859216321344, + "grad_norm": 0.12195251137018204, + "learning_rate": 9.231248007529642e-05, + "loss": 1.583752155303955, + "step": 87130 + }, + { + "epoch": 0.26376886171355923, + "grad_norm": 0.11739973723888397, + "learning_rate": 9.230868489365901e-05, + "loss": 1.5937461853027344, + "step": 87140 + }, + { + "epoch": 0.2637991312639051, + "grad_norm": 0.13410508632659912, + "learning_rate": 9.230488971202163e-05, + "loss": 1.584289264678955, + "step": 87150 + }, + { + "epoch": 0.2638294008142509, + "grad_norm": 0.12266259640455246, + "learning_rate": 9.230109453038422e-05, + "loss": 1.6022581100463866, + "step": 87160 + }, + { + "epoch": 0.26385967036459673, + "grad_norm": 0.13020969927310944, + "learning_rate": 9.229729934874684e-05, + "loss": 1.557521438598633, + "step": 87170 + }, + { + "epoch": 0.2638899399149426, + "grad_norm": 0.1054413765668869, + "learning_rate": 9.229350416710943e-05, + "loss": 1.5924314498901366, + "step": 87180 + }, + { + "epoch": 0.2639202094652884, + "grad_norm": 0.11905231326818466, + "learning_rate": 9.228970898547205e-05, + "loss": 1.5502938270568847, + "step": 87190 + }, + { + "epoch": 0.26395047901563423, + "grad_norm": 0.12820641696453094, + "learning_rate": 9.228591380383465e-05, + "loss": 1.5655282974243163, + "step": 87200 + }, + { + "epoch": 0.26398074856598003, + "grad_norm": 0.11652592569589615, + "learning_rate": 9.228211862219726e-05, + "loss": 1.5699443817138672, + "step": 87210 + }, + { + "epoch": 0.2640110181163259, + "grad_norm": 0.10346333682537079, + "learning_rate": 9.227832344055986e-05, + "loss": 1.5833324432373046, + "step": 87220 + }, + { + "epoch": 0.26404128766667173, + "grad_norm": 0.10506317019462585, + "learning_rate": 9.227452825892248e-05, + "loss": 1.573216438293457, + "step": 87230 + }, + { + "epoch": 0.2640715572170175, + "grad_norm": 0.1377134621143341, + "learning_rate": 9.227073307728508e-05, + "loss": 1.5918045043945312, + "step": 87240 + }, + { + "epoch": 0.2641018267673634, + "grad_norm": 0.11818906664848328, + "learning_rate": 9.226693789564769e-05, + "loss": 1.5273080825805665, + "step": 87250 + }, + { + "epoch": 0.2641320963177092, + "grad_norm": 0.13354377448558807, + "learning_rate": 9.22631427140103e-05, + "loss": 1.578075885772705, + "step": 87260 + }, + { + "epoch": 0.264162365868055, + "grad_norm": 0.11491100490093231, + "learning_rate": 9.22593475323729e-05, + "loss": 1.5458200454711915, + "step": 87270 + }, + { + "epoch": 0.2641926354184009, + "grad_norm": 0.11169911175966263, + "learning_rate": 9.22555523507355e-05, + "loss": 1.5598023414611817, + "step": 87280 + }, + { + "epoch": 0.2642229049687467, + "grad_norm": 0.12638625502586365, + "learning_rate": 9.225175716909811e-05, + "loss": 1.5630093574523927, + "step": 87290 + }, + { + "epoch": 0.2642531745190925, + "grad_norm": 0.11950374394655228, + "learning_rate": 9.224796198746072e-05, + "loss": 1.5521150588989259, + "step": 87300 + }, + { + "epoch": 0.2642834440694383, + "grad_norm": 0.12179964780807495, + "learning_rate": 9.224416680582332e-05, + "loss": 1.5976040840148926, + "step": 87310 + }, + { + "epoch": 0.2643137136197842, + "grad_norm": 0.10175438970327377, + "learning_rate": 9.224037162418593e-05, + "loss": 1.5524621963500977, + "step": 87320 + }, + { + "epoch": 0.26434398317013, + "grad_norm": 0.11314312368631363, + "learning_rate": 9.223657644254854e-05, + "loss": 1.5535788536071777, + "step": 87330 + }, + { + "epoch": 0.2643742527204758, + "grad_norm": 0.11347583681344986, + "learning_rate": 9.223278126091116e-05, + "loss": 1.5654869079589844, + "step": 87340 + }, + { + "epoch": 0.2644045222708217, + "grad_norm": 0.12913678586483002, + "learning_rate": 9.222898607927376e-05, + "loss": 1.547043514251709, + "step": 87350 + }, + { + "epoch": 0.26443479182116747, + "grad_norm": 0.11365979164838791, + "learning_rate": 9.222519089763637e-05, + "loss": 1.575723648071289, + "step": 87360 + }, + { + "epoch": 0.2644650613715133, + "grad_norm": 0.11251571774482727, + "learning_rate": 9.222139571599897e-05, + "loss": 1.5454627990722656, + "step": 87370 + }, + { + "epoch": 0.2644953309218592, + "grad_norm": 0.11157721281051636, + "learning_rate": 9.221760053436158e-05, + "loss": 1.596306037902832, + "step": 87380 + }, + { + "epoch": 0.26452560047220497, + "grad_norm": 0.12790538370609283, + "learning_rate": 9.221380535272419e-05, + "loss": 1.592976188659668, + "step": 87390 + }, + { + "epoch": 0.2645558700225508, + "grad_norm": 0.1103680431842804, + "learning_rate": 9.221001017108679e-05, + "loss": 1.5881498336791993, + "step": 87400 + }, + { + "epoch": 0.2645861395728966, + "grad_norm": 0.11447449773550034, + "learning_rate": 9.22062149894494e-05, + "loss": 1.5479793548583984, + "step": 87410 + }, + { + "epoch": 0.26461640912324247, + "grad_norm": 0.1216205433011055, + "learning_rate": 9.2202419807812e-05, + "loss": 1.5522824287414552, + "step": 87420 + }, + { + "epoch": 0.2646466786735883, + "grad_norm": 0.10583176463842392, + "learning_rate": 9.219862462617461e-05, + "loss": 1.5212954521179198, + "step": 87430 + }, + { + "epoch": 0.2646769482239341, + "grad_norm": 0.12973445653915405, + "learning_rate": 9.219482944453722e-05, + "loss": 1.5493582725524901, + "step": 87440 + }, + { + "epoch": 0.26470721777427997, + "grad_norm": 0.11074661463499069, + "learning_rate": 9.219103426289982e-05, + "loss": 1.5472330093383788, + "step": 87450 + }, + { + "epoch": 0.2647374873246258, + "grad_norm": 0.11590706557035446, + "learning_rate": 9.218723908126243e-05, + "loss": 1.5550692558288575, + "step": 87460 + }, + { + "epoch": 0.2647677568749716, + "grad_norm": 0.11852362751960754, + "learning_rate": 9.218344389962503e-05, + "loss": 1.545050048828125, + "step": 87470 + }, + { + "epoch": 0.26479802642531747, + "grad_norm": 0.12868431210517883, + "learning_rate": 9.217964871798765e-05, + "loss": 1.5880928993225099, + "step": 87480 + }, + { + "epoch": 0.26482829597566326, + "grad_norm": 0.10996157675981522, + "learning_rate": 9.217585353635025e-05, + "loss": 1.5681514739990234, + "step": 87490 + }, + { + "epoch": 0.2648585655260091, + "grad_norm": 0.12098821252584457, + "learning_rate": 9.217205835471286e-05, + "loss": 1.5834404945373535, + "step": 87500 + }, + { + "epoch": 0.2648585655260091, + "eval_loss": 1.5710222721099854, + "eval_runtime": 27.9513, + "eval_samples_per_second": 17.888, + "eval_steps_per_second": 1.145, + "step": 87500 + }, + { + "epoch": 0.26488883507635497, + "grad_norm": 0.11424253135919571, + "learning_rate": 9.216826317307546e-05, + "loss": 1.5686234474182128, + "step": 87510 + }, + { + "epoch": 0.26491910462670076, + "grad_norm": 0.11545538902282715, + "learning_rate": 9.216446799143808e-05, + "loss": 1.5924854278564453, + "step": 87520 + }, + { + "epoch": 0.2649493741770466, + "grad_norm": 0.12460685521364212, + "learning_rate": 9.216067280980068e-05, + "loss": 1.5508742332458496, + "step": 87530 + }, + { + "epoch": 0.2649796437273924, + "grad_norm": 0.12550145387649536, + "learning_rate": 9.215687762816329e-05, + "loss": 1.5648401260375977, + "step": 87540 + }, + { + "epoch": 0.26500991327773826, + "grad_norm": 0.13162761926651, + "learning_rate": 9.21530824465259e-05, + "loss": 1.5521745681762695, + "step": 87550 + }, + { + "epoch": 0.2650401828280841, + "grad_norm": 0.13638898730278015, + "learning_rate": 9.21492872648885e-05, + "loss": 1.512587547302246, + "step": 87560 + }, + { + "epoch": 0.2650704523784299, + "grad_norm": 0.1314782351255417, + "learning_rate": 9.21454920832511e-05, + "loss": 1.5561214447021485, + "step": 87570 + }, + { + "epoch": 0.26510072192877576, + "grad_norm": 0.1287411004304886, + "learning_rate": 9.214169690161371e-05, + "loss": 1.56365966796875, + "step": 87580 + }, + { + "epoch": 0.26513099147912156, + "grad_norm": 0.10495882481336594, + "learning_rate": 9.213790171997632e-05, + "loss": 1.5619087219238281, + "step": 87590 + }, + { + "epoch": 0.2651612610294674, + "grad_norm": 0.11534478515386581, + "learning_rate": 9.213410653833892e-05, + "loss": 1.602077102661133, + "step": 87600 + }, + { + "epoch": 0.26519153057981326, + "grad_norm": 0.10475718975067139, + "learning_rate": 9.213031135670154e-05, + "loss": 1.5857336044311523, + "step": 87610 + }, + { + "epoch": 0.26522180013015906, + "grad_norm": 0.1271466165781021, + "learning_rate": 9.212651617506414e-05, + "loss": 1.5305547714233398, + "step": 87620 + }, + { + "epoch": 0.2652520696805049, + "grad_norm": 0.11552493274211884, + "learning_rate": 9.212272099342676e-05, + "loss": 1.588945198059082, + "step": 87630 + }, + { + "epoch": 0.2652823392308507, + "grad_norm": 0.12254247814416885, + "learning_rate": 9.211892581178935e-05, + "loss": 1.5467842102050782, + "step": 87640 + }, + { + "epoch": 0.26531260878119656, + "grad_norm": 0.12127445638179779, + "learning_rate": 9.211513063015197e-05, + "loss": 1.5949548721313476, + "step": 87650 + }, + { + "epoch": 0.2653428783315424, + "grad_norm": 0.11392436176538467, + "learning_rate": 9.211133544851456e-05, + "loss": 1.586068344116211, + "step": 87660 + }, + { + "epoch": 0.2653731478818882, + "grad_norm": 0.1074124425649643, + "learning_rate": 9.210754026687718e-05, + "loss": 1.566580581665039, + "step": 87670 + }, + { + "epoch": 0.26540341743223406, + "grad_norm": 0.11946435272693634, + "learning_rate": 9.210374508523977e-05, + "loss": 1.5383697509765626, + "step": 87680 + }, + { + "epoch": 0.26543368698257985, + "grad_norm": 0.12720200419425964, + "learning_rate": 9.209994990360239e-05, + "loss": 1.5640861511230468, + "step": 87690 + }, + { + "epoch": 0.2654639565329257, + "grad_norm": 0.12601041793823242, + "learning_rate": 9.209615472196498e-05, + "loss": 1.5995195388793946, + "step": 87700 + }, + { + "epoch": 0.26549422608327156, + "grad_norm": 0.10809321701526642, + "learning_rate": 9.20923595403276e-05, + "loss": 1.6036544799804688, + "step": 87710 + }, + { + "epoch": 0.26552449563361735, + "grad_norm": 0.10539259016513824, + "learning_rate": 9.20885643586902e-05, + "loss": 1.5436515808105469, + "step": 87720 + }, + { + "epoch": 0.2655547651839632, + "grad_norm": 0.11837424337863922, + "learning_rate": 9.208476917705282e-05, + "loss": 1.5601126670837402, + "step": 87730 + }, + { + "epoch": 0.265585034734309, + "grad_norm": 0.10983218252658844, + "learning_rate": 9.208097399541543e-05, + "loss": 1.568967342376709, + "step": 87740 + }, + { + "epoch": 0.26561530428465485, + "grad_norm": 0.11739758402109146, + "learning_rate": 9.207717881377803e-05, + "loss": 1.5476719856262207, + "step": 87750 + }, + { + "epoch": 0.2656455738350007, + "grad_norm": 0.10473648458719254, + "learning_rate": 9.207338363214065e-05, + "loss": 1.5534351348876954, + "step": 87760 + }, + { + "epoch": 0.2656758433853465, + "grad_norm": 0.11488912254571915, + "learning_rate": 9.206958845050324e-05, + "loss": 1.58253812789917, + "step": 87770 + }, + { + "epoch": 0.26570611293569235, + "grad_norm": 0.12416135519742966, + "learning_rate": 9.206579326886586e-05, + "loss": 1.6059087753295898, + "step": 87780 + }, + { + "epoch": 0.26573638248603815, + "grad_norm": 0.11566942185163498, + "learning_rate": 9.206199808722845e-05, + "loss": 1.5549842834472656, + "step": 87790 + }, + { + "epoch": 0.265766652036384, + "grad_norm": 0.10444217175245285, + "learning_rate": 9.205820290559107e-05, + "loss": 1.5691888809204102, + "step": 87800 + }, + { + "epoch": 0.26579692158672985, + "grad_norm": 0.1361362487077713, + "learning_rate": 9.205440772395366e-05, + "loss": 1.546086311340332, + "step": 87810 + }, + { + "epoch": 0.26582719113707565, + "grad_norm": 0.10912477970123291, + "learning_rate": 9.205061254231628e-05, + "loss": 1.5903377532958984, + "step": 87820 + }, + { + "epoch": 0.2658574606874215, + "grad_norm": 0.1115112379193306, + "learning_rate": 9.204681736067887e-05, + "loss": 1.5660703659057618, + "step": 87830 + }, + { + "epoch": 0.2658877302377673, + "grad_norm": 0.11602853238582611, + "learning_rate": 9.20430221790415e-05, + "loss": 1.5647823333740234, + "step": 87840 + }, + { + "epoch": 0.26591799978811315, + "grad_norm": 0.10814087092876434, + "learning_rate": 9.20392269974041e-05, + "loss": 1.5366618156433105, + "step": 87850 + }, + { + "epoch": 0.265948269338459, + "grad_norm": 0.10912678390741348, + "learning_rate": 9.20354318157667e-05, + "loss": 1.5439414978027344, + "step": 87860 + }, + { + "epoch": 0.2659785388888048, + "grad_norm": 0.10874517261981964, + "learning_rate": 9.203163663412931e-05, + "loss": 1.5843172073364258, + "step": 87870 + }, + { + "epoch": 0.26600880843915065, + "grad_norm": 0.1224084123969078, + "learning_rate": 9.202784145249192e-05, + "loss": 1.581318473815918, + "step": 87880 + }, + { + "epoch": 0.26603907798949644, + "grad_norm": 0.1385050266981125, + "learning_rate": 9.202404627085452e-05, + "loss": 1.5699216842651367, + "step": 87890 + }, + { + "epoch": 0.2660693475398423, + "grad_norm": 0.11057513952255249, + "learning_rate": 9.202025108921713e-05, + "loss": 1.5375479698181151, + "step": 87900 + }, + { + "epoch": 0.26609961709018815, + "grad_norm": 0.11756613105535507, + "learning_rate": 9.201645590757974e-05, + "loss": 1.5814852714538574, + "step": 87910 + }, + { + "epoch": 0.26612988664053394, + "grad_norm": 0.11307122558355331, + "learning_rate": 9.201266072594234e-05, + "loss": 1.5661592483520508, + "step": 87920 + }, + { + "epoch": 0.2661601561908798, + "grad_norm": 0.11085028201341629, + "learning_rate": 9.200886554430495e-05, + "loss": 1.5370565414428712, + "step": 87930 + }, + { + "epoch": 0.2661904257412256, + "grad_norm": 0.1234356239438057, + "learning_rate": 9.200507036266755e-05, + "loss": 1.5679651260375977, + "step": 87940 + }, + { + "epoch": 0.26622069529157144, + "grad_norm": 0.12472974509000778, + "learning_rate": 9.200127518103017e-05, + "loss": 1.5615246772766114, + "step": 87950 + }, + { + "epoch": 0.2662509648419173, + "grad_norm": 0.11680983752012253, + "learning_rate": 9.199747999939277e-05, + "loss": 1.539204978942871, + "step": 87960 + }, + { + "epoch": 0.2662812343922631, + "grad_norm": 0.12802118062973022, + "learning_rate": 9.199368481775538e-05, + "loss": 1.566763210296631, + "step": 87970 + }, + { + "epoch": 0.26631150394260894, + "grad_norm": 0.12639029324054718, + "learning_rate": 9.198988963611799e-05, + "loss": 1.5343488693237304, + "step": 87980 + }, + { + "epoch": 0.26634177349295474, + "grad_norm": 0.109780453145504, + "learning_rate": 9.19860944544806e-05, + "loss": 1.5423978805541991, + "step": 87990 + }, + { + "epoch": 0.2663720430433006, + "grad_norm": 0.10723430663347244, + "learning_rate": 9.19822992728432e-05, + "loss": 1.555605697631836, + "step": 88000 + }, + { + "epoch": 0.2663720430433006, + "eval_loss": 1.57932448387146, + "eval_runtime": 28.2035, + "eval_samples_per_second": 17.728, + "eval_steps_per_second": 1.135, + "step": 88000 + }, + { + "epoch": 0.26640231259364644, + "grad_norm": 0.11274126172065735, + "learning_rate": 9.197850409120581e-05, + "loss": 1.5782670974731445, + "step": 88010 + }, + { + "epoch": 0.26643258214399224, + "grad_norm": 0.11299756914377213, + "learning_rate": 9.197470890956841e-05, + "loss": 1.5553257942199707, + "step": 88020 + }, + { + "epoch": 0.2664628516943381, + "grad_norm": 0.11557462811470032, + "learning_rate": 9.197091372793102e-05, + "loss": 1.5556961059570313, + "step": 88030 + }, + { + "epoch": 0.2664931212446839, + "grad_norm": 0.11738453805446625, + "learning_rate": 9.196711854629363e-05, + "loss": 1.5816096305847167, + "step": 88040 + }, + { + "epoch": 0.26652339079502974, + "grad_norm": 0.12081228941679001, + "learning_rate": 9.196332336465623e-05, + "loss": 1.5984813690185546, + "step": 88050 + }, + { + "epoch": 0.2665536603453756, + "grad_norm": 0.11974987387657166, + "learning_rate": 9.195952818301884e-05, + "loss": 1.5617117881774902, + "step": 88060 + }, + { + "epoch": 0.2665839298957214, + "grad_norm": 0.12011837214231491, + "learning_rate": 9.195573300138144e-05, + "loss": 1.5882225036621094, + "step": 88070 + }, + { + "epoch": 0.26661419944606723, + "grad_norm": 0.11490528285503387, + "learning_rate": 9.195193781974405e-05, + "loss": 1.5884933471679688, + "step": 88080 + }, + { + "epoch": 0.26664446899641303, + "grad_norm": 0.1200370118021965, + "learning_rate": 9.194814263810667e-05, + "loss": 1.5701271057128907, + "step": 88090 + }, + { + "epoch": 0.2666747385467589, + "grad_norm": 0.11306923627853394, + "learning_rate": 9.194434745646926e-05, + "loss": 1.5782031059265136, + "step": 88100 + }, + { + "epoch": 0.26670500809710473, + "grad_norm": 0.12462828308343887, + "learning_rate": 9.194055227483188e-05, + "loss": 1.589673900604248, + "step": 88110 + }, + { + "epoch": 0.26673527764745053, + "grad_norm": 0.11708258092403412, + "learning_rate": 9.193675709319447e-05, + "loss": 1.5415781021118165, + "step": 88120 + }, + { + "epoch": 0.2667655471977964, + "grad_norm": 0.11323042213916779, + "learning_rate": 9.19329619115571e-05, + "loss": 1.563516616821289, + "step": 88130 + }, + { + "epoch": 0.26679581674814223, + "grad_norm": 0.11307311803102493, + "learning_rate": 9.19291667299197e-05, + "loss": 1.5758953094482422, + "step": 88140 + }, + { + "epoch": 0.26682608629848803, + "grad_norm": 0.11481577157974243, + "learning_rate": 9.19253715482823e-05, + "loss": 1.5662720680236817, + "step": 88150 + }, + { + "epoch": 0.2668563558488339, + "grad_norm": 0.11293984949588776, + "learning_rate": 9.192157636664491e-05, + "loss": 1.5661305427551269, + "step": 88160 + }, + { + "epoch": 0.2668866253991797, + "grad_norm": 0.1214861124753952, + "learning_rate": 9.191778118500752e-05, + "loss": 1.558380699157715, + "step": 88170 + }, + { + "epoch": 0.26691689494952553, + "grad_norm": 0.11688974499702454, + "learning_rate": 9.191398600337012e-05, + "loss": 1.5518731117248534, + "step": 88180 + }, + { + "epoch": 0.2669471644998714, + "grad_norm": 0.11875011026859283, + "learning_rate": 9.191019082173273e-05, + "loss": 1.537392807006836, + "step": 88190 + }, + { + "epoch": 0.2669774340502172, + "grad_norm": 0.11331133544445038, + "learning_rate": 9.190639564009534e-05, + "loss": 1.5271086692810059, + "step": 88200 + }, + { + "epoch": 0.26700770360056303, + "grad_norm": 0.11496835947036743, + "learning_rate": 9.190260045845794e-05, + "loss": 1.5776315689086915, + "step": 88210 + }, + { + "epoch": 0.2670379731509088, + "grad_norm": 0.129655122756958, + "learning_rate": 9.189880527682056e-05, + "loss": 1.5631644248962402, + "step": 88220 + }, + { + "epoch": 0.2670682427012547, + "grad_norm": 0.11320656538009644, + "learning_rate": 9.189501009518315e-05, + "loss": 1.576401138305664, + "step": 88230 + }, + { + "epoch": 0.26709851225160053, + "grad_norm": 0.11870507895946503, + "learning_rate": 9.189121491354577e-05, + "loss": 1.5573089599609375, + "step": 88240 + }, + { + "epoch": 0.2671287818019463, + "grad_norm": 0.11767043173313141, + "learning_rate": 9.188741973190837e-05, + "loss": 1.5660850524902343, + "step": 88250 + }, + { + "epoch": 0.2671590513522922, + "grad_norm": 0.12410508096218109, + "learning_rate": 9.188362455027098e-05, + "loss": 1.5473697662353516, + "step": 88260 + }, + { + "epoch": 0.267189320902638, + "grad_norm": 0.12320426851511002, + "learning_rate": 9.187982936863358e-05, + "loss": 1.5722315788269043, + "step": 88270 + }, + { + "epoch": 0.2672195904529838, + "grad_norm": 0.1353151649236679, + "learning_rate": 9.18760341869962e-05, + "loss": 1.5283390045166017, + "step": 88280 + }, + { + "epoch": 0.2672498600033297, + "grad_norm": 0.11982454359531403, + "learning_rate": 9.187223900535879e-05, + "loss": 1.5769073486328125, + "step": 88290 + }, + { + "epoch": 0.26728012955367547, + "grad_norm": 0.11355628818273544, + "learning_rate": 9.186844382372141e-05, + "loss": 1.571267795562744, + "step": 88300 + }, + { + "epoch": 0.2673103991040213, + "grad_norm": 0.12458541244268417, + "learning_rate": 9.1864648642084e-05, + "loss": 1.5623676300048828, + "step": 88310 + }, + { + "epoch": 0.2673406686543671, + "grad_norm": 0.11636095494031906, + "learning_rate": 9.186085346044662e-05, + "loss": 1.5422640800476075, + "step": 88320 + }, + { + "epoch": 0.26737093820471297, + "grad_norm": 0.121637724339962, + "learning_rate": 9.185705827880921e-05, + "loss": 1.5664276123046874, + "step": 88330 + }, + { + "epoch": 0.2674012077550588, + "grad_norm": 0.1242007464170456, + "learning_rate": 9.185326309717183e-05, + "loss": 1.557763671875, + "step": 88340 + }, + { + "epoch": 0.2674314773054046, + "grad_norm": 0.1098259836435318, + "learning_rate": 9.184946791553445e-05, + "loss": 1.593307876586914, + "step": 88350 + }, + { + "epoch": 0.26746174685575047, + "grad_norm": 0.11818641424179077, + "learning_rate": 9.184567273389704e-05, + "loss": 1.559173583984375, + "step": 88360 + }, + { + "epoch": 0.26749201640609627, + "grad_norm": 0.11982708424329758, + "learning_rate": 9.184187755225966e-05, + "loss": 1.5639177322387696, + "step": 88370 + }, + { + "epoch": 0.2675222859564421, + "grad_norm": 0.1156018078327179, + "learning_rate": 9.183808237062226e-05, + "loss": 1.5567522048950195, + "step": 88380 + }, + { + "epoch": 0.26755255550678797, + "grad_norm": 0.12561701238155365, + "learning_rate": 9.183428718898488e-05, + "loss": 1.5830575942993164, + "step": 88390 + }, + { + "epoch": 0.26758282505713377, + "grad_norm": 0.11630186438560486, + "learning_rate": 9.183049200734747e-05, + "loss": 1.5338233947753905, + "step": 88400 + }, + { + "epoch": 0.2676130946074796, + "grad_norm": 0.11819331347942352, + "learning_rate": 9.182669682571009e-05, + "loss": 1.5796781539916993, + "step": 88410 + }, + { + "epoch": 0.2676433641578254, + "grad_norm": 0.10961687564849854, + "learning_rate": 9.182290164407268e-05, + "loss": 1.5526269912719726, + "step": 88420 + }, + { + "epoch": 0.26767363370817127, + "grad_norm": 0.11416859924793243, + "learning_rate": 9.18191064624353e-05, + "loss": 1.5518096923828124, + "step": 88430 + }, + { + "epoch": 0.2677039032585171, + "grad_norm": 0.11155979335308075, + "learning_rate": 9.181531128079789e-05, + "loss": 1.5500752449035644, + "step": 88440 + }, + { + "epoch": 0.2677341728088629, + "grad_norm": 0.12006991356611252, + "learning_rate": 9.181151609916051e-05, + "loss": 1.537694549560547, + "step": 88450 + }, + { + "epoch": 0.26776444235920877, + "grad_norm": 0.122177354991436, + "learning_rate": 9.180772091752312e-05, + "loss": 1.5928624153137207, + "step": 88460 + }, + { + "epoch": 0.26779471190955456, + "grad_norm": 0.11622954159975052, + "learning_rate": 9.180392573588572e-05, + "loss": 1.577568531036377, + "step": 88470 + }, + { + "epoch": 0.2678249814599004, + "grad_norm": 0.12182467430830002, + "learning_rate": 9.180013055424833e-05, + "loss": 1.5657193183898925, + "step": 88480 + }, + { + "epoch": 0.26785525101024626, + "grad_norm": 0.11799009144306183, + "learning_rate": 9.179633537261094e-05, + "loss": 1.6253387451171875, + "step": 88490 + }, + { + "epoch": 0.26788552056059206, + "grad_norm": 0.1122625321149826, + "learning_rate": 9.179254019097354e-05, + "loss": 1.5675735473632812, + "step": 88500 + }, + { + "epoch": 0.26788552056059206, + "eval_loss": 1.546314001083374, + "eval_runtime": 27.4713, + "eval_samples_per_second": 18.201, + "eval_steps_per_second": 1.165, + "step": 88500 + }, + { + "epoch": 0.2679157901109379, + "grad_norm": 0.10334320366382599, + "learning_rate": 9.178874500933615e-05, + "loss": 1.542313289642334, + "step": 88510 + }, + { + "epoch": 0.2679460596612837, + "grad_norm": 0.13711045682430267, + "learning_rate": 9.178494982769875e-05, + "loss": 1.5820444107055665, + "step": 88520 + }, + { + "epoch": 0.26797632921162956, + "grad_norm": 0.11960319429636002, + "learning_rate": 9.178115464606136e-05, + "loss": 1.611851119995117, + "step": 88530 + }, + { + "epoch": 0.2680065987619754, + "grad_norm": 0.10935547947883606, + "learning_rate": 9.177735946442396e-05, + "loss": 1.5644163131713866, + "step": 88540 + }, + { + "epoch": 0.2680368683123212, + "grad_norm": 0.12110854685306549, + "learning_rate": 9.177356428278657e-05, + "loss": 1.5682846069335938, + "step": 88550 + }, + { + "epoch": 0.26806713786266706, + "grad_norm": 0.11745051294565201, + "learning_rate": 9.176976910114919e-05, + "loss": 1.5681614875793457, + "step": 88560 + }, + { + "epoch": 0.26809740741301286, + "grad_norm": 0.1128518134355545, + "learning_rate": 9.176597391951178e-05, + "loss": 1.592885398864746, + "step": 88570 + }, + { + "epoch": 0.2681276769633587, + "grad_norm": 0.11203780025243759, + "learning_rate": 9.17621787378744e-05, + "loss": 1.572762680053711, + "step": 88580 + }, + { + "epoch": 0.26815794651370456, + "grad_norm": 0.10641616582870483, + "learning_rate": 9.175838355623701e-05, + "loss": 1.563238525390625, + "step": 88590 + }, + { + "epoch": 0.26818821606405036, + "grad_norm": 0.12405457347631454, + "learning_rate": 9.175458837459961e-05, + "loss": 1.520414161682129, + "step": 88600 + }, + { + "epoch": 0.2682184856143962, + "grad_norm": 0.13197694718837738, + "learning_rate": 9.175079319296222e-05, + "loss": 1.5535350799560548, + "step": 88610 + }, + { + "epoch": 0.268248755164742, + "grad_norm": 0.11688979715108871, + "learning_rate": 9.174699801132483e-05, + "loss": 1.5533912658691407, + "step": 88620 + }, + { + "epoch": 0.26827902471508785, + "grad_norm": 0.12535293400287628, + "learning_rate": 9.174320282968743e-05, + "loss": 1.5640600204467774, + "step": 88630 + }, + { + "epoch": 0.2683092942654337, + "grad_norm": 0.11652181297540665, + "learning_rate": 9.173940764805004e-05, + "loss": 1.5743633270263673, + "step": 88640 + }, + { + "epoch": 0.2683395638157795, + "grad_norm": 0.11560842394828796, + "learning_rate": 9.173561246641264e-05, + "loss": 1.5428595542907715, + "step": 88650 + }, + { + "epoch": 0.26836983336612535, + "grad_norm": 0.12218748778104782, + "learning_rate": 9.173181728477525e-05, + "loss": 1.5891731262207032, + "step": 88660 + }, + { + "epoch": 0.26840010291647115, + "grad_norm": 0.12811754643917084, + "learning_rate": 9.172802210313786e-05, + "loss": 1.5732684135437012, + "step": 88670 + }, + { + "epoch": 0.268430372466817, + "grad_norm": 0.11765484511852264, + "learning_rate": 9.172422692150046e-05, + "loss": 1.595149326324463, + "step": 88680 + }, + { + "epoch": 0.26846064201716285, + "grad_norm": 0.10225380957126617, + "learning_rate": 9.172043173986307e-05, + "loss": 1.5524659156799316, + "step": 88690 + }, + { + "epoch": 0.26849091156750865, + "grad_norm": 0.10911119729280472, + "learning_rate": 9.171663655822567e-05, + "loss": 1.5402058601379394, + "step": 88700 + }, + { + "epoch": 0.2685211811178545, + "grad_norm": 0.11738324910402298, + "learning_rate": 9.171284137658828e-05, + "loss": 1.586421585083008, + "step": 88710 + }, + { + "epoch": 0.2685514506682003, + "grad_norm": 0.11927248537540436, + "learning_rate": 9.17090461949509e-05, + "loss": 1.5912250518798827, + "step": 88720 + }, + { + "epoch": 0.26858172021854615, + "grad_norm": 0.10168246179819107, + "learning_rate": 9.170525101331349e-05, + "loss": 1.540728759765625, + "step": 88730 + }, + { + "epoch": 0.268611989768892, + "grad_norm": 0.12414173781871796, + "learning_rate": 9.170145583167611e-05, + "loss": 1.5649879455566407, + "step": 88740 + }, + { + "epoch": 0.2686422593192378, + "grad_norm": 0.11206856369972229, + "learning_rate": 9.169766065003872e-05, + "loss": 1.5396657943725587, + "step": 88750 + }, + { + "epoch": 0.26867252886958365, + "grad_norm": 0.10667650401592255, + "learning_rate": 9.169386546840132e-05, + "loss": 1.5319639205932618, + "step": 88760 + }, + { + "epoch": 0.26870279841992945, + "grad_norm": 0.12840619683265686, + "learning_rate": 9.169007028676393e-05, + "loss": 1.5632655143737793, + "step": 88770 + }, + { + "epoch": 0.2687330679702753, + "grad_norm": 0.11792780458927155, + "learning_rate": 9.168627510512653e-05, + "loss": 1.5707571029663085, + "step": 88780 + }, + { + "epoch": 0.26876333752062115, + "grad_norm": 0.11237344890832901, + "learning_rate": 9.168247992348914e-05, + "loss": 1.5872489929199218, + "step": 88790 + }, + { + "epoch": 0.26879360707096694, + "grad_norm": 0.11849520355463028, + "learning_rate": 9.167868474185175e-05, + "loss": 1.5418442726135253, + "step": 88800 + }, + { + "epoch": 0.2688238766213128, + "grad_norm": 0.10627685487270355, + "learning_rate": 9.167488956021435e-05, + "loss": 1.5768397331237793, + "step": 88810 + }, + { + "epoch": 0.2688541461716586, + "grad_norm": 0.10925943404436111, + "learning_rate": 9.167109437857696e-05, + "loss": 1.5981268882751465, + "step": 88820 + }, + { + "epoch": 0.26888441572200444, + "grad_norm": 0.10953774303197861, + "learning_rate": 9.166729919693958e-05, + "loss": 1.5515271186828614, + "step": 88830 + }, + { + "epoch": 0.2689146852723503, + "grad_norm": 0.1348176896572113, + "learning_rate": 9.166350401530217e-05, + "loss": 1.5317193984985351, + "step": 88840 + }, + { + "epoch": 0.2689449548226961, + "grad_norm": 0.11643002927303314, + "learning_rate": 9.165970883366479e-05, + "loss": 1.5443765640258789, + "step": 88850 + }, + { + "epoch": 0.26897522437304194, + "grad_norm": 0.11840981990098953, + "learning_rate": 9.165591365202738e-05, + "loss": 1.5731745719909669, + "step": 88860 + }, + { + "epoch": 0.2690054939233878, + "grad_norm": 0.11220475286245346, + "learning_rate": 9.165211847039e-05, + "loss": 1.5685342788696288, + "step": 88870 + }, + { + "epoch": 0.2690357634737336, + "grad_norm": 0.1298142671585083, + "learning_rate": 9.16483232887526e-05, + "loss": 1.5672203063964845, + "step": 88880 + }, + { + "epoch": 0.26906603302407944, + "grad_norm": 0.11732489615678787, + "learning_rate": 9.164452810711521e-05, + "loss": 1.55514497756958, + "step": 88890 + }, + { + "epoch": 0.26909630257442524, + "grad_norm": 0.11237141489982605, + "learning_rate": 9.16407329254778e-05, + "loss": 1.5746597290039062, + "step": 88900 + }, + { + "epoch": 0.2691265721247711, + "grad_norm": 0.11085250228643417, + "learning_rate": 9.163693774384043e-05, + "loss": 1.5824302673339843, + "step": 88910 + }, + { + "epoch": 0.26915684167511694, + "grad_norm": 0.122092604637146, + "learning_rate": 9.163314256220302e-05, + "loss": 1.524022102355957, + "step": 88920 + }, + { + "epoch": 0.26918711122546274, + "grad_norm": 0.10853967815637589, + "learning_rate": 9.162934738056564e-05, + "loss": 1.5140606880187988, + "step": 88930 + }, + { + "epoch": 0.2692173807758086, + "grad_norm": 0.10924658179283142, + "learning_rate": 9.162555219892823e-05, + "loss": 1.5617639541625976, + "step": 88940 + }, + { + "epoch": 0.2692476503261544, + "grad_norm": 0.13080689311027527, + "learning_rate": 9.162175701729085e-05, + "loss": 1.5418350219726562, + "step": 88950 + }, + { + "epoch": 0.26927791987650024, + "grad_norm": 0.11504308134317398, + "learning_rate": 9.161796183565347e-05, + "loss": 1.5726296424865722, + "step": 88960 + }, + { + "epoch": 0.2693081894268461, + "grad_norm": 0.1270572990179062, + "learning_rate": 9.161416665401606e-05, + "loss": 1.54220609664917, + "step": 88970 + }, + { + "epoch": 0.2693384589771919, + "grad_norm": 0.1091061607003212, + "learning_rate": 9.161037147237868e-05, + "loss": 1.5463956832885741, + "step": 88980 + }, + { + "epoch": 0.26936872852753774, + "grad_norm": 0.11384771764278412, + "learning_rate": 9.160657629074127e-05, + "loss": 1.597226905822754, + "step": 88990 + }, + { + "epoch": 0.26939899807788353, + "grad_norm": 0.11520591378211975, + "learning_rate": 9.160278110910389e-05, + "loss": 1.5560331344604492, + "step": 89000 + }, + { + "epoch": 0.26939899807788353, + "eval_loss": 1.57558274269104, + "eval_runtime": 28.2876, + "eval_samples_per_second": 17.676, + "eval_steps_per_second": 1.131, + "step": 89000 + }, + { + "epoch": 0.2694292676282294, + "grad_norm": 0.12631483376026154, + "learning_rate": 9.159898592746649e-05, + "loss": 1.5571019172668457, + "step": 89010 + }, + { + "epoch": 0.26945953717857524, + "grad_norm": 0.11238466948270798, + "learning_rate": 9.15951907458291e-05, + "loss": 1.593623161315918, + "step": 89020 + }, + { + "epoch": 0.26948980672892103, + "grad_norm": 0.12262498587369919, + "learning_rate": 9.15913955641917e-05, + "loss": 1.549989891052246, + "step": 89030 + }, + { + "epoch": 0.2695200762792669, + "grad_norm": 0.10022415220737457, + "learning_rate": 9.158760038255432e-05, + "loss": 1.5654738426208497, + "step": 89040 + }, + { + "epoch": 0.2695503458296127, + "grad_norm": 0.1128619834780693, + "learning_rate": 9.158380520091691e-05, + "loss": 1.5093510627746582, + "step": 89050 + }, + { + "epoch": 0.26958061537995853, + "grad_norm": 0.11764172464609146, + "learning_rate": 9.158001001927953e-05, + "loss": 1.5529424667358398, + "step": 89060 + }, + { + "epoch": 0.2696108849303044, + "grad_norm": 0.12005499750375748, + "learning_rate": 9.157621483764212e-05, + "loss": 1.5668745040893555, + "step": 89070 + }, + { + "epoch": 0.2696411544806502, + "grad_norm": 0.12400714308023453, + "learning_rate": 9.157241965600474e-05, + "loss": 1.560787296295166, + "step": 89080 + }, + { + "epoch": 0.26967142403099603, + "grad_norm": 0.12632063031196594, + "learning_rate": 9.156862447436735e-05, + "loss": 1.5819705963134765, + "step": 89090 + }, + { + "epoch": 0.26970169358134183, + "grad_norm": 0.11003688722848892, + "learning_rate": 9.156482929272995e-05, + "loss": 1.5502781867980957, + "step": 89100 + }, + { + "epoch": 0.2697319631316877, + "grad_norm": 0.11744635552167892, + "learning_rate": 9.156103411109256e-05, + "loss": 1.5431484222412108, + "step": 89110 + }, + { + "epoch": 0.26976223268203353, + "grad_norm": 0.1082896739244461, + "learning_rate": 9.155723892945516e-05, + "loss": 1.5642736434936524, + "step": 89120 + }, + { + "epoch": 0.2697925022323793, + "grad_norm": 0.11831878125667572, + "learning_rate": 9.155344374781777e-05, + "loss": 1.5622350692749023, + "step": 89130 + }, + { + "epoch": 0.2698227717827252, + "grad_norm": 0.11643270403146744, + "learning_rate": 9.154964856618038e-05, + "loss": 1.548143196105957, + "step": 89140 + }, + { + "epoch": 0.269853041333071, + "grad_norm": 0.10840179026126862, + "learning_rate": 9.1545853384543e-05, + "loss": 1.5415657043457032, + "step": 89150 + }, + { + "epoch": 0.2698833108834168, + "grad_norm": 0.12606674432754517, + "learning_rate": 9.154205820290559e-05, + "loss": 1.5820375442504884, + "step": 89160 + }, + { + "epoch": 0.2699135804337627, + "grad_norm": 0.10562044382095337, + "learning_rate": 9.153826302126821e-05, + "loss": 1.5423173904418945, + "step": 89170 + }, + { + "epoch": 0.2699438499841085, + "grad_norm": 0.12325003743171692, + "learning_rate": 9.15344678396308e-05, + "loss": 1.5847305297851562, + "step": 89180 + }, + { + "epoch": 0.2699741195344543, + "grad_norm": 0.11077068746089935, + "learning_rate": 9.153067265799342e-05, + "loss": 1.573099136352539, + "step": 89190 + }, + { + "epoch": 0.2700043890848001, + "grad_norm": 0.10969087481498718, + "learning_rate": 9.152687747635603e-05, + "loss": 1.5777740478515625, + "step": 89200 + }, + { + "epoch": 0.270034658635146, + "grad_norm": 0.1206601932644844, + "learning_rate": 9.152308229471863e-05, + "loss": 1.5601160049438476, + "step": 89210 + }, + { + "epoch": 0.2700649281854918, + "grad_norm": 0.1107545718550682, + "learning_rate": 9.151928711308124e-05, + "loss": 1.5511960983276367, + "step": 89220 + }, + { + "epoch": 0.2700951977358376, + "grad_norm": 0.12426581978797913, + "learning_rate": 9.151549193144384e-05, + "loss": 1.559015655517578, + "step": 89230 + }, + { + "epoch": 0.2701254672861835, + "grad_norm": 0.1048559620976448, + "learning_rate": 9.151169674980645e-05, + "loss": 1.5253814697265624, + "step": 89240 + }, + { + "epoch": 0.27015573683652927, + "grad_norm": 0.11785886436700821, + "learning_rate": 9.150790156816905e-05, + "loss": 1.540824031829834, + "step": 89250 + }, + { + "epoch": 0.2701860063868751, + "grad_norm": 0.11246800422668457, + "learning_rate": 9.150410638653166e-05, + "loss": 1.563809871673584, + "step": 89260 + }, + { + "epoch": 0.270216275937221, + "grad_norm": 0.1137789785861969, + "learning_rate": 9.150031120489427e-05, + "loss": 1.5081686973571777, + "step": 89270 + }, + { + "epoch": 0.27024654548756677, + "grad_norm": 0.1238531693816185, + "learning_rate": 9.149651602325687e-05, + "loss": 1.5161523818969727, + "step": 89280 + }, + { + "epoch": 0.2702768150379126, + "grad_norm": 0.10760823637247086, + "learning_rate": 9.149272084161948e-05, + "loss": 1.5983426094055175, + "step": 89290 + }, + { + "epoch": 0.2703070845882584, + "grad_norm": 0.10796301066875458, + "learning_rate": 9.148892565998208e-05, + "loss": 1.5437753677368165, + "step": 89300 + }, + { + "epoch": 0.27033735413860427, + "grad_norm": 0.10700767487287521, + "learning_rate": 9.148513047834469e-05, + "loss": 1.536837100982666, + "step": 89310 + }, + { + "epoch": 0.2703676236889501, + "grad_norm": 0.11465786397457123, + "learning_rate": 9.14813352967073e-05, + "loss": 1.5654640197753906, + "step": 89320 + }, + { + "epoch": 0.2703978932392959, + "grad_norm": 0.1263340711593628, + "learning_rate": 9.147754011506992e-05, + "loss": 1.6016826629638672, + "step": 89330 + }, + { + "epoch": 0.27042816278964177, + "grad_norm": 0.1198907196521759, + "learning_rate": 9.147374493343251e-05, + "loss": 1.5402187347412108, + "step": 89340 + }, + { + "epoch": 0.27045843233998756, + "grad_norm": 0.11580294370651245, + "learning_rate": 9.146994975179513e-05, + "loss": 1.565338134765625, + "step": 89350 + }, + { + "epoch": 0.2704887018903334, + "grad_norm": 0.10759197920560837, + "learning_rate": 9.146615457015773e-05, + "loss": 1.5762847900390624, + "step": 89360 + }, + { + "epoch": 0.27051897144067927, + "grad_norm": 0.11336836218833923, + "learning_rate": 9.146235938852034e-05, + "loss": 1.5507330894470215, + "step": 89370 + }, + { + "epoch": 0.27054924099102506, + "grad_norm": 0.11497315019369125, + "learning_rate": 9.145856420688295e-05, + "loss": 1.5664993286132813, + "step": 89380 + }, + { + "epoch": 0.2705795105413709, + "grad_norm": 0.12299793213605881, + "learning_rate": 9.145476902524555e-05, + "loss": 1.6068702697753907, + "step": 89390 + }, + { + "epoch": 0.2706097800917167, + "grad_norm": 0.12173738330602646, + "learning_rate": 9.145097384360816e-05, + "loss": 1.5649529457092286, + "step": 89400 + }, + { + "epoch": 0.27064004964206256, + "grad_norm": 0.11087767034769058, + "learning_rate": 9.144717866197076e-05, + "loss": 1.5223225593566894, + "step": 89410 + }, + { + "epoch": 0.2706703191924084, + "grad_norm": 0.11031627655029297, + "learning_rate": 9.144338348033337e-05, + "loss": 1.5748277664184571, + "step": 89420 + }, + { + "epoch": 0.2707005887427542, + "grad_norm": 0.12832584977149963, + "learning_rate": 9.143958829869598e-05, + "loss": 1.5781057357788086, + "step": 89430 + }, + { + "epoch": 0.27073085829310006, + "grad_norm": 0.12712959945201874, + "learning_rate": 9.143579311705858e-05, + "loss": 1.559331226348877, + "step": 89440 + }, + { + "epoch": 0.27076112784344586, + "grad_norm": 0.12894649803638458, + "learning_rate": 9.143199793542119e-05, + "loss": 1.5926494598388672, + "step": 89450 + }, + { + "epoch": 0.2707913973937917, + "grad_norm": 0.12203389406204224, + "learning_rate": 9.142820275378381e-05, + "loss": 1.5277804374694823, + "step": 89460 + }, + { + "epoch": 0.27082166694413756, + "grad_norm": 0.11631659418344498, + "learning_rate": 9.14244075721464e-05, + "loss": 1.5726016998291015, + "step": 89470 + }, + { + "epoch": 0.27085193649448336, + "grad_norm": 0.12344670295715332, + "learning_rate": 9.142061239050902e-05, + "loss": 1.554189109802246, + "step": 89480 + }, + { + "epoch": 0.2708822060448292, + "grad_norm": 0.11751291900873184, + "learning_rate": 9.141681720887161e-05, + "loss": 1.5784242630004883, + "step": 89490 + }, + { + "epoch": 0.270912475595175, + "grad_norm": 0.12012296169996262, + "learning_rate": 9.141302202723423e-05, + "loss": 1.562077522277832, + "step": 89500 + }, + { + "epoch": 0.270912475595175, + "eval_loss": 1.5752215385437012, + "eval_runtime": 28.2166, + "eval_samples_per_second": 17.72, + "eval_steps_per_second": 1.134, + "step": 89500 + }, + { + "epoch": 0.27094274514552086, + "grad_norm": 0.11223658174276352, + "learning_rate": 9.140922684559682e-05, + "loss": 1.5598932266235352, + "step": 89510 + }, + { + "epoch": 0.2709730146958667, + "grad_norm": 0.11283688992261887, + "learning_rate": 9.140543166395944e-05, + "loss": 1.5905906677246093, + "step": 89520 + }, + { + "epoch": 0.2710032842462125, + "grad_norm": 0.12555982172489166, + "learning_rate": 9.140163648232204e-05, + "loss": 1.5608022689819336, + "step": 89530 + }, + { + "epoch": 0.27103355379655836, + "grad_norm": 0.11470445245504379, + "learning_rate": 9.139784130068465e-05, + "loss": 1.6047355651855468, + "step": 89540 + }, + { + "epoch": 0.2710638233469042, + "grad_norm": 0.12141841650009155, + "learning_rate": 9.139404611904725e-05, + "loss": 1.5649816513061523, + "step": 89550 + }, + { + "epoch": 0.27109409289725, + "grad_norm": 0.10657202452421188, + "learning_rate": 9.139025093740987e-05, + "loss": 1.5760648727416993, + "step": 89560 + }, + { + "epoch": 0.27112436244759586, + "grad_norm": 0.12179569154977798, + "learning_rate": 9.138645575577249e-05, + "loss": 1.588473129272461, + "step": 89570 + }, + { + "epoch": 0.27115463199794165, + "grad_norm": 0.10851775109767914, + "learning_rate": 9.138266057413508e-05, + "loss": 1.5692558288574219, + "step": 89580 + }, + { + "epoch": 0.2711849015482875, + "grad_norm": 0.11621476709842682, + "learning_rate": 9.13788653924977e-05, + "loss": 1.5559113502502442, + "step": 89590 + }, + { + "epoch": 0.27121517109863336, + "grad_norm": 0.11620312184095383, + "learning_rate": 9.137507021086029e-05, + "loss": 1.5828896522521974, + "step": 89600 + }, + { + "epoch": 0.27124544064897915, + "grad_norm": 0.12007173150777817, + "learning_rate": 9.137127502922291e-05, + "loss": 1.541710090637207, + "step": 89610 + }, + { + "epoch": 0.271275710199325, + "grad_norm": 0.12812615931034088, + "learning_rate": 9.13674798475855e-05, + "loss": 1.5652867317199708, + "step": 89620 + }, + { + "epoch": 0.2713059797496708, + "grad_norm": 0.11803650110960007, + "learning_rate": 9.136368466594812e-05, + "loss": 1.5239432334899903, + "step": 89630 + }, + { + "epoch": 0.27133624930001665, + "grad_norm": 0.11244877427816391, + "learning_rate": 9.135988948431071e-05, + "loss": 1.5775447845458985, + "step": 89640 + }, + { + "epoch": 0.2713665188503625, + "grad_norm": 0.1161750927567482, + "learning_rate": 9.135609430267333e-05, + "loss": 1.591012954711914, + "step": 89650 + }, + { + "epoch": 0.2713967884007083, + "grad_norm": 0.1106572300195694, + "learning_rate": 9.135229912103593e-05, + "loss": 1.5434497833251952, + "step": 89660 + }, + { + "epoch": 0.27142705795105415, + "grad_norm": 0.12225264310836792, + "learning_rate": 9.134850393939855e-05, + "loss": 1.5537638664245605, + "step": 89670 + }, + { + "epoch": 0.27145732750139995, + "grad_norm": 0.13298529386520386, + "learning_rate": 9.134470875776114e-05, + "loss": 1.5779340744018555, + "step": 89680 + }, + { + "epoch": 0.2714875970517458, + "grad_norm": 0.12202991545200348, + "learning_rate": 9.134091357612376e-05, + "loss": 1.5294986724853517, + "step": 89690 + }, + { + "epoch": 0.27151786660209165, + "grad_norm": 0.12877458333969116, + "learning_rate": 9.133711839448636e-05, + "loss": 1.5662894248962402, + "step": 89700 + }, + { + "epoch": 0.27154813615243745, + "grad_norm": 0.11308691650629044, + "learning_rate": 9.133332321284897e-05, + "loss": 1.5825716972351074, + "step": 89710 + }, + { + "epoch": 0.2715784057027833, + "grad_norm": 0.11131920665502548, + "learning_rate": 9.132952803121158e-05, + "loss": 1.587181282043457, + "step": 89720 + }, + { + "epoch": 0.2716086752531291, + "grad_norm": 0.12041161209344864, + "learning_rate": 9.132573284957418e-05, + "loss": 1.574162483215332, + "step": 89730 + }, + { + "epoch": 0.27163894480347495, + "grad_norm": 0.14123164117336273, + "learning_rate": 9.132193766793679e-05, + "loss": 1.5602937698364259, + "step": 89740 + }, + { + "epoch": 0.2716692143538208, + "grad_norm": 0.11335520446300507, + "learning_rate": 9.131814248629939e-05, + "loss": 1.5456576347351074, + "step": 89750 + }, + { + "epoch": 0.2716994839041666, + "grad_norm": 0.11420676857233047, + "learning_rate": 9.131434730466201e-05, + "loss": 1.5717716217041016, + "step": 89760 + }, + { + "epoch": 0.27172975345451245, + "grad_norm": 0.11413571238517761, + "learning_rate": 9.13105521230246e-05, + "loss": 1.589807891845703, + "step": 89770 + }, + { + "epoch": 0.27176002300485824, + "grad_norm": 0.12333178520202637, + "learning_rate": 9.130675694138722e-05, + "loss": 1.5462658882141114, + "step": 89780 + }, + { + "epoch": 0.2717902925552041, + "grad_norm": 0.12809301912784576, + "learning_rate": 9.130296175974982e-05, + "loss": 1.5670344352722168, + "step": 89790 + }, + { + "epoch": 0.27182056210554995, + "grad_norm": 0.1103411465883255, + "learning_rate": 9.129916657811244e-05, + "loss": 1.6112916946411133, + "step": 89800 + }, + { + "epoch": 0.27185083165589574, + "grad_norm": 0.10908594727516174, + "learning_rate": 9.129537139647503e-05, + "loss": 1.5500388145446777, + "step": 89810 + }, + { + "epoch": 0.2718811012062416, + "grad_norm": 0.13088922202587128, + "learning_rate": 9.129157621483765e-05, + "loss": 1.591609573364258, + "step": 89820 + }, + { + "epoch": 0.2719113707565874, + "grad_norm": 0.12004324048757553, + "learning_rate": 9.128778103320025e-05, + "loss": 1.552328109741211, + "step": 89830 + }, + { + "epoch": 0.27194164030693324, + "grad_norm": 0.12308602035045624, + "learning_rate": 9.128398585156286e-05, + "loss": 1.5553101539611816, + "step": 89840 + }, + { + "epoch": 0.2719719098572791, + "grad_norm": 0.11234956979751587, + "learning_rate": 9.128019066992547e-05, + "loss": 1.5670767784118653, + "step": 89850 + }, + { + "epoch": 0.2720021794076249, + "grad_norm": 0.12189526110887527, + "learning_rate": 9.127639548828807e-05, + "loss": 1.597097110748291, + "step": 89860 + }, + { + "epoch": 0.27203244895797074, + "grad_norm": 0.11212532222270966, + "learning_rate": 9.127260030665068e-05, + "loss": 1.5559358596801758, + "step": 89870 + }, + { + "epoch": 0.27206271850831654, + "grad_norm": 0.11749012023210526, + "learning_rate": 9.126880512501328e-05, + "loss": 1.5788442611694335, + "step": 89880 + }, + { + "epoch": 0.2720929880586624, + "grad_norm": 0.12719418108463287, + "learning_rate": 9.126500994337589e-05, + "loss": 1.560490608215332, + "step": 89890 + }, + { + "epoch": 0.27212325760900824, + "grad_norm": 0.11300636827945709, + "learning_rate": 9.12612147617385e-05, + "loss": 1.5771971702575684, + "step": 89900 + }, + { + "epoch": 0.27215352715935404, + "grad_norm": 0.11345294862985611, + "learning_rate": 9.12574195801011e-05, + "loss": 1.5497756958007813, + "step": 89910 + }, + { + "epoch": 0.2721837967096999, + "grad_norm": 0.10913224518299103, + "learning_rate": 9.125362439846371e-05, + "loss": 1.5826591491699218, + "step": 89920 + }, + { + "epoch": 0.2722140662600457, + "grad_norm": 0.10024125874042511, + "learning_rate": 9.124982921682631e-05, + "loss": 1.5804594039916993, + "step": 89930 + }, + { + "epoch": 0.27224433581039154, + "grad_norm": 0.11524154245853424, + "learning_rate": 9.124603403518893e-05, + "loss": 1.546432876586914, + "step": 89940 + }, + { + "epoch": 0.2722746053607374, + "grad_norm": 0.11451155692338943, + "learning_rate": 9.124223885355153e-05, + "loss": 1.5661322593688964, + "step": 89950 + }, + { + "epoch": 0.2723048749110832, + "grad_norm": 0.11441566795110703, + "learning_rate": 9.123844367191415e-05, + "loss": 1.5503805160522461, + "step": 89960 + }, + { + "epoch": 0.27233514446142904, + "grad_norm": 0.103434257209301, + "learning_rate": 9.123464849027675e-05, + "loss": 1.590345859527588, + "step": 89970 + }, + { + "epoch": 0.27236541401177483, + "grad_norm": 0.11863724887371063, + "learning_rate": 9.123085330863936e-05, + "loss": 1.575749969482422, + "step": 89980 + }, + { + "epoch": 0.2723956835621207, + "grad_norm": 0.11284638941287994, + "learning_rate": 9.122705812700196e-05, + "loss": 1.540176010131836, + "step": 89990 + }, + { + "epoch": 0.27242595311246653, + "grad_norm": 0.10858235508203506, + "learning_rate": 9.122326294536457e-05, + "loss": 1.5683453559875489, + "step": 90000 + }, + { + "epoch": 0.27242595311246653, + "eval_loss": 1.5580254793167114, + "eval_runtime": 28.1211, + "eval_samples_per_second": 17.78, + "eval_steps_per_second": 1.138, + "step": 90000 + }, + { + "epoch": 0.27245622266281233, + "grad_norm": 0.11096826195716858, + "learning_rate": 9.121946776372717e-05, + "loss": 1.592483901977539, + "step": 90010 + }, + { + "epoch": 0.2724864922131582, + "grad_norm": 0.11360183358192444, + "learning_rate": 9.121567258208978e-05, + "loss": 1.5806102752685547, + "step": 90020 + }, + { + "epoch": 0.272516761763504, + "grad_norm": 0.10660221427679062, + "learning_rate": 9.121187740045239e-05, + "loss": 1.5211130142211915, + "step": 90030 + }, + { + "epoch": 0.27254703131384983, + "grad_norm": 0.12351547181606293, + "learning_rate": 9.120808221881499e-05, + "loss": 1.5551770210266114, + "step": 90040 + }, + { + "epoch": 0.2725773008641957, + "grad_norm": 0.10221586376428604, + "learning_rate": 9.12042870371776e-05, + "loss": 1.581441879272461, + "step": 90050 + }, + { + "epoch": 0.2726075704145415, + "grad_norm": 0.11349542438983917, + "learning_rate": 9.12004918555402e-05, + "loss": 1.5664788246154786, + "step": 90060 + }, + { + "epoch": 0.27263783996488733, + "grad_norm": 0.10842281579971313, + "learning_rate": 9.119669667390282e-05, + "loss": 1.556788444519043, + "step": 90070 + }, + { + "epoch": 0.2726681095152331, + "grad_norm": 0.11789824068546295, + "learning_rate": 9.119290149226542e-05, + "loss": 1.5587334632873535, + "step": 90080 + }, + { + "epoch": 0.272698379065579, + "grad_norm": 0.10828489065170288, + "learning_rate": 9.118910631062804e-05, + "loss": 1.5743921279907227, + "step": 90090 + }, + { + "epoch": 0.27272864861592483, + "grad_norm": 0.1072641983628273, + "learning_rate": 9.118531112899063e-05, + "loss": 1.5678722381591796, + "step": 90100 + }, + { + "epoch": 0.2727589181662706, + "grad_norm": 0.11395180225372314, + "learning_rate": 9.118151594735325e-05, + "loss": 1.553251075744629, + "step": 90110 + }, + { + "epoch": 0.2727891877166165, + "grad_norm": 0.12317243218421936, + "learning_rate": 9.117772076571584e-05, + "loss": 1.5578421592712401, + "step": 90120 + }, + { + "epoch": 0.2728194572669623, + "grad_norm": 0.12167427688837051, + "learning_rate": 9.117392558407846e-05, + "loss": 1.5221607208251953, + "step": 90130 + }, + { + "epoch": 0.2728497268173081, + "grad_norm": 0.12172571569681168, + "learning_rate": 9.117013040244105e-05, + "loss": 1.5842538833618165, + "step": 90140 + }, + { + "epoch": 0.272879996367654, + "grad_norm": 0.1188463345170021, + "learning_rate": 9.116633522080367e-05, + "loss": 1.5482683181762695, + "step": 90150 + }, + { + "epoch": 0.2729102659179998, + "grad_norm": 0.11998995393514633, + "learning_rate": 9.116254003916626e-05, + "loss": 1.5798027038574218, + "step": 90160 + }, + { + "epoch": 0.2729405354683456, + "grad_norm": 0.114966481924057, + "learning_rate": 9.115874485752888e-05, + "loss": 1.554872989654541, + "step": 90170 + }, + { + "epoch": 0.2729708050186914, + "grad_norm": 0.12549589574337006, + "learning_rate": 9.11549496758915e-05, + "loss": 1.5929462432861328, + "step": 90180 + }, + { + "epoch": 0.27300107456903727, + "grad_norm": 0.10798484832048416, + "learning_rate": 9.11511544942541e-05, + "loss": 1.5689573287963867, + "step": 90190 + }, + { + "epoch": 0.2730313441193831, + "grad_norm": 0.10022260993719101, + "learning_rate": 9.114735931261672e-05, + "loss": 1.535384464263916, + "step": 90200 + }, + { + "epoch": 0.2730616136697289, + "grad_norm": 0.11235295236110687, + "learning_rate": 9.114356413097931e-05, + "loss": 1.5809404373168945, + "step": 90210 + }, + { + "epoch": 0.27309188322007477, + "grad_norm": 0.10567133873701096, + "learning_rate": 9.113976894934193e-05, + "loss": 1.5686355590820313, + "step": 90220 + }, + { + "epoch": 0.2731221527704206, + "grad_norm": 0.11239752173423767, + "learning_rate": 9.113597376770452e-05, + "loss": 1.5503412246704102, + "step": 90230 + }, + { + "epoch": 0.2731524223207664, + "grad_norm": 0.12256769835948944, + "learning_rate": 9.113217858606714e-05, + "loss": 1.567904281616211, + "step": 90240 + }, + { + "epoch": 0.27318269187111227, + "grad_norm": 0.12520147860050201, + "learning_rate": 9.112838340442973e-05, + "loss": 1.591052532196045, + "step": 90250 + }, + { + "epoch": 0.27321296142145807, + "grad_norm": 0.10982415825128555, + "learning_rate": 9.112458822279235e-05, + "loss": 1.560897159576416, + "step": 90260 + }, + { + "epoch": 0.2732432309718039, + "grad_norm": 0.12231066823005676, + "learning_rate": 9.112079304115494e-05, + "loss": 1.6013908386230469, + "step": 90270 + }, + { + "epoch": 0.27327350052214977, + "grad_norm": 0.11535049974918365, + "learning_rate": 9.111699785951756e-05, + "loss": 1.5623706817626952, + "step": 90280 + }, + { + "epoch": 0.27330377007249557, + "grad_norm": 0.13822075724601746, + "learning_rate": 9.111320267788016e-05, + "loss": 1.5425867080688476, + "step": 90290 + }, + { + "epoch": 0.2733340396228414, + "grad_norm": 0.11495111137628555, + "learning_rate": 9.110940749624277e-05, + "loss": 1.5558485984802246, + "step": 90300 + }, + { + "epoch": 0.2733643091731872, + "grad_norm": 0.11481396108865738, + "learning_rate": 9.110561231460538e-05, + "loss": 1.544989013671875, + "step": 90310 + }, + { + "epoch": 0.27339457872353307, + "grad_norm": 0.125390887260437, + "learning_rate": 9.110181713296799e-05, + "loss": 1.5856006622314454, + "step": 90320 + }, + { + "epoch": 0.2734248482738789, + "grad_norm": 0.1212209090590477, + "learning_rate": 9.109802195133059e-05, + "loss": 1.5793473243713378, + "step": 90330 + }, + { + "epoch": 0.2734551178242247, + "grad_norm": 0.1097758561372757, + "learning_rate": 9.10942267696932e-05, + "loss": 1.5650684356689453, + "step": 90340 + }, + { + "epoch": 0.27348538737457057, + "grad_norm": 0.11472705751657486, + "learning_rate": 9.10904315880558e-05, + "loss": 1.5665544509887694, + "step": 90350 + }, + { + "epoch": 0.27351565692491636, + "grad_norm": 0.125965878367424, + "learning_rate": 9.108663640641841e-05, + "loss": 1.540603256225586, + "step": 90360 + }, + { + "epoch": 0.2735459264752622, + "grad_norm": 0.11375916004180908, + "learning_rate": 9.108284122478103e-05, + "loss": 1.5617856979370117, + "step": 90370 + }, + { + "epoch": 0.27357619602560806, + "grad_norm": 0.11510824412107468, + "learning_rate": 9.107904604314362e-05, + "loss": 1.551516342163086, + "step": 90380 + }, + { + "epoch": 0.27360646557595386, + "grad_norm": 0.12635943293571472, + "learning_rate": 9.107525086150624e-05, + "loss": 1.5470453262329102, + "step": 90390 + }, + { + "epoch": 0.2736367351262997, + "grad_norm": 0.11349909007549286, + "learning_rate": 9.107145567986883e-05, + "loss": 1.5492664337158204, + "step": 90400 + }, + { + "epoch": 0.2736670046766455, + "grad_norm": 0.10158911347389221, + "learning_rate": 9.106766049823145e-05, + "loss": 1.581334114074707, + "step": 90410 + }, + { + "epoch": 0.27369727422699136, + "grad_norm": 0.117283895611763, + "learning_rate": 9.106386531659405e-05, + "loss": 1.569573974609375, + "step": 90420 + }, + { + "epoch": 0.2737275437773372, + "grad_norm": 0.11261020600795746, + "learning_rate": 9.106007013495667e-05, + "loss": 1.5761729240417481, + "step": 90430 + }, + { + "epoch": 0.273757813327683, + "grad_norm": 0.12376124411821365, + "learning_rate": 9.105627495331927e-05, + "loss": 1.5652192115783692, + "step": 90440 + }, + { + "epoch": 0.27378808287802886, + "grad_norm": 0.11861404031515121, + "learning_rate": 9.105247977168188e-05, + "loss": 1.5746366500854492, + "step": 90450 + }, + { + "epoch": 0.27381835242837466, + "grad_norm": 0.12213215231895447, + "learning_rate": 9.104868459004448e-05, + "loss": 1.5754770278930663, + "step": 90460 + }, + { + "epoch": 0.2738486219787205, + "grad_norm": 0.12179671972990036, + "learning_rate": 9.104488940840709e-05, + "loss": 1.5846445083618164, + "step": 90470 + }, + { + "epoch": 0.27387889152906636, + "grad_norm": 0.1304326206445694, + "learning_rate": 9.10410942267697e-05, + "loss": 1.5676584243774414, + "step": 90480 + }, + { + "epoch": 0.27390916107941216, + "grad_norm": 0.12651847302913666, + "learning_rate": 9.10372990451323e-05, + "loss": 1.5861963272094726, + "step": 90490 + }, + { + "epoch": 0.273939430629758, + "grad_norm": 0.12136288732290268, + "learning_rate": 9.103350386349491e-05, + "loss": 1.5672385215759277, + "step": 90500 + }, + { + "epoch": 0.273939430629758, + "eval_loss": 1.5677893161773682, + "eval_runtime": 28.187, + "eval_samples_per_second": 17.739, + "eval_steps_per_second": 1.135, + "step": 90500 + }, + { + "epoch": 0.2739697001801038, + "grad_norm": 0.10439752787351608, + "learning_rate": 9.102970868185751e-05, + "loss": 1.576871681213379, + "step": 90510 + }, + { + "epoch": 0.27399996973044966, + "grad_norm": 0.12627694010734558, + "learning_rate": 9.102591350022012e-05, + "loss": 1.5800885200500487, + "step": 90520 + }, + { + "epoch": 0.2740302392807955, + "grad_norm": 0.12160736322402954, + "learning_rate": 9.102211831858273e-05, + "loss": 1.5580883026123047, + "step": 90530 + }, + { + "epoch": 0.2740605088311413, + "grad_norm": 0.11467396467924118, + "learning_rate": 9.101832313694533e-05, + "loss": 1.543203639984131, + "step": 90540 + }, + { + "epoch": 0.27409077838148715, + "grad_norm": 0.13158726692199707, + "learning_rate": 9.101452795530794e-05, + "loss": 1.57069730758667, + "step": 90550 + }, + { + "epoch": 0.27412104793183295, + "grad_norm": 0.11042142659425735, + "learning_rate": 9.101073277367054e-05, + "loss": 1.619961929321289, + "step": 90560 + }, + { + "epoch": 0.2741513174821788, + "grad_norm": 0.11952652782201767, + "learning_rate": 9.100693759203316e-05, + "loss": 1.5654067993164062, + "step": 90570 + }, + { + "epoch": 0.27418158703252465, + "grad_norm": 0.11354757845401764, + "learning_rate": 9.100314241039577e-05, + "loss": 1.5552720069885253, + "step": 90580 + }, + { + "epoch": 0.27421185658287045, + "grad_norm": 0.1129012480378151, + "learning_rate": 9.099934722875837e-05, + "loss": 1.5716693878173829, + "step": 90590 + }, + { + "epoch": 0.2742421261332163, + "grad_norm": 0.10285383462905884, + "learning_rate": 9.099555204712098e-05, + "loss": 1.5637946128845215, + "step": 90600 + }, + { + "epoch": 0.2742723956835621, + "grad_norm": 0.12275930494070053, + "learning_rate": 9.099175686548359e-05, + "loss": 1.572026824951172, + "step": 90610 + }, + { + "epoch": 0.27430266523390795, + "grad_norm": 0.11530806124210358, + "learning_rate": 9.098796168384619e-05, + "loss": 1.599496555328369, + "step": 90620 + }, + { + "epoch": 0.2743329347842538, + "grad_norm": 0.1132160872220993, + "learning_rate": 9.09841665022088e-05, + "loss": 1.5546457290649414, + "step": 90630 + }, + { + "epoch": 0.2743632043345996, + "grad_norm": 0.12332917004823685, + "learning_rate": 9.09803713205714e-05, + "loss": 1.5207347869873047, + "step": 90640 + }, + { + "epoch": 0.27439347388494545, + "grad_norm": 0.12467624992132187, + "learning_rate": 9.097657613893401e-05, + "loss": 1.5953221321105957, + "step": 90650 + }, + { + "epoch": 0.27442374343529125, + "grad_norm": 0.10384870320558548, + "learning_rate": 9.097278095729662e-05, + "loss": 1.5445745468139649, + "step": 90660 + }, + { + "epoch": 0.2744540129856371, + "grad_norm": 0.11754357069730759, + "learning_rate": 9.096898577565922e-05, + "loss": 1.632075309753418, + "step": 90670 + }, + { + "epoch": 0.27448428253598295, + "grad_norm": 0.1229703351855278, + "learning_rate": 9.096519059402184e-05, + "loss": 1.5511414527893066, + "step": 90680 + }, + { + "epoch": 0.27451455208632874, + "grad_norm": 0.112674281001091, + "learning_rate": 9.096139541238443e-05, + "loss": 1.558572769165039, + "step": 90690 + }, + { + "epoch": 0.2745448216366746, + "grad_norm": 0.11986600607633591, + "learning_rate": 9.095760023074705e-05, + "loss": 1.5561944961547851, + "step": 90700 + }, + { + "epoch": 0.2745750911870204, + "grad_norm": 0.12737591564655304, + "learning_rate": 9.095380504910965e-05, + "loss": 1.5758401870727539, + "step": 90710 + }, + { + "epoch": 0.27460536073736624, + "grad_norm": 0.1059548556804657, + "learning_rate": 9.095000986747227e-05, + "loss": 1.598928451538086, + "step": 90720 + }, + { + "epoch": 0.2746356302877121, + "grad_norm": 0.11058369278907776, + "learning_rate": 9.094621468583486e-05, + "loss": 1.5620124816894532, + "step": 90730 + }, + { + "epoch": 0.2746658998380579, + "grad_norm": 0.12116988003253937, + "learning_rate": 9.094241950419748e-05, + "loss": 1.5803701400756835, + "step": 90740 + }, + { + "epoch": 0.27469616938840374, + "grad_norm": 0.1279696226119995, + "learning_rate": 9.093862432256007e-05, + "loss": 1.573841667175293, + "step": 90750 + }, + { + "epoch": 0.27472643893874954, + "grad_norm": 0.1109309121966362, + "learning_rate": 9.093482914092269e-05, + "loss": 1.5646953582763672, + "step": 90760 + }, + { + "epoch": 0.2747567084890954, + "grad_norm": 0.11700807511806488, + "learning_rate": 9.093103395928528e-05, + "loss": 1.5787593841552734, + "step": 90770 + }, + { + "epoch": 0.27478697803944124, + "grad_norm": 0.10940288007259369, + "learning_rate": 9.09272387776479e-05, + "loss": 1.5508813858032227, + "step": 90780 + }, + { + "epoch": 0.27481724758978704, + "grad_norm": 0.12505637109279633, + "learning_rate": 9.092344359601051e-05, + "loss": 1.5060505867004395, + "step": 90790 + }, + { + "epoch": 0.2748475171401329, + "grad_norm": 0.09389064460992813, + "learning_rate": 9.091964841437311e-05, + "loss": 1.5697928428649903, + "step": 90800 + }, + { + "epoch": 0.2748777866904787, + "grad_norm": 0.11268475651741028, + "learning_rate": 9.091585323273573e-05, + "loss": 1.5761097908020019, + "step": 90810 + }, + { + "epoch": 0.27490805624082454, + "grad_norm": 0.12870994210243225, + "learning_rate": 9.091205805109832e-05, + "loss": 1.5530256271362304, + "step": 90820 + }, + { + "epoch": 0.2749383257911704, + "grad_norm": 0.11203250288963318, + "learning_rate": 9.090826286946094e-05, + "loss": 1.562298583984375, + "step": 90830 + }, + { + "epoch": 0.2749685953415162, + "grad_norm": 0.12588630616664886, + "learning_rate": 9.090446768782354e-05, + "loss": 1.5308981895446778, + "step": 90840 + }, + { + "epoch": 0.27499886489186204, + "grad_norm": 0.10581358522176743, + "learning_rate": 9.090067250618616e-05, + "loss": 1.5785225868225097, + "step": 90850 + }, + { + "epoch": 0.27502913444220783, + "grad_norm": 0.12246003746986389, + "learning_rate": 9.089687732454875e-05, + "loss": 1.5594483375549317, + "step": 90860 + }, + { + "epoch": 0.2750594039925537, + "grad_norm": 0.11575008928775787, + "learning_rate": 9.089308214291137e-05, + "loss": 1.5271173477172852, + "step": 90870 + }, + { + "epoch": 0.27508967354289954, + "grad_norm": 0.11989818513393402, + "learning_rate": 9.088928696127396e-05, + "loss": 1.5744831085205078, + "step": 90880 + }, + { + "epoch": 0.27511994309324533, + "grad_norm": 0.12626205384731293, + "learning_rate": 9.088549177963658e-05, + "loss": 1.5247433662414551, + "step": 90890 + }, + { + "epoch": 0.2751502126435912, + "grad_norm": 0.1166435182094574, + "learning_rate": 9.088169659799917e-05, + "loss": 1.5561450004577637, + "step": 90900 + }, + { + "epoch": 0.275180482193937, + "grad_norm": 0.12521252036094666, + "learning_rate": 9.087790141636179e-05, + "loss": 1.56536808013916, + "step": 90910 + }, + { + "epoch": 0.27521075174428283, + "grad_norm": 0.11016285419464111, + "learning_rate": 9.087410623472438e-05, + "loss": 1.5933469772338866, + "step": 90920 + }, + { + "epoch": 0.2752410212946287, + "grad_norm": 0.12599462270736694, + "learning_rate": 9.0870311053087e-05, + "loss": 1.581633949279785, + "step": 90930 + }, + { + "epoch": 0.2752712908449745, + "grad_norm": 0.11432195454835892, + "learning_rate": 9.086651587144961e-05, + "loss": 1.5643491744995117, + "step": 90940 + }, + { + "epoch": 0.27530156039532033, + "grad_norm": 0.11771851778030396, + "learning_rate": 9.086272068981222e-05, + "loss": 1.5698136329650878, + "step": 90950 + }, + { + "epoch": 0.2753318299456662, + "grad_norm": 0.12325716018676758, + "learning_rate": 9.085892550817482e-05, + "loss": 1.5586935997009277, + "step": 90960 + }, + { + "epoch": 0.275362099496012, + "grad_norm": 0.10489717870950699, + "learning_rate": 9.085513032653743e-05, + "loss": 1.5642059326171875, + "step": 90970 + }, + { + "epoch": 0.27539236904635783, + "grad_norm": 0.12512315809726715, + "learning_rate": 9.085133514490005e-05, + "loss": 1.5661237716674805, + "step": 90980 + }, + { + "epoch": 0.27542263859670363, + "grad_norm": 0.11346804350614548, + "learning_rate": 9.084753996326264e-05, + "loss": 1.576469898223877, + "step": 90990 + }, + { + "epoch": 0.2754529081470495, + "grad_norm": 0.10979481786489487, + "learning_rate": 9.084374478162526e-05, + "loss": 1.5649063110351562, + "step": 91000 + }, + { + "epoch": 0.2754529081470495, + "eval_loss": 1.5570873022079468, + "eval_runtime": 28.3511, + "eval_samples_per_second": 17.636, + "eval_steps_per_second": 1.129, + "step": 91000 + }, + { + "epoch": 0.27548317769739533, + "grad_norm": 0.12297943234443665, + "learning_rate": 9.083994959998785e-05, + "loss": 1.5742538452148438, + "step": 91010 + }, + { + "epoch": 0.27551344724774113, + "grad_norm": 0.10770982503890991, + "learning_rate": 9.083615441835047e-05, + "loss": 1.6078102111816406, + "step": 91020 + }, + { + "epoch": 0.275543716798087, + "grad_norm": 0.1034313440322876, + "learning_rate": 9.083235923671306e-05, + "loss": 1.5873162269592285, + "step": 91030 + }, + { + "epoch": 0.2755739863484328, + "grad_norm": 0.118490070104599, + "learning_rate": 9.082856405507568e-05, + "loss": 1.5461341857910156, + "step": 91040 + }, + { + "epoch": 0.2756042558987786, + "grad_norm": 0.1256200075149536, + "learning_rate": 9.082476887343829e-05, + "loss": 1.5370828628540039, + "step": 91050 + }, + { + "epoch": 0.2756345254491245, + "grad_norm": 0.1252361685037613, + "learning_rate": 9.08209736918009e-05, + "loss": 1.5584442138671875, + "step": 91060 + }, + { + "epoch": 0.2756647949994703, + "grad_norm": 0.11608058959245682, + "learning_rate": 9.08171785101635e-05, + "loss": 1.5858020782470703, + "step": 91070 + }, + { + "epoch": 0.2756950645498161, + "grad_norm": 0.10550785809755325, + "learning_rate": 9.08133833285261e-05, + "loss": 1.553754997253418, + "step": 91080 + }, + { + "epoch": 0.2757253341001619, + "grad_norm": 0.10914664715528488, + "learning_rate": 9.080958814688871e-05, + "loss": 1.5611379623413086, + "step": 91090 + }, + { + "epoch": 0.2757556036505078, + "grad_norm": 0.10877911001443863, + "learning_rate": 9.080579296525132e-05, + "loss": 1.5803701400756835, + "step": 91100 + }, + { + "epoch": 0.2757858732008536, + "grad_norm": 0.1082410216331482, + "learning_rate": 9.080199778361392e-05, + "loss": 1.5840996742248534, + "step": 91110 + }, + { + "epoch": 0.2758161427511994, + "grad_norm": 0.10022152960300446, + "learning_rate": 9.079820260197653e-05, + "loss": 1.5380794525146484, + "step": 91120 + }, + { + "epoch": 0.2758464123015453, + "grad_norm": 0.10782385617494583, + "learning_rate": 9.079440742033914e-05, + "loss": 1.551468276977539, + "step": 91130 + }, + { + "epoch": 0.27587668185189107, + "grad_norm": 0.11607532948255539, + "learning_rate": 9.079061223870174e-05, + "loss": 1.5634166717529296, + "step": 91140 + }, + { + "epoch": 0.2759069514022369, + "grad_norm": 0.1219060868024826, + "learning_rate": 9.078681705706435e-05, + "loss": 1.5440215110778808, + "step": 91150 + }, + { + "epoch": 0.2759372209525828, + "grad_norm": 0.11634517461061478, + "learning_rate": 9.078302187542695e-05, + "loss": 1.5861095428466796, + "step": 91160 + }, + { + "epoch": 0.27596749050292857, + "grad_norm": 0.11485007405281067, + "learning_rate": 9.077922669378956e-05, + "loss": 1.5758806228637696, + "step": 91170 + }, + { + "epoch": 0.2759977600532744, + "grad_norm": 0.10606228560209274, + "learning_rate": 9.077543151215218e-05, + "loss": 1.5314804077148438, + "step": 91180 + }, + { + "epoch": 0.2760280296036202, + "grad_norm": 0.11417876929044724, + "learning_rate": 9.077163633051479e-05, + "loss": 1.5778624534606933, + "step": 91190 + }, + { + "epoch": 0.27605829915396607, + "grad_norm": 0.10501769185066223, + "learning_rate": 9.076784114887739e-05, + "loss": 1.5738154411315919, + "step": 91200 + }, + { + "epoch": 0.2760885687043119, + "grad_norm": 0.11943663656711578, + "learning_rate": 9.076404596724e-05, + "loss": 1.5298261642456055, + "step": 91210 + }, + { + "epoch": 0.2761188382546577, + "grad_norm": 0.11632504314184189, + "learning_rate": 9.07602507856026e-05, + "loss": 1.5649581909179688, + "step": 91220 + }, + { + "epoch": 0.27614910780500357, + "grad_norm": 0.12730953097343445, + "learning_rate": 9.075645560396521e-05, + "loss": 1.5634893417358398, + "step": 91230 + }, + { + "epoch": 0.27617937735534936, + "grad_norm": 0.11106252670288086, + "learning_rate": 9.075266042232782e-05, + "loss": 1.546077346801758, + "step": 91240 + }, + { + "epoch": 0.2762096469056952, + "grad_norm": 0.11545120179653168, + "learning_rate": 9.074886524069042e-05, + "loss": 1.5559723854064942, + "step": 91250 + }, + { + "epoch": 0.27623991645604107, + "grad_norm": 0.12307590246200562, + "learning_rate": 9.074507005905303e-05, + "loss": 1.5431302070617676, + "step": 91260 + }, + { + "epoch": 0.27627018600638686, + "grad_norm": 0.10681372135877609, + "learning_rate": 9.074127487741563e-05, + "loss": 1.565531063079834, + "step": 91270 + }, + { + "epoch": 0.2763004555567327, + "grad_norm": 0.11854330450296402, + "learning_rate": 9.073747969577824e-05, + "loss": 1.5853768348693849, + "step": 91280 + }, + { + "epoch": 0.2763307251070785, + "grad_norm": 0.11080741882324219, + "learning_rate": 9.073368451414085e-05, + "loss": 1.552032470703125, + "step": 91290 + }, + { + "epoch": 0.27636099465742436, + "grad_norm": 0.11527406424283981, + "learning_rate": 9.072988933250345e-05, + "loss": 1.5586281776428224, + "step": 91300 + }, + { + "epoch": 0.2763912642077702, + "grad_norm": 0.11506696790456772, + "learning_rate": 9.072609415086607e-05, + "loss": 1.5506907463073731, + "step": 91310 + }, + { + "epoch": 0.276421533758116, + "grad_norm": 0.11095710098743439, + "learning_rate": 9.072229896922866e-05, + "loss": 1.5845317840576172, + "step": 91320 + }, + { + "epoch": 0.27645180330846186, + "grad_norm": 0.1207103356719017, + "learning_rate": 9.071850378759128e-05, + "loss": 1.5788344383239745, + "step": 91330 + }, + { + "epoch": 0.27648207285880766, + "grad_norm": 0.12025460600852966, + "learning_rate": 9.071470860595387e-05, + "loss": 1.5458084106445313, + "step": 91340 + }, + { + "epoch": 0.2765123424091535, + "grad_norm": 0.11627772450447083, + "learning_rate": 9.07109134243165e-05, + "loss": 1.5703792572021484, + "step": 91350 + }, + { + "epoch": 0.27654261195949936, + "grad_norm": 0.13171637058258057, + "learning_rate": 9.070711824267909e-05, + "loss": 1.541575050354004, + "step": 91360 + }, + { + "epoch": 0.27657288150984516, + "grad_norm": 0.11026521027088165, + "learning_rate": 9.07033230610417e-05, + "loss": 1.5699377059936523, + "step": 91370 + }, + { + "epoch": 0.276603151060191, + "grad_norm": 0.11408422887325287, + "learning_rate": 9.06995278794043e-05, + "loss": 1.5817878723144532, + "step": 91380 + }, + { + "epoch": 0.2766334206105368, + "grad_norm": 0.12171852588653564, + "learning_rate": 9.069573269776692e-05, + "loss": 1.5202104568481445, + "step": 91390 + }, + { + "epoch": 0.27666369016088266, + "grad_norm": 0.11127177625894547, + "learning_rate": 9.069193751612952e-05, + "loss": 1.5851224899291991, + "step": 91400 + }, + { + "epoch": 0.2766939597112285, + "grad_norm": 0.11585696786642075, + "learning_rate": 9.068814233449213e-05, + "loss": 1.5353925704956055, + "step": 91410 + }, + { + "epoch": 0.2767242292615743, + "grad_norm": 0.11059228330850601, + "learning_rate": 9.068434715285475e-05, + "loss": 1.5485474586486816, + "step": 91420 + }, + { + "epoch": 0.27675449881192016, + "grad_norm": 0.11933843791484833, + "learning_rate": 9.068055197121734e-05, + "loss": 1.5504862785339355, + "step": 91430 + }, + { + "epoch": 0.27678476836226595, + "grad_norm": 0.12165217101573944, + "learning_rate": 9.067675678957996e-05, + "loss": 1.5776058197021485, + "step": 91440 + }, + { + "epoch": 0.2768150379126118, + "grad_norm": 0.12931270897388458, + "learning_rate": 9.067296160794255e-05, + "loss": 1.563525390625, + "step": 91450 + }, + { + "epoch": 0.27684530746295766, + "grad_norm": 0.10655446350574493, + "learning_rate": 9.066916642630517e-05, + "loss": 1.5461957931518555, + "step": 91460 + }, + { + "epoch": 0.27687557701330345, + "grad_norm": 0.11660005897283554, + "learning_rate": 9.066537124466777e-05, + "loss": 1.5530786514282227, + "step": 91470 + }, + { + "epoch": 0.2769058465636493, + "grad_norm": 0.11068116873502731, + "learning_rate": 9.066157606303039e-05, + "loss": 1.5508003234863281, + "step": 91480 + }, + { + "epoch": 0.2769361161139951, + "grad_norm": 0.11747833341360092, + "learning_rate": 9.065778088139298e-05, + "loss": 1.5359860420227052, + "step": 91490 + }, + { + "epoch": 0.27696638566434095, + "grad_norm": 0.10685940086841583, + "learning_rate": 9.06539856997556e-05, + "loss": 1.5894489288330078, + "step": 91500 + }, + { + "epoch": 0.27696638566434095, + "eval_loss": 1.568952202796936, + "eval_runtime": 28.0563, + "eval_samples_per_second": 17.821, + "eval_steps_per_second": 1.141, + "step": 91500 + }, + { + "epoch": 0.2769966552146868, + "grad_norm": 0.1062425747513771, + "learning_rate": 9.065019051811819e-05, + "loss": 1.5664637565612793, + "step": 91510 + }, + { + "epoch": 0.2770269247650326, + "grad_norm": 0.1126924604177475, + "learning_rate": 9.064639533648081e-05, + "loss": 1.593984031677246, + "step": 91520 + }, + { + "epoch": 0.27705719431537845, + "grad_norm": 0.11536899954080582, + "learning_rate": 9.06426001548434e-05, + "loss": 1.5458131790161134, + "step": 91530 + }, + { + "epoch": 0.27708746386572425, + "grad_norm": 0.13079716265201569, + "learning_rate": 9.063880497320602e-05, + "loss": 1.5617136001586913, + "step": 91540 + }, + { + "epoch": 0.2771177334160701, + "grad_norm": 0.12166614085435867, + "learning_rate": 9.063500979156863e-05, + "loss": 1.5524967193603516, + "step": 91550 + }, + { + "epoch": 0.27714800296641595, + "grad_norm": 0.11243431270122528, + "learning_rate": 9.063121460993123e-05, + "loss": 1.5223127365112306, + "step": 91560 + }, + { + "epoch": 0.27717827251676175, + "grad_norm": 0.10906737297773361, + "learning_rate": 9.062741942829384e-05, + "loss": 1.5831854820251465, + "step": 91570 + }, + { + "epoch": 0.2772085420671076, + "grad_norm": 0.11317479610443115, + "learning_rate": 9.062362424665644e-05, + "loss": 1.568679428100586, + "step": 91580 + }, + { + "epoch": 0.2772388116174534, + "grad_norm": 0.11388832330703735, + "learning_rate": 9.061982906501906e-05, + "loss": 1.5920973777770997, + "step": 91590 + }, + { + "epoch": 0.27726908116779925, + "grad_norm": 0.11003641039133072, + "learning_rate": 9.061603388338166e-05, + "loss": 1.5809951782226563, + "step": 91600 + }, + { + "epoch": 0.2772993507181451, + "grad_norm": 0.115690678358078, + "learning_rate": 9.061223870174428e-05, + "loss": 1.570246696472168, + "step": 91610 + }, + { + "epoch": 0.2773296202684909, + "grad_norm": 0.1380600482225418, + "learning_rate": 9.060844352010687e-05, + "loss": 1.5855722427368164, + "step": 91620 + }, + { + "epoch": 0.27735988981883675, + "grad_norm": 0.12068215012550354, + "learning_rate": 9.060464833846949e-05, + "loss": 1.5630668640136718, + "step": 91630 + }, + { + "epoch": 0.2773901593691826, + "grad_norm": 0.1274101734161377, + "learning_rate": 9.060085315683208e-05, + "loss": 1.4769489288330078, + "step": 91640 + }, + { + "epoch": 0.2774204289195284, + "grad_norm": 0.11980687081813812, + "learning_rate": 9.05970579751947e-05, + "loss": 1.5757299423217774, + "step": 91650 + }, + { + "epoch": 0.27745069846987425, + "grad_norm": 0.12162788212299347, + "learning_rate": 9.059326279355729e-05, + "loss": 1.5641208648681642, + "step": 91660 + }, + { + "epoch": 0.27748096802022004, + "grad_norm": 0.1214907169342041, + "learning_rate": 9.058946761191991e-05, + "loss": 1.5613039970397948, + "step": 91670 + }, + { + "epoch": 0.2775112375705659, + "grad_norm": 0.1148134395480156, + "learning_rate": 9.058567243028252e-05, + "loss": 1.5337690353393554, + "step": 91680 + }, + { + "epoch": 0.27754150712091175, + "grad_norm": 0.11205611377954483, + "learning_rate": 9.058187724864512e-05, + "loss": 1.587385368347168, + "step": 91690 + }, + { + "epoch": 0.27757177667125754, + "grad_norm": 0.10533574968576431, + "learning_rate": 9.057808206700773e-05, + "loss": 1.5613564491271972, + "step": 91700 + }, + { + "epoch": 0.2776020462216034, + "grad_norm": 0.11334989964962006, + "learning_rate": 9.057428688537034e-05, + "loss": 1.5571898460388183, + "step": 91710 + }, + { + "epoch": 0.2776323157719492, + "grad_norm": 0.12191444635391235, + "learning_rate": 9.057049170373294e-05, + "loss": 1.6065603256225587, + "step": 91720 + }, + { + "epoch": 0.27766258532229504, + "grad_norm": 0.10802702605724335, + "learning_rate": 9.056669652209555e-05, + "loss": 1.5749031066894532, + "step": 91730 + }, + { + "epoch": 0.2776928548726409, + "grad_norm": 0.12014930695295334, + "learning_rate": 9.056290134045815e-05, + "loss": 1.5952497482299806, + "step": 91740 + }, + { + "epoch": 0.2777231244229867, + "grad_norm": 0.11205415427684784, + "learning_rate": 9.055910615882076e-05, + "loss": 1.5482370376586914, + "step": 91750 + }, + { + "epoch": 0.27775339397333254, + "grad_norm": 0.11125604808330536, + "learning_rate": 9.055531097718337e-05, + "loss": 1.5512680053710937, + "step": 91760 + }, + { + "epoch": 0.27778366352367834, + "grad_norm": 0.16013255715370178, + "learning_rate": 9.055151579554597e-05, + "loss": 1.4971229553222656, + "step": 91770 + }, + { + "epoch": 0.2778139330740242, + "grad_norm": 0.11387497931718826, + "learning_rate": 9.054772061390858e-05, + "loss": 1.5854221343994142, + "step": 91780 + }, + { + "epoch": 0.27784420262437004, + "grad_norm": 0.11719285696744919, + "learning_rate": 9.05439254322712e-05, + "loss": 1.568058681488037, + "step": 91790 + }, + { + "epoch": 0.27787447217471584, + "grad_norm": 0.10916857421398163, + "learning_rate": 9.05401302506338e-05, + "loss": 1.5311697006225586, + "step": 91800 + }, + { + "epoch": 0.2779047417250617, + "grad_norm": 0.12705639004707336, + "learning_rate": 9.053633506899641e-05, + "loss": 1.567860221862793, + "step": 91810 + }, + { + "epoch": 0.2779350112754075, + "grad_norm": 0.10818195343017578, + "learning_rate": 9.053253988735901e-05, + "loss": 1.6081531524658204, + "step": 91820 + }, + { + "epoch": 0.27796528082575334, + "grad_norm": 0.10715688019990921, + "learning_rate": 9.052874470572162e-05, + "loss": 1.568503189086914, + "step": 91830 + }, + { + "epoch": 0.2779955503760992, + "grad_norm": 0.10898037999868393, + "learning_rate": 9.052494952408423e-05, + "loss": 1.5914722442626954, + "step": 91840 + }, + { + "epoch": 0.278025819926445, + "grad_norm": 0.11260590702295303, + "learning_rate": 9.052115434244683e-05, + "loss": 1.5603765487670898, + "step": 91850 + }, + { + "epoch": 0.27805608947679084, + "grad_norm": 0.11645667999982834, + "learning_rate": 9.051735916080944e-05, + "loss": 1.533779239654541, + "step": 91860 + }, + { + "epoch": 0.27808635902713663, + "grad_norm": 0.12052568048238754, + "learning_rate": 9.051356397917204e-05, + "loss": 1.5775030136108399, + "step": 91870 + }, + { + "epoch": 0.2781166285774825, + "grad_norm": 0.1226455420255661, + "learning_rate": 9.050976879753465e-05, + "loss": 1.5537105560302735, + "step": 91880 + }, + { + "epoch": 0.27814689812782833, + "grad_norm": 0.11284186691045761, + "learning_rate": 9.050597361589726e-05, + "loss": 1.5473592758178711, + "step": 91890 + }, + { + "epoch": 0.27817716767817413, + "grad_norm": 0.12076862901449203, + "learning_rate": 9.050217843425986e-05, + "loss": 1.5548622131347656, + "step": 91900 + }, + { + "epoch": 0.27820743722852, + "grad_norm": 0.11666687577962875, + "learning_rate": 9.049838325262247e-05, + "loss": 1.548919105529785, + "step": 91910 + }, + { + "epoch": 0.2782377067788658, + "grad_norm": 0.1171005368232727, + "learning_rate": 9.049458807098509e-05, + "loss": 1.5374969482421874, + "step": 91920 + }, + { + "epoch": 0.27826797632921163, + "grad_norm": 0.10727597028017044, + "learning_rate": 9.049079288934768e-05, + "loss": 1.5332775115966797, + "step": 91930 + }, + { + "epoch": 0.2782982458795575, + "grad_norm": 0.11337687075138092, + "learning_rate": 9.04869977077103e-05, + "loss": 1.5850689888000489, + "step": 91940 + }, + { + "epoch": 0.2783285154299033, + "grad_norm": 0.1083337813615799, + "learning_rate": 9.048320252607289e-05, + "loss": 1.5852258682250977, + "step": 91950 + }, + { + "epoch": 0.27835878498024913, + "grad_norm": 0.12913350760936737, + "learning_rate": 9.047940734443551e-05, + "loss": 1.5709123611450195, + "step": 91960 + }, + { + "epoch": 0.2783890545305949, + "grad_norm": 0.1357630491256714, + "learning_rate": 9.04756121627981e-05, + "loss": 1.549871826171875, + "step": 91970 + }, + { + "epoch": 0.2784193240809408, + "grad_norm": 0.11563484370708466, + "learning_rate": 9.047181698116072e-05, + "loss": 1.5539981842041015, + "step": 91980 + }, + { + "epoch": 0.27844959363128663, + "grad_norm": 0.13410457968711853, + "learning_rate": 9.046802179952333e-05, + "loss": 1.5405695915222168, + "step": 91990 + }, + { + "epoch": 0.2784798631816324, + "grad_norm": 0.11737526208162308, + "learning_rate": 9.046422661788594e-05, + "loss": 1.5584548950195312, + "step": 92000 + }, + { + "epoch": 0.2784798631816324, + "eval_loss": 1.559443473815918, + "eval_runtime": 28.2076, + "eval_samples_per_second": 17.726, + "eval_steps_per_second": 1.134, + "step": 92000 + }, + { + "epoch": 0.2785101327319783, + "grad_norm": 0.11099015176296234, + "learning_rate": 9.046043143624854e-05, + "loss": 1.6008459091186524, + "step": 92010 + }, + { + "epoch": 0.2785404022823241, + "grad_norm": 0.1216721311211586, + "learning_rate": 9.045663625461115e-05, + "loss": 1.545602798461914, + "step": 92020 + }, + { + "epoch": 0.2785706718326699, + "grad_norm": 0.11155397444963455, + "learning_rate": 9.045284107297377e-05, + "loss": 1.549964427947998, + "step": 92030 + }, + { + "epoch": 0.2786009413830158, + "grad_norm": 0.1234540268778801, + "learning_rate": 9.044904589133636e-05, + "loss": 1.5508578300476075, + "step": 92040 + }, + { + "epoch": 0.2786312109333616, + "grad_norm": 0.1286812275648117, + "learning_rate": 9.044525070969898e-05, + "loss": 1.5456201553344726, + "step": 92050 + }, + { + "epoch": 0.2786614804837074, + "grad_norm": 0.12467534840106964, + "learning_rate": 9.044145552806157e-05, + "loss": 1.5733848571777345, + "step": 92060 + }, + { + "epoch": 0.2786917500340532, + "grad_norm": 0.1208312138915062, + "learning_rate": 9.043766034642419e-05, + "loss": 1.5718759536743163, + "step": 92070 + }, + { + "epoch": 0.27872201958439907, + "grad_norm": 0.108265221118927, + "learning_rate": 9.043386516478678e-05, + "loss": 1.558885097503662, + "step": 92080 + }, + { + "epoch": 0.2787522891347449, + "grad_norm": 0.10546872019767761, + "learning_rate": 9.04300699831494e-05, + "loss": 1.5886817932128907, + "step": 92090 + }, + { + "epoch": 0.2787825586850907, + "grad_norm": 0.11626046895980835, + "learning_rate": 9.0426274801512e-05, + "loss": 1.5566972732543944, + "step": 92100 + }, + { + "epoch": 0.27881282823543657, + "grad_norm": 0.12320650368928909, + "learning_rate": 9.042247961987461e-05, + "loss": 1.568409538269043, + "step": 92110 + }, + { + "epoch": 0.27884309778578237, + "grad_norm": 0.10999110341072083, + "learning_rate": 9.04186844382372e-05, + "loss": 1.5700878143310546, + "step": 92120 + }, + { + "epoch": 0.2788733673361282, + "grad_norm": 0.12660972774028778, + "learning_rate": 9.041488925659983e-05, + "loss": 1.574723243713379, + "step": 92130 + }, + { + "epoch": 0.27890363688647407, + "grad_norm": 0.12588003277778625, + "learning_rate": 9.041109407496242e-05, + "loss": 1.5309256553649901, + "step": 92140 + }, + { + "epoch": 0.27893390643681987, + "grad_norm": 0.11875010281801224, + "learning_rate": 9.040729889332504e-05, + "loss": 1.5921974182128906, + "step": 92150 + }, + { + "epoch": 0.2789641759871657, + "grad_norm": 0.11145200580358505, + "learning_rate": 9.040350371168764e-05, + "loss": 1.5658635139465331, + "step": 92160 + }, + { + "epoch": 0.2789944455375115, + "grad_norm": 0.12698377668857574, + "learning_rate": 9.039970853005025e-05, + "loss": 1.5768394470214844, + "step": 92170 + }, + { + "epoch": 0.27902471508785737, + "grad_norm": 0.11994685232639313, + "learning_rate": 9.039591334841286e-05, + "loss": 1.5543004989624023, + "step": 92180 + }, + { + "epoch": 0.2790549846382032, + "grad_norm": 0.10349985957145691, + "learning_rate": 9.039211816677546e-05, + "loss": 1.5335856437683106, + "step": 92190 + }, + { + "epoch": 0.279085254188549, + "grad_norm": 0.11152709275484085, + "learning_rate": 9.038832298513808e-05, + "loss": 1.548881721496582, + "step": 92200 + }, + { + "epoch": 0.27911552373889487, + "grad_norm": 0.10416345298290253, + "learning_rate": 9.038452780350067e-05, + "loss": 1.579062271118164, + "step": 92210 + }, + { + "epoch": 0.27914579328924066, + "grad_norm": 0.11980754882097244, + "learning_rate": 9.03807326218633e-05, + "loss": 1.5849371910095216, + "step": 92220 + }, + { + "epoch": 0.2791760628395865, + "grad_norm": 0.09708055853843689, + "learning_rate": 9.037693744022589e-05, + "loss": 1.5551651954650878, + "step": 92230 + }, + { + "epoch": 0.27920633238993237, + "grad_norm": 0.10994083434343338, + "learning_rate": 9.03731422585885e-05, + "loss": 1.6013521194458007, + "step": 92240 + }, + { + "epoch": 0.27923660194027816, + "grad_norm": 0.1190245971083641, + "learning_rate": 9.03693470769511e-05, + "loss": 1.5381874084472655, + "step": 92250 + }, + { + "epoch": 0.279266871490624, + "grad_norm": 0.10939020663499832, + "learning_rate": 9.036555189531372e-05, + "loss": 1.5364258766174317, + "step": 92260 + }, + { + "epoch": 0.2792971410409698, + "grad_norm": 0.112730011343956, + "learning_rate": 9.036175671367631e-05, + "loss": 1.5306032180786133, + "step": 92270 + }, + { + "epoch": 0.27932741059131566, + "grad_norm": 0.11416391283273697, + "learning_rate": 9.035796153203893e-05, + "loss": 1.5413703918457031, + "step": 92280 + }, + { + "epoch": 0.2793576801416615, + "grad_norm": 0.11685411632061005, + "learning_rate": 9.035416635040153e-05, + "loss": 1.5840445518493653, + "step": 92290 + }, + { + "epoch": 0.2793879496920073, + "grad_norm": 0.10966683179140091, + "learning_rate": 9.035037116876414e-05, + "loss": 1.574226188659668, + "step": 92300 + }, + { + "epoch": 0.27941821924235316, + "grad_norm": 0.11331713199615479, + "learning_rate": 9.034657598712675e-05, + "loss": 1.5847793579101563, + "step": 92310 + }, + { + "epoch": 0.27944848879269896, + "grad_norm": 0.11415168642997742, + "learning_rate": 9.034278080548935e-05, + "loss": 1.5493080139160156, + "step": 92320 + }, + { + "epoch": 0.2794787583430448, + "grad_norm": 0.11803882569074631, + "learning_rate": 9.033898562385196e-05, + "loss": 1.5429010391235352, + "step": 92330 + }, + { + "epoch": 0.27950902789339066, + "grad_norm": 0.10772064328193665, + "learning_rate": 9.033519044221456e-05, + "loss": 1.5635210037231446, + "step": 92340 + }, + { + "epoch": 0.27953929744373646, + "grad_norm": 0.11631079763174057, + "learning_rate": 9.033139526057717e-05, + "loss": 1.548775863647461, + "step": 92350 + }, + { + "epoch": 0.2795695669940823, + "grad_norm": 0.13035129010677338, + "learning_rate": 9.032760007893978e-05, + "loss": 1.5559148788452148, + "step": 92360 + }, + { + "epoch": 0.27959983654442816, + "grad_norm": 0.11166168004274368, + "learning_rate": 9.032380489730238e-05, + "loss": 1.538093376159668, + "step": 92370 + }, + { + "epoch": 0.27963010609477396, + "grad_norm": 0.11234923452138901, + "learning_rate": 9.032000971566499e-05, + "loss": 1.6013874053955077, + "step": 92380 + }, + { + "epoch": 0.2796603756451198, + "grad_norm": 0.13406801223754883, + "learning_rate": 9.03162145340276e-05, + "loss": 1.5946159362792969, + "step": 92390 + }, + { + "epoch": 0.2796906451954656, + "grad_norm": 0.11916912347078323, + "learning_rate": 9.03124193523902e-05, + "loss": 1.5546328544616699, + "step": 92400 + }, + { + "epoch": 0.27972091474581146, + "grad_norm": 0.11021662503480911, + "learning_rate": 9.030862417075282e-05, + "loss": 1.5286701202392579, + "step": 92410 + }, + { + "epoch": 0.2797511842961573, + "grad_norm": 0.11407319456338882, + "learning_rate": 9.030482898911543e-05, + "loss": 1.5576455116271972, + "step": 92420 + }, + { + "epoch": 0.2797814538465031, + "grad_norm": 0.10857848078012466, + "learning_rate": 9.030103380747803e-05, + "loss": 1.5744367599487306, + "step": 92430 + }, + { + "epoch": 0.27981172339684895, + "grad_norm": 0.12749157845973969, + "learning_rate": 9.029723862584064e-05, + "loss": 1.5434982299804687, + "step": 92440 + }, + { + "epoch": 0.27984199294719475, + "grad_norm": 0.12064921855926514, + "learning_rate": 9.029344344420324e-05, + "loss": 1.5733131408691405, + "step": 92450 + }, + { + "epoch": 0.2798722624975406, + "grad_norm": 0.10997142642736435, + "learning_rate": 9.028964826256585e-05, + "loss": 1.574667263031006, + "step": 92460 + }, + { + "epoch": 0.27990253204788645, + "grad_norm": 0.11195340752601624, + "learning_rate": 9.028585308092846e-05, + "loss": 1.5468069076538087, + "step": 92470 + }, + { + "epoch": 0.27993280159823225, + "grad_norm": 0.12949581444263458, + "learning_rate": 9.028205789929106e-05, + "loss": 1.5618705749511719, + "step": 92480 + }, + { + "epoch": 0.2799630711485781, + "grad_norm": 0.11414575576782227, + "learning_rate": 9.027826271765367e-05, + "loss": 1.572978401184082, + "step": 92490 + }, + { + "epoch": 0.2799933406989239, + "grad_norm": 0.11524246633052826, + "learning_rate": 9.027446753601627e-05, + "loss": 1.5546030044555663, + "step": 92500 + }, + { + "epoch": 0.2799933406989239, + "eval_loss": 1.563720703125, + "eval_runtime": 28.2115, + "eval_samples_per_second": 17.723, + "eval_steps_per_second": 1.134, + "step": 92500 + }, + { + "epoch": 0.28002361024926975, + "grad_norm": 0.11386016011238098, + "learning_rate": 9.027067235437888e-05, + "loss": 1.5856770515441894, + "step": 92510 + }, + { + "epoch": 0.2800538797996156, + "grad_norm": 0.11295995861291885, + "learning_rate": 9.026687717274149e-05, + "loss": 1.5765702247619628, + "step": 92520 + }, + { + "epoch": 0.2800841493499614, + "grad_norm": 0.12836188077926636, + "learning_rate": 9.02630819911041e-05, + "loss": 1.5391496658325194, + "step": 92530 + }, + { + "epoch": 0.28011441890030725, + "grad_norm": 0.12825573980808258, + "learning_rate": 9.02592868094667e-05, + "loss": 1.5602885246276856, + "step": 92540 + }, + { + "epoch": 0.28014468845065305, + "grad_norm": 0.11314728856086731, + "learning_rate": 9.025549162782932e-05, + "loss": 1.5571741104125976, + "step": 92550 + }, + { + "epoch": 0.2801749580009989, + "grad_norm": 0.11676222831010818, + "learning_rate": 9.025169644619191e-05, + "loss": 1.5381489753723145, + "step": 92560 + }, + { + "epoch": 0.28020522755134475, + "grad_norm": 0.10062149912118912, + "learning_rate": 9.024790126455453e-05, + "loss": 1.557838249206543, + "step": 92570 + }, + { + "epoch": 0.28023549710169054, + "grad_norm": 0.11525988578796387, + "learning_rate": 9.024410608291712e-05, + "loss": 1.5600916862487793, + "step": 92580 + }, + { + "epoch": 0.2802657666520364, + "grad_norm": 0.11090093106031418, + "learning_rate": 9.024031090127974e-05, + "loss": 1.5660482406616212, + "step": 92590 + }, + { + "epoch": 0.2802960362023822, + "grad_norm": 0.10279791802167892, + "learning_rate": 9.023651571964235e-05, + "loss": 1.573683738708496, + "step": 92600 + }, + { + "epoch": 0.28032630575272804, + "grad_norm": 0.1168646439909935, + "learning_rate": 9.023272053800495e-05, + "loss": 1.6003715515136718, + "step": 92610 + }, + { + "epoch": 0.2803565753030739, + "grad_norm": 0.11155708879232407, + "learning_rate": 9.022892535636756e-05, + "loss": 1.5708436965942383, + "step": 92620 + }, + { + "epoch": 0.2803868448534197, + "grad_norm": 0.11299148201942444, + "learning_rate": 9.022513017473016e-05, + "loss": 1.5991580009460449, + "step": 92630 + }, + { + "epoch": 0.28041711440376554, + "grad_norm": 0.10116470605134964, + "learning_rate": 9.022133499309277e-05, + "loss": 1.591615867614746, + "step": 92640 + }, + { + "epoch": 0.28044738395411134, + "grad_norm": 0.11194074153900146, + "learning_rate": 9.021753981145538e-05, + "loss": 1.5562400817871094, + "step": 92650 + }, + { + "epoch": 0.2804776535044572, + "grad_norm": 0.11654510349035263, + "learning_rate": 9.0213744629818e-05, + "loss": 1.5990306854248046, + "step": 92660 + }, + { + "epoch": 0.28050792305480304, + "grad_norm": 0.12300316244363785, + "learning_rate": 9.020994944818059e-05, + "loss": 1.5315656661987305, + "step": 92670 + }, + { + "epoch": 0.28053819260514884, + "grad_norm": 0.11181043088436127, + "learning_rate": 9.020615426654321e-05, + "loss": 1.5189018249511719, + "step": 92680 + }, + { + "epoch": 0.2805684621554947, + "grad_norm": 0.11050254851579666, + "learning_rate": 9.02023590849058e-05, + "loss": 1.554217529296875, + "step": 92690 + }, + { + "epoch": 0.2805987317058405, + "grad_norm": 0.11154475808143616, + "learning_rate": 9.019856390326842e-05, + "loss": 1.5748605728149414, + "step": 92700 + }, + { + "epoch": 0.28062900125618634, + "grad_norm": 0.11359675228595734, + "learning_rate": 9.019476872163101e-05, + "loss": 1.6229774475097656, + "step": 92710 + }, + { + "epoch": 0.2806592708065322, + "grad_norm": 0.11797523498535156, + "learning_rate": 9.019097353999363e-05, + "loss": 1.602100944519043, + "step": 92720 + }, + { + "epoch": 0.280689540356878, + "grad_norm": 0.10379286855459213, + "learning_rate": 9.018717835835622e-05, + "loss": 1.5540621757507325, + "step": 92730 + }, + { + "epoch": 0.28071980990722384, + "grad_norm": 0.11328928172588348, + "learning_rate": 9.018338317671884e-05, + "loss": 1.5582967758178712, + "step": 92740 + }, + { + "epoch": 0.28075007945756963, + "grad_norm": 0.10094331204891205, + "learning_rate": 9.017958799508144e-05, + "loss": 1.568570899963379, + "step": 92750 + }, + { + "epoch": 0.2807803490079155, + "grad_norm": 0.11652538180351257, + "learning_rate": 9.017579281344406e-05, + "loss": 1.5672744750976562, + "step": 92760 + }, + { + "epoch": 0.28081061855826134, + "grad_norm": 0.11408058553934097, + "learning_rate": 9.017199763180665e-05, + "loss": 1.5857388496398925, + "step": 92770 + }, + { + "epoch": 0.28084088810860713, + "grad_norm": 0.12105907499790192, + "learning_rate": 9.016820245016927e-05, + "loss": 1.5349738121032714, + "step": 92780 + }, + { + "epoch": 0.280871157658953, + "grad_norm": 0.12083330750465393, + "learning_rate": 9.016440726853187e-05, + "loss": 1.570594882965088, + "step": 92790 + }, + { + "epoch": 0.2809014272092988, + "grad_norm": 0.12400919944047928, + "learning_rate": 9.016061208689448e-05, + "loss": 1.5749298095703126, + "step": 92800 + }, + { + "epoch": 0.28093169675964463, + "grad_norm": 0.10602204501628876, + "learning_rate": 9.01568169052571e-05, + "loss": 1.5814221382141114, + "step": 92810 + }, + { + "epoch": 0.2809619663099905, + "grad_norm": 0.11825542151927948, + "learning_rate": 9.015302172361969e-05, + "loss": 1.5397710800170898, + "step": 92820 + }, + { + "epoch": 0.2809922358603363, + "grad_norm": 0.12129812687635422, + "learning_rate": 9.014922654198231e-05, + "loss": 1.5813959121704102, + "step": 92830 + }, + { + "epoch": 0.28102250541068213, + "grad_norm": 0.13231144845485687, + "learning_rate": 9.01454313603449e-05, + "loss": 1.5291902542114257, + "step": 92840 + }, + { + "epoch": 0.28105277496102793, + "grad_norm": 0.1179145947098732, + "learning_rate": 9.014163617870752e-05, + "loss": 1.5530993461608886, + "step": 92850 + }, + { + "epoch": 0.2810830445113738, + "grad_norm": 0.11389631778001785, + "learning_rate": 9.013784099707011e-05, + "loss": 1.5677623748779297, + "step": 92860 + }, + { + "epoch": 0.28111331406171963, + "grad_norm": 0.10928075760602951, + "learning_rate": 9.013404581543273e-05, + "loss": 1.5395700454711914, + "step": 92870 + }, + { + "epoch": 0.28114358361206543, + "grad_norm": 0.12498226016759872, + "learning_rate": 9.013025063379533e-05, + "loss": 1.5513517379760742, + "step": 92880 + }, + { + "epoch": 0.2811738531624113, + "grad_norm": 0.12987761199474335, + "learning_rate": 9.012645545215795e-05, + "loss": 1.5464141845703125, + "step": 92890 + }, + { + "epoch": 0.2812041227127571, + "grad_norm": 0.11602319777011871, + "learning_rate": 9.012266027052055e-05, + "loss": 1.571260643005371, + "step": 92900 + }, + { + "epoch": 0.28123439226310293, + "grad_norm": 0.11268503963947296, + "learning_rate": 9.011886508888316e-05, + "loss": 1.6017396926879883, + "step": 92910 + }, + { + "epoch": 0.2812646618134488, + "grad_norm": 0.12887023389339447, + "learning_rate": 9.011506990724576e-05, + "loss": 1.5687295913696289, + "step": 92920 + }, + { + "epoch": 0.2812949313637946, + "grad_norm": 0.11786792427301407, + "learning_rate": 9.011127472560837e-05, + "loss": 1.576445960998535, + "step": 92930 + }, + { + "epoch": 0.2813252009141404, + "grad_norm": 0.10376506298780441, + "learning_rate": 9.010747954397098e-05, + "loss": 1.5271877288818358, + "step": 92940 + }, + { + "epoch": 0.2813554704644862, + "grad_norm": 0.11025058478116989, + "learning_rate": 9.010368436233358e-05, + "loss": 1.5306246757507325, + "step": 92950 + }, + { + "epoch": 0.2813857400148321, + "grad_norm": 0.10769058018922806, + "learning_rate": 9.009988918069619e-05, + "loss": 1.5784686088562012, + "step": 92960 + }, + { + "epoch": 0.2814160095651779, + "grad_norm": 0.11640569567680359, + "learning_rate": 9.00960939990588e-05, + "loss": 1.5915958404541015, + "step": 92970 + }, + { + "epoch": 0.2814462791155237, + "grad_norm": 0.11785225570201874, + "learning_rate": 9.00922988174214e-05, + "loss": 1.5292706489562988, + "step": 92980 + }, + { + "epoch": 0.2814765486658696, + "grad_norm": 0.13155260682106018, + "learning_rate": 9.0088503635784e-05, + "loss": 1.5771157264709472, + "step": 92990 + }, + { + "epoch": 0.28150681821621537, + "grad_norm": 0.1216631531715393, + "learning_rate": 9.008470845414661e-05, + "loss": 1.5534555435180664, + "step": 93000 + }, + { + "epoch": 0.28150681821621537, + "eval_loss": 1.565105676651001, + "eval_runtime": 28.161, + "eval_samples_per_second": 17.755, + "eval_steps_per_second": 1.136, + "step": 93000 + }, + { + "epoch": 0.2815370877665612, + "grad_norm": 0.11454209685325623, + "learning_rate": 9.008091327250922e-05, + "loss": 1.5770230293273926, + "step": 93010 + }, + { + "epoch": 0.2815673573169071, + "grad_norm": 0.10994367301464081, + "learning_rate": 9.007711809087184e-05, + "loss": 1.5758262634277345, + "step": 93020 + }, + { + "epoch": 0.28159762686725287, + "grad_norm": 0.1056985929608345, + "learning_rate": 9.007332290923444e-05, + "loss": 1.598362350463867, + "step": 93030 + }, + { + "epoch": 0.2816278964175987, + "grad_norm": 0.11984127014875412, + "learning_rate": 9.006952772759705e-05, + "loss": 1.6026674270629884, + "step": 93040 + }, + { + "epoch": 0.2816581659679446, + "grad_norm": 0.11367825418710709, + "learning_rate": 9.006573254595965e-05, + "loss": 1.5548176765441895, + "step": 93050 + }, + { + "epoch": 0.28168843551829037, + "grad_norm": 0.11407230794429779, + "learning_rate": 9.006193736432226e-05, + "loss": 1.5542654037475585, + "step": 93060 + }, + { + "epoch": 0.2817187050686362, + "grad_norm": 0.1343812793493271, + "learning_rate": 9.005814218268487e-05, + "loss": 1.5627161979675293, + "step": 93070 + }, + { + "epoch": 0.281748974618982, + "grad_norm": 0.1309886872768402, + "learning_rate": 9.005434700104747e-05, + "loss": 1.5655447959899902, + "step": 93080 + }, + { + "epoch": 0.28177924416932787, + "grad_norm": 0.11872150003910065, + "learning_rate": 9.005055181941008e-05, + "loss": 1.5716493606567383, + "step": 93090 + }, + { + "epoch": 0.2818095137196737, + "grad_norm": 0.10601238161325455, + "learning_rate": 9.004675663777268e-05, + "loss": 1.5840225219726562, + "step": 93100 + }, + { + "epoch": 0.2818397832700195, + "grad_norm": 0.12344782054424286, + "learning_rate": 9.004296145613529e-05, + "loss": 1.5775633811950684, + "step": 93110 + }, + { + "epoch": 0.28187005282036537, + "grad_norm": 0.1251930296421051, + "learning_rate": 9.00391662744979e-05, + "loss": 1.5619080543518067, + "step": 93120 + }, + { + "epoch": 0.28190032237071116, + "grad_norm": 0.12597383558750153, + "learning_rate": 9.00353710928605e-05, + "loss": 1.5794825553894043, + "step": 93130 + }, + { + "epoch": 0.281930591921057, + "grad_norm": 0.1137106642127037, + "learning_rate": 9.003157591122312e-05, + "loss": 1.5432847023010254, + "step": 93140 + }, + { + "epoch": 0.28196086147140287, + "grad_norm": 0.12255299836397171, + "learning_rate": 9.002778072958571e-05, + "loss": 1.5406518936157227, + "step": 93150 + }, + { + "epoch": 0.28199113102174866, + "grad_norm": 0.10198904573917389, + "learning_rate": 9.002398554794833e-05, + "loss": 1.5310952186584472, + "step": 93160 + }, + { + "epoch": 0.2820214005720945, + "grad_norm": 0.132193461060524, + "learning_rate": 9.002019036631093e-05, + "loss": 1.5904870986938477, + "step": 93170 + }, + { + "epoch": 0.2820516701224403, + "grad_norm": 0.11934597045183182, + "learning_rate": 9.001639518467355e-05, + "loss": 1.5507091522216796, + "step": 93180 + }, + { + "epoch": 0.28208193967278616, + "grad_norm": 0.11732073128223419, + "learning_rate": 9.001260000303614e-05, + "loss": 1.5961296081542968, + "step": 93190 + }, + { + "epoch": 0.282112209223132, + "grad_norm": 0.11673116683959961, + "learning_rate": 9.000880482139876e-05, + "loss": 1.5345448493957519, + "step": 93200 + }, + { + "epoch": 0.2821424787734778, + "grad_norm": 0.12178222835063934, + "learning_rate": 9.000500963976136e-05, + "loss": 1.571316909790039, + "step": 93210 + }, + { + "epoch": 0.28217274832382366, + "grad_norm": 0.11997239291667938, + "learning_rate": 9.000121445812397e-05, + "loss": 1.5322596549987793, + "step": 93220 + }, + { + "epoch": 0.28220301787416946, + "grad_norm": 0.10856577754020691, + "learning_rate": 8.999741927648658e-05, + "loss": 1.548598575592041, + "step": 93230 + }, + { + "epoch": 0.2822332874245153, + "grad_norm": 0.11751777678728104, + "learning_rate": 8.999362409484918e-05, + "loss": 1.578267765045166, + "step": 93240 + }, + { + "epoch": 0.28226355697486116, + "grad_norm": 0.13840819895267487, + "learning_rate": 8.998982891321179e-05, + "loss": 1.5185956001281737, + "step": 93250 + }, + { + "epoch": 0.28229382652520696, + "grad_norm": 0.11428353935480118, + "learning_rate": 8.99860337315744e-05, + "loss": 1.5761228561401368, + "step": 93260 + }, + { + "epoch": 0.2823240960755528, + "grad_norm": 0.11474990099668503, + "learning_rate": 8.998223854993701e-05, + "loss": 1.5531546592712402, + "step": 93270 + }, + { + "epoch": 0.2823543656258986, + "grad_norm": 0.11126451939344406, + "learning_rate": 8.99784433682996e-05, + "loss": 1.58009033203125, + "step": 93280 + }, + { + "epoch": 0.28238463517624446, + "grad_norm": 0.11427327990531921, + "learning_rate": 8.997464818666222e-05, + "loss": 1.5600614547729492, + "step": 93290 + }, + { + "epoch": 0.2824149047265903, + "grad_norm": 0.12039261311292648, + "learning_rate": 8.997085300502482e-05, + "loss": 1.586849308013916, + "step": 93300 + }, + { + "epoch": 0.2824451742769361, + "grad_norm": 0.1268557906150818, + "learning_rate": 8.996705782338744e-05, + "loss": 1.5491134643554687, + "step": 93310 + }, + { + "epoch": 0.28247544382728196, + "grad_norm": 0.1055518314242363, + "learning_rate": 8.996326264175003e-05, + "loss": 1.5722820281982421, + "step": 93320 + }, + { + "epoch": 0.28250571337762775, + "grad_norm": 0.10904969274997711, + "learning_rate": 8.995946746011265e-05, + "loss": 1.5362722396850585, + "step": 93330 + }, + { + "epoch": 0.2825359829279736, + "grad_norm": 0.11193792521953583, + "learning_rate": 8.995567227847524e-05, + "loss": 1.54158935546875, + "step": 93340 + }, + { + "epoch": 0.28256625247831946, + "grad_norm": 0.1053880974650383, + "learning_rate": 8.995187709683786e-05, + "loss": 1.5546157836914063, + "step": 93350 + }, + { + "epoch": 0.28259652202866525, + "grad_norm": 0.10204582661390305, + "learning_rate": 8.994808191520045e-05, + "loss": 1.5567389488220216, + "step": 93360 + }, + { + "epoch": 0.2826267915790111, + "grad_norm": 0.10937540978193283, + "learning_rate": 8.994428673356307e-05, + "loss": 1.5707283020019531, + "step": 93370 + }, + { + "epoch": 0.2826570611293569, + "grad_norm": 0.11189447343349457, + "learning_rate": 8.994049155192566e-05, + "loss": 1.585586166381836, + "step": 93380 + }, + { + "epoch": 0.28268733067970275, + "grad_norm": 0.1113288626074791, + "learning_rate": 8.993669637028828e-05, + "loss": 1.5361336708068847, + "step": 93390 + }, + { + "epoch": 0.2827176002300486, + "grad_norm": 0.11333336681127548, + "learning_rate": 8.993290118865089e-05, + "loss": 1.5605119705200194, + "step": 93400 + }, + { + "epoch": 0.2827478697803944, + "grad_norm": 0.10823003947734833, + "learning_rate": 8.99291060070135e-05, + "loss": 1.5850183486938476, + "step": 93410 + }, + { + "epoch": 0.28277813933074025, + "grad_norm": 0.11347084492444992, + "learning_rate": 8.992531082537612e-05, + "loss": 1.5309110641479493, + "step": 93420 + }, + { + "epoch": 0.28280840888108605, + "grad_norm": 0.11880100518465042, + "learning_rate": 8.992151564373871e-05, + "loss": 1.5669525146484375, + "step": 93430 + }, + { + "epoch": 0.2828386784314319, + "grad_norm": 0.11173702776432037, + "learning_rate": 8.991772046210133e-05, + "loss": 1.6118249893188477, + "step": 93440 + }, + { + "epoch": 0.28286894798177775, + "grad_norm": 0.11915473639965057, + "learning_rate": 8.991392528046392e-05, + "loss": 1.5756110191345214, + "step": 93450 + }, + { + "epoch": 0.28289921753212355, + "grad_norm": 0.10930074006319046, + "learning_rate": 8.991013009882654e-05, + "loss": 1.5676427841186524, + "step": 93460 + }, + { + "epoch": 0.2829294870824694, + "grad_norm": 0.1078595295548439, + "learning_rate": 8.990633491718913e-05, + "loss": 1.5781411170959472, + "step": 93470 + }, + { + "epoch": 0.2829597566328152, + "grad_norm": 0.11079905927181244, + "learning_rate": 8.990253973555175e-05, + "loss": 1.5974774360656738, + "step": 93480 + }, + { + "epoch": 0.28299002618316105, + "grad_norm": 0.11747555434703827, + "learning_rate": 8.989874455391434e-05, + "loss": 1.5874284744262694, + "step": 93490 + }, + { + "epoch": 0.2830202957335069, + "grad_norm": 0.1178327426314354, + "learning_rate": 8.989494937227696e-05, + "loss": 1.5357979774475097, + "step": 93500 + }, + { + "epoch": 0.2830202957335069, + "eval_loss": 1.5588630437850952, + "eval_runtime": 28.1316, + "eval_samples_per_second": 17.774, + "eval_steps_per_second": 1.138, + "step": 93500 + }, + { + "epoch": 0.2830505652838527, + "grad_norm": 0.1168503537774086, + "learning_rate": 8.989115419063956e-05, + "loss": 1.5921213150024414, + "step": 93510 + }, + { + "epoch": 0.28308083483419855, + "grad_norm": 0.11906565725803375, + "learning_rate": 8.988735900900218e-05, + "loss": 1.5378494262695312, + "step": 93520 + }, + { + "epoch": 0.28311110438454434, + "grad_norm": 0.11441738903522491, + "learning_rate": 8.988356382736478e-05, + "loss": 1.5536672592163085, + "step": 93530 + }, + { + "epoch": 0.2831413739348902, + "grad_norm": 0.10706277936697006, + "learning_rate": 8.987976864572739e-05, + "loss": 1.577772045135498, + "step": 93540 + }, + { + "epoch": 0.28317164348523605, + "grad_norm": 0.10567720234394073, + "learning_rate": 8.987597346408999e-05, + "loss": 1.5588411331176757, + "step": 93550 + }, + { + "epoch": 0.28320191303558184, + "grad_norm": 0.11882497370243073, + "learning_rate": 8.98721782824526e-05, + "loss": 1.562917423248291, + "step": 93560 + }, + { + "epoch": 0.2832321825859277, + "grad_norm": 0.10848355293273926, + "learning_rate": 8.98683831008152e-05, + "loss": 1.5636114120483398, + "step": 93570 + }, + { + "epoch": 0.2832624521362735, + "grad_norm": 0.1235756203532219, + "learning_rate": 8.986458791917781e-05, + "loss": 1.5813070297241212, + "step": 93580 + }, + { + "epoch": 0.28329272168661934, + "grad_norm": 0.11717162281274796, + "learning_rate": 8.986079273754042e-05, + "loss": 1.5584613800048828, + "step": 93590 + }, + { + "epoch": 0.2833229912369652, + "grad_norm": 0.1321980357170105, + "learning_rate": 8.985699755590302e-05, + "loss": 1.5362730026245117, + "step": 93600 + }, + { + "epoch": 0.283353260787311, + "grad_norm": 0.11106990277767181, + "learning_rate": 8.985320237426563e-05, + "loss": 1.6215946197509765, + "step": 93610 + }, + { + "epoch": 0.28338353033765684, + "grad_norm": 0.10769882798194885, + "learning_rate": 8.984940719262823e-05, + "loss": 1.5573296546936035, + "step": 93620 + }, + { + "epoch": 0.28341379988800264, + "grad_norm": 0.110355906188488, + "learning_rate": 8.984561201099085e-05, + "loss": 1.5846745491027832, + "step": 93630 + }, + { + "epoch": 0.2834440694383485, + "grad_norm": 0.11999969184398651, + "learning_rate": 8.984181682935346e-05, + "loss": 1.5582033157348634, + "step": 93640 + }, + { + "epoch": 0.28347433898869434, + "grad_norm": 0.10669851303100586, + "learning_rate": 8.983802164771607e-05, + "loss": 1.5695460319519043, + "step": 93650 + }, + { + "epoch": 0.28350460853904014, + "grad_norm": 0.09936852753162384, + "learning_rate": 8.983422646607867e-05, + "loss": 1.5468182563781738, + "step": 93660 + }, + { + "epoch": 0.283534878089386, + "grad_norm": 0.101594477891922, + "learning_rate": 8.983043128444128e-05, + "loss": 1.566805648803711, + "step": 93670 + }, + { + "epoch": 0.2835651476397318, + "grad_norm": 0.11173735558986664, + "learning_rate": 8.982663610280388e-05, + "loss": 1.5672746658325196, + "step": 93680 + }, + { + "epoch": 0.28359541719007764, + "grad_norm": 0.10218065977096558, + "learning_rate": 8.982284092116649e-05, + "loss": 1.5766939163208007, + "step": 93690 + }, + { + "epoch": 0.2836256867404235, + "grad_norm": 0.1116001158952713, + "learning_rate": 8.98190457395291e-05, + "loss": 1.5706631660461425, + "step": 93700 + }, + { + "epoch": 0.2836559562907693, + "grad_norm": 0.13051612675189972, + "learning_rate": 8.98152505578917e-05, + "loss": 1.5799163818359374, + "step": 93710 + }, + { + "epoch": 0.28368622584111514, + "grad_norm": 0.10274475067853928, + "learning_rate": 8.981145537625431e-05, + "loss": 1.5697503089904785, + "step": 93720 + }, + { + "epoch": 0.28371649539146093, + "grad_norm": 0.11772788316011429, + "learning_rate": 8.980766019461691e-05, + "loss": 1.5718363761901855, + "step": 93730 + }, + { + "epoch": 0.2837467649418068, + "grad_norm": 0.10524193197488785, + "learning_rate": 8.980386501297952e-05, + "loss": 1.548619556427002, + "step": 93740 + }, + { + "epoch": 0.28377703449215264, + "grad_norm": 0.11597980558872223, + "learning_rate": 8.980006983134213e-05, + "loss": 1.5393962860107422, + "step": 93750 + }, + { + "epoch": 0.28380730404249843, + "grad_norm": 0.11860247701406479, + "learning_rate": 8.979627464970473e-05, + "loss": 1.5669568061828614, + "step": 93760 + }, + { + "epoch": 0.2838375735928443, + "grad_norm": 0.11685862392187119, + "learning_rate": 8.979247946806735e-05, + "loss": 1.5617433547973634, + "step": 93770 + }, + { + "epoch": 0.28386784314319014, + "grad_norm": 0.13199679553508759, + "learning_rate": 8.978868428642994e-05, + "loss": 1.5195127487182618, + "step": 93780 + }, + { + "epoch": 0.28389811269353593, + "grad_norm": 0.1186547502875328, + "learning_rate": 8.978488910479256e-05, + "loss": 1.5426856994628906, + "step": 93790 + }, + { + "epoch": 0.2839283822438818, + "grad_norm": 0.10652098804712296, + "learning_rate": 8.978109392315516e-05, + "loss": 1.5482995986938477, + "step": 93800 + }, + { + "epoch": 0.2839586517942276, + "grad_norm": 0.12047217041254044, + "learning_rate": 8.977729874151777e-05, + "loss": 1.5357385635375977, + "step": 93810 + }, + { + "epoch": 0.28398892134457343, + "grad_norm": 0.11336503177881241, + "learning_rate": 8.977350355988038e-05, + "loss": 1.5664459228515626, + "step": 93820 + }, + { + "epoch": 0.2840191908949193, + "grad_norm": 0.10939889401197433, + "learning_rate": 8.976970837824299e-05, + "loss": 1.5343971252441406, + "step": 93830 + }, + { + "epoch": 0.2840494604452651, + "grad_norm": 0.11527267098426819, + "learning_rate": 8.976591319660559e-05, + "loss": 1.561817741394043, + "step": 93840 + }, + { + "epoch": 0.28407972999561093, + "grad_norm": 0.11151076853275299, + "learning_rate": 8.97621180149682e-05, + "loss": 1.5922748565673828, + "step": 93850 + }, + { + "epoch": 0.2841099995459567, + "grad_norm": 0.11636205017566681, + "learning_rate": 8.97583228333308e-05, + "loss": 1.5501407623291015, + "step": 93860 + }, + { + "epoch": 0.2841402690963026, + "grad_norm": 0.11839153617620468, + "learning_rate": 8.975452765169341e-05, + "loss": 1.5676919937133789, + "step": 93870 + }, + { + "epoch": 0.28417053864664843, + "grad_norm": 0.11421389877796173, + "learning_rate": 8.975073247005603e-05, + "loss": 1.5305928230285644, + "step": 93880 + }, + { + "epoch": 0.2842008081969942, + "grad_norm": 0.10311013460159302, + "learning_rate": 8.974693728841862e-05, + "loss": 1.5355436325073242, + "step": 93890 + }, + { + "epoch": 0.2842310777473401, + "grad_norm": 0.1136518195271492, + "learning_rate": 8.974314210678124e-05, + "loss": 1.565479850769043, + "step": 93900 + }, + { + "epoch": 0.2842613472976859, + "grad_norm": 0.11935198307037354, + "learning_rate": 8.973934692514383e-05, + "loss": 1.552731704711914, + "step": 93910 + }, + { + "epoch": 0.2842916168480317, + "grad_norm": 0.12710943818092346, + "learning_rate": 8.973555174350645e-05, + "loss": 1.549265480041504, + "step": 93920 + }, + { + "epoch": 0.2843218863983776, + "grad_norm": 0.10736346244812012, + "learning_rate": 8.973175656186905e-05, + "loss": 1.564982795715332, + "step": 93930 + }, + { + "epoch": 0.2843521559487234, + "grad_norm": 0.11896375566720963, + "learning_rate": 8.972796138023167e-05, + "loss": 1.5363836288452148, + "step": 93940 + }, + { + "epoch": 0.2843824254990692, + "grad_norm": 0.11592672765254974, + "learning_rate": 8.972416619859426e-05, + "loss": 1.5439592361450196, + "step": 93950 + }, + { + "epoch": 0.284412695049415, + "grad_norm": 0.11852671951055527, + "learning_rate": 8.972037101695688e-05, + "loss": 1.5561536788940429, + "step": 93960 + }, + { + "epoch": 0.2844429645997609, + "grad_norm": 0.11666420847177505, + "learning_rate": 8.971657583531947e-05, + "loss": 1.588839340209961, + "step": 93970 + }, + { + "epoch": 0.2844732341501067, + "grad_norm": 0.1107279434800148, + "learning_rate": 8.971278065368209e-05, + "loss": 1.5722196578979493, + "step": 93980 + }, + { + "epoch": 0.2845035037004525, + "grad_norm": 0.12204374372959137, + "learning_rate": 8.970898547204468e-05, + "loss": 1.5909084320068358, + "step": 93990 + }, + { + "epoch": 0.28453377325079837, + "grad_norm": 0.11414012312889099, + "learning_rate": 8.97051902904073e-05, + "loss": 1.5960206031799316, + "step": 94000 + }, + { + "epoch": 0.28453377325079837, + "eval_loss": 1.5617133378982544, + "eval_runtime": 27.8309, + "eval_samples_per_second": 17.966, + "eval_steps_per_second": 1.15, + "step": 94000 + }, + { + "epoch": 0.28456404280114417, + "grad_norm": 0.1362532526254654, + "learning_rate": 8.970139510876991e-05, + "loss": 1.5222972869873046, + "step": 94010 + }, + { + "epoch": 0.28459431235149, + "grad_norm": 0.11773613095283508, + "learning_rate": 8.969759992713251e-05, + "loss": 1.5461684226989747, + "step": 94020 + }, + { + "epoch": 0.28462458190183587, + "grad_norm": 0.1193842962384224, + "learning_rate": 8.969380474549513e-05, + "loss": 1.5928747177124023, + "step": 94030 + }, + { + "epoch": 0.28465485145218167, + "grad_norm": 0.10995017737150192, + "learning_rate": 8.969000956385773e-05, + "loss": 1.571516990661621, + "step": 94040 + }, + { + "epoch": 0.2846851210025275, + "grad_norm": 0.10286790877580643, + "learning_rate": 8.968621438222034e-05, + "loss": 1.5655232429504395, + "step": 94050 + }, + { + "epoch": 0.2847153905528733, + "grad_norm": 0.11061020195484161, + "learning_rate": 8.968241920058294e-05, + "loss": 1.5670169830322265, + "step": 94060 + }, + { + "epoch": 0.28474566010321917, + "grad_norm": 0.13134805858135223, + "learning_rate": 8.967862401894556e-05, + "loss": 1.5771973609924317, + "step": 94070 + }, + { + "epoch": 0.284775929653565, + "grad_norm": 0.10895597189664841, + "learning_rate": 8.967482883730815e-05, + "loss": 1.5129175186157227, + "step": 94080 + }, + { + "epoch": 0.2848061992039108, + "grad_norm": 0.10435543954372406, + "learning_rate": 8.967103365567077e-05, + "loss": 1.5665060043334962, + "step": 94090 + }, + { + "epoch": 0.28483646875425667, + "grad_norm": 0.11097758263349533, + "learning_rate": 8.966723847403336e-05, + "loss": 1.5258119583129883, + "step": 94100 + }, + { + "epoch": 0.28486673830460246, + "grad_norm": 0.11760997027158737, + "learning_rate": 8.966344329239598e-05, + "loss": 1.524520206451416, + "step": 94110 + }, + { + "epoch": 0.2848970078549483, + "grad_norm": 0.131185844540596, + "learning_rate": 8.965964811075857e-05, + "loss": 1.6005844116210937, + "step": 94120 + }, + { + "epoch": 0.28492727740529417, + "grad_norm": 0.12128555774688721, + "learning_rate": 8.965585292912119e-05, + "loss": 1.556879997253418, + "step": 94130 + }, + { + "epoch": 0.28495754695563996, + "grad_norm": 0.11048834770917892, + "learning_rate": 8.96520577474838e-05, + "loss": 1.5684099197387695, + "step": 94140 + }, + { + "epoch": 0.2849878165059858, + "grad_norm": 0.11355982720851898, + "learning_rate": 8.96482625658464e-05, + "loss": 1.5410152435302735, + "step": 94150 + }, + { + "epoch": 0.2850180860563316, + "grad_norm": 0.12471097707748413, + "learning_rate": 8.964446738420901e-05, + "loss": 1.555255126953125, + "step": 94160 + }, + { + "epoch": 0.28504835560667746, + "grad_norm": 0.11182550340890884, + "learning_rate": 8.964067220257162e-05, + "loss": 1.5621150970458983, + "step": 94170 + }, + { + "epoch": 0.2850786251570233, + "grad_norm": 0.131051704287529, + "learning_rate": 8.963687702093422e-05, + "loss": 1.5810062408447265, + "step": 94180 + }, + { + "epoch": 0.2851088947073691, + "grad_norm": 0.11928396672010422, + "learning_rate": 8.963308183929683e-05, + "loss": 1.562995147705078, + "step": 94190 + }, + { + "epoch": 0.28513916425771496, + "grad_norm": 0.10439854860305786, + "learning_rate": 8.962928665765943e-05, + "loss": 1.544807529449463, + "step": 94200 + }, + { + "epoch": 0.28516943380806076, + "grad_norm": 0.11316760629415512, + "learning_rate": 8.962549147602204e-05, + "loss": 1.5294246673583984, + "step": 94210 + }, + { + "epoch": 0.2851997033584066, + "grad_norm": 0.11423327773809433, + "learning_rate": 8.962169629438465e-05, + "loss": 1.5567399978637695, + "step": 94220 + }, + { + "epoch": 0.28522997290875246, + "grad_norm": 0.11649588495492935, + "learning_rate": 8.961790111274725e-05, + "loss": 1.6041709899902343, + "step": 94230 + }, + { + "epoch": 0.28526024245909826, + "grad_norm": 0.11256513744592667, + "learning_rate": 8.961410593110987e-05, + "loss": 1.6117403030395507, + "step": 94240 + }, + { + "epoch": 0.2852905120094441, + "grad_norm": 0.12012933194637299, + "learning_rate": 8.961031074947248e-05, + "loss": 1.5744043350219727, + "step": 94250 + }, + { + "epoch": 0.2853207815597899, + "grad_norm": 0.12739287316799164, + "learning_rate": 8.960651556783508e-05, + "loss": 1.5528646469116212, + "step": 94260 + }, + { + "epoch": 0.28535105111013576, + "grad_norm": 0.11101444810628891, + "learning_rate": 8.960272038619769e-05, + "loss": 1.5959320068359375, + "step": 94270 + }, + { + "epoch": 0.2853813206604816, + "grad_norm": 0.12443210929632187, + "learning_rate": 8.95989252045603e-05, + "loss": 1.5685388565063476, + "step": 94280 + }, + { + "epoch": 0.2854115902108274, + "grad_norm": 0.106805719435215, + "learning_rate": 8.95951300229229e-05, + "loss": 1.5497793197631835, + "step": 94290 + }, + { + "epoch": 0.28544185976117326, + "grad_norm": 0.12425822764635086, + "learning_rate": 8.959133484128551e-05, + "loss": 1.5976664543151855, + "step": 94300 + }, + { + "epoch": 0.28547212931151905, + "grad_norm": 0.11585181951522827, + "learning_rate": 8.958753965964811e-05, + "loss": 1.5681568145751954, + "step": 94310 + }, + { + "epoch": 0.2855023988618649, + "grad_norm": 0.126677006483078, + "learning_rate": 8.958374447801072e-05, + "loss": 1.5798291206359862, + "step": 94320 + }, + { + "epoch": 0.28553266841221076, + "grad_norm": 0.12335353344678879, + "learning_rate": 8.957994929637332e-05, + "loss": 1.5473325729370118, + "step": 94330 + }, + { + "epoch": 0.28556293796255655, + "grad_norm": 0.1359671652317047, + "learning_rate": 8.957615411473593e-05, + "loss": 1.5500289916992187, + "step": 94340 + }, + { + "epoch": 0.2855932075129024, + "grad_norm": 0.11754146218299866, + "learning_rate": 8.957235893309854e-05, + "loss": 1.5402856826782227, + "step": 94350 + }, + { + "epoch": 0.2856234770632482, + "grad_norm": 0.11901118606328964, + "learning_rate": 8.956856375146114e-05, + "loss": 1.5667194366455077, + "step": 94360 + }, + { + "epoch": 0.28565374661359405, + "grad_norm": 0.1156257688999176, + "learning_rate": 8.956476856982375e-05, + "loss": 1.6098373413085938, + "step": 94370 + }, + { + "epoch": 0.2856840161639399, + "grad_norm": 0.1051952913403511, + "learning_rate": 8.956097338818637e-05, + "loss": 1.5764200210571289, + "step": 94380 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.11497578769922256, + "learning_rate": 8.955717820654896e-05, + "loss": 1.5410259246826172, + "step": 94390 + }, + { + "epoch": 0.28574455526463155, + "grad_norm": 0.12093551456928253, + "learning_rate": 8.955338302491158e-05, + "loss": 1.562614917755127, + "step": 94400 + }, + { + "epoch": 0.28577482481497735, + "grad_norm": 0.10559051483869553, + "learning_rate": 8.954958784327417e-05, + "loss": 1.5563241958618164, + "step": 94410 + }, + { + "epoch": 0.2858050943653232, + "grad_norm": 0.09061299264431, + "learning_rate": 8.954579266163679e-05, + "loss": 1.5687257766723632, + "step": 94420 + }, + { + "epoch": 0.28583536391566905, + "grad_norm": 0.1339888572692871, + "learning_rate": 8.95419974799994e-05, + "loss": 1.5488287925720214, + "step": 94430 + }, + { + "epoch": 0.28586563346601485, + "grad_norm": 0.11757197976112366, + "learning_rate": 8.9538202298362e-05, + "loss": 1.5801767349243163, + "step": 94440 + }, + { + "epoch": 0.2858959030163607, + "grad_norm": 0.12743930518627167, + "learning_rate": 8.953440711672461e-05, + "loss": 1.5544654846191406, + "step": 94450 + }, + { + "epoch": 0.28592617256670655, + "grad_norm": 0.1305999606847763, + "learning_rate": 8.953061193508722e-05, + "loss": 1.5307615280151368, + "step": 94460 + }, + { + "epoch": 0.28595644211705235, + "grad_norm": 0.1214061975479126, + "learning_rate": 8.952681675344982e-05, + "loss": 1.5379926681518554, + "step": 94470 + }, + { + "epoch": 0.2859867116673982, + "grad_norm": 0.11935689300298691, + "learning_rate": 8.952302157181243e-05, + "loss": 1.579193115234375, + "step": 94480 + }, + { + "epoch": 0.286016981217744, + "grad_norm": 0.12015414237976074, + "learning_rate": 8.951922639017503e-05, + "loss": 1.5340894699096679, + "step": 94490 + }, + { + "epoch": 0.28604725076808984, + "grad_norm": 0.11809594929218292, + "learning_rate": 8.951543120853764e-05, + "loss": 1.5490334510803223, + "step": 94500 + }, + { + "epoch": 0.28604725076808984, + "eval_loss": 1.5671182870864868, + "eval_runtime": 27.7287, + "eval_samples_per_second": 18.032, + "eval_steps_per_second": 1.154, + "step": 94500 + }, + { + "epoch": 0.2860775203184357, + "grad_norm": 0.12554123997688293, + "learning_rate": 8.951163602690026e-05, + "loss": 1.5575527191162108, + "step": 94510 + }, + { + "epoch": 0.2861077898687815, + "grad_norm": 0.12936796247959137, + "learning_rate": 8.950784084526285e-05, + "loss": 1.5744526863098145, + "step": 94520 + }, + { + "epoch": 0.28613805941912734, + "grad_norm": 0.11603395640850067, + "learning_rate": 8.950404566362547e-05, + "loss": 1.5712762832641602, + "step": 94530 + }, + { + "epoch": 0.28616832896947314, + "grad_norm": 0.12251412868499756, + "learning_rate": 8.950025048198806e-05, + "loss": 1.5283027648925782, + "step": 94540 + }, + { + "epoch": 0.286198598519819, + "grad_norm": 0.11530783027410507, + "learning_rate": 8.949645530035068e-05, + "loss": 1.5665281295776368, + "step": 94550 + }, + { + "epoch": 0.28622886807016484, + "grad_norm": 0.1291225105524063, + "learning_rate": 8.949266011871328e-05, + "loss": 1.5325125694274901, + "step": 94560 + }, + { + "epoch": 0.28625913762051064, + "grad_norm": 0.11704544723033905, + "learning_rate": 8.94888649370759e-05, + "loss": 1.5650819778442382, + "step": 94570 + }, + { + "epoch": 0.2862894071708565, + "grad_norm": 0.12887725234031677, + "learning_rate": 8.948506975543849e-05, + "loss": 1.5809175491333007, + "step": 94580 + }, + { + "epoch": 0.2863196767212023, + "grad_norm": 0.11672578006982803, + "learning_rate": 8.94812745738011e-05, + "loss": 1.5754295349121095, + "step": 94590 + }, + { + "epoch": 0.28634994627154814, + "grad_norm": 0.10448294878005981, + "learning_rate": 8.94774793921637e-05, + "loss": 1.5589162826538085, + "step": 94600 + }, + { + "epoch": 0.286380215821894, + "grad_norm": 0.11177925020456314, + "learning_rate": 8.947368421052632e-05, + "loss": 1.581231689453125, + "step": 94610 + }, + { + "epoch": 0.2864104853722398, + "grad_norm": 0.11963360756635666, + "learning_rate": 8.946988902888891e-05, + "loss": 1.529843807220459, + "step": 94620 + }, + { + "epoch": 0.28644075492258564, + "grad_norm": 0.11024115234613419, + "learning_rate": 8.946609384725153e-05, + "loss": 1.5327155113220214, + "step": 94630 + }, + { + "epoch": 0.28647102447293143, + "grad_norm": 0.11874277889728546, + "learning_rate": 8.946229866561415e-05, + "loss": 1.5841718673706056, + "step": 94640 + }, + { + "epoch": 0.2865012940232773, + "grad_norm": 0.10726679116487503, + "learning_rate": 8.945850348397674e-05, + "loss": 1.5541255950927735, + "step": 94650 + }, + { + "epoch": 0.28653156357362314, + "grad_norm": 0.11744225025177002, + "learning_rate": 8.945470830233936e-05, + "loss": 1.5589750289916993, + "step": 94660 + }, + { + "epoch": 0.28656183312396893, + "grad_norm": 0.1277303844690323, + "learning_rate": 8.945091312070195e-05, + "loss": 1.565091609954834, + "step": 94670 + }, + { + "epoch": 0.2865921026743148, + "grad_norm": 0.12094121426343918, + "learning_rate": 8.944711793906457e-05, + "loss": 1.576408863067627, + "step": 94680 + }, + { + "epoch": 0.2866223722246606, + "grad_norm": 0.12277104705572128, + "learning_rate": 8.944332275742717e-05, + "loss": 1.5482930183410644, + "step": 94690 + }, + { + "epoch": 0.28665264177500643, + "grad_norm": 0.12587516009807587, + "learning_rate": 8.943952757578979e-05, + "loss": 1.5989016532897948, + "step": 94700 + }, + { + "epoch": 0.2866829113253523, + "grad_norm": 0.10726810246706009, + "learning_rate": 8.943573239415238e-05, + "loss": 1.6180437088012696, + "step": 94710 + }, + { + "epoch": 0.2867131808756981, + "grad_norm": 0.10145585238933563, + "learning_rate": 8.9431937212515e-05, + "loss": 1.5576606750488282, + "step": 94720 + }, + { + "epoch": 0.28674345042604393, + "grad_norm": 0.11106817424297333, + "learning_rate": 8.942814203087759e-05, + "loss": 1.5606884002685546, + "step": 94730 + }, + { + "epoch": 0.28677371997638973, + "grad_norm": 0.11378039419651031, + "learning_rate": 8.942434684924021e-05, + "loss": 1.5500640869140625, + "step": 94740 + }, + { + "epoch": 0.2868039895267356, + "grad_norm": 0.10688526928424835, + "learning_rate": 8.942055166760282e-05, + "loss": 1.5479262351989747, + "step": 94750 + }, + { + "epoch": 0.28683425907708143, + "grad_norm": 0.10445530712604523, + "learning_rate": 8.941675648596542e-05, + "loss": 1.5480867385864259, + "step": 94760 + }, + { + "epoch": 0.28686452862742723, + "grad_norm": 0.10624255239963531, + "learning_rate": 8.941296130432803e-05, + "loss": 1.5283685684204102, + "step": 94770 + }, + { + "epoch": 0.2868947981777731, + "grad_norm": 0.12999257445335388, + "learning_rate": 8.940916612269063e-05, + "loss": 1.5737759590148925, + "step": 94780 + }, + { + "epoch": 0.2869250677281189, + "grad_norm": 0.10859416425228119, + "learning_rate": 8.940537094105324e-05, + "loss": 1.579010009765625, + "step": 94790 + }, + { + "epoch": 0.28695533727846473, + "grad_norm": 0.127994105219841, + "learning_rate": 8.940157575941585e-05, + "loss": 1.566169261932373, + "step": 94800 + }, + { + "epoch": 0.2869856068288106, + "grad_norm": 0.12789267301559448, + "learning_rate": 8.939778057777845e-05, + "loss": 1.5648733139038087, + "step": 94810 + }, + { + "epoch": 0.2870158763791564, + "grad_norm": 0.12956273555755615, + "learning_rate": 8.939398539614106e-05, + "loss": 1.5815213203430176, + "step": 94820 + }, + { + "epoch": 0.2870461459295022, + "grad_norm": 0.10857902467250824, + "learning_rate": 8.939019021450368e-05, + "loss": 1.565057373046875, + "step": 94830 + }, + { + "epoch": 0.287076415479848, + "grad_norm": 0.11035675555467606, + "learning_rate": 8.938639503286627e-05, + "loss": 1.5371042251586915, + "step": 94840 + }, + { + "epoch": 0.2871066850301939, + "grad_norm": 0.11437110602855682, + "learning_rate": 8.938259985122889e-05, + "loss": 1.5353572845458985, + "step": 94850 + }, + { + "epoch": 0.2871369545805397, + "grad_norm": 0.1241220012307167, + "learning_rate": 8.937880466959148e-05, + "loss": 1.5532109260559082, + "step": 94860 + }, + { + "epoch": 0.2871672241308855, + "grad_norm": 0.11337069422006607, + "learning_rate": 8.93750094879541e-05, + "loss": 1.5590285301208495, + "step": 94870 + }, + { + "epoch": 0.2871974936812314, + "grad_norm": 0.11751698702573776, + "learning_rate": 8.93712143063167e-05, + "loss": 1.5513689041137695, + "step": 94880 + }, + { + "epoch": 0.28722776323157717, + "grad_norm": 0.1250167042016983, + "learning_rate": 8.936741912467931e-05, + "loss": 1.5406081199645996, + "step": 94890 + }, + { + "epoch": 0.287258032781923, + "grad_norm": 0.12115654349327087, + "learning_rate": 8.936362394304192e-05, + "loss": 1.567002010345459, + "step": 94900 + }, + { + "epoch": 0.2872883023322689, + "grad_norm": 0.13032348453998566, + "learning_rate": 8.935982876140452e-05, + "loss": 1.5388715744018555, + "step": 94910 + }, + { + "epoch": 0.28731857188261467, + "grad_norm": 0.1145230308175087, + "learning_rate": 8.935603357976713e-05, + "loss": 1.5763893127441406, + "step": 94920 + }, + { + "epoch": 0.2873488414329605, + "grad_norm": 0.11779360473155975, + "learning_rate": 8.935223839812974e-05, + "loss": 1.5700326919555665, + "step": 94930 + }, + { + "epoch": 0.2873791109833063, + "grad_norm": 0.11610196530818939, + "learning_rate": 8.934844321649234e-05, + "loss": 1.5819510459899901, + "step": 94940 + }, + { + "epoch": 0.28740938053365217, + "grad_norm": 0.1193372905254364, + "learning_rate": 8.934464803485495e-05, + "loss": 1.5676036834716798, + "step": 94950 + }, + { + "epoch": 0.287439650083998, + "grad_norm": 0.12928345799446106, + "learning_rate": 8.934085285321755e-05, + "loss": 1.5030109405517578, + "step": 94960 + }, + { + "epoch": 0.2874699196343438, + "grad_norm": 0.11475250124931335, + "learning_rate": 8.933705767158016e-05, + "loss": 1.5585237503051759, + "step": 94970 + }, + { + "epoch": 0.28750018918468967, + "grad_norm": 0.11037223041057587, + "learning_rate": 8.933326248994277e-05, + "loss": 1.5769010543823243, + "step": 94980 + }, + { + "epoch": 0.28753045873503547, + "grad_norm": 0.11413609236478806, + "learning_rate": 8.932946730830539e-05, + "loss": 1.5371175765991212, + "step": 94990 + }, + { + "epoch": 0.2875607282853813, + "grad_norm": 0.11171326786279678, + "learning_rate": 8.932567212666798e-05, + "loss": 1.584661865234375, + "step": 95000 + }, + { + "epoch": 0.2875607282853813, + "eval_loss": 1.5440150499343872, + "eval_runtime": 28.2597, + "eval_samples_per_second": 17.693, + "eval_steps_per_second": 1.132, + "step": 95000 + }, + { + "epoch": 0.28759099783572717, + "grad_norm": 0.11214743554592133, + "learning_rate": 8.93218769450306e-05, + "loss": 1.560874080657959, + "step": 95010 + }, + { + "epoch": 0.28762126738607297, + "grad_norm": 0.11959239840507507, + "learning_rate": 8.931808176339319e-05, + "loss": 1.6236186981201173, + "step": 95020 + }, + { + "epoch": 0.2876515369364188, + "grad_norm": 0.11172633618116379, + "learning_rate": 8.931428658175581e-05, + "loss": 1.5439220428466798, + "step": 95030 + }, + { + "epoch": 0.2876818064867646, + "grad_norm": 0.12914711236953735, + "learning_rate": 8.931049140011842e-05, + "loss": 1.5678583145141602, + "step": 95040 + }, + { + "epoch": 0.28771207603711046, + "grad_norm": 0.11839235574007034, + "learning_rate": 8.930669621848102e-05, + "loss": 1.5534231185913085, + "step": 95050 + }, + { + "epoch": 0.2877423455874563, + "grad_norm": 0.11679337173700333, + "learning_rate": 8.930290103684363e-05, + "loss": 1.5600141525268554, + "step": 95060 + }, + { + "epoch": 0.2877726151378021, + "grad_norm": 0.11177125573158264, + "learning_rate": 8.929910585520623e-05, + "loss": 1.5695199966430664, + "step": 95070 + }, + { + "epoch": 0.28780288468814796, + "grad_norm": 0.11651910096406937, + "learning_rate": 8.929531067356884e-05, + "loss": 1.5627662658691406, + "step": 95080 + }, + { + "epoch": 0.28783315423849376, + "grad_norm": 0.126990407705307, + "learning_rate": 8.929151549193144e-05, + "loss": 1.548466682434082, + "step": 95090 + }, + { + "epoch": 0.2878634237888396, + "grad_norm": 0.10986367613077164, + "learning_rate": 8.928772031029405e-05, + "loss": 1.54626522064209, + "step": 95100 + }, + { + "epoch": 0.28789369333918546, + "grad_norm": 0.12057078629732132, + "learning_rate": 8.928392512865666e-05, + "loss": 1.5695917129516601, + "step": 95110 + }, + { + "epoch": 0.28792396288953126, + "grad_norm": 0.12053154408931732, + "learning_rate": 8.928012994701928e-05, + "loss": 1.5790861129760743, + "step": 95120 + }, + { + "epoch": 0.2879542324398771, + "grad_norm": 0.11016379296779633, + "learning_rate": 8.927633476538187e-05, + "loss": 1.5230471611022949, + "step": 95130 + }, + { + "epoch": 0.2879845019902229, + "grad_norm": 0.11153306066989899, + "learning_rate": 8.927253958374449e-05, + "loss": 1.5420608520507812, + "step": 95140 + }, + { + "epoch": 0.28801477154056876, + "grad_norm": 0.11934176087379456, + "learning_rate": 8.926874440210708e-05, + "loss": 1.597188377380371, + "step": 95150 + }, + { + "epoch": 0.2880450410909146, + "grad_norm": 0.11774180829524994, + "learning_rate": 8.92649492204697e-05, + "loss": 1.5631675720214844, + "step": 95160 + }, + { + "epoch": 0.2880753106412604, + "grad_norm": 0.11716342717409134, + "learning_rate": 8.926115403883229e-05, + "loss": 1.54725341796875, + "step": 95170 + }, + { + "epoch": 0.28810558019160626, + "grad_norm": 0.12220901995897293, + "learning_rate": 8.925735885719491e-05, + "loss": 1.5590792655944825, + "step": 95180 + }, + { + "epoch": 0.2881358497419521, + "grad_norm": 0.11753271520137787, + "learning_rate": 8.92535636755575e-05, + "loss": 1.5271930694580078, + "step": 95190 + }, + { + "epoch": 0.2881661192922979, + "grad_norm": 0.12261504679918289, + "learning_rate": 8.924976849392012e-05, + "loss": 1.556070613861084, + "step": 95200 + }, + { + "epoch": 0.28819638884264376, + "grad_norm": 0.12684617936611176, + "learning_rate": 8.924597331228272e-05, + "loss": 1.5888557434082031, + "step": 95210 + }, + { + "epoch": 0.28822665839298955, + "grad_norm": 0.12840206921100616, + "learning_rate": 8.924217813064534e-05, + "loss": 1.5770806312561034, + "step": 95220 + }, + { + "epoch": 0.2882569279433354, + "grad_norm": 0.13074961304664612, + "learning_rate": 8.923838294900793e-05, + "loss": 1.5859264373779296, + "step": 95230 + }, + { + "epoch": 0.28828719749368126, + "grad_norm": 0.11360140144824982, + "learning_rate": 8.923458776737055e-05, + "loss": 1.55786190032959, + "step": 95240 + }, + { + "epoch": 0.28831746704402705, + "grad_norm": 0.11908027529716492, + "learning_rate": 8.923079258573317e-05, + "loss": 1.5747963905334472, + "step": 95250 + }, + { + "epoch": 0.2883477365943729, + "grad_norm": 0.10301583260297775, + "learning_rate": 8.922699740409576e-05, + "loss": 1.5970401763916016, + "step": 95260 + }, + { + "epoch": 0.2883780061447187, + "grad_norm": 0.12294480204582214, + "learning_rate": 8.922320222245838e-05, + "loss": 1.519191074371338, + "step": 95270 + }, + { + "epoch": 0.28840827569506455, + "grad_norm": 0.10629472136497498, + "learning_rate": 8.921940704082097e-05, + "loss": 1.5828152656555177, + "step": 95280 + }, + { + "epoch": 0.2884385452454104, + "grad_norm": 0.12827467918395996, + "learning_rate": 8.921561185918359e-05, + "loss": 1.541558074951172, + "step": 95290 + }, + { + "epoch": 0.2884688147957562, + "grad_norm": 0.1152605190873146, + "learning_rate": 8.921181667754618e-05, + "loss": 1.527216911315918, + "step": 95300 + }, + { + "epoch": 0.28849908434610205, + "grad_norm": 0.10819448530673981, + "learning_rate": 8.92080214959088e-05, + "loss": 1.5624302864074706, + "step": 95310 + }, + { + "epoch": 0.28852935389644785, + "grad_norm": 0.10914681851863861, + "learning_rate": 8.92042263142714e-05, + "loss": 1.563567543029785, + "step": 95320 + }, + { + "epoch": 0.2885596234467937, + "grad_norm": 0.11668543517589569, + "learning_rate": 8.920043113263401e-05, + "loss": 1.5650444030761719, + "step": 95330 + }, + { + "epoch": 0.28858989299713955, + "grad_norm": 0.11782027035951614, + "learning_rate": 8.919663595099661e-05, + "loss": 1.506418800354004, + "step": 95340 + }, + { + "epoch": 0.28862016254748535, + "grad_norm": 0.12076211720705032, + "learning_rate": 8.919284076935923e-05, + "loss": 1.5473934173583985, + "step": 95350 + }, + { + "epoch": 0.2886504320978312, + "grad_norm": 0.11247052252292633, + "learning_rate": 8.918904558772183e-05, + "loss": 1.5380638122558594, + "step": 95360 + }, + { + "epoch": 0.288680701648177, + "grad_norm": 0.11423768103122711, + "learning_rate": 8.918525040608444e-05, + "loss": 1.5669619560241699, + "step": 95370 + }, + { + "epoch": 0.28871097119852285, + "grad_norm": 0.108397476375103, + "learning_rate": 8.918145522444704e-05, + "loss": 1.5410391807556152, + "step": 95380 + }, + { + "epoch": 0.2887412407488687, + "grad_norm": 0.1191398948431015, + "learning_rate": 8.917766004280965e-05, + "loss": 1.6062126159667969, + "step": 95390 + }, + { + "epoch": 0.2887715102992145, + "grad_norm": 0.10599149018526077, + "learning_rate": 8.917386486117226e-05, + "loss": 1.57546968460083, + "step": 95400 + }, + { + "epoch": 0.28880177984956035, + "grad_norm": 0.1278969943523407, + "learning_rate": 8.917006967953486e-05, + "loss": 1.572166633605957, + "step": 95410 + }, + { + "epoch": 0.28883204939990614, + "grad_norm": 0.12468770891427994, + "learning_rate": 8.916627449789747e-05, + "loss": 1.5715354919433593, + "step": 95420 + }, + { + "epoch": 0.288862318950252, + "grad_norm": 0.1169586181640625, + "learning_rate": 8.916247931626007e-05, + "loss": 1.5169662475585937, + "step": 95430 + }, + { + "epoch": 0.28889258850059785, + "grad_norm": 0.11298501491546631, + "learning_rate": 8.91586841346227e-05, + "loss": 1.5375507354736329, + "step": 95440 + }, + { + "epoch": 0.28892285805094364, + "grad_norm": 0.10077667981386185, + "learning_rate": 8.915488895298529e-05, + "loss": 1.5678228378295898, + "step": 95450 + }, + { + "epoch": 0.2889531276012895, + "grad_norm": 0.10529881715774536, + "learning_rate": 8.91510937713479e-05, + "loss": 1.5730034828186035, + "step": 95460 + }, + { + "epoch": 0.2889833971516353, + "grad_norm": 0.1108318492770195, + "learning_rate": 8.91472985897105e-05, + "loss": 1.5669132232666017, + "step": 95470 + }, + { + "epoch": 0.28901366670198114, + "grad_norm": 0.1304338574409485, + "learning_rate": 8.914350340807312e-05, + "loss": 1.5423059463500977, + "step": 95480 + }, + { + "epoch": 0.289043936252327, + "grad_norm": 0.1191999763250351, + "learning_rate": 8.913970822643572e-05, + "loss": 1.5803953170776368, + "step": 95490 + }, + { + "epoch": 0.2890742058026728, + "grad_norm": 0.12085583060979843, + "learning_rate": 8.913591304479833e-05, + "loss": 1.5253908157348632, + "step": 95500 + }, + { + "epoch": 0.2890742058026728, + "eval_loss": 1.5470609664916992, + "eval_runtime": 27.7001, + "eval_samples_per_second": 18.051, + "eval_steps_per_second": 1.155, + "step": 95500 + }, + { + "epoch": 0.28910447535301864, + "grad_norm": 0.10242284089326859, + "learning_rate": 8.913211786316094e-05, + "loss": 1.5460086822509767, + "step": 95510 + }, + { + "epoch": 0.28913474490336444, + "grad_norm": 0.11085529625415802, + "learning_rate": 8.912832268152354e-05, + "loss": 1.5898216247558594, + "step": 95520 + }, + { + "epoch": 0.2891650144537103, + "grad_norm": 0.12128622084856033, + "learning_rate": 8.912452749988615e-05, + "loss": 1.5875482559204102, + "step": 95530 + }, + { + "epoch": 0.28919528400405614, + "grad_norm": 0.1143810972571373, + "learning_rate": 8.912073231824875e-05, + "loss": 1.5827335357666015, + "step": 95540 + }, + { + "epoch": 0.28922555355440194, + "grad_norm": 0.11839105188846588, + "learning_rate": 8.911693713661136e-05, + "loss": 1.5408305168151855, + "step": 95550 + }, + { + "epoch": 0.2892558231047478, + "grad_norm": 0.11726652830839157, + "learning_rate": 8.911314195497397e-05, + "loss": 1.557034969329834, + "step": 95560 + }, + { + "epoch": 0.2892860926550936, + "grad_norm": 0.11542271077632904, + "learning_rate": 8.910934677333657e-05, + "loss": 1.5636941909790039, + "step": 95570 + }, + { + "epoch": 0.28931636220543944, + "grad_norm": 0.11745303869247437, + "learning_rate": 8.910555159169918e-05, + "loss": 1.5549857139587402, + "step": 95580 + }, + { + "epoch": 0.2893466317557853, + "grad_norm": 0.11890950053930283, + "learning_rate": 8.910175641006178e-05, + "loss": 1.5713584899902344, + "step": 95590 + }, + { + "epoch": 0.2893769013061311, + "grad_norm": 0.10885488986968994, + "learning_rate": 8.909796122842439e-05, + "loss": 1.532928466796875, + "step": 95600 + }, + { + "epoch": 0.28940717085647694, + "grad_norm": 0.10984671860933304, + "learning_rate": 8.9094166046787e-05, + "loss": 1.5837246894836425, + "step": 95610 + }, + { + "epoch": 0.28943744040682273, + "grad_norm": 0.11612536758184433, + "learning_rate": 8.909037086514961e-05, + "loss": 1.5904096603393554, + "step": 95620 + }, + { + "epoch": 0.2894677099571686, + "grad_norm": 0.11684867739677429, + "learning_rate": 8.908657568351221e-05, + "loss": 1.5732596397399903, + "step": 95630 + }, + { + "epoch": 0.28949797950751444, + "grad_norm": 0.1120644137263298, + "learning_rate": 8.908278050187483e-05, + "loss": 1.554398250579834, + "step": 95640 + }, + { + "epoch": 0.28952824905786023, + "grad_norm": 0.11798585206270218, + "learning_rate": 8.907898532023743e-05, + "loss": 1.580520248413086, + "step": 95650 + }, + { + "epoch": 0.2895585186082061, + "grad_norm": 0.11882147192955017, + "learning_rate": 8.907519013860004e-05, + "loss": 1.5943733215332032, + "step": 95660 + }, + { + "epoch": 0.2895887881585519, + "grad_norm": 0.10931503772735596, + "learning_rate": 8.907139495696264e-05, + "loss": 1.5642335891723633, + "step": 95670 + }, + { + "epoch": 0.28961905770889773, + "grad_norm": 0.12023207545280457, + "learning_rate": 8.906759977532525e-05, + "loss": 1.5684636116027832, + "step": 95680 + }, + { + "epoch": 0.2896493272592436, + "grad_norm": 0.11460092663764954, + "learning_rate": 8.906380459368786e-05, + "loss": 1.5356966018676759, + "step": 95690 + }, + { + "epoch": 0.2896795968095894, + "grad_norm": 0.12127789109945297, + "learning_rate": 8.906000941205046e-05, + "loss": 1.5652923583984375, + "step": 95700 + }, + { + "epoch": 0.28970986635993523, + "grad_norm": 0.12101597338914871, + "learning_rate": 8.905621423041307e-05, + "loss": 1.5603742599487305, + "step": 95710 + }, + { + "epoch": 0.289740135910281, + "grad_norm": 0.13205121457576752, + "learning_rate": 8.905241904877567e-05, + "loss": 1.4786386489868164, + "step": 95720 + }, + { + "epoch": 0.2897704054606269, + "grad_norm": 0.11458142846822739, + "learning_rate": 8.90486238671383e-05, + "loss": 1.5730995178222655, + "step": 95730 + }, + { + "epoch": 0.28980067501097273, + "grad_norm": 0.12431563436985016, + "learning_rate": 8.904482868550089e-05, + "loss": 1.5647050857543945, + "step": 95740 + }, + { + "epoch": 0.2898309445613185, + "grad_norm": 0.13345441222190857, + "learning_rate": 8.90410335038635e-05, + "loss": 1.5590217590332032, + "step": 95750 + }, + { + "epoch": 0.2898612141116644, + "grad_norm": 0.10941213369369507, + "learning_rate": 8.90372383222261e-05, + "loss": 1.5524431228637696, + "step": 95760 + }, + { + "epoch": 0.2898914836620102, + "grad_norm": 0.10217206180095673, + "learning_rate": 8.903344314058872e-05, + "loss": 1.568308448791504, + "step": 95770 + }, + { + "epoch": 0.289921753212356, + "grad_norm": 0.11142687499523163, + "learning_rate": 8.902964795895131e-05, + "loss": 1.5732130050659179, + "step": 95780 + }, + { + "epoch": 0.2899520227627019, + "grad_norm": 0.12387353181838989, + "learning_rate": 8.902585277731393e-05, + "loss": 1.5976853370666504, + "step": 95790 + }, + { + "epoch": 0.2899822923130477, + "grad_norm": 0.11605790257453918, + "learning_rate": 8.902205759567652e-05, + "loss": 1.5185586929321289, + "step": 95800 + }, + { + "epoch": 0.2900125618633935, + "grad_norm": 0.12456687539815903, + "learning_rate": 8.901826241403914e-05, + "loss": 1.5666728973388673, + "step": 95810 + }, + { + "epoch": 0.2900428314137393, + "grad_norm": 0.1273283213376999, + "learning_rate": 8.901446723240173e-05, + "loss": 1.59061861038208, + "step": 95820 + }, + { + "epoch": 0.2900731009640852, + "grad_norm": 0.11713346838951111, + "learning_rate": 8.901067205076435e-05, + "loss": 1.579365062713623, + "step": 95830 + }, + { + "epoch": 0.290103370514431, + "grad_norm": 0.10356077551841736, + "learning_rate": 8.900687686912695e-05, + "loss": 1.6010005950927735, + "step": 95840 + }, + { + "epoch": 0.2901336400647768, + "grad_norm": 0.11392903327941895, + "learning_rate": 8.900308168748956e-05, + "loss": 1.5515995025634766, + "step": 95850 + }, + { + "epoch": 0.2901639096151227, + "grad_norm": 0.096946120262146, + "learning_rate": 8.899928650585218e-05, + "loss": 1.555659580230713, + "step": 95860 + }, + { + "epoch": 0.2901941791654685, + "grad_norm": 0.10897223651409149, + "learning_rate": 8.899549132421478e-05, + "loss": 1.5871569633483886, + "step": 95870 + }, + { + "epoch": 0.2902244487158143, + "grad_norm": 0.09815740585327148, + "learning_rate": 8.89916961425774e-05, + "loss": 1.5811177253723145, + "step": 95880 + }, + { + "epoch": 0.29025471826616017, + "grad_norm": 0.137852281332016, + "learning_rate": 8.898790096093999e-05, + "loss": 1.5509568214416505, + "step": 95890 + }, + { + "epoch": 0.29028498781650597, + "grad_norm": 0.12421470880508423, + "learning_rate": 8.898410577930261e-05, + "loss": 1.5953213691711425, + "step": 95900 + }, + { + "epoch": 0.2903152573668518, + "grad_norm": 0.10783623158931732, + "learning_rate": 8.89803105976652e-05, + "loss": 1.5687393188476562, + "step": 95910 + }, + { + "epoch": 0.29034552691719767, + "grad_norm": 0.11318065971136093, + "learning_rate": 8.897651541602782e-05, + "loss": 1.582443904876709, + "step": 95920 + }, + { + "epoch": 0.29037579646754347, + "grad_norm": 0.10187330096960068, + "learning_rate": 8.897272023439041e-05, + "loss": 1.6029544830322267, + "step": 95930 + }, + { + "epoch": 0.2904060660178893, + "grad_norm": 0.11298831552267075, + "learning_rate": 8.896892505275303e-05, + "loss": 1.5441030502319335, + "step": 95940 + }, + { + "epoch": 0.2904363355682351, + "grad_norm": 0.12198423594236374, + "learning_rate": 8.896512987111562e-05, + "loss": 1.5469493865966797, + "step": 95950 + }, + { + "epoch": 0.29046660511858097, + "grad_norm": 0.1312713772058487, + "learning_rate": 8.896133468947824e-05, + "loss": 1.5403743743896485, + "step": 95960 + }, + { + "epoch": 0.2904968746689268, + "grad_norm": 0.11184842884540558, + "learning_rate": 8.895753950784084e-05, + "loss": 1.580000114440918, + "step": 95970 + }, + { + "epoch": 0.2905271442192726, + "grad_norm": 0.11394474655389786, + "learning_rate": 8.895374432620346e-05, + "loss": 1.546464729309082, + "step": 95980 + }, + { + "epoch": 0.29055741376961847, + "grad_norm": 0.10946314036846161, + "learning_rate": 8.894994914456606e-05, + "loss": 1.5853256225585937, + "step": 95990 + }, + { + "epoch": 0.29058768331996426, + "grad_norm": 0.11712086945772171, + "learning_rate": 8.894615396292867e-05, + "loss": 1.5282188415527345, + "step": 96000 + }, + { + "epoch": 0.29058768331996426, + "eval_loss": 1.57463538646698, + "eval_runtime": 28.5153, + "eval_samples_per_second": 17.534, + "eval_steps_per_second": 1.122, + "step": 96000 + }, + { + "epoch": 0.2906179528703101, + "grad_norm": 0.11508490890264511, + "learning_rate": 8.894235878129127e-05, + "loss": 1.5621049880981446, + "step": 96010 + }, + { + "epoch": 0.29064822242065597, + "grad_norm": 0.12333814799785614, + "learning_rate": 8.893856359965388e-05, + "loss": 1.5898822784423827, + "step": 96020 + }, + { + "epoch": 0.29067849197100176, + "grad_norm": 0.11295115202665329, + "learning_rate": 8.893476841801649e-05, + "loss": 1.611212921142578, + "step": 96030 + }, + { + "epoch": 0.2907087615213476, + "grad_norm": 0.11387622356414795, + "learning_rate": 8.893097323637909e-05, + "loss": 1.528703498840332, + "step": 96040 + }, + { + "epoch": 0.2907390310716934, + "grad_norm": 0.10819453746080399, + "learning_rate": 8.892717805474171e-05, + "loss": 1.5327033996582031, + "step": 96050 + }, + { + "epoch": 0.29076930062203926, + "grad_norm": 0.1060190424323082, + "learning_rate": 8.89233828731043e-05, + "loss": 1.545465850830078, + "step": 96060 + }, + { + "epoch": 0.2907995701723851, + "grad_norm": 0.11874756217002869, + "learning_rate": 8.891958769146692e-05, + "loss": 1.573591423034668, + "step": 96070 + }, + { + "epoch": 0.2908298397227309, + "grad_norm": 0.11051110923290253, + "learning_rate": 8.891579250982952e-05, + "loss": 1.5332358360290528, + "step": 96080 + }, + { + "epoch": 0.29086010927307676, + "grad_norm": 0.11629100888967514, + "learning_rate": 8.891199732819213e-05, + "loss": 1.5181618690490724, + "step": 96090 + }, + { + "epoch": 0.29089037882342256, + "grad_norm": 0.1100994274020195, + "learning_rate": 8.890820214655474e-05, + "loss": 1.5648503303527832, + "step": 96100 + }, + { + "epoch": 0.2909206483737684, + "grad_norm": 0.13810792565345764, + "learning_rate": 8.890440696491735e-05, + "loss": 1.58840970993042, + "step": 96110 + }, + { + "epoch": 0.29095091792411426, + "grad_norm": 0.12892724573612213, + "learning_rate": 8.890061178327995e-05, + "loss": 1.5436335563659669, + "step": 96120 + }, + { + "epoch": 0.29098118747446006, + "grad_norm": 0.12921495735645294, + "learning_rate": 8.889681660164256e-05, + "loss": 1.5573341369628906, + "step": 96130 + }, + { + "epoch": 0.2910114570248059, + "grad_norm": 0.12024933099746704, + "learning_rate": 8.889302142000516e-05, + "loss": 1.5432574272155761, + "step": 96140 + }, + { + "epoch": 0.2910417265751517, + "grad_norm": 0.1196684017777443, + "learning_rate": 8.888922623836777e-05, + "loss": 1.5656624794006349, + "step": 96150 + }, + { + "epoch": 0.29107199612549756, + "grad_norm": 0.11829398572444916, + "learning_rate": 8.888543105673038e-05, + "loss": 1.533640193939209, + "step": 96160 + }, + { + "epoch": 0.2911022656758434, + "grad_norm": 0.11300772428512573, + "learning_rate": 8.888163587509298e-05, + "loss": 1.5498204231262207, + "step": 96170 + }, + { + "epoch": 0.2911325352261892, + "grad_norm": 0.11316046863794327, + "learning_rate": 8.887784069345559e-05, + "loss": 1.5145755767822267, + "step": 96180 + }, + { + "epoch": 0.29116280477653506, + "grad_norm": 0.1170358881354332, + "learning_rate": 8.88740455118182e-05, + "loss": 1.568644905090332, + "step": 96190 + }, + { + "epoch": 0.29119307432688085, + "grad_norm": 0.11115995794534683, + "learning_rate": 8.88702503301808e-05, + "loss": 1.56695556640625, + "step": 96200 + }, + { + "epoch": 0.2912233438772267, + "grad_norm": 0.10138732194900513, + "learning_rate": 8.88664551485434e-05, + "loss": 1.5867188453674317, + "step": 96210 + }, + { + "epoch": 0.29125361342757256, + "grad_norm": 0.12569890916347504, + "learning_rate": 8.886265996690601e-05, + "loss": 1.5737024307250977, + "step": 96220 + }, + { + "epoch": 0.29128388297791835, + "grad_norm": 0.11022252589464188, + "learning_rate": 8.885886478526863e-05, + "loss": 1.5529851913452148, + "step": 96230 + }, + { + "epoch": 0.2913141525282642, + "grad_norm": 0.1291370987892151, + "learning_rate": 8.885506960363122e-05, + "loss": 1.5539830207824707, + "step": 96240 + }, + { + "epoch": 0.29134442207861, + "grad_norm": 0.11018882691860199, + "learning_rate": 8.885127442199384e-05, + "loss": 1.550356101989746, + "step": 96250 + }, + { + "epoch": 0.29137469162895585, + "grad_norm": 0.1073523461818695, + "learning_rate": 8.884747924035645e-05, + "loss": 1.5432258605957032, + "step": 96260 + }, + { + "epoch": 0.2914049611793017, + "grad_norm": 0.12435737997293472, + "learning_rate": 8.884368405871906e-05, + "loss": 1.552669620513916, + "step": 96270 + }, + { + "epoch": 0.2914352307296475, + "grad_norm": 0.12424638867378235, + "learning_rate": 8.883988887708166e-05, + "loss": 1.5956727981567382, + "step": 96280 + }, + { + "epoch": 0.29146550027999335, + "grad_norm": 0.11058098077774048, + "learning_rate": 8.883609369544427e-05, + "loss": 1.5640081405639648, + "step": 96290 + }, + { + "epoch": 0.29149576983033915, + "grad_norm": 0.12218059599399567, + "learning_rate": 8.883229851380687e-05, + "loss": 1.5602850914001465, + "step": 96300 + }, + { + "epoch": 0.291526039380685, + "grad_norm": 0.10952356457710266, + "learning_rate": 8.882850333216948e-05, + "loss": 1.5138323783874512, + "step": 96310 + }, + { + "epoch": 0.29155630893103085, + "grad_norm": 0.10551533848047256, + "learning_rate": 8.882470815053209e-05, + "loss": 1.5397956848144532, + "step": 96320 + }, + { + "epoch": 0.29158657848137665, + "grad_norm": 0.11477280408143997, + "learning_rate": 8.882091296889469e-05, + "loss": 1.5259931564331055, + "step": 96330 + }, + { + "epoch": 0.2916168480317225, + "grad_norm": 0.12275133281946182, + "learning_rate": 8.88171177872573e-05, + "loss": 1.5612951278686524, + "step": 96340 + }, + { + "epoch": 0.2916471175820683, + "grad_norm": 0.11057250946760178, + "learning_rate": 8.88133226056199e-05, + "loss": 1.566610050201416, + "step": 96350 + }, + { + "epoch": 0.29167738713241415, + "grad_norm": 0.10350210964679718, + "learning_rate": 8.880952742398252e-05, + "loss": 1.5563981056213378, + "step": 96360 + }, + { + "epoch": 0.29170765668276, + "grad_norm": 0.11309251934289932, + "learning_rate": 8.880573224234511e-05, + "loss": 1.5994001388549806, + "step": 96370 + }, + { + "epoch": 0.2917379262331058, + "grad_norm": 0.11907411366701126, + "learning_rate": 8.880193706070773e-05, + "loss": 1.555600929260254, + "step": 96380 + }, + { + "epoch": 0.29176819578345164, + "grad_norm": 0.11726853251457214, + "learning_rate": 8.879814187907033e-05, + "loss": 1.6119306564331055, + "step": 96390 + }, + { + "epoch": 0.29179846533379744, + "grad_norm": 0.11657507717609406, + "learning_rate": 8.879434669743295e-05, + "loss": 1.5696510314941405, + "step": 96400 + }, + { + "epoch": 0.2918287348841433, + "grad_norm": 0.11691548675298691, + "learning_rate": 8.879055151579554e-05, + "loss": 1.536719512939453, + "step": 96410 + }, + { + "epoch": 0.29185900443448914, + "grad_norm": 0.10997115820646286, + "learning_rate": 8.878675633415816e-05, + "loss": 1.5531299591064454, + "step": 96420 + }, + { + "epoch": 0.29188927398483494, + "grad_norm": 0.12826110422611237, + "learning_rate": 8.878296115252075e-05, + "loss": 1.5496981620788575, + "step": 96430 + }, + { + "epoch": 0.2919195435351808, + "grad_norm": 0.1129937469959259, + "learning_rate": 8.877916597088337e-05, + "loss": 1.5836239814758302, + "step": 96440 + }, + { + "epoch": 0.2919498130855266, + "grad_norm": 0.11194389313459396, + "learning_rate": 8.877537078924596e-05, + "loss": 1.512193489074707, + "step": 96450 + }, + { + "epoch": 0.29198008263587244, + "grad_norm": 0.11481539905071259, + "learning_rate": 8.877157560760858e-05, + "loss": 1.5570795059204101, + "step": 96460 + }, + { + "epoch": 0.2920103521862183, + "grad_norm": 0.11791541427373886, + "learning_rate": 8.87677804259712e-05, + "loss": 1.5554738998413087, + "step": 96470 + }, + { + "epoch": 0.2920406217365641, + "grad_norm": 0.1247183233499527, + "learning_rate": 8.87639852443338e-05, + "loss": 1.5931415557861328, + "step": 96480 + }, + { + "epoch": 0.29207089128690994, + "grad_norm": 0.11576979607343674, + "learning_rate": 8.876019006269641e-05, + "loss": 1.5620304107666017, + "step": 96490 + }, + { + "epoch": 0.29210116083725574, + "grad_norm": 0.10854984819889069, + "learning_rate": 8.8756394881059e-05, + "loss": 1.5638641357421874, + "step": 96500 + }, + { + "epoch": 0.29210116083725574, + "eval_loss": 1.5518120527267456, + "eval_runtime": 27.7984, + "eval_samples_per_second": 17.987, + "eval_steps_per_second": 1.151, + "step": 96500 + }, + { + "epoch": 0.2921314303876016, + "grad_norm": 0.11688362807035446, + "learning_rate": 8.875259969942163e-05, + "loss": 1.573348331451416, + "step": 96510 + }, + { + "epoch": 0.29216169993794744, + "grad_norm": 0.11285575479269028, + "learning_rate": 8.874880451778422e-05, + "loss": 1.55220308303833, + "step": 96520 + }, + { + "epoch": 0.29219196948829324, + "grad_norm": 0.11986060440540314, + "learning_rate": 8.874500933614684e-05, + "loss": 1.5588891983032227, + "step": 96530 + }, + { + "epoch": 0.2922222390386391, + "grad_norm": 0.1194942519068718, + "learning_rate": 8.874121415450943e-05, + "loss": 1.5805150032043458, + "step": 96540 + }, + { + "epoch": 0.29225250858898494, + "grad_norm": 0.10772066563367844, + "learning_rate": 8.873741897287205e-05, + "loss": 1.5778319358825683, + "step": 96550 + }, + { + "epoch": 0.29228277813933073, + "grad_norm": 0.12273628264665604, + "learning_rate": 8.873362379123464e-05, + "loss": 1.4739811897277832, + "step": 96560 + }, + { + "epoch": 0.2923130476896766, + "grad_norm": 0.10875151306390762, + "learning_rate": 8.872982860959726e-05, + "loss": 1.5784435272216797, + "step": 96570 + }, + { + "epoch": 0.2923433172400224, + "grad_norm": 0.12345638126134872, + "learning_rate": 8.872603342795985e-05, + "loss": 1.5457494735717774, + "step": 96580 + }, + { + "epoch": 0.29237358679036823, + "grad_norm": 0.09802503883838654, + "learning_rate": 8.872223824632247e-05, + "loss": 1.5570305824279784, + "step": 96590 + }, + { + "epoch": 0.2924038563407141, + "grad_norm": 0.12575848400592804, + "learning_rate": 8.871844306468508e-05, + "loss": 1.5332436561584473, + "step": 96600 + }, + { + "epoch": 0.2924341258910599, + "grad_norm": 0.09494832158088684, + "learning_rate": 8.871464788304768e-05, + "loss": 1.5937241554260253, + "step": 96610 + }, + { + "epoch": 0.29246439544140573, + "grad_norm": 0.10480664670467377, + "learning_rate": 8.871085270141029e-05, + "loss": 1.5262303352355957, + "step": 96620 + }, + { + "epoch": 0.29249466499175153, + "grad_norm": 0.11313743889331818, + "learning_rate": 8.87070575197729e-05, + "loss": 1.5929277420043946, + "step": 96630 + }, + { + "epoch": 0.2925249345420974, + "grad_norm": 0.10988655686378479, + "learning_rate": 8.87032623381355e-05, + "loss": 1.5360729217529296, + "step": 96640 + }, + { + "epoch": 0.29255520409244323, + "grad_norm": 0.12929409742355347, + "learning_rate": 8.869946715649811e-05, + "loss": 1.5421920776367188, + "step": 96650 + }, + { + "epoch": 0.29258547364278903, + "grad_norm": 0.10804034769535065, + "learning_rate": 8.869567197486073e-05, + "loss": 1.5492337226867676, + "step": 96660 + }, + { + "epoch": 0.2926157431931349, + "grad_norm": 0.11286210268735886, + "learning_rate": 8.869187679322332e-05, + "loss": 1.5432916641235352, + "step": 96670 + }, + { + "epoch": 0.2926460127434807, + "grad_norm": 0.1188083216547966, + "learning_rate": 8.868808161158594e-05, + "loss": 1.5319276809692384, + "step": 96680 + }, + { + "epoch": 0.29267628229382653, + "grad_norm": 0.116559237241745, + "learning_rate": 8.868428642994853e-05, + "loss": 1.571624755859375, + "step": 96690 + }, + { + "epoch": 0.2927065518441724, + "grad_norm": 0.10359562188386917, + "learning_rate": 8.868049124831115e-05, + "loss": 1.5804734230041504, + "step": 96700 + }, + { + "epoch": 0.2927368213945182, + "grad_norm": 0.11138569563627243, + "learning_rate": 8.867669606667374e-05, + "loss": 1.5481194496154784, + "step": 96710 + }, + { + "epoch": 0.29276709094486403, + "grad_norm": 0.11851239949464798, + "learning_rate": 8.867290088503636e-05, + "loss": 1.5506575584411622, + "step": 96720 + }, + { + "epoch": 0.2927973604952098, + "grad_norm": 0.10700183361768723, + "learning_rate": 8.866910570339897e-05, + "loss": 1.5384891510009766, + "step": 96730 + }, + { + "epoch": 0.2928276300455557, + "grad_norm": 0.11347460001707077, + "learning_rate": 8.866531052176158e-05, + "loss": 1.5611947059631348, + "step": 96740 + }, + { + "epoch": 0.2928578995959015, + "grad_norm": 0.12017222493886948, + "learning_rate": 8.866151534012418e-05, + "loss": 1.5606155395507812, + "step": 96750 + }, + { + "epoch": 0.2928881691462473, + "grad_norm": 0.10668715089559555, + "learning_rate": 8.865772015848679e-05, + "loss": 1.5334648132324218, + "step": 96760 + }, + { + "epoch": 0.2929184386965932, + "grad_norm": 0.11953724920749664, + "learning_rate": 8.86539249768494e-05, + "loss": 1.5757920265197753, + "step": 96770 + }, + { + "epoch": 0.29294870824693897, + "grad_norm": 0.10593316704034805, + "learning_rate": 8.8650129795212e-05, + "loss": 1.5506741523742675, + "step": 96780 + }, + { + "epoch": 0.2929789777972848, + "grad_norm": 0.1196618527173996, + "learning_rate": 8.86463346135746e-05, + "loss": 1.5635566711425781, + "step": 96790 + }, + { + "epoch": 0.2930092473476307, + "grad_norm": 0.10277976840734482, + "learning_rate": 8.864253943193721e-05, + "loss": 1.5678190231323241, + "step": 96800 + }, + { + "epoch": 0.29303951689797647, + "grad_norm": 0.12010172009468079, + "learning_rate": 8.863874425029982e-05, + "loss": 1.5532651901245118, + "step": 96810 + }, + { + "epoch": 0.2930697864483223, + "grad_norm": 0.10226353257894516, + "learning_rate": 8.863494906866242e-05, + "loss": 1.5578363418579102, + "step": 96820 + }, + { + "epoch": 0.2931000559986681, + "grad_norm": 0.10773339867591858, + "learning_rate": 8.863115388702503e-05, + "loss": 1.5803295135498048, + "step": 96830 + }, + { + "epoch": 0.29313032554901397, + "grad_norm": 0.11778149008750916, + "learning_rate": 8.862735870538765e-05, + "loss": 1.5821724891662599, + "step": 96840 + }, + { + "epoch": 0.2931605950993598, + "grad_norm": 0.11566417664289474, + "learning_rate": 8.862356352375024e-05, + "loss": 1.5503574371337892, + "step": 96850 + }, + { + "epoch": 0.2931908646497056, + "grad_norm": 0.13554176688194275, + "learning_rate": 8.861976834211286e-05, + "loss": 1.5499509811401366, + "step": 96860 + }, + { + "epoch": 0.29322113420005147, + "grad_norm": 0.11397626996040344, + "learning_rate": 8.861597316047547e-05, + "loss": 1.5289968490600585, + "step": 96870 + }, + { + "epoch": 0.29325140375039727, + "grad_norm": 0.11752253025770187, + "learning_rate": 8.861217797883807e-05, + "loss": 1.564809799194336, + "step": 96880 + }, + { + "epoch": 0.2932816733007431, + "grad_norm": 0.12416937202215195, + "learning_rate": 8.860838279720068e-05, + "loss": 1.5066296577453613, + "step": 96890 + }, + { + "epoch": 0.29331194285108897, + "grad_norm": 0.11120537668466568, + "learning_rate": 8.860458761556328e-05, + "loss": 1.54202241897583, + "step": 96900 + }, + { + "epoch": 0.29334221240143477, + "grad_norm": 0.11827216297388077, + "learning_rate": 8.860079243392589e-05, + "loss": 1.5304578781127929, + "step": 96910 + }, + { + "epoch": 0.2933724819517806, + "grad_norm": 0.113768070936203, + "learning_rate": 8.85969972522885e-05, + "loss": 1.5244224548339844, + "step": 96920 + }, + { + "epoch": 0.2934027515021264, + "grad_norm": 0.10678211599588394, + "learning_rate": 8.85932020706511e-05, + "loss": 1.5516111373901367, + "step": 96930 + }, + { + "epoch": 0.29343302105247226, + "grad_norm": 0.11159873753786087, + "learning_rate": 8.858940688901371e-05, + "loss": 1.5757485389709474, + "step": 96940 + }, + { + "epoch": 0.2934632906028181, + "grad_norm": 0.11408783495426178, + "learning_rate": 8.858561170737631e-05, + "loss": 1.5397706985473634, + "step": 96950 + }, + { + "epoch": 0.2934935601531639, + "grad_norm": 0.11799893528223038, + "learning_rate": 8.858181652573892e-05, + "loss": 1.5469356536865235, + "step": 96960 + }, + { + "epoch": 0.29352382970350976, + "grad_norm": 0.10537403076887131, + "learning_rate": 8.857802134410154e-05, + "loss": 1.5812082290649414, + "step": 96970 + }, + { + "epoch": 0.29355409925385556, + "grad_norm": 0.10928601771593094, + "learning_rate": 8.857422616246413e-05, + "loss": 1.5615854263305664, + "step": 96980 + }, + { + "epoch": 0.2935843688042014, + "grad_norm": 0.11509109288454056, + "learning_rate": 8.857043098082675e-05, + "loss": 1.5574823379516602, + "step": 96990 + }, + { + "epoch": 0.29361463835454726, + "grad_norm": 0.12454349547624588, + "learning_rate": 8.856663579918934e-05, + "loss": 1.5462968826293946, + "step": 97000 + }, + { + "epoch": 0.29361463835454726, + "eval_loss": 1.5430142879486084, + "eval_runtime": 27.7209, + "eval_samples_per_second": 18.037, + "eval_steps_per_second": 1.154, + "step": 97000 + }, + { + "epoch": 0.29364490790489306, + "grad_norm": 0.10480430722236633, + "learning_rate": 8.856284061755196e-05, + "loss": 1.5614465713500976, + "step": 97010 + }, + { + "epoch": 0.2936751774552389, + "grad_norm": 0.12333271652460098, + "learning_rate": 8.855904543591456e-05, + "loss": 1.5843017578125, + "step": 97020 + }, + { + "epoch": 0.2937054470055847, + "grad_norm": 0.12499546259641647, + "learning_rate": 8.855525025427718e-05, + "loss": 1.5514116287231445, + "step": 97030 + }, + { + "epoch": 0.29373571655593056, + "grad_norm": 0.11001550406217575, + "learning_rate": 8.855145507263977e-05, + "loss": 1.5646456718444823, + "step": 97040 + }, + { + "epoch": 0.2937659861062764, + "grad_norm": 0.11458222568035126, + "learning_rate": 8.854765989100239e-05, + "loss": 1.528355884552002, + "step": 97050 + }, + { + "epoch": 0.2937962556566222, + "grad_norm": 0.10805228352546692, + "learning_rate": 8.854386470936498e-05, + "loss": 1.5855424880981446, + "step": 97060 + }, + { + "epoch": 0.29382652520696806, + "grad_norm": 0.10391438007354736, + "learning_rate": 8.85400695277276e-05, + "loss": 1.5990682601928712, + "step": 97070 + }, + { + "epoch": 0.29385679475731386, + "grad_norm": 0.11801748722791672, + "learning_rate": 8.853627434609022e-05, + "loss": 1.5695174217224122, + "step": 97080 + }, + { + "epoch": 0.2938870643076597, + "grad_norm": 0.1259482502937317, + "learning_rate": 8.853247916445281e-05, + "loss": 1.5431177139282226, + "step": 97090 + }, + { + "epoch": 0.29391733385800556, + "grad_norm": 0.11711201071739197, + "learning_rate": 8.852868398281543e-05, + "loss": 1.5769715309143066, + "step": 97100 + }, + { + "epoch": 0.29394760340835135, + "grad_norm": 0.10426773130893707, + "learning_rate": 8.852488880117802e-05, + "loss": 1.58245792388916, + "step": 97110 + }, + { + "epoch": 0.2939778729586972, + "grad_norm": 0.12032251805067062, + "learning_rate": 8.852109361954064e-05, + "loss": 1.5394311904907227, + "step": 97120 + }, + { + "epoch": 0.294008142509043, + "grad_norm": 0.10617969185113907, + "learning_rate": 8.851729843790323e-05, + "loss": 1.5681669235229492, + "step": 97130 + }, + { + "epoch": 0.29403841205938885, + "grad_norm": 0.11511784046888351, + "learning_rate": 8.851350325626585e-05, + "loss": 1.5443078994750976, + "step": 97140 + }, + { + "epoch": 0.2940686816097347, + "grad_norm": 0.13223421573638916, + "learning_rate": 8.850970807462845e-05, + "loss": 1.5690771102905274, + "step": 97150 + }, + { + "epoch": 0.2940989511600805, + "grad_norm": 0.11110720038414001, + "learning_rate": 8.850591289299107e-05, + "loss": 1.5252252578735352, + "step": 97160 + }, + { + "epoch": 0.29412922071042635, + "grad_norm": 0.10905064642429352, + "learning_rate": 8.850211771135366e-05, + "loss": 1.5741205215454102, + "step": 97170 + }, + { + "epoch": 0.29415949026077215, + "grad_norm": 0.10262744128704071, + "learning_rate": 8.849832252971628e-05, + "loss": 1.5755861282348633, + "step": 97180 + }, + { + "epoch": 0.294189759811118, + "grad_norm": 0.10986208915710449, + "learning_rate": 8.849452734807887e-05, + "loss": 1.5373710632324218, + "step": 97190 + }, + { + "epoch": 0.29422002936146385, + "grad_norm": 0.10512129962444305, + "learning_rate": 8.849073216644149e-05, + "loss": 1.5625973701477052, + "step": 97200 + }, + { + "epoch": 0.29425029891180965, + "grad_norm": 0.11085483431816101, + "learning_rate": 8.84869369848041e-05, + "loss": 1.5555009841918945, + "step": 97210 + }, + { + "epoch": 0.2942805684621555, + "grad_norm": 0.11559825390577316, + "learning_rate": 8.84831418031667e-05, + "loss": 1.562076950073242, + "step": 97220 + }, + { + "epoch": 0.2943108380125013, + "grad_norm": 0.12135186791419983, + "learning_rate": 8.847934662152931e-05, + "loss": 1.5517430305480957, + "step": 97230 + }, + { + "epoch": 0.29434110756284715, + "grad_norm": 0.12631744146347046, + "learning_rate": 8.847555143989191e-05, + "loss": 1.5598642349243164, + "step": 97240 + }, + { + "epoch": 0.294371377113193, + "grad_norm": 0.1096145510673523, + "learning_rate": 8.847175625825452e-05, + "loss": 1.5422590255737305, + "step": 97250 + }, + { + "epoch": 0.2944016466635388, + "grad_norm": 0.1260817050933838, + "learning_rate": 8.846796107661713e-05, + "loss": 1.560234260559082, + "step": 97260 + }, + { + "epoch": 0.29443191621388465, + "grad_norm": 0.09950975328683853, + "learning_rate": 8.846416589497975e-05, + "loss": 1.5537388801574707, + "step": 97270 + }, + { + "epoch": 0.2944621857642305, + "grad_norm": 0.11077172309160233, + "learning_rate": 8.846037071334234e-05, + "loss": 1.5365148544311524, + "step": 97280 + }, + { + "epoch": 0.2944924553145763, + "grad_norm": 0.11635537445545197, + "learning_rate": 8.845657553170496e-05, + "loss": 1.5686141967773437, + "step": 97290 + }, + { + "epoch": 0.29452272486492215, + "grad_norm": 0.09619398415088654, + "learning_rate": 8.845278035006755e-05, + "loss": 1.5982027053833008, + "step": 97300 + }, + { + "epoch": 0.29455299441526794, + "grad_norm": 0.12554720044136047, + "learning_rate": 8.844898516843017e-05, + "loss": 1.5388153076171875, + "step": 97310 + }, + { + "epoch": 0.2945832639656138, + "grad_norm": 0.10960359871387482, + "learning_rate": 8.844518998679276e-05, + "loss": 1.523221206665039, + "step": 97320 + }, + { + "epoch": 0.29461353351595965, + "grad_norm": 0.1122603490948677, + "learning_rate": 8.844139480515538e-05, + "loss": 1.548674964904785, + "step": 97330 + }, + { + "epoch": 0.29464380306630544, + "grad_norm": 0.11559951305389404, + "learning_rate": 8.843759962351799e-05, + "loss": 1.544433307647705, + "step": 97340 + }, + { + "epoch": 0.2946740726166513, + "grad_norm": 0.1103350892663002, + "learning_rate": 8.843380444188059e-05, + "loss": 1.5675105094909667, + "step": 97350 + }, + { + "epoch": 0.2947043421669971, + "grad_norm": 0.10850774496793747, + "learning_rate": 8.84300092602432e-05, + "loss": 1.5526643753051759, + "step": 97360 + }, + { + "epoch": 0.29473461171734294, + "grad_norm": 0.09763287007808685, + "learning_rate": 8.84262140786058e-05, + "loss": 1.5727052688598633, + "step": 97370 + }, + { + "epoch": 0.2947648812676888, + "grad_norm": 0.116800956428051, + "learning_rate": 8.842241889696841e-05, + "loss": 1.517163372039795, + "step": 97380 + }, + { + "epoch": 0.2947951508180346, + "grad_norm": 0.10773136466741562, + "learning_rate": 8.841862371533102e-05, + "loss": 1.5456063270568847, + "step": 97390 + }, + { + "epoch": 0.29482542036838044, + "grad_norm": 0.10902876406908035, + "learning_rate": 8.841482853369362e-05, + "loss": 1.5075072288513183, + "step": 97400 + }, + { + "epoch": 0.29485568991872624, + "grad_norm": 0.12162523716688156, + "learning_rate": 8.841103335205623e-05, + "loss": 1.5483784675598145, + "step": 97410 + }, + { + "epoch": 0.2948859594690721, + "grad_norm": 0.11124012619256973, + "learning_rate": 8.840723817041883e-05, + "loss": 1.579863929748535, + "step": 97420 + }, + { + "epoch": 0.29491622901941794, + "grad_norm": 0.12062089145183563, + "learning_rate": 8.840344298878144e-05, + "loss": 1.5308426856994628, + "step": 97430 + }, + { + "epoch": 0.29494649856976374, + "grad_norm": 0.1104327067732811, + "learning_rate": 8.839964780714405e-05, + "loss": 1.5562878608703614, + "step": 97440 + }, + { + "epoch": 0.2949767681201096, + "grad_norm": 0.12123962491750717, + "learning_rate": 8.839585262550665e-05, + "loss": 1.5578554153442383, + "step": 97450 + }, + { + "epoch": 0.2950070376704554, + "grad_norm": 0.13202807307243347, + "learning_rate": 8.839205744386926e-05, + "loss": 1.559685516357422, + "step": 97460 + }, + { + "epoch": 0.29503730722080124, + "grad_norm": 0.1217544823884964, + "learning_rate": 8.838826226223188e-05, + "loss": 1.5824968338012695, + "step": 97470 + }, + { + "epoch": 0.2950675767711471, + "grad_norm": 0.10625678300857544, + "learning_rate": 8.838446708059448e-05, + "loss": 1.5394673347473145, + "step": 97480 + }, + { + "epoch": 0.2950978463214929, + "grad_norm": 0.10901499539613724, + "learning_rate": 8.838067189895709e-05, + "loss": 1.5120176315307616, + "step": 97490 + }, + { + "epoch": 0.29512811587183874, + "grad_norm": 0.12444040179252625, + "learning_rate": 8.83768767173197e-05, + "loss": 1.5472951889038087, + "step": 97500 + }, + { + "epoch": 0.29512811587183874, + "eval_loss": 1.5705475807189941, + "eval_runtime": 28.1856, + "eval_samples_per_second": 17.74, + "eval_steps_per_second": 1.135, + "step": 97500 + }, + { + "epoch": 0.29515838542218453, + "grad_norm": 0.1128600537776947, + "learning_rate": 8.83730815356823e-05, + "loss": 1.5726961135864257, + "step": 97510 + }, + { + "epoch": 0.2951886549725304, + "grad_norm": 0.11678781360387802, + "learning_rate": 8.836928635404491e-05, + "loss": 1.5307796478271485, + "step": 97520 + }, + { + "epoch": 0.29521892452287624, + "grad_norm": 0.11682049930095673, + "learning_rate": 8.836549117240751e-05, + "loss": 1.5482444763183594, + "step": 97530 + }, + { + "epoch": 0.29524919407322203, + "grad_norm": 0.1197277158498764, + "learning_rate": 8.836169599077012e-05, + "loss": 1.560507869720459, + "step": 97540 + }, + { + "epoch": 0.2952794636235679, + "grad_norm": 0.11284167319536209, + "learning_rate": 8.835790080913273e-05, + "loss": 1.5911805152893066, + "step": 97550 + }, + { + "epoch": 0.2953097331739137, + "grad_norm": 0.12326706945896149, + "learning_rate": 8.835410562749533e-05, + "loss": 1.5686161041259765, + "step": 97560 + }, + { + "epoch": 0.29534000272425953, + "grad_norm": 0.12255217880010605, + "learning_rate": 8.835031044585794e-05, + "loss": 1.5315845489501954, + "step": 97570 + }, + { + "epoch": 0.2953702722746054, + "grad_norm": 0.1373419612646103, + "learning_rate": 8.834651526422056e-05, + "loss": 1.5664033889770508, + "step": 97580 + }, + { + "epoch": 0.2954005418249512, + "grad_norm": 0.1221960186958313, + "learning_rate": 8.834272008258315e-05, + "loss": 1.522714614868164, + "step": 97590 + }, + { + "epoch": 0.29543081137529703, + "grad_norm": 0.11105217784643173, + "learning_rate": 8.833892490094577e-05, + "loss": 1.571159267425537, + "step": 97600 + }, + { + "epoch": 0.2954610809256428, + "grad_norm": 0.11324623227119446, + "learning_rate": 8.833512971930836e-05, + "loss": 1.593027114868164, + "step": 97610 + }, + { + "epoch": 0.2954913504759887, + "grad_norm": 0.11268675327301025, + "learning_rate": 8.833133453767098e-05, + "loss": 1.5667147636413574, + "step": 97620 + }, + { + "epoch": 0.29552162002633453, + "grad_norm": 0.10684410482645035, + "learning_rate": 8.832753935603357e-05, + "loss": 1.578474235534668, + "step": 97630 + }, + { + "epoch": 0.2955518895766803, + "grad_norm": 0.12118943780660629, + "learning_rate": 8.832374417439619e-05, + "loss": 1.5458185195922851, + "step": 97640 + }, + { + "epoch": 0.2955821591270262, + "grad_norm": 0.10524217784404755, + "learning_rate": 8.831994899275879e-05, + "loss": 1.5651735305786132, + "step": 97650 + }, + { + "epoch": 0.295612428677372, + "grad_norm": 0.1047036200761795, + "learning_rate": 8.83161538111214e-05, + "loss": 1.5672489166259767, + "step": 97660 + }, + { + "epoch": 0.2956426982277178, + "grad_norm": 0.12012917548418045, + "learning_rate": 8.8312358629484e-05, + "loss": 1.5799005508422852, + "step": 97670 + }, + { + "epoch": 0.2956729677780637, + "grad_norm": 0.12077079713344574, + "learning_rate": 8.830856344784662e-05, + "loss": 1.5269421577453612, + "step": 97680 + }, + { + "epoch": 0.2957032373284095, + "grad_norm": 0.1045793890953064, + "learning_rate": 8.830476826620922e-05, + "loss": 1.5374284744262696, + "step": 97690 + }, + { + "epoch": 0.2957335068787553, + "grad_norm": 0.10891677439212799, + "learning_rate": 8.830097308457183e-05, + "loss": 1.5481313705444335, + "step": 97700 + }, + { + "epoch": 0.2957637764291011, + "grad_norm": 0.11880633980035782, + "learning_rate": 8.829717790293445e-05, + "loss": 1.5433966636657714, + "step": 97710 + }, + { + "epoch": 0.295794045979447, + "grad_norm": 0.10958731174468994, + "learning_rate": 8.829338272129704e-05, + "loss": 1.554306697845459, + "step": 97720 + }, + { + "epoch": 0.2958243155297928, + "grad_norm": 0.10973076522350311, + "learning_rate": 8.828958753965966e-05, + "loss": 1.559360885620117, + "step": 97730 + }, + { + "epoch": 0.2958545850801386, + "grad_norm": 0.11645542830228806, + "learning_rate": 8.828579235802225e-05, + "loss": 1.5544378280639648, + "step": 97740 + }, + { + "epoch": 0.2958848546304845, + "grad_norm": 0.1449713557958603, + "learning_rate": 8.828199717638487e-05, + "loss": 1.5138795852661133, + "step": 97750 + }, + { + "epoch": 0.29591512418083027, + "grad_norm": 0.10778585821390152, + "learning_rate": 8.827820199474746e-05, + "loss": 1.574270248413086, + "step": 97760 + }, + { + "epoch": 0.2959453937311761, + "grad_norm": 0.1141478642821312, + "learning_rate": 8.827440681311008e-05, + "loss": 1.550429344177246, + "step": 97770 + }, + { + "epoch": 0.295975663281522, + "grad_norm": 0.11280089616775513, + "learning_rate": 8.827061163147268e-05, + "loss": 1.5490342140197755, + "step": 97780 + }, + { + "epoch": 0.29600593283186777, + "grad_norm": 0.12408007681369781, + "learning_rate": 8.82668164498353e-05, + "loss": 1.5957933425903321, + "step": 97790 + }, + { + "epoch": 0.2960362023822136, + "grad_norm": 0.11271169036626816, + "learning_rate": 8.826302126819789e-05, + "loss": 1.5467389106750489, + "step": 97800 + }, + { + "epoch": 0.2960664719325594, + "grad_norm": 0.10885244607925415, + "learning_rate": 8.825922608656051e-05, + "loss": 1.5628971099853515, + "step": 97810 + }, + { + "epoch": 0.29609674148290527, + "grad_norm": 0.11792900413274765, + "learning_rate": 8.82554309049231e-05, + "loss": 1.5274923324584961, + "step": 97820 + }, + { + "epoch": 0.2961270110332511, + "grad_norm": 0.11234567314386368, + "learning_rate": 8.825163572328572e-05, + "loss": 1.5941129684448243, + "step": 97830 + }, + { + "epoch": 0.2961572805835969, + "grad_norm": 0.11155137419700623, + "learning_rate": 8.824784054164833e-05, + "loss": 1.5421510696411134, + "step": 97840 + }, + { + "epoch": 0.29618755013394277, + "grad_norm": 0.11398469656705856, + "learning_rate": 8.824404536001093e-05, + "loss": 1.546208381652832, + "step": 97850 + }, + { + "epoch": 0.29621781968428856, + "grad_norm": 0.12839648127555847, + "learning_rate": 8.824025017837354e-05, + "loss": 1.524072265625, + "step": 97860 + }, + { + "epoch": 0.2962480892346344, + "grad_norm": 0.11292160302400589, + "learning_rate": 8.823645499673614e-05, + "loss": 1.5521705627441407, + "step": 97870 + }, + { + "epoch": 0.29627835878498027, + "grad_norm": 0.12211305648088455, + "learning_rate": 8.823265981509876e-05, + "loss": 1.5931697845458985, + "step": 97880 + }, + { + "epoch": 0.29630862833532606, + "grad_norm": 0.11916360259056091, + "learning_rate": 8.822886463346135e-05, + "loss": 1.5729625701904297, + "step": 97890 + }, + { + "epoch": 0.2963388978856719, + "grad_norm": 0.11947927623987198, + "learning_rate": 8.822506945182397e-05, + "loss": 1.5985238075256347, + "step": 97900 + }, + { + "epoch": 0.2963691674360177, + "grad_norm": 0.12321795523166656, + "learning_rate": 8.822127427018657e-05, + "loss": 1.5725520133972168, + "step": 97910 + }, + { + "epoch": 0.29639943698636356, + "grad_norm": 0.12722043693065643, + "learning_rate": 8.821747908854919e-05, + "loss": 1.5407949447631837, + "step": 97920 + }, + { + "epoch": 0.2964297065367094, + "grad_norm": 0.10251367092132568, + "learning_rate": 8.821368390691178e-05, + "loss": 1.5649585723876953, + "step": 97930 + }, + { + "epoch": 0.2964599760870552, + "grad_norm": 0.12733519077301025, + "learning_rate": 8.82098887252744e-05, + "loss": 1.509023666381836, + "step": 97940 + }, + { + "epoch": 0.29649024563740106, + "grad_norm": 0.11296479403972626, + "learning_rate": 8.8206093543637e-05, + "loss": 1.5603198051452636, + "step": 97950 + }, + { + "epoch": 0.2965205151877469, + "grad_norm": 0.10525888204574585, + "learning_rate": 8.820229836199961e-05, + "loss": 1.5616468429565429, + "step": 97960 + }, + { + "epoch": 0.2965507847380927, + "grad_norm": 0.09727541357278824, + "learning_rate": 8.819850318036222e-05, + "loss": 1.5424308776855469, + "step": 97970 + }, + { + "epoch": 0.29658105428843856, + "grad_norm": 0.10773497074842453, + "learning_rate": 8.819470799872482e-05, + "loss": 1.5586997985839843, + "step": 97980 + }, + { + "epoch": 0.29661132383878436, + "grad_norm": 0.11351752281188965, + "learning_rate": 8.819091281708743e-05, + "loss": 1.5591312408447267, + "step": 97990 + }, + { + "epoch": 0.2966415933891302, + "grad_norm": 0.11060302704572678, + "learning_rate": 8.818711763545003e-05, + "loss": 1.574100399017334, + "step": 98000 + }, + { + "epoch": 0.2966415933891302, + "eval_loss": 1.5679579973220825, + "eval_runtime": 28.2035, + "eval_samples_per_second": 17.728, + "eval_steps_per_second": 1.135, + "step": 98000 + }, + { + "epoch": 0.29667186293947606, + "grad_norm": 0.11647903919219971, + "learning_rate": 8.818332245381264e-05, + "loss": 1.5763797760009766, + "step": 98010 + }, + { + "epoch": 0.29670213248982186, + "grad_norm": 0.11099828034639359, + "learning_rate": 8.817952727217525e-05, + "loss": 1.5817581176757813, + "step": 98020 + }, + { + "epoch": 0.2967324020401677, + "grad_norm": 0.1066618487238884, + "learning_rate": 8.817573209053785e-05, + "loss": 1.5541491508483887, + "step": 98030 + }, + { + "epoch": 0.2967626715905135, + "grad_norm": 0.10982425510883331, + "learning_rate": 8.817193690890046e-05, + "loss": 1.5583815574645996, + "step": 98040 + }, + { + "epoch": 0.29679294114085936, + "grad_norm": 0.12568703293800354, + "learning_rate": 8.816814172726306e-05, + "loss": 1.5240507125854492, + "step": 98050 + }, + { + "epoch": 0.2968232106912052, + "grad_norm": 0.1394980102777481, + "learning_rate": 8.816434654562567e-05, + "loss": 1.581784439086914, + "step": 98060 + }, + { + "epoch": 0.296853480241551, + "grad_norm": 0.13324877619743347, + "learning_rate": 8.816055136398828e-05, + "loss": 1.5724088668823242, + "step": 98070 + }, + { + "epoch": 0.29688374979189686, + "grad_norm": 0.11941833794116974, + "learning_rate": 8.81567561823509e-05, + "loss": 1.549359130859375, + "step": 98080 + }, + { + "epoch": 0.29691401934224265, + "grad_norm": 0.10571421682834625, + "learning_rate": 8.81529610007135e-05, + "loss": 1.5507869720458984, + "step": 98090 + }, + { + "epoch": 0.2969442888925885, + "grad_norm": 0.10487403720617294, + "learning_rate": 8.814916581907611e-05, + "loss": 1.5585289001464844, + "step": 98100 + }, + { + "epoch": 0.29697455844293436, + "grad_norm": 0.10946498811244965, + "learning_rate": 8.814537063743871e-05, + "loss": 1.5939754486083983, + "step": 98110 + }, + { + "epoch": 0.29700482799328015, + "grad_norm": 0.12344454228878021, + "learning_rate": 8.814157545580132e-05, + "loss": 1.5613507270812987, + "step": 98120 + }, + { + "epoch": 0.297035097543626, + "grad_norm": 0.11808159947395325, + "learning_rate": 8.813778027416392e-05, + "loss": 1.5529601097106933, + "step": 98130 + }, + { + "epoch": 0.2970653670939718, + "grad_norm": 0.10615343600511551, + "learning_rate": 8.813398509252653e-05, + "loss": 1.5722219467163085, + "step": 98140 + }, + { + "epoch": 0.29709563664431765, + "grad_norm": 0.10788285732269287, + "learning_rate": 8.813018991088914e-05, + "loss": 1.5705958366394044, + "step": 98150 + }, + { + "epoch": 0.2971259061946635, + "grad_norm": 0.09829074144363403, + "learning_rate": 8.812639472925174e-05, + "loss": 1.57525577545166, + "step": 98160 + }, + { + "epoch": 0.2971561757450093, + "grad_norm": 0.12438356131315231, + "learning_rate": 8.812259954761435e-05, + "loss": 1.5999292373657226, + "step": 98170 + }, + { + "epoch": 0.29718644529535515, + "grad_norm": 0.12853720784187317, + "learning_rate": 8.811880436597695e-05, + "loss": 1.5644950866699219, + "step": 98180 + }, + { + "epoch": 0.29721671484570095, + "grad_norm": 0.10892144590616226, + "learning_rate": 8.811500918433957e-05, + "loss": 1.5521211624145508, + "step": 98190 + }, + { + "epoch": 0.2972469843960468, + "grad_norm": 0.11527415364980698, + "learning_rate": 8.811121400270217e-05, + "loss": 1.6216176986694335, + "step": 98200 + }, + { + "epoch": 0.29727725394639265, + "grad_norm": 0.10378170758485794, + "learning_rate": 8.810741882106479e-05, + "loss": 1.5806654930114745, + "step": 98210 + }, + { + "epoch": 0.29730752349673845, + "grad_norm": 0.10967572778463364, + "learning_rate": 8.810362363942738e-05, + "loss": 1.5386009216308594, + "step": 98220 + }, + { + "epoch": 0.2973377930470843, + "grad_norm": 0.12519480288028717, + "learning_rate": 8.809982845779e-05, + "loss": 1.5387529373168944, + "step": 98230 + }, + { + "epoch": 0.2973680625974301, + "grad_norm": 0.12853893637657166, + "learning_rate": 8.809603327615259e-05, + "loss": 1.5377367973327636, + "step": 98240 + }, + { + "epoch": 0.29739833214777595, + "grad_norm": 0.11024665832519531, + "learning_rate": 8.809223809451521e-05, + "loss": 1.5330297470092773, + "step": 98250 + }, + { + "epoch": 0.2974286016981218, + "grad_norm": 0.10853249579668045, + "learning_rate": 8.80884429128778e-05, + "loss": 1.5262317657470703, + "step": 98260 + }, + { + "epoch": 0.2974588712484676, + "grad_norm": 0.11328563839197159, + "learning_rate": 8.808464773124042e-05, + "loss": 1.5451620101928711, + "step": 98270 + }, + { + "epoch": 0.29748914079881345, + "grad_norm": 0.12120594084262848, + "learning_rate": 8.808085254960303e-05, + "loss": 1.5379944801330567, + "step": 98280 + }, + { + "epoch": 0.29751941034915924, + "grad_norm": 0.12403404712677002, + "learning_rate": 8.807705736796563e-05, + "loss": 1.517038631439209, + "step": 98290 + }, + { + "epoch": 0.2975496798995051, + "grad_norm": 0.09749158471822739, + "learning_rate": 8.807326218632824e-05, + "loss": 1.5408587455749512, + "step": 98300 + }, + { + "epoch": 0.29757994944985094, + "grad_norm": 0.10270000994205475, + "learning_rate": 8.806946700469085e-05, + "loss": 1.533907699584961, + "step": 98310 + }, + { + "epoch": 0.29761021900019674, + "grad_norm": 0.09592536091804504, + "learning_rate": 8.806567182305346e-05, + "loss": 1.562352180480957, + "step": 98320 + }, + { + "epoch": 0.2976404885505426, + "grad_norm": 0.11247708648443222, + "learning_rate": 8.806187664141606e-05, + "loss": 1.5756534576416015, + "step": 98330 + }, + { + "epoch": 0.2976707581008884, + "grad_norm": 0.12388087809085846, + "learning_rate": 8.805808145977868e-05, + "loss": 1.5407906532287599, + "step": 98340 + }, + { + "epoch": 0.29770102765123424, + "grad_norm": 0.12407258152961731, + "learning_rate": 8.805428627814127e-05, + "loss": 1.532951545715332, + "step": 98350 + }, + { + "epoch": 0.2977312972015801, + "grad_norm": 0.1113186776638031, + "learning_rate": 8.805049109650389e-05, + "loss": 1.5506946563720703, + "step": 98360 + }, + { + "epoch": 0.2977615667519259, + "grad_norm": 0.10991489142179489, + "learning_rate": 8.804669591486648e-05, + "loss": 1.5548603057861328, + "step": 98370 + }, + { + "epoch": 0.29779183630227174, + "grad_norm": 0.10318776220083237, + "learning_rate": 8.80429007332291e-05, + "loss": 1.5706605911254883, + "step": 98380 + }, + { + "epoch": 0.29782210585261754, + "grad_norm": 0.1234096959233284, + "learning_rate": 8.803910555159169e-05, + "loss": 1.527400588989258, + "step": 98390 + }, + { + "epoch": 0.2978523754029634, + "grad_norm": 0.11729662865400314, + "learning_rate": 8.803531036995431e-05, + "loss": 1.5351225852966308, + "step": 98400 + }, + { + "epoch": 0.29788264495330924, + "grad_norm": 0.10819701105356216, + "learning_rate": 8.80315151883169e-05, + "loss": 1.5675315856933594, + "step": 98410 + }, + { + "epoch": 0.29791291450365504, + "grad_norm": 0.12948469817638397, + "learning_rate": 8.802772000667952e-05, + "loss": 1.548628044128418, + "step": 98420 + }, + { + "epoch": 0.2979431840540009, + "grad_norm": 0.1165645495057106, + "learning_rate": 8.802392482504212e-05, + "loss": 1.5305551528930663, + "step": 98430 + }, + { + "epoch": 0.2979734536043467, + "grad_norm": 0.1181783601641655, + "learning_rate": 8.802012964340474e-05, + "loss": 1.5568586349487306, + "step": 98440 + }, + { + "epoch": 0.29800372315469253, + "grad_norm": 0.11090689152479172, + "learning_rate": 8.801633446176734e-05, + "loss": 1.5842644691467285, + "step": 98450 + }, + { + "epoch": 0.2980339927050384, + "grad_norm": 0.12177225947380066, + "learning_rate": 8.801253928012995e-05, + "loss": 1.5519237518310547, + "step": 98460 + }, + { + "epoch": 0.2980642622553842, + "grad_norm": 0.10950271785259247, + "learning_rate": 8.800874409849255e-05, + "loss": 1.5586138725280763, + "step": 98470 + }, + { + "epoch": 0.29809453180573003, + "grad_norm": 0.11746305972337723, + "learning_rate": 8.800494891685516e-05, + "loss": 1.5326536178588868, + "step": 98480 + }, + { + "epoch": 0.29812480135607583, + "grad_norm": 0.12153278291225433, + "learning_rate": 8.800115373521778e-05, + "loss": 1.5521635055541991, + "step": 98490 + }, + { + "epoch": 0.2981550709064217, + "grad_norm": 0.1277008354663849, + "learning_rate": 8.799735855358037e-05, + "loss": 1.5873908042907714, + "step": 98500 + }, + { + "epoch": 0.2981550709064217, + "eval_loss": 1.574212908744812, + "eval_runtime": 27.9669, + "eval_samples_per_second": 17.878, + "eval_steps_per_second": 1.144, + "step": 98500 + }, + { + "epoch": 0.29818534045676753, + "grad_norm": 0.11414088308811188, + "learning_rate": 8.799356337194299e-05, + "loss": 1.5653846740722657, + "step": 98510 + }, + { + "epoch": 0.29821561000711333, + "grad_norm": 0.10754022747278214, + "learning_rate": 8.798976819030558e-05, + "loss": 1.5578460693359375, + "step": 98520 + }, + { + "epoch": 0.2982458795574592, + "grad_norm": 0.1074303686618805, + "learning_rate": 8.79859730086682e-05, + "loss": 1.5244539260864258, + "step": 98530 + }, + { + "epoch": 0.298276149107805, + "grad_norm": 0.09399174153804779, + "learning_rate": 8.79821778270308e-05, + "loss": 1.5316570281982422, + "step": 98540 + }, + { + "epoch": 0.29830641865815083, + "grad_norm": 0.10791251808404922, + "learning_rate": 8.797838264539342e-05, + "loss": 1.5537005424499513, + "step": 98550 + }, + { + "epoch": 0.2983366882084967, + "grad_norm": 0.1117335632443428, + "learning_rate": 8.797458746375601e-05, + "loss": 1.5622425079345703, + "step": 98560 + }, + { + "epoch": 0.2983669577588425, + "grad_norm": 0.11528055369853973, + "learning_rate": 8.797079228211863e-05, + "loss": 1.580003547668457, + "step": 98570 + }, + { + "epoch": 0.29839722730918833, + "grad_norm": 0.10991325229406357, + "learning_rate": 8.796699710048123e-05, + "loss": 1.5454378128051758, + "step": 98580 + }, + { + "epoch": 0.2984274968595341, + "grad_norm": 0.10600793361663818, + "learning_rate": 8.796320191884384e-05, + "loss": 1.5565576553344727, + "step": 98590 + }, + { + "epoch": 0.29845776640988, + "grad_norm": 0.11219082772731781, + "learning_rate": 8.795940673720645e-05, + "loss": 1.5786705017089844, + "step": 98600 + }, + { + "epoch": 0.29848803596022583, + "grad_norm": 0.11884235590696335, + "learning_rate": 8.795561155556905e-05, + "loss": 1.566614532470703, + "step": 98610 + }, + { + "epoch": 0.2985183055105716, + "grad_norm": 0.10225526243448257, + "learning_rate": 8.795181637393166e-05, + "loss": 1.5563992500305175, + "step": 98620 + }, + { + "epoch": 0.2985485750609175, + "grad_norm": 0.11715850234031677, + "learning_rate": 8.794802119229426e-05, + "loss": 1.5635498046875, + "step": 98630 + }, + { + "epoch": 0.29857884461126327, + "grad_norm": 0.11217979341745377, + "learning_rate": 8.794422601065687e-05, + "loss": 1.5991918563842773, + "step": 98640 + }, + { + "epoch": 0.2986091141616091, + "grad_norm": 0.11340216547250748, + "learning_rate": 8.794043082901947e-05, + "loss": 1.5766645431518556, + "step": 98650 + }, + { + "epoch": 0.298639383711955, + "grad_norm": 0.09672331809997559, + "learning_rate": 8.793663564738208e-05, + "loss": 1.5857526779174804, + "step": 98660 + }, + { + "epoch": 0.29866965326230077, + "grad_norm": 0.10295002907514572, + "learning_rate": 8.793284046574469e-05, + "loss": 1.5453672409057617, + "step": 98670 + }, + { + "epoch": 0.2986999228126466, + "grad_norm": 0.1251716911792755, + "learning_rate": 8.792904528410729e-05, + "loss": 1.5721879959106446, + "step": 98680 + }, + { + "epoch": 0.2987301923629925, + "grad_norm": 0.13665883243083954, + "learning_rate": 8.792525010246991e-05, + "loss": 1.563288116455078, + "step": 98690 + }, + { + "epoch": 0.29876046191333827, + "grad_norm": 0.11758735775947571, + "learning_rate": 8.792145492083252e-05, + "loss": 1.5650620460510254, + "step": 98700 + }, + { + "epoch": 0.2987907314636841, + "grad_norm": 0.13021889328956604, + "learning_rate": 8.791765973919512e-05, + "loss": 1.589609432220459, + "step": 98710 + }, + { + "epoch": 0.2988210010140299, + "grad_norm": 0.11143431812524796, + "learning_rate": 8.791386455755773e-05, + "loss": 1.5203316688537598, + "step": 98720 + }, + { + "epoch": 0.29885127056437577, + "grad_norm": 0.12067681550979614, + "learning_rate": 8.791006937592034e-05, + "loss": 1.5468994140625, + "step": 98730 + }, + { + "epoch": 0.2988815401147216, + "grad_norm": 0.10429255664348602, + "learning_rate": 8.790627419428294e-05, + "loss": 1.5829765319824218, + "step": 98740 + }, + { + "epoch": 0.2989118096650674, + "grad_norm": 0.11005522310733795, + "learning_rate": 8.790247901264555e-05, + "loss": 1.5443607330322267, + "step": 98750 + }, + { + "epoch": 0.29894207921541327, + "grad_norm": 0.11582280695438385, + "learning_rate": 8.789868383100815e-05, + "loss": 1.5388648986816407, + "step": 98760 + }, + { + "epoch": 0.29897234876575907, + "grad_norm": 0.10275814682245255, + "learning_rate": 8.789488864937076e-05, + "loss": 1.548255443572998, + "step": 98770 + }, + { + "epoch": 0.2990026183161049, + "grad_norm": 0.13810941576957703, + "learning_rate": 8.789109346773337e-05, + "loss": 1.5598278045654297, + "step": 98780 + }, + { + "epoch": 0.29903288786645077, + "grad_norm": 0.11450391262769699, + "learning_rate": 8.788729828609597e-05, + "loss": 1.5539839744567872, + "step": 98790 + }, + { + "epoch": 0.29906315741679657, + "grad_norm": 0.1242535337805748, + "learning_rate": 8.788350310445858e-05, + "loss": 1.5548693656921386, + "step": 98800 + }, + { + "epoch": 0.2990934269671424, + "grad_norm": 0.10916899889707565, + "learning_rate": 8.787970792282118e-05, + "loss": 1.549062442779541, + "step": 98810 + }, + { + "epoch": 0.2991236965174882, + "grad_norm": 0.1051235944032669, + "learning_rate": 8.78759127411838e-05, + "loss": 1.5553143501281739, + "step": 98820 + }, + { + "epoch": 0.29915396606783407, + "grad_norm": 0.11761485785245895, + "learning_rate": 8.78721175595464e-05, + "loss": 1.5530231475830079, + "step": 98830 + }, + { + "epoch": 0.2991842356181799, + "grad_norm": 0.11390475183725357, + "learning_rate": 8.786832237790902e-05, + "loss": 1.5727481842041016, + "step": 98840 + }, + { + "epoch": 0.2992145051685257, + "grad_norm": 0.12833300232887268, + "learning_rate": 8.786452719627161e-05, + "loss": 1.538375473022461, + "step": 98850 + }, + { + "epoch": 0.29924477471887156, + "grad_norm": 0.11114112287759781, + "learning_rate": 8.786073201463423e-05, + "loss": 1.5786683082580566, + "step": 98860 + }, + { + "epoch": 0.29927504426921736, + "grad_norm": 0.12415607273578644, + "learning_rate": 8.785693683299682e-05, + "loss": 1.5666400909423828, + "step": 98870 + }, + { + "epoch": 0.2993053138195632, + "grad_norm": 0.1055600494146347, + "learning_rate": 8.785314165135944e-05, + "loss": 1.5637741088867188, + "step": 98880 + }, + { + "epoch": 0.29933558336990906, + "grad_norm": 0.11234482377767563, + "learning_rate": 8.784934646972204e-05, + "loss": 1.5679275512695312, + "step": 98890 + }, + { + "epoch": 0.29936585292025486, + "grad_norm": 0.11795703321695328, + "learning_rate": 8.784555128808465e-05, + "loss": 1.5412616729736328, + "step": 98900 + }, + { + "epoch": 0.2993961224706007, + "grad_norm": 0.1045927181839943, + "learning_rate": 8.784175610644726e-05, + "loss": 1.5571975708007812, + "step": 98910 + }, + { + "epoch": 0.2994263920209465, + "grad_norm": 0.10235337167978287, + "learning_rate": 8.783796092480986e-05, + "loss": 1.5408208847045899, + "step": 98920 + }, + { + "epoch": 0.29945666157129236, + "grad_norm": 0.11267884075641632, + "learning_rate": 8.783416574317248e-05, + "loss": 1.545064926147461, + "step": 98930 + }, + { + "epoch": 0.2994869311216382, + "grad_norm": 0.105677030980587, + "learning_rate": 8.783037056153507e-05, + "loss": 1.5703636169433595, + "step": 98940 + }, + { + "epoch": 0.299517200671984, + "grad_norm": 0.11454562842845917, + "learning_rate": 8.78265753798977e-05, + "loss": 1.586740493774414, + "step": 98950 + }, + { + "epoch": 0.29954747022232986, + "grad_norm": 0.10636892914772034, + "learning_rate": 8.782278019826029e-05, + "loss": 1.5272903442382812, + "step": 98960 + }, + { + "epoch": 0.29957773977267566, + "grad_norm": 0.11741111427545547, + "learning_rate": 8.78189850166229e-05, + "loss": 1.5479507446289062, + "step": 98970 + }, + { + "epoch": 0.2996080093230215, + "grad_norm": 0.11190953105688095, + "learning_rate": 8.78151898349855e-05, + "loss": 1.5548345565795898, + "step": 98980 + }, + { + "epoch": 0.29963827887336736, + "grad_norm": 0.1144450232386589, + "learning_rate": 8.781139465334812e-05, + "loss": 1.5673068046569825, + "step": 98990 + }, + { + "epoch": 0.29966854842371315, + "grad_norm": 0.10859275609254837, + "learning_rate": 8.780759947171071e-05, + "loss": 1.5421968460083009, + "step": 99000 + }, + { + "epoch": 0.29966854842371315, + "eval_loss": 1.5679314136505127, + "eval_runtime": 27.9507, + "eval_samples_per_second": 17.889, + "eval_steps_per_second": 1.145, + "step": 99000 + }, + { + "epoch": 0.299698817974059, + "grad_norm": 0.11047634482383728, + "learning_rate": 8.780380429007333e-05, + "loss": 1.545353889465332, + "step": 99010 + }, + { + "epoch": 0.2997290875244048, + "grad_norm": 0.10800673812627792, + "learning_rate": 8.780000910843592e-05, + "loss": 1.5797341346740723, + "step": 99020 + }, + { + "epoch": 0.29975935707475065, + "grad_norm": 0.12413868308067322, + "learning_rate": 8.779621392679854e-05, + "loss": 1.5529490470886231, + "step": 99030 + }, + { + "epoch": 0.2997896266250965, + "grad_norm": 0.10939791053533554, + "learning_rate": 8.779241874516113e-05, + "loss": 1.5623417854309083, + "step": 99040 + }, + { + "epoch": 0.2998198961754423, + "grad_norm": 0.11923877149820328, + "learning_rate": 8.778862356352375e-05, + "loss": 1.571925449371338, + "step": 99050 + }, + { + "epoch": 0.29985016572578815, + "grad_norm": 0.11429622024297714, + "learning_rate": 8.778482838188636e-05, + "loss": 1.5955093383789063, + "step": 99060 + }, + { + "epoch": 0.29988043527613395, + "grad_norm": 0.10407210886478424, + "learning_rate": 8.778103320024897e-05, + "loss": 1.5920455932617188, + "step": 99070 + }, + { + "epoch": 0.2999107048264798, + "grad_norm": 0.103024922311306, + "learning_rate": 8.777723801861157e-05, + "loss": 1.5781502723693848, + "step": 99080 + }, + { + "epoch": 0.29994097437682565, + "grad_norm": 0.11995577067136765, + "learning_rate": 8.777344283697418e-05, + "loss": 1.5496137619018555, + "step": 99090 + }, + { + "epoch": 0.29997124392717145, + "grad_norm": 0.10436010360717773, + "learning_rate": 8.77696476553368e-05, + "loss": 1.5539878845214843, + "step": 99100 + }, + { + "epoch": 0.3000015134775173, + "grad_norm": 0.09688788652420044, + "learning_rate": 8.776585247369939e-05, + "loss": 1.5932340621948242, + "step": 99110 + }, + { + "epoch": 0.3000317830278631, + "grad_norm": 0.10225267708301544, + "learning_rate": 8.776205729206201e-05, + "loss": 1.5806497573852538, + "step": 99120 + }, + { + "epoch": 0.30006205257820895, + "grad_norm": 0.10900043696165085, + "learning_rate": 8.77582621104246e-05, + "loss": 1.5897210121154786, + "step": 99130 + }, + { + "epoch": 0.3000923221285548, + "grad_norm": 0.11497728526592255, + "learning_rate": 8.775446692878722e-05, + "loss": 1.5504424095153808, + "step": 99140 + }, + { + "epoch": 0.3001225916789006, + "grad_norm": 0.10356469452381134, + "learning_rate": 8.775067174714981e-05, + "loss": 1.5799772262573242, + "step": 99150 + }, + { + "epoch": 0.30015286122924645, + "grad_norm": 0.10258816182613373, + "learning_rate": 8.774687656551243e-05, + "loss": 1.5763651847839355, + "step": 99160 + }, + { + "epoch": 0.30018313077959224, + "grad_norm": 0.10150710493326187, + "learning_rate": 8.774308138387502e-05, + "loss": 1.5964487075805665, + "step": 99170 + }, + { + "epoch": 0.3002134003299381, + "grad_norm": 0.12238956242799759, + "learning_rate": 8.773928620223764e-05, + "loss": 1.5725467681884766, + "step": 99180 + }, + { + "epoch": 0.30024366988028395, + "grad_norm": 0.10175609588623047, + "learning_rate": 8.773549102060025e-05, + "loss": 1.5681255340576172, + "step": 99190 + }, + { + "epoch": 0.30027393943062974, + "grad_norm": 0.12348952889442444, + "learning_rate": 8.773169583896286e-05, + "loss": 1.5467195510864258, + "step": 99200 + }, + { + "epoch": 0.3003042089809756, + "grad_norm": 0.1119011715054512, + "learning_rate": 8.772790065732546e-05, + "loss": 1.5513545989990234, + "step": 99210 + }, + { + "epoch": 0.3003344785313214, + "grad_norm": 0.12925337255001068, + "learning_rate": 8.772410547568807e-05, + "loss": 1.560175895690918, + "step": 99220 + }, + { + "epoch": 0.30036474808166724, + "grad_norm": 0.11519201099872589, + "learning_rate": 8.772031029405067e-05, + "loss": 1.5698898315429688, + "step": 99230 + }, + { + "epoch": 0.3003950176320131, + "grad_norm": 0.1241462305188179, + "learning_rate": 8.771651511241328e-05, + "loss": 1.5596937179565429, + "step": 99240 + }, + { + "epoch": 0.3004252871823589, + "grad_norm": 0.1073874905705452, + "learning_rate": 8.771271993077589e-05, + "loss": 1.5582588195800782, + "step": 99250 + }, + { + "epoch": 0.30045555673270474, + "grad_norm": 0.10334751754999161, + "learning_rate": 8.770892474913849e-05, + "loss": 1.5815276145935058, + "step": 99260 + }, + { + "epoch": 0.30048582628305054, + "grad_norm": 0.12224996834993362, + "learning_rate": 8.77051295675011e-05, + "loss": 1.5411958694458008, + "step": 99270 + }, + { + "epoch": 0.3005160958333964, + "grad_norm": 0.10969630628824234, + "learning_rate": 8.77013343858637e-05, + "loss": 1.5311199188232423, + "step": 99280 + }, + { + "epoch": 0.30054636538374224, + "grad_norm": 0.12172406911849976, + "learning_rate": 8.769753920422631e-05, + "loss": 1.580335521697998, + "step": 99290 + }, + { + "epoch": 0.30057663493408804, + "grad_norm": 0.10813978314399719, + "learning_rate": 8.769374402258893e-05, + "loss": 1.5874354362487793, + "step": 99300 + }, + { + "epoch": 0.3006069044844339, + "grad_norm": 0.10962007194757462, + "learning_rate": 8.768994884095154e-05, + "loss": 1.5938434600830078, + "step": 99310 + }, + { + "epoch": 0.3006371740347797, + "grad_norm": 0.10988468676805496, + "learning_rate": 8.768615365931414e-05, + "loss": 1.5550512313842773, + "step": 99320 + }, + { + "epoch": 0.30066744358512554, + "grad_norm": 0.12270302325487137, + "learning_rate": 8.768235847767675e-05, + "loss": 1.5505859375, + "step": 99330 + }, + { + "epoch": 0.3006977131354714, + "grad_norm": 0.10393288731575012, + "learning_rate": 8.767856329603935e-05, + "loss": 1.5531761169433593, + "step": 99340 + }, + { + "epoch": 0.3007279826858172, + "grad_norm": 0.10376426577568054, + "learning_rate": 8.767476811440196e-05, + "loss": 1.5434823036193848, + "step": 99350 + }, + { + "epoch": 0.30075825223616304, + "grad_norm": 0.12599849700927734, + "learning_rate": 8.767097293276457e-05, + "loss": 1.5879201889038086, + "step": 99360 + }, + { + "epoch": 0.3007885217865089, + "grad_norm": 0.10406573116779327, + "learning_rate": 8.766717775112717e-05, + "loss": 1.5792640686035155, + "step": 99370 + }, + { + "epoch": 0.3008187913368547, + "grad_norm": 0.1155945360660553, + "learning_rate": 8.766338256948978e-05, + "loss": 1.5651868820190429, + "step": 99380 + }, + { + "epoch": 0.30084906088720054, + "grad_norm": 0.1138944998383522, + "learning_rate": 8.765958738785238e-05, + "loss": 1.556753158569336, + "step": 99390 + }, + { + "epoch": 0.30087933043754633, + "grad_norm": 0.09850239753723145, + "learning_rate": 8.765579220621499e-05, + "loss": 1.563835334777832, + "step": 99400 + }, + { + "epoch": 0.3009095999878922, + "grad_norm": 0.12359034270048141, + "learning_rate": 8.76519970245776e-05, + "loss": 1.5667266845703125, + "step": 99410 + }, + { + "epoch": 0.30093986953823804, + "grad_norm": 0.1250597983598709, + "learning_rate": 8.76482018429402e-05, + "loss": 1.5664451599121094, + "step": 99420 + }, + { + "epoch": 0.30097013908858383, + "grad_norm": 0.12391723692417145, + "learning_rate": 8.764440666130282e-05, + "loss": 1.5406196594238282, + "step": 99430 + }, + { + "epoch": 0.3010004086389297, + "grad_norm": 0.11110011488199234, + "learning_rate": 8.764061147966541e-05, + "loss": 1.5404170989990233, + "step": 99440 + }, + { + "epoch": 0.3010306781892755, + "grad_norm": 0.1118297353386879, + "learning_rate": 8.763681629802803e-05, + "loss": 1.5722149848937987, + "step": 99450 + }, + { + "epoch": 0.30106094773962133, + "grad_norm": 0.11157845705747604, + "learning_rate": 8.763302111639062e-05, + "loss": 1.5488513946533202, + "step": 99460 + }, + { + "epoch": 0.3010912172899672, + "grad_norm": 0.12075722217559814, + "learning_rate": 8.762922593475324e-05, + "loss": 1.5979230880737305, + "step": 99470 + }, + { + "epoch": 0.301121486840313, + "grad_norm": 0.0955452173948288, + "learning_rate": 8.762543075311584e-05, + "loss": 1.571306037902832, + "step": 99480 + }, + { + "epoch": 0.30115175639065883, + "grad_norm": 0.11429940909147263, + "learning_rate": 8.762163557147846e-05, + "loss": 1.5230780601501466, + "step": 99490 + }, + { + "epoch": 0.3011820259410046, + "grad_norm": 0.10963478684425354, + "learning_rate": 8.761784038984106e-05, + "loss": 1.5369251251220704, + "step": 99500 + }, + { + "epoch": 0.3011820259410046, + "eval_loss": 1.559411644935608, + "eval_runtime": 27.8419, + "eval_samples_per_second": 17.959, + "eval_steps_per_second": 1.149, + "step": 99500 + }, + { + "epoch": 0.3012122954913505, + "grad_norm": 0.10239926725625992, + "learning_rate": 8.761404520820367e-05, + "loss": 1.626571273803711, + "step": 99510 + }, + { + "epoch": 0.30124256504169633, + "grad_norm": 0.10686001181602478, + "learning_rate": 8.761025002656627e-05, + "loss": 1.5783626556396484, + "step": 99520 + }, + { + "epoch": 0.3012728345920421, + "grad_norm": 0.10483458638191223, + "learning_rate": 8.760645484492888e-05, + "loss": 1.5588739395141602, + "step": 99530 + }, + { + "epoch": 0.301303104142388, + "grad_norm": 0.11255712807178497, + "learning_rate": 8.760265966329149e-05, + "loss": 1.582863998413086, + "step": 99540 + }, + { + "epoch": 0.3013333736927338, + "grad_norm": 0.11117447167634964, + "learning_rate": 8.759886448165409e-05, + "loss": 1.5388019561767579, + "step": 99550 + }, + { + "epoch": 0.3013636432430796, + "grad_norm": 0.11933456361293793, + "learning_rate": 8.759506930001671e-05, + "loss": 1.5654354095458984, + "step": 99560 + }, + { + "epoch": 0.3013939127934255, + "grad_norm": 0.10565993189811707, + "learning_rate": 8.75912741183793e-05, + "loss": 1.5583844184875488, + "step": 99570 + }, + { + "epoch": 0.3014241823437713, + "grad_norm": 0.1111116036772728, + "learning_rate": 8.758747893674192e-05, + "loss": 1.5501493453979491, + "step": 99580 + }, + { + "epoch": 0.3014544518941171, + "grad_norm": 0.12320330739021301, + "learning_rate": 8.758368375510452e-05, + "loss": 1.5422768592834473, + "step": 99590 + }, + { + "epoch": 0.3014847214444629, + "grad_norm": 0.11467713117599487, + "learning_rate": 8.757988857346714e-05, + "loss": 1.5268505096435547, + "step": 99600 + }, + { + "epoch": 0.3015149909948088, + "grad_norm": 0.11927984654903412, + "learning_rate": 8.757609339182973e-05, + "loss": 1.5511253356933594, + "step": 99610 + }, + { + "epoch": 0.3015452605451546, + "grad_norm": 0.12918274104595184, + "learning_rate": 8.757229821019235e-05, + "loss": 1.5619749069213866, + "step": 99620 + }, + { + "epoch": 0.3015755300955004, + "grad_norm": 0.1163366511464119, + "learning_rate": 8.756850302855494e-05, + "loss": 1.5847929954528808, + "step": 99630 + }, + { + "epoch": 0.3016057996458463, + "grad_norm": 0.11090753227472305, + "learning_rate": 8.756470784691756e-05, + "loss": 1.5485994338989257, + "step": 99640 + }, + { + "epoch": 0.30163606919619207, + "grad_norm": 0.14168614149093628, + "learning_rate": 8.756091266528015e-05, + "loss": 1.5390647888183593, + "step": 99650 + }, + { + "epoch": 0.3016663387465379, + "grad_norm": 0.11548653244972229, + "learning_rate": 8.755711748364277e-05, + "loss": 1.5682785034179687, + "step": 99660 + }, + { + "epoch": 0.3016966082968838, + "grad_norm": 0.12940596044063568, + "learning_rate": 8.755332230200536e-05, + "loss": 1.5783832550048829, + "step": 99670 + }, + { + "epoch": 0.30172687784722957, + "grad_norm": 0.11955516040325165, + "learning_rate": 8.754952712036798e-05, + "loss": 1.5906885147094727, + "step": 99680 + }, + { + "epoch": 0.3017571473975754, + "grad_norm": 0.11834285408258438, + "learning_rate": 8.754573193873059e-05, + "loss": 1.540839958190918, + "step": 99690 + }, + { + "epoch": 0.3017874169479212, + "grad_norm": 0.1193053126335144, + "learning_rate": 8.75419367570932e-05, + "loss": 1.585893440246582, + "step": 99700 + }, + { + "epoch": 0.30181768649826707, + "grad_norm": 0.10837394744157791, + "learning_rate": 8.753814157545581e-05, + "loss": 1.5509749412536622, + "step": 99710 + }, + { + "epoch": 0.3018479560486129, + "grad_norm": 0.10640381276607513, + "learning_rate": 8.75343463938184e-05, + "loss": 1.5861684799194335, + "step": 99720 + }, + { + "epoch": 0.3018782255989587, + "grad_norm": 0.10686460137367249, + "learning_rate": 8.753055121218103e-05, + "loss": 1.567219066619873, + "step": 99730 + }, + { + "epoch": 0.30190849514930457, + "grad_norm": 0.12209659814834595, + "learning_rate": 8.752675603054362e-05, + "loss": 1.5741491317749023, + "step": 99740 + }, + { + "epoch": 0.30193876469965036, + "grad_norm": 0.11518239974975586, + "learning_rate": 8.752296084890624e-05, + "loss": 1.5597039222717286, + "step": 99750 + }, + { + "epoch": 0.3019690342499962, + "grad_norm": 0.11547533422708511, + "learning_rate": 8.751916566726883e-05, + "loss": 1.537708854675293, + "step": 99760 + }, + { + "epoch": 0.30199930380034207, + "grad_norm": 0.11690931022167206, + "learning_rate": 8.751537048563145e-05, + "loss": 1.567864227294922, + "step": 99770 + }, + { + "epoch": 0.30202957335068786, + "grad_norm": 0.1100507527589798, + "learning_rate": 8.751157530399404e-05, + "loss": 1.547332763671875, + "step": 99780 + }, + { + "epoch": 0.3020598429010337, + "grad_norm": 0.11572840809822083, + "learning_rate": 8.750778012235666e-05, + "loss": 1.5565072059631349, + "step": 99790 + }, + { + "epoch": 0.3020901124513795, + "grad_norm": 0.11289612948894501, + "learning_rate": 8.750398494071927e-05, + "loss": 1.551613998413086, + "step": 99800 + }, + { + "epoch": 0.30212038200172536, + "grad_norm": 0.09955138713121414, + "learning_rate": 8.750018975908187e-05, + "loss": 1.5479066848754883, + "step": 99810 + }, + { + "epoch": 0.3021506515520712, + "grad_norm": 0.12472500652074814, + "learning_rate": 8.749639457744448e-05, + "loss": 1.5715435028076172, + "step": 99820 + }, + { + "epoch": 0.302180921102417, + "grad_norm": 0.11968538165092468, + "learning_rate": 8.749259939580709e-05, + "loss": 1.5507919311523437, + "step": 99830 + }, + { + "epoch": 0.30221119065276286, + "grad_norm": 0.10465749353170395, + "learning_rate": 8.748880421416969e-05, + "loss": 1.546958065032959, + "step": 99840 + }, + { + "epoch": 0.30224146020310866, + "grad_norm": 0.10999509692192078, + "learning_rate": 8.74850090325323e-05, + "loss": 1.5415168762207032, + "step": 99850 + }, + { + "epoch": 0.3022717297534545, + "grad_norm": 0.11659547686576843, + "learning_rate": 8.74812138508949e-05, + "loss": 1.565956974029541, + "step": 99860 + }, + { + "epoch": 0.30230199930380036, + "grad_norm": 0.11804646253585815, + "learning_rate": 8.747741866925751e-05, + "loss": 1.582132911682129, + "step": 99870 + }, + { + "epoch": 0.30233226885414616, + "grad_norm": 0.1072676032781601, + "learning_rate": 8.747362348762012e-05, + "loss": 1.5410971641540527, + "step": 99880 + }, + { + "epoch": 0.302362538404492, + "grad_norm": 0.11263362318277359, + "learning_rate": 8.746982830598272e-05, + "loss": 1.5411123275756835, + "step": 99890 + }, + { + "epoch": 0.3023928079548378, + "grad_norm": 0.11853748559951782, + "learning_rate": 8.746603312434533e-05, + "loss": 1.5549087524414062, + "step": 99900 + }, + { + "epoch": 0.30242307750518366, + "grad_norm": 0.10408563166856766, + "learning_rate": 8.746223794270793e-05, + "loss": 1.5465764045715331, + "step": 99910 + }, + { + "epoch": 0.3024533470555295, + "grad_norm": 0.11324165761470795, + "learning_rate": 8.745844276107055e-05, + "loss": 1.627296257019043, + "step": 99920 + }, + { + "epoch": 0.3024836166058753, + "grad_norm": 0.10891944915056229, + "learning_rate": 8.745464757943316e-05, + "loss": 1.5364253997802735, + "step": 99930 + }, + { + "epoch": 0.30251388615622116, + "grad_norm": 0.11883910745382309, + "learning_rate": 8.745085239779576e-05, + "loss": 1.5688566207885741, + "step": 99940 + }, + { + "epoch": 0.30254415570656695, + "grad_norm": 0.12234491854906082, + "learning_rate": 8.744705721615837e-05, + "loss": 1.5700769424438477, + "step": 99950 + }, + { + "epoch": 0.3025744252569128, + "grad_norm": 0.11769945919513702, + "learning_rate": 8.744326203452098e-05, + "loss": 1.5717280387878418, + "step": 99960 + }, + { + "epoch": 0.30260469480725866, + "grad_norm": 0.10509888082742691, + "learning_rate": 8.743946685288358e-05, + "loss": 1.5684127807617188, + "step": 99970 + }, + { + "epoch": 0.30263496435760445, + "grad_norm": 0.11524584889411926, + "learning_rate": 8.743567167124619e-05, + "loss": 1.5625635147094727, + "step": 99980 + }, + { + "epoch": 0.3026652339079503, + "grad_norm": 0.10983853042125702, + "learning_rate": 8.74318764896088e-05, + "loss": 1.5382303237915038, + "step": 99990 + }, + { + "epoch": 0.3026955034582961, + "grad_norm": 0.12150998413562775, + "learning_rate": 8.74280813079714e-05, + "loss": 1.5358810424804688, + "step": 100000 + }, + { + "epoch": 0.3026955034582961, + "eval_loss": 1.5618598461151123, + "eval_runtime": 28.2339, + "eval_samples_per_second": 17.709, + "eval_steps_per_second": 1.133, + "step": 100000 + }, + { + "epoch": 0.30272577300864195, + "grad_norm": 0.1312667578458786, + "learning_rate": 8.7424286126334e-05, + "loss": 1.5203598976135253, + "step": 100010 + }, + { + "epoch": 0.3027560425589878, + "grad_norm": 0.11169231683015823, + "learning_rate": 8.742049094469661e-05, + "loss": 1.5413299560546876, + "step": 100020 + }, + { + "epoch": 0.3027863121093336, + "grad_norm": 0.10423494130373001, + "learning_rate": 8.741669576305922e-05, + "loss": 1.563699722290039, + "step": 100030 + }, + { + "epoch": 0.30281658165967945, + "grad_norm": 0.11882658302783966, + "learning_rate": 8.741290058142184e-05, + "loss": 1.5578561782836915, + "step": 100040 + }, + { + "epoch": 0.30284685121002525, + "grad_norm": 0.11313895136117935, + "learning_rate": 8.740910539978443e-05, + "loss": 1.5916736602783204, + "step": 100050 + }, + { + "epoch": 0.3028771207603711, + "grad_norm": 0.10237900167703629, + "learning_rate": 8.740531021814705e-05, + "loss": 1.5369300842285156, + "step": 100060 + }, + { + "epoch": 0.30290739031071695, + "grad_norm": 0.11734233796596527, + "learning_rate": 8.740151503650964e-05, + "loss": 1.5938423156738282, + "step": 100070 + }, + { + "epoch": 0.30293765986106275, + "grad_norm": 0.11994223296642303, + "learning_rate": 8.739771985487226e-05, + "loss": 1.5761635780334473, + "step": 100080 + }, + { + "epoch": 0.3029679294114086, + "grad_norm": 0.1115526556968689, + "learning_rate": 8.739392467323485e-05, + "loss": 1.5670886993408204, + "step": 100090 + }, + { + "epoch": 0.30299819896175445, + "grad_norm": 0.13997486233711243, + "learning_rate": 8.739012949159747e-05, + "loss": 1.533871841430664, + "step": 100100 + }, + { + "epoch": 0.30302846851210025, + "grad_norm": 0.12230177968740463, + "learning_rate": 8.738633430996008e-05, + "loss": 1.5566866874694825, + "step": 100110 + }, + { + "epoch": 0.3030587380624461, + "grad_norm": 0.11208876967430115, + "learning_rate": 8.738253912832269e-05, + "loss": 1.5617710113525392, + "step": 100120 + }, + { + "epoch": 0.3030890076127919, + "grad_norm": 0.11174919456243515, + "learning_rate": 8.737874394668529e-05, + "loss": 1.522153663635254, + "step": 100130 + }, + { + "epoch": 0.30311927716313775, + "grad_norm": 0.14419353008270264, + "learning_rate": 8.73749487650479e-05, + "loss": 1.5548593521118164, + "step": 100140 + }, + { + "epoch": 0.3031495467134836, + "grad_norm": 0.10882646590471268, + "learning_rate": 8.73711535834105e-05, + "loss": 1.5594568252563477, + "step": 100150 + }, + { + "epoch": 0.3031798162638294, + "grad_norm": 0.12380830943584442, + "learning_rate": 8.736735840177311e-05, + "loss": 1.5701926231384278, + "step": 100160 + }, + { + "epoch": 0.30321008581417525, + "grad_norm": 0.10756119340658188, + "learning_rate": 8.736356322013573e-05, + "loss": 1.582888412475586, + "step": 100170 + }, + { + "epoch": 0.30324035536452104, + "grad_norm": 0.1294078230857849, + "learning_rate": 8.735976803849832e-05, + "loss": 1.5616849899291991, + "step": 100180 + }, + { + "epoch": 0.3032706249148669, + "grad_norm": 0.12136650830507278, + "learning_rate": 8.735597285686094e-05, + "loss": 1.5596652984619142, + "step": 100190 + }, + { + "epoch": 0.30330089446521274, + "grad_norm": 0.11007307469844818, + "learning_rate": 8.735217767522353e-05, + "loss": 1.5432707786560058, + "step": 100200 + }, + { + "epoch": 0.30333116401555854, + "grad_norm": 0.11009884625673294, + "learning_rate": 8.734838249358615e-05, + "loss": 1.545583438873291, + "step": 100210 + }, + { + "epoch": 0.3033614335659044, + "grad_norm": 0.12746794521808624, + "learning_rate": 8.734458731194874e-05, + "loss": 1.524437713623047, + "step": 100220 + }, + { + "epoch": 0.3033917031162502, + "grad_norm": 0.12464816123247147, + "learning_rate": 8.734079213031136e-05, + "loss": 1.5592117309570312, + "step": 100230 + }, + { + "epoch": 0.30342197266659604, + "grad_norm": 0.11557003110647202, + "learning_rate": 8.733699694867396e-05, + "loss": 1.5776677131652832, + "step": 100240 + }, + { + "epoch": 0.3034522422169419, + "grad_norm": 0.10612774640321732, + "learning_rate": 8.733320176703658e-05, + "loss": 1.5764894485473633, + "step": 100250 + }, + { + "epoch": 0.3034825117672877, + "grad_norm": 0.1225271001458168, + "learning_rate": 8.732940658539917e-05, + "loss": 1.5550583839416503, + "step": 100260 + }, + { + "epoch": 0.30351278131763354, + "grad_norm": 0.11255133897066116, + "learning_rate": 8.732561140376179e-05, + "loss": 1.5910354614257813, + "step": 100270 + }, + { + "epoch": 0.30354305086797934, + "grad_norm": 0.12026481330394745, + "learning_rate": 8.732181622212438e-05, + "loss": 1.5372753143310547, + "step": 100280 + }, + { + "epoch": 0.3035733204183252, + "grad_norm": 0.11872109025716782, + "learning_rate": 8.7318021040487e-05, + "loss": 1.5384786605834961, + "step": 100290 + }, + { + "epoch": 0.30360358996867104, + "grad_norm": 0.11269819736480713, + "learning_rate": 8.73142258588496e-05, + "loss": 1.5721117019653321, + "step": 100300 + }, + { + "epoch": 0.30363385951901684, + "grad_norm": 0.1251063495874405, + "learning_rate": 8.731043067721221e-05, + "loss": 1.548013973236084, + "step": 100310 + }, + { + "epoch": 0.3036641290693627, + "grad_norm": 0.12221727520227432, + "learning_rate": 8.730663549557483e-05, + "loss": 1.5898015022277832, + "step": 100320 + }, + { + "epoch": 0.3036943986197085, + "grad_norm": 0.11507797241210938, + "learning_rate": 8.730284031393742e-05, + "loss": 1.5862255096435547, + "step": 100330 + }, + { + "epoch": 0.30372466817005433, + "grad_norm": 0.10776644200086594, + "learning_rate": 8.729904513230004e-05, + "loss": 1.5878169059753418, + "step": 100340 + }, + { + "epoch": 0.3037549377204002, + "grad_norm": 0.1278967708349228, + "learning_rate": 8.729524995066264e-05, + "loss": 1.5284931182861328, + "step": 100350 + }, + { + "epoch": 0.303785207270746, + "grad_norm": 0.10212644189596176, + "learning_rate": 8.729145476902526e-05, + "loss": 1.5582393646240233, + "step": 100360 + }, + { + "epoch": 0.30381547682109183, + "grad_norm": 0.11948090046644211, + "learning_rate": 8.728765958738785e-05, + "loss": 1.546999454498291, + "step": 100370 + }, + { + "epoch": 0.30384574637143763, + "grad_norm": 0.12404468655586243, + "learning_rate": 8.728386440575047e-05, + "loss": 1.5468480110168457, + "step": 100380 + }, + { + "epoch": 0.3038760159217835, + "grad_norm": 0.1158132255077362, + "learning_rate": 8.728006922411306e-05, + "loss": 1.5238940238952636, + "step": 100390 + }, + { + "epoch": 0.30390628547212933, + "grad_norm": 0.11119039356708527, + "learning_rate": 8.727627404247568e-05, + "loss": 1.5474229812622071, + "step": 100400 + }, + { + "epoch": 0.30393655502247513, + "grad_norm": 0.11081712692975998, + "learning_rate": 8.727247886083828e-05, + "loss": 1.5435765266418457, + "step": 100410 + }, + { + "epoch": 0.303966824572821, + "grad_norm": 0.12291832268238068, + "learning_rate": 8.726868367920089e-05, + "loss": 1.5403122901916504, + "step": 100420 + }, + { + "epoch": 0.3039970941231668, + "grad_norm": 0.11801508814096451, + "learning_rate": 8.72648884975635e-05, + "loss": 1.5267057418823242, + "step": 100430 + }, + { + "epoch": 0.30402736367351263, + "grad_norm": 0.10885783284902573, + "learning_rate": 8.72610933159261e-05, + "loss": 1.544260025024414, + "step": 100440 + }, + { + "epoch": 0.3040576332238585, + "grad_norm": 0.13197745382785797, + "learning_rate": 8.725729813428871e-05, + "loss": 1.542202091217041, + "step": 100450 + }, + { + "epoch": 0.3040879027742043, + "grad_norm": 0.1104716956615448, + "learning_rate": 8.725350295265131e-05, + "loss": 1.5514362335205079, + "step": 100460 + }, + { + "epoch": 0.30411817232455013, + "grad_norm": 0.11870626360177994, + "learning_rate": 8.724970777101392e-05, + "loss": 1.5696893692016602, + "step": 100470 + }, + { + "epoch": 0.3041484418748959, + "grad_norm": 0.11690706759691238, + "learning_rate": 8.724591258937653e-05, + "loss": 1.5602706909179687, + "step": 100480 + }, + { + "epoch": 0.3041787114252418, + "grad_norm": 0.12402363866567612, + "learning_rate": 8.724211740773913e-05, + "loss": 1.5382610321044923, + "step": 100490 + }, + { + "epoch": 0.30420898097558763, + "grad_norm": 0.12461081147193909, + "learning_rate": 8.723832222610174e-05, + "loss": 1.5321722030639648, + "step": 100500 + }, + { + "epoch": 0.30420898097558763, + "eval_loss": 1.5673469305038452, + "eval_runtime": 27.9805, + "eval_samples_per_second": 17.87, + "eval_steps_per_second": 1.144, + "step": 100500 + }, + { + "epoch": 0.3042392505259334, + "grad_norm": 0.10801748931407928, + "learning_rate": 8.723452704446434e-05, + "loss": 1.5795034408569335, + "step": 100510 + }, + { + "epoch": 0.3042695200762793, + "grad_norm": 0.12437625229358673, + "learning_rate": 8.723073186282695e-05, + "loss": 1.5164935111999511, + "step": 100520 + }, + { + "epoch": 0.3042997896266251, + "grad_norm": 0.11316180229187012, + "learning_rate": 8.722693668118957e-05, + "loss": 1.5566390991210937, + "step": 100530 + }, + { + "epoch": 0.3043300591769709, + "grad_norm": 0.1052902489900589, + "learning_rate": 8.722314149955218e-05, + "loss": 1.5449755668640137, + "step": 100540 + }, + { + "epoch": 0.3043603287273168, + "grad_norm": 0.12793385982513428, + "learning_rate": 8.721934631791478e-05, + "loss": 1.5465734481811524, + "step": 100550 + }, + { + "epoch": 0.30439059827766257, + "grad_norm": 0.10635741055011749, + "learning_rate": 8.721555113627739e-05, + "loss": 1.5506385803222655, + "step": 100560 + }, + { + "epoch": 0.3044208678280084, + "grad_norm": 0.11545095592737198, + "learning_rate": 8.721175595464e-05, + "loss": 1.5964488983154297, + "step": 100570 + }, + { + "epoch": 0.3044511373783542, + "grad_norm": 0.10504081100225449, + "learning_rate": 8.72079607730026e-05, + "loss": 1.604155921936035, + "step": 100580 + }, + { + "epoch": 0.30448140692870007, + "grad_norm": 0.10036677122116089, + "learning_rate": 8.72041655913652e-05, + "loss": 1.5417967796325684, + "step": 100590 + }, + { + "epoch": 0.3045116764790459, + "grad_norm": 0.11372392624616623, + "learning_rate": 8.720037040972781e-05, + "loss": 1.5361509323120117, + "step": 100600 + }, + { + "epoch": 0.3045419460293917, + "grad_norm": 0.11913768947124481, + "learning_rate": 8.719657522809042e-05, + "loss": 1.569089126586914, + "step": 100610 + }, + { + "epoch": 0.30457221557973757, + "grad_norm": 0.1085590124130249, + "learning_rate": 8.719278004645302e-05, + "loss": 1.5509873390197755, + "step": 100620 + }, + { + "epoch": 0.30460248513008337, + "grad_norm": 0.11402425169944763, + "learning_rate": 8.718898486481563e-05, + "loss": 1.5291641235351563, + "step": 100630 + }, + { + "epoch": 0.3046327546804292, + "grad_norm": 0.1251644343137741, + "learning_rate": 8.718518968317824e-05, + "loss": 1.5658279418945313, + "step": 100640 + }, + { + "epoch": 0.30466302423077507, + "grad_norm": 0.10911048203706741, + "learning_rate": 8.718139450154084e-05, + "loss": 1.5467033386230469, + "step": 100650 + }, + { + "epoch": 0.30469329378112087, + "grad_norm": 0.11510470509529114, + "learning_rate": 8.717759931990345e-05, + "loss": 1.5490481376647949, + "step": 100660 + }, + { + "epoch": 0.3047235633314667, + "grad_norm": 0.1146899089217186, + "learning_rate": 8.717380413826607e-05, + "loss": 1.5637012481689454, + "step": 100670 + }, + { + "epoch": 0.3047538328818125, + "grad_norm": 0.10073599964380264, + "learning_rate": 8.717000895662866e-05, + "loss": 1.56611967086792, + "step": 100680 + }, + { + "epoch": 0.30478410243215837, + "grad_norm": 0.1223912388086319, + "learning_rate": 8.716621377499128e-05, + "loss": 1.5668672561645507, + "step": 100690 + }, + { + "epoch": 0.3048143719825042, + "grad_norm": 0.11488087475299835, + "learning_rate": 8.716241859335387e-05, + "loss": 1.5329533576965333, + "step": 100700 + }, + { + "epoch": 0.30484464153285, + "grad_norm": 0.10784576833248138, + "learning_rate": 8.715862341171649e-05, + "loss": 1.5663318634033203, + "step": 100710 + }, + { + "epoch": 0.30487491108319587, + "grad_norm": 0.10642234981060028, + "learning_rate": 8.71548282300791e-05, + "loss": 1.5688305854797364, + "step": 100720 + }, + { + "epoch": 0.30490518063354166, + "grad_norm": 0.1167360320687294, + "learning_rate": 8.71510330484417e-05, + "loss": 1.5561479568481444, + "step": 100730 + }, + { + "epoch": 0.3049354501838875, + "grad_norm": 0.11340968310832977, + "learning_rate": 8.714723786680431e-05, + "loss": 1.5661343574523925, + "step": 100740 + }, + { + "epoch": 0.30496571973423336, + "grad_norm": 0.10867142677307129, + "learning_rate": 8.714344268516691e-05, + "loss": 1.595226001739502, + "step": 100750 + }, + { + "epoch": 0.30499598928457916, + "grad_norm": 0.10601476579904556, + "learning_rate": 8.713964750352952e-05, + "loss": 1.5521015167236327, + "step": 100760 + }, + { + "epoch": 0.305026258834925, + "grad_norm": 0.11945517361164093, + "learning_rate": 8.713585232189213e-05, + "loss": 1.588823127746582, + "step": 100770 + }, + { + "epoch": 0.30505652838527086, + "grad_norm": 0.1126440241932869, + "learning_rate": 8.713205714025475e-05, + "loss": 1.569455909729004, + "step": 100780 + }, + { + "epoch": 0.30508679793561666, + "grad_norm": 0.12462125718593597, + "learning_rate": 8.712826195861734e-05, + "loss": 1.5825989723205567, + "step": 100790 + }, + { + "epoch": 0.3051170674859625, + "grad_norm": 0.11909661442041397, + "learning_rate": 8.712446677697996e-05, + "loss": 1.5424089431762695, + "step": 100800 + }, + { + "epoch": 0.3051473370363083, + "grad_norm": 0.11396150290966034, + "learning_rate": 8.712067159534255e-05, + "loss": 1.5452478408813477, + "step": 100810 + }, + { + "epoch": 0.30517760658665416, + "grad_norm": 0.10748883336782455, + "learning_rate": 8.711687641370517e-05, + "loss": 1.5305774688720704, + "step": 100820 + }, + { + "epoch": 0.305207876137, + "grad_norm": 0.10375325381755829, + "learning_rate": 8.711308123206776e-05, + "loss": 1.5328693389892578, + "step": 100830 + }, + { + "epoch": 0.3052381456873458, + "grad_norm": 0.11112654209136963, + "learning_rate": 8.710928605043038e-05, + "loss": 1.5716896057128906, + "step": 100840 + }, + { + "epoch": 0.30526841523769166, + "grad_norm": 0.127643421292305, + "learning_rate": 8.710549086879297e-05, + "loss": 1.5353057861328125, + "step": 100850 + }, + { + "epoch": 0.30529868478803746, + "grad_norm": 0.10588902980089188, + "learning_rate": 8.710169568715559e-05, + "loss": 1.5187026977539062, + "step": 100860 + }, + { + "epoch": 0.3053289543383833, + "grad_norm": 0.10949012637138367, + "learning_rate": 8.709790050551819e-05, + "loss": 1.556711483001709, + "step": 100870 + }, + { + "epoch": 0.30535922388872916, + "grad_norm": 0.12108560651540756, + "learning_rate": 8.70941053238808e-05, + "loss": 1.564926815032959, + "step": 100880 + }, + { + "epoch": 0.30538949343907495, + "grad_norm": 0.10725811123847961, + "learning_rate": 8.70903101422434e-05, + "loss": 1.5103529930114745, + "step": 100890 + }, + { + "epoch": 0.3054197629894208, + "grad_norm": 0.12881256639957428, + "learning_rate": 8.708651496060602e-05, + "loss": 1.5773096084594727, + "step": 100900 + }, + { + "epoch": 0.3054500325397666, + "grad_norm": 0.11054173856973648, + "learning_rate": 8.708271977896862e-05, + "loss": 1.535618782043457, + "step": 100910 + }, + { + "epoch": 0.30548030209011245, + "grad_norm": 0.11793096363544464, + "learning_rate": 8.707892459733123e-05, + "loss": 1.5897449493408202, + "step": 100920 + }, + { + "epoch": 0.3055105716404583, + "grad_norm": 0.11114887148141861, + "learning_rate": 8.707512941569385e-05, + "loss": 1.5347938537597656, + "step": 100930 + }, + { + "epoch": 0.3055408411908041, + "grad_norm": 0.10673592984676361, + "learning_rate": 8.707133423405644e-05, + "loss": 1.5536785125732422, + "step": 100940 + }, + { + "epoch": 0.30557111074114995, + "grad_norm": 0.11722925305366516, + "learning_rate": 8.706753905241906e-05, + "loss": 1.5820266723632812, + "step": 100950 + }, + { + "epoch": 0.30560138029149575, + "grad_norm": 0.12111838907003403, + "learning_rate": 8.706374387078165e-05, + "loss": 1.5179712295532226, + "step": 100960 + }, + { + "epoch": 0.3056316498418416, + "grad_norm": 0.1150786280632019, + "learning_rate": 8.705994868914427e-05, + "loss": 1.5525158882141112, + "step": 100970 + }, + { + "epoch": 0.30566191939218745, + "grad_norm": 0.10755264759063721, + "learning_rate": 8.705615350750686e-05, + "loss": 1.5586865425109864, + "step": 100980 + }, + { + "epoch": 0.30569218894253325, + "grad_norm": 0.11471973359584808, + "learning_rate": 8.705235832586948e-05, + "loss": 1.539527416229248, + "step": 100990 + }, + { + "epoch": 0.3057224584928791, + "grad_norm": 0.10158754140138626, + "learning_rate": 8.704856314423208e-05, + "loss": 1.581135368347168, + "step": 101000 + }, + { + "epoch": 0.3057224584928791, + "eval_loss": 1.5760947465896606, + "eval_runtime": 28.2416, + "eval_samples_per_second": 17.704, + "eval_steps_per_second": 1.133, + "step": 101000 + }, + { + "epoch": 0.3057527280432249, + "grad_norm": 0.11132040619850159, + "learning_rate": 8.70447679625947e-05, + "loss": 1.560932731628418, + "step": 101010 + }, + { + "epoch": 0.30578299759357075, + "grad_norm": 0.12708760797977448, + "learning_rate": 8.704097278095729e-05, + "loss": 1.5263026237487793, + "step": 101020 + }, + { + "epoch": 0.3058132671439166, + "grad_norm": 0.12288863956928253, + "learning_rate": 8.703717759931991e-05, + "loss": 1.5307439804077148, + "step": 101030 + }, + { + "epoch": 0.3058435366942624, + "grad_norm": 0.11044319719076157, + "learning_rate": 8.703338241768251e-05, + "loss": 1.538844394683838, + "step": 101040 + }, + { + "epoch": 0.30587380624460825, + "grad_norm": 0.11809375137090683, + "learning_rate": 8.702958723604512e-05, + "loss": 1.5513864517211915, + "step": 101050 + }, + { + "epoch": 0.30590407579495404, + "grad_norm": 0.10204991698265076, + "learning_rate": 8.702579205440773e-05, + "loss": 1.5709693908691407, + "step": 101060 + }, + { + "epoch": 0.3059343453452999, + "grad_norm": 0.10944082587957382, + "learning_rate": 8.702199687277033e-05, + "loss": 1.5955355644226075, + "step": 101070 + }, + { + "epoch": 0.30596461489564575, + "grad_norm": 0.1142532229423523, + "learning_rate": 8.701820169113294e-05, + "loss": 1.5657655715942382, + "step": 101080 + }, + { + "epoch": 0.30599488444599154, + "grad_norm": 0.11672250181436539, + "learning_rate": 8.701440650949554e-05, + "loss": 1.5639472961425782, + "step": 101090 + }, + { + "epoch": 0.3060251539963374, + "grad_norm": 0.12245632708072662, + "learning_rate": 8.701061132785815e-05, + "loss": 1.545200252532959, + "step": 101100 + }, + { + "epoch": 0.3060554235466832, + "grad_norm": 0.12813633680343628, + "learning_rate": 8.700681614622076e-05, + "loss": 1.5665716171264648, + "step": 101110 + }, + { + "epoch": 0.30608569309702904, + "grad_norm": 0.09493985027074814, + "learning_rate": 8.700302096458338e-05, + "loss": 1.5457740783691407, + "step": 101120 + }, + { + "epoch": 0.3061159626473749, + "grad_norm": 0.10796774923801422, + "learning_rate": 8.699922578294597e-05, + "loss": 1.5299669265747071, + "step": 101130 + }, + { + "epoch": 0.3061462321977207, + "grad_norm": 0.10374908149242401, + "learning_rate": 8.699543060130859e-05, + "loss": 1.507242202758789, + "step": 101140 + }, + { + "epoch": 0.30617650174806654, + "grad_norm": 0.11422653496265411, + "learning_rate": 8.699163541967119e-05, + "loss": 1.556401252746582, + "step": 101150 + }, + { + "epoch": 0.30620677129841234, + "grad_norm": 0.12810580432415009, + "learning_rate": 8.69878402380338e-05, + "loss": 1.5323007583618165, + "step": 101160 + }, + { + "epoch": 0.3062370408487582, + "grad_norm": 0.09810024499893188, + "learning_rate": 8.69840450563964e-05, + "loss": 1.5531967163085938, + "step": 101170 + }, + { + "epoch": 0.30626731039910404, + "grad_norm": 0.10432034730911255, + "learning_rate": 8.698024987475901e-05, + "loss": 1.5120767593383788, + "step": 101180 + }, + { + "epoch": 0.30629757994944984, + "grad_norm": 0.11832217127084732, + "learning_rate": 8.697645469312162e-05, + "loss": 1.5681004524230957, + "step": 101190 + }, + { + "epoch": 0.3063278494997957, + "grad_norm": 0.11743509769439697, + "learning_rate": 8.697265951148422e-05, + "loss": 1.5865503311157227, + "step": 101200 + }, + { + "epoch": 0.3063581190501415, + "grad_norm": 0.12333139032125473, + "learning_rate": 8.696886432984683e-05, + "loss": 1.5540332794189453, + "step": 101210 + }, + { + "epoch": 0.30638838860048734, + "grad_norm": 0.12679119408130646, + "learning_rate": 8.696506914820943e-05, + "loss": 1.5496133804321288, + "step": 101220 + }, + { + "epoch": 0.3064186581508332, + "grad_norm": 0.13026219606399536, + "learning_rate": 8.696127396657204e-05, + "loss": 1.5523103713989257, + "step": 101230 + }, + { + "epoch": 0.306448927701179, + "grad_norm": 0.11110961437225342, + "learning_rate": 8.695747878493465e-05, + "loss": 1.5345532417297363, + "step": 101240 + }, + { + "epoch": 0.30647919725152484, + "grad_norm": 0.1170019656419754, + "learning_rate": 8.695368360329725e-05, + "loss": 1.5864179611206055, + "step": 101250 + }, + { + "epoch": 0.30650946680187063, + "grad_norm": 0.1000094935297966, + "learning_rate": 8.694988842165986e-05, + "loss": 1.5769170761108398, + "step": 101260 + }, + { + "epoch": 0.3065397363522165, + "grad_norm": 0.10843905061483383, + "learning_rate": 8.694609324002246e-05, + "loss": 1.5502954483032227, + "step": 101270 + }, + { + "epoch": 0.30657000590256234, + "grad_norm": 0.11375702172517776, + "learning_rate": 8.694229805838508e-05, + "loss": 1.5861513137817382, + "step": 101280 + }, + { + "epoch": 0.30660027545290813, + "grad_norm": 0.10693491250276566, + "learning_rate": 8.693850287674768e-05, + "loss": 1.5670915603637696, + "step": 101290 + }, + { + "epoch": 0.306630545003254, + "grad_norm": 0.11820242553949356, + "learning_rate": 8.69347076951103e-05, + "loss": 1.5532342910766601, + "step": 101300 + }, + { + "epoch": 0.3066608145535998, + "grad_norm": 0.1359838992357254, + "learning_rate": 8.693091251347289e-05, + "loss": 1.5369829177856444, + "step": 101310 + }, + { + "epoch": 0.30669108410394563, + "grad_norm": 0.11191476881504059, + "learning_rate": 8.692711733183551e-05, + "loss": 1.525158405303955, + "step": 101320 + }, + { + "epoch": 0.3067213536542915, + "grad_norm": 0.11863332986831665, + "learning_rate": 8.692332215019811e-05, + "loss": 1.5577738761901856, + "step": 101330 + }, + { + "epoch": 0.3067516232046373, + "grad_norm": 0.114720918238163, + "learning_rate": 8.691952696856072e-05, + "loss": 1.5522953033447267, + "step": 101340 + }, + { + "epoch": 0.30678189275498313, + "grad_norm": 0.10972911864519119, + "learning_rate": 8.691573178692333e-05, + "loss": 1.5737058639526367, + "step": 101350 + }, + { + "epoch": 0.30681216230532893, + "grad_norm": 0.12526673078536987, + "learning_rate": 8.691193660528593e-05, + "loss": 1.5112171173095703, + "step": 101360 + }, + { + "epoch": 0.3068424318556748, + "grad_norm": 0.1185615062713623, + "learning_rate": 8.690814142364854e-05, + "loss": 1.5757102012634276, + "step": 101370 + }, + { + "epoch": 0.30687270140602063, + "grad_norm": 0.10909607261419296, + "learning_rate": 8.690434624201114e-05, + "loss": 1.5352235794067384, + "step": 101380 + }, + { + "epoch": 0.3069029709563664, + "grad_norm": 0.12168846279382706, + "learning_rate": 8.690055106037375e-05, + "loss": 1.5185806274414062, + "step": 101390 + }, + { + "epoch": 0.3069332405067123, + "grad_norm": 0.11279814690351486, + "learning_rate": 8.689675587873636e-05, + "loss": 1.5295244216918946, + "step": 101400 + }, + { + "epoch": 0.3069635100570581, + "grad_norm": 0.11436331272125244, + "learning_rate": 8.689296069709897e-05, + "loss": 1.555429458618164, + "step": 101410 + }, + { + "epoch": 0.3069937796074039, + "grad_norm": 0.1144469752907753, + "learning_rate": 8.688916551546157e-05, + "loss": 1.5407983779907226, + "step": 101420 + }, + { + "epoch": 0.3070240491577498, + "grad_norm": 0.1146693080663681, + "learning_rate": 8.688537033382419e-05, + "loss": 1.5588544845581054, + "step": 101430 + }, + { + "epoch": 0.3070543187080956, + "grad_norm": 0.11478713899850845, + "learning_rate": 8.688157515218678e-05, + "loss": 1.5506261825561523, + "step": 101440 + }, + { + "epoch": 0.3070845882584414, + "grad_norm": 0.10546843707561493, + "learning_rate": 8.68777799705494e-05, + "loss": 1.5588542938232421, + "step": 101450 + }, + { + "epoch": 0.3071148578087872, + "grad_norm": 0.11409458518028259, + "learning_rate": 8.687398478891199e-05, + "loss": 1.5466438293457032, + "step": 101460 + }, + { + "epoch": 0.3071451273591331, + "grad_norm": 0.11228950321674347, + "learning_rate": 8.687018960727461e-05, + "loss": 1.5431204795837403, + "step": 101470 + }, + { + "epoch": 0.3071753969094789, + "grad_norm": 0.11172039806842804, + "learning_rate": 8.68663944256372e-05, + "loss": 1.5696517944335937, + "step": 101480 + }, + { + "epoch": 0.3072056664598247, + "grad_norm": 0.11940664798021317, + "learning_rate": 8.686259924399982e-05, + "loss": 1.5543474197387694, + "step": 101490 + }, + { + "epoch": 0.3072359360101706, + "grad_norm": 0.1266724169254303, + "learning_rate": 8.685880406236241e-05, + "loss": 1.5582700729370118, + "step": 101500 + }, + { + "epoch": 0.3072359360101706, + "eval_loss": 1.5463429689407349, + "eval_runtime": 28.3898, + "eval_samples_per_second": 17.612, + "eval_steps_per_second": 1.127, + "step": 101500 + }, + { + "epoch": 0.3072662055605164, + "grad_norm": 0.10357379168272018, + "learning_rate": 8.685500888072503e-05, + "loss": 1.5643739700317383, + "step": 101510 + }, + { + "epoch": 0.3072964751108622, + "grad_norm": 0.12401896715164185, + "learning_rate": 8.685121369908764e-05, + "loss": 1.5421310424804688, + "step": 101520 + }, + { + "epoch": 0.3073267446612081, + "grad_norm": 0.11842209100723267, + "learning_rate": 8.684741851745025e-05, + "loss": 1.5643021583557128, + "step": 101530 + }, + { + "epoch": 0.30735701421155387, + "grad_norm": 0.10605555027723312, + "learning_rate": 8.684362333581287e-05, + "loss": 1.5336591720581054, + "step": 101540 + }, + { + "epoch": 0.3073872837618997, + "grad_norm": 0.11391457915306091, + "learning_rate": 8.683982815417546e-05, + "loss": 1.5098819732666016, + "step": 101550 + }, + { + "epoch": 0.3074175533122456, + "grad_norm": 0.10891532897949219, + "learning_rate": 8.683603297253808e-05, + "loss": 1.5680839538574218, + "step": 101560 + }, + { + "epoch": 0.30744782286259137, + "grad_norm": 0.10607525706291199, + "learning_rate": 8.683223779090067e-05, + "loss": 1.5576319694519043, + "step": 101570 + }, + { + "epoch": 0.3074780924129372, + "grad_norm": 0.11297324299812317, + "learning_rate": 8.682844260926329e-05, + "loss": 1.568591022491455, + "step": 101580 + }, + { + "epoch": 0.307508361963283, + "grad_norm": 0.10777346789836884, + "learning_rate": 8.682464742762588e-05, + "loss": 1.561873435974121, + "step": 101590 + }, + { + "epoch": 0.30753863151362887, + "grad_norm": 0.11764664947986603, + "learning_rate": 8.68208522459885e-05, + "loss": 1.5604225158691407, + "step": 101600 + }, + { + "epoch": 0.3075689010639747, + "grad_norm": 0.11277096718549728, + "learning_rate": 8.68170570643511e-05, + "loss": 1.57474365234375, + "step": 101610 + }, + { + "epoch": 0.3075991706143205, + "grad_norm": 0.10208506882190704, + "learning_rate": 8.681326188271371e-05, + "loss": 1.5724126815795898, + "step": 101620 + }, + { + "epoch": 0.30762944016466637, + "grad_norm": 0.11551698297262192, + "learning_rate": 8.68094667010763e-05, + "loss": 1.5684064865112304, + "step": 101630 + }, + { + "epoch": 0.30765970971501216, + "grad_norm": 0.11651203036308289, + "learning_rate": 8.680567151943893e-05, + "loss": 1.5273134231567382, + "step": 101640 + }, + { + "epoch": 0.307689979265358, + "grad_norm": 0.10829497873783112, + "learning_rate": 8.680187633780153e-05, + "loss": 1.574690055847168, + "step": 101650 + }, + { + "epoch": 0.30772024881570387, + "grad_norm": 0.12054659426212311, + "learning_rate": 8.679808115616414e-05, + "loss": 1.5632280349731444, + "step": 101660 + }, + { + "epoch": 0.30775051836604966, + "grad_norm": 0.10650847107172012, + "learning_rate": 8.679428597452674e-05, + "loss": 1.56689510345459, + "step": 101670 + }, + { + "epoch": 0.3077807879163955, + "grad_norm": 0.11107669770717621, + "learning_rate": 8.679049079288935e-05, + "loss": 1.5681476593017578, + "step": 101680 + }, + { + "epoch": 0.3078110574667413, + "grad_norm": 0.12182003259658813, + "learning_rate": 8.678669561125195e-05, + "loss": 1.5433252334594727, + "step": 101690 + }, + { + "epoch": 0.30784132701708716, + "grad_norm": 0.11052674800157547, + "learning_rate": 8.678290042961456e-05, + "loss": 1.585774803161621, + "step": 101700 + }, + { + "epoch": 0.307871596567433, + "grad_norm": 0.10952313244342804, + "learning_rate": 8.677910524797717e-05, + "loss": 1.559004783630371, + "step": 101710 + }, + { + "epoch": 0.3079018661177788, + "grad_norm": 0.10716374218463898, + "learning_rate": 8.677531006633977e-05, + "loss": 1.580965805053711, + "step": 101720 + }, + { + "epoch": 0.30793213566812466, + "grad_norm": 0.10151025652885437, + "learning_rate": 8.677151488470239e-05, + "loss": 1.5946038246154786, + "step": 101730 + }, + { + "epoch": 0.30796240521847046, + "grad_norm": 0.10215432941913605, + "learning_rate": 8.676771970306498e-05, + "loss": 1.5260429382324219, + "step": 101740 + }, + { + "epoch": 0.3079926747688163, + "grad_norm": 0.1405419558286667, + "learning_rate": 8.67639245214276e-05, + "loss": 1.534120464324951, + "step": 101750 + }, + { + "epoch": 0.30802294431916216, + "grad_norm": 0.10784609615802765, + "learning_rate": 8.67601293397902e-05, + "loss": 1.5629167556762695, + "step": 101760 + }, + { + "epoch": 0.30805321386950796, + "grad_norm": 0.11155661940574646, + "learning_rate": 8.675633415815282e-05, + "loss": 1.5610166549682618, + "step": 101770 + }, + { + "epoch": 0.3080834834198538, + "grad_norm": 0.11979544162750244, + "learning_rate": 8.675253897651542e-05, + "loss": 1.5293498992919923, + "step": 101780 + }, + { + "epoch": 0.3081137529701996, + "grad_norm": 0.11552350223064423, + "learning_rate": 8.674874379487803e-05, + "loss": 1.578652572631836, + "step": 101790 + }, + { + "epoch": 0.30814402252054546, + "grad_norm": 0.10377507656812668, + "learning_rate": 8.674494861324063e-05, + "loss": 1.587796401977539, + "step": 101800 + }, + { + "epoch": 0.3081742920708913, + "grad_norm": 0.11185547709465027, + "learning_rate": 8.674115343160324e-05, + "loss": 1.5659235954284667, + "step": 101810 + }, + { + "epoch": 0.3082045616212371, + "grad_norm": 0.10576851665973663, + "learning_rate": 8.673735824996585e-05, + "loss": 1.5710800170898438, + "step": 101820 + }, + { + "epoch": 0.30823483117158296, + "grad_norm": 0.11589597165584564, + "learning_rate": 8.673356306832845e-05, + "loss": 1.5298553466796876, + "step": 101830 + }, + { + "epoch": 0.30826510072192875, + "grad_norm": 0.1405874490737915, + "learning_rate": 8.672976788669106e-05, + "loss": 1.5687703132629394, + "step": 101840 + }, + { + "epoch": 0.3082953702722746, + "grad_norm": 0.11091183871030807, + "learning_rate": 8.672597270505366e-05, + "loss": 1.5726919174194336, + "step": 101850 + }, + { + "epoch": 0.30832563982262046, + "grad_norm": 0.10824929177761078, + "learning_rate": 8.672217752341627e-05, + "loss": 1.5697029113769532, + "step": 101860 + }, + { + "epoch": 0.30835590937296625, + "grad_norm": 0.10965477675199509, + "learning_rate": 8.671838234177888e-05, + "loss": 1.5681749343872071, + "step": 101870 + }, + { + "epoch": 0.3083861789233121, + "grad_norm": 0.10332339257001877, + "learning_rate": 8.671458716014148e-05, + "loss": 1.5935514450073243, + "step": 101880 + }, + { + "epoch": 0.3084164484736579, + "grad_norm": 0.11114182323217392, + "learning_rate": 8.67107919785041e-05, + "loss": 1.5844461441040039, + "step": 101890 + }, + { + "epoch": 0.30844671802400375, + "grad_norm": 0.1006772518157959, + "learning_rate": 8.67069967968667e-05, + "loss": 1.5451189994812011, + "step": 101900 + }, + { + "epoch": 0.3084769875743496, + "grad_norm": 0.12286848574876785, + "learning_rate": 8.670320161522931e-05, + "loss": 1.5712154388427735, + "step": 101910 + }, + { + "epoch": 0.3085072571246954, + "grad_norm": 0.11529436707496643, + "learning_rate": 8.66994064335919e-05, + "loss": 1.614396858215332, + "step": 101920 + }, + { + "epoch": 0.30853752667504125, + "grad_norm": 0.1142096221446991, + "learning_rate": 8.669561125195452e-05, + "loss": 1.556693935394287, + "step": 101930 + }, + { + "epoch": 0.30856779622538705, + "grad_norm": 0.11075925827026367, + "learning_rate": 8.669181607031713e-05, + "loss": 1.5829327583312989, + "step": 101940 + }, + { + "epoch": 0.3085980657757329, + "grad_norm": 0.10696698725223541, + "learning_rate": 8.668802088867974e-05, + "loss": 1.5450395584106444, + "step": 101950 + }, + { + "epoch": 0.30862833532607875, + "grad_norm": 0.11634724587202072, + "learning_rate": 8.668422570704234e-05, + "loss": 1.5929168701171874, + "step": 101960 + }, + { + "epoch": 0.30865860487642455, + "grad_norm": 0.10878940671682358, + "learning_rate": 8.668043052540495e-05, + "loss": 1.5479507446289062, + "step": 101970 + }, + { + "epoch": 0.3086888744267704, + "grad_norm": 0.11345970630645752, + "learning_rate": 8.667663534376755e-05, + "loss": 1.5453797340393067, + "step": 101980 + }, + { + "epoch": 0.3087191439771162, + "grad_norm": 0.11110608279705048, + "learning_rate": 8.667284016213016e-05, + "loss": 1.5205718994140625, + "step": 101990 + }, + { + "epoch": 0.30874941352746205, + "grad_norm": 0.12789322435855865, + "learning_rate": 8.666904498049277e-05, + "loss": 1.5344337463378905, + "step": 102000 + }, + { + "epoch": 0.30874941352746205, + "eval_loss": 1.5631630420684814, + "eval_runtime": 27.8893, + "eval_samples_per_second": 17.928, + "eval_steps_per_second": 1.147, + "step": 102000 + }, + { + "epoch": 0.3087796830778079, + "grad_norm": 0.10896281152963638, + "learning_rate": 8.666524979885537e-05, + "loss": 1.539496898651123, + "step": 102010 + }, + { + "epoch": 0.3088099526281537, + "grad_norm": 0.11475063115358353, + "learning_rate": 8.666145461721799e-05, + "loss": 1.5380027770996094, + "step": 102020 + }, + { + "epoch": 0.30884022217849955, + "grad_norm": 0.12449299544095993, + "learning_rate": 8.665765943558058e-05, + "loss": 1.5422072410583496, + "step": 102030 + }, + { + "epoch": 0.30887049172884534, + "grad_norm": 0.12068763375282288, + "learning_rate": 8.66538642539432e-05, + "loss": 1.569784450531006, + "step": 102040 + }, + { + "epoch": 0.3089007612791912, + "grad_norm": 0.11141689121723175, + "learning_rate": 8.66500690723058e-05, + "loss": 1.5787646293640136, + "step": 102050 + }, + { + "epoch": 0.30893103082953705, + "grad_norm": 0.09892053157091141, + "learning_rate": 8.664627389066842e-05, + "loss": 1.5326047897338868, + "step": 102060 + }, + { + "epoch": 0.30896130037988284, + "grad_norm": 0.10812222212553024, + "learning_rate": 8.664247870903101e-05, + "loss": 1.5931020736694337, + "step": 102070 + }, + { + "epoch": 0.3089915699302287, + "grad_norm": 0.10966718941926956, + "learning_rate": 8.663868352739363e-05, + "loss": 1.5725030899047852, + "step": 102080 + }, + { + "epoch": 0.3090218394805745, + "grad_norm": 0.1094961166381836, + "learning_rate": 8.663488834575622e-05, + "loss": 1.6280715942382813, + "step": 102090 + }, + { + "epoch": 0.30905210903092034, + "grad_norm": 0.10087104141712189, + "learning_rate": 8.663109316411884e-05, + "loss": 1.5559846878051757, + "step": 102100 + }, + { + "epoch": 0.3090823785812662, + "grad_norm": 0.12714065611362457, + "learning_rate": 8.662729798248143e-05, + "loss": 1.5308579444885253, + "step": 102110 + }, + { + "epoch": 0.309112648131612, + "grad_norm": 0.12532448768615723, + "learning_rate": 8.662350280084405e-05, + "loss": 1.5343350410461425, + "step": 102120 + }, + { + "epoch": 0.30914291768195784, + "grad_norm": 0.12732158601284027, + "learning_rate": 8.661970761920664e-05, + "loss": 1.5438995361328125, + "step": 102130 + }, + { + "epoch": 0.30917318723230364, + "grad_norm": 0.12642565369606018, + "learning_rate": 8.661591243756926e-05, + "loss": 1.5464988708496095, + "step": 102140 + }, + { + "epoch": 0.3092034567826495, + "grad_norm": 0.11787578463554382, + "learning_rate": 8.661211725593188e-05, + "loss": 1.5633213996887207, + "step": 102150 + }, + { + "epoch": 0.30923372633299534, + "grad_norm": 0.11359893530607224, + "learning_rate": 8.660832207429448e-05, + "loss": 1.5140768051147462, + "step": 102160 + }, + { + "epoch": 0.30926399588334114, + "grad_norm": 0.12922745943069458, + "learning_rate": 8.66045268926571e-05, + "loss": 1.57708797454834, + "step": 102170 + }, + { + "epoch": 0.309294265433687, + "grad_norm": 0.11356761306524277, + "learning_rate": 8.660073171101969e-05, + "loss": 1.56823787689209, + "step": 102180 + }, + { + "epoch": 0.30932453498403284, + "grad_norm": 0.11369595676660538, + "learning_rate": 8.65969365293823e-05, + "loss": 1.5500978469848632, + "step": 102190 + }, + { + "epoch": 0.30935480453437864, + "grad_norm": 0.11062028259038925, + "learning_rate": 8.65931413477449e-05, + "loss": 1.5093753814697266, + "step": 102200 + }, + { + "epoch": 0.3093850740847245, + "grad_norm": 0.10530821979045868, + "learning_rate": 8.658934616610752e-05, + "loss": 1.552131748199463, + "step": 102210 + }, + { + "epoch": 0.3094153436350703, + "grad_norm": 0.1252661496400833, + "learning_rate": 8.658555098447011e-05, + "loss": 1.5398958206176758, + "step": 102220 + }, + { + "epoch": 0.30944561318541614, + "grad_norm": 0.1260208934545517, + "learning_rate": 8.658175580283273e-05, + "loss": 1.5271512985229492, + "step": 102230 + }, + { + "epoch": 0.309475882735762, + "grad_norm": 0.10104943066835403, + "learning_rate": 8.657796062119532e-05, + "loss": 1.543980598449707, + "step": 102240 + }, + { + "epoch": 0.3095061522861078, + "grad_norm": 0.11324812471866608, + "learning_rate": 8.657416543955794e-05, + "loss": 1.5673126220703124, + "step": 102250 + }, + { + "epoch": 0.30953642183645363, + "grad_norm": 0.10102462023496628, + "learning_rate": 8.657037025792055e-05, + "loss": 1.5396541595458983, + "step": 102260 + }, + { + "epoch": 0.30956669138679943, + "grad_norm": 0.10595376044511795, + "learning_rate": 8.656657507628315e-05, + "loss": 1.5574822425842285, + "step": 102270 + }, + { + "epoch": 0.3095969609371453, + "grad_norm": 0.10573748499155045, + "learning_rate": 8.656277989464576e-05, + "loss": 1.5454328536987305, + "step": 102280 + }, + { + "epoch": 0.30962723048749113, + "grad_norm": 0.10646183043718338, + "learning_rate": 8.655898471300837e-05, + "loss": 1.5434820175170898, + "step": 102290 + }, + { + "epoch": 0.30965750003783693, + "grad_norm": 0.11585722118616104, + "learning_rate": 8.655518953137097e-05, + "loss": 1.514811134338379, + "step": 102300 + }, + { + "epoch": 0.3096877695881828, + "grad_norm": 0.11297880113124847, + "learning_rate": 8.655139434973358e-05, + "loss": 1.600626564025879, + "step": 102310 + }, + { + "epoch": 0.3097180391385286, + "grad_norm": 0.12127892673015594, + "learning_rate": 8.654759916809618e-05, + "loss": 1.5593013763427734, + "step": 102320 + }, + { + "epoch": 0.30974830868887443, + "grad_norm": 0.12689733505249023, + "learning_rate": 8.654380398645879e-05, + "loss": 1.5525409698486328, + "step": 102330 + }, + { + "epoch": 0.3097785782392203, + "grad_norm": 0.1027848869562149, + "learning_rate": 8.654000880482141e-05, + "loss": 1.5717056274414063, + "step": 102340 + }, + { + "epoch": 0.3098088477895661, + "grad_norm": 0.11470593512058258, + "learning_rate": 8.6536213623184e-05, + "loss": 1.5743954658508301, + "step": 102350 + }, + { + "epoch": 0.30983911733991193, + "grad_norm": 0.11091934889554977, + "learning_rate": 8.653241844154662e-05, + "loss": 1.6040786743164062, + "step": 102360 + }, + { + "epoch": 0.3098693868902577, + "grad_norm": 0.11706499010324478, + "learning_rate": 8.652862325990921e-05, + "loss": 1.5711641311645508, + "step": 102370 + }, + { + "epoch": 0.3098996564406036, + "grad_norm": 0.10517911612987518, + "learning_rate": 8.652482807827183e-05, + "loss": 1.5486051559448242, + "step": 102380 + }, + { + "epoch": 0.30992992599094943, + "grad_norm": 0.1314830482006073, + "learning_rate": 8.652103289663444e-05, + "loss": 1.5611863136291504, + "step": 102390 + }, + { + "epoch": 0.3099601955412952, + "grad_norm": 0.10670823603868484, + "learning_rate": 8.651723771499705e-05, + "loss": 1.5432424545288086, + "step": 102400 + }, + { + "epoch": 0.3099904650916411, + "grad_norm": 0.11004426330327988, + "learning_rate": 8.651344253335965e-05, + "loss": 1.5256317138671875, + "step": 102410 + }, + { + "epoch": 0.3100207346419869, + "grad_norm": 0.11199299991130829, + "learning_rate": 8.650964735172226e-05, + "loss": 1.5584517478942872, + "step": 102420 + }, + { + "epoch": 0.3100510041923327, + "grad_norm": 0.10548936575651169, + "learning_rate": 8.650585217008486e-05, + "loss": 1.5768177032470703, + "step": 102430 + }, + { + "epoch": 0.3100812737426786, + "grad_norm": 0.1104116439819336, + "learning_rate": 8.650205698844747e-05, + "loss": 1.5787303924560547, + "step": 102440 + }, + { + "epoch": 0.31011154329302437, + "grad_norm": 0.10428532212972641, + "learning_rate": 8.649826180681007e-05, + "loss": 1.55261173248291, + "step": 102450 + }, + { + "epoch": 0.3101418128433702, + "grad_norm": 0.11183751374483109, + "learning_rate": 8.649446662517268e-05, + "loss": 1.5585801124572753, + "step": 102460 + }, + { + "epoch": 0.310172082393716, + "grad_norm": 0.11440088599920273, + "learning_rate": 8.649067144353529e-05, + "loss": 1.558865451812744, + "step": 102470 + }, + { + "epoch": 0.31020235194406187, + "grad_norm": 0.10943913459777832, + "learning_rate": 8.648687626189789e-05, + "loss": 1.540189552307129, + "step": 102480 + }, + { + "epoch": 0.3102326214944077, + "grad_norm": 0.10656365752220154, + "learning_rate": 8.64830810802605e-05, + "loss": 1.534839153289795, + "step": 102490 + }, + { + "epoch": 0.3102628910447535, + "grad_norm": 0.10703905671834946, + "learning_rate": 8.64792858986231e-05, + "loss": 1.5653463363647462, + "step": 102500 + }, + { + "epoch": 0.3102628910447535, + "eval_loss": 1.5430970191955566, + "eval_runtime": 27.6889, + "eval_samples_per_second": 18.058, + "eval_steps_per_second": 1.156, + "step": 102500 + }, + { + "epoch": 0.31029316059509937, + "grad_norm": 0.11954774707555771, + "learning_rate": 8.647549071698571e-05, + "loss": 1.504832649230957, + "step": 102510 + }, + { + "epoch": 0.31032343014544517, + "grad_norm": 0.11968119442462921, + "learning_rate": 8.647169553534833e-05, + "loss": 1.5674997329711915, + "step": 102520 + }, + { + "epoch": 0.310353699695791, + "grad_norm": 0.11056827753782272, + "learning_rate": 8.646790035371092e-05, + "loss": 1.572108268737793, + "step": 102530 + }, + { + "epoch": 0.31038396924613687, + "grad_norm": 0.1049385666847229, + "learning_rate": 8.646410517207354e-05, + "loss": 1.545517635345459, + "step": 102540 + }, + { + "epoch": 0.31041423879648267, + "grad_norm": 0.1126776859164238, + "learning_rate": 8.646030999043615e-05, + "loss": 1.5291772842407227, + "step": 102550 + }, + { + "epoch": 0.3104445083468285, + "grad_norm": 0.10916128754615784, + "learning_rate": 8.645651480879875e-05, + "loss": 1.545830535888672, + "step": 102560 + }, + { + "epoch": 0.3104747778971743, + "grad_norm": 0.12138869613409042, + "learning_rate": 8.645271962716136e-05, + "loss": 1.5525307655334473, + "step": 102570 + }, + { + "epoch": 0.31050504744752017, + "grad_norm": 0.1302706003189087, + "learning_rate": 8.644892444552397e-05, + "loss": 1.5519710540771485, + "step": 102580 + }, + { + "epoch": 0.310535316997866, + "grad_norm": 0.11562852561473846, + "learning_rate": 8.644512926388657e-05, + "loss": 1.58801212310791, + "step": 102590 + }, + { + "epoch": 0.3105655865482118, + "grad_norm": 0.1262706220149994, + "learning_rate": 8.644133408224918e-05, + "loss": 1.565528106689453, + "step": 102600 + }, + { + "epoch": 0.31059585609855767, + "grad_norm": 0.11625572293996811, + "learning_rate": 8.643753890061178e-05, + "loss": 1.5166419982910155, + "step": 102610 + }, + { + "epoch": 0.31062612564890346, + "grad_norm": 0.11386460810899734, + "learning_rate": 8.643374371897439e-05, + "loss": 1.5338336944580078, + "step": 102620 + }, + { + "epoch": 0.3106563951992493, + "grad_norm": 0.10604279488325119, + "learning_rate": 8.642994853733701e-05, + "loss": 1.5563201904296875, + "step": 102630 + }, + { + "epoch": 0.31068666474959516, + "grad_norm": 0.11104555428028107, + "learning_rate": 8.64261533556996e-05, + "loss": 1.5638286590576171, + "step": 102640 + }, + { + "epoch": 0.31071693429994096, + "grad_norm": 0.10580076277256012, + "learning_rate": 8.642235817406222e-05, + "loss": 1.5708091735839844, + "step": 102650 + }, + { + "epoch": 0.3107472038502868, + "grad_norm": 0.1374293863773346, + "learning_rate": 8.641856299242481e-05, + "loss": 1.5562493324279785, + "step": 102660 + }, + { + "epoch": 0.3107774734006326, + "grad_norm": 0.11475688964128494, + "learning_rate": 8.641476781078743e-05, + "loss": 1.565526580810547, + "step": 102670 + }, + { + "epoch": 0.31080774295097846, + "grad_norm": 0.1064835786819458, + "learning_rate": 8.641097262915003e-05, + "loss": 1.5723457336425781, + "step": 102680 + }, + { + "epoch": 0.3108380125013243, + "grad_norm": 0.11427697539329529, + "learning_rate": 8.640717744751264e-05, + "loss": 1.5941551208496094, + "step": 102690 + }, + { + "epoch": 0.3108682820516701, + "grad_norm": 0.11358966678380966, + "learning_rate": 8.640338226587524e-05, + "loss": 1.549077606201172, + "step": 102700 + }, + { + "epoch": 0.31089855160201596, + "grad_norm": 0.11624912172555923, + "learning_rate": 8.639958708423786e-05, + "loss": 1.5952399253845215, + "step": 102710 + }, + { + "epoch": 0.31092882115236176, + "grad_norm": 0.11883185058832169, + "learning_rate": 8.639579190260045e-05, + "loss": 1.566103744506836, + "step": 102720 + }, + { + "epoch": 0.3109590907027076, + "grad_norm": 0.1112373098731041, + "learning_rate": 8.639199672096307e-05, + "loss": 1.5562768936157227, + "step": 102730 + }, + { + "epoch": 0.31098936025305346, + "grad_norm": 0.12240517884492874, + "learning_rate": 8.638820153932566e-05, + "loss": 1.578721809387207, + "step": 102740 + }, + { + "epoch": 0.31101962980339926, + "grad_norm": 0.12237244099378586, + "learning_rate": 8.638440635768828e-05, + "loss": 1.5703158378601074, + "step": 102750 + }, + { + "epoch": 0.3110498993537451, + "grad_norm": 0.12161123007535934, + "learning_rate": 8.63806111760509e-05, + "loss": 1.5600202560424805, + "step": 102760 + }, + { + "epoch": 0.3110801689040909, + "grad_norm": 0.1163460910320282, + "learning_rate": 8.637681599441349e-05, + "loss": 1.5385542869567872, + "step": 102770 + }, + { + "epoch": 0.31111043845443676, + "grad_norm": 0.11696846038103104, + "learning_rate": 8.637302081277611e-05, + "loss": 1.5626741409301759, + "step": 102780 + }, + { + "epoch": 0.3111407080047826, + "grad_norm": 0.10574682056903839, + "learning_rate": 8.63692256311387e-05, + "loss": 1.51373291015625, + "step": 102790 + }, + { + "epoch": 0.3111709775551284, + "grad_norm": 0.1233050748705864, + "learning_rate": 8.636543044950132e-05, + "loss": 1.5719274520874023, + "step": 102800 + }, + { + "epoch": 0.31120124710547425, + "grad_norm": 0.10217763483524323, + "learning_rate": 8.636163526786392e-05, + "loss": 1.5831342697143556, + "step": 102810 + }, + { + "epoch": 0.31123151665582005, + "grad_norm": 0.11952845752239227, + "learning_rate": 8.635784008622654e-05, + "loss": 1.531589889526367, + "step": 102820 + }, + { + "epoch": 0.3112617862061659, + "grad_norm": 0.10959848016500473, + "learning_rate": 8.635404490458913e-05, + "loss": 1.5189887046813966, + "step": 102830 + }, + { + "epoch": 0.31129205575651175, + "grad_norm": 0.1312137246131897, + "learning_rate": 8.635024972295175e-05, + "loss": 1.5490184783935548, + "step": 102840 + }, + { + "epoch": 0.31132232530685755, + "grad_norm": 0.1100323423743248, + "learning_rate": 8.634645454131434e-05, + "loss": 1.5305047988891602, + "step": 102850 + }, + { + "epoch": 0.3113525948572034, + "grad_norm": 0.12465833872556686, + "learning_rate": 8.634265935967696e-05, + "loss": 1.542707633972168, + "step": 102860 + }, + { + "epoch": 0.31138286440754925, + "grad_norm": 0.10562034696340561, + "learning_rate": 8.633886417803955e-05, + "loss": 1.5526588439941407, + "step": 102870 + }, + { + "epoch": 0.31141313395789505, + "grad_norm": 0.13014167547225952, + "learning_rate": 8.633506899640217e-05, + "loss": 1.561936378479004, + "step": 102880 + }, + { + "epoch": 0.3114434035082409, + "grad_norm": 0.10830078274011612, + "learning_rate": 8.633127381476478e-05, + "loss": 1.5794256210327149, + "step": 102890 + }, + { + "epoch": 0.3114736730585867, + "grad_norm": 0.12522099912166595, + "learning_rate": 8.632747863312738e-05, + "loss": 1.5580831527709962, + "step": 102900 + }, + { + "epoch": 0.31150394260893255, + "grad_norm": 0.10310259461402893, + "learning_rate": 8.632368345148999e-05, + "loss": 1.564407730102539, + "step": 102910 + }, + { + "epoch": 0.3115342121592784, + "grad_norm": 0.12089240550994873, + "learning_rate": 8.63198882698526e-05, + "loss": 1.568861198425293, + "step": 102920 + }, + { + "epoch": 0.3115644817096242, + "grad_norm": 0.12180888652801514, + "learning_rate": 8.63160930882152e-05, + "loss": 1.5804676055908202, + "step": 102930 + }, + { + "epoch": 0.31159475125997005, + "grad_norm": 0.11850482225418091, + "learning_rate": 8.631229790657781e-05, + "loss": 1.5451554298400878, + "step": 102940 + }, + { + "epoch": 0.31162502081031584, + "grad_norm": 0.10645578801631927, + "learning_rate": 8.630850272494043e-05, + "loss": 1.5539440155029296, + "step": 102950 + }, + { + "epoch": 0.3116552903606617, + "grad_norm": 0.09999266266822815, + "learning_rate": 8.630470754330302e-05, + "loss": 1.558951473236084, + "step": 102960 + }, + { + "epoch": 0.31168555991100755, + "grad_norm": 0.11187362670898438, + "learning_rate": 8.630091236166564e-05, + "loss": 1.563866424560547, + "step": 102970 + }, + { + "epoch": 0.31171582946135334, + "grad_norm": 0.12882842123508453, + "learning_rate": 8.629711718002823e-05, + "loss": 1.5288073539733886, + "step": 102980 + }, + { + "epoch": 0.3117460990116992, + "grad_norm": 0.12009299546480179, + "learning_rate": 8.629332199839085e-05, + "loss": 1.568249225616455, + "step": 102990 + }, + { + "epoch": 0.311776368562045, + "grad_norm": 0.10273818671703339, + "learning_rate": 8.628952681675346e-05, + "loss": 1.561086082458496, + "step": 103000 + }, + { + "epoch": 0.311776368562045, + "eval_loss": 1.5392661094665527, + "eval_runtime": 27.7378, + "eval_samples_per_second": 18.026, + "eval_steps_per_second": 1.154, + "step": 103000 + }, + { + "epoch": 0.31180663811239084, + "grad_norm": 0.10633689165115356, + "learning_rate": 8.628573163511606e-05, + "loss": 1.5618767738342285, + "step": 103010 + }, + { + "epoch": 0.3118369076627367, + "grad_norm": 0.1163407638669014, + "learning_rate": 8.628193645347867e-05, + "loss": 1.5548510551452637, + "step": 103020 + }, + { + "epoch": 0.3118671772130825, + "grad_norm": 0.11333541572093964, + "learning_rate": 8.627814127184127e-05, + "loss": 1.5978160858154298, + "step": 103030 + }, + { + "epoch": 0.31189744676342834, + "grad_norm": 0.11491749435663223, + "learning_rate": 8.627434609020388e-05, + "loss": 1.5613523483276368, + "step": 103040 + }, + { + "epoch": 0.31192771631377414, + "grad_norm": 0.11130768805742264, + "learning_rate": 8.627055090856649e-05, + "loss": 1.5063068389892578, + "step": 103050 + }, + { + "epoch": 0.31195798586412, + "grad_norm": 0.13182388246059418, + "learning_rate": 8.626675572692909e-05, + "loss": 1.5431984901428222, + "step": 103060 + }, + { + "epoch": 0.31198825541446584, + "grad_norm": 0.11051206290721893, + "learning_rate": 8.62629605452917e-05, + "loss": 1.5226382255554198, + "step": 103070 + }, + { + "epoch": 0.31201852496481164, + "grad_norm": 0.11608607321977615, + "learning_rate": 8.62591653636543e-05, + "loss": 1.572578239440918, + "step": 103080 + }, + { + "epoch": 0.3120487945151575, + "grad_norm": 0.11832758784294128, + "learning_rate": 8.625537018201691e-05, + "loss": 1.5428102493286133, + "step": 103090 + }, + { + "epoch": 0.3120790640655033, + "grad_norm": 0.11377986520528793, + "learning_rate": 8.625157500037952e-05, + "loss": 1.5567480087280274, + "step": 103100 + }, + { + "epoch": 0.31210933361584914, + "grad_norm": 0.10802648216485977, + "learning_rate": 8.624777981874212e-05, + "loss": 1.5112634658813477, + "step": 103110 + }, + { + "epoch": 0.312139603166195, + "grad_norm": 0.12152758985757828, + "learning_rate": 8.624398463710473e-05, + "loss": 1.5886682510375976, + "step": 103120 + }, + { + "epoch": 0.3121698727165408, + "grad_norm": 0.1301955133676529, + "learning_rate": 8.624018945546735e-05, + "loss": 1.5408559799194337, + "step": 103130 + }, + { + "epoch": 0.31220014226688664, + "grad_norm": 0.11649615317583084, + "learning_rate": 8.623639427382994e-05, + "loss": 1.5291643142700195, + "step": 103140 + }, + { + "epoch": 0.31223041181723243, + "grad_norm": 0.11823678016662598, + "learning_rate": 8.623259909219256e-05, + "loss": 1.5372959136962892, + "step": 103150 + }, + { + "epoch": 0.3122606813675783, + "grad_norm": 0.1210397407412529, + "learning_rate": 8.622880391055517e-05, + "loss": 1.5403714179992676, + "step": 103160 + }, + { + "epoch": 0.31229095091792414, + "grad_norm": 0.11281073838472366, + "learning_rate": 8.622500872891777e-05, + "loss": 1.582528781890869, + "step": 103170 + }, + { + "epoch": 0.31232122046826993, + "grad_norm": 0.11030139774084091, + "learning_rate": 8.622121354728038e-05, + "loss": 1.5083629608154296, + "step": 103180 + }, + { + "epoch": 0.3123514900186158, + "grad_norm": 0.1137920394539833, + "learning_rate": 8.621741836564298e-05, + "loss": 1.5327518463134766, + "step": 103190 + }, + { + "epoch": 0.3123817595689616, + "grad_norm": 0.11617007106542587, + "learning_rate": 8.621362318400559e-05, + "loss": 1.5392168045043946, + "step": 103200 + }, + { + "epoch": 0.31241202911930743, + "grad_norm": 0.1212673932313919, + "learning_rate": 8.62098280023682e-05, + "loss": 1.5459178924560546, + "step": 103210 + }, + { + "epoch": 0.3124422986696533, + "grad_norm": 0.10954564809799194, + "learning_rate": 8.62060328207308e-05, + "loss": 1.5940500259399415, + "step": 103220 + }, + { + "epoch": 0.3124725682199991, + "grad_norm": 0.10998205095529556, + "learning_rate": 8.62022376390934e-05, + "loss": 1.573108959197998, + "step": 103230 + }, + { + "epoch": 0.31250283777034493, + "grad_norm": 0.10489942133426666, + "learning_rate": 8.619844245745603e-05, + "loss": 1.5435641288757325, + "step": 103240 + }, + { + "epoch": 0.31253310732069073, + "grad_norm": 0.1297852247953415, + "learning_rate": 8.619464727581862e-05, + "loss": 1.5649679183959961, + "step": 103250 + }, + { + "epoch": 0.3125633768710366, + "grad_norm": 0.10394030064344406, + "learning_rate": 8.619085209418124e-05, + "loss": 1.5580565452575683, + "step": 103260 + }, + { + "epoch": 0.31259364642138243, + "grad_norm": 0.11907029896974564, + "learning_rate": 8.618705691254383e-05, + "loss": 1.5842944145202638, + "step": 103270 + }, + { + "epoch": 0.31262391597172823, + "grad_norm": 0.15582439303398132, + "learning_rate": 8.618326173090645e-05, + "loss": 1.558943748474121, + "step": 103280 + }, + { + "epoch": 0.3126541855220741, + "grad_norm": 0.12581832706928253, + "learning_rate": 8.617946654926904e-05, + "loss": 1.5539134979248046, + "step": 103290 + }, + { + "epoch": 0.3126844550724199, + "grad_norm": 0.10917089879512787, + "learning_rate": 8.617567136763166e-05, + "loss": 1.5815567016601562, + "step": 103300 + }, + { + "epoch": 0.3127147246227657, + "grad_norm": 0.10601239651441574, + "learning_rate": 8.617187618599425e-05, + "loss": 1.5611442565917968, + "step": 103310 + }, + { + "epoch": 0.3127449941731116, + "grad_norm": 0.1147860735654831, + "learning_rate": 8.616808100435687e-05, + "loss": 1.546725368499756, + "step": 103320 + }, + { + "epoch": 0.3127752637234574, + "grad_norm": 0.13626377284526825, + "learning_rate": 8.616428582271947e-05, + "loss": 1.5472776412963867, + "step": 103330 + }, + { + "epoch": 0.3128055332738032, + "grad_norm": 0.11532918363809586, + "learning_rate": 8.616049064108209e-05, + "loss": 1.5612085342407227, + "step": 103340 + }, + { + "epoch": 0.312835802824149, + "grad_norm": 0.12196211516857147, + "learning_rate": 8.615669545944468e-05, + "loss": 1.5659565925598145, + "step": 103350 + }, + { + "epoch": 0.3128660723744949, + "grad_norm": 0.12131441384553909, + "learning_rate": 8.61529002778073e-05, + "loss": 1.5540180206298828, + "step": 103360 + }, + { + "epoch": 0.3128963419248407, + "grad_norm": 0.11175347864627838, + "learning_rate": 8.614910509616992e-05, + "loss": 1.5646651268005372, + "step": 103370 + }, + { + "epoch": 0.3129266114751865, + "grad_norm": 0.11585664749145508, + "learning_rate": 8.614530991453251e-05, + "loss": 1.5416264533996582, + "step": 103380 + }, + { + "epoch": 0.3129568810255324, + "grad_norm": 0.12238862365484238, + "learning_rate": 8.614151473289513e-05, + "loss": 1.5154205322265626, + "step": 103390 + }, + { + "epoch": 0.31298715057587817, + "grad_norm": 0.12956134974956512, + "learning_rate": 8.613771955125772e-05, + "loss": 1.5245975494384765, + "step": 103400 + }, + { + "epoch": 0.313017420126224, + "grad_norm": 0.11940643191337585, + "learning_rate": 8.613392436962034e-05, + "loss": 1.5332706451416016, + "step": 103410 + }, + { + "epoch": 0.3130476896765699, + "grad_norm": 0.11670005321502686, + "learning_rate": 8.613012918798293e-05, + "loss": 1.5897056579589843, + "step": 103420 + }, + { + "epoch": 0.31307795922691567, + "grad_norm": 0.10873574763536453, + "learning_rate": 8.612633400634555e-05, + "loss": 1.5697004318237304, + "step": 103430 + }, + { + "epoch": 0.3131082287772615, + "grad_norm": 0.10769089311361313, + "learning_rate": 8.612253882470815e-05, + "loss": 1.5609819412231445, + "step": 103440 + }, + { + "epoch": 0.3131384983276073, + "grad_norm": 0.1188177689909935, + "learning_rate": 8.611874364307076e-05, + "loss": 1.548733615875244, + "step": 103450 + }, + { + "epoch": 0.31316876787795317, + "grad_norm": 0.10909000039100647, + "learning_rate": 8.611494846143336e-05, + "loss": 1.5529420852661133, + "step": 103460 + }, + { + "epoch": 0.313199037428299, + "grad_norm": 0.11992467939853668, + "learning_rate": 8.611115327979598e-05, + "loss": 1.5353422164916992, + "step": 103470 + }, + { + "epoch": 0.3132293069786448, + "grad_norm": 0.11668025702238083, + "learning_rate": 8.610735809815857e-05, + "loss": 1.5604087829589843, + "step": 103480 + }, + { + "epoch": 0.31325957652899067, + "grad_norm": 0.10707416385412216, + "learning_rate": 8.610356291652119e-05, + "loss": 1.5450809478759766, + "step": 103490 + }, + { + "epoch": 0.31328984607933646, + "grad_norm": 0.10334275662899017, + "learning_rate": 8.60997677348838e-05, + "loss": 1.5587704658508301, + "step": 103500 + }, + { + "epoch": 0.31328984607933646, + "eval_loss": 1.5446945428848267, + "eval_runtime": 28.08, + "eval_samples_per_second": 17.806, + "eval_steps_per_second": 1.14, + "step": 103500 + }, + { + "epoch": 0.3133201156296823, + "grad_norm": 0.11755956709384918, + "learning_rate": 8.60959725532464e-05, + "loss": 1.5493868827819823, + "step": 103510 + }, + { + "epoch": 0.31335038518002817, + "grad_norm": 0.11038632690906525, + "learning_rate": 8.6092177371609e-05, + "loss": 1.5492237091064454, + "step": 103520 + }, + { + "epoch": 0.31338065473037396, + "grad_norm": 0.11128049343824387, + "learning_rate": 8.608838218997161e-05, + "loss": 1.5451278686523438, + "step": 103530 + }, + { + "epoch": 0.3134109242807198, + "grad_norm": 0.10352671146392822, + "learning_rate": 8.608458700833422e-05, + "loss": 1.601308822631836, + "step": 103540 + }, + { + "epoch": 0.3134411938310656, + "grad_norm": 0.11205261200666428, + "learning_rate": 8.608079182669682e-05, + "loss": 1.5752227783203125, + "step": 103550 + }, + { + "epoch": 0.31347146338141146, + "grad_norm": 0.12106627970933914, + "learning_rate": 8.607699664505944e-05, + "loss": 1.5536452293395997, + "step": 103560 + }, + { + "epoch": 0.3135017329317573, + "grad_norm": 0.09847421199083328, + "learning_rate": 8.607320146342204e-05, + "loss": 1.5624849319458007, + "step": 103570 + }, + { + "epoch": 0.3135320024821031, + "grad_norm": 0.11109531670808792, + "learning_rate": 8.606940628178466e-05, + "loss": 1.5264104843139648, + "step": 103580 + }, + { + "epoch": 0.31356227203244896, + "grad_norm": 0.11009574681520462, + "learning_rate": 8.606561110014725e-05, + "loss": 1.5418840408325196, + "step": 103590 + }, + { + "epoch": 0.3135925415827948, + "grad_norm": 0.10490724444389343, + "learning_rate": 8.606181591850987e-05, + "loss": 1.5451541900634767, + "step": 103600 + }, + { + "epoch": 0.3136228111331406, + "grad_norm": 0.11646540462970734, + "learning_rate": 8.605802073687246e-05, + "loss": 1.538203525543213, + "step": 103610 + }, + { + "epoch": 0.31365308068348646, + "grad_norm": 0.13084572553634644, + "learning_rate": 8.605422555523508e-05, + "loss": 1.529627513885498, + "step": 103620 + }, + { + "epoch": 0.31368335023383226, + "grad_norm": 0.12296596169471741, + "learning_rate": 8.605043037359769e-05, + "loss": 1.5283547401428224, + "step": 103630 + }, + { + "epoch": 0.3137136197841781, + "grad_norm": 0.11334022134542465, + "learning_rate": 8.604663519196029e-05, + "loss": 1.5216121673583984, + "step": 103640 + }, + { + "epoch": 0.31374388933452396, + "grad_norm": 0.13072721660137177, + "learning_rate": 8.60428400103229e-05, + "loss": 1.5641220092773438, + "step": 103650 + }, + { + "epoch": 0.31377415888486976, + "grad_norm": 0.11481917649507523, + "learning_rate": 8.60390448286855e-05, + "loss": 1.5681612014770507, + "step": 103660 + }, + { + "epoch": 0.3138044284352156, + "grad_norm": 0.10255339741706848, + "learning_rate": 8.603524964704811e-05, + "loss": 1.5630335807800293, + "step": 103670 + }, + { + "epoch": 0.3138346979855614, + "grad_norm": 0.11205478012561798, + "learning_rate": 8.603145446541072e-05, + "loss": 1.5393245697021485, + "step": 103680 + }, + { + "epoch": 0.31386496753590726, + "grad_norm": 0.10309364646673203, + "learning_rate": 8.602765928377332e-05, + "loss": 1.572332000732422, + "step": 103690 + }, + { + "epoch": 0.3138952370862531, + "grad_norm": 0.11084075272083282, + "learning_rate": 8.602386410213593e-05, + "loss": 1.5542988777160645, + "step": 103700 + }, + { + "epoch": 0.3139255066365989, + "grad_norm": 0.10054653882980347, + "learning_rate": 8.602006892049853e-05, + "loss": 1.5363157272338868, + "step": 103710 + }, + { + "epoch": 0.31395577618694476, + "grad_norm": 0.11433236300945282, + "learning_rate": 8.601627373886114e-05, + "loss": 1.5653074264526368, + "step": 103720 + }, + { + "epoch": 0.31398604573729055, + "grad_norm": 0.11800725758075714, + "learning_rate": 8.601247855722374e-05, + "loss": 1.5663952827453613, + "step": 103730 + }, + { + "epoch": 0.3140163152876364, + "grad_norm": 0.1248251423239708, + "learning_rate": 8.600868337558636e-05, + "loss": 1.5444615364074707, + "step": 103740 + }, + { + "epoch": 0.31404658483798226, + "grad_norm": 0.10305801779031754, + "learning_rate": 8.600488819394896e-05, + "loss": 1.5730247497558594, + "step": 103750 + }, + { + "epoch": 0.31407685438832805, + "grad_norm": 0.10968896001577377, + "learning_rate": 8.600109301231158e-05, + "loss": 1.5260819435119628, + "step": 103760 + }, + { + "epoch": 0.3141071239386739, + "grad_norm": 0.10912235826253891, + "learning_rate": 8.599729783067418e-05, + "loss": 1.5699193000793457, + "step": 103770 + }, + { + "epoch": 0.3141373934890197, + "grad_norm": 0.11043926328420639, + "learning_rate": 8.599350264903679e-05, + "loss": 1.5606675148010254, + "step": 103780 + }, + { + "epoch": 0.31416766303936555, + "grad_norm": 0.12317589670419693, + "learning_rate": 8.59897074673994e-05, + "loss": 1.5614839553833009, + "step": 103790 + }, + { + "epoch": 0.3141979325897114, + "grad_norm": 0.11404818296432495, + "learning_rate": 8.5985912285762e-05, + "loss": 1.5503774642944337, + "step": 103800 + }, + { + "epoch": 0.3142282021400572, + "grad_norm": 0.1137910783290863, + "learning_rate": 8.59821171041246e-05, + "loss": 1.597555923461914, + "step": 103810 + }, + { + "epoch": 0.31425847169040305, + "grad_norm": 0.11615561693906784, + "learning_rate": 8.597832192248721e-05, + "loss": 1.6037235260009766, + "step": 103820 + }, + { + "epoch": 0.31428874124074885, + "grad_norm": 0.10552789270877838, + "learning_rate": 8.597452674084982e-05, + "loss": 1.550844955444336, + "step": 103830 + }, + { + "epoch": 0.3143190107910947, + "grad_norm": 0.12463755160570145, + "learning_rate": 8.597073155921242e-05, + "loss": 1.532270622253418, + "step": 103840 + }, + { + "epoch": 0.31434928034144055, + "grad_norm": 0.11945468932390213, + "learning_rate": 8.596693637757503e-05, + "loss": 1.5413032531738282, + "step": 103850 + }, + { + "epoch": 0.31437954989178635, + "grad_norm": 0.12219848483800888, + "learning_rate": 8.596314119593764e-05, + "loss": 1.5680780410766602, + "step": 103860 + }, + { + "epoch": 0.3144098194421322, + "grad_norm": 0.118254654109478, + "learning_rate": 8.595934601430026e-05, + "loss": 1.5699712753295898, + "step": 103870 + }, + { + "epoch": 0.314440088992478, + "grad_norm": 0.112447589635849, + "learning_rate": 8.595555083266285e-05, + "loss": 1.5599749565124512, + "step": 103880 + }, + { + "epoch": 0.31447035854282385, + "grad_norm": 0.10081096738576889, + "learning_rate": 8.595175565102547e-05, + "loss": 1.5287397384643555, + "step": 103890 + }, + { + "epoch": 0.3145006280931697, + "grad_norm": 0.10223580151796341, + "learning_rate": 8.594796046938806e-05, + "loss": 1.5742055892944335, + "step": 103900 + }, + { + "epoch": 0.3145308976435155, + "grad_norm": 0.11201246827840805, + "learning_rate": 8.594416528775068e-05, + "loss": 1.563363265991211, + "step": 103910 + }, + { + "epoch": 0.31456116719386135, + "grad_norm": 0.1128445565700531, + "learning_rate": 8.594037010611327e-05, + "loss": 1.5707561492919921, + "step": 103920 + }, + { + "epoch": 0.31459143674420714, + "grad_norm": 0.12451568990945816, + "learning_rate": 8.593657492447589e-05, + "loss": 1.583526039123535, + "step": 103930 + }, + { + "epoch": 0.314621706294553, + "grad_norm": 0.10846642404794693, + "learning_rate": 8.593277974283848e-05, + "loss": 1.527273178100586, + "step": 103940 + }, + { + "epoch": 0.31465197584489885, + "grad_norm": 0.1018255278468132, + "learning_rate": 8.59289845612011e-05, + "loss": 1.5566482543945312, + "step": 103950 + }, + { + "epoch": 0.31468224539524464, + "grad_norm": 0.10503357648849487, + "learning_rate": 8.592518937956371e-05, + "loss": 1.603081512451172, + "step": 103960 + }, + { + "epoch": 0.3147125149455905, + "grad_norm": 0.1169329360127449, + "learning_rate": 8.592139419792631e-05, + "loss": 1.5898205757141113, + "step": 103970 + }, + { + "epoch": 0.3147427844959363, + "grad_norm": 0.10936921089887619, + "learning_rate": 8.591759901628893e-05, + "loss": 1.5182481765747071, + "step": 103980 + }, + { + "epoch": 0.31477305404628214, + "grad_norm": 0.11031513661146164, + "learning_rate": 8.591380383465153e-05, + "loss": 1.5302196502685548, + "step": 103990 + }, + { + "epoch": 0.314803323596628, + "grad_norm": 0.10490848869085312, + "learning_rate": 8.591000865301415e-05, + "loss": 1.523312759399414, + "step": 104000 + }, + { + "epoch": 0.314803323596628, + "eval_loss": 1.5568854808807373, + "eval_runtime": 28.0542, + "eval_samples_per_second": 17.823, + "eval_steps_per_second": 1.141, + "step": 104000 + }, + { + "epoch": 0.3148335931469738, + "grad_norm": 0.11700917780399323, + "learning_rate": 8.590621347137674e-05, + "loss": 1.5463741302490235, + "step": 104010 + }, + { + "epoch": 0.31486386269731964, + "grad_norm": 0.1147376224398613, + "learning_rate": 8.590241828973936e-05, + "loss": 1.5529609680175782, + "step": 104020 + }, + { + "epoch": 0.31489413224766544, + "grad_norm": 0.11018363386392593, + "learning_rate": 8.589862310810195e-05, + "loss": 1.5379404067993163, + "step": 104030 + }, + { + "epoch": 0.3149244017980113, + "grad_norm": 0.1078985333442688, + "learning_rate": 8.589482792646457e-05, + "loss": 1.545968246459961, + "step": 104040 + }, + { + "epoch": 0.31495467134835714, + "grad_norm": 0.11429613083600998, + "learning_rate": 8.589103274482716e-05, + "loss": 1.5901260375976562, + "step": 104050 + }, + { + "epoch": 0.31498494089870294, + "grad_norm": 0.12220526486635208, + "learning_rate": 8.588723756318978e-05, + "loss": 1.5344687461853028, + "step": 104060 + }, + { + "epoch": 0.3150152104490488, + "grad_norm": 0.10356954485177994, + "learning_rate": 8.588344238155237e-05, + "loss": 1.5537678718566894, + "step": 104070 + }, + { + "epoch": 0.3150454799993946, + "grad_norm": 0.1123078390955925, + "learning_rate": 8.5879647199915e-05, + "loss": 1.576268196105957, + "step": 104080 + }, + { + "epoch": 0.31507574954974044, + "grad_norm": 0.11891559511423111, + "learning_rate": 8.587585201827759e-05, + "loss": 1.5063718795776366, + "step": 104090 + }, + { + "epoch": 0.3151060191000863, + "grad_norm": 0.10862046480178833, + "learning_rate": 8.58720568366402e-05, + "loss": 1.525557804107666, + "step": 104100 + }, + { + "epoch": 0.3151362886504321, + "grad_norm": 0.11618765443563461, + "learning_rate": 8.586826165500281e-05, + "loss": 1.5747200012207032, + "step": 104110 + }, + { + "epoch": 0.31516655820077794, + "grad_norm": 0.10087303072214127, + "learning_rate": 8.586446647336542e-05, + "loss": 1.5358929634094238, + "step": 104120 + }, + { + "epoch": 0.31519682775112373, + "grad_norm": 0.11087266355752945, + "learning_rate": 8.586067129172802e-05, + "loss": 1.5728775978088378, + "step": 104130 + }, + { + "epoch": 0.3152270973014696, + "grad_norm": 0.10739804059267044, + "learning_rate": 8.585687611009063e-05, + "loss": 1.5267637252807618, + "step": 104140 + }, + { + "epoch": 0.31525736685181543, + "grad_norm": 0.11308739334344864, + "learning_rate": 8.585308092845324e-05, + "loss": 1.5363286972045898, + "step": 104150 + }, + { + "epoch": 0.31528763640216123, + "grad_norm": 0.11121634393930435, + "learning_rate": 8.584928574681584e-05, + "loss": 1.5404508590698243, + "step": 104160 + }, + { + "epoch": 0.3153179059525071, + "grad_norm": 0.11276982724666595, + "learning_rate": 8.584549056517846e-05, + "loss": 1.5431226730346679, + "step": 104170 + }, + { + "epoch": 0.3153481755028529, + "grad_norm": 0.12348809093236923, + "learning_rate": 8.584169538354105e-05, + "loss": 1.5685165405273438, + "step": 104180 + }, + { + "epoch": 0.31537844505319873, + "grad_norm": 0.10616036504507065, + "learning_rate": 8.583790020190367e-05, + "loss": 1.5701467514038085, + "step": 104190 + }, + { + "epoch": 0.3154087146035446, + "grad_norm": 0.11319959163665771, + "learning_rate": 8.583410502026627e-05, + "loss": 1.5946916580200194, + "step": 104200 + }, + { + "epoch": 0.3154389841538904, + "grad_norm": 0.11414029449224472, + "learning_rate": 8.583030983862888e-05, + "loss": 1.5510669708251954, + "step": 104210 + }, + { + "epoch": 0.31546925370423623, + "grad_norm": 0.1132374182343483, + "learning_rate": 8.582651465699148e-05, + "loss": 1.5782442092895508, + "step": 104220 + }, + { + "epoch": 0.315499523254582, + "grad_norm": 0.11234383285045624, + "learning_rate": 8.58227194753541e-05, + "loss": 1.5382975578308105, + "step": 104230 + }, + { + "epoch": 0.3155297928049279, + "grad_norm": 0.11978480964899063, + "learning_rate": 8.58189242937167e-05, + "loss": 1.5683107376098633, + "step": 104240 + }, + { + "epoch": 0.31556006235527373, + "grad_norm": 0.10999283194541931, + "learning_rate": 8.581512911207931e-05, + "loss": 1.58839693069458, + "step": 104250 + }, + { + "epoch": 0.3155903319056195, + "grad_norm": 0.12514235079288483, + "learning_rate": 8.581133393044191e-05, + "loss": 1.5478328704833983, + "step": 104260 + }, + { + "epoch": 0.3156206014559654, + "grad_norm": 0.12501046061515808, + "learning_rate": 8.580753874880452e-05, + "loss": 1.5644416809082031, + "step": 104270 + }, + { + "epoch": 0.31565087100631123, + "grad_norm": 0.11096300184726715, + "learning_rate": 8.580374356716713e-05, + "loss": 1.5420333862304687, + "step": 104280 + }, + { + "epoch": 0.315681140556657, + "grad_norm": 0.12909267842769623, + "learning_rate": 8.579994838552973e-05, + "loss": 1.5142835617065429, + "step": 104290 + }, + { + "epoch": 0.3157114101070029, + "grad_norm": 0.1169179230928421, + "learning_rate": 8.579615320389234e-05, + "loss": 1.5573589324951171, + "step": 104300 + }, + { + "epoch": 0.3157416796573487, + "grad_norm": 0.1174662858247757, + "learning_rate": 8.579235802225494e-05, + "loss": 1.5320013999938964, + "step": 104310 + }, + { + "epoch": 0.3157719492076945, + "grad_norm": 0.12191323190927505, + "learning_rate": 8.578856284061755e-05, + "loss": 1.554922103881836, + "step": 104320 + }, + { + "epoch": 0.3158022187580404, + "grad_norm": 0.10787239670753479, + "learning_rate": 8.578476765898016e-05, + "loss": 1.5531682014465331, + "step": 104330 + }, + { + "epoch": 0.3158324883083862, + "grad_norm": 0.11591591686010361, + "learning_rate": 8.578097247734276e-05, + "loss": 1.5704227447509767, + "step": 104340 + }, + { + "epoch": 0.315862757858732, + "grad_norm": 0.11493439227342606, + "learning_rate": 8.577717729570537e-05, + "loss": 1.5514181137084961, + "step": 104350 + }, + { + "epoch": 0.3158930274090778, + "grad_norm": 0.10867372900247574, + "learning_rate": 8.577338211406797e-05, + "loss": 1.5303358078002929, + "step": 104360 + }, + { + "epoch": 0.31592329695942367, + "grad_norm": 0.11070860177278519, + "learning_rate": 8.57695869324306e-05, + "loss": 1.571281623840332, + "step": 104370 + }, + { + "epoch": 0.3159535665097695, + "grad_norm": 0.12332267314195633, + "learning_rate": 8.57657917507932e-05, + "loss": 1.5487915992736816, + "step": 104380 + }, + { + "epoch": 0.3159838360601153, + "grad_norm": 0.11662214249372482, + "learning_rate": 8.57619965691558e-05, + "loss": 1.6231634140014648, + "step": 104390 + }, + { + "epoch": 0.31601410561046117, + "grad_norm": 0.11104471236467361, + "learning_rate": 8.575820138751841e-05, + "loss": 1.5306923866271973, + "step": 104400 + }, + { + "epoch": 0.31604437516080697, + "grad_norm": 0.10913737863302231, + "learning_rate": 8.575440620588102e-05, + "loss": 1.529738998413086, + "step": 104410 + }, + { + "epoch": 0.3160746447111528, + "grad_norm": 0.11821223050355911, + "learning_rate": 8.575061102424362e-05, + "loss": 1.523350429534912, + "step": 104420 + }, + { + "epoch": 0.31610491426149867, + "grad_norm": 0.10388027131557465, + "learning_rate": 8.574681584260623e-05, + "loss": 1.5314180374145507, + "step": 104430 + }, + { + "epoch": 0.31613518381184447, + "grad_norm": 0.11081445217132568, + "learning_rate": 8.574302066096884e-05, + "loss": 1.5655507087707519, + "step": 104440 + }, + { + "epoch": 0.3161654533621903, + "grad_norm": 0.11495951563119888, + "learning_rate": 8.573922547933144e-05, + "loss": 1.5288761138916016, + "step": 104450 + }, + { + "epoch": 0.3161957229125361, + "grad_norm": 0.12373004853725433, + "learning_rate": 8.573543029769405e-05, + "loss": 1.5785980224609375, + "step": 104460 + }, + { + "epoch": 0.31622599246288197, + "grad_norm": 0.10689051449298859, + "learning_rate": 8.573163511605665e-05, + "loss": 1.5597664833068847, + "step": 104470 + }, + { + "epoch": 0.3162562620132278, + "grad_norm": 0.14317676424980164, + "learning_rate": 8.572783993441927e-05, + "loss": 1.5123510360717773, + "step": 104480 + }, + { + "epoch": 0.3162865315635736, + "grad_norm": 0.11402616649866104, + "learning_rate": 8.572404475278186e-05, + "loss": 1.524137306213379, + "step": 104490 + }, + { + "epoch": 0.31631680111391947, + "grad_norm": 0.13350805640220642, + "learning_rate": 8.572024957114448e-05, + "loss": 1.5545103073120117, + "step": 104500 + }, + { + "epoch": 0.31631680111391947, + "eval_loss": 1.5626473426818848, + "eval_runtime": 28.023, + "eval_samples_per_second": 17.842, + "eval_steps_per_second": 1.142, + "step": 104500 + }, + { + "epoch": 0.31634707066426526, + "grad_norm": 0.1306719034910202, + "learning_rate": 8.571645438950708e-05, + "loss": 1.5457167625427246, + "step": 104510 + }, + { + "epoch": 0.3163773402146111, + "grad_norm": 0.11050643771886826, + "learning_rate": 8.57126592078697e-05, + "loss": 1.577874755859375, + "step": 104520 + }, + { + "epoch": 0.31640760976495697, + "grad_norm": 0.11221389472484589, + "learning_rate": 8.570886402623229e-05, + "loss": 1.5754125595092774, + "step": 104530 + }, + { + "epoch": 0.31643787931530276, + "grad_norm": 0.10099491477012634, + "learning_rate": 8.570506884459491e-05, + "loss": 1.5823529243469239, + "step": 104540 + }, + { + "epoch": 0.3164681488656486, + "grad_norm": 0.12380848079919815, + "learning_rate": 8.57012736629575e-05, + "loss": 1.5658713340759278, + "step": 104550 + }, + { + "epoch": 0.3164984184159944, + "grad_norm": 0.10279393196105957, + "learning_rate": 8.569747848132012e-05, + "loss": 1.546176815032959, + "step": 104560 + }, + { + "epoch": 0.31652868796634026, + "grad_norm": 0.10588081181049347, + "learning_rate": 8.569368329968273e-05, + "loss": 1.574593734741211, + "step": 104570 + }, + { + "epoch": 0.3165589575166861, + "grad_norm": 0.13661505281925201, + "learning_rate": 8.568988811804533e-05, + "loss": 1.5238198280334472, + "step": 104580 + }, + { + "epoch": 0.3165892270670319, + "grad_norm": 0.12574070692062378, + "learning_rate": 8.568609293640794e-05, + "loss": 1.538181686401367, + "step": 104590 + }, + { + "epoch": 0.31661949661737776, + "grad_norm": 0.10324753075838089, + "learning_rate": 8.568229775477054e-05, + "loss": 1.60321102142334, + "step": 104600 + }, + { + "epoch": 0.31664976616772356, + "grad_norm": 0.11211711913347244, + "learning_rate": 8.567850257313316e-05, + "loss": 1.5530481338500977, + "step": 104610 + }, + { + "epoch": 0.3166800357180694, + "grad_norm": 0.10241544991731644, + "learning_rate": 8.567470739149576e-05, + "loss": 1.542947769165039, + "step": 104620 + }, + { + "epoch": 0.31671030526841526, + "grad_norm": 0.10886596888303757, + "learning_rate": 8.567091220985838e-05, + "loss": 1.5713054656982421, + "step": 104630 + }, + { + "epoch": 0.31674057481876106, + "grad_norm": 0.11769215017557144, + "learning_rate": 8.566711702822097e-05, + "loss": 1.5361408233642577, + "step": 104640 + }, + { + "epoch": 0.3167708443691069, + "grad_norm": 0.11079999059438705, + "learning_rate": 8.566332184658359e-05, + "loss": 1.5610601425170898, + "step": 104650 + }, + { + "epoch": 0.3168011139194527, + "grad_norm": 0.10829336196184158, + "learning_rate": 8.565952666494618e-05, + "loss": 1.5522903442382812, + "step": 104660 + }, + { + "epoch": 0.31683138346979856, + "grad_norm": 0.12334970384836197, + "learning_rate": 8.56557314833088e-05, + "loss": 1.5772521018981933, + "step": 104670 + }, + { + "epoch": 0.3168616530201444, + "grad_norm": 0.11006049066781998, + "learning_rate": 8.565193630167139e-05, + "loss": 1.5527440071105958, + "step": 104680 + }, + { + "epoch": 0.3168919225704902, + "grad_norm": 0.11231285333633423, + "learning_rate": 8.564814112003401e-05, + "loss": 1.5855717658996582, + "step": 104690 + }, + { + "epoch": 0.31692219212083605, + "grad_norm": 0.11141998320817947, + "learning_rate": 8.56443459383966e-05, + "loss": 1.5640482902526855, + "step": 104700 + }, + { + "epoch": 0.31695246167118185, + "grad_norm": 0.12083648145198822, + "learning_rate": 8.564055075675922e-05, + "loss": 1.5706192016601563, + "step": 104710 + }, + { + "epoch": 0.3169827312215277, + "grad_norm": 0.10648151487112045, + "learning_rate": 8.563675557512182e-05, + "loss": 1.5000597000122071, + "step": 104720 + }, + { + "epoch": 0.31701300077187355, + "grad_norm": 0.11270900070667267, + "learning_rate": 8.563296039348443e-05, + "loss": 1.5424970626831054, + "step": 104730 + }, + { + "epoch": 0.31704327032221935, + "grad_norm": 0.11284885555505753, + "learning_rate": 8.562916521184704e-05, + "loss": 1.5301362991333007, + "step": 104740 + }, + { + "epoch": 0.3170735398725652, + "grad_norm": 0.1148497685790062, + "learning_rate": 8.562537003020965e-05, + "loss": 1.534556007385254, + "step": 104750 + }, + { + "epoch": 0.317103809422911, + "grad_norm": 0.11220505833625793, + "learning_rate": 8.562157484857225e-05, + "loss": 1.5460174560546875, + "step": 104760 + }, + { + "epoch": 0.31713407897325685, + "grad_norm": 0.11405239254236221, + "learning_rate": 8.561777966693486e-05, + "loss": 1.5870539665222168, + "step": 104770 + }, + { + "epoch": 0.3171643485236027, + "grad_norm": 0.11665713042020798, + "learning_rate": 8.561398448529748e-05, + "loss": 1.5734318733215331, + "step": 104780 + }, + { + "epoch": 0.3171946180739485, + "grad_norm": 0.1246403232216835, + "learning_rate": 8.561018930366007e-05, + "loss": 1.573436164855957, + "step": 104790 + }, + { + "epoch": 0.31722488762429435, + "grad_norm": 0.10968825221061707, + "learning_rate": 8.560639412202269e-05, + "loss": 1.5513041496276856, + "step": 104800 + }, + { + "epoch": 0.31725515717464015, + "grad_norm": 0.11033026874065399, + "learning_rate": 8.560259894038528e-05, + "loss": 1.5454135894775392, + "step": 104810 + }, + { + "epoch": 0.317285426724986, + "grad_norm": 0.11443597823381424, + "learning_rate": 8.55988037587479e-05, + "loss": 1.5490614891052246, + "step": 104820 + }, + { + "epoch": 0.31731569627533185, + "grad_norm": 0.10326600819826126, + "learning_rate": 8.55950085771105e-05, + "loss": 1.581196403503418, + "step": 104830 + }, + { + "epoch": 0.31734596582567765, + "grad_norm": 0.13438403606414795, + "learning_rate": 8.559121339547311e-05, + "loss": 1.5762670516967774, + "step": 104840 + }, + { + "epoch": 0.3173762353760235, + "grad_norm": 0.1107650026679039, + "learning_rate": 8.558741821383572e-05, + "loss": 1.5965364456176758, + "step": 104850 + }, + { + "epoch": 0.3174065049263693, + "grad_norm": 0.09995104372501373, + "learning_rate": 8.558362303219833e-05, + "loss": 1.5240948677062989, + "step": 104860 + }, + { + "epoch": 0.31743677447671514, + "grad_norm": 0.12161048501729965, + "learning_rate": 8.557982785056093e-05, + "loss": 1.5592623710632325, + "step": 104870 + }, + { + "epoch": 0.317467044027061, + "grad_norm": 0.11942097544670105, + "learning_rate": 8.557603266892354e-05, + "loss": 1.5517067909240723, + "step": 104880 + }, + { + "epoch": 0.3174973135774068, + "grad_norm": 0.11209908872842789, + "learning_rate": 8.557223748728614e-05, + "loss": 1.5403011322021485, + "step": 104890 + }, + { + "epoch": 0.31752758312775264, + "grad_norm": 0.1166388988494873, + "learning_rate": 8.556844230564875e-05, + "loss": 1.5487936019897461, + "step": 104900 + }, + { + "epoch": 0.31755785267809844, + "grad_norm": 0.10368174314498901, + "learning_rate": 8.556464712401136e-05, + "loss": 1.5728803634643556, + "step": 104910 + }, + { + "epoch": 0.3175881222284443, + "grad_norm": 0.11104999482631683, + "learning_rate": 8.556085194237396e-05, + "loss": 1.558646011352539, + "step": 104920 + }, + { + "epoch": 0.31761839177879014, + "grad_norm": 0.11592673510313034, + "learning_rate": 8.555705676073657e-05, + "loss": 1.5501325607299805, + "step": 104930 + }, + { + "epoch": 0.31764866132913594, + "grad_norm": 0.11349943280220032, + "learning_rate": 8.555326157909917e-05, + "loss": 1.5729166984558105, + "step": 104940 + }, + { + "epoch": 0.3176789308794818, + "grad_norm": 0.11415130645036697, + "learning_rate": 8.554946639746178e-05, + "loss": 1.5642901420593263, + "step": 104950 + }, + { + "epoch": 0.3177092004298276, + "grad_norm": 0.11100303381681442, + "learning_rate": 8.554567121582439e-05, + "loss": 1.5563657760620118, + "step": 104960 + }, + { + "epoch": 0.31773946998017344, + "grad_norm": 0.13311851024627686, + "learning_rate": 8.554187603418699e-05, + "loss": 1.5384366989135743, + "step": 104970 + }, + { + "epoch": 0.3177697395305193, + "grad_norm": 0.10984355956315994, + "learning_rate": 8.553808085254961e-05, + "loss": 1.5512868881225585, + "step": 104980 + }, + { + "epoch": 0.3178000090808651, + "grad_norm": 0.12474004924297333, + "learning_rate": 8.553428567091222e-05, + "loss": 1.507007884979248, + "step": 104990 + }, + { + "epoch": 0.31783027863121094, + "grad_norm": 0.10752592980861664, + "learning_rate": 8.553049048927482e-05, + "loss": 1.5453036308288575, + "step": 105000 + }, + { + "epoch": 0.31783027863121094, + "eval_loss": 1.55938720703125, + "eval_runtime": 28.0402, + "eval_samples_per_second": 17.832, + "eval_steps_per_second": 1.141, + "step": 105000 + }, + { + "epoch": 0.3178605481815568, + "grad_norm": 0.11476723849773407, + "learning_rate": 8.552669530763743e-05, + "loss": 1.5790916442871095, + "step": 105010 + }, + { + "epoch": 0.3178908177319026, + "grad_norm": 0.11046860367059708, + "learning_rate": 8.552290012600003e-05, + "loss": 1.5524106979370118, + "step": 105020 + }, + { + "epoch": 0.31792108728224844, + "grad_norm": 0.10252860188484192, + "learning_rate": 8.551910494436264e-05, + "loss": 1.5832084655761718, + "step": 105030 + }, + { + "epoch": 0.31795135683259423, + "grad_norm": 0.11736368387937546, + "learning_rate": 8.551530976272525e-05, + "loss": 1.554990577697754, + "step": 105040 + }, + { + "epoch": 0.3179816263829401, + "grad_norm": 0.11872730404138565, + "learning_rate": 8.551151458108785e-05, + "loss": 1.5395825386047364, + "step": 105050 + }, + { + "epoch": 0.31801189593328594, + "grad_norm": 0.10898806154727936, + "learning_rate": 8.550771939945046e-05, + "loss": 1.55296049118042, + "step": 105060 + }, + { + "epoch": 0.31804216548363173, + "grad_norm": 0.12392731755971909, + "learning_rate": 8.550392421781306e-05, + "loss": 1.553055477142334, + "step": 105070 + }, + { + "epoch": 0.3180724350339776, + "grad_norm": 0.11322714388370514, + "learning_rate": 8.550012903617567e-05, + "loss": 1.5431087493896485, + "step": 105080 + }, + { + "epoch": 0.3181027045843234, + "grad_norm": 0.11912034451961517, + "learning_rate": 8.549633385453829e-05, + "loss": 1.5080154418945313, + "step": 105090 + }, + { + "epoch": 0.31813297413466923, + "grad_norm": 0.11386223882436752, + "learning_rate": 8.549253867290088e-05, + "loss": 1.5948167800903321, + "step": 105100 + }, + { + "epoch": 0.3181632436850151, + "grad_norm": 0.11265401542186737, + "learning_rate": 8.54887434912635e-05, + "loss": 1.5485387802124024, + "step": 105110 + }, + { + "epoch": 0.3181935132353609, + "grad_norm": 0.11904460191726685, + "learning_rate": 8.54849483096261e-05, + "loss": 1.5934021949768067, + "step": 105120 + }, + { + "epoch": 0.31822378278570673, + "grad_norm": 0.11686676740646362, + "learning_rate": 8.548115312798871e-05, + "loss": 1.5807793617248536, + "step": 105130 + }, + { + "epoch": 0.31825405233605253, + "grad_norm": 0.10075646638870239, + "learning_rate": 8.54773579463513e-05, + "loss": 1.5494611740112305, + "step": 105140 + }, + { + "epoch": 0.3182843218863984, + "grad_norm": 0.09944666922092438, + "learning_rate": 8.547356276471393e-05, + "loss": 1.5618995666503905, + "step": 105150 + }, + { + "epoch": 0.31831459143674423, + "grad_norm": 0.11365240812301636, + "learning_rate": 8.546976758307652e-05, + "loss": 1.5418134689331056, + "step": 105160 + }, + { + "epoch": 0.31834486098709003, + "grad_norm": 0.11992358416318893, + "learning_rate": 8.546597240143914e-05, + "loss": 1.5839569091796875, + "step": 105170 + }, + { + "epoch": 0.3183751305374359, + "grad_norm": 0.10478152334690094, + "learning_rate": 8.546217721980174e-05, + "loss": 1.5357202529907226, + "step": 105180 + }, + { + "epoch": 0.3184054000877817, + "grad_norm": 0.11329399794340134, + "learning_rate": 8.545838203816435e-05, + "loss": 1.5742127418518066, + "step": 105190 + }, + { + "epoch": 0.3184356696381275, + "grad_norm": 0.10145208984613419, + "learning_rate": 8.545458685652696e-05, + "loss": 1.5720305442810059, + "step": 105200 + }, + { + "epoch": 0.3184659391884734, + "grad_norm": 0.11935153603553772, + "learning_rate": 8.545079167488956e-05, + "loss": 1.5155181884765625, + "step": 105210 + }, + { + "epoch": 0.3184962087388192, + "grad_norm": 0.1034703254699707, + "learning_rate": 8.544699649325218e-05, + "loss": 1.5684507369995118, + "step": 105220 + }, + { + "epoch": 0.318526478289165, + "grad_norm": 0.11333246529102325, + "learning_rate": 8.544320131161477e-05, + "loss": 1.5425137519836425, + "step": 105230 + }, + { + "epoch": 0.3185567478395108, + "grad_norm": 0.10844188928604126, + "learning_rate": 8.543940612997739e-05, + "loss": 1.5501920700073242, + "step": 105240 + }, + { + "epoch": 0.3185870173898567, + "grad_norm": 0.10449690371751785, + "learning_rate": 8.543561094833998e-05, + "loss": 1.506490993499756, + "step": 105250 + }, + { + "epoch": 0.3186172869402025, + "grad_norm": 0.11785990744829178, + "learning_rate": 8.54318157667026e-05, + "loss": 1.5642494201660155, + "step": 105260 + }, + { + "epoch": 0.3186475564905483, + "grad_norm": 0.10669108480215073, + "learning_rate": 8.54280205850652e-05, + "loss": 1.5505821228027343, + "step": 105270 + }, + { + "epoch": 0.3186778260408942, + "grad_norm": 0.11496838927268982, + "learning_rate": 8.542422540342782e-05, + "loss": 1.5886008262634277, + "step": 105280 + }, + { + "epoch": 0.31870809559123997, + "grad_norm": 0.11664939671754837, + "learning_rate": 8.542043022179041e-05, + "loss": 1.5536788940429687, + "step": 105290 + }, + { + "epoch": 0.3187383651415858, + "grad_norm": 0.1076822280883789, + "learning_rate": 8.541663504015303e-05, + "loss": 1.5703790664672852, + "step": 105300 + }, + { + "epoch": 0.3187686346919317, + "grad_norm": 0.10723108053207397, + "learning_rate": 8.541283985851562e-05, + "loss": 1.562849521636963, + "step": 105310 + }, + { + "epoch": 0.31879890424227747, + "grad_norm": 0.11028552055358887, + "learning_rate": 8.540904467687824e-05, + "loss": 1.5845868110656738, + "step": 105320 + }, + { + "epoch": 0.3188291737926233, + "grad_norm": 0.1228800117969513, + "learning_rate": 8.540524949524083e-05, + "loss": 1.5447125434875488, + "step": 105330 + }, + { + "epoch": 0.3188594433429691, + "grad_norm": 0.10717020183801651, + "learning_rate": 8.540145431360345e-05, + "loss": 1.593952751159668, + "step": 105340 + }, + { + "epoch": 0.31888971289331497, + "grad_norm": 0.11663594096899033, + "learning_rate": 8.539765913196606e-05, + "loss": 1.5742517471313477, + "step": 105350 + }, + { + "epoch": 0.3189199824436608, + "grad_norm": 0.1098511815071106, + "learning_rate": 8.539386395032866e-05, + "loss": 1.5825756072998047, + "step": 105360 + }, + { + "epoch": 0.3189502519940066, + "grad_norm": 0.1117372065782547, + "learning_rate": 8.539006876869127e-05, + "loss": 1.5445821762084961, + "step": 105370 + }, + { + "epoch": 0.31898052154435247, + "grad_norm": 0.11983779817819595, + "learning_rate": 8.538627358705388e-05, + "loss": 1.5462669372558593, + "step": 105380 + }, + { + "epoch": 0.31901079109469827, + "grad_norm": 0.11022092401981354, + "learning_rate": 8.53824784054165e-05, + "loss": 1.5586663246154786, + "step": 105390 + }, + { + "epoch": 0.3190410606450441, + "grad_norm": 0.11033505201339722, + "learning_rate": 8.537868322377909e-05, + "loss": 1.5602442741394043, + "step": 105400 + }, + { + "epoch": 0.31907133019538997, + "grad_norm": 0.11178361624479294, + "learning_rate": 8.537488804214171e-05, + "loss": 1.5188949584960938, + "step": 105410 + }, + { + "epoch": 0.31910159974573576, + "grad_norm": 0.10900519043207169, + "learning_rate": 8.53710928605043e-05, + "loss": 1.5386077880859375, + "step": 105420 + }, + { + "epoch": 0.3191318692960816, + "grad_norm": 0.10606478899717331, + "learning_rate": 8.536729767886692e-05, + "loss": 1.5605717658996583, + "step": 105430 + }, + { + "epoch": 0.3191621388464274, + "grad_norm": 0.11166266351938248, + "learning_rate": 8.536350249722951e-05, + "loss": 1.5710501670837402, + "step": 105440 + }, + { + "epoch": 0.31919240839677326, + "grad_norm": 0.10460297018289566, + "learning_rate": 8.535970731559213e-05, + "loss": 1.5413506507873536, + "step": 105450 + }, + { + "epoch": 0.3192226779471191, + "grad_norm": 0.10355786234140396, + "learning_rate": 8.535591213395472e-05, + "loss": 1.5357123374938966, + "step": 105460 + }, + { + "epoch": 0.3192529474974649, + "grad_norm": 0.1272558867931366, + "learning_rate": 8.535211695231734e-05, + "loss": 1.5436358451843262, + "step": 105470 + }, + { + "epoch": 0.31928321704781076, + "grad_norm": 0.10312772542238235, + "learning_rate": 8.534832177067995e-05, + "loss": 1.5086268424987792, + "step": 105480 + }, + { + "epoch": 0.31931348659815656, + "grad_norm": 0.12263143062591553, + "learning_rate": 8.534452658904255e-05, + "loss": 1.568527889251709, + "step": 105490 + }, + { + "epoch": 0.3193437561485024, + "grad_norm": 0.11747146397829056, + "learning_rate": 8.534073140740516e-05, + "loss": 1.5465818405151368, + "step": 105500 + }, + { + "epoch": 0.3193437561485024, + "eval_loss": 1.5496094226837158, + "eval_runtime": 28.2255, + "eval_samples_per_second": 17.714, + "eval_steps_per_second": 1.134, + "step": 105500 + }, + { + "epoch": 0.31937402569884826, + "grad_norm": 0.12163633108139038, + "learning_rate": 8.533693622576777e-05, + "loss": 1.5222790718078614, + "step": 105510 + }, + { + "epoch": 0.31940429524919406, + "grad_norm": 0.10518769174814224, + "learning_rate": 8.533314104413037e-05, + "loss": 1.5582192420959473, + "step": 105520 + }, + { + "epoch": 0.3194345647995399, + "grad_norm": 0.11450622230768204, + "learning_rate": 8.532934586249298e-05, + "loss": 1.5831802368164063, + "step": 105530 + }, + { + "epoch": 0.3194648343498857, + "grad_norm": 0.11167770624160767, + "learning_rate": 8.532555068085558e-05, + "loss": 1.568953800201416, + "step": 105540 + }, + { + "epoch": 0.31949510390023156, + "grad_norm": 0.11321481317281723, + "learning_rate": 8.532175549921819e-05, + "loss": 1.581551742553711, + "step": 105550 + }, + { + "epoch": 0.3195253734505774, + "grad_norm": 0.10981014370918274, + "learning_rate": 8.53179603175808e-05, + "loss": 1.5779229164123536, + "step": 105560 + }, + { + "epoch": 0.3195556430009232, + "grad_norm": 0.10379976034164429, + "learning_rate": 8.53141651359434e-05, + "loss": 1.5586381912231446, + "step": 105570 + }, + { + "epoch": 0.31958591255126906, + "grad_norm": 0.11683934926986694, + "learning_rate": 8.531036995430601e-05, + "loss": 1.5914787292480468, + "step": 105580 + }, + { + "epoch": 0.31961618210161485, + "grad_norm": 0.10335937142372131, + "learning_rate": 8.530657477266863e-05, + "loss": 1.5551081657409669, + "step": 105590 + }, + { + "epoch": 0.3196464516519607, + "grad_norm": 0.10903038084506989, + "learning_rate": 8.530277959103123e-05, + "loss": 1.5972681045532227, + "step": 105600 + }, + { + "epoch": 0.31967672120230656, + "grad_norm": 0.12088033556938171, + "learning_rate": 8.529898440939384e-05, + "loss": 1.5758443832397462, + "step": 105610 + }, + { + "epoch": 0.31970699075265235, + "grad_norm": 0.11338132619857788, + "learning_rate": 8.529518922775645e-05, + "loss": 1.5281148910522462, + "step": 105620 + }, + { + "epoch": 0.3197372603029982, + "grad_norm": 0.10693684965372086, + "learning_rate": 8.529139404611905e-05, + "loss": 1.6055646896362306, + "step": 105630 + }, + { + "epoch": 0.319767529853344, + "grad_norm": 0.10207953304052353, + "learning_rate": 8.528759886448166e-05, + "loss": 1.5563944816589355, + "step": 105640 + }, + { + "epoch": 0.31979779940368985, + "grad_norm": 0.12977063655853271, + "learning_rate": 8.528380368284426e-05, + "loss": 1.5732328414916992, + "step": 105650 + }, + { + "epoch": 0.3198280689540357, + "grad_norm": 0.10960181802511215, + "learning_rate": 8.528000850120687e-05, + "loss": 1.567523193359375, + "step": 105660 + }, + { + "epoch": 0.3198583385043815, + "grad_norm": 0.1241360530257225, + "learning_rate": 8.527621331956948e-05, + "loss": 1.54245023727417, + "step": 105670 + }, + { + "epoch": 0.31988860805472735, + "grad_norm": 0.11270581185817719, + "learning_rate": 8.527241813793208e-05, + "loss": 1.5540691375732423, + "step": 105680 + }, + { + "epoch": 0.3199188776050732, + "grad_norm": 0.1154630184173584, + "learning_rate": 8.526862295629469e-05, + "loss": 1.566262435913086, + "step": 105690 + }, + { + "epoch": 0.319949147155419, + "grad_norm": 0.12129826843738556, + "learning_rate": 8.52648277746573e-05, + "loss": 1.5311601638793946, + "step": 105700 + }, + { + "epoch": 0.31997941670576485, + "grad_norm": 0.11878903210163116, + "learning_rate": 8.52610325930199e-05, + "loss": 1.5964832305908203, + "step": 105710 + }, + { + "epoch": 0.32000968625611065, + "grad_norm": 0.11831699311733246, + "learning_rate": 8.525723741138252e-05, + "loss": 1.5351573944091796, + "step": 105720 + }, + { + "epoch": 0.3200399558064565, + "grad_norm": 0.10141914337873459, + "learning_rate": 8.525344222974511e-05, + "loss": 1.56359806060791, + "step": 105730 + }, + { + "epoch": 0.32007022535680235, + "grad_norm": 0.1114875078201294, + "learning_rate": 8.524964704810773e-05, + "loss": 1.4979019165039062, + "step": 105740 + }, + { + "epoch": 0.32010049490714815, + "grad_norm": 0.10788848996162415, + "learning_rate": 8.524585186647032e-05, + "loss": 1.5375595092773438, + "step": 105750 + }, + { + "epoch": 0.320130764457494, + "grad_norm": 0.11311239749193192, + "learning_rate": 8.524205668483294e-05, + "loss": 1.5559347152709961, + "step": 105760 + }, + { + "epoch": 0.3201610340078398, + "grad_norm": 0.10617093741893768, + "learning_rate": 8.523826150319553e-05, + "loss": 1.531015968322754, + "step": 105770 + }, + { + "epoch": 0.32019130355818565, + "grad_norm": 0.10881582647562027, + "learning_rate": 8.523446632155815e-05, + "loss": 1.5510969161987305, + "step": 105780 + }, + { + "epoch": 0.3202215731085315, + "grad_norm": 0.1068200170993805, + "learning_rate": 8.523067113992076e-05, + "loss": 1.570778751373291, + "step": 105790 + }, + { + "epoch": 0.3202518426588773, + "grad_norm": 0.12082921713590622, + "learning_rate": 8.522687595828337e-05, + "loss": 1.5209651947021485, + "step": 105800 + }, + { + "epoch": 0.32028211220922315, + "grad_norm": 0.11059125512838364, + "learning_rate": 8.522308077664597e-05, + "loss": 1.5291425704956054, + "step": 105810 + }, + { + "epoch": 0.32031238175956894, + "grad_norm": 0.12112808227539062, + "learning_rate": 8.521928559500858e-05, + "loss": 1.5528459548950195, + "step": 105820 + }, + { + "epoch": 0.3203426513099148, + "grad_norm": 0.11121869087219238, + "learning_rate": 8.52154904133712e-05, + "loss": 1.5380406379699707, + "step": 105830 + }, + { + "epoch": 0.32037292086026065, + "grad_norm": 0.11434449255466461, + "learning_rate": 8.521169523173379e-05, + "loss": 1.5277116775512696, + "step": 105840 + }, + { + "epoch": 0.32040319041060644, + "grad_norm": 0.12449758499860764, + "learning_rate": 8.520790005009641e-05, + "loss": 1.511958694458008, + "step": 105850 + }, + { + "epoch": 0.3204334599609523, + "grad_norm": 0.10737667232751846, + "learning_rate": 8.5204104868459e-05, + "loss": 1.5507431030273438, + "step": 105860 + }, + { + "epoch": 0.3204637295112981, + "grad_norm": 0.1089923307299614, + "learning_rate": 8.520030968682162e-05, + "loss": 1.5474770545959473, + "step": 105870 + }, + { + "epoch": 0.32049399906164394, + "grad_norm": 0.11180303245782852, + "learning_rate": 8.519651450518421e-05, + "loss": 1.5286425590515136, + "step": 105880 + }, + { + "epoch": 0.3205242686119898, + "grad_norm": 0.10938763618469238, + "learning_rate": 8.519271932354683e-05, + "loss": 1.5557916641235352, + "step": 105890 + }, + { + "epoch": 0.3205545381623356, + "grad_norm": 0.10542891919612885, + "learning_rate": 8.518892414190943e-05, + "loss": 1.5877654075622558, + "step": 105900 + }, + { + "epoch": 0.32058480771268144, + "grad_norm": 0.12140415608882904, + "learning_rate": 8.518512896027205e-05, + "loss": 1.5614131927490233, + "step": 105910 + }, + { + "epoch": 0.32061507726302724, + "grad_norm": 0.1114545688033104, + "learning_rate": 8.518133377863464e-05, + "loss": 1.5405542373657226, + "step": 105920 + }, + { + "epoch": 0.3206453468133731, + "grad_norm": 0.11203286051750183, + "learning_rate": 8.517753859699726e-05, + "loss": 1.5374757766723632, + "step": 105930 + }, + { + "epoch": 0.32067561636371894, + "grad_norm": 0.10404261201620102, + "learning_rate": 8.517374341535985e-05, + "loss": 1.561332893371582, + "step": 105940 + }, + { + "epoch": 0.32070588591406474, + "grad_norm": 0.11594144999980927, + "learning_rate": 8.516994823372247e-05, + "loss": 1.5531631469726563, + "step": 105950 + }, + { + "epoch": 0.3207361554644106, + "grad_norm": 0.11440958827733994, + "learning_rate": 8.516615305208508e-05, + "loss": 1.5765484809875487, + "step": 105960 + }, + { + "epoch": 0.3207664250147564, + "grad_norm": 0.10173893719911575, + "learning_rate": 8.516235787044768e-05, + "loss": 1.5786092758178711, + "step": 105970 + }, + { + "epoch": 0.32079669456510224, + "grad_norm": 0.11337791383266449, + "learning_rate": 8.515856268881029e-05, + "loss": 1.5380247116088868, + "step": 105980 + }, + { + "epoch": 0.3208269641154481, + "grad_norm": 0.10881630331277847, + "learning_rate": 8.515476750717289e-05, + "loss": 1.5831881523132325, + "step": 105990 + }, + { + "epoch": 0.3208572336657939, + "grad_norm": 0.11410406231880188, + "learning_rate": 8.515097232553551e-05, + "loss": 1.5546786308288574, + "step": 106000 + }, + { + "epoch": 0.3208572336657939, + "eval_loss": 1.5570862293243408, + "eval_runtime": 27.8931, + "eval_samples_per_second": 17.926, + "eval_steps_per_second": 1.147, + "step": 106000 + }, + { + "epoch": 0.32088750321613974, + "grad_norm": 0.09486523270606995, + "learning_rate": 8.51471771438981e-05, + "loss": 1.5670005798339843, + "step": 106010 + }, + { + "epoch": 0.32091777276648553, + "grad_norm": 0.11028598248958588, + "learning_rate": 8.514338196226072e-05, + "loss": 1.5738896369934081, + "step": 106020 + }, + { + "epoch": 0.3209480423168314, + "grad_norm": 0.10627409815788269, + "learning_rate": 8.513958678062332e-05, + "loss": 1.5553377151489258, + "step": 106030 + }, + { + "epoch": 0.32097831186717724, + "grad_norm": 0.1104721948504448, + "learning_rate": 8.513579159898594e-05, + "loss": 1.5735519409179688, + "step": 106040 + }, + { + "epoch": 0.32100858141752303, + "grad_norm": 0.11576709151268005, + "learning_rate": 8.513199641734853e-05, + "loss": 1.5688379287719727, + "step": 106050 + }, + { + "epoch": 0.3210388509678689, + "grad_norm": 0.10783635824918747, + "learning_rate": 8.512820123571115e-05, + "loss": 1.5458906173706055, + "step": 106060 + }, + { + "epoch": 0.3210691205182147, + "grad_norm": 0.10883255302906036, + "learning_rate": 8.512440605407374e-05, + "loss": 1.5449727058410645, + "step": 106070 + }, + { + "epoch": 0.32109939006856053, + "grad_norm": 0.1106882095336914, + "learning_rate": 8.512061087243636e-05, + "loss": 1.5344409942626953, + "step": 106080 + }, + { + "epoch": 0.3211296596189064, + "grad_norm": 0.11875692009925842, + "learning_rate": 8.511681569079897e-05, + "loss": 1.5416084289550782, + "step": 106090 + }, + { + "epoch": 0.3211599291692522, + "grad_norm": 0.10639870166778564, + "learning_rate": 8.511302050916157e-05, + "loss": 1.5165712356567382, + "step": 106100 + }, + { + "epoch": 0.32119019871959803, + "grad_norm": 0.11280059069395065, + "learning_rate": 8.510922532752418e-05, + "loss": 1.57442626953125, + "step": 106110 + }, + { + "epoch": 0.3212204682699438, + "grad_norm": 0.11293133348226547, + "learning_rate": 8.510543014588678e-05, + "loss": 1.5596952438354492, + "step": 106120 + }, + { + "epoch": 0.3212507378202897, + "grad_norm": 0.11410648375749588, + "learning_rate": 8.510163496424939e-05, + "loss": 1.5281763076782227, + "step": 106130 + }, + { + "epoch": 0.32128100737063553, + "grad_norm": 0.11409741640090942, + "learning_rate": 8.5097839782612e-05, + "loss": 1.513899040222168, + "step": 106140 + }, + { + "epoch": 0.3213112769209813, + "grad_norm": 0.1171032190322876, + "learning_rate": 8.50940446009746e-05, + "loss": 1.5507542610168457, + "step": 106150 + }, + { + "epoch": 0.3213415464713272, + "grad_norm": 0.1112365573644638, + "learning_rate": 8.509024941933721e-05, + "loss": 1.5518115997314452, + "step": 106160 + }, + { + "epoch": 0.321371816021673, + "grad_norm": 0.12204629927873611, + "learning_rate": 8.508645423769981e-05, + "loss": 1.5289036750793457, + "step": 106170 + }, + { + "epoch": 0.3214020855720188, + "grad_norm": 0.11374742537736893, + "learning_rate": 8.508265905606242e-05, + "loss": 1.5138086318969726, + "step": 106180 + }, + { + "epoch": 0.3214323551223647, + "grad_norm": 0.10101073235273361, + "learning_rate": 8.507886387442503e-05, + "loss": 1.5777570724487304, + "step": 106190 + }, + { + "epoch": 0.3214626246727105, + "grad_norm": 0.10228735953569412, + "learning_rate": 8.507506869278764e-05, + "loss": 1.5387233734130858, + "step": 106200 + }, + { + "epoch": 0.3214928942230563, + "grad_norm": 0.1149791032075882, + "learning_rate": 8.507127351115025e-05, + "loss": 1.5194235801696778, + "step": 106210 + }, + { + "epoch": 0.3215231637734021, + "grad_norm": 0.11526423692703247, + "learning_rate": 8.506747832951286e-05, + "loss": 1.5723729133605957, + "step": 106220 + }, + { + "epoch": 0.321553433323748, + "grad_norm": 0.11220034211874008, + "learning_rate": 8.506368314787546e-05, + "loss": 1.544118309020996, + "step": 106230 + }, + { + "epoch": 0.3215837028740938, + "grad_norm": 0.10909895598888397, + "learning_rate": 8.505988796623807e-05, + "loss": 1.5452367782592773, + "step": 106240 + }, + { + "epoch": 0.3216139724244396, + "grad_norm": 0.10939609259366989, + "learning_rate": 8.505609278460067e-05, + "loss": 1.562373733520508, + "step": 106250 + }, + { + "epoch": 0.32164424197478547, + "grad_norm": 0.1076945811510086, + "learning_rate": 8.505229760296328e-05, + "loss": 1.5657842636108399, + "step": 106260 + }, + { + "epoch": 0.32167451152513127, + "grad_norm": 0.12243221700191498, + "learning_rate": 8.504850242132589e-05, + "loss": 1.5234530448913575, + "step": 106270 + }, + { + "epoch": 0.3217047810754771, + "grad_norm": 0.1092420220375061, + "learning_rate": 8.504470723968849e-05, + "loss": 1.5900091171264648, + "step": 106280 + }, + { + "epoch": 0.32173505062582297, + "grad_norm": 0.12024569511413574, + "learning_rate": 8.50409120580511e-05, + "loss": 1.5756969451904297, + "step": 106290 + }, + { + "epoch": 0.32176532017616877, + "grad_norm": 0.10138319432735443, + "learning_rate": 8.50371168764137e-05, + "loss": 1.5395851135253906, + "step": 106300 + }, + { + "epoch": 0.3217955897265146, + "grad_norm": 0.11812000721693039, + "learning_rate": 8.503332169477631e-05, + "loss": 1.532406711578369, + "step": 106310 + }, + { + "epoch": 0.3218258592768604, + "grad_norm": 0.11539289355278015, + "learning_rate": 8.502952651313892e-05, + "loss": 1.5379634857177735, + "step": 106320 + }, + { + "epoch": 0.32185612882720627, + "grad_norm": 0.11453180760145187, + "learning_rate": 8.502573133150154e-05, + "loss": 1.5462878227233887, + "step": 106330 + }, + { + "epoch": 0.3218863983775521, + "grad_norm": 0.1329059898853302, + "learning_rate": 8.502193614986413e-05, + "loss": 1.5815832138061523, + "step": 106340 + }, + { + "epoch": 0.3219166679278979, + "grad_norm": 0.09810105711221695, + "learning_rate": 8.501814096822675e-05, + "loss": 1.5881454467773437, + "step": 106350 + }, + { + "epoch": 0.32194693747824377, + "grad_norm": 0.11626982688903809, + "learning_rate": 8.501434578658934e-05, + "loss": 1.5674413681030273, + "step": 106360 + }, + { + "epoch": 0.32197720702858956, + "grad_norm": 0.11771168559789658, + "learning_rate": 8.501055060495196e-05, + "loss": 1.589951515197754, + "step": 106370 + }, + { + "epoch": 0.3220074765789354, + "grad_norm": 0.1201709657907486, + "learning_rate": 8.500675542331455e-05, + "loss": 1.5134431838989257, + "step": 106380 + }, + { + "epoch": 0.32203774612928127, + "grad_norm": 0.10985226184129715, + "learning_rate": 8.500296024167717e-05, + "loss": 1.5607875823974608, + "step": 106390 + }, + { + "epoch": 0.32206801567962706, + "grad_norm": 0.10164063423871994, + "learning_rate": 8.499916506003978e-05, + "loss": 1.5492363929748536, + "step": 106400 + }, + { + "epoch": 0.3220982852299729, + "grad_norm": 0.13350504636764526, + "learning_rate": 8.499536987840238e-05, + "loss": 1.5550134658813477, + "step": 106410 + }, + { + "epoch": 0.32212855478031877, + "grad_norm": 0.11970862001180649, + "learning_rate": 8.499157469676499e-05, + "loss": 1.5474560737609864, + "step": 106420 + }, + { + "epoch": 0.32215882433066456, + "grad_norm": 0.10644534975290298, + "learning_rate": 8.49877795151276e-05, + "loss": 1.552459716796875, + "step": 106430 + }, + { + "epoch": 0.3221890938810104, + "grad_norm": 0.11564648151397705, + "learning_rate": 8.49839843334902e-05, + "loss": 1.5701852798461915, + "step": 106440 + }, + { + "epoch": 0.3222193634313562, + "grad_norm": 0.12614475190639496, + "learning_rate": 8.498018915185281e-05, + "loss": 1.5284626007080078, + "step": 106450 + }, + { + "epoch": 0.32224963298170206, + "grad_norm": 0.1105615496635437, + "learning_rate": 8.497639397021543e-05, + "loss": 1.538651466369629, + "step": 106460 + }, + { + "epoch": 0.3222799025320479, + "grad_norm": 0.11611470580101013, + "learning_rate": 8.497259878857802e-05, + "loss": 1.571647834777832, + "step": 106470 + }, + { + "epoch": 0.3223101720823937, + "grad_norm": 0.11692236363887787, + "learning_rate": 8.496880360694064e-05, + "loss": 1.5553504943847656, + "step": 106480 + }, + { + "epoch": 0.32234044163273956, + "grad_norm": 0.11686546355485916, + "learning_rate": 8.496500842530323e-05, + "loss": 1.5695749282836915, + "step": 106490 + }, + { + "epoch": 0.32237071118308536, + "grad_norm": 0.10779576748609543, + "learning_rate": 8.496121324366585e-05, + "loss": 1.5195222854614259, + "step": 106500 + }, + { + "epoch": 0.32237071118308536, + "eval_loss": 1.5439443588256836, + "eval_runtime": 28.2335, + "eval_samples_per_second": 17.709, + "eval_steps_per_second": 1.133, + "step": 106500 + }, + { + "epoch": 0.3224009807334312, + "grad_norm": 0.11867312341928482, + "learning_rate": 8.495741806202844e-05, + "loss": 1.5668254852294923, + "step": 106510 + }, + { + "epoch": 0.32243125028377706, + "grad_norm": 0.12211868166923523, + "learning_rate": 8.495362288039106e-05, + "loss": 1.5343557357788087, + "step": 106520 + }, + { + "epoch": 0.32246151983412286, + "grad_norm": 0.11769658327102661, + "learning_rate": 8.494982769875365e-05, + "loss": 1.5623104095458984, + "step": 106530 + }, + { + "epoch": 0.3224917893844687, + "grad_norm": 0.11095906794071198, + "learning_rate": 8.494603251711627e-05, + "loss": 1.540877628326416, + "step": 106540 + }, + { + "epoch": 0.3225220589348145, + "grad_norm": 0.117475226521492, + "learning_rate": 8.494223733547887e-05, + "loss": 1.5381696701049805, + "step": 106550 + }, + { + "epoch": 0.32255232848516036, + "grad_norm": 0.11498638242483139, + "learning_rate": 8.493844215384149e-05, + "loss": 1.5429018974304198, + "step": 106560 + }, + { + "epoch": 0.3225825980355062, + "grad_norm": 0.10007456690073013, + "learning_rate": 8.493464697220408e-05, + "loss": 1.5505709648132324, + "step": 106570 + }, + { + "epoch": 0.322612867585852, + "grad_norm": 0.11404284089803696, + "learning_rate": 8.49308517905667e-05, + "loss": 1.5823567390441895, + "step": 106580 + }, + { + "epoch": 0.32264313713619786, + "grad_norm": 0.10795072466135025, + "learning_rate": 8.49270566089293e-05, + "loss": 1.5128764152526855, + "step": 106590 + }, + { + "epoch": 0.32267340668654365, + "grad_norm": 0.11675482243299484, + "learning_rate": 8.492326142729191e-05, + "loss": 1.5237236976623536, + "step": 106600 + }, + { + "epoch": 0.3227036762368895, + "grad_norm": 0.11235693842172623, + "learning_rate": 8.491946624565453e-05, + "loss": 1.5388298034667969, + "step": 106610 + }, + { + "epoch": 0.32273394578723535, + "grad_norm": 0.11233869194984436, + "learning_rate": 8.491567106401712e-05, + "loss": 1.5315654754638672, + "step": 106620 + }, + { + "epoch": 0.32276421533758115, + "grad_norm": 0.11133656650781631, + "learning_rate": 8.491187588237974e-05, + "loss": 1.5399385452270509, + "step": 106630 + }, + { + "epoch": 0.322794484887927, + "grad_norm": 0.1207449808716774, + "learning_rate": 8.490808070074233e-05, + "loss": 1.5626633644104004, + "step": 106640 + }, + { + "epoch": 0.3228247544382728, + "grad_norm": 0.11901000142097473, + "learning_rate": 8.490428551910495e-05, + "loss": 1.5408803939819335, + "step": 106650 + }, + { + "epoch": 0.32285502398861865, + "grad_norm": 0.1115756705403328, + "learning_rate": 8.490049033746755e-05, + "loss": 1.5655376434326171, + "step": 106660 + }, + { + "epoch": 0.3228852935389645, + "grad_norm": 0.11716095358133316, + "learning_rate": 8.489669515583017e-05, + "loss": 1.5668794631958007, + "step": 106670 + }, + { + "epoch": 0.3229155630893103, + "grad_norm": 0.10822681337594986, + "learning_rate": 8.489289997419276e-05, + "loss": 1.5889164924621582, + "step": 106680 + }, + { + "epoch": 0.32294583263965615, + "grad_norm": 0.0958167314529419, + "learning_rate": 8.488910479255538e-05, + "loss": 1.5455169677734375, + "step": 106690 + }, + { + "epoch": 0.32297610219000195, + "grad_norm": 0.10554589331150055, + "learning_rate": 8.488530961091798e-05, + "loss": 1.5371669769287108, + "step": 106700 + }, + { + "epoch": 0.3230063717403478, + "grad_norm": 0.10553944110870361, + "learning_rate": 8.488151442928059e-05, + "loss": 1.5673852920532227, + "step": 106710 + }, + { + "epoch": 0.32303664129069365, + "grad_norm": 0.10387488454580307, + "learning_rate": 8.48777192476432e-05, + "loss": 1.6017200469970703, + "step": 106720 + }, + { + "epoch": 0.32306691084103945, + "grad_norm": 0.10426986217498779, + "learning_rate": 8.48739240660058e-05, + "loss": 1.5783924102783202, + "step": 106730 + }, + { + "epoch": 0.3230971803913853, + "grad_norm": 0.10834144800901413, + "learning_rate": 8.487012888436841e-05, + "loss": 1.5680047035217286, + "step": 106740 + }, + { + "epoch": 0.3231274499417311, + "grad_norm": 0.11442893743515015, + "learning_rate": 8.486633370273101e-05, + "loss": 1.5408290863037108, + "step": 106750 + }, + { + "epoch": 0.32315771949207694, + "grad_norm": 0.11557423323392868, + "learning_rate": 8.486253852109362e-05, + "loss": 1.5551794052124024, + "step": 106760 + }, + { + "epoch": 0.3231879890424228, + "grad_norm": 0.11496882885694504, + "learning_rate": 8.485874333945622e-05, + "loss": 1.5192216873168944, + "step": 106770 + }, + { + "epoch": 0.3232182585927686, + "grad_norm": 0.11363452672958374, + "learning_rate": 8.485494815781883e-05, + "loss": 1.5936914443969727, + "step": 106780 + }, + { + "epoch": 0.32324852814311444, + "grad_norm": 0.10924993455410004, + "learning_rate": 8.485115297618144e-05, + "loss": 1.5657931327819825, + "step": 106790 + }, + { + "epoch": 0.32327879769346024, + "grad_norm": 0.11586164683103561, + "learning_rate": 8.484735779454406e-05, + "loss": 1.5919208526611328, + "step": 106800 + }, + { + "epoch": 0.3233090672438061, + "grad_norm": 0.12748941779136658, + "learning_rate": 8.484356261290665e-05, + "loss": 1.5531999588012695, + "step": 106810 + }, + { + "epoch": 0.32333933679415194, + "grad_norm": 0.12431839853525162, + "learning_rate": 8.483976743126927e-05, + "loss": 1.5595535278320312, + "step": 106820 + }, + { + "epoch": 0.32336960634449774, + "grad_norm": 0.11440899968147278, + "learning_rate": 8.483597224963187e-05, + "loss": 1.5964069366455078, + "step": 106830 + }, + { + "epoch": 0.3233998758948436, + "grad_norm": 0.13042543828487396, + "learning_rate": 8.483217706799448e-05, + "loss": 1.5580744743347168, + "step": 106840 + }, + { + "epoch": 0.3234301454451894, + "grad_norm": 0.10387568175792694, + "learning_rate": 8.482838188635709e-05, + "loss": 1.5292070388793946, + "step": 106850 + }, + { + "epoch": 0.32346041499553524, + "grad_norm": 0.11672750860452652, + "learning_rate": 8.482458670471969e-05, + "loss": 1.5608912467956544, + "step": 106860 + }, + { + "epoch": 0.3234906845458811, + "grad_norm": 0.1171424612402916, + "learning_rate": 8.48207915230823e-05, + "loss": 1.5775615692138671, + "step": 106870 + }, + { + "epoch": 0.3235209540962269, + "grad_norm": 0.1118973046541214, + "learning_rate": 8.48169963414449e-05, + "loss": 1.5377522468566895, + "step": 106880 + }, + { + "epoch": 0.32355122364657274, + "grad_norm": 0.11076196283102036, + "learning_rate": 8.481320115980751e-05, + "loss": 1.524721908569336, + "step": 106890 + }, + { + "epoch": 0.32358149319691853, + "grad_norm": 0.09981495887041092, + "learning_rate": 8.480940597817012e-05, + "loss": 1.5524277687072754, + "step": 106900 + }, + { + "epoch": 0.3236117627472644, + "grad_norm": 0.10769465565681458, + "learning_rate": 8.480561079653272e-05, + "loss": 1.6104759216308593, + "step": 106910 + }, + { + "epoch": 0.32364203229761024, + "grad_norm": 0.11536907404661179, + "learning_rate": 8.480181561489533e-05, + "loss": 1.551786518096924, + "step": 106920 + }, + { + "epoch": 0.32367230184795603, + "grad_norm": 0.11508223414421082, + "learning_rate": 8.479802043325793e-05, + "loss": 1.5417948722839356, + "step": 106930 + }, + { + "epoch": 0.3237025713983019, + "grad_norm": 0.10949062556028366, + "learning_rate": 8.479422525162055e-05, + "loss": 1.5400715827941895, + "step": 106940 + }, + { + "epoch": 0.3237328409486477, + "grad_norm": 0.12780046463012695, + "learning_rate": 8.479043006998315e-05, + "loss": 1.5572460174560547, + "step": 106950 + }, + { + "epoch": 0.32376311049899353, + "grad_norm": 0.10532614588737488, + "learning_rate": 8.478663488834576e-05, + "loss": 1.5925262451171875, + "step": 106960 + }, + { + "epoch": 0.3237933800493394, + "grad_norm": 0.11363811790943146, + "learning_rate": 8.478283970670836e-05, + "loss": 1.535608673095703, + "step": 106970 + }, + { + "epoch": 0.3238236495996852, + "grad_norm": 0.1064986139535904, + "learning_rate": 8.477904452507098e-05, + "loss": 1.5662267684936524, + "step": 106980 + }, + { + "epoch": 0.32385391915003103, + "grad_norm": 0.13605080544948578, + "learning_rate": 8.477524934343357e-05, + "loss": 1.564737606048584, + "step": 106990 + }, + { + "epoch": 0.32388418870037683, + "grad_norm": 0.11892124265432358, + "learning_rate": 8.477145416179619e-05, + "loss": 1.5836868286132812, + "step": 107000 + }, + { + "epoch": 0.32388418870037683, + "eval_loss": 1.5385884046554565, + "eval_runtime": 28.2236, + "eval_samples_per_second": 17.716, + "eval_steps_per_second": 1.134, + "step": 107000 + }, + { + "epoch": 0.3239144582507227, + "grad_norm": 0.12471079081296921, + "learning_rate": 8.47676589801588e-05, + "loss": 1.5216588973999023, + "step": 107010 + }, + { + "epoch": 0.32394472780106853, + "grad_norm": 0.10641510784626007, + "learning_rate": 8.47638637985214e-05, + "loss": 1.5872157096862793, + "step": 107020 + }, + { + "epoch": 0.32397499735141433, + "grad_norm": 0.11124477535486221, + "learning_rate": 8.4760068616884e-05, + "loss": 1.5443248748779297, + "step": 107030 + }, + { + "epoch": 0.3240052669017602, + "grad_norm": 0.11337310820817947, + "learning_rate": 8.475627343524661e-05, + "loss": 1.5534870147705078, + "step": 107040 + }, + { + "epoch": 0.324035536452106, + "grad_norm": 0.12784667313098907, + "learning_rate": 8.475247825360922e-05, + "loss": 1.5171402931213378, + "step": 107050 + }, + { + "epoch": 0.32406580600245183, + "grad_norm": 0.10369577258825302, + "learning_rate": 8.474868307197182e-05, + "loss": 1.589005470275879, + "step": 107060 + }, + { + "epoch": 0.3240960755527977, + "grad_norm": 0.10469701141119003, + "learning_rate": 8.474488789033444e-05, + "loss": 1.5668907165527344, + "step": 107070 + }, + { + "epoch": 0.3241263451031435, + "grad_norm": 0.10203417390584946, + "learning_rate": 8.474109270869704e-05, + "loss": 1.553495216369629, + "step": 107080 + }, + { + "epoch": 0.32415661465348933, + "grad_norm": 0.10874111205339432, + "learning_rate": 8.473729752705966e-05, + "loss": 1.5558479309082032, + "step": 107090 + }, + { + "epoch": 0.3241868842038352, + "grad_norm": 0.10756529122591019, + "learning_rate": 8.473350234542225e-05, + "loss": 1.5283611297607422, + "step": 107100 + }, + { + "epoch": 0.324217153754181, + "grad_norm": 0.11009801924228668, + "learning_rate": 8.472970716378487e-05, + "loss": 1.584954833984375, + "step": 107110 + }, + { + "epoch": 0.3242474233045268, + "grad_norm": 0.10314839333295822, + "learning_rate": 8.472591198214746e-05, + "loss": 1.5635544776916503, + "step": 107120 + }, + { + "epoch": 0.3242776928548726, + "grad_norm": 0.10924997925758362, + "learning_rate": 8.472211680051008e-05, + "loss": 1.5404080390930175, + "step": 107130 + }, + { + "epoch": 0.3243079624052185, + "grad_norm": 0.12400630861520767, + "learning_rate": 8.471832161887267e-05, + "loss": 1.5432361602783202, + "step": 107140 + }, + { + "epoch": 0.3243382319555643, + "grad_norm": 0.11889307200908661, + "learning_rate": 8.471452643723529e-05, + "loss": 1.5439685821533202, + "step": 107150 + }, + { + "epoch": 0.3243685015059101, + "grad_norm": 0.11474008858203888, + "learning_rate": 8.471073125559788e-05, + "loss": 1.536912727355957, + "step": 107160 + }, + { + "epoch": 0.324398771056256, + "grad_norm": 0.11346041411161423, + "learning_rate": 8.47069360739605e-05, + "loss": 1.5597923278808594, + "step": 107170 + }, + { + "epoch": 0.32442904060660177, + "grad_norm": 0.12192367762327194, + "learning_rate": 8.47031408923231e-05, + "loss": 1.5781538009643554, + "step": 107180 + }, + { + "epoch": 0.3244593101569476, + "grad_norm": 0.12481117248535156, + "learning_rate": 8.469934571068572e-05, + "loss": 1.56346435546875, + "step": 107190 + }, + { + "epoch": 0.3244895797072935, + "grad_norm": 0.1208580732345581, + "learning_rate": 8.469555052904832e-05, + "loss": 1.5612474441528321, + "step": 107200 + }, + { + "epoch": 0.32451984925763927, + "grad_norm": 0.11643679440021515, + "learning_rate": 8.469175534741093e-05, + "loss": 1.5756298065185548, + "step": 107210 + }, + { + "epoch": 0.3245501188079851, + "grad_norm": 0.11326640844345093, + "learning_rate": 8.468796016577355e-05, + "loss": 1.545234775543213, + "step": 107220 + }, + { + "epoch": 0.3245803883583309, + "grad_norm": 0.11485711485147476, + "learning_rate": 8.468416498413614e-05, + "loss": 1.5580665588378906, + "step": 107230 + }, + { + "epoch": 0.32461065790867677, + "grad_norm": 0.10990265756845474, + "learning_rate": 8.468036980249876e-05, + "loss": 1.5678682327270508, + "step": 107240 + }, + { + "epoch": 0.3246409274590226, + "grad_norm": 0.11614816635847092, + "learning_rate": 8.467657462086135e-05, + "loss": 1.5422563552856445, + "step": 107250 + }, + { + "epoch": 0.3246711970093684, + "grad_norm": 0.11818113178014755, + "learning_rate": 8.467277943922397e-05, + "loss": 1.5854723930358887, + "step": 107260 + }, + { + "epoch": 0.32470146655971427, + "grad_norm": 0.10339582711458206, + "learning_rate": 8.466898425758656e-05, + "loss": 1.554800796508789, + "step": 107270 + }, + { + "epoch": 0.32473173611006007, + "grad_norm": 0.1281658113002777, + "learning_rate": 8.466518907594918e-05, + "loss": 1.5418347358703612, + "step": 107280 + }, + { + "epoch": 0.3247620056604059, + "grad_norm": 0.11616236716508865, + "learning_rate": 8.466139389431177e-05, + "loss": 1.5352907180786133, + "step": 107290 + }, + { + "epoch": 0.32479227521075177, + "grad_norm": 0.11255808919668198, + "learning_rate": 8.46575987126744e-05, + "loss": 1.5252331733703612, + "step": 107300 + }, + { + "epoch": 0.32482254476109756, + "grad_norm": 0.10354077070951462, + "learning_rate": 8.4653803531037e-05, + "loss": 1.5227954864501954, + "step": 107310 + }, + { + "epoch": 0.3248528143114434, + "grad_norm": 0.11481250077486038, + "learning_rate": 8.46500083493996e-05, + "loss": 1.5750036239624023, + "step": 107320 + }, + { + "epoch": 0.3248830838617892, + "grad_norm": 0.1213885098695755, + "learning_rate": 8.464621316776221e-05, + "loss": 1.542110061645508, + "step": 107330 + }, + { + "epoch": 0.32491335341213506, + "grad_norm": 0.1071503683924675, + "learning_rate": 8.464241798612482e-05, + "loss": 1.5549111366271973, + "step": 107340 + }, + { + "epoch": 0.3249436229624809, + "grad_norm": 0.11294865608215332, + "learning_rate": 8.463862280448742e-05, + "loss": 1.5679450988769532, + "step": 107350 + }, + { + "epoch": 0.3249738925128267, + "grad_norm": 0.11730390787124634, + "learning_rate": 8.463482762285003e-05, + "loss": 1.5406545639038085, + "step": 107360 + }, + { + "epoch": 0.32500416206317256, + "grad_norm": 0.11045054346323013, + "learning_rate": 8.463103244121264e-05, + "loss": 1.5496764183044434, + "step": 107370 + }, + { + "epoch": 0.32503443161351836, + "grad_norm": 0.10334525257349014, + "learning_rate": 8.462723725957524e-05, + "loss": 1.5966235160827638, + "step": 107380 + }, + { + "epoch": 0.3250647011638642, + "grad_norm": 0.10933810472488403, + "learning_rate": 8.462344207793785e-05, + "loss": 1.5307701110839844, + "step": 107390 + }, + { + "epoch": 0.32509497071421006, + "grad_norm": 0.10926332324743271, + "learning_rate": 8.461964689630045e-05, + "loss": 1.5912405967712402, + "step": 107400 + }, + { + "epoch": 0.32512524026455586, + "grad_norm": 0.12780249118804932, + "learning_rate": 8.461585171466307e-05, + "loss": 1.5593975067138672, + "step": 107410 + }, + { + "epoch": 0.3251555098149017, + "grad_norm": 0.10614150017499924, + "learning_rate": 8.461205653302567e-05, + "loss": 1.5558658599853517, + "step": 107420 + }, + { + "epoch": 0.3251857793652475, + "grad_norm": 0.11192335933446884, + "learning_rate": 8.460826135138829e-05, + "loss": 1.5638051986694337, + "step": 107430 + }, + { + "epoch": 0.32521604891559336, + "grad_norm": 0.12209174782037735, + "learning_rate": 8.460446616975089e-05, + "loss": 1.5488430976867675, + "step": 107440 + }, + { + "epoch": 0.3252463184659392, + "grad_norm": 0.0941944420337677, + "learning_rate": 8.46006709881135e-05, + "loss": 1.5415910720825194, + "step": 107450 + }, + { + "epoch": 0.325276588016285, + "grad_norm": 0.12029748409986496, + "learning_rate": 8.45968758064761e-05, + "loss": 1.5315080642700196, + "step": 107460 + }, + { + "epoch": 0.32530685756663086, + "grad_norm": 0.11404424905776978, + "learning_rate": 8.459308062483871e-05, + "loss": 1.5347302436828614, + "step": 107470 + }, + { + "epoch": 0.32533712711697665, + "grad_norm": 0.11220111697912216, + "learning_rate": 8.458928544320132e-05, + "loss": 1.571141242980957, + "step": 107480 + }, + { + "epoch": 0.3253673966673225, + "grad_norm": 0.10344121605157852, + "learning_rate": 8.458549026156392e-05, + "loss": 1.569166088104248, + "step": 107490 + }, + { + "epoch": 0.32539766621766836, + "grad_norm": 0.11755859106779099, + "learning_rate": 8.458169507992653e-05, + "loss": 1.538792037963867, + "step": 107500 + }, + { + "epoch": 0.32539766621766836, + "eval_loss": 1.538748025894165, + "eval_runtime": 28.1816, + "eval_samples_per_second": 17.742, + "eval_steps_per_second": 1.135, + "step": 107500 + }, + { + "epoch": 0.32542793576801415, + "grad_norm": 0.1133030503988266, + "learning_rate": 8.457789989828913e-05, + "loss": 1.554861068725586, + "step": 107510 + }, + { + "epoch": 0.32545820531836, + "grad_norm": 0.11825347691774368, + "learning_rate": 8.457410471665174e-05, + "loss": 1.560710334777832, + "step": 107520 + }, + { + "epoch": 0.3254884748687058, + "grad_norm": 0.10539019852876663, + "learning_rate": 8.457030953501434e-05, + "loss": 1.5687023162841798, + "step": 107530 + }, + { + "epoch": 0.32551874441905165, + "grad_norm": 0.10977381467819214, + "learning_rate": 8.456651435337695e-05, + "loss": 1.5677387237548828, + "step": 107540 + }, + { + "epoch": 0.3255490139693975, + "grad_norm": 0.10628235340118408, + "learning_rate": 8.456271917173956e-05, + "loss": 1.57161283493042, + "step": 107550 + }, + { + "epoch": 0.3255792835197433, + "grad_norm": 0.11348819732666016, + "learning_rate": 8.455892399010216e-05, + "loss": 1.5557485580444337, + "step": 107560 + }, + { + "epoch": 0.32560955307008915, + "grad_norm": 0.11463379114866257, + "learning_rate": 8.455512880846478e-05, + "loss": 1.5582508087158202, + "step": 107570 + }, + { + "epoch": 0.32563982262043495, + "grad_norm": 0.10963170230388641, + "learning_rate": 8.455133362682737e-05, + "loss": 1.5752294540405274, + "step": 107580 + }, + { + "epoch": 0.3256700921707808, + "grad_norm": 0.10170713067054749, + "learning_rate": 8.454753844519e-05, + "loss": 1.5460655212402343, + "step": 107590 + }, + { + "epoch": 0.32570036172112665, + "grad_norm": 0.12245398759841919, + "learning_rate": 8.454374326355259e-05, + "loss": 1.5794877052307128, + "step": 107600 + }, + { + "epoch": 0.32573063127147245, + "grad_norm": 0.12137900292873383, + "learning_rate": 8.45399480819152e-05, + "loss": 1.5979642868041992, + "step": 107610 + }, + { + "epoch": 0.3257609008218183, + "grad_norm": 0.10457263141870499, + "learning_rate": 8.453615290027781e-05, + "loss": 1.5776042938232422, + "step": 107620 + }, + { + "epoch": 0.3257911703721641, + "grad_norm": 0.11913798004388809, + "learning_rate": 8.453235771864042e-05, + "loss": 1.5525825500488282, + "step": 107630 + }, + { + "epoch": 0.32582143992250995, + "grad_norm": 0.11477978527545929, + "learning_rate": 8.452856253700302e-05, + "loss": 1.5853447914123535, + "step": 107640 + }, + { + "epoch": 0.3258517094728558, + "grad_norm": 0.10394687950611115, + "learning_rate": 8.452476735536563e-05, + "loss": 1.5566933631896973, + "step": 107650 + }, + { + "epoch": 0.3258819790232016, + "grad_norm": 0.14654266834259033, + "learning_rate": 8.452097217372824e-05, + "loss": 1.5678558349609375, + "step": 107660 + }, + { + "epoch": 0.32591224857354745, + "grad_norm": 0.10717437416315079, + "learning_rate": 8.451717699209084e-05, + "loss": 1.5341312408447265, + "step": 107670 + }, + { + "epoch": 0.32594251812389324, + "grad_norm": 0.11599712073802948, + "learning_rate": 8.451338181045346e-05, + "loss": 1.5129230499267579, + "step": 107680 + }, + { + "epoch": 0.3259727876742391, + "grad_norm": 0.10763084143400192, + "learning_rate": 8.450958662881605e-05, + "loss": 1.5616184234619142, + "step": 107690 + }, + { + "epoch": 0.32600305722458495, + "grad_norm": 0.12657296657562256, + "learning_rate": 8.450579144717867e-05, + "loss": 1.558635139465332, + "step": 107700 + }, + { + "epoch": 0.32603332677493074, + "grad_norm": 0.11776208132505417, + "learning_rate": 8.450199626554127e-05, + "loss": 1.5277236938476562, + "step": 107710 + }, + { + "epoch": 0.3260635963252766, + "grad_norm": 0.12417036294937134, + "learning_rate": 8.449820108390388e-05, + "loss": 1.5569921493530274, + "step": 107720 + }, + { + "epoch": 0.3260938658756224, + "grad_norm": 0.11944499611854553, + "learning_rate": 8.449440590226648e-05, + "loss": 1.538112258911133, + "step": 107730 + }, + { + "epoch": 0.32612413542596824, + "grad_norm": 0.10445145517587662, + "learning_rate": 8.44906107206291e-05, + "loss": 1.518389892578125, + "step": 107740 + }, + { + "epoch": 0.3261544049763141, + "grad_norm": 0.111770398914814, + "learning_rate": 8.448681553899169e-05, + "loss": 1.5299121856689453, + "step": 107750 + }, + { + "epoch": 0.3261846745266599, + "grad_norm": 0.10530165582895279, + "learning_rate": 8.448302035735431e-05, + "loss": 1.544790267944336, + "step": 107760 + }, + { + "epoch": 0.32621494407700574, + "grad_norm": 0.10265879333019257, + "learning_rate": 8.44792251757169e-05, + "loss": 1.5427715301513671, + "step": 107770 + }, + { + "epoch": 0.32624521362735154, + "grad_norm": 0.10993529856204987, + "learning_rate": 8.447542999407952e-05, + "loss": 1.5170347213745117, + "step": 107780 + }, + { + "epoch": 0.3262754831776974, + "grad_norm": 0.11560960859060287, + "learning_rate": 8.447163481244211e-05, + "loss": 1.544219398498535, + "step": 107790 + }, + { + "epoch": 0.32630575272804324, + "grad_norm": 0.10527660697698593, + "learning_rate": 8.446783963080473e-05, + "loss": 1.6027734756469727, + "step": 107800 + }, + { + "epoch": 0.32633602227838904, + "grad_norm": 0.1276099979877472, + "learning_rate": 8.446404444916734e-05, + "loss": 1.5751089096069335, + "step": 107810 + }, + { + "epoch": 0.3263662918287349, + "grad_norm": 0.10276598483324051, + "learning_rate": 8.446024926752994e-05, + "loss": 1.5524664878845216, + "step": 107820 + }, + { + "epoch": 0.32639656137908074, + "grad_norm": 0.11492718756198883, + "learning_rate": 8.445645408589256e-05, + "loss": 1.533485221862793, + "step": 107830 + }, + { + "epoch": 0.32642683092942654, + "grad_norm": 0.11796115338802338, + "learning_rate": 8.445265890425516e-05, + "loss": 1.5267412185668945, + "step": 107840 + }, + { + "epoch": 0.3264571004797724, + "grad_norm": 0.1267784833908081, + "learning_rate": 8.444886372261778e-05, + "loss": 1.5875677108764648, + "step": 107850 + }, + { + "epoch": 0.3264873700301182, + "grad_norm": 0.13662345707416534, + "learning_rate": 8.444506854098037e-05, + "loss": 1.5257403373718261, + "step": 107860 + }, + { + "epoch": 0.32651763958046404, + "grad_norm": 0.12554463744163513, + "learning_rate": 8.444127335934299e-05, + "loss": 1.5438475608825684, + "step": 107870 + }, + { + "epoch": 0.3265479091308099, + "grad_norm": 0.10923077911138535, + "learning_rate": 8.443747817770558e-05, + "loss": 1.5430798530578613, + "step": 107880 + }, + { + "epoch": 0.3265781786811557, + "grad_norm": 0.11882461607456207, + "learning_rate": 8.44336829960682e-05, + "loss": 1.5494384765625, + "step": 107890 + }, + { + "epoch": 0.32660844823150154, + "grad_norm": 0.11180181801319122, + "learning_rate": 8.442988781443079e-05, + "loss": 1.5778886795043945, + "step": 107900 + }, + { + "epoch": 0.32663871778184733, + "grad_norm": 0.12285012006759644, + "learning_rate": 8.442609263279341e-05, + "loss": 1.5758331298828125, + "step": 107910 + }, + { + "epoch": 0.3266689873321932, + "grad_norm": 0.10411880910396576, + "learning_rate": 8.4422297451156e-05, + "loss": 1.5571365356445312, + "step": 107920 + }, + { + "epoch": 0.32669925688253904, + "grad_norm": 0.10932592302560806, + "learning_rate": 8.441850226951862e-05, + "loss": 1.504819679260254, + "step": 107930 + }, + { + "epoch": 0.32672952643288483, + "grad_norm": 0.11828684061765671, + "learning_rate": 8.441470708788123e-05, + "loss": 1.5435302734375, + "step": 107940 + }, + { + "epoch": 0.3267597959832307, + "grad_norm": 0.11122605949640274, + "learning_rate": 8.441091190624384e-05, + "loss": 1.5349667549133301, + "step": 107950 + }, + { + "epoch": 0.3267900655335765, + "grad_norm": 0.11019597947597504, + "learning_rate": 8.440711672460644e-05, + "loss": 1.5461267471313476, + "step": 107960 + }, + { + "epoch": 0.32682033508392233, + "grad_norm": 0.11236410588026047, + "learning_rate": 8.440332154296905e-05, + "loss": 1.5136045455932616, + "step": 107970 + }, + { + "epoch": 0.3268506046342682, + "grad_norm": 0.11793636530637741, + "learning_rate": 8.439952636133165e-05, + "loss": 1.5276708602905273, + "step": 107980 + }, + { + "epoch": 0.326880874184614, + "grad_norm": 0.11715684831142426, + "learning_rate": 8.439573117969426e-05, + "loss": 1.5610424041748048, + "step": 107990 + }, + { + "epoch": 0.32691114373495983, + "grad_norm": 0.1047619953751564, + "learning_rate": 8.439193599805687e-05, + "loss": 1.5552370071411132, + "step": 108000 + }, + { + "epoch": 0.32691114373495983, + "eval_loss": 1.5408329963684082, + "eval_runtime": 28.1166, + "eval_samples_per_second": 17.783, + "eval_steps_per_second": 1.138, + "step": 108000 + }, + { + "epoch": 0.3269414132853056, + "grad_norm": 0.1019783690571785, + "learning_rate": 8.438814081641947e-05, + "loss": 1.5494562149047852, + "step": 108010 + }, + { + "epoch": 0.3269716828356515, + "grad_norm": 0.11102164536714554, + "learning_rate": 8.438434563478209e-05, + "loss": 1.5376250267028808, + "step": 108020 + }, + { + "epoch": 0.32700195238599733, + "grad_norm": 0.10154542326927185, + "learning_rate": 8.438055045314468e-05, + "loss": 1.5833839416503905, + "step": 108030 + }, + { + "epoch": 0.3270322219363431, + "grad_norm": 0.12226877361536026, + "learning_rate": 8.43767552715073e-05, + "loss": 1.570855712890625, + "step": 108040 + }, + { + "epoch": 0.327062491486689, + "grad_norm": 0.11692699790000916, + "learning_rate": 8.437296008986991e-05, + "loss": 1.5553779602050781, + "step": 108050 + }, + { + "epoch": 0.3270927610370348, + "grad_norm": 0.10861697047948837, + "learning_rate": 8.436916490823251e-05, + "loss": 1.5406171798706054, + "step": 108060 + }, + { + "epoch": 0.3271230305873806, + "grad_norm": 0.1104934960603714, + "learning_rate": 8.436536972659512e-05, + "loss": 1.5513764381408692, + "step": 108070 + }, + { + "epoch": 0.3271533001377265, + "grad_norm": 0.11172476410865784, + "learning_rate": 8.436157454495773e-05, + "loss": 1.5439861297607422, + "step": 108080 + }, + { + "epoch": 0.3271835696880723, + "grad_norm": 0.10948118567466736, + "learning_rate": 8.435777936332033e-05, + "loss": 1.5464941024780274, + "step": 108090 + }, + { + "epoch": 0.3272138392384181, + "grad_norm": 0.10705840587615967, + "learning_rate": 8.435398418168294e-05, + "loss": 1.5606313705444337, + "step": 108100 + }, + { + "epoch": 0.3272441087887639, + "grad_norm": 0.1063515916466713, + "learning_rate": 8.435018900004554e-05, + "loss": 1.598567008972168, + "step": 108110 + }, + { + "epoch": 0.3272743783391098, + "grad_norm": 0.11009655892848969, + "learning_rate": 8.434639381840815e-05, + "loss": 1.533134365081787, + "step": 108120 + }, + { + "epoch": 0.3273046478894556, + "grad_norm": 0.10362493991851807, + "learning_rate": 8.434259863677076e-05, + "loss": 1.5226015090942382, + "step": 108130 + }, + { + "epoch": 0.3273349174398014, + "grad_norm": 0.10555144399404526, + "learning_rate": 8.433880345513336e-05, + "loss": 1.5546032905578613, + "step": 108140 + }, + { + "epoch": 0.32736518699014727, + "grad_norm": 0.11467213928699493, + "learning_rate": 8.433500827349597e-05, + "loss": 1.5605081558227538, + "step": 108150 + }, + { + "epoch": 0.32739545654049307, + "grad_norm": 0.11044991761445999, + "learning_rate": 8.433121309185857e-05, + "loss": 1.5583667755126953, + "step": 108160 + }, + { + "epoch": 0.3274257260908389, + "grad_norm": 0.13188335299491882, + "learning_rate": 8.432741791022118e-05, + "loss": 1.5443460464477539, + "step": 108170 + }, + { + "epoch": 0.32745599564118477, + "grad_norm": 0.11276594549417496, + "learning_rate": 8.43236227285838e-05, + "loss": 1.5033687591552733, + "step": 108180 + }, + { + "epoch": 0.32748626519153057, + "grad_norm": 0.10983823239803314, + "learning_rate": 8.431982754694639e-05, + "loss": 1.5414864540100097, + "step": 108190 + }, + { + "epoch": 0.3275165347418764, + "grad_norm": 0.10996516793966293, + "learning_rate": 8.431603236530901e-05, + "loss": 1.5276060104370117, + "step": 108200 + }, + { + "epoch": 0.3275468042922222, + "grad_norm": 0.12070051580667496, + "learning_rate": 8.43122371836716e-05, + "loss": 1.5966163635253907, + "step": 108210 + }, + { + "epoch": 0.32757707384256807, + "grad_norm": 0.11199315637350082, + "learning_rate": 8.430844200203422e-05, + "loss": 1.5129959106445312, + "step": 108220 + }, + { + "epoch": 0.3276073433929139, + "grad_norm": 0.11434880644083023, + "learning_rate": 8.430464682039683e-05, + "loss": 1.5719952583312988, + "step": 108230 + }, + { + "epoch": 0.3276376129432597, + "grad_norm": 0.1162671446800232, + "learning_rate": 8.430085163875943e-05, + "loss": 1.5397161483764648, + "step": 108240 + }, + { + "epoch": 0.32766788249360557, + "grad_norm": 0.10685240477323532, + "learning_rate": 8.429705645712204e-05, + "loss": 1.5828887939453125, + "step": 108250 + }, + { + "epoch": 0.32769815204395136, + "grad_norm": 0.11138104647397995, + "learning_rate": 8.429326127548465e-05, + "loss": 1.567145824432373, + "step": 108260 + }, + { + "epoch": 0.3277284215942972, + "grad_norm": 0.10546175390481949, + "learning_rate": 8.428946609384725e-05, + "loss": 1.5405792236328124, + "step": 108270 + }, + { + "epoch": 0.32775869114464307, + "grad_norm": 0.10899098217487335, + "learning_rate": 8.428567091220986e-05, + "loss": 1.5588314056396484, + "step": 108280 + }, + { + "epoch": 0.32778896069498886, + "grad_norm": 0.11578428745269775, + "learning_rate": 8.428187573057246e-05, + "loss": 1.5275001525878906, + "step": 108290 + }, + { + "epoch": 0.3278192302453347, + "grad_norm": 0.11305437237024307, + "learning_rate": 8.427808054893507e-05, + "loss": 1.547550582885742, + "step": 108300 + }, + { + "epoch": 0.3278494997956805, + "grad_norm": 0.11474453657865524, + "learning_rate": 8.427428536729769e-05, + "loss": 1.5462251663208009, + "step": 108310 + }, + { + "epoch": 0.32787976934602636, + "grad_norm": 0.11327585577964783, + "learning_rate": 8.427049018566028e-05, + "loss": 1.525139617919922, + "step": 108320 + }, + { + "epoch": 0.3279100388963722, + "grad_norm": 0.11547761410474777, + "learning_rate": 8.42666950040229e-05, + "loss": 1.541559600830078, + "step": 108330 + }, + { + "epoch": 0.327940308446718, + "grad_norm": 0.09859748929738998, + "learning_rate": 8.42628998223855e-05, + "loss": 1.5168553352355958, + "step": 108340 + }, + { + "epoch": 0.32797057799706386, + "grad_norm": 0.10469634085893631, + "learning_rate": 8.425910464074811e-05, + "loss": 1.551607894897461, + "step": 108350 + }, + { + "epoch": 0.32800084754740966, + "grad_norm": 0.10012280941009521, + "learning_rate": 8.42553094591107e-05, + "loss": 1.5504436492919922, + "step": 108360 + }, + { + "epoch": 0.3280311170977555, + "grad_norm": 0.11465339362621307, + "learning_rate": 8.425151427747333e-05, + "loss": 1.5635353088378907, + "step": 108370 + }, + { + "epoch": 0.32806138664810136, + "grad_norm": 0.1084839254617691, + "learning_rate": 8.424771909583592e-05, + "loss": 1.523574447631836, + "step": 108380 + }, + { + "epoch": 0.32809165619844716, + "grad_norm": 0.1081690713763237, + "learning_rate": 8.424392391419854e-05, + "loss": 1.5573450088500977, + "step": 108390 + }, + { + "epoch": 0.328121925748793, + "grad_norm": 0.11044526100158691, + "learning_rate": 8.424012873256113e-05, + "loss": 1.5354844093322755, + "step": 108400 + }, + { + "epoch": 0.3281521952991388, + "grad_norm": 0.1119207814335823, + "learning_rate": 8.423633355092375e-05, + "loss": 1.5195676803588867, + "step": 108410 + }, + { + "epoch": 0.32818246484948466, + "grad_norm": 0.1058531105518341, + "learning_rate": 8.423253836928636e-05, + "loss": 1.5515607833862304, + "step": 108420 + }, + { + "epoch": 0.3282127343998305, + "grad_norm": 0.11427891254425049, + "learning_rate": 8.422874318764896e-05, + "loss": 1.53477144241333, + "step": 108430 + }, + { + "epoch": 0.3282430039501763, + "grad_norm": 0.11928419768810272, + "learning_rate": 8.422494800601158e-05, + "loss": 1.5574256896972656, + "step": 108440 + }, + { + "epoch": 0.32827327350052216, + "grad_norm": 0.12337780743837357, + "learning_rate": 8.422115282437417e-05, + "loss": 1.5318540573120116, + "step": 108450 + }, + { + "epoch": 0.32830354305086795, + "grad_norm": 0.10124904662370682, + "learning_rate": 8.421735764273679e-05, + "loss": 1.5730920791625977, + "step": 108460 + }, + { + "epoch": 0.3283338126012138, + "grad_norm": 0.10943453013896942, + "learning_rate": 8.421356246109939e-05, + "loss": 1.5922075271606446, + "step": 108470 + }, + { + "epoch": 0.32836408215155966, + "grad_norm": 0.10955066978931427, + "learning_rate": 8.4209767279462e-05, + "loss": 1.539773941040039, + "step": 108480 + }, + { + "epoch": 0.32839435170190545, + "grad_norm": 0.11587370932102203, + "learning_rate": 8.42059720978246e-05, + "loss": 1.548246192932129, + "step": 108490 + }, + { + "epoch": 0.3284246212522513, + "grad_norm": 0.10974331200122833, + "learning_rate": 8.420217691618722e-05, + "loss": 1.533061408996582, + "step": 108500 + }, + { + "epoch": 0.3284246212522513, + "eval_loss": 1.5374919176101685, + "eval_runtime": 28.0143, + "eval_samples_per_second": 17.848, + "eval_steps_per_second": 1.142, + "step": 108500 + }, + { + "epoch": 0.32845489080259715, + "grad_norm": 0.10851246863603592, + "learning_rate": 8.419838173454981e-05, + "loss": 1.508121871948242, + "step": 108510 + }, + { + "epoch": 0.32848516035294295, + "grad_norm": 0.11471357941627502, + "learning_rate": 8.419458655291243e-05, + "loss": 1.571352767944336, + "step": 108520 + }, + { + "epoch": 0.3285154299032888, + "grad_norm": 0.10717874765396118, + "learning_rate": 8.419079137127502e-05, + "loss": 1.5697340011596679, + "step": 108530 + }, + { + "epoch": 0.3285456994536346, + "grad_norm": 0.11609414219856262, + "learning_rate": 8.418699618963764e-05, + "loss": 1.559580135345459, + "step": 108540 + }, + { + "epoch": 0.32857596900398045, + "grad_norm": 0.11996123939752579, + "learning_rate": 8.418320100800025e-05, + "loss": 1.5472976684570312, + "step": 108550 + }, + { + "epoch": 0.3286062385543263, + "grad_norm": 0.1015649363398552, + "learning_rate": 8.417940582636285e-05, + "loss": 1.5151567459106445, + "step": 108560 + }, + { + "epoch": 0.3286365081046721, + "grad_norm": 0.11671001464128494, + "learning_rate": 8.417561064472546e-05, + "loss": 1.558837890625, + "step": 108570 + }, + { + "epoch": 0.32866677765501795, + "grad_norm": 0.1191878616809845, + "learning_rate": 8.417181546308806e-05, + "loss": 1.600827407836914, + "step": 108580 + }, + { + "epoch": 0.32869704720536375, + "grad_norm": 0.11506853252649307, + "learning_rate": 8.416802028145067e-05, + "loss": 1.5622724533081054, + "step": 108590 + }, + { + "epoch": 0.3287273167557096, + "grad_norm": 0.11005867272615433, + "learning_rate": 8.416422509981328e-05, + "loss": 1.5475450515747071, + "step": 108600 + }, + { + "epoch": 0.32875758630605545, + "grad_norm": 0.11554653942584991, + "learning_rate": 8.416042991817588e-05, + "loss": 1.5440765380859376, + "step": 108610 + }, + { + "epoch": 0.32878785585640125, + "grad_norm": 0.11961233615875244, + "learning_rate": 8.415663473653849e-05, + "loss": 1.5446568489074708, + "step": 108620 + }, + { + "epoch": 0.3288181254067471, + "grad_norm": 0.10637546330690384, + "learning_rate": 8.415283955490111e-05, + "loss": 1.636343765258789, + "step": 108630 + }, + { + "epoch": 0.3288483949570929, + "grad_norm": 0.11205253005027771, + "learning_rate": 8.41490443732637e-05, + "loss": 1.5474944114685059, + "step": 108640 + }, + { + "epoch": 0.32887866450743874, + "grad_norm": 0.10220044106245041, + "learning_rate": 8.414524919162632e-05, + "loss": 1.5258119583129883, + "step": 108650 + }, + { + "epoch": 0.3289089340577846, + "grad_norm": 0.10036265850067139, + "learning_rate": 8.414145400998891e-05, + "loss": 1.5751424789428712, + "step": 108660 + }, + { + "epoch": 0.3289392036081304, + "grad_norm": 0.11681239306926727, + "learning_rate": 8.413765882835153e-05, + "loss": 1.5147289276123046, + "step": 108670 + }, + { + "epoch": 0.32896947315847624, + "grad_norm": 0.11809075623750687, + "learning_rate": 8.413386364671414e-05, + "loss": 1.5233238220214844, + "step": 108680 + }, + { + "epoch": 0.32899974270882204, + "grad_norm": 0.11274313181638718, + "learning_rate": 8.413006846507674e-05, + "loss": 1.551029109954834, + "step": 108690 + }, + { + "epoch": 0.3290300122591679, + "grad_norm": 0.11342982947826385, + "learning_rate": 8.412627328343935e-05, + "loss": 1.5257994651794433, + "step": 108700 + }, + { + "epoch": 0.32906028180951374, + "grad_norm": 0.11531764268875122, + "learning_rate": 8.412247810180196e-05, + "loss": 1.573137378692627, + "step": 108710 + }, + { + "epoch": 0.32909055135985954, + "grad_norm": 0.10640960931777954, + "learning_rate": 8.411868292016456e-05, + "loss": 1.571721649169922, + "step": 108720 + }, + { + "epoch": 0.3291208209102054, + "grad_norm": 0.1354512870311737, + "learning_rate": 8.411488773852717e-05, + "loss": 1.500452423095703, + "step": 108730 + }, + { + "epoch": 0.3291510904605512, + "grad_norm": 0.11688805371522903, + "learning_rate": 8.411109255688977e-05, + "loss": 1.5224358558654785, + "step": 108740 + }, + { + "epoch": 0.32918136001089704, + "grad_norm": 0.1095249205827713, + "learning_rate": 8.410729737525238e-05, + "loss": 1.5720160484313965, + "step": 108750 + }, + { + "epoch": 0.3292116295612429, + "grad_norm": 0.10702230781316757, + "learning_rate": 8.410350219361499e-05, + "loss": 1.5565092086791992, + "step": 108760 + }, + { + "epoch": 0.3292418991115887, + "grad_norm": 0.10572709888219833, + "learning_rate": 8.409970701197759e-05, + "loss": 1.5575370788574219, + "step": 108770 + }, + { + "epoch": 0.32927216866193454, + "grad_norm": 0.11961912363767624, + "learning_rate": 8.40959118303402e-05, + "loss": 1.5704033851623536, + "step": 108780 + }, + { + "epoch": 0.32930243821228034, + "grad_norm": 0.11656098812818527, + "learning_rate": 8.409211664870282e-05, + "loss": 1.539285659790039, + "step": 108790 + }, + { + "epoch": 0.3293327077626262, + "grad_norm": 0.10264855623245239, + "learning_rate": 8.408832146706541e-05, + "loss": 1.5276442527770997, + "step": 108800 + }, + { + "epoch": 0.32936297731297204, + "grad_norm": 0.12750960886478424, + "learning_rate": 8.408452628542803e-05, + "loss": 1.5380115509033203, + "step": 108810 + }, + { + "epoch": 0.32939324686331783, + "grad_norm": 0.1103363111615181, + "learning_rate": 8.408073110379062e-05, + "loss": 1.5557051658630372, + "step": 108820 + }, + { + "epoch": 0.3294235164136637, + "grad_norm": 0.11505033075809479, + "learning_rate": 8.407693592215324e-05, + "loss": 1.5423927307128906, + "step": 108830 + }, + { + "epoch": 0.3294537859640095, + "grad_norm": 0.10277579724788666, + "learning_rate": 8.407314074051585e-05, + "loss": 1.5388526916503906, + "step": 108840 + }, + { + "epoch": 0.32948405551435533, + "grad_norm": 0.10459300875663757, + "learning_rate": 8.406934555887845e-05, + "loss": 1.544134521484375, + "step": 108850 + }, + { + "epoch": 0.3295143250647012, + "grad_norm": 0.11127249896526337, + "learning_rate": 8.406555037724106e-05, + "loss": 1.5396942138671874, + "step": 108860 + }, + { + "epoch": 0.329544594615047, + "grad_norm": 0.10575830936431885, + "learning_rate": 8.406175519560366e-05, + "loss": 1.50491943359375, + "step": 108870 + }, + { + "epoch": 0.32957486416539283, + "grad_norm": 0.13457101583480835, + "learning_rate": 8.405796001396627e-05, + "loss": 1.5533140182495118, + "step": 108880 + }, + { + "epoch": 0.32960513371573863, + "grad_norm": 0.10452594608068466, + "learning_rate": 8.405416483232888e-05, + "loss": 1.5345458984375, + "step": 108890 + }, + { + "epoch": 0.3296354032660845, + "grad_norm": 0.10770867764949799, + "learning_rate": 8.405036965069148e-05, + "loss": 1.5553060531616212, + "step": 108900 + }, + { + "epoch": 0.32966567281643033, + "grad_norm": 0.1061197966337204, + "learning_rate": 8.404657446905409e-05, + "loss": 1.520640754699707, + "step": 108910 + }, + { + "epoch": 0.32969594236677613, + "grad_norm": 0.11134180426597595, + "learning_rate": 8.404277928741671e-05, + "loss": 1.5378253936767579, + "step": 108920 + }, + { + "epoch": 0.329726211917122, + "grad_norm": 0.10509271919727325, + "learning_rate": 8.40389841057793e-05, + "loss": 1.5436578750610352, + "step": 108930 + }, + { + "epoch": 0.3297564814674678, + "grad_norm": 0.12332543730735779, + "learning_rate": 8.403518892414192e-05, + "loss": 1.5451014518737793, + "step": 108940 + }, + { + "epoch": 0.32978675101781363, + "grad_norm": 0.11261942237615585, + "learning_rate": 8.403139374250451e-05, + "loss": 1.5466291427612304, + "step": 108950 + }, + { + "epoch": 0.3298170205681595, + "grad_norm": 0.1039477214217186, + "learning_rate": 8.402759856086713e-05, + "loss": 1.5575674057006836, + "step": 108960 + }, + { + "epoch": 0.3298472901185053, + "grad_norm": 0.12345263361930847, + "learning_rate": 8.402380337922972e-05, + "loss": 1.5607828140258788, + "step": 108970 + }, + { + "epoch": 0.32987755966885113, + "grad_norm": 0.11915048211812973, + "learning_rate": 8.402000819759234e-05, + "loss": 1.5582599639892578, + "step": 108980 + }, + { + "epoch": 0.3299078292191969, + "grad_norm": 0.10771261155605316, + "learning_rate": 8.401621301595494e-05, + "loss": 1.5505725860595703, + "step": 108990 + }, + { + "epoch": 0.3299380987695428, + "grad_norm": 0.1064261868596077, + "learning_rate": 8.401241783431755e-05, + "loss": 1.573471450805664, + "step": 109000 + }, + { + "epoch": 0.3299380987695428, + "eval_loss": 1.550203561782837, + "eval_runtime": 28.0164, + "eval_samples_per_second": 17.847, + "eval_steps_per_second": 1.142, + "step": 109000 + }, + { + "epoch": 0.3299683683198886, + "grad_norm": 0.10571601986885071, + "learning_rate": 8.400862265268015e-05, + "loss": 1.5622503280639648, + "step": 109010 + }, + { + "epoch": 0.3299986378702344, + "grad_norm": 0.10655394196510315, + "learning_rate": 8.400482747104277e-05, + "loss": 1.5205623626708984, + "step": 109020 + }, + { + "epoch": 0.3300289074205803, + "grad_norm": 0.11906874924898148, + "learning_rate": 8.400103228940536e-05, + "loss": 1.549822998046875, + "step": 109030 + }, + { + "epoch": 0.33005917697092607, + "grad_norm": 0.09679925441741943, + "learning_rate": 8.399723710776798e-05, + "loss": 1.52806396484375, + "step": 109040 + }, + { + "epoch": 0.3300894465212719, + "grad_norm": 0.10469970852136612, + "learning_rate": 8.39934419261306e-05, + "loss": 1.5823352813720704, + "step": 109050 + }, + { + "epoch": 0.3301197160716178, + "grad_norm": 0.11422359198331833, + "learning_rate": 8.398964674449319e-05, + "loss": 1.5569480895996093, + "step": 109060 + }, + { + "epoch": 0.33014998562196357, + "grad_norm": 0.1069643422961235, + "learning_rate": 8.398585156285581e-05, + "loss": 1.5564865112304687, + "step": 109070 + }, + { + "epoch": 0.3301802551723094, + "grad_norm": 0.12262126803398132, + "learning_rate": 8.39820563812184e-05, + "loss": 1.5405982971191405, + "step": 109080 + }, + { + "epoch": 0.3302105247226552, + "grad_norm": 0.1068011075258255, + "learning_rate": 8.397826119958102e-05, + "loss": 1.5607534408569337, + "step": 109090 + }, + { + "epoch": 0.33024079427300107, + "grad_norm": 0.1245110034942627, + "learning_rate": 8.397446601794361e-05, + "loss": 1.5655508041381836, + "step": 109100 + }, + { + "epoch": 0.3302710638233469, + "grad_norm": 0.10521836578845978, + "learning_rate": 8.397067083630623e-05, + "loss": 1.582719326019287, + "step": 109110 + }, + { + "epoch": 0.3303013333736927, + "grad_norm": 0.11097513139247894, + "learning_rate": 8.396687565466883e-05, + "loss": 1.5371549606323243, + "step": 109120 + }, + { + "epoch": 0.33033160292403857, + "grad_norm": 0.10547510534524918, + "learning_rate": 8.396308047303145e-05, + "loss": 1.559196376800537, + "step": 109130 + }, + { + "epoch": 0.33036187247438437, + "grad_norm": 0.11831007152795792, + "learning_rate": 8.395928529139404e-05, + "loss": 1.5180627822875976, + "step": 109140 + }, + { + "epoch": 0.3303921420247302, + "grad_norm": 0.11089703440666199, + "learning_rate": 8.395549010975666e-05, + "loss": 1.5270437240600585, + "step": 109150 + }, + { + "epoch": 0.33042241157507607, + "grad_norm": 0.11407959461212158, + "learning_rate": 8.395169492811926e-05, + "loss": 1.530622673034668, + "step": 109160 + }, + { + "epoch": 0.33045268112542187, + "grad_norm": 0.11398307979106903, + "learning_rate": 8.394789974648187e-05, + "loss": 1.588759994506836, + "step": 109170 + }, + { + "epoch": 0.3304829506757677, + "grad_norm": 0.11202333122491837, + "learning_rate": 8.394410456484448e-05, + "loss": 1.530456256866455, + "step": 109180 + }, + { + "epoch": 0.33051322022611357, + "grad_norm": 0.11427394300699234, + "learning_rate": 8.394030938320708e-05, + "loss": 1.5507378578186035, + "step": 109190 + }, + { + "epoch": 0.33054348977645936, + "grad_norm": 0.10449637472629547, + "learning_rate": 8.393651420156969e-05, + "loss": 1.526169776916504, + "step": 109200 + }, + { + "epoch": 0.3305737593268052, + "grad_norm": 0.10003367811441422, + "learning_rate": 8.39327190199323e-05, + "loss": 1.548872184753418, + "step": 109210 + }, + { + "epoch": 0.330604028877151, + "grad_norm": 0.11491875350475311, + "learning_rate": 8.39289238382949e-05, + "loss": 1.533284568786621, + "step": 109220 + }, + { + "epoch": 0.33063429842749686, + "grad_norm": 0.11278131604194641, + "learning_rate": 8.39251286566575e-05, + "loss": 1.5617401123046875, + "step": 109230 + }, + { + "epoch": 0.3306645679778427, + "grad_norm": 0.10744967311620712, + "learning_rate": 8.392133347502012e-05, + "loss": 1.568599319458008, + "step": 109240 + }, + { + "epoch": 0.3306948375281885, + "grad_norm": 0.10841472446918488, + "learning_rate": 8.391753829338272e-05, + "loss": 1.5519580841064453, + "step": 109250 + }, + { + "epoch": 0.33072510707853436, + "grad_norm": 0.11778503656387329, + "learning_rate": 8.391374311174534e-05, + "loss": 1.5234187126159668, + "step": 109260 + }, + { + "epoch": 0.33075537662888016, + "grad_norm": 0.11554296314716339, + "learning_rate": 8.390994793010793e-05, + "loss": 1.5561837196350097, + "step": 109270 + }, + { + "epoch": 0.330785646179226, + "grad_norm": 0.12389218062162399, + "learning_rate": 8.390615274847055e-05, + "loss": 1.5679779052734375, + "step": 109280 + }, + { + "epoch": 0.33081591572957186, + "grad_norm": 0.09912097454071045, + "learning_rate": 8.390235756683315e-05, + "loss": 1.595038604736328, + "step": 109290 + }, + { + "epoch": 0.33084618527991766, + "grad_norm": 0.12578922510147095, + "learning_rate": 8.389856238519576e-05, + "loss": 1.5238250732421874, + "step": 109300 + }, + { + "epoch": 0.3308764548302635, + "grad_norm": 0.10001876205205917, + "learning_rate": 8.389476720355837e-05, + "loss": 1.531515884399414, + "step": 109310 + }, + { + "epoch": 0.3309067243806093, + "grad_norm": 0.10733284801244736, + "learning_rate": 8.389097202192097e-05, + "loss": 1.536741065979004, + "step": 109320 + }, + { + "epoch": 0.33093699393095516, + "grad_norm": 0.10957902669906616, + "learning_rate": 8.388717684028358e-05, + "loss": 1.5210397720336915, + "step": 109330 + }, + { + "epoch": 0.330967263481301, + "grad_norm": 0.10446406900882721, + "learning_rate": 8.388338165864618e-05, + "loss": 1.5572086334228517, + "step": 109340 + }, + { + "epoch": 0.3309975330316468, + "grad_norm": 0.1046091765165329, + "learning_rate": 8.387958647700879e-05, + "loss": 1.5378215789794922, + "step": 109350 + }, + { + "epoch": 0.33102780258199266, + "grad_norm": 0.11517059803009033, + "learning_rate": 8.38757912953714e-05, + "loss": 1.532561683654785, + "step": 109360 + }, + { + "epoch": 0.33105807213233845, + "grad_norm": 0.1127048134803772, + "learning_rate": 8.3871996113734e-05, + "loss": 1.5877304077148438, + "step": 109370 + }, + { + "epoch": 0.3310883416826843, + "grad_norm": 0.12787236273288727, + "learning_rate": 8.386820093209661e-05, + "loss": 1.5679835319519042, + "step": 109380 + }, + { + "epoch": 0.33111861123303016, + "grad_norm": 0.10688778758049011, + "learning_rate": 8.386440575045921e-05, + "loss": 1.5167486190795898, + "step": 109390 + }, + { + "epoch": 0.33114888078337595, + "grad_norm": 0.1059037446975708, + "learning_rate": 8.386061056882182e-05, + "loss": 1.5859455108642577, + "step": 109400 + }, + { + "epoch": 0.3311791503337218, + "grad_norm": 0.1107911691069603, + "learning_rate": 8.385681538718443e-05, + "loss": 1.5315269470214843, + "step": 109410 + }, + { + "epoch": 0.3312094198840676, + "grad_norm": 0.10744232684373856, + "learning_rate": 8.385302020554705e-05, + "loss": 1.5289751052856446, + "step": 109420 + }, + { + "epoch": 0.33123968943441345, + "grad_norm": 0.10091898590326309, + "learning_rate": 8.384922502390964e-05, + "loss": 1.5528444290161132, + "step": 109430 + }, + { + "epoch": 0.3312699589847593, + "grad_norm": 0.12193950265645981, + "learning_rate": 8.384542984227226e-05, + "loss": 1.5383871078491211, + "step": 109440 + }, + { + "epoch": 0.3313002285351051, + "grad_norm": 0.12171461433172226, + "learning_rate": 8.384163466063486e-05, + "loss": 1.561897373199463, + "step": 109450 + }, + { + "epoch": 0.33133049808545095, + "grad_norm": 0.11792240291833878, + "learning_rate": 8.383783947899747e-05, + "loss": 1.5169219970703125, + "step": 109460 + }, + { + "epoch": 0.33136076763579675, + "grad_norm": 0.12050198763608932, + "learning_rate": 8.383404429736008e-05, + "loss": 1.5595022201538087, + "step": 109470 + }, + { + "epoch": 0.3313910371861426, + "grad_norm": 0.1081637591123581, + "learning_rate": 8.383024911572268e-05, + "loss": 1.5586816787719726, + "step": 109480 + }, + { + "epoch": 0.33142130673648845, + "grad_norm": 0.11160393804311752, + "learning_rate": 8.382645393408529e-05, + "loss": 1.5737299919128418, + "step": 109490 + }, + { + "epoch": 0.33145157628683425, + "grad_norm": 0.11426607519388199, + "learning_rate": 8.382265875244789e-05, + "loss": 1.557196044921875, + "step": 109500 + }, + { + "epoch": 0.33145157628683425, + "eval_loss": 1.542800784111023, + "eval_runtime": 28.0701, + "eval_samples_per_second": 17.813, + "eval_steps_per_second": 1.14, + "step": 109500 + }, + { + "epoch": 0.3314818458371801, + "grad_norm": 0.11811621487140656, + "learning_rate": 8.38188635708105e-05, + "loss": 1.5444808959960938, + "step": 109510 + }, + { + "epoch": 0.3315121153875259, + "grad_norm": 0.1045193001627922, + "learning_rate": 8.38150683891731e-05, + "loss": 1.5646773338317872, + "step": 109520 + }, + { + "epoch": 0.33154238493787175, + "grad_norm": 0.1079912781715393, + "learning_rate": 8.381127320753572e-05, + "loss": 1.5432418823242187, + "step": 109530 + }, + { + "epoch": 0.3315726544882176, + "grad_norm": 0.10141560435295105, + "learning_rate": 8.380747802589832e-05, + "loss": 1.567509365081787, + "step": 109540 + }, + { + "epoch": 0.3316029240385634, + "grad_norm": 0.10471371561288834, + "learning_rate": 8.380368284426094e-05, + "loss": 1.5298741340637207, + "step": 109550 + }, + { + "epoch": 0.33163319358890925, + "grad_norm": 0.11849423497915268, + "learning_rate": 8.379988766262353e-05, + "loss": 1.5578522682189941, + "step": 109560 + }, + { + "epoch": 0.33166346313925504, + "grad_norm": 0.12237055599689484, + "learning_rate": 8.379609248098615e-05, + "loss": 1.5361398696899413, + "step": 109570 + }, + { + "epoch": 0.3316937326896009, + "grad_norm": 0.12626343965530396, + "learning_rate": 8.379229729934874e-05, + "loss": 1.5597400665283203, + "step": 109580 + }, + { + "epoch": 0.33172400223994675, + "grad_norm": 0.1103600338101387, + "learning_rate": 8.378850211771136e-05, + "loss": 1.507626724243164, + "step": 109590 + }, + { + "epoch": 0.33175427179029254, + "grad_norm": 0.11177096515893936, + "learning_rate": 8.378470693607395e-05, + "loss": 1.5447419166564942, + "step": 109600 + }, + { + "epoch": 0.3317845413406384, + "grad_norm": 0.11186598986387253, + "learning_rate": 8.378091175443657e-05, + "loss": 1.5231549263000488, + "step": 109610 + }, + { + "epoch": 0.3318148108909842, + "grad_norm": 0.10911738127470016, + "learning_rate": 8.377711657279916e-05, + "loss": 1.6235328674316407, + "step": 109620 + }, + { + "epoch": 0.33184508044133004, + "grad_norm": 0.10002374649047852, + "learning_rate": 8.377332139116178e-05, + "loss": 1.5853205680847169, + "step": 109630 + }, + { + "epoch": 0.3318753499916759, + "grad_norm": 0.1164117082953453, + "learning_rate": 8.376952620952439e-05, + "loss": 1.542329978942871, + "step": 109640 + }, + { + "epoch": 0.3319056195420217, + "grad_norm": 0.11435796320438385, + "learning_rate": 8.3765731027887e-05, + "loss": 1.5450871467590332, + "step": 109650 + }, + { + "epoch": 0.33193588909236754, + "grad_norm": 0.11244206875562668, + "learning_rate": 8.376193584624962e-05, + "loss": 1.5351892471313477, + "step": 109660 + }, + { + "epoch": 0.33196615864271334, + "grad_norm": 0.11530164629220963, + "learning_rate": 8.375814066461221e-05, + "loss": 1.5439729690551758, + "step": 109670 + }, + { + "epoch": 0.3319964281930592, + "grad_norm": 0.1175859272480011, + "learning_rate": 8.375434548297483e-05, + "loss": 1.534271812438965, + "step": 109680 + }, + { + "epoch": 0.33202669774340504, + "grad_norm": 0.1125638484954834, + "learning_rate": 8.375055030133742e-05, + "loss": 1.561293125152588, + "step": 109690 + }, + { + "epoch": 0.33205696729375084, + "grad_norm": 0.13537871837615967, + "learning_rate": 8.374675511970004e-05, + "loss": 1.5321683883666992, + "step": 109700 + }, + { + "epoch": 0.3320872368440967, + "grad_norm": 0.11568476259708405, + "learning_rate": 8.374295993806263e-05, + "loss": 1.5768482208251953, + "step": 109710 + }, + { + "epoch": 0.3321175063944425, + "grad_norm": 0.10525667667388916, + "learning_rate": 8.373916475642525e-05, + "loss": 1.5408489227294921, + "step": 109720 + }, + { + "epoch": 0.33214777594478834, + "grad_norm": 0.10620619356632233, + "learning_rate": 8.373536957478784e-05, + "loss": 1.5451045036315918, + "step": 109730 + }, + { + "epoch": 0.3321780454951342, + "grad_norm": 0.13396789133548737, + "learning_rate": 8.373157439315046e-05, + "loss": 1.5645299911499024, + "step": 109740 + }, + { + "epoch": 0.33220831504548, + "grad_norm": 0.10344427078962326, + "learning_rate": 8.372777921151306e-05, + "loss": 1.5625058174133302, + "step": 109750 + }, + { + "epoch": 0.33223858459582584, + "grad_norm": 0.115743488073349, + "learning_rate": 8.372398402987567e-05, + "loss": 1.542611789703369, + "step": 109760 + }, + { + "epoch": 0.33226885414617163, + "grad_norm": 0.12857264280319214, + "learning_rate": 8.372018884823827e-05, + "loss": 1.5550704956054688, + "step": 109770 + }, + { + "epoch": 0.3322991236965175, + "grad_norm": 0.1170407086610794, + "learning_rate": 8.371639366660089e-05, + "loss": 1.5308223724365235, + "step": 109780 + }, + { + "epoch": 0.33232939324686334, + "grad_norm": 0.107276052236557, + "learning_rate": 8.371259848496349e-05, + "loss": 1.5360525131225586, + "step": 109790 + }, + { + "epoch": 0.33235966279720913, + "grad_norm": 0.11627934128046036, + "learning_rate": 8.37088033033261e-05, + "loss": 1.5695098876953124, + "step": 109800 + }, + { + "epoch": 0.332389932347555, + "grad_norm": 0.10829116404056549, + "learning_rate": 8.37050081216887e-05, + "loss": 1.5491806030273438, + "step": 109810 + }, + { + "epoch": 0.3324202018979008, + "grad_norm": 0.12147510796785355, + "learning_rate": 8.370121294005131e-05, + "loss": 1.5333797454833984, + "step": 109820 + }, + { + "epoch": 0.33245047144824663, + "grad_norm": 0.13350681960582733, + "learning_rate": 8.369741775841392e-05, + "loss": 1.5633691787719726, + "step": 109830 + }, + { + "epoch": 0.3324807409985925, + "grad_norm": 0.11526650935411453, + "learning_rate": 8.369362257677652e-05, + "loss": 1.5669414520263671, + "step": 109840 + }, + { + "epoch": 0.3325110105489383, + "grad_norm": 0.11207402497529984, + "learning_rate": 8.368982739513914e-05, + "loss": 1.5477797508239746, + "step": 109850 + }, + { + "epoch": 0.33254128009928413, + "grad_norm": 0.10588620603084564, + "learning_rate": 8.368603221350173e-05, + "loss": 1.520745849609375, + "step": 109860 + }, + { + "epoch": 0.3325715496496299, + "grad_norm": 0.10867232829332352, + "learning_rate": 8.368223703186435e-05, + "loss": 1.5200664520263671, + "step": 109870 + }, + { + "epoch": 0.3326018191999758, + "grad_norm": 0.10985899716615677, + "learning_rate": 8.367844185022695e-05, + "loss": 1.559064769744873, + "step": 109880 + }, + { + "epoch": 0.33263208875032163, + "grad_norm": 0.10985369235277176, + "learning_rate": 8.367464666858957e-05, + "loss": 1.5424936294555665, + "step": 109890 + }, + { + "epoch": 0.3326623583006674, + "grad_norm": 0.11293847858905792, + "learning_rate": 8.367085148695217e-05, + "loss": 1.5431090354919434, + "step": 109900 + }, + { + "epoch": 0.3326926278510133, + "grad_norm": 0.10464490205049515, + "learning_rate": 8.366705630531478e-05, + "loss": 1.5656063079833984, + "step": 109910 + }, + { + "epoch": 0.33272289740135913, + "grad_norm": 0.1028682067990303, + "learning_rate": 8.366326112367738e-05, + "loss": 1.5379217147827149, + "step": 109920 + }, + { + "epoch": 0.3327531669517049, + "grad_norm": 0.10880139470100403, + "learning_rate": 8.365946594203999e-05, + "loss": 1.5648983001708985, + "step": 109930 + }, + { + "epoch": 0.3327834365020508, + "grad_norm": 0.1302536427974701, + "learning_rate": 8.36556707604026e-05, + "loss": 1.5291661262512206, + "step": 109940 + }, + { + "epoch": 0.3328137060523966, + "grad_norm": 0.11503733694553375, + "learning_rate": 8.36518755787652e-05, + "loss": 1.5696527481079101, + "step": 109950 + }, + { + "epoch": 0.3328439756027424, + "grad_norm": 0.10875767469406128, + "learning_rate": 8.364808039712781e-05, + "loss": 1.5374065399169923, + "step": 109960 + }, + { + "epoch": 0.3328742451530883, + "grad_norm": 0.12768414616584778, + "learning_rate": 8.364428521549041e-05, + "loss": 1.5427183151245116, + "step": 109970 + }, + { + "epoch": 0.3329045147034341, + "grad_norm": 0.11428327858448029, + "learning_rate": 8.364049003385302e-05, + "loss": 1.5716670036315918, + "step": 109980 + }, + { + "epoch": 0.3329347842537799, + "grad_norm": 0.11996351927518845, + "learning_rate": 8.363669485221563e-05, + "loss": 1.5367170333862306, + "step": 109990 + }, + { + "epoch": 0.3329650538041257, + "grad_norm": 0.10791052132844925, + "learning_rate": 8.363289967057823e-05, + "loss": 1.5464845657348634, + "step": 110000 + }, + { + "epoch": 0.3329650538041257, + "eval_loss": 1.559951901435852, + "eval_runtime": 27.9144, + "eval_samples_per_second": 17.912, + "eval_steps_per_second": 1.146, + "step": 110000 + }, + { + "epoch": 0.3329953233544716, + "grad_norm": 0.13089905679225922, + "learning_rate": 8.362910448894084e-05, + "loss": 1.575335693359375, + "step": 110010 + }, + { + "epoch": 0.3330255929048174, + "grad_norm": 0.12056464701890945, + "learning_rate": 8.362530930730344e-05, + "loss": 1.5513124465942383, + "step": 110020 + }, + { + "epoch": 0.3330558624551632, + "grad_norm": 0.10966560989618301, + "learning_rate": 8.362151412566606e-05, + "loss": 1.5603922843933105, + "step": 110030 + }, + { + "epoch": 0.3330861320055091, + "grad_norm": 0.11926840245723724, + "learning_rate": 8.361771894402866e-05, + "loss": 1.5139240264892577, + "step": 110040 + }, + { + "epoch": 0.33311640155585487, + "grad_norm": 0.12079682946205139, + "learning_rate": 8.361392376239127e-05, + "loss": 1.5539039611816405, + "step": 110050 + }, + { + "epoch": 0.3331466711062007, + "grad_norm": 0.12844790518283844, + "learning_rate": 8.361012858075388e-05, + "loss": 1.5510982513427733, + "step": 110060 + }, + { + "epoch": 0.33317694065654657, + "grad_norm": 0.12576639652252197, + "learning_rate": 8.360633339911649e-05, + "loss": 1.5475305557250976, + "step": 110070 + }, + { + "epoch": 0.33320721020689237, + "grad_norm": 0.11335477232933044, + "learning_rate": 8.360253821747909e-05, + "loss": 1.5793722152709961, + "step": 110080 + }, + { + "epoch": 0.3332374797572382, + "grad_norm": 0.11679816991090775, + "learning_rate": 8.35987430358417e-05, + "loss": 1.5871973037719727, + "step": 110090 + }, + { + "epoch": 0.333267749307584, + "grad_norm": 0.1017133966088295, + "learning_rate": 8.35949478542043e-05, + "loss": 1.5178824424743653, + "step": 110100 + }, + { + "epoch": 0.33329801885792987, + "grad_norm": 0.11517930030822754, + "learning_rate": 8.359115267256691e-05, + "loss": 1.492123794555664, + "step": 110110 + }, + { + "epoch": 0.3333282884082757, + "grad_norm": 0.10518720746040344, + "learning_rate": 8.358735749092952e-05, + "loss": 1.5168142318725586, + "step": 110120 + }, + { + "epoch": 0.3333585579586215, + "grad_norm": 0.1065361276268959, + "learning_rate": 8.358356230929212e-05, + "loss": 1.5537144660949707, + "step": 110130 + }, + { + "epoch": 0.33338882750896737, + "grad_norm": 0.12221421301364899, + "learning_rate": 8.357976712765474e-05, + "loss": 1.5194338798522948, + "step": 110140 + }, + { + "epoch": 0.33341909705931316, + "grad_norm": 0.11221233755350113, + "learning_rate": 8.357597194601733e-05, + "loss": 1.573250961303711, + "step": 110150 + }, + { + "epoch": 0.333449366609659, + "grad_norm": 0.10341521352529526, + "learning_rate": 8.357217676437995e-05, + "loss": 1.5435050964355468, + "step": 110160 + }, + { + "epoch": 0.33347963616000487, + "grad_norm": 0.11199132353067398, + "learning_rate": 8.356838158274255e-05, + "loss": 1.5704946517944336, + "step": 110170 + }, + { + "epoch": 0.33350990571035066, + "grad_norm": 0.12369635701179504, + "learning_rate": 8.356458640110517e-05, + "loss": 1.5699541091918945, + "step": 110180 + }, + { + "epoch": 0.3335401752606965, + "grad_norm": 0.11781914532184601, + "learning_rate": 8.356079121946776e-05, + "loss": 1.5449838638305664, + "step": 110190 + }, + { + "epoch": 0.3335704448110423, + "grad_norm": 0.10712311416864395, + "learning_rate": 8.355699603783038e-05, + "loss": 1.5425291061401367, + "step": 110200 + }, + { + "epoch": 0.33360071436138816, + "grad_norm": 0.11603633314371109, + "learning_rate": 8.355320085619297e-05, + "loss": 1.5535674095153809, + "step": 110210 + }, + { + "epoch": 0.333630983911734, + "grad_norm": 0.10259659588336945, + "learning_rate": 8.354940567455559e-05, + "loss": 1.5573103904724122, + "step": 110220 + }, + { + "epoch": 0.3336612534620798, + "grad_norm": 0.11374779045581818, + "learning_rate": 8.354561049291818e-05, + "loss": 1.5336529731750488, + "step": 110230 + }, + { + "epoch": 0.33369152301242566, + "grad_norm": 0.11189666390419006, + "learning_rate": 8.35418153112808e-05, + "loss": 1.5394786834716796, + "step": 110240 + }, + { + "epoch": 0.33372179256277146, + "grad_norm": 0.10920913517475128, + "learning_rate": 8.353802012964341e-05, + "loss": 1.5642868041992188, + "step": 110250 + }, + { + "epoch": 0.3337520621131173, + "grad_norm": 0.112762950360775, + "learning_rate": 8.353422494800601e-05, + "loss": 1.5259453773498535, + "step": 110260 + }, + { + "epoch": 0.33378233166346316, + "grad_norm": 0.10759206116199493, + "learning_rate": 8.353042976636863e-05, + "loss": 1.5787004470825194, + "step": 110270 + }, + { + "epoch": 0.33381260121380896, + "grad_norm": 0.10790981352329254, + "learning_rate": 8.352663458473123e-05, + "loss": 1.5390830993652345, + "step": 110280 + }, + { + "epoch": 0.3338428707641548, + "grad_norm": 0.10265638679265976, + "learning_rate": 8.352283940309384e-05, + "loss": 1.5386186599731446, + "step": 110290 + }, + { + "epoch": 0.3338731403145006, + "grad_norm": 0.11225966364145279, + "learning_rate": 8.351904422145644e-05, + "loss": 1.5257747650146485, + "step": 110300 + }, + { + "epoch": 0.33390340986484646, + "grad_norm": 0.1119127869606018, + "learning_rate": 8.351524903981906e-05, + "loss": 1.551043701171875, + "step": 110310 + }, + { + "epoch": 0.3339336794151923, + "grad_norm": 0.1253066211938858, + "learning_rate": 8.351145385818165e-05, + "loss": 1.5550702095031739, + "step": 110320 + }, + { + "epoch": 0.3339639489655381, + "grad_norm": 0.11020347476005554, + "learning_rate": 8.350765867654427e-05, + "loss": 1.5763537406921386, + "step": 110330 + }, + { + "epoch": 0.33399421851588396, + "grad_norm": 0.11075540632009506, + "learning_rate": 8.350386349490686e-05, + "loss": 1.6031944274902343, + "step": 110340 + }, + { + "epoch": 0.33402448806622975, + "grad_norm": 0.12363892048597336, + "learning_rate": 8.350006831326948e-05, + "loss": 1.5406574249267577, + "step": 110350 + }, + { + "epoch": 0.3340547576165756, + "grad_norm": 0.09582661092281342, + "learning_rate": 8.349627313163207e-05, + "loss": 1.5251852035522462, + "step": 110360 + }, + { + "epoch": 0.33408502716692146, + "grad_norm": 0.11630558222532272, + "learning_rate": 8.349247794999469e-05, + "loss": 1.567453670501709, + "step": 110370 + }, + { + "epoch": 0.33411529671726725, + "grad_norm": 0.12330266833305359, + "learning_rate": 8.348868276835728e-05, + "loss": 1.5255250930786133, + "step": 110380 + }, + { + "epoch": 0.3341455662676131, + "grad_norm": 0.1194864884018898, + "learning_rate": 8.34848875867199e-05, + "loss": 1.539903450012207, + "step": 110390 + }, + { + "epoch": 0.3341758358179589, + "grad_norm": 0.10651438683271408, + "learning_rate": 8.348109240508251e-05, + "loss": 1.5282564163208008, + "step": 110400 + }, + { + "epoch": 0.33420610536830475, + "grad_norm": 0.12044034898281097, + "learning_rate": 8.347729722344512e-05, + "loss": 1.519439697265625, + "step": 110410 + }, + { + "epoch": 0.3342363749186506, + "grad_norm": 0.09210017323493958, + "learning_rate": 8.347350204180772e-05, + "loss": 1.5804277420043946, + "step": 110420 + }, + { + "epoch": 0.3342666444689964, + "grad_norm": 0.10514512658119202, + "learning_rate": 8.346970686017033e-05, + "loss": 1.536455249786377, + "step": 110430 + }, + { + "epoch": 0.33429691401934225, + "grad_norm": 0.11366036534309387, + "learning_rate": 8.346591167853293e-05, + "loss": 1.5488086700439454, + "step": 110440 + }, + { + "epoch": 0.33432718356968805, + "grad_norm": 0.12163782864809036, + "learning_rate": 8.346211649689554e-05, + "loss": 1.568277359008789, + "step": 110450 + }, + { + "epoch": 0.3343574531200339, + "grad_norm": 0.10913636535406113, + "learning_rate": 8.345832131525816e-05, + "loss": 1.5421472549438477, + "step": 110460 + }, + { + "epoch": 0.33438772267037975, + "grad_norm": 0.1369517743587494, + "learning_rate": 8.345452613362075e-05, + "loss": 1.5770343780517577, + "step": 110470 + }, + { + "epoch": 0.33441799222072555, + "grad_norm": 0.11117662489414215, + "learning_rate": 8.345073095198337e-05, + "loss": 1.530350112915039, + "step": 110480 + }, + { + "epoch": 0.3344482617710714, + "grad_norm": 0.10088016092777252, + "learning_rate": 8.344693577034596e-05, + "loss": 1.5651016235351562, + "step": 110490 + }, + { + "epoch": 0.3344785313214172, + "grad_norm": 0.10154804587364197, + "learning_rate": 8.344314058870858e-05, + "loss": 1.5491666793823242, + "step": 110500 + }, + { + "epoch": 0.3344785313214172, + "eval_loss": 1.5657849311828613, + "eval_runtime": 28.3701, + "eval_samples_per_second": 17.624, + "eval_steps_per_second": 1.128, + "step": 110500 + }, + { + "epoch": 0.33450880087176305, + "grad_norm": 0.1193327009677887, + "learning_rate": 8.343934540707118e-05, + "loss": 1.5912835121154785, + "step": 110510 + }, + { + "epoch": 0.3345390704221089, + "grad_norm": 0.10323596745729446, + "learning_rate": 8.34355502254338e-05, + "loss": 1.5863287925720215, + "step": 110520 + }, + { + "epoch": 0.3345693399724547, + "grad_norm": 0.1116332858800888, + "learning_rate": 8.34317550437964e-05, + "loss": 1.5380104064941407, + "step": 110530 + }, + { + "epoch": 0.33459960952280055, + "grad_norm": 0.10818158835172653, + "learning_rate": 8.342795986215901e-05, + "loss": 1.5898635864257813, + "step": 110540 + }, + { + "epoch": 0.33462987907314634, + "grad_norm": 0.12744420766830444, + "learning_rate": 8.342416468052161e-05, + "loss": 1.5560091018676758, + "step": 110550 + }, + { + "epoch": 0.3346601486234922, + "grad_norm": 0.10888466238975525, + "learning_rate": 8.342036949888422e-05, + "loss": 1.5483814239501954, + "step": 110560 + }, + { + "epoch": 0.33469041817383804, + "grad_norm": 0.11630565673112869, + "learning_rate": 8.341657431724682e-05, + "loss": 1.5756902694702148, + "step": 110570 + }, + { + "epoch": 0.33472068772418384, + "grad_norm": 0.1189764216542244, + "learning_rate": 8.341277913560943e-05, + "loss": 1.572422981262207, + "step": 110580 + }, + { + "epoch": 0.3347509572745297, + "grad_norm": 0.12171078473329544, + "learning_rate": 8.340898395397204e-05, + "loss": 1.5298357963562013, + "step": 110590 + }, + { + "epoch": 0.33478122682487554, + "grad_norm": 0.12125702202320099, + "learning_rate": 8.340518877233464e-05, + "loss": 1.5369623184204102, + "step": 110600 + }, + { + "epoch": 0.33481149637522134, + "grad_norm": 0.11593180894851685, + "learning_rate": 8.340139359069725e-05, + "loss": 1.5348048210144043, + "step": 110610 + }, + { + "epoch": 0.3348417659255672, + "grad_norm": 0.1118454560637474, + "learning_rate": 8.339759840905985e-05, + "loss": 1.5255895614624024, + "step": 110620 + }, + { + "epoch": 0.334872035475913, + "grad_norm": 0.12095983326435089, + "learning_rate": 8.339380322742246e-05, + "loss": 1.541917324066162, + "step": 110630 + }, + { + "epoch": 0.33490230502625884, + "grad_norm": 0.11043280363082886, + "learning_rate": 8.339000804578508e-05, + "loss": 1.5477397918701172, + "step": 110640 + }, + { + "epoch": 0.3349325745766047, + "grad_norm": 0.10534217953681946, + "learning_rate": 8.338621286414767e-05, + "loss": 1.5882129669189453, + "step": 110650 + }, + { + "epoch": 0.3349628441269505, + "grad_norm": 0.11084705591201782, + "learning_rate": 8.338241768251029e-05, + "loss": 1.5324005126953124, + "step": 110660 + }, + { + "epoch": 0.33499311367729634, + "grad_norm": 0.10899293422698975, + "learning_rate": 8.33786225008729e-05, + "loss": 1.5529821395874024, + "step": 110670 + }, + { + "epoch": 0.33502338322764214, + "grad_norm": 0.1110139787197113, + "learning_rate": 8.33748273192355e-05, + "loss": 1.5568883895874024, + "step": 110680 + }, + { + "epoch": 0.335053652777988, + "grad_norm": 0.12751983106136322, + "learning_rate": 8.337103213759811e-05, + "loss": 1.5689725875854492, + "step": 110690 + }, + { + "epoch": 0.33508392232833384, + "grad_norm": 0.10466240346431732, + "learning_rate": 8.336723695596072e-05, + "loss": 1.5728041648864746, + "step": 110700 + }, + { + "epoch": 0.33511419187867963, + "grad_norm": 0.12077855318784714, + "learning_rate": 8.336344177432332e-05, + "loss": 1.5571344375610352, + "step": 110710 + }, + { + "epoch": 0.3351444614290255, + "grad_norm": 0.11717038601636887, + "learning_rate": 8.335964659268593e-05, + "loss": 1.546478843688965, + "step": 110720 + }, + { + "epoch": 0.3351747309793713, + "grad_norm": 0.11085941642522812, + "learning_rate": 8.335585141104853e-05, + "loss": 1.5428650856018067, + "step": 110730 + }, + { + "epoch": 0.33520500052971713, + "grad_norm": 0.10907004028558731, + "learning_rate": 8.335205622941114e-05, + "loss": 1.570058536529541, + "step": 110740 + }, + { + "epoch": 0.335235270080063, + "grad_norm": 0.11909619718790054, + "learning_rate": 8.334826104777375e-05, + "loss": 1.5643392562866212, + "step": 110750 + }, + { + "epoch": 0.3352655396304088, + "grad_norm": 0.12592823803424835, + "learning_rate": 8.334446586613635e-05, + "loss": 1.5085023880004882, + "step": 110760 + }, + { + "epoch": 0.33529580918075463, + "grad_norm": 0.1139678806066513, + "learning_rate": 8.334067068449897e-05, + "loss": 1.5205588340759277, + "step": 110770 + }, + { + "epoch": 0.33532607873110043, + "grad_norm": 0.10953903943300247, + "learning_rate": 8.333687550286156e-05, + "loss": 1.5433374404907227, + "step": 110780 + }, + { + "epoch": 0.3353563482814463, + "grad_norm": 0.13250771164894104, + "learning_rate": 8.333308032122418e-05, + "loss": 1.5155750274658204, + "step": 110790 + }, + { + "epoch": 0.33538661783179213, + "grad_norm": 0.11671850830316544, + "learning_rate": 8.332928513958678e-05, + "loss": 1.5780061721801757, + "step": 110800 + }, + { + "epoch": 0.33541688738213793, + "grad_norm": 0.10551997274160385, + "learning_rate": 8.33254899579494e-05, + "loss": 1.5398731231689453, + "step": 110810 + }, + { + "epoch": 0.3354471569324838, + "grad_norm": 0.1128925234079361, + "learning_rate": 8.332169477631199e-05, + "loss": 1.5926374435424804, + "step": 110820 + }, + { + "epoch": 0.3354774264828296, + "grad_norm": 0.13288021087646484, + "learning_rate": 8.33178995946746e-05, + "loss": 1.5412357330322266, + "step": 110830 + }, + { + "epoch": 0.33550769603317543, + "grad_norm": 0.09845757484436035, + "learning_rate": 8.33141044130372e-05, + "loss": 1.5674838066101073, + "step": 110840 + }, + { + "epoch": 0.3355379655835213, + "grad_norm": 0.10182130336761475, + "learning_rate": 8.331030923139982e-05, + "loss": 1.5533138275146485, + "step": 110850 + }, + { + "epoch": 0.3355682351338671, + "grad_norm": 0.10390400141477585, + "learning_rate": 8.330651404976242e-05, + "loss": 1.5896495819091796, + "step": 110860 + }, + { + "epoch": 0.33559850468421293, + "grad_norm": 0.11433672159910202, + "learning_rate": 8.330271886812503e-05, + "loss": 1.518914794921875, + "step": 110870 + }, + { + "epoch": 0.3356287742345587, + "grad_norm": 0.11092797666788101, + "learning_rate": 8.329892368648765e-05, + "loss": 1.5317045211791993, + "step": 110880 + }, + { + "epoch": 0.3356590437849046, + "grad_norm": 0.11412051320075989, + "learning_rate": 8.329512850485024e-05, + "loss": 1.556135368347168, + "step": 110890 + }, + { + "epoch": 0.3356893133352504, + "grad_norm": 0.0969814658164978, + "learning_rate": 8.329133332321286e-05, + "loss": 1.5158806800842286, + "step": 110900 + }, + { + "epoch": 0.3357195828855962, + "grad_norm": 0.12038736045360565, + "learning_rate": 8.328753814157545e-05, + "loss": 1.5535609245300293, + "step": 110910 + }, + { + "epoch": 0.3357498524359421, + "grad_norm": 0.12107282876968384, + "learning_rate": 8.328374295993807e-05, + "loss": 1.5038704872131348, + "step": 110920 + }, + { + "epoch": 0.33578012198628787, + "grad_norm": 0.11687467992305756, + "learning_rate": 8.327994777830067e-05, + "loss": 1.5642423629760742, + "step": 110930 + }, + { + "epoch": 0.3358103915366337, + "grad_norm": 0.12124450504779816, + "learning_rate": 8.327615259666329e-05, + "loss": 1.5680519104003907, + "step": 110940 + }, + { + "epoch": 0.3358406610869796, + "grad_norm": 0.10775662958621979, + "learning_rate": 8.327235741502588e-05, + "loss": 1.5582509994506837, + "step": 110950 + }, + { + "epoch": 0.33587093063732537, + "grad_norm": 0.10970763862133026, + "learning_rate": 8.32685622333885e-05, + "loss": 1.5358161926269531, + "step": 110960 + }, + { + "epoch": 0.3359012001876712, + "grad_norm": 0.1224963590502739, + "learning_rate": 8.326476705175109e-05, + "loss": 1.5251734733581543, + "step": 110970 + }, + { + "epoch": 0.335931469738017, + "grad_norm": 0.11848948150873184, + "learning_rate": 8.326097187011371e-05, + "loss": 1.54844970703125, + "step": 110980 + }, + { + "epoch": 0.33596173928836287, + "grad_norm": 0.11032547056674957, + "learning_rate": 8.32571766884763e-05, + "loss": 1.563725471496582, + "step": 110990 + }, + { + "epoch": 0.3359920088387087, + "grad_norm": 0.12016294151544571, + "learning_rate": 8.325338150683892e-05, + "loss": 1.5286588668823242, + "step": 111000 + }, + { + "epoch": 0.3359920088387087, + "eval_loss": 1.552341103553772, + "eval_runtime": 28.0963, + "eval_samples_per_second": 17.796, + "eval_steps_per_second": 1.139, + "step": 111000 + }, + { + "epoch": 0.3360222783890545, + "grad_norm": 0.10380539298057556, + "learning_rate": 8.324958632520153e-05, + "loss": 1.5617802619934082, + "step": 111010 + }, + { + "epoch": 0.33605254793940037, + "grad_norm": 0.12932096421718597, + "learning_rate": 8.324579114356413e-05, + "loss": 1.5576985359191895, + "step": 111020 + }, + { + "epoch": 0.33608281748974617, + "grad_norm": 0.11241142451763153, + "learning_rate": 8.324199596192674e-05, + "loss": 1.5360298156738281, + "step": 111030 + }, + { + "epoch": 0.336113087040092, + "grad_norm": 0.11299403756856918, + "learning_rate": 8.323820078028935e-05, + "loss": 1.5848987579345704, + "step": 111040 + }, + { + "epoch": 0.33614335659043787, + "grad_norm": 0.11631329357624054, + "learning_rate": 8.323440559865195e-05, + "loss": 1.5949131965637207, + "step": 111050 + }, + { + "epoch": 0.33617362614078367, + "grad_norm": 0.11077162623405457, + "learning_rate": 8.323061041701456e-05, + "loss": 1.5309192657470703, + "step": 111060 + }, + { + "epoch": 0.3362038956911295, + "grad_norm": 0.11430106312036514, + "learning_rate": 8.322681523537718e-05, + "loss": 1.5308677673339843, + "step": 111070 + }, + { + "epoch": 0.3362341652414753, + "grad_norm": 0.09580356627702713, + "learning_rate": 8.322302005373977e-05, + "loss": 1.5525168418884276, + "step": 111080 + }, + { + "epoch": 0.33626443479182117, + "grad_norm": 0.11014488339424133, + "learning_rate": 8.321922487210239e-05, + "loss": 1.5405917167663574, + "step": 111090 + }, + { + "epoch": 0.336294704342167, + "grad_norm": 0.1031746119260788, + "learning_rate": 8.321542969046498e-05, + "loss": 1.573097038269043, + "step": 111100 + }, + { + "epoch": 0.3363249738925128, + "grad_norm": 0.11218687891960144, + "learning_rate": 8.32116345088276e-05, + "loss": 1.5263290405273438, + "step": 111110 + }, + { + "epoch": 0.33635524344285866, + "grad_norm": 0.10710398107767105, + "learning_rate": 8.320783932719019e-05, + "loss": 1.5333430290222168, + "step": 111120 + }, + { + "epoch": 0.33638551299320446, + "grad_norm": 0.11740676313638687, + "learning_rate": 8.320404414555281e-05, + "loss": 1.553253746032715, + "step": 111130 + }, + { + "epoch": 0.3364157825435503, + "grad_norm": 0.10717936605215073, + "learning_rate": 8.320024896391542e-05, + "loss": 1.570179557800293, + "step": 111140 + }, + { + "epoch": 0.33644605209389616, + "grad_norm": 0.11808014661073685, + "learning_rate": 8.319645378227802e-05, + "loss": 1.5453166007995605, + "step": 111150 + }, + { + "epoch": 0.33647632164424196, + "grad_norm": 0.10678138583898544, + "learning_rate": 8.319265860064063e-05, + "loss": 1.5346952438354493, + "step": 111160 + }, + { + "epoch": 0.3365065911945878, + "grad_norm": 0.11159148067235947, + "learning_rate": 8.318886341900324e-05, + "loss": 1.5640024185180663, + "step": 111170 + }, + { + "epoch": 0.3365368607449336, + "grad_norm": 0.11656559258699417, + "learning_rate": 8.318506823736584e-05, + "loss": 1.5762588500976562, + "step": 111180 + }, + { + "epoch": 0.33656713029527946, + "grad_norm": 0.11490477621555328, + "learning_rate": 8.318127305572845e-05, + "loss": 1.5612720489501952, + "step": 111190 + }, + { + "epoch": 0.3365973998456253, + "grad_norm": 0.12131303548812866, + "learning_rate": 8.317747787409105e-05, + "loss": 1.5676128387451171, + "step": 111200 + }, + { + "epoch": 0.3366276693959711, + "grad_norm": 0.12482691556215286, + "learning_rate": 8.317368269245366e-05, + "loss": 1.541788387298584, + "step": 111210 + }, + { + "epoch": 0.33665793894631696, + "grad_norm": 0.11950389295816422, + "learning_rate": 8.316988751081627e-05, + "loss": 1.5549113273620605, + "step": 111220 + }, + { + "epoch": 0.33668820849666276, + "grad_norm": 0.10763613134622574, + "learning_rate": 8.316609232917887e-05, + "loss": 1.5727887153625488, + "step": 111230 + }, + { + "epoch": 0.3367184780470086, + "grad_norm": 0.10695358365774155, + "learning_rate": 8.316229714754148e-05, + "loss": 1.5716357231140137, + "step": 111240 + }, + { + "epoch": 0.33674874759735446, + "grad_norm": 0.1071905568242073, + "learning_rate": 8.31585019659041e-05, + "loss": 1.5141199111938477, + "step": 111250 + }, + { + "epoch": 0.33677901714770025, + "grad_norm": 0.11550989747047424, + "learning_rate": 8.315470678426669e-05, + "loss": 1.5416013717651367, + "step": 111260 + }, + { + "epoch": 0.3368092866980461, + "grad_norm": 0.11161991208791733, + "learning_rate": 8.315091160262931e-05, + "loss": 1.5726161003112793, + "step": 111270 + }, + { + "epoch": 0.3368395562483919, + "grad_norm": 0.1102621778845787, + "learning_rate": 8.314711642099191e-05, + "loss": 1.5267368316650392, + "step": 111280 + }, + { + "epoch": 0.33686982579873775, + "grad_norm": 0.10187079012393951, + "learning_rate": 8.314332123935452e-05, + "loss": 1.5876066207885742, + "step": 111290 + }, + { + "epoch": 0.3369000953490836, + "grad_norm": 0.10898243635892868, + "learning_rate": 8.313952605771713e-05, + "loss": 1.5459172248840332, + "step": 111300 + }, + { + "epoch": 0.3369303648994294, + "grad_norm": 0.11326660960912704, + "learning_rate": 8.313573087607973e-05, + "loss": 1.5422380447387696, + "step": 111310 + }, + { + "epoch": 0.33696063444977525, + "grad_norm": 0.12451761215925217, + "learning_rate": 8.313193569444234e-05, + "loss": 1.5387076377868651, + "step": 111320 + }, + { + "epoch": 0.3369909040001211, + "grad_norm": 0.1104671061038971, + "learning_rate": 8.312814051280494e-05, + "loss": 1.5488166809082031, + "step": 111330 + }, + { + "epoch": 0.3370211735504669, + "grad_norm": 0.10387113690376282, + "learning_rate": 8.312434533116755e-05, + "loss": 1.5719290733337403, + "step": 111340 + }, + { + "epoch": 0.33705144310081275, + "grad_norm": 0.10531473159790039, + "learning_rate": 8.312055014953016e-05, + "loss": 1.5624405860900878, + "step": 111350 + }, + { + "epoch": 0.33708171265115855, + "grad_norm": 0.1113891676068306, + "learning_rate": 8.311675496789276e-05, + "loss": 1.54276762008667, + "step": 111360 + }, + { + "epoch": 0.3371119822015044, + "grad_norm": 0.09145475924015045, + "learning_rate": 8.311295978625537e-05, + "loss": 1.5853805541992188, + "step": 111370 + }, + { + "epoch": 0.33714225175185025, + "grad_norm": 0.12485500425100327, + "learning_rate": 8.310916460461799e-05, + "loss": 1.5041221618652343, + "step": 111380 + }, + { + "epoch": 0.33717252130219605, + "grad_norm": 0.11662283539772034, + "learning_rate": 8.310536942298058e-05, + "loss": 1.5686895370483398, + "step": 111390 + }, + { + "epoch": 0.3372027908525419, + "grad_norm": 0.11288826912641525, + "learning_rate": 8.31015742413432e-05, + "loss": 1.5841533660888671, + "step": 111400 + }, + { + "epoch": 0.3372330604028877, + "grad_norm": 0.10792495310306549, + "learning_rate": 8.309777905970579e-05, + "loss": 1.5511667251586914, + "step": 111410 + }, + { + "epoch": 0.33726332995323355, + "grad_norm": 0.10884613543748856, + "learning_rate": 8.309398387806841e-05, + "loss": 1.5850685119628907, + "step": 111420 + }, + { + "epoch": 0.3372935995035794, + "grad_norm": 0.12826323509216309, + "learning_rate": 8.3090188696431e-05, + "loss": 1.5266172409057617, + "step": 111430 + }, + { + "epoch": 0.3373238690539252, + "grad_norm": 0.10834353417158127, + "learning_rate": 8.308639351479362e-05, + "loss": 1.547147274017334, + "step": 111440 + }, + { + "epoch": 0.33735413860427105, + "grad_norm": 0.13633690774440765, + "learning_rate": 8.308259833315622e-05, + "loss": 1.545871639251709, + "step": 111450 + }, + { + "epoch": 0.33738440815461684, + "grad_norm": 0.11034814268350601, + "learning_rate": 8.307880315151884e-05, + "loss": 1.5534820556640625, + "step": 111460 + }, + { + "epoch": 0.3374146777049627, + "grad_norm": 0.11241936683654785, + "learning_rate": 8.307500796988144e-05, + "loss": 1.540135383605957, + "step": 111470 + }, + { + "epoch": 0.33744494725530855, + "grad_norm": 0.11231846362352371, + "learning_rate": 8.307121278824405e-05, + "loss": 1.549159049987793, + "step": 111480 + }, + { + "epoch": 0.33747521680565434, + "grad_norm": 0.12202200293540955, + "learning_rate": 8.306741760660665e-05, + "loss": 1.5229063034057617, + "step": 111490 + }, + { + "epoch": 0.3375054863560002, + "grad_norm": 0.12549518048763275, + "learning_rate": 8.306362242496926e-05, + "loss": 1.543367290496826, + "step": 111500 + }, + { + "epoch": 0.3375054863560002, + "eval_loss": 1.5557249784469604, + "eval_runtime": 28.159, + "eval_samples_per_second": 17.756, + "eval_steps_per_second": 1.136, + "step": 111500 + }, + { + "epoch": 0.337535755906346, + "grad_norm": 0.12222358584403992, + "learning_rate": 8.305982724333188e-05, + "loss": 1.5151498794555665, + "step": 111510 + }, + { + "epoch": 0.33756602545669184, + "grad_norm": 0.11172331869602203, + "learning_rate": 8.305603206169447e-05, + "loss": 1.533156681060791, + "step": 111520 + }, + { + "epoch": 0.3375962950070377, + "grad_norm": 0.10775793343782425, + "learning_rate": 8.305223688005709e-05, + "loss": 1.5452310562133789, + "step": 111530 + }, + { + "epoch": 0.3376265645573835, + "grad_norm": 0.11305350065231323, + "learning_rate": 8.304844169841968e-05, + "loss": 1.4990092277526856, + "step": 111540 + }, + { + "epoch": 0.33765683410772934, + "grad_norm": 0.114424929022789, + "learning_rate": 8.30446465167823e-05, + "loss": 1.5824580192565918, + "step": 111550 + }, + { + "epoch": 0.33768710365807514, + "grad_norm": 0.1138230487704277, + "learning_rate": 8.30408513351449e-05, + "loss": 1.5479228973388672, + "step": 111560 + }, + { + "epoch": 0.337717373208421, + "grad_norm": 0.10127777606248856, + "learning_rate": 8.303705615350751e-05, + "loss": 1.5027339935302735, + "step": 111570 + }, + { + "epoch": 0.33774764275876684, + "grad_norm": 0.12150021642446518, + "learning_rate": 8.303326097187011e-05, + "loss": 1.556961441040039, + "step": 111580 + }, + { + "epoch": 0.33777791230911264, + "grad_norm": 0.10918215662240982, + "learning_rate": 8.302946579023273e-05, + "loss": 1.5607112884521483, + "step": 111590 + }, + { + "epoch": 0.3378081818594585, + "grad_norm": 0.11218148469924927, + "learning_rate": 8.302567060859532e-05, + "loss": 1.5564313888549806, + "step": 111600 + }, + { + "epoch": 0.3378384514098043, + "grad_norm": 0.10821329802274704, + "learning_rate": 8.302187542695794e-05, + "loss": 1.560265064239502, + "step": 111610 + }, + { + "epoch": 0.33786872096015014, + "grad_norm": 0.10265529155731201, + "learning_rate": 8.301808024532053e-05, + "loss": 1.5652334213256835, + "step": 111620 + }, + { + "epoch": 0.337898990510496, + "grad_norm": 0.10286907106637955, + "learning_rate": 8.301428506368315e-05, + "loss": 1.5426618576049804, + "step": 111630 + }, + { + "epoch": 0.3379292600608418, + "grad_norm": 0.10606898367404938, + "learning_rate": 8.301048988204576e-05, + "loss": 1.5580427169799804, + "step": 111640 + }, + { + "epoch": 0.33795952961118764, + "grad_norm": 0.11722953617572784, + "learning_rate": 8.300669470040836e-05, + "loss": 1.540604019165039, + "step": 111650 + }, + { + "epoch": 0.33798979916153343, + "grad_norm": 0.1269609034061432, + "learning_rate": 8.300289951877097e-05, + "loss": 1.5275375366210937, + "step": 111660 + }, + { + "epoch": 0.3380200687118793, + "grad_norm": 0.1204223707318306, + "learning_rate": 8.299910433713357e-05, + "loss": 1.5545404434204102, + "step": 111670 + }, + { + "epoch": 0.33805033826222514, + "grad_norm": 0.10435327142477036, + "learning_rate": 8.29953091554962e-05, + "loss": 1.5732189178466798, + "step": 111680 + }, + { + "epoch": 0.33808060781257093, + "grad_norm": 0.11176731437444687, + "learning_rate": 8.299151397385879e-05, + "loss": 1.570522403717041, + "step": 111690 + }, + { + "epoch": 0.3381108773629168, + "grad_norm": 0.10214479267597198, + "learning_rate": 8.29877187922214e-05, + "loss": 1.5482696533203124, + "step": 111700 + }, + { + "epoch": 0.3381411469132626, + "grad_norm": 0.12188461422920227, + "learning_rate": 8.2983923610584e-05, + "loss": 1.563735294342041, + "step": 111710 + }, + { + "epoch": 0.33817141646360843, + "grad_norm": 0.11611524969339371, + "learning_rate": 8.298012842894662e-05, + "loss": 1.55814208984375, + "step": 111720 + }, + { + "epoch": 0.3382016860139543, + "grad_norm": 0.10694239288568497, + "learning_rate": 8.297633324730921e-05, + "loss": 1.5385412216186523, + "step": 111730 + }, + { + "epoch": 0.3382319555643001, + "grad_norm": 0.1098765954375267, + "learning_rate": 8.297253806567183e-05, + "loss": 1.5546661376953126, + "step": 111740 + }, + { + "epoch": 0.33826222511464593, + "grad_norm": 0.12147825211286545, + "learning_rate": 8.296874288403444e-05, + "loss": 1.5558226585388184, + "step": 111750 + }, + { + "epoch": 0.3382924946649917, + "grad_norm": 0.10438057035207748, + "learning_rate": 8.296494770239704e-05, + "loss": 1.5913404464721679, + "step": 111760 + }, + { + "epoch": 0.3383227642153376, + "grad_norm": 0.10921104997396469, + "learning_rate": 8.296115252075965e-05, + "loss": 1.5658930778503417, + "step": 111770 + }, + { + "epoch": 0.33835303376568343, + "grad_norm": 0.11914832144975662, + "learning_rate": 8.295735733912225e-05, + "loss": 1.5262039184570313, + "step": 111780 + }, + { + "epoch": 0.3383833033160292, + "grad_norm": 0.10358944535255432, + "learning_rate": 8.295356215748486e-05, + "loss": 1.498898983001709, + "step": 111790 + }, + { + "epoch": 0.3384135728663751, + "grad_norm": 0.13656331598758698, + "learning_rate": 8.294976697584746e-05, + "loss": 1.553373146057129, + "step": 111800 + }, + { + "epoch": 0.3384438424167209, + "grad_norm": 0.12144914269447327, + "learning_rate": 8.294597179421007e-05, + "loss": 1.5313020706176759, + "step": 111810 + }, + { + "epoch": 0.3384741119670667, + "grad_norm": 0.10758458077907562, + "learning_rate": 8.294217661257268e-05, + "loss": 1.5880449295043946, + "step": 111820 + }, + { + "epoch": 0.3385043815174126, + "grad_norm": 0.11882859468460083, + "learning_rate": 8.293838143093528e-05, + "loss": 1.52905216217041, + "step": 111830 + }, + { + "epoch": 0.3385346510677584, + "grad_norm": 0.09464267641305923, + "learning_rate": 8.293458624929789e-05, + "loss": 1.5624802589416504, + "step": 111840 + }, + { + "epoch": 0.3385649206181042, + "grad_norm": 0.0976070836186409, + "learning_rate": 8.29307910676605e-05, + "loss": 1.5612679481506349, + "step": 111850 + }, + { + "epoch": 0.33859519016845, + "grad_norm": 0.1153448224067688, + "learning_rate": 8.29269958860231e-05, + "loss": 1.5472314834594727, + "step": 111860 + }, + { + "epoch": 0.3386254597187959, + "grad_norm": 0.10966090112924576, + "learning_rate": 8.29232007043857e-05, + "loss": 1.5327754974365235, + "step": 111870 + }, + { + "epoch": 0.3386557292691417, + "grad_norm": 0.11454950273036957, + "learning_rate": 8.291940552274833e-05, + "loss": 1.5359716415405273, + "step": 111880 + }, + { + "epoch": 0.3386859988194875, + "grad_norm": 0.10114434361457825, + "learning_rate": 8.291561034111093e-05, + "loss": 1.5450517654418945, + "step": 111890 + }, + { + "epoch": 0.3387162683698334, + "grad_norm": 0.11358701437711716, + "learning_rate": 8.291181515947354e-05, + "loss": 1.5194847106933593, + "step": 111900 + }, + { + "epoch": 0.33874653792017917, + "grad_norm": 0.11163344234228134, + "learning_rate": 8.290801997783614e-05, + "loss": 1.5329065322875977, + "step": 111910 + }, + { + "epoch": 0.338776807470525, + "grad_norm": 0.10767209529876709, + "learning_rate": 8.290422479619875e-05, + "loss": 1.570925235748291, + "step": 111920 + }, + { + "epoch": 0.3388070770208709, + "grad_norm": 0.10550300776958466, + "learning_rate": 8.290042961456136e-05, + "loss": 1.5829689979553223, + "step": 111930 + }, + { + "epoch": 0.33883734657121667, + "grad_norm": 0.10418960452079773, + "learning_rate": 8.289663443292396e-05, + "loss": 1.5273195266723634, + "step": 111940 + }, + { + "epoch": 0.3388676161215625, + "grad_norm": 0.11581622064113617, + "learning_rate": 8.289283925128657e-05, + "loss": 1.544675636291504, + "step": 111950 + }, + { + "epoch": 0.3388978856719083, + "grad_norm": 0.11287892609834671, + "learning_rate": 8.288904406964917e-05, + "loss": 1.5472775459289552, + "step": 111960 + }, + { + "epoch": 0.33892815522225417, + "grad_norm": 0.11249828338623047, + "learning_rate": 8.288524888801178e-05, + "loss": 1.534860610961914, + "step": 111970 + }, + { + "epoch": 0.3389584247726, + "grad_norm": 0.10140936821699142, + "learning_rate": 8.288145370637439e-05, + "loss": 1.5404781341552733, + "step": 111980 + }, + { + "epoch": 0.3389886943229458, + "grad_norm": 0.10836168378591537, + "learning_rate": 8.2877658524737e-05, + "loss": 1.5773391723632812, + "step": 111990 + }, + { + "epoch": 0.33901896387329167, + "grad_norm": 0.11725209653377533, + "learning_rate": 8.28738633430996e-05, + "loss": 1.5634234428405762, + "step": 112000 + }, + { + "epoch": 0.33901896387329167, + "eval_loss": 1.5775736570358276, + "eval_runtime": 28.2278, + "eval_samples_per_second": 17.713, + "eval_steps_per_second": 1.134, + "step": 112000 + }, + { + "epoch": 0.3390492334236375, + "grad_norm": 0.11595287173986435, + "learning_rate": 8.287006816146222e-05, + "loss": 1.5422176361083983, + "step": 112010 + }, + { + "epoch": 0.3390795029739833, + "grad_norm": 0.11241267621517181, + "learning_rate": 8.286627297982481e-05, + "loss": 1.5493172645568847, + "step": 112020 + }, + { + "epoch": 0.33910977252432917, + "grad_norm": 0.11563792079687119, + "learning_rate": 8.286247779818743e-05, + "loss": 1.5418523788452148, + "step": 112030 + }, + { + "epoch": 0.33914004207467496, + "grad_norm": 0.10377643257379532, + "learning_rate": 8.285868261655002e-05, + "loss": 1.5771293640136719, + "step": 112040 + }, + { + "epoch": 0.3391703116250208, + "grad_norm": 0.13011741638183594, + "learning_rate": 8.285488743491264e-05, + "loss": 1.5666056632995606, + "step": 112050 + }, + { + "epoch": 0.33920058117536667, + "grad_norm": 0.10828563570976257, + "learning_rate": 8.285109225327523e-05, + "loss": 1.5440902709960938, + "step": 112060 + }, + { + "epoch": 0.33923085072571246, + "grad_norm": 0.11740902066230774, + "learning_rate": 8.284729707163785e-05, + "loss": 1.5626462936401366, + "step": 112070 + }, + { + "epoch": 0.3392611202760583, + "grad_norm": 0.09918080270290375, + "learning_rate": 8.284350189000046e-05, + "loss": 1.5652189254760742, + "step": 112080 + }, + { + "epoch": 0.3392913898264041, + "grad_norm": 0.11693225800991058, + "learning_rate": 8.283970670836306e-05, + "loss": 1.5446324348449707, + "step": 112090 + }, + { + "epoch": 0.33932165937674996, + "grad_norm": 0.11823077499866486, + "learning_rate": 8.283591152672567e-05, + "loss": 1.5721248626708983, + "step": 112100 + }, + { + "epoch": 0.3393519289270958, + "grad_norm": 0.10761095583438873, + "learning_rate": 8.283211634508828e-05, + "loss": 1.543163013458252, + "step": 112110 + }, + { + "epoch": 0.3393821984774416, + "grad_norm": 0.10746365040540695, + "learning_rate": 8.28283211634509e-05, + "loss": 1.5644073486328125, + "step": 112120 + }, + { + "epoch": 0.33941246802778746, + "grad_norm": 0.10434862226247787, + "learning_rate": 8.282452598181349e-05, + "loss": 1.5510770797729492, + "step": 112130 + }, + { + "epoch": 0.33944273757813326, + "grad_norm": 0.1127530112862587, + "learning_rate": 8.282073080017611e-05, + "loss": 1.5378934860229492, + "step": 112140 + }, + { + "epoch": 0.3394730071284791, + "grad_norm": 0.1174112856388092, + "learning_rate": 8.28169356185387e-05, + "loss": 1.5389513969421387, + "step": 112150 + }, + { + "epoch": 0.33950327667882496, + "grad_norm": 0.11964373290538788, + "learning_rate": 8.281314043690132e-05, + "loss": 1.5269670486450195, + "step": 112160 + }, + { + "epoch": 0.33953354622917076, + "grad_norm": 0.11969422549009323, + "learning_rate": 8.280934525526391e-05, + "loss": 1.567702293395996, + "step": 112170 + }, + { + "epoch": 0.3395638157795166, + "grad_norm": 0.12306946516036987, + "learning_rate": 8.280555007362653e-05, + "loss": 1.571034049987793, + "step": 112180 + }, + { + "epoch": 0.3395940853298624, + "grad_norm": 0.1076774075627327, + "learning_rate": 8.280175489198912e-05, + "loss": 1.5848840713500976, + "step": 112190 + }, + { + "epoch": 0.33962435488020826, + "grad_norm": 0.1185174435377121, + "learning_rate": 8.279795971035174e-05, + "loss": 1.5183720588684082, + "step": 112200 + }, + { + "epoch": 0.3396546244305541, + "grad_norm": 0.10584788024425507, + "learning_rate": 8.279416452871434e-05, + "loss": 1.555307960510254, + "step": 112210 + }, + { + "epoch": 0.3396848939808999, + "grad_norm": 0.11044026911258698, + "learning_rate": 8.279036934707696e-05, + "loss": 1.556056594848633, + "step": 112220 + }, + { + "epoch": 0.33971516353124576, + "grad_norm": 0.11669547855854034, + "learning_rate": 8.278657416543955e-05, + "loss": 1.5882222175598144, + "step": 112230 + }, + { + "epoch": 0.33974543308159155, + "grad_norm": 0.10776166617870331, + "learning_rate": 8.278277898380217e-05, + "loss": 1.5454142570495606, + "step": 112240 + }, + { + "epoch": 0.3397757026319374, + "grad_norm": 0.11840583384037018, + "learning_rate": 8.277898380216477e-05, + "loss": 1.521446132659912, + "step": 112250 + }, + { + "epoch": 0.33980597218228326, + "grad_norm": 0.10121075809001923, + "learning_rate": 8.277518862052738e-05, + "loss": 1.5713730812072755, + "step": 112260 + }, + { + "epoch": 0.33983624173262905, + "grad_norm": 0.11158760637044907, + "learning_rate": 8.277139343888999e-05, + "loss": 1.5257168769836427, + "step": 112270 + }, + { + "epoch": 0.3398665112829749, + "grad_norm": 0.11611245572566986, + "learning_rate": 8.276759825725259e-05, + "loss": 1.5586673736572265, + "step": 112280 + }, + { + "epoch": 0.3398967808333207, + "grad_norm": 0.12423072755336761, + "learning_rate": 8.276380307561521e-05, + "loss": 1.5060588836669921, + "step": 112290 + }, + { + "epoch": 0.33992705038366655, + "grad_norm": 0.10539095103740692, + "learning_rate": 8.27600078939778e-05, + "loss": 1.5599838256835938, + "step": 112300 + }, + { + "epoch": 0.3399573199340124, + "grad_norm": 0.10824764519929886, + "learning_rate": 8.275621271234042e-05, + "loss": 1.552662467956543, + "step": 112310 + }, + { + "epoch": 0.3399875894843582, + "grad_norm": 0.12619774043560028, + "learning_rate": 8.275241753070302e-05, + "loss": 1.5643155097961425, + "step": 112320 + }, + { + "epoch": 0.34001785903470405, + "grad_norm": 0.11704540997743607, + "learning_rate": 8.274862234906563e-05, + "loss": 1.5635242462158203, + "step": 112330 + }, + { + "epoch": 0.34004812858504985, + "grad_norm": 0.1118406131863594, + "learning_rate": 8.274482716742823e-05, + "loss": 1.5737051963806152, + "step": 112340 + }, + { + "epoch": 0.3400783981353957, + "grad_norm": 0.11693616956472397, + "learning_rate": 8.274103198579085e-05, + "loss": 1.5585622787475586, + "step": 112350 + }, + { + "epoch": 0.34010866768574155, + "grad_norm": 0.10649574548006058, + "learning_rate": 8.273723680415345e-05, + "loss": 1.5824738502502442, + "step": 112360 + }, + { + "epoch": 0.34013893723608735, + "grad_norm": 0.09626923501491547, + "learning_rate": 8.273344162251606e-05, + "loss": 1.548594093322754, + "step": 112370 + }, + { + "epoch": 0.3401692067864332, + "grad_norm": 0.105513796210289, + "learning_rate": 8.272964644087866e-05, + "loss": 1.5766563415527344, + "step": 112380 + }, + { + "epoch": 0.340199476336779, + "grad_norm": 0.09846003353595734, + "learning_rate": 8.272585125924127e-05, + "loss": 1.5584508895874023, + "step": 112390 + }, + { + "epoch": 0.34022974588712485, + "grad_norm": 0.09894295036792755, + "learning_rate": 8.272205607760388e-05, + "loss": 1.5777698516845704, + "step": 112400 + }, + { + "epoch": 0.3402600154374707, + "grad_norm": 0.10965729504823685, + "learning_rate": 8.271826089596648e-05, + "loss": 1.5792564392089843, + "step": 112410 + }, + { + "epoch": 0.3402902849878165, + "grad_norm": 0.10771358013153076, + "learning_rate": 8.271446571432909e-05, + "loss": 1.549722385406494, + "step": 112420 + }, + { + "epoch": 0.34032055453816235, + "grad_norm": 0.1184953823685646, + "learning_rate": 8.27106705326917e-05, + "loss": 1.5675546646118164, + "step": 112430 + }, + { + "epoch": 0.34035082408850814, + "grad_norm": 0.1217147707939148, + "learning_rate": 8.27068753510543e-05, + "loss": 1.5277462959289552, + "step": 112440 + }, + { + "epoch": 0.340381093638854, + "grad_norm": 0.1108672097325325, + "learning_rate": 8.27030801694169e-05, + "loss": 1.5814451217651366, + "step": 112450 + }, + { + "epoch": 0.34041136318919984, + "grad_norm": 0.11842767149209976, + "learning_rate": 8.269928498777951e-05, + "loss": 1.5546934127807617, + "step": 112460 + }, + { + "epoch": 0.34044163273954564, + "grad_norm": 0.11587280035018921, + "learning_rate": 8.269548980614212e-05, + "loss": 1.5559881210327149, + "step": 112470 + }, + { + "epoch": 0.3404719022898915, + "grad_norm": 0.11574268341064453, + "learning_rate": 8.269169462450474e-05, + "loss": 1.5449069023132325, + "step": 112480 + }, + { + "epoch": 0.3405021718402373, + "grad_norm": 0.11968383938074112, + "learning_rate": 8.268789944286734e-05, + "loss": 1.5242491722106934, + "step": 112490 + }, + { + "epoch": 0.34053244139058314, + "grad_norm": 0.10742185264825821, + "learning_rate": 8.268410426122995e-05, + "loss": 1.5387605667114257, + "step": 112500 + }, + { + "epoch": 0.34053244139058314, + "eval_loss": 1.5548079013824463, + "eval_runtime": 28.1512, + "eval_samples_per_second": 17.761, + "eval_steps_per_second": 1.137, + "step": 112500 + }, + { + "epoch": 0.340562710940929, + "grad_norm": 0.10380307585000992, + "learning_rate": 8.268030907959256e-05, + "loss": 1.5641109466552734, + "step": 112510 + }, + { + "epoch": 0.3405929804912748, + "grad_norm": 0.10746974498033524, + "learning_rate": 8.267651389795516e-05, + "loss": 1.580215835571289, + "step": 112520 + }, + { + "epoch": 0.34062325004162064, + "grad_norm": 0.11052223294973373, + "learning_rate": 8.267271871631777e-05, + "loss": 1.5464458465576172, + "step": 112530 + }, + { + "epoch": 0.34065351959196644, + "grad_norm": 0.11292585730552673, + "learning_rate": 8.266892353468037e-05, + "loss": 1.5327073097229005, + "step": 112540 + }, + { + "epoch": 0.3406837891423123, + "grad_norm": 0.12387735396623611, + "learning_rate": 8.266512835304298e-05, + "loss": 1.5083656311035156, + "step": 112550 + }, + { + "epoch": 0.34071405869265814, + "grad_norm": 0.11863313615322113, + "learning_rate": 8.266133317140558e-05, + "loss": 1.5318614006042481, + "step": 112560 + }, + { + "epoch": 0.34074432824300394, + "grad_norm": 0.1171584203839302, + "learning_rate": 8.265753798976819e-05, + "loss": 1.5602077484130858, + "step": 112570 + }, + { + "epoch": 0.3407745977933498, + "grad_norm": 0.12081994116306305, + "learning_rate": 8.26537428081308e-05, + "loss": 1.5515605926513671, + "step": 112580 + }, + { + "epoch": 0.3408048673436956, + "grad_norm": 0.10192123055458069, + "learning_rate": 8.26499476264934e-05, + "loss": 1.5249162673950196, + "step": 112590 + }, + { + "epoch": 0.34083513689404143, + "grad_norm": 0.11296963691711426, + "learning_rate": 8.264615244485601e-05, + "loss": 1.5586935043334962, + "step": 112600 + }, + { + "epoch": 0.3408654064443873, + "grad_norm": 0.1174931600689888, + "learning_rate": 8.264235726321861e-05, + "loss": 1.5549489974975585, + "step": 112610 + }, + { + "epoch": 0.3408956759947331, + "grad_norm": 0.103338822722435, + "learning_rate": 8.263856208158123e-05, + "loss": 1.5833117485046386, + "step": 112620 + }, + { + "epoch": 0.34092594554507893, + "grad_norm": 0.11136865615844727, + "learning_rate": 8.263476689994383e-05, + "loss": 1.531633186340332, + "step": 112630 + }, + { + "epoch": 0.34095621509542473, + "grad_norm": 0.11011228710412979, + "learning_rate": 8.263097171830645e-05, + "loss": 1.5552730560302734, + "step": 112640 + }, + { + "epoch": 0.3409864846457706, + "grad_norm": 0.10471894592046738, + "learning_rate": 8.262717653666904e-05, + "loss": 1.5337895393371581, + "step": 112650 + }, + { + "epoch": 0.34101675419611643, + "grad_norm": 0.11970746517181396, + "learning_rate": 8.262338135503166e-05, + "loss": 1.545506191253662, + "step": 112660 + }, + { + "epoch": 0.34104702374646223, + "grad_norm": 0.1086559146642685, + "learning_rate": 8.261958617339425e-05, + "loss": 1.5444022178649903, + "step": 112670 + }, + { + "epoch": 0.3410772932968081, + "grad_norm": 0.10648176819086075, + "learning_rate": 8.261579099175687e-05, + "loss": 1.5381065368652345, + "step": 112680 + }, + { + "epoch": 0.3411075628471539, + "grad_norm": 0.11025211960077286, + "learning_rate": 8.261199581011948e-05, + "loss": 1.512507438659668, + "step": 112690 + }, + { + "epoch": 0.34113783239749973, + "grad_norm": 0.11829427629709244, + "learning_rate": 8.260820062848208e-05, + "loss": 1.5576654434204102, + "step": 112700 + }, + { + "epoch": 0.3411681019478456, + "grad_norm": 0.1055668517947197, + "learning_rate": 8.260440544684469e-05, + "loss": 1.5727783203125, + "step": 112710 + }, + { + "epoch": 0.3411983714981914, + "grad_norm": 0.11845002323389053, + "learning_rate": 8.26006102652073e-05, + "loss": 1.5582446098327636, + "step": 112720 + }, + { + "epoch": 0.34122864104853723, + "grad_norm": 0.13137300312519073, + "learning_rate": 8.259681508356991e-05, + "loss": 1.563417911529541, + "step": 112730 + }, + { + "epoch": 0.3412589105988831, + "grad_norm": 0.11272192746400833, + "learning_rate": 8.25930199019325e-05, + "loss": 1.5173210144042968, + "step": 112740 + }, + { + "epoch": 0.3412891801492289, + "grad_norm": 0.1060873344540596, + "learning_rate": 8.258922472029513e-05, + "loss": 1.5410411834716797, + "step": 112750 + }, + { + "epoch": 0.34131944969957473, + "grad_norm": 0.11185275763273239, + "learning_rate": 8.258542953865772e-05, + "loss": 1.53469820022583, + "step": 112760 + }, + { + "epoch": 0.3413497192499205, + "grad_norm": 0.11055658012628555, + "learning_rate": 8.258163435702034e-05, + "loss": 1.559725284576416, + "step": 112770 + }, + { + "epoch": 0.3413799888002664, + "grad_norm": 0.12532955408096313, + "learning_rate": 8.257783917538293e-05, + "loss": 1.5397357940673828, + "step": 112780 + }, + { + "epoch": 0.34141025835061223, + "grad_norm": 0.10989788174629211, + "learning_rate": 8.257404399374555e-05, + "loss": 1.5837873458862304, + "step": 112790 + }, + { + "epoch": 0.341440527900958, + "grad_norm": 0.10424651205539703, + "learning_rate": 8.257024881210814e-05, + "loss": 1.5545134544372559, + "step": 112800 + }, + { + "epoch": 0.3414707974513039, + "grad_norm": 0.11748314648866653, + "learning_rate": 8.256645363047076e-05, + "loss": 1.562275505065918, + "step": 112810 + }, + { + "epoch": 0.34150106700164967, + "grad_norm": 0.10371940582990646, + "learning_rate": 8.256265844883335e-05, + "loss": 1.5412351608276367, + "step": 112820 + }, + { + "epoch": 0.3415313365519955, + "grad_norm": 0.10360236465930939, + "learning_rate": 8.255886326719597e-05, + "loss": 1.6187042236328124, + "step": 112830 + }, + { + "epoch": 0.3415616061023414, + "grad_norm": 0.12561118602752686, + "learning_rate": 8.255506808555857e-05, + "loss": 1.5891804695129395, + "step": 112840 + }, + { + "epoch": 0.34159187565268717, + "grad_norm": 0.10992337018251419, + "learning_rate": 8.255127290392118e-05, + "loss": 1.531224250793457, + "step": 112850 + }, + { + "epoch": 0.341622145203033, + "grad_norm": 0.10845345258712769, + "learning_rate": 8.254747772228379e-05, + "loss": 1.5568170547485352, + "step": 112860 + }, + { + "epoch": 0.3416524147533788, + "grad_norm": 0.10862688720226288, + "learning_rate": 8.25436825406464e-05, + "loss": 1.5323729515075684, + "step": 112870 + }, + { + "epoch": 0.34168268430372467, + "grad_norm": 0.15766939520835876, + "learning_rate": 8.2539887359009e-05, + "loss": 1.5386003494262694, + "step": 112880 + }, + { + "epoch": 0.3417129538540705, + "grad_norm": 0.12248963117599487, + "learning_rate": 8.253609217737161e-05, + "loss": 1.5681212425231934, + "step": 112890 + }, + { + "epoch": 0.3417432234044163, + "grad_norm": 0.11795379966497421, + "learning_rate": 8.253229699573423e-05, + "loss": 1.5516554832458496, + "step": 112900 + }, + { + "epoch": 0.34177349295476217, + "grad_norm": 0.1330743134021759, + "learning_rate": 8.252850181409682e-05, + "loss": 1.526611614227295, + "step": 112910 + }, + { + "epoch": 0.34180376250510797, + "grad_norm": 0.10763130336999893, + "learning_rate": 8.252470663245944e-05, + "loss": 1.5466675758361816, + "step": 112920 + }, + { + "epoch": 0.3418340320554538, + "grad_norm": 0.11628161370754242, + "learning_rate": 8.252091145082203e-05, + "loss": 1.535309410095215, + "step": 112930 + }, + { + "epoch": 0.34186430160579967, + "grad_norm": 0.12408044934272766, + "learning_rate": 8.251711626918465e-05, + "loss": 1.5396961212158202, + "step": 112940 + }, + { + "epoch": 0.34189457115614547, + "grad_norm": 0.10778218507766724, + "learning_rate": 8.251332108754724e-05, + "loss": 1.5457698822021484, + "step": 112950 + }, + { + "epoch": 0.3419248407064913, + "grad_norm": 0.0964360460639, + "learning_rate": 8.250952590590986e-05, + "loss": 1.5408164978027343, + "step": 112960 + }, + { + "epoch": 0.3419551102568371, + "grad_norm": 0.11160928010940552, + "learning_rate": 8.250573072427246e-05, + "loss": 1.5922292709350585, + "step": 112970 + }, + { + "epoch": 0.34198537980718297, + "grad_norm": 0.10776132345199585, + "learning_rate": 8.250193554263508e-05, + "loss": 1.5692191123962402, + "step": 112980 + }, + { + "epoch": 0.3420156493575288, + "grad_norm": 0.11306195706129074, + "learning_rate": 8.249814036099768e-05, + "loss": 1.5703689575195312, + "step": 112990 + }, + { + "epoch": 0.3420459189078746, + "grad_norm": 0.12136677652597427, + "learning_rate": 8.249434517936029e-05, + "loss": 1.5128597259521483, + "step": 113000 + }, + { + "epoch": 0.3420459189078746, + "eval_loss": 1.5600355863571167, + "eval_runtime": 28.2043, + "eval_samples_per_second": 17.728, + "eval_steps_per_second": 1.135, + "step": 113000 + }, + { + "epoch": 0.34207618845822046, + "grad_norm": 0.11000486463308334, + "learning_rate": 8.24905499977229e-05, + "loss": 1.5657976150512696, + "step": 113010 + }, + { + "epoch": 0.34210645800856626, + "grad_norm": 0.11618854105472565, + "learning_rate": 8.24867548160855e-05, + "loss": 1.5573183059692384, + "step": 113020 + }, + { + "epoch": 0.3421367275589121, + "grad_norm": 0.10317562520503998, + "learning_rate": 8.24829596344481e-05, + "loss": 1.5119503021240235, + "step": 113030 + }, + { + "epoch": 0.34216699710925796, + "grad_norm": 0.11169318109750748, + "learning_rate": 8.247916445281071e-05, + "loss": 1.567086410522461, + "step": 113040 + }, + { + "epoch": 0.34219726665960376, + "grad_norm": 0.107546366751194, + "learning_rate": 8.247536927117332e-05, + "loss": 1.5546934127807617, + "step": 113050 + }, + { + "epoch": 0.3422275362099496, + "grad_norm": 0.10344716906547546, + "learning_rate": 8.247157408953592e-05, + "loss": 1.5503111839294434, + "step": 113060 + }, + { + "epoch": 0.3422578057602954, + "grad_norm": 0.12959952652454376, + "learning_rate": 8.246777890789853e-05, + "loss": 1.5466313362121582, + "step": 113070 + }, + { + "epoch": 0.34228807531064126, + "grad_norm": 0.11125483363866806, + "learning_rate": 8.246398372626114e-05, + "loss": 1.5604351043701172, + "step": 113080 + }, + { + "epoch": 0.3423183448609871, + "grad_norm": 0.10205106437206268, + "learning_rate": 8.246018854462375e-05, + "loss": 1.5369817733764648, + "step": 113090 + }, + { + "epoch": 0.3423486144113329, + "grad_norm": 0.10857799649238586, + "learning_rate": 8.245639336298636e-05, + "loss": 1.5230871200561524, + "step": 113100 + }, + { + "epoch": 0.34237888396167876, + "grad_norm": 0.10258685052394867, + "learning_rate": 8.245259818134897e-05, + "loss": 1.5403623580932617, + "step": 113110 + }, + { + "epoch": 0.34240915351202456, + "grad_norm": 0.11062093079090118, + "learning_rate": 8.244880299971157e-05, + "loss": 1.5871645927429199, + "step": 113120 + }, + { + "epoch": 0.3424394230623704, + "grad_norm": 0.11550971120595932, + "learning_rate": 8.244500781807418e-05, + "loss": 1.5788875579833985, + "step": 113130 + }, + { + "epoch": 0.34246969261271626, + "grad_norm": 0.10868508368730545, + "learning_rate": 8.244121263643678e-05, + "loss": 1.5571992874145508, + "step": 113140 + }, + { + "epoch": 0.34249996216306205, + "grad_norm": 0.11231011152267456, + "learning_rate": 8.243741745479939e-05, + "loss": 1.554062271118164, + "step": 113150 + }, + { + "epoch": 0.3425302317134079, + "grad_norm": 0.10350959748029709, + "learning_rate": 8.2433622273162e-05, + "loss": 1.5459096908569336, + "step": 113160 + }, + { + "epoch": 0.3425605012637537, + "grad_norm": 0.1064138188958168, + "learning_rate": 8.24298270915246e-05, + "loss": 1.5365104675292969, + "step": 113170 + }, + { + "epoch": 0.34259077081409955, + "grad_norm": 0.11891116201877594, + "learning_rate": 8.242603190988721e-05, + "loss": 1.5436599731445313, + "step": 113180 + }, + { + "epoch": 0.3426210403644454, + "grad_norm": 0.11301659792661667, + "learning_rate": 8.242223672824981e-05, + "loss": 1.5123815536499023, + "step": 113190 + }, + { + "epoch": 0.3426513099147912, + "grad_norm": 0.11651924252510071, + "learning_rate": 8.241844154661242e-05, + "loss": 1.5628536224365235, + "step": 113200 + }, + { + "epoch": 0.34268157946513705, + "grad_norm": 0.09640410542488098, + "learning_rate": 8.241464636497503e-05, + "loss": 1.5372915267944336, + "step": 113210 + }, + { + "epoch": 0.34271184901548285, + "grad_norm": 0.10964725911617279, + "learning_rate": 8.241085118333763e-05, + "loss": 1.5095537185668946, + "step": 113220 + }, + { + "epoch": 0.3427421185658287, + "grad_norm": 0.09843658655881882, + "learning_rate": 8.240705600170025e-05, + "loss": 1.5358343124389648, + "step": 113230 + }, + { + "epoch": 0.34277238811617455, + "grad_norm": 0.10650654882192612, + "learning_rate": 8.240326082006284e-05, + "loss": 1.51260347366333, + "step": 113240 + }, + { + "epoch": 0.34280265766652035, + "grad_norm": 0.10275600105524063, + "learning_rate": 8.239946563842546e-05, + "loss": 1.582535171508789, + "step": 113250 + }, + { + "epoch": 0.3428329272168662, + "grad_norm": 0.11991056799888611, + "learning_rate": 8.239567045678806e-05, + "loss": 1.5429752349853516, + "step": 113260 + }, + { + "epoch": 0.342863196767212, + "grad_norm": 0.11363334208726883, + "learning_rate": 8.239187527515068e-05, + "loss": 1.5568223953247071, + "step": 113270 + }, + { + "epoch": 0.34289346631755785, + "grad_norm": 0.10120759904384613, + "learning_rate": 8.238808009351327e-05, + "loss": 1.5456062316894532, + "step": 113280 + }, + { + "epoch": 0.3429237358679037, + "grad_norm": 0.11303368955850601, + "learning_rate": 8.238428491187589e-05, + "loss": 1.5358147621154785, + "step": 113290 + }, + { + "epoch": 0.3429540054182495, + "grad_norm": 0.10661356151103973, + "learning_rate": 8.238048973023849e-05, + "loss": 1.5795753479003907, + "step": 113300 + }, + { + "epoch": 0.34298427496859535, + "grad_norm": 0.09941590577363968, + "learning_rate": 8.23766945486011e-05, + "loss": 1.5411980628967286, + "step": 113310 + }, + { + "epoch": 0.34301454451894114, + "grad_norm": 0.11294852942228317, + "learning_rate": 8.23728993669637e-05, + "loss": 1.560795497894287, + "step": 113320 + }, + { + "epoch": 0.343044814069287, + "grad_norm": 0.11623241007328033, + "learning_rate": 8.236910418532631e-05, + "loss": 1.501305389404297, + "step": 113330 + }, + { + "epoch": 0.34307508361963285, + "grad_norm": 0.11825346201658249, + "learning_rate": 8.236530900368892e-05, + "loss": 1.581961441040039, + "step": 113340 + }, + { + "epoch": 0.34310535316997864, + "grad_norm": 0.12606140971183777, + "learning_rate": 8.236151382205152e-05, + "loss": 1.5298026084899903, + "step": 113350 + }, + { + "epoch": 0.3431356227203245, + "grad_norm": 0.11727332323789597, + "learning_rate": 8.235771864041414e-05, + "loss": 1.5102888107299806, + "step": 113360 + }, + { + "epoch": 0.3431658922706703, + "grad_norm": 0.12134698033332825, + "learning_rate": 8.235392345877673e-05, + "loss": 1.53442440032959, + "step": 113370 + }, + { + "epoch": 0.34319616182101614, + "grad_norm": 0.10370121151208878, + "learning_rate": 8.235012827713935e-05, + "loss": 1.5499378204345704, + "step": 113380 + }, + { + "epoch": 0.343226431371362, + "grad_norm": 0.1252046525478363, + "learning_rate": 8.234633309550195e-05, + "loss": 1.557332992553711, + "step": 113390 + }, + { + "epoch": 0.3432567009217078, + "grad_norm": 0.11406399309635162, + "learning_rate": 8.234253791386457e-05, + "loss": 1.5357125282287598, + "step": 113400 + }, + { + "epoch": 0.34328697047205364, + "grad_norm": 0.1114252507686615, + "learning_rate": 8.233874273222716e-05, + "loss": 1.5097550392150878, + "step": 113410 + }, + { + "epoch": 0.3433172400223995, + "grad_norm": 0.11495660245418549, + "learning_rate": 8.233494755058978e-05, + "loss": 1.5595561981201171, + "step": 113420 + }, + { + "epoch": 0.3433475095727453, + "grad_norm": 0.11285289376974106, + "learning_rate": 8.233115236895237e-05, + "loss": 1.5643039703369142, + "step": 113430 + }, + { + "epoch": 0.34337777912309114, + "grad_norm": 0.11964327096939087, + "learning_rate": 8.232735718731499e-05, + "loss": 1.5847683906555177, + "step": 113440 + }, + { + "epoch": 0.34340804867343694, + "grad_norm": 0.125456765294075, + "learning_rate": 8.232356200567758e-05, + "loss": 1.5430766105651856, + "step": 113450 + }, + { + "epoch": 0.3434383182237828, + "grad_norm": 0.10256822407245636, + "learning_rate": 8.23197668240402e-05, + "loss": 1.537337875366211, + "step": 113460 + }, + { + "epoch": 0.34346858777412864, + "grad_norm": 0.10640078037977219, + "learning_rate": 8.231597164240281e-05, + "loss": 1.526646614074707, + "step": 113470 + }, + { + "epoch": 0.34349885732447444, + "grad_norm": 0.1154962107539177, + "learning_rate": 8.231217646076541e-05, + "loss": 1.5366622924804687, + "step": 113480 + }, + { + "epoch": 0.3435291268748203, + "grad_norm": 0.11137954145669937, + "learning_rate": 8.230838127912802e-05, + "loss": 1.5351232528686523, + "step": 113490 + }, + { + "epoch": 0.3435593964251661, + "grad_norm": 0.11495887488126755, + "learning_rate": 8.230458609749063e-05, + "loss": 1.5291418075561523, + "step": 113500 + }, + { + "epoch": 0.3435593964251661, + "eval_loss": 1.5499993562698364, + "eval_runtime": 28.1113, + "eval_samples_per_second": 17.786, + "eval_steps_per_second": 1.138, + "step": 113500 + }, + { + "epoch": 0.34358966597551194, + "grad_norm": 0.11840999126434326, + "learning_rate": 8.230079091585325e-05, + "loss": 1.5339717864990234, + "step": 113510 + }, + { + "epoch": 0.3436199355258578, + "grad_norm": 0.11151717603206635, + "learning_rate": 8.229699573421584e-05, + "loss": 1.5398243904113769, + "step": 113520 + }, + { + "epoch": 0.3436502050762036, + "grad_norm": 0.10849689692258835, + "learning_rate": 8.229320055257846e-05, + "loss": 1.5294259071350098, + "step": 113530 + }, + { + "epoch": 0.34368047462654944, + "grad_norm": 0.11748957633972168, + "learning_rate": 8.228940537094105e-05, + "loss": 1.5199752807617188, + "step": 113540 + }, + { + "epoch": 0.34371074417689523, + "grad_norm": 0.11150659620761871, + "learning_rate": 8.228561018930367e-05, + "loss": 1.5331711769104004, + "step": 113550 + }, + { + "epoch": 0.3437410137272411, + "grad_norm": 0.1079990491271019, + "learning_rate": 8.228181500766626e-05, + "loss": 1.5694345474243163, + "step": 113560 + }, + { + "epoch": 0.34377128327758694, + "grad_norm": 0.11654652655124664, + "learning_rate": 8.227801982602888e-05, + "loss": 1.5334081649780273, + "step": 113570 + }, + { + "epoch": 0.34380155282793273, + "grad_norm": 0.12146848440170288, + "learning_rate": 8.227422464439147e-05, + "loss": 1.5566682815551758, + "step": 113580 + }, + { + "epoch": 0.3438318223782786, + "grad_norm": 0.1114257425069809, + "learning_rate": 8.227042946275409e-05, + "loss": 1.5510006904602052, + "step": 113590 + }, + { + "epoch": 0.3438620919286244, + "grad_norm": 0.12771324813365936, + "learning_rate": 8.22666342811167e-05, + "loss": 1.5568048477172851, + "step": 113600 + }, + { + "epoch": 0.34389236147897023, + "grad_norm": 0.11267583072185516, + "learning_rate": 8.22628390994793e-05, + "loss": 1.5480427742004395, + "step": 113610 + }, + { + "epoch": 0.3439226310293161, + "grad_norm": 0.12566997110843658, + "learning_rate": 8.225904391784191e-05, + "loss": 1.54982328414917, + "step": 113620 + }, + { + "epoch": 0.3439529005796619, + "grad_norm": 0.09340367466211319, + "learning_rate": 8.225524873620452e-05, + "loss": 1.551753044128418, + "step": 113630 + }, + { + "epoch": 0.34398317013000773, + "grad_norm": 0.11139193922281265, + "learning_rate": 8.225145355456712e-05, + "loss": 1.5269281387329101, + "step": 113640 + }, + { + "epoch": 0.3440134396803535, + "grad_norm": 0.10854550451040268, + "learning_rate": 8.224765837292973e-05, + "loss": 1.5886417388916017, + "step": 113650 + }, + { + "epoch": 0.3440437092306994, + "grad_norm": 0.11085686087608337, + "learning_rate": 8.224386319129233e-05, + "loss": 1.549015998840332, + "step": 113660 + }, + { + "epoch": 0.34407397878104523, + "grad_norm": 0.10743767768144608, + "learning_rate": 8.224006800965494e-05, + "loss": 1.5537532806396483, + "step": 113670 + }, + { + "epoch": 0.344104248331391, + "grad_norm": 0.11206179112195969, + "learning_rate": 8.223627282801755e-05, + "loss": 1.5471141815185547, + "step": 113680 + }, + { + "epoch": 0.3441345178817369, + "grad_norm": 0.10133299231529236, + "learning_rate": 8.223247764638015e-05, + "loss": 1.574486541748047, + "step": 113690 + }, + { + "epoch": 0.3441647874320827, + "grad_norm": 0.12361204624176025, + "learning_rate": 8.222868246474277e-05, + "loss": 1.5712121963500976, + "step": 113700 + }, + { + "epoch": 0.3441950569824285, + "grad_norm": 0.11376991122961044, + "learning_rate": 8.222488728310536e-05, + "loss": 1.56589937210083, + "step": 113710 + }, + { + "epoch": 0.3442253265327744, + "grad_norm": 0.12332667410373688, + "learning_rate": 8.222109210146798e-05, + "loss": 1.565873908996582, + "step": 113720 + }, + { + "epoch": 0.3442555960831202, + "grad_norm": 0.11972975730895996, + "learning_rate": 8.221729691983059e-05, + "loss": 1.5569067001342773, + "step": 113730 + }, + { + "epoch": 0.344285865633466, + "grad_norm": 0.10380086302757263, + "learning_rate": 8.22135017381932e-05, + "loss": 1.551212692260742, + "step": 113740 + }, + { + "epoch": 0.3443161351838118, + "grad_norm": 0.11266782879829407, + "learning_rate": 8.22097065565558e-05, + "loss": 1.5330097198486328, + "step": 113750 + }, + { + "epoch": 0.3443464047341577, + "grad_norm": 0.099466472864151, + "learning_rate": 8.220591137491841e-05, + "loss": 1.5394600868225097, + "step": 113760 + }, + { + "epoch": 0.3443766742845035, + "grad_norm": 0.12327396124601364, + "learning_rate": 8.220211619328101e-05, + "loss": 1.5485206604003907, + "step": 113770 + }, + { + "epoch": 0.3444069438348493, + "grad_norm": 0.10433614253997803, + "learning_rate": 8.219832101164362e-05, + "loss": 1.5366453170776366, + "step": 113780 + }, + { + "epoch": 0.3444372133851952, + "grad_norm": 0.11099225282669067, + "learning_rate": 8.219452583000623e-05, + "loss": 1.5581586837768555, + "step": 113790 + }, + { + "epoch": 0.34446748293554097, + "grad_norm": 0.100807324051857, + "learning_rate": 8.219073064836883e-05, + "loss": 1.552991771697998, + "step": 113800 + }, + { + "epoch": 0.3444977524858868, + "grad_norm": 0.12069308012723923, + "learning_rate": 8.218693546673144e-05, + "loss": 1.5441892623901368, + "step": 113810 + }, + { + "epoch": 0.3445280220362327, + "grad_norm": 0.12788870930671692, + "learning_rate": 8.218314028509404e-05, + "loss": 1.5523554801940918, + "step": 113820 + }, + { + "epoch": 0.34455829158657847, + "grad_norm": 0.12581199407577515, + "learning_rate": 8.217934510345665e-05, + "loss": 1.5315131187438964, + "step": 113830 + }, + { + "epoch": 0.3445885611369243, + "grad_norm": 0.11590871959924698, + "learning_rate": 8.217554992181927e-05, + "loss": 1.5007305145263672, + "step": 113840 + }, + { + "epoch": 0.3446188306872701, + "grad_norm": 0.10397226363420486, + "learning_rate": 8.217175474018186e-05, + "loss": 1.563509464263916, + "step": 113850 + }, + { + "epoch": 0.34464910023761597, + "grad_norm": 0.1182510182261467, + "learning_rate": 8.216795955854448e-05, + "loss": 1.5418581008911132, + "step": 113860 + }, + { + "epoch": 0.3446793697879618, + "grad_norm": 0.10305152833461761, + "learning_rate": 8.216416437690707e-05, + "loss": 1.5229273796081544, + "step": 113870 + }, + { + "epoch": 0.3447096393383076, + "grad_norm": 0.10924948751926422, + "learning_rate": 8.216036919526969e-05, + "loss": 1.5649259567260743, + "step": 113880 + }, + { + "epoch": 0.34473990888865347, + "grad_norm": 0.12421973049640656, + "learning_rate": 8.215657401363228e-05, + "loss": 1.5646894454956055, + "step": 113890 + }, + { + "epoch": 0.34477017843899926, + "grad_norm": 0.10715369135141373, + "learning_rate": 8.21527788319949e-05, + "loss": 1.5352319717407226, + "step": 113900 + }, + { + "epoch": 0.3448004479893451, + "grad_norm": 0.11293147504329681, + "learning_rate": 8.214898365035751e-05, + "loss": 1.5460302352905273, + "step": 113910 + }, + { + "epoch": 0.34483071753969097, + "grad_norm": 0.12648847699165344, + "learning_rate": 8.214518846872012e-05, + "loss": 1.5152149200439453, + "step": 113920 + }, + { + "epoch": 0.34486098709003676, + "grad_norm": 0.10682683438062668, + "learning_rate": 8.214139328708272e-05, + "loss": 1.5699872970581055, + "step": 113930 + }, + { + "epoch": 0.3448912566403826, + "grad_norm": 0.10215946286916733, + "learning_rate": 8.213759810544533e-05, + "loss": 1.5724171638488769, + "step": 113940 + }, + { + "epoch": 0.3449215261907284, + "grad_norm": 0.10315641760826111, + "learning_rate": 8.213380292380793e-05, + "loss": 1.5415914535522461, + "step": 113950 + }, + { + "epoch": 0.34495179574107426, + "grad_norm": 0.10254257917404175, + "learning_rate": 8.213000774217054e-05, + "loss": 1.5721750259399414, + "step": 113960 + }, + { + "epoch": 0.3449820652914201, + "grad_norm": 0.10857440531253815, + "learning_rate": 8.212621256053316e-05, + "loss": 1.585007095336914, + "step": 113970 + }, + { + "epoch": 0.3450123348417659, + "grad_norm": 0.12114433944225311, + "learning_rate": 8.212241737889575e-05, + "loss": 1.5630012512207032, + "step": 113980 + }, + { + "epoch": 0.34504260439211176, + "grad_norm": 0.11074238270521164, + "learning_rate": 8.211862219725837e-05, + "loss": 1.5387822151184083, + "step": 113990 + }, + { + "epoch": 0.34507287394245756, + "grad_norm": 0.11560465395450592, + "learning_rate": 8.211482701562096e-05, + "loss": 1.5399627685546875, + "step": 114000 + }, + { + "epoch": 0.34507287394245756, + "eval_loss": 1.5266203880310059, + "eval_runtime": 28.0067, + "eval_samples_per_second": 17.853, + "eval_steps_per_second": 1.143, + "step": 114000 + }, + { + "epoch": 0.3451031434928034, + "grad_norm": 0.1027488261461258, + "learning_rate": 8.211103183398358e-05, + "loss": 1.5630712509155273, + "step": 114010 + }, + { + "epoch": 0.34513341304314926, + "grad_norm": 0.10088122636079788, + "learning_rate": 8.210723665234618e-05, + "loss": 1.5568683624267579, + "step": 114020 + }, + { + "epoch": 0.34516368259349506, + "grad_norm": 0.11298540979623795, + "learning_rate": 8.21034414707088e-05, + "loss": 1.5649019241333009, + "step": 114030 + }, + { + "epoch": 0.3451939521438409, + "grad_norm": 0.11323502659797668, + "learning_rate": 8.209964628907139e-05, + "loss": 1.573890781402588, + "step": 114040 + }, + { + "epoch": 0.3452242216941867, + "grad_norm": 0.11323880404233932, + "learning_rate": 8.209585110743401e-05, + "loss": 1.5392890930175782, + "step": 114050 + }, + { + "epoch": 0.34525449124453256, + "grad_norm": 0.11388684809207916, + "learning_rate": 8.20920559257966e-05, + "loss": 1.5380983352661133, + "step": 114060 + }, + { + "epoch": 0.3452847607948784, + "grad_norm": 0.11615724861621857, + "learning_rate": 8.208826074415922e-05, + "loss": 1.583035659790039, + "step": 114070 + }, + { + "epoch": 0.3453150303452242, + "grad_norm": 0.10375328361988068, + "learning_rate": 8.208446556252181e-05, + "loss": 1.5708717346191405, + "step": 114080 + }, + { + "epoch": 0.34534529989557006, + "grad_norm": 0.12358730286359787, + "learning_rate": 8.208067038088443e-05, + "loss": 1.5800518989562988, + "step": 114090 + }, + { + "epoch": 0.34537556944591585, + "grad_norm": 0.1268094778060913, + "learning_rate": 8.207687519924704e-05, + "loss": 1.5419345855712892, + "step": 114100 + }, + { + "epoch": 0.3454058389962617, + "grad_norm": 0.10587864369153976, + "learning_rate": 8.207308001760964e-05, + "loss": 1.6022018432617187, + "step": 114110 + }, + { + "epoch": 0.34543610854660756, + "grad_norm": 0.1152617484331131, + "learning_rate": 8.206928483597226e-05, + "loss": 1.547142505645752, + "step": 114120 + }, + { + "epoch": 0.34546637809695335, + "grad_norm": 0.10654909163713455, + "learning_rate": 8.206548965433485e-05, + "loss": 1.569260025024414, + "step": 114130 + }, + { + "epoch": 0.3454966476472992, + "grad_norm": 0.10639423877000809, + "learning_rate": 8.206169447269747e-05, + "loss": 1.543818473815918, + "step": 114140 + }, + { + "epoch": 0.34552691719764506, + "grad_norm": 0.11445478349924088, + "learning_rate": 8.205789929106007e-05, + "loss": 1.538779067993164, + "step": 114150 + }, + { + "epoch": 0.34555718674799085, + "grad_norm": 0.11318904161453247, + "learning_rate": 8.205410410942269e-05, + "loss": 1.5443460464477539, + "step": 114160 + }, + { + "epoch": 0.3455874562983367, + "grad_norm": 0.11445572227239609, + "learning_rate": 8.205030892778528e-05, + "loss": 1.5353775024414062, + "step": 114170 + }, + { + "epoch": 0.3456177258486825, + "grad_norm": 0.11182455718517303, + "learning_rate": 8.20465137461479e-05, + "loss": 1.5634122848510743, + "step": 114180 + }, + { + "epoch": 0.34564799539902835, + "grad_norm": 0.1078961119055748, + "learning_rate": 8.204271856451049e-05, + "loss": 1.534108543395996, + "step": 114190 + }, + { + "epoch": 0.3456782649493742, + "grad_norm": 0.1076040267944336, + "learning_rate": 8.203892338287311e-05, + "loss": 1.5336764335632325, + "step": 114200 + }, + { + "epoch": 0.34570853449972, + "grad_norm": 0.11187849938869476, + "learning_rate": 8.203512820123572e-05, + "loss": 1.534587287902832, + "step": 114210 + }, + { + "epoch": 0.34573880405006585, + "grad_norm": 0.10535034537315369, + "learning_rate": 8.203133301959832e-05, + "loss": 1.5401765823364257, + "step": 114220 + }, + { + "epoch": 0.34576907360041165, + "grad_norm": 0.09968730062246323, + "learning_rate": 8.202753783796093e-05, + "loss": 1.592494010925293, + "step": 114230 + }, + { + "epoch": 0.3457993431507575, + "grad_norm": 0.10912331193685532, + "learning_rate": 8.202374265632353e-05, + "loss": 1.5075319290161133, + "step": 114240 + }, + { + "epoch": 0.34582961270110335, + "grad_norm": 0.11637597531080246, + "learning_rate": 8.201994747468614e-05, + "loss": 1.5533303260803222, + "step": 114250 + }, + { + "epoch": 0.34585988225144915, + "grad_norm": 0.10601435601711273, + "learning_rate": 8.201615229304875e-05, + "loss": 1.544313907623291, + "step": 114260 + }, + { + "epoch": 0.345890151801795, + "grad_norm": 0.11711001396179199, + "learning_rate": 8.201235711141135e-05, + "loss": 1.518801784515381, + "step": 114270 + }, + { + "epoch": 0.3459204213521408, + "grad_norm": 0.10820426046848297, + "learning_rate": 8.200856192977396e-05, + "loss": 1.5743768692016602, + "step": 114280 + }, + { + "epoch": 0.34595069090248665, + "grad_norm": 0.1035388633608818, + "learning_rate": 8.200476674813656e-05, + "loss": 1.5473924636840821, + "step": 114290 + }, + { + "epoch": 0.3459809604528325, + "grad_norm": 0.10346098989248276, + "learning_rate": 8.200097156649917e-05, + "loss": 1.4804146766662598, + "step": 114300 + }, + { + "epoch": 0.3460112300031783, + "grad_norm": 0.0978798046708107, + "learning_rate": 8.199717638486179e-05, + "loss": 1.5445998191833497, + "step": 114310 + }, + { + "epoch": 0.34604149955352415, + "grad_norm": 0.12126898020505905, + "learning_rate": 8.199338120322438e-05, + "loss": 1.5597146987915038, + "step": 114320 + }, + { + "epoch": 0.34607176910386994, + "grad_norm": 0.11533761769533157, + "learning_rate": 8.1989586021587e-05, + "loss": 1.584981346130371, + "step": 114330 + }, + { + "epoch": 0.3461020386542158, + "grad_norm": 0.116157665848732, + "learning_rate": 8.19857908399496e-05, + "loss": 1.5572723388671874, + "step": 114340 + }, + { + "epoch": 0.34613230820456165, + "grad_norm": 0.1251891404390335, + "learning_rate": 8.198199565831221e-05, + "loss": 1.5471255302429199, + "step": 114350 + }, + { + "epoch": 0.34616257775490744, + "grad_norm": 0.10505328327417374, + "learning_rate": 8.197820047667482e-05, + "loss": 1.563544464111328, + "step": 114360 + }, + { + "epoch": 0.3461928473052533, + "grad_norm": 0.11484270542860031, + "learning_rate": 8.197440529503742e-05, + "loss": 1.5452348709106445, + "step": 114370 + }, + { + "epoch": 0.3462231168555991, + "grad_norm": 0.11546546220779419, + "learning_rate": 8.197061011340003e-05, + "loss": 1.5338626861572267, + "step": 114380 + }, + { + "epoch": 0.34625338640594494, + "grad_norm": 0.11468487232923508, + "learning_rate": 8.196681493176264e-05, + "loss": 1.5546769142150878, + "step": 114390 + }, + { + "epoch": 0.3462836559562908, + "grad_norm": 0.13098359107971191, + "learning_rate": 8.196301975012524e-05, + "loss": 1.5061918258666993, + "step": 114400 + }, + { + "epoch": 0.3463139255066366, + "grad_norm": 0.12438579648733139, + "learning_rate": 8.195922456848785e-05, + "loss": 1.5346384048461914, + "step": 114410 + }, + { + "epoch": 0.34634419505698244, + "grad_norm": 0.1292995810508728, + "learning_rate": 8.195542938685045e-05, + "loss": 1.5716800689697266, + "step": 114420 + }, + { + "epoch": 0.34637446460732824, + "grad_norm": 0.11535672098398209, + "learning_rate": 8.195163420521306e-05, + "loss": 1.5586215019226075, + "step": 114430 + }, + { + "epoch": 0.3464047341576741, + "grad_norm": 0.09915538877248764, + "learning_rate": 8.194783902357567e-05, + "loss": 1.554744052886963, + "step": 114440 + }, + { + "epoch": 0.34643500370801994, + "grad_norm": 0.11785702407360077, + "learning_rate": 8.194404384193827e-05, + "loss": 1.5771408081054688, + "step": 114450 + }, + { + "epoch": 0.34646527325836574, + "grad_norm": 0.11323380470275879, + "learning_rate": 8.194024866030088e-05, + "loss": 1.5614333152770996, + "step": 114460 + }, + { + "epoch": 0.3464955428087116, + "grad_norm": 0.10033658146858215, + "learning_rate": 8.19364534786635e-05, + "loss": 1.5433876037597656, + "step": 114470 + }, + { + "epoch": 0.3465258123590574, + "grad_norm": 0.10164307802915573, + "learning_rate": 8.193265829702609e-05, + "loss": 1.532973289489746, + "step": 114480 + }, + { + "epoch": 0.34655608190940324, + "grad_norm": 0.10115101933479309, + "learning_rate": 8.192886311538871e-05, + "loss": 1.54762601852417, + "step": 114490 + }, + { + "epoch": 0.3465863514597491, + "grad_norm": 0.11568359285593033, + "learning_rate": 8.19250679337513e-05, + "loss": 1.566659164428711, + "step": 114500 + }, + { + "epoch": 0.3465863514597491, + "eval_loss": 1.5402847528457642, + "eval_runtime": 28.4402, + "eval_samples_per_second": 17.581, + "eval_steps_per_second": 1.125, + "step": 114500 + }, + { + "epoch": 0.3466166210100949, + "grad_norm": 0.10038360208272934, + "learning_rate": 8.192127275211392e-05, + "loss": 1.5424839019775392, + "step": 114510 + }, + { + "epoch": 0.34664689056044073, + "grad_norm": 0.1101004108786583, + "learning_rate": 8.191747757047653e-05, + "loss": 1.5358379364013672, + "step": 114520 + }, + { + "epoch": 0.34667716011078653, + "grad_norm": 0.11789222061634064, + "learning_rate": 8.191368238883913e-05, + "loss": 1.5314214706420899, + "step": 114530 + }, + { + "epoch": 0.3467074296611324, + "grad_norm": 0.09772457927465439, + "learning_rate": 8.190988720720174e-05, + "loss": 1.5682729721069335, + "step": 114540 + }, + { + "epoch": 0.34673769921147823, + "grad_norm": 0.11939311027526855, + "learning_rate": 8.190609202556435e-05, + "loss": 1.5627775192260742, + "step": 114550 + }, + { + "epoch": 0.34676796876182403, + "grad_norm": 0.10769281536340714, + "learning_rate": 8.190229684392695e-05, + "loss": 1.5456132888793945, + "step": 114560 + }, + { + "epoch": 0.3467982383121699, + "grad_norm": 0.09783390164375305, + "learning_rate": 8.189850166228956e-05, + "loss": 1.5728116035461426, + "step": 114570 + }, + { + "epoch": 0.3468285078625157, + "grad_norm": 0.12743113934993744, + "learning_rate": 8.189470648065218e-05, + "loss": 1.543100357055664, + "step": 114580 + }, + { + "epoch": 0.34685877741286153, + "grad_norm": 0.10659096390008926, + "learning_rate": 8.189091129901477e-05, + "loss": 1.5256364822387696, + "step": 114590 + }, + { + "epoch": 0.3468890469632074, + "grad_norm": 0.12340978533029556, + "learning_rate": 8.188711611737739e-05, + "loss": 1.5472984313964844, + "step": 114600 + }, + { + "epoch": 0.3469193165135532, + "grad_norm": 0.10448917001485825, + "learning_rate": 8.188332093573998e-05, + "loss": 1.5852519989013671, + "step": 114610 + }, + { + "epoch": 0.34694958606389903, + "grad_norm": 0.10763595998287201, + "learning_rate": 8.18795257541026e-05, + "loss": 1.5468015670776367, + "step": 114620 + }, + { + "epoch": 0.3469798556142448, + "grad_norm": 0.10770562291145325, + "learning_rate": 8.187573057246519e-05, + "loss": 1.5420955657958983, + "step": 114630 + }, + { + "epoch": 0.3470101251645907, + "grad_norm": 0.11886940896511078, + "learning_rate": 8.187193539082781e-05, + "loss": 1.5587051391601563, + "step": 114640 + }, + { + "epoch": 0.34704039471493653, + "grad_norm": 0.1169392466545105, + "learning_rate": 8.18681402091904e-05, + "loss": 1.537592315673828, + "step": 114650 + }, + { + "epoch": 0.3470706642652823, + "grad_norm": 0.11077699810266495, + "learning_rate": 8.186434502755302e-05, + "loss": 1.5198049545288086, + "step": 114660 + }, + { + "epoch": 0.3471009338156282, + "grad_norm": 0.12094049155712128, + "learning_rate": 8.186054984591562e-05, + "loss": 1.5763988494873047, + "step": 114670 + }, + { + "epoch": 0.347131203365974, + "grad_norm": 0.11592935025691986, + "learning_rate": 8.185675466427824e-05, + "loss": 1.5352142333984375, + "step": 114680 + }, + { + "epoch": 0.3471614729163198, + "grad_norm": 0.12489645183086395, + "learning_rate": 8.185295948264083e-05, + "loss": 1.5780662536621093, + "step": 114690 + }, + { + "epoch": 0.3471917424666657, + "grad_norm": 0.11291420459747314, + "learning_rate": 8.184916430100345e-05, + "loss": 1.5638340950012206, + "step": 114700 + }, + { + "epoch": 0.34722201201701147, + "grad_norm": 0.10967476665973663, + "learning_rate": 8.184536911936605e-05, + "loss": 1.5594947814941407, + "step": 114710 + }, + { + "epoch": 0.3472522815673573, + "grad_norm": 0.10897624492645264, + "learning_rate": 8.184157393772866e-05, + "loss": 1.551940631866455, + "step": 114720 + }, + { + "epoch": 0.3472825511177031, + "grad_norm": 0.10485775768756866, + "learning_rate": 8.183777875609128e-05, + "loss": 1.546843910217285, + "step": 114730 + }, + { + "epoch": 0.34731282066804897, + "grad_norm": 0.13317979872226715, + "learning_rate": 8.183398357445387e-05, + "loss": 1.5233720779418944, + "step": 114740 + }, + { + "epoch": 0.3473430902183948, + "grad_norm": 0.1140693873167038, + "learning_rate": 8.183018839281649e-05, + "loss": 1.538187313079834, + "step": 114750 + }, + { + "epoch": 0.3473733597687406, + "grad_norm": 0.11463115364313126, + "learning_rate": 8.182639321117908e-05, + "loss": 1.53765811920166, + "step": 114760 + }, + { + "epoch": 0.34740362931908647, + "grad_norm": 0.10614625364542007, + "learning_rate": 8.18225980295417e-05, + "loss": 1.5558961868286132, + "step": 114770 + }, + { + "epoch": 0.34743389886943227, + "grad_norm": 0.12891553342342377, + "learning_rate": 8.18188028479043e-05, + "loss": 1.5015060424804687, + "step": 114780 + }, + { + "epoch": 0.3474641684197781, + "grad_norm": 0.12200554460287094, + "learning_rate": 8.181500766626692e-05, + "loss": 1.5623196601867675, + "step": 114790 + }, + { + "epoch": 0.34749443797012397, + "grad_norm": 0.11548402160406113, + "learning_rate": 8.181121248462951e-05, + "loss": 1.587274932861328, + "step": 114800 + }, + { + "epoch": 0.34752470752046977, + "grad_norm": 0.10609462112188339, + "learning_rate": 8.180741730299213e-05, + "loss": 1.5615103721618653, + "step": 114810 + }, + { + "epoch": 0.3475549770708156, + "grad_norm": 0.12140347808599472, + "learning_rate": 8.180362212135472e-05, + "loss": 1.5384058952331543, + "step": 114820 + }, + { + "epoch": 0.34758524662116147, + "grad_norm": 0.10832623392343521, + "learning_rate": 8.179982693971734e-05, + "loss": 1.5788320541381835, + "step": 114830 + }, + { + "epoch": 0.34761551617150727, + "grad_norm": 0.11288990825414658, + "learning_rate": 8.179603175807994e-05, + "loss": 1.5172540664672851, + "step": 114840 + }, + { + "epoch": 0.3476457857218531, + "grad_norm": 0.11738721281290054, + "learning_rate": 8.179223657644255e-05, + "loss": 1.5201705932617187, + "step": 114850 + }, + { + "epoch": 0.3476760552721989, + "grad_norm": 0.10535185039043427, + "learning_rate": 8.178844139480516e-05, + "loss": 1.5554766654968262, + "step": 114860 + }, + { + "epoch": 0.34770632482254477, + "grad_norm": 0.10800962150096893, + "learning_rate": 8.178464621316776e-05, + "loss": 1.5509668350219727, + "step": 114870 + }, + { + "epoch": 0.3477365943728906, + "grad_norm": 0.10635542869567871, + "learning_rate": 8.178085103153037e-05, + "loss": 1.5537790298461913, + "step": 114880 + }, + { + "epoch": 0.3477668639232364, + "grad_norm": 0.10908117145299911, + "learning_rate": 8.177705584989297e-05, + "loss": 1.5482260704040527, + "step": 114890 + }, + { + "epoch": 0.34779713347358227, + "grad_norm": 0.10691504925489426, + "learning_rate": 8.177326066825558e-05, + "loss": 1.56536865234375, + "step": 114900 + }, + { + "epoch": 0.34782740302392806, + "grad_norm": 0.11928685754537582, + "learning_rate": 8.176946548661819e-05, + "loss": 1.5635356903076172, + "step": 114910 + }, + { + "epoch": 0.3478576725742739, + "grad_norm": 0.11952587962150574, + "learning_rate": 8.17656703049808e-05, + "loss": 1.5600024223327638, + "step": 114920 + }, + { + "epoch": 0.34788794212461976, + "grad_norm": 0.10619081556797028, + "learning_rate": 8.17618751233434e-05, + "loss": 1.5512483596801758, + "step": 114930 + }, + { + "epoch": 0.34791821167496556, + "grad_norm": 0.11010315269231796, + "learning_rate": 8.175807994170602e-05, + "loss": 1.5353029251098633, + "step": 114940 + }, + { + "epoch": 0.3479484812253114, + "grad_norm": 0.12460380047559738, + "learning_rate": 8.175428476006862e-05, + "loss": 1.5510164260864259, + "step": 114950 + }, + { + "epoch": 0.3479787507756572, + "grad_norm": 0.11682308465242386, + "learning_rate": 8.175048957843123e-05, + "loss": 1.5526182174682617, + "step": 114960 + }, + { + "epoch": 0.34800902032600306, + "grad_norm": 0.11524612456560135, + "learning_rate": 8.174669439679384e-05, + "loss": 1.551541519165039, + "step": 114970 + }, + { + "epoch": 0.3480392898763489, + "grad_norm": 0.1056981235742569, + "learning_rate": 8.174289921515644e-05, + "loss": 1.5680898666381835, + "step": 114980 + }, + { + "epoch": 0.3480695594266947, + "grad_norm": 0.127493217587471, + "learning_rate": 8.173910403351905e-05, + "loss": 1.549384880065918, + "step": 114990 + }, + { + "epoch": 0.34809982897704056, + "grad_norm": 0.13557493686676025, + "learning_rate": 8.173530885188165e-05, + "loss": 1.5654600143432618, + "step": 115000 + }, + { + "epoch": 0.34809982897704056, + "eval_loss": 1.5562899112701416, + "eval_runtime": 28.3996, + "eval_samples_per_second": 17.606, + "eval_steps_per_second": 1.127, + "step": 115000 + }, + { + "epoch": 0.34813009852738636, + "grad_norm": 0.10388673096895218, + "learning_rate": 8.173151367024426e-05, + "loss": 1.5464887619018555, + "step": 115010 + }, + { + "epoch": 0.3481603680777322, + "grad_norm": 0.12780416011810303, + "learning_rate": 8.172771848860687e-05, + "loss": 1.5364545822143554, + "step": 115020 + }, + { + "epoch": 0.34819063762807806, + "grad_norm": 0.10545925796031952, + "learning_rate": 8.172392330696947e-05, + "loss": 1.5819616317749023, + "step": 115030 + }, + { + "epoch": 0.34822090717842386, + "grad_norm": 0.0958024337887764, + "learning_rate": 8.172012812533208e-05, + "loss": 1.5617003440856934, + "step": 115040 + }, + { + "epoch": 0.3482511767287697, + "grad_norm": 0.11429962515830994, + "learning_rate": 8.171633294369468e-05, + "loss": 1.561635398864746, + "step": 115050 + }, + { + "epoch": 0.3482814462791155, + "grad_norm": 0.11441723257303238, + "learning_rate": 8.171253776205729e-05, + "loss": 1.5499774932861328, + "step": 115060 + }, + { + "epoch": 0.34831171582946135, + "grad_norm": 0.10417946428060532, + "learning_rate": 8.17087425804199e-05, + "loss": 1.577174758911133, + "step": 115070 + }, + { + "epoch": 0.3483419853798072, + "grad_norm": 0.10594546049833298, + "learning_rate": 8.170494739878251e-05, + "loss": 1.554180908203125, + "step": 115080 + }, + { + "epoch": 0.348372254930153, + "grad_norm": 0.11596428602933884, + "learning_rate": 8.170115221714511e-05, + "loss": 1.5715351104736328, + "step": 115090 + }, + { + "epoch": 0.34840252448049885, + "grad_norm": 0.10948649793863297, + "learning_rate": 8.169735703550773e-05, + "loss": 1.5978153228759766, + "step": 115100 + }, + { + "epoch": 0.34843279403084465, + "grad_norm": 0.10686524957418442, + "learning_rate": 8.169356185387032e-05, + "loss": 1.5473591804504394, + "step": 115110 + }, + { + "epoch": 0.3484630635811905, + "grad_norm": 0.12029226124286652, + "learning_rate": 8.168976667223294e-05, + "loss": 1.5398344993591309, + "step": 115120 + }, + { + "epoch": 0.34849333313153635, + "grad_norm": 0.11334892362356186, + "learning_rate": 8.168597149059554e-05, + "loss": 1.56099853515625, + "step": 115130 + }, + { + "epoch": 0.34852360268188215, + "grad_norm": 0.10697115957736969, + "learning_rate": 8.168217630895815e-05, + "loss": 1.5708047866821289, + "step": 115140 + }, + { + "epoch": 0.348553872232228, + "grad_norm": 0.11262315511703491, + "learning_rate": 8.167838112732076e-05, + "loss": 1.5554004669189454, + "step": 115150 + }, + { + "epoch": 0.3485841417825738, + "grad_norm": 0.10700418800115585, + "learning_rate": 8.167458594568336e-05, + "loss": 1.5091423034667968, + "step": 115160 + }, + { + "epoch": 0.34861441133291965, + "grad_norm": 0.11312159895896912, + "learning_rate": 8.167079076404597e-05, + "loss": 1.5842880249023437, + "step": 115170 + }, + { + "epoch": 0.3486446808832655, + "grad_norm": 0.11434954404830933, + "learning_rate": 8.166699558240857e-05, + "loss": 1.5377370834350585, + "step": 115180 + }, + { + "epoch": 0.3486749504336113, + "grad_norm": 0.10849221050739288, + "learning_rate": 8.16632004007712e-05, + "loss": 1.5465662956237793, + "step": 115190 + }, + { + "epoch": 0.34870521998395715, + "grad_norm": 0.10762923955917358, + "learning_rate": 8.165940521913379e-05, + "loss": 1.590591812133789, + "step": 115200 + }, + { + "epoch": 0.34873548953430294, + "grad_norm": 0.09823522716760635, + "learning_rate": 8.16556100374964e-05, + "loss": 1.5510244369506836, + "step": 115210 + }, + { + "epoch": 0.3487657590846488, + "grad_norm": 0.11598179489374161, + "learning_rate": 8.1651814855859e-05, + "loss": 1.5462632179260254, + "step": 115220 + }, + { + "epoch": 0.34879602863499465, + "grad_norm": 0.1293971836566925, + "learning_rate": 8.164801967422162e-05, + "loss": 1.5483052253723144, + "step": 115230 + }, + { + "epoch": 0.34882629818534044, + "grad_norm": 0.1095220148563385, + "learning_rate": 8.164422449258421e-05, + "loss": 1.5613594055175781, + "step": 115240 + }, + { + "epoch": 0.3488565677356863, + "grad_norm": 0.11483293771743774, + "learning_rate": 8.164042931094683e-05, + "loss": 1.583589744567871, + "step": 115250 + }, + { + "epoch": 0.3488868372860321, + "grad_norm": 0.11738141626119614, + "learning_rate": 8.163663412930942e-05, + "loss": 1.5630704879760742, + "step": 115260 + }, + { + "epoch": 0.34891710683637794, + "grad_norm": 0.11356476694345474, + "learning_rate": 8.163283894767204e-05, + "loss": 1.569369411468506, + "step": 115270 + }, + { + "epoch": 0.3489473763867238, + "grad_norm": 0.1046757623553276, + "learning_rate": 8.162904376603463e-05, + "loss": 1.5377281188964844, + "step": 115280 + }, + { + "epoch": 0.3489776459370696, + "grad_norm": 0.10603249818086624, + "learning_rate": 8.162524858439725e-05, + "loss": 1.566320037841797, + "step": 115290 + }, + { + "epoch": 0.34900791548741544, + "grad_norm": 0.10579895228147507, + "learning_rate": 8.162145340275985e-05, + "loss": 1.510287094116211, + "step": 115300 + }, + { + "epoch": 0.34903818503776124, + "grad_norm": 0.10983327776193619, + "learning_rate": 8.161765822112247e-05, + "loss": 1.5130719184875487, + "step": 115310 + }, + { + "epoch": 0.3490684545881071, + "grad_norm": 0.13097718358039856, + "learning_rate": 8.161386303948507e-05, + "loss": 1.5525495529174804, + "step": 115320 + }, + { + "epoch": 0.34909872413845294, + "grad_norm": 0.11307166516780853, + "learning_rate": 8.161006785784768e-05, + "loss": 1.5330076217651367, + "step": 115330 + }, + { + "epoch": 0.34912899368879874, + "grad_norm": 0.11894198507070541, + "learning_rate": 8.16062726762103e-05, + "loss": 1.5211559295654298, + "step": 115340 + }, + { + "epoch": 0.3491592632391446, + "grad_norm": 0.1013600155711174, + "learning_rate": 8.160247749457289e-05, + "loss": 1.5448674201965331, + "step": 115350 + }, + { + "epoch": 0.3491895327894904, + "grad_norm": 0.11054769903421402, + "learning_rate": 8.159868231293551e-05, + "loss": 1.505851936340332, + "step": 115360 + }, + { + "epoch": 0.34921980233983624, + "grad_norm": 0.12336968630552292, + "learning_rate": 8.15948871312981e-05, + "loss": 1.571314811706543, + "step": 115370 + }, + { + "epoch": 0.3492500718901821, + "grad_norm": 0.09750834852457047, + "learning_rate": 8.159109194966072e-05, + "loss": 1.574721908569336, + "step": 115380 + }, + { + "epoch": 0.3492803414405279, + "grad_norm": 0.11187414824962616, + "learning_rate": 8.158729676802331e-05, + "loss": 1.5437880516052247, + "step": 115390 + }, + { + "epoch": 0.34931061099087374, + "grad_norm": 0.12246581166982651, + "learning_rate": 8.158350158638593e-05, + "loss": 1.5142499923706054, + "step": 115400 + }, + { + "epoch": 0.34934088054121953, + "grad_norm": 0.10863666236400604, + "learning_rate": 8.157970640474852e-05, + "loss": 1.523172378540039, + "step": 115410 + }, + { + "epoch": 0.3493711500915654, + "grad_norm": 0.11766966432332993, + "learning_rate": 8.157591122311114e-05, + "loss": 1.5369115829467774, + "step": 115420 + }, + { + "epoch": 0.34940141964191124, + "grad_norm": 0.10549464076757431, + "learning_rate": 8.157211604147374e-05, + "loss": 1.5567508697509767, + "step": 115430 + }, + { + "epoch": 0.34943168919225703, + "grad_norm": 0.12388282269239426, + "learning_rate": 8.156832085983636e-05, + "loss": 1.5188188552856445, + "step": 115440 + }, + { + "epoch": 0.3494619587426029, + "grad_norm": 0.12920939922332764, + "learning_rate": 8.156452567819896e-05, + "loss": 1.5476568222045899, + "step": 115450 + }, + { + "epoch": 0.3494922282929487, + "grad_norm": 0.1163044422864914, + "learning_rate": 8.156073049656157e-05, + "loss": 1.5613181114196777, + "step": 115460 + }, + { + "epoch": 0.34952249784329453, + "grad_norm": 0.09950166195631027, + "learning_rate": 8.155693531492417e-05, + "loss": 1.5993873596191406, + "step": 115470 + }, + { + "epoch": 0.3495527673936404, + "grad_norm": 0.11008954793214798, + "learning_rate": 8.155314013328678e-05, + "loss": 1.5358405113220215, + "step": 115480 + }, + { + "epoch": 0.3495830369439862, + "grad_norm": 0.10394155234098434, + "learning_rate": 8.154934495164939e-05, + "loss": 1.5557012557983398, + "step": 115490 + }, + { + "epoch": 0.34961330649433203, + "grad_norm": 0.09735383093357086, + "learning_rate": 8.154554977001199e-05, + "loss": 1.5514210700988769, + "step": 115500 + }, + { + "epoch": 0.34961330649433203, + "eval_loss": 1.5497307777404785, + "eval_runtime": 27.8129, + "eval_samples_per_second": 17.977, + "eval_steps_per_second": 1.151, + "step": 115500 + }, + { + "epoch": 0.3496435760446779, + "grad_norm": 0.11117827892303467, + "learning_rate": 8.15417545883746e-05, + "loss": 1.509957981109619, + "step": 115510 + }, + { + "epoch": 0.3496738455950237, + "grad_norm": 0.10875048488378525, + "learning_rate": 8.15379594067372e-05, + "loss": 1.5491077423095703, + "step": 115520 + }, + { + "epoch": 0.34970411514536953, + "grad_norm": 0.11234467476606369, + "learning_rate": 8.153416422509982e-05, + "loss": 1.5424004554748536, + "step": 115530 + }, + { + "epoch": 0.34973438469571533, + "grad_norm": 0.11498042941093445, + "learning_rate": 8.153036904346242e-05, + "loss": 1.5422853469848632, + "step": 115540 + }, + { + "epoch": 0.3497646542460612, + "grad_norm": 0.10614578425884247, + "learning_rate": 8.152657386182504e-05, + "loss": 1.5661161422729493, + "step": 115550 + }, + { + "epoch": 0.34979492379640703, + "grad_norm": 0.11626644432544708, + "learning_rate": 8.152277868018763e-05, + "loss": 1.5454317092895509, + "step": 115560 + }, + { + "epoch": 0.3498251933467528, + "grad_norm": 0.11650706827640533, + "learning_rate": 8.151898349855025e-05, + "loss": 1.5468242645263672, + "step": 115570 + }, + { + "epoch": 0.3498554628970987, + "grad_norm": 0.1126556545495987, + "learning_rate": 8.151518831691285e-05, + "loss": 1.5469476699829101, + "step": 115580 + }, + { + "epoch": 0.3498857324474445, + "grad_norm": 0.12408701330423355, + "learning_rate": 8.151139313527546e-05, + "loss": 1.5812746047973634, + "step": 115590 + }, + { + "epoch": 0.3499160019977903, + "grad_norm": 0.09535133838653564, + "learning_rate": 8.150759795363806e-05, + "loss": 1.54893798828125, + "step": 115600 + }, + { + "epoch": 0.3499462715481362, + "grad_norm": 0.10887310653924942, + "learning_rate": 8.150380277200067e-05, + "loss": 1.5738683700561524, + "step": 115610 + }, + { + "epoch": 0.349976541098482, + "grad_norm": 0.10657015442848206, + "learning_rate": 8.150000759036328e-05, + "loss": 1.5649107933044433, + "step": 115620 + }, + { + "epoch": 0.3500068106488278, + "grad_norm": 0.12341603636741638, + "learning_rate": 8.149621240872588e-05, + "loss": 1.5639987945556642, + "step": 115630 + }, + { + "epoch": 0.3500370801991736, + "grad_norm": 0.11751145869493484, + "learning_rate": 8.149241722708849e-05, + "loss": 1.554435634613037, + "step": 115640 + }, + { + "epoch": 0.3500673497495195, + "grad_norm": 0.10894903540611267, + "learning_rate": 8.14886220454511e-05, + "loss": 1.5363687515258788, + "step": 115650 + }, + { + "epoch": 0.3500976192998653, + "grad_norm": 0.10809887945652008, + "learning_rate": 8.14848268638137e-05, + "loss": 1.5705928802490234, + "step": 115660 + }, + { + "epoch": 0.3501278888502111, + "grad_norm": 0.10693210363388062, + "learning_rate": 8.14810316821763e-05, + "loss": 1.5442509651184082, + "step": 115670 + }, + { + "epoch": 0.350158158400557, + "grad_norm": 0.11665411293506622, + "learning_rate": 8.147723650053891e-05, + "loss": 1.5828163146972656, + "step": 115680 + }, + { + "epoch": 0.35018842795090277, + "grad_norm": 0.10228244215250015, + "learning_rate": 8.147344131890153e-05, + "loss": 1.5627123832702636, + "step": 115690 + }, + { + "epoch": 0.3502186975012486, + "grad_norm": 0.10365840792655945, + "learning_rate": 8.146964613726412e-05, + "loss": 1.539426040649414, + "step": 115700 + }, + { + "epoch": 0.3502489670515945, + "grad_norm": 0.10881578177213669, + "learning_rate": 8.146585095562674e-05, + "loss": 1.538622760772705, + "step": 115710 + }, + { + "epoch": 0.35027923660194027, + "grad_norm": 0.1033654659986496, + "learning_rate": 8.146205577398934e-05, + "loss": 1.5678049087524415, + "step": 115720 + }, + { + "epoch": 0.3503095061522861, + "grad_norm": 0.1003948375582695, + "learning_rate": 8.145826059235196e-05, + "loss": 1.5365776062011718, + "step": 115730 + }, + { + "epoch": 0.3503397757026319, + "grad_norm": 0.1045340746641159, + "learning_rate": 8.145446541071456e-05, + "loss": 1.5773748397827148, + "step": 115740 + }, + { + "epoch": 0.35037004525297777, + "grad_norm": 0.11391831189393997, + "learning_rate": 8.145067022907717e-05, + "loss": 1.5642656326293944, + "step": 115750 + }, + { + "epoch": 0.3504003148033236, + "grad_norm": 0.10398385673761368, + "learning_rate": 8.144687504743977e-05, + "loss": 1.5405763626098632, + "step": 115760 + }, + { + "epoch": 0.3504305843536694, + "grad_norm": 0.10779151320457458, + "learning_rate": 8.144307986580238e-05, + "loss": 1.5497047424316406, + "step": 115770 + }, + { + "epoch": 0.35046085390401527, + "grad_norm": 0.11011253297328949, + "learning_rate": 8.143928468416499e-05, + "loss": 1.538458251953125, + "step": 115780 + }, + { + "epoch": 0.35049112345436106, + "grad_norm": 0.13276565074920654, + "learning_rate": 8.143548950252759e-05, + "loss": 1.5538055419921875, + "step": 115790 + }, + { + "epoch": 0.3505213930047069, + "grad_norm": 0.10982127487659454, + "learning_rate": 8.14316943208902e-05, + "loss": 1.5300817489624023, + "step": 115800 + }, + { + "epoch": 0.35055166255505277, + "grad_norm": 0.11114419251680374, + "learning_rate": 8.14278991392528e-05, + "loss": 1.5592370986938477, + "step": 115810 + }, + { + "epoch": 0.35058193210539856, + "grad_norm": 0.10444442927837372, + "learning_rate": 8.142410395761542e-05, + "loss": 1.5617180824279786, + "step": 115820 + }, + { + "epoch": 0.3506122016557444, + "grad_norm": 0.10497213900089264, + "learning_rate": 8.142030877597802e-05, + "loss": 1.573464584350586, + "step": 115830 + }, + { + "epoch": 0.3506424712060902, + "grad_norm": 0.11699441820383072, + "learning_rate": 8.141651359434063e-05, + "loss": 1.5207239151000977, + "step": 115840 + }, + { + "epoch": 0.35067274075643606, + "grad_norm": 0.11441045254468918, + "learning_rate": 8.141271841270323e-05, + "loss": 1.5670841217041016, + "step": 115850 + }, + { + "epoch": 0.3507030103067819, + "grad_norm": 0.1174272894859314, + "learning_rate": 8.140892323106585e-05, + "loss": 1.5555601119995117, + "step": 115860 + }, + { + "epoch": 0.3507332798571277, + "grad_norm": 0.12061025202274323, + "learning_rate": 8.140512804942844e-05, + "loss": 1.5314598083496094, + "step": 115870 + }, + { + "epoch": 0.35076354940747356, + "grad_norm": 0.11122078448534012, + "learning_rate": 8.140133286779106e-05, + "loss": 1.5499916076660156, + "step": 115880 + }, + { + "epoch": 0.35079381895781936, + "grad_norm": 0.10573960840702057, + "learning_rate": 8.139753768615365e-05, + "loss": 1.5686476707458497, + "step": 115890 + }, + { + "epoch": 0.3508240885081652, + "grad_norm": 0.10657400637865067, + "learning_rate": 8.139374250451627e-05, + "loss": 1.5546296119689942, + "step": 115900 + }, + { + "epoch": 0.35085435805851106, + "grad_norm": 0.11101938039064407, + "learning_rate": 8.138994732287886e-05, + "loss": 1.5656688690185547, + "step": 115910 + }, + { + "epoch": 0.35088462760885686, + "grad_norm": 0.1125943660736084, + "learning_rate": 8.138615214124148e-05, + "loss": 1.541491985321045, + "step": 115920 + }, + { + "epoch": 0.3509148971592027, + "grad_norm": 0.11390543729066849, + "learning_rate": 8.13823569596041e-05, + "loss": 1.5492921829223634, + "step": 115930 + }, + { + "epoch": 0.3509451667095485, + "grad_norm": 0.10789213329553604, + "learning_rate": 8.13785617779667e-05, + "loss": 1.5363456726074218, + "step": 115940 + }, + { + "epoch": 0.35097543625989436, + "grad_norm": 0.11138736456632614, + "learning_rate": 8.137476659632931e-05, + "loss": 1.5333471298217773, + "step": 115950 + }, + { + "epoch": 0.3510057058102402, + "grad_norm": 0.110197514295578, + "learning_rate": 8.13709714146919e-05, + "loss": 1.5716157913208009, + "step": 115960 + }, + { + "epoch": 0.351035975360586, + "grad_norm": 0.11575590074062347, + "learning_rate": 8.136717623305453e-05, + "loss": 1.5178274154663085, + "step": 115970 + }, + { + "epoch": 0.35106624491093186, + "grad_norm": 0.10860932618379593, + "learning_rate": 8.136338105141712e-05, + "loss": 1.554461669921875, + "step": 115980 + }, + { + "epoch": 0.35109651446127765, + "grad_norm": 0.1162826344370842, + "learning_rate": 8.135958586977974e-05, + "loss": 1.5325891494750976, + "step": 115990 + }, + { + "epoch": 0.3511267840116235, + "grad_norm": 0.1114468052983284, + "learning_rate": 8.135579068814233e-05, + "loss": 1.5395150184631348, + "step": 116000 + }, + { + "epoch": 0.3511267840116235, + "eval_loss": 1.5432929992675781, + "eval_runtime": 28.2381, + "eval_samples_per_second": 17.707, + "eval_steps_per_second": 1.133, + "step": 116000 + }, + { + "epoch": 0.35115705356196936, + "grad_norm": 0.12840092182159424, + "learning_rate": 8.135199550650495e-05, + "loss": 1.5398854255676269, + "step": 116010 + }, + { + "epoch": 0.35118732311231515, + "grad_norm": 0.10838059335947037, + "learning_rate": 8.134820032486754e-05, + "loss": 1.5204190254211425, + "step": 116020 + }, + { + "epoch": 0.351217592662661, + "grad_norm": 0.12019508332014084, + "learning_rate": 8.134440514323016e-05, + "loss": 1.5374749183654786, + "step": 116030 + }, + { + "epoch": 0.3512478622130068, + "grad_norm": 0.11761113256216049, + "learning_rate": 8.134060996159275e-05, + "loss": 1.539820384979248, + "step": 116040 + }, + { + "epoch": 0.35127813176335265, + "grad_norm": 0.10577341169118881, + "learning_rate": 8.133681477995537e-05, + "loss": 1.5351895332336425, + "step": 116050 + }, + { + "epoch": 0.3513084013136985, + "grad_norm": 0.11524684727191925, + "learning_rate": 8.133301959831798e-05, + "loss": 1.5341894149780273, + "step": 116060 + }, + { + "epoch": 0.3513386708640443, + "grad_norm": 0.10900822281837463, + "learning_rate": 8.132922441668059e-05, + "loss": 1.5523475646972655, + "step": 116070 + }, + { + "epoch": 0.35136894041439015, + "grad_norm": 0.12007908523082733, + "learning_rate": 8.132542923504319e-05, + "loss": 1.5220420837402344, + "step": 116080 + }, + { + "epoch": 0.35139920996473595, + "grad_norm": 0.10799164324998856, + "learning_rate": 8.13216340534058e-05, + "loss": 1.553741455078125, + "step": 116090 + }, + { + "epoch": 0.3514294795150818, + "grad_norm": 0.11709444224834442, + "learning_rate": 8.13178388717684e-05, + "loss": 1.5194194793701172, + "step": 116100 + }, + { + "epoch": 0.35145974906542765, + "grad_norm": 0.14154799282550812, + "learning_rate": 8.131404369013101e-05, + "loss": 1.5615230560302735, + "step": 116110 + }, + { + "epoch": 0.35149001861577345, + "grad_norm": 0.12149035930633545, + "learning_rate": 8.131024850849361e-05, + "loss": 1.5409660339355469, + "step": 116120 + }, + { + "epoch": 0.3515202881661193, + "grad_norm": 0.12079236656427383, + "learning_rate": 8.130645332685622e-05, + "loss": 1.5599534034729003, + "step": 116130 + }, + { + "epoch": 0.3515505577164651, + "grad_norm": 0.10492508858442307, + "learning_rate": 8.130265814521884e-05, + "loss": 1.534787082672119, + "step": 116140 + }, + { + "epoch": 0.35158082726681095, + "grad_norm": 0.11758754402399063, + "learning_rate": 8.129886296358143e-05, + "loss": 1.5785451889038087, + "step": 116150 + }, + { + "epoch": 0.3516110968171568, + "grad_norm": 0.11693596094846725, + "learning_rate": 8.129506778194405e-05, + "loss": 1.5501861572265625, + "step": 116160 + }, + { + "epoch": 0.3516413663675026, + "grad_norm": 0.11541333794593811, + "learning_rate": 8.129127260030664e-05, + "loss": 1.5578025817871093, + "step": 116170 + }, + { + "epoch": 0.35167163591784845, + "grad_norm": 0.12558794021606445, + "learning_rate": 8.128747741866926e-05, + "loss": 1.5423103332519532, + "step": 116180 + }, + { + "epoch": 0.35170190546819424, + "grad_norm": 0.10863244533538818, + "learning_rate": 8.128368223703187e-05, + "loss": 1.5287546157836913, + "step": 116190 + }, + { + "epoch": 0.3517321750185401, + "grad_norm": 0.12366731464862823, + "learning_rate": 8.127988705539448e-05, + "loss": 1.5686448097229004, + "step": 116200 + }, + { + "epoch": 0.35176244456888595, + "grad_norm": 0.11789864301681519, + "learning_rate": 8.127609187375708e-05, + "loss": 1.5424827575683593, + "step": 116210 + }, + { + "epoch": 0.35179271411923174, + "grad_norm": 0.11119237542152405, + "learning_rate": 8.127229669211969e-05, + "loss": 1.5583338737487793, + "step": 116220 + }, + { + "epoch": 0.3518229836695776, + "grad_norm": 0.10304149985313416, + "learning_rate": 8.12685015104823e-05, + "loss": 1.557259750366211, + "step": 116230 + }, + { + "epoch": 0.35185325321992345, + "grad_norm": 0.11416561901569366, + "learning_rate": 8.12647063288449e-05, + "loss": 1.5328261375427246, + "step": 116240 + }, + { + "epoch": 0.35188352277026924, + "grad_norm": 0.10489648580551147, + "learning_rate": 8.12609111472075e-05, + "loss": 1.5527873992919923, + "step": 116250 + }, + { + "epoch": 0.3519137923206151, + "grad_norm": 0.11117430031299591, + "learning_rate": 8.125711596557011e-05, + "loss": 1.5772367477416993, + "step": 116260 + }, + { + "epoch": 0.3519440618709609, + "grad_norm": 0.10749296098947525, + "learning_rate": 8.125332078393272e-05, + "loss": 1.5609000205993653, + "step": 116270 + }, + { + "epoch": 0.35197433142130674, + "grad_norm": 0.10843974351882935, + "learning_rate": 8.124952560229532e-05, + "loss": 1.5424649238586425, + "step": 116280 + }, + { + "epoch": 0.3520046009716526, + "grad_norm": 0.1033255010843277, + "learning_rate": 8.124573042065793e-05, + "loss": 1.5134210586547852, + "step": 116290 + }, + { + "epoch": 0.3520348705219984, + "grad_norm": 0.1146908700466156, + "learning_rate": 8.124193523902055e-05, + "loss": 1.5400413513183593, + "step": 116300 + }, + { + "epoch": 0.35206514007234424, + "grad_norm": 0.09725040197372437, + "learning_rate": 8.123814005738314e-05, + "loss": 1.5317121505737306, + "step": 116310 + }, + { + "epoch": 0.35209540962269004, + "grad_norm": 0.10039086639881134, + "learning_rate": 8.123434487574576e-05, + "loss": 1.5303258895874023, + "step": 116320 + }, + { + "epoch": 0.3521256791730359, + "grad_norm": 0.10435637086629868, + "learning_rate": 8.123054969410835e-05, + "loss": 1.5198529243469239, + "step": 116330 + }, + { + "epoch": 0.35215594872338174, + "grad_norm": 0.10747870802879333, + "learning_rate": 8.122675451247097e-05, + "loss": 1.531204891204834, + "step": 116340 + }, + { + "epoch": 0.35218621827372754, + "grad_norm": 0.10494881123304367, + "learning_rate": 8.122295933083358e-05, + "loss": 1.5581064224243164, + "step": 116350 + }, + { + "epoch": 0.3522164878240734, + "grad_norm": 0.12092122435569763, + "learning_rate": 8.121916414919618e-05, + "loss": 1.5459524154663087, + "step": 116360 + }, + { + "epoch": 0.3522467573744192, + "grad_norm": 0.12624810636043549, + "learning_rate": 8.121536896755879e-05, + "loss": 1.5628791809082032, + "step": 116370 + }, + { + "epoch": 0.35227702692476504, + "grad_norm": 0.1160685271024704, + "learning_rate": 8.12115737859214e-05, + "loss": 1.551163673400879, + "step": 116380 + }, + { + "epoch": 0.3523072964751109, + "grad_norm": 0.10364405065774918, + "learning_rate": 8.1207778604284e-05, + "loss": 1.5230998992919922, + "step": 116390 + }, + { + "epoch": 0.3523375660254567, + "grad_norm": 0.10772712528705597, + "learning_rate": 8.120398342264661e-05, + "loss": 1.5401880264282226, + "step": 116400 + }, + { + "epoch": 0.35236783557580253, + "grad_norm": 0.11278712004423141, + "learning_rate": 8.120018824100921e-05, + "loss": 1.5501066207885743, + "step": 116410 + }, + { + "epoch": 0.35239810512614833, + "grad_norm": 0.10805637389421463, + "learning_rate": 8.119639305937182e-05, + "loss": 1.587669563293457, + "step": 116420 + }, + { + "epoch": 0.3524283746764942, + "grad_norm": 0.10675302147865295, + "learning_rate": 8.119259787773444e-05, + "loss": 1.5730798721313477, + "step": 116430 + }, + { + "epoch": 0.35245864422684003, + "grad_norm": 0.09726641327142715, + "learning_rate": 8.118880269609703e-05, + "loss": 1.5741540908813476, + "step": 116440 + }, + { + "epoch": 0.35248891377718583, + "grad_norm": 0.12590840458869934, + "learning_rate": 8.118500751445965e-05, + "loss": 1.5757932662963867, + "step": 116450 + }, + { + "epoch": 0.3525191833275317, + "grad_norm": 0.10305875539779663, + "learning_rate": 8.118121233282224e-05, + "loss": 1.5286096572875976, + "step": 116460 + }, + { + "epoch": 0.3525494528778775, + "grad_norm": 0.11605048179626465, + "learning_rate": 8.117741715118486e-05, + "loss": 1.5880245208740233, + "step": 116470 + }, + { + "epoch": 0.35257972242822333, + "grad_norm": 0.09836923331022263, + "learning_rate": 8.117362196954746e-05, + "loss": 1.5286304473876953, + "step": 116480 + }, + { + "epoch": 0.3526099919785692, + "grad_norm": 0.12926433980464935, + "learning_rate": 8.116982678791008e-05, + "loss": 1.5834043502807618, + "step": 116490 + }, + { + "epoch": 0.352640261528915, + "grad_norm": 0.12925484776496887, + "learning_rate": 8.116603160627267e-05, + "loss": 1.569377326965332, + "step": 116500 + }, + { + "epoch": 0.352640261528915, + "eval_loss": 1.5420193672180176, + "eval_runtime": 27.9037, + "eval_samples_per_second": 17.919, + "eval_steps_per_second": 1.147, + "step": 116500 + }, + { + "epoch": 0.35267053107926083, + "grad_norm": 0.1144455298781395, + "learning_rate": 8.116223642463529e-05, + "loss": 1.559107780456543, + "step": 116510 + }, + { + "epoch": 0.3527008006296066, + "grad_norm": 0.10659563541412354, + "learning_rate": 8.115844124299788e-05, + "loss": 1.5606240272521972, + "step": 116520 + }, + { + "epoch": 0.3527310701799525, + "grad_norm": 0.11247869580984116, + "learning_rate": 8.11546460613605e-05, + "loss": 1.5754525184631347, + "step": 116530 + }, + { + "epoch": 0.35276133973029833, + "grad_norm": 0.1292455941438675, + "learning_rate": 8.11508508797231e-05, + "loss": 1.5885831832885742, + "step": 116540 + }, + { + "epoch": 0.3527916092806441, + "grad_norm": 0.10752569139003754, + "learning_rate": 8.114705569808571e-05, + "loss": 1.5461033821105956, + "step": 116550 + }, + { + "epoch": 0.35282187883099, + "grad_norm": 0.10443295538425446, + "learning_rate": 8.114326051644833e-05, + "loss": 1.525766944885254, + "step": 116560 + }, + { + "epoch": 0.3528521483813358, + "grad_norm": 0.11641719937324524, + "learning_rate": 8.113946533481092e-05, + "loss": 1.5370458602905273, + "step": 116570 + }, + { + "epoch": 0.3528824179316816, + "grad_norm": 0.11469447612762451, + "learning_rate": 8.113567015317354e-05, + "loss": 1.562420654296875, + "step": 116580 + }, + { + "epoch": 0.3529126874820275, + "grad_norm": 0.10878154635429382, + "learning_rate": 8.113187497153614e-05, + "loss": 1.5895294189453124, + "step": 116590 + }, + { + "epoch": 0.3529429570323733, + "grad_norm": 0.10290996730327606, + "learning_rate": 8.112807978989875e-05, + "loss": 1.5567935943603515, + "step": 116600 + }, + { + "epoch": 0.3529732265827191, + "grad_norm": 0.10854531079530716, + "learning_rate": 8.112428460826135e-05, + "loss": 1.5791379928588867, + "step": 116610 + }, + { + "epoch": 0.3530034961330649, + "grad_norm": 0.1121574342250824, + "learning_rate": 8.112048942662397e-05, + "loss": 1.5758584976196288, + "step": 116620 + }, + { + "epoch": 0.35303376568341077, + "grad_norm": 0.12605522572994232, + "learning_rate": 8.111669424498656e-05, + "loss": 1.5330083847045899, + "step": 116630 + }, + { + "epoch": 0.3530640352337566, + "grad_norm": 0.11570524424314499, + "learning_rate": 8.111289906334918e-05, + "loss": 1.5600811004638673, + "step": 116640 + }, + { + "epoch": 0.3530943047841024, + "grad_norm": 0.11916638910770416, + "learning_rate": 8.110910388171177e-05, + "loss": 1.5246994018554687, + "step": 116650 + }, + { + "epoch": 0.35312457433444827, + "grad_norm": 0.12647077441215515, + "learning_rate": 8.110530870007439e-05, + "loss": 1.5610491752624511, + "step": 116660 + }, + { + "epoch": 0.35315484388479407, + "grad_norm": 0.10095149278640747, + "learning_rate": 8.110151351843698e-05, + "loss": 1.556015682220459, + "step": 116670 + }, + { + "epoch": 0.3531851134351399, + "grad_norm": 0.10925806313753128, + "learning_rate": 8.10977183367996e-05, + "loss": 1.541293239593506, + "step": 116680 + }, + { + "epoch": 0.35321538298548577, + "grad_norm": 0.1268284171819687, + "learning_rate": 8.109392315516221e-05, + "loss": 1.573346996307373, + "step": 116690 + }, + { + "epoch": 0.35324565253583157, + "grad_norm": 0.12206058949232101, + "learning_rate": 8.109012797352481e-05, + "loss": 1.5850841522216796, + "step": 116700 + }, + { + "epoch": 0.3532759220861774, + "grad_norm": 0.12012016028165817, + "learning_rate": 8.108633279188742e-05, + "loss": 1.5225875854492188, + "step": 116710 + }, + { + "epoch": 0.3533061916365232, + "grad_norm": 0.0945289358496666, + "learning_rate": 8.108253761025003e-05, + "loss": 1.5578492164611817, + "step": 116720 + }, + { + "epoch": 0.35333646118686907, + "grad_norm": 0.10042925179004669, + "learning_rate": 8.107874242861263e-05, + "loss": 1.5589529037475587, + "step": 116730 + }, + { + "epoch": 0.3533667307372149, + "grad_norm": 0.1083127111196518, + "learning_rate": 8.107494724697524e-05, + "loss": 1.529630184173584, + "step": 116740 + }, + { + "epoch": 0.3533970002875607, + "grad_norm": 0.11662591248750687, + "learning_rate": 8.107115206533786e-05, + "loss": 1.563600730895996, + "step": 116750 + }, + { + "epoch": 0.35342726983790657, + "grad_norm": 0.11510861665010452, + "learning_rate": 8.106735688370045e-05, + "loss": 1.5582837104797362, + "step": 116760 + }, + { + "epoch": 0.35345753938825236, + "grad_norm": 0.1097324788570404, + "learning_rate": 8.106356170206307e-05, + "loss": 1.579455852508545, + "step": 116770 + }, + { + "epoch": 0.3534878089385982, + "grad_norm": 0.10553288459777832, + "learning_rate": 8.105976652042566e-05, + "loss": 1.5189204216003418, + "step": 116780 + }, + { + "epoch": 0.35351807848894407, + "grad_norm": 0.11482713371515274, + "learning_rate": 8.105597133878828e-05, + "loss": 1.543268871307373, + "step": 116790 + }, + { + "epoch": 0.35354834803928986, + "grad_norm": 0.10337066650390625, + "learning_rate": 8.105217615715089e-05, + "loss": 1.5707196235656737, + "step": 116800 + }, + { + "epoch": 0.3535786175896357, + "grad_norm": 0.1116461455821991, + "learning_rate": 8.10483809755135e-05, + "loss": 1.5589926719665528, + "step": 116810 + }, + { + "epoch": 0.3536088871399815, + "grad_norm": 0.11279202252626419, + "learning_rate": 8.10445857938761e-05, + "loss": 1.541618251800537, + "step": 116820 + }, + { + "epoch": 0.35363915669032736, + "grad_norm": 0.12537522614002228, + "learning_rate": 8.10407906122387e-05, + "loss": 1.545609188079834, + "step": 116830 + }, + { + "epoch": 0.3536694262406732, + "grad_norm": 0.10958054661750793, + "learning_rate": 8.103699543060131e-05, + "loss": 1.565177059173584, + "step": 116840 + }, + { + "epoch": 0.353699695791019, + "grad_norm": 0.10214512050151825, + "learning_rate": 8.103320024896392e-05, + "loss": 1.5624530792236329, + "step": 116850 + }, + { + "epoch": 0.35372996534136486, + "grad_norm": 0.10695712268352509, + "learning_rate": 8.102940506732652e-05, + "loss": 1.548435115814209, + "step": 116860 + }, + { + "epoch": 0.35376023489171066, + "grad_norm": 0.1254853755235672, + "learning_rate": 8.102560988568913e-05, + "loss": 1.5670612335205079, + "step": 116870 + }, + { + "epoch": 0.3537905044420565, + "grad_norm": 0.1200990378856659, + "learning_rate": 8.102181470405173e-05, + "loss": 1.5234453201293945, + "step": 116880 + }, + { + "epoch": 0.35382077399240236, + "grad_norm": 0.10853859037160873, + "learning_rate": 8.101801952241434e-05, + "loss": 1.5360366821289062, + "step": 116890 + }, + { + "epoch": 0.35385104354274816, + "grad_norm": 0.10570839047431946, + "learning_rate": 8.101422434077695e-05, + "loss": 1.562606430053711, + "step": 116900 + }, + { + "epoch": 0.353881313093094, + "grad_norm": 0.11072774976491928, + "learning_rate": 8.101042915913955e-05, + "loss": 1.5584117889404296, + "step": 116910 + }, + { + "epoch": 0.35391158264343986, + "grad_norm": 0.11139386147260666, + "learning_rate": 8.100663397750216e-05, + "loss": 1.5606870651245117, + "step": 116920 + }, + { + "epoch": 0.35394185219378566, + "grad_norm": 0.11413414031267166, + "learning_rate": 8.100283879586478e-05, + "loss": 1.53851375579834, + "step": 116930 + }, + { + "epoch": 0.3539721217441315, + "grad_norm": 0.1195688247680664, + "learning_rate": 8.099904361422737e-05, + "loss": 1.568901252746582, + "step": 116940 + }, + { + "epoch": 0.3540023912944773, + "grad_norm": 0.11407356709241867, + "learning_rate": 8.099524843258999e-05, + "loss": 1.5538537979125977, + "step": 116950 + }, + { + "epoch": 0.35403266084482315, + "grad_norm": 0.10694380104541779, + "learning_rate": 8.09914532509526e-05, + "loss": 1.5397531509399414, + "step": 116960 + }, + { + "epoch": 0.354062930395169, + "grad_norm": 0.10668822377920151, + "learning_rate": 8.09876580693152e-05, + "loss": 1.5374604225158692, + "step": 116970 + }, + { + "epoch": 0.3540931999455148, + "grad_norm": 0.11226066201925278, + "learning_rate": 8.098386288767781e-05, + "loss": 1.5405988693237305, + "step": 116980 + }, + { + "epoch": 0.35412346949586065, + "grad_norm": 0.11521930247545242, + "learning_rate": 8.098006770604041e-05, + "loss": 1.539804458618164, + "step": 116990 + }, + { + "epoch": 0.35415373904620645, + "grad_norm": 0.1119568794965744, + "learning_rate": 8.097627252440302e-05, + "loss": 1.5107104301452636, + "step": 117000 + }, + { + "epoch": 0.35415373904620645, + "eval_loss": 1.5601584911346436, + "eval_runtime": 28.4437, + "eval_samples_per_second": 17.579, + "eval_steps_per_second": 1.125, + "step": 117000 + }, + { + "epoch": 0.3541840085965523, + "grad_norm": 0.1049027219414711, + "learning_rate": 8.097247734276563e-05, + "loss": 1.5652692794799805, + "step": 117010 + }, + { + "epoch": 0.35421427814689815, + "grad_norm": 0.10222364962100983, + "learning_rate": 8.096868216112823e-05, + "loss": 1.5663902282714843, + "step": 117020 + }, + { + "epoch": 0.35424454769724395, + "grad_norm": 0.1254952996969223, + "learning_rate": 8.096488697949084e-05, + "loss": 1.5168928146362304, + "step": 117030 + }, + { + "epoch": 0.3542748172475898, + "grad_norm": 0.11022058129310608, + "learning_rate": 8.096109179785346e-05, + "loss": 1.534010124206543, + "step": 117040 + }, + { + "epoch": 0.3543050867979356, + "grad_norm": 0.11270318925380707, + "learning_rate": 8.095729661621605e-05, + "loss": 1.5236311912536622, + "step": 117050 + }, + { + "epoch": 0.35433535634828145, + "grad_norm": 0.11067009717226028, + "learning_rate": 8.095350143457867e-05, + "loss": 1.5408145904541015, + "step": 117060 + }, + { + "epoch": 0.3543656258986273, + "grad_norm": 0.1056990772485733, + "learning_rate": 8.094970625294126e-05, + "loss": 1.5500124931335448, + "step": 117070 + }, + { + "epoch": 0.3543958954489731, + "grad_norm": 0.10832004994153976, + "learning_rate": 8.094591107130388e-05, + "loss": 1.565192699432373, + "step": 117080 + }, + { + "epoch": 0.35442616499931895, + "grad_norm": 0.10770431905984879, + "learning_rate": 8.094211588966647e-05, + "loss": 1.561495018005371, + "step": 117090 + }, + { + "epoch": 0.35445643454966475, + "grad_norm": 0.12357375025749207, + "learning_rate": 8.093832070802909e-05, + "loss": 1.5671899795532227, + "step": 117100 + }, + { + "epoch": 0.3544867041000106, + "grad_norm": 0.12010407447814941, + "learning_rate": 8.093452552639169e-05, + "loss": 1.6011348724365235, + "step": 117110 + }, + { + "epoch": 0.35451697365035645, + "grad_norm": 0.11117582768201828, + "learning_rate": 8.09307303447543e-05, + "loss": 1.5113260269165039, + "step": 117120 + }, + { + "epoch": 0.35454724320070224, + "grad_norm": 0.11946380883455276, + "learning_rate": 8.09269351631169e-05, + "loss": 1.5470145225524903, + "step": 117130 + }, + { + "epoch": 0.3545775127510481, + "grad_norm": 0.10828098654747009, + "learning_rate": 8.092313998147952e-05, + "loss": 1.530278778076172, + "step": 117140 + }, + { + "epoch": 0.3546077823013939, + "grad_norm": 0.11343096196651459, + "learning_rate": 8.091934479984212e-05, + "loss": 1.528437042236328, + "step": 117150 + }, + { + "epoch": 0.35463805185173974, + "grad_norm": 0.11804863065481186, + "learning_rate": 8.091554961820473e-05, + "loss": 1.5545092582702638, + "step": 117160 + }, + { + "epoch": 0.3546683214020856, + "grad_norm": 0.12402091920375824, + "learning_rate": 8.091175443656735e-05, + "loss": 1.567348575592041, + "step": 117170 + }, + { + "epoch": 0.3546985909524314, + "grad_norm": 0.11641396582126617, + "learning_rate": 8.090795925492994e-05, + "loss": 1.531291389465332, + "step": 117180 + }, + { + "epoch": 0.35472886050277724, + "grad_norm": 0.12409710139036179, + "learning_rate": 8.090416407329256e-05, + "loss": 1.5380095481872558, + "step": 117190 + }, + { + "epoch": 0.35475913005312304, + "grad_norm": 0.11770972609519958, + "learning_rate": 8.090036889165515e-05, + "loss": 1.5361036300659179, + "step": 117200 + }, + { + "epoch": 0.3547893996034689, + "grad_norm": 0.11687033623456955, + "learning_rate": 8.089657371001777e-05, + "loss": 1.5287864685058594, + "step": 117210 + }, + { + "epoch": 0.35481966915381474, + "grad_norm": 0.11740656942129135, + "learning_rate": 8.089277852838036e-05, + "loss": 1.5594736099243165, + "step": 117220 + }, + { + "epoch": 0.35484993870416054, + "grad_norm": 0.10878437012434006, + "learning_rate": 8.088898334674298e-05, + "loss": 1.5801393508911132, + "step": 117230 + }, + { + "epoch": 0.3548802082545064, + "grad_norm": 0.11135841906070709, + "learning_rate": 8.088518816510558e-05, + "loss": 1.5064261436462403, + "step": 117240 + }, + { + "epoch": 0.3549104778048522, + "grad_norm": 0.11471399664878845, + "learning_rate": 8.08813929834682e-05, + "loss": 1.5408466339111329, + "step": 117250 + }, + { + "epoch": 0.35494074735519804, + "grad_norm": 0.10984589904546738, + "learning_rate": 8.087759780183079e-05, + "loss": 1.5282394409179687, + "step": 117260 + }, + { + "epoch": 0.3549710169055439, + "grad_norm": 0.12583519518375397, + "learning_rate": 8.087380262019341e-05, + "loss": 1.496718692779541, + "step": 117270 + }, + { + "epoch": 0.3550012864558897, + "grad_norm": 0.12207451462745667, + "learning_rate": 8.0870007438556e-05, + "loss": 1.5527558326721191, + "step": 117280 + }, + { + "epoch": 0.35503155600623554, + "grad_norm": 0.10507147759199142, + "learning_rate": 8.086621225691862e-05, + "loss": 1.5767333030700683, + "step": 117290 + }, + { + "epoch": 0.35506182555658133, + "grad_norm": 0.1033494770526886, + "learning_rate": 8.086241707528123e-05, + "loss": 1.5554817199707032, + "step": 117300 + }, + { + "epoch": 0.3550920951069272, + "grad_norm": 0.09760309010744095, + "learning_rate": 8.085862189364383e-05, + "loss": 1.5040307998657227, + "step": 117310 + }, + { + "epoch": 0.35512236465727304, + "grad_norm": 0.11466652899980545, + "learning_rate": 8.085482671200644e-05, + "loss": 1.5200929641723633, + "step": 117320 + }, + { + "epoch": 0.35515263420761883, + "grad_norm": 0.0965743139386177, + "learning_rate": 8.085103153036904e-05, + "loss": 1.5328849792480468, + "step": 117330 + }, + { + "epoch": 0.3551829037579647, + "grad_norm": 0.125007763504982, + "learning_rate": 8.084723634873165e-05, + "loss": 1.562791347503662, + "step": 117340 + }, + { + "epoch": 0.3552131733083105, + "grad_norm": 0.1085902750492096, + "learning_rate": 8.084344116709426e-05, + "loss": 1.5234893798828124, + "step": 117350 + }, + { + "epoch": 0.35524344285865633, + "grad_norm": 0.11900077015161514, + "learning_rate": 8.083964598545687e-05, + "loss": 1.5420236587524414, + "step": 117360 + }, + { + "epoch": 0.3552737124090022, + "grad_norm": 0.1063215583562851, + "learning_rate": 8.083585080381947e-05, + "loss": 1.5470499038696288, + "step": 117370 + }, + { + "epoch": 0.355303981959348, + "grad_norm": 0.12286349385976791, + "learning_rate": 8.083205562218209e-05, + "loss": 1.5211262702941895, + "step": 117380 + }, + { + "epoch": 0.35533425150969383, + "grad_norm": 0.10440851002931595, + "learning_rate": 8.082826044054468e-05, + "loss": 1.5372509956359863, + "step": 117390 + }, + { + "epoch": 0.35536452106003963, + "grad_norm": 0.11437888443470001, + "learning_rate": 8.08244652589073e-05, + "loss": 1.5178277969360352, + "step": 117400 + }, + { + "epoch": 0.3553947906103855, + "grad_norm": 0.09444029629230499, + "learning_rate": 8.08206700772699e-05, + "loss": 1.5811975479125977, + "step": 117410 + }, + { + "epoch": 0.35542506016073133, + "grad_norm": 0.09823501110076904, + "learning_rate": 8.081687489563251e-05, + "loss": 1.5462080001831056, + "step": 117420 + }, + { + "epoch": 0.35545532971107713, + "grad_norm": 0.0955592542886734, + "learning_rate": 8.081307971399512e-05, + "loss": 1.5763895034790039, + "step": 117430 + }, + { + "epoch": 0.355485599261423, + "grad_norm": 0.12417721003293991, + "learning_rate": 8.080928453235772e-05, + "loss": 1.5173236846923828, + "step": 117440 + }, + { + "epoch": 0.3555158688117688, + "grad_norm": 0.10511324554681778, + "learning_rate": 8.080548935072033e-05, + "loss": 1.5622430801391602, + "step": 117450 + }, + { + "epoch": 0.3555461383621146, + "grad_norm": 0.11872933804988861, + "learning_rate": 8.080169416908293e-05, + "loss": 1.5467697143554688, + "step": 117460 + }, + { + "epoch": 0.3555764079124605, + "grad_norm": 0.09564049541950226, + "learning_rate": 8.079789898744554e-05, + "loss": 1.554274082183838, + "step": 117470 + }, + { + "epoch": 0.3556066774628063, + "grad_norm": 0.11209923774003983, + "learning_rate": 8.079410380580815e-05, + "loss": 1.550203227996826, + "step": 117480 + }, + { + "epoch": 0.3556369470131521, + "grad_norm": 0.11421475559473038, + "learning_rate": 8.079030862417075e-05, + "loss": 1.5269624710083007, + "step": 117490 + }, + { + "epoch": 0.3556672165634979, + "grad_norm": 0.10714684426784515, + "learning_rate": 8.078651344253336e-05, + "loss": 1.5325736999511719, + "step": 117500 + }, + { + "epoch": 0.3556672165634979, + "eval_loss": 1.543700933456421, + "eval_runtime": 28.012, + "eval_samples_per_second": 17.85, + "eval_steps_per_second": 1.142, + "step": 117500 + }, + { + "epoch": 0.3556974861138438, + "grad_norm": 0.11411651968955994, + "learning_rate": 8.078271826089596e-05, + "loss": 1.562923049926758, + "step": 117510 + }, + { + "epoch": 0.3557277556641896, + "grad_norm": 0.10201306641101837, + "learning_rate": 8.077892307925857e-05, + "loss": 1.5513774871826171, + "step": 117520 + }, + { + "epoch": 0.3557580252145354, + "grad_norm": 0.11216483265161514, + "learning_rate": 8.077512789762118e-05, + "loss": 1.5250043869018555, + "step": 117530 + }, + { + "epoch": 0.3557882947648813, + "grad_norm": 0.10475098341703415, + "learning_rate": 8.07713327159838e-05, + "loss": 1.566086769104004, + "step": 117540 + }, + { + "epoch": 0.35581856431522707, + "grad_norm": 0.1261386275291443, + "learning_rate": 8.076753753434639e-05, + "loss": 1.5810102462768554, + "step": 117550 + }, + { + "epoch": 0.3558488338655729, + "grad_norm": 0.10359098017215729, + "learning_rate": 8.076374235270901e-05, + "loss": 1.557546901702881, + "step": 117560 + }, + { + "epoch": 0.3558791034159188, + "grad_norm": 0.10363148897886276, + "learning_rate": 8.075994717107161e-05, + "loss": 1.544516658782959, + "step": 117570 + }, + { + "epoch": 0.35590937296626457, + "grad_norm": 0.10475935786962509, + "learning_rate": 8.075615198943422e-05, + "loss": 1.5462239265441895, + "step": 117580 + }, + { + "epoch": 0.3559396425166104, + "grad_norm": 0.11317400634288788, + "learning_rate": 8.075235680779683e-05, + "loss": 1.5252087593078614, + "step": 117590 + }, + { + "epoch": 0.3559699120669562, + "grad_norm": 0.11128649115562439, + "learning_rate": 8.074856162615943e-05, + "loss": 1.537705421447754, + "step": 117600 + }, + { + "epoch": 0.35600018161730207, + "grad_norm": 0.10239724814891815, + "learning_rate": 8.074476644452204e-05, + "loss": 1.5535043716430663, + "step": 117610 + }, + { + "epoch": 0.3560304511676479, + "grad_norm": 0.11155588179826736, + "learning_rate": 8.074097126288464e-05, + "loss": 1.555091667175293, + "step": 117620 + }, + { + "epoch": 0.3560607207179937, + "grad_norm": 0.1020553931593895, + "learning_rate": 8.073717608124725e-05, + "loss": 1.521888828277588, + "step": 117630 + }, + { + "epoch": 0.35609099026833957, + "grad_norm": 0.10748498141765594, + "learning_rate": 8.073338089960985e-05, + "loss": 1.5575237274169922, + "step": 117640 + }, + { + "epoch": 0.3561212598186854, + "grad_norm": 0.11191708594560623, + "learning_rate": 8.072958571797246e-05, + "loss": 1.539698600769043, + "step": 117650 + }, + { + "epoch": 0.3561515293690312, + "grad_norm": 0.10709306597709656, + "learning_rate": 8.072579053633507e-05, + "loss": 1.56350679397583, + "step": 117660 + }, + { + "epoch": 0.35618179891937707, + "grad_norm": 0.11856818199157715, + "learning_rate": 8.072199535469769e-05, + "loss": 1.5581531524658203, + "step": 117670 + }, + { + "epoch": 0.35621206846972286, + "grad_norm": 0.10100698471069336, + "learning_rate": 8.071820017306028e-05, + "loss": 1.5781625747680663, + "step": 117680 + }, + { + "epoch": 0.3562423380200687, + "grad_norm": 0.10590217262506485, + "learning_rate": 8.07144049914229e-05, + "loss": 1.545957374572754, + "step": 117690 + }, + { + "epoch": 0.35627260757041457, + "grad_norm": 0.11337421089410782, + "learning_rate": 8.071060980978549e-05, + "loss": 1.5127345085144044, + "step": 117700 + }, + { + "epoch": 0.35630287712076036, + "grad_norm": 0.11387749761343002, + "learning_rate": 8.070681462814811e-05, + "loss": 1.5544836044311523, + "step": 117710 + }, + { + "epoch": 0.3563331466711062, + "grad_norm": 0.12092979997396469, + "learning_rate": 8.07030194465107e-05, + "loss": 1.5364896774291992, + "step": 117720 + }, + { + "epoch": 0.356363416221452, + "grad_norm": 0.10446657985448837, + "learning_rate": 8.069922426487332e-05, + "loss": 1.5682332992553711, + "step": 117730 + }, + { + "epoch": 0.35639368577179786, + "grad_norm": 0.1065073236823082, + "learning_rate": 8.069542908323591e-05, + "loss": 1.5596874237060547, + "step": 117740 + }, + { + "epoch": 0.3564239553221437, + "grad_norm": 0.11473976820707321, + "learning_rate": 8.069163390159853e-05, + "loss": 1.5415928840637207, + "step": 117750 + }, + { + "epoch": 0.3564542248724895, + "grad_norm": 0.10768993198871613, + "learning_rate": 8.068783871996114e-05, + "loss": 1.5666401863098145, + "step": 117760 + }, + { + "epoch": 0.35648449442283536, + "grad_norm": 0.11068008840084076, + "learning_rate": 8.068404353832375e-05, + "loss": 1.5327947616577149, + "step": 117770 + }, + { + "epoch": 0.35651476397318116, + "grad_norm": 0.1057344451546669, + "learning_rate": 8.068024835668637e-05, + "loss": 1.5261260986328125, + "step": 117780 + }, + { + "epoch": 0.356545033523527, + "grad_norm": 0.103028304874897, + "learning_rate": 8.067645317504896e-05, + "loss": 1.538180160522461, + "step": 117790 + }, + { + "epoch": 0.35657530307387286, + "grad_norm": 0.1048494428396225, + "learning_rate": 8.067265799341158e-05, + "loss": 1.5416049003601073, + "step": 117800 + }, + { + "epoch": 0.35660557262421866, + "grad_norm": 0.10998067259788513, + "learning_rate": 8.066886281177417e-05, + "loss": 1.5810518264770508, + "step": 117810 + }, + { + "epoch": 0.3566358421745645, + "grad_norm": 0.11998461931943893, + "learning_rate": 8.066506763013679e-05, + "loss": 1.5528390884399415, + "step": 117820 + }, + { + "epoch": 0.3566661117249103, + "grad_norm": 0.1120472103357315, + "learning_rate": 8.066127244849938e-05, + "loss": 1.5957916259765625, + "step": 117830 + }, + { + "epoch": 0.35669638127525616, + "grad_norm": 0.10642720758914948, + "learning_rate": 8.0657477266862e-05, + "loss": 1.549784278869629, + "step": 117840 + }, + { + "epoch": 0.356726650825602, + "grad_norm": 0.11820223182439804, + "learning_rate": 8.06536820852246e-05, + "loss": 1.5823522567749024, + "step": 117850 + }, + { + "epoch": 0.3567569203759478, + "grad_norm": 0.10770977288484573, + "learning_rate": 8.064988690358721e-05, + "loss": 1.5401037216186524, + "step": 117860 + }, + { + "epoch": 0.35678718992629366, + "grad_norm": 0.11512473225593567, + "learning_rate": 8.06460917219498e-05, + "loss": 1.5571310997009278, + "step": 117870 + }, + { + "epoch": 0.35681745947663945, + "grad_norm": 0.10934273898601532, + "learning_rate": 8.064229654031242e-05, + "loss": 1.530438232421875, + "step": 117880 + }, + { + "epoch": 0.3568477290269853, + "grad_norm": 0.10445963591337204, + "learning_rate": 8.063850135867502e-05, + "loss": 1.5133384704589843, + "step": 117890 + }, + { + "epoch": 0.35687799857733116, + "grad_norm": 0.1025567501783371, + "learning_rate": 8.063470617703764e-05, + "loss": 1.5754159927368163, + "step": 117900 + }, + { + "epoch": 0.35690826812767695, + "grad_norm": 0.11212094873189926, + "learning_rate": 8.063091099540024e-05, + "loss": 1.559041976928711, + "step": 117910 + }, + { + "epoch": 0.3569385376780228, + "grad_norm": 0.10567240417003632, + "learning_rate": 8.062711581376285e-05, + "loss": 1.5394376754760741, + "step": 117920 + }, + { + "epoch": 0.3569688072283686, + "grad_norm": 0.10704560577869415, + "learning_rate": 8.062332063212545e-05, + "loss": 1.6117887496948242, + "step": 117930 + }, + { + "epoch": 0.35699907677871445, + "grad_norm": 0.10202019661664963, + "learning_rate": 8.061952545048806e-05, + "loss": 1.5325105667114258, + "step": 117940 + }, + { + "epoch": 0.3570293463290603, + "grad_norm": 0.11126946657896042, + "learning_rate": 8.061573026885067e-05, + "loss": 1.5216270446777345, + "step": 117950 + }, + { + "epoch": 0.3570596158794061, + "grad_norm": 0.10167333483695984, + "learning_rate": 8.061193508721327e-05, + "loss": 1.5293296813964843, + "step": 117960 + }, + { + "epoch": 0.35708988542975195, + "grad_norm": 0.10242287814617157, + "learning_rate": 8.060813990557589e-05, + "loss": 1.5711301803588866, + "step": 117970 + }, + { + "epoch": 0.35712015498009775, + "grad_norm": 0.11766462028026581, + "learning_rate": 8.060434472393848e-05, + "loss": 1.5552892684936523, + "step": 117980 + }, + { + "epoch": 0.3571504245304436, + "grad_norm": 0.1140078455209732, + "learning_rate": 8.06005495423011e-05, + "loss": 1.5211291313171387, + "step": 117990 + }, + { + "epoch": 0.35718069408078945, + "grad_norm": 0.10514318943023682, + "learning_rate": 8.05967543606637e-05, + "loss": 1.5148373603820802, + "step": 118000 + }, + { + "epoch": 0.35718069408078945, + "eval_loss": 1.5373222827911377, + "eval_runtime": 28.1904, + "eval_samples_per_second": 17.737, + "eval_steps_per_second": 1.135, + "step": 118000 + }, + { + "epoch": 0.35721096363113525, + "grad_norm": 0.10986055433750153, + "learning_rate": 8.059295917902632e-05, + "loss": 1.5343586921691894, + "step": 118010 + }, + { + "epoch": 0.3572412331814811, + "grad_norm": 0.10463886708021164, + "learning_rate": 8.058916399738891e-05, + "loss": 1.5376492500305177, + "step": 118020 + }, + { + "epoch": 0.3572715027318269, + "grad_norm": 0.11807078123092651, + "learning_rate": 8.058536881575153e-05, + "loss": 1.538493824005127, + "step": 118030 + }, + { + "epoch": 0.35730177228217275, + "grad_norm": 0.11220957338809967, + "learning_rate": 8.058157363411413e-05, + "loss": 1.5448888778686523, + "step": 118040 + }, + { + "epoch": 0.3573320418325186, + "grad_norm": 0.10195960104465485, + "learning_rate": 8.057777845247674e-05, + "loss": 1.5400097846984864, + "step": 118050 + }, + { + "epoch": 0.3573623113828644, + "grad_norm": 0.10125380754470825, + "learning_rate": 8.057398327083935e-05, + "loss": 1.5635395050048828, + "step": 118060 + }, + { + "epoch": 0.35739258093321025, + "grad_norm": 0.11424839496612549, + "learning_rate": 8.057018808920195e-05, + "loss": 1.5708932876586914, + "step": 118070 + }, + { + "epoch": 0.35742285048355604, + "grad_norm": 0.11497610062360764, + "learning_rate": 8.056639290756456e-05, + "loss": 1.529989242553711, + "step": 118080 + }, + { + "epoch": 0.3574531200339019, + "grad_norm": 0.11470682173967361, + "learning_rate": 8.056259772592716e-05, + "loss": 1.5709959030151368, + "step": 118090 + }, + { + "epoch": 0.35748338958424775, + "grad_norm": 0.1130116656422615, + "learning_rate": 8.055880254428977e-05, + "loss": 1.558479118347168, + "step": 118100 + }, + { + "epoch": 0.35751365913459354, + "grad_norm": 0.11338021606206894, + "learning_rate": 8.055500736265238e-05, + "loss": 1.5573315620422363, + "step": 118110 + }, + { + "epoch": 0.3575439286849394, + "grad_norm": 0.11301683634519577, + "learning_rate": 8.055121218101498e-05, + "loss": 1.5402788162231444, + "step": 118120 + }, + { + "epoch": 0.3575741982352852, + "grad_norm": 0.10141494125127792, + "learning_rate": 8.054741699937759e-05, + "loss": 1.5556757926940918, + "step": 118130 + }, + { + "epoch": 0.35760446778563104, + "grad_norm": 0.10652372986078262, + "learning_rate": 8.054362181774019e-05, + "loss": 1.5679404258728027, + "step": 118140 + }, + { + "epoch": 0.3576347373359769, + "grad_norm": 0.10663438588380814, + "learning_rate": 8.053982663610281e-05, + "loss": 1.5354998588562012, + "step": 118150 + }, + { + "epoch": 0.3576650068863227, + "grad_norm": 0.11961542814970016, + "learning_rate": 8.05360314544654e-05, + "loss": 1.555187702178955, + "step": 118160 + }, + { + "epoch": 0.35769527643666854, + "grad_norm": 0.11952144652605057, + "learning_rate": 8.053223627282802e-05, + "loss": 1.5360063552856444, + "step": 118170 + }, + { + "epoch": 0.35772554598701434, + "grad_norm": 0.10715395957231522, + "learning_rate": 8.052844109119063e-05, + "loss": 1.5637575149536134, + "step": 118180 + }, + { + "epoch": 0.3577558155373602, + "grad_norm": 0.10643669962882996, + "learning_rate": 8.052464590955324e-05, + "loss": 1.5379859924316406, + "step": 118190 + }, + { + "epoch": 0.35778608508770604, + "grad_norm": 0.12069394439458847, + "learning_rate": 8.052085072791584e-05, + "loss": 1.5639796257019043, + "step": 118200 + }, + { + "epoch": 0.35781635463805184, + "grad_norm": 0.10137154906988144, + "learning_rate": 8.051705554627845e-05, + "loss": 1.5498353004455567, + "step": 118210 + }, + { + "epoch": 0.3578466241883977, + "grad_norm": 0.09880975633859634, + "learning_rate": 8.051326036464105e-05, + "loss": 1.5667868614196778, + "step": 118220 + }, + { + "epoch": 0.3578768937387435, + "grad_norm": 0.11059058457612991, + "learning_rate": 8.050946518300366e-05, + "loss": 1.562178611755371, + "step": 118230 + }, + { + "epoch": 0.35790716328908934, + "grad_norm": 0.11098871380090714, + "learning_rate": 8.050567000136627e-05, + "loss": 1.5459040641784667, + "step": 118240 + }, + { + "epoch": 0.3579374328394352, + "grad_norm": 0.11586934328079224, + "learning_rate": 8.050187481972887e-05, + "loss": 1.5520401000976562, + "step": 118250 + }, + { + "epoch": 0.357967702389781, + "grad_norm": 0.1036146730184555, + "learning_rate": 8.049807963809148e-05, + "loss": 1.5598539352416991, + "step": 118260 + }, + { + "epoch": 0.35799797194012684, + "grad_norm": 0.10895901173353195, + "learning_rate": 8.049428445645408e-05, + "loss": 1.5339910507202148, + "step": 118270 + }, + { + "epoch": 0.35802824149047263, + "grad_norm": 0.11064320057630539, + "learning_rate": 8.04904892748167e-05, + "loss": 1.553986167907715, + "step": 118280 + }, + { + "epoch": 0.3580585110408185, + "grad_norm": 0.1188540980219841, + "learning_rate": 8.04866940931793e-05, + "loss": 1.5848745346069335, + "step": 118290 + }, + { + "epoch": 0.35808878059116434, + "grad_norm": 0.11093083024024963, + "learning_rate": 8.048289891154192e-05, + "loss": 1.5519401550292968, + "step": 118300 + }, + { + "epoch": 0.35811905014151013, + "grad_norm": 0.1070975512266159, + "learning_rate": 8.047910372990451e-05, + "loss": 1.5430517196655273, + "step": 118310 + }, + { + "epoch": 0.358149319691856, + "grad_norm": 0.1036636233329773, + "learning_rate": 8.047530854826713e-05, + "loss": 1.5322252273559571, + "step": 118320 + }, + { + "epoch": 0.35817958924220183, + "grad_norm": 0.12428324669599533, + "learning_rate": 8.047151336662972e-05, + "loss": 1.5643457412719726, + "step": 118330 + }, + { + "epoch": 0.35820985879254763, + "grad_norm": 0.10870435833930969, + "learning_rate": 8.046771818499234e-05, + "loss": 1.5667770385742188, + "step": 118340 + }, + { + "epoch": 0.3582401283428935, + "grad_norm": 0.10639969259500504, + "learning_rate": 8.046392300335493e-05, + "loss": 1.5733075141906738, + "step": 118350 + }, + { + "epoch": 0.3582703978932393, + "grad_norm": 0.11467064917087555, + "learning_rate": 8.046012782171755e-05, + "loss": 1.5721628189086914, + "step": 118360 + }, + { + "epoch": 0.35830066744358513, + "grad_norm": 0.12373402714729309, + "learning_rate": 8.045633264008016e-05, + "loss": 1.5171880722045898, + "step": 118370 + }, + { + "epoch": 0.358330936993931, + "grad_norm": 0.115155890583992, + "learning_rate": 8.045253745844276e-05, + "loss": 1.5634796142578125, + "step": 118380 + }, + { + "epoch": 0.3583612065442768, + "grad_norm": 0.1146104484796524, + "learning_rate": 8.044874227680537e-05, + "loss": 1.569190788269043, + "step": 118390 + }, + { + "epoch": 0.35839147609462263, + "grad_norm": 0.115401491522789, + "learning_rate": 8.044494709516797e-05, + "loss": 1.539889144897461, + "step": 118400 + }, + { + "epoch": 0.3584217456449684, + "grad_norm": 0.10967392474412918, + "learning_rate": 8.04411519135306e-05, + "loss": 1.5569005012512207, + "step": 118410 + }, + { + "epoch": 0.3584520151953143, + "grad_norm": 0.10078799724578857, + "learning_rate": 8.043735673189319e-05, + "loss": 1.566675853729248, + "step": 118420 + }, + { + "epoch": 0.35848228474566013, + "grad_norm": 0.1128738671541214, + "learning_rate": 8.04335615502558e-05, + "loss": 1.5394554138183594, + "step": 118430 + }, + { + "epoch": 0.3585125542960059, + "grad_norm": 0.13707192242145538, + "learning_rate": 8.04297663686184e-05, + "loss": 1.5606071472167968, + "step": 118440 + }, + { + "epoch": 0.3585428238463518, + "grad_norm": 0.12039776146411896, + "learning_rate": 8.042597118698102e-05, + "loss": 1.5455266952514648, + "step": 118450 + }, + { + "epoch": 0.3585730933966976, + "grad_norm": 0.11454665660858154, + "learning_rate": 8.042217600534361e-05, + "loss": 1.525475311279297, + "step": 118460 + }, + { + "epoch": 0.3586033629470434, + "grad_norm": 0.10721751302480698, + "learning_rate": 8.041838082370623e-05, + "loss": 1.5444632530212403, + "step": 118470 + }, + { + "epoch": 0.3586336324973893, + "grad_norm": 0.11773433536291122, + "learning_rate": 8.041458564206882e-05, + "loss": 1.5191262245178223, + "step": 118480 + }, + { + "epoch": 0.3586639020477351, + "grad_norm": 0.10900865495204926, + "learning_rate": 8.041079046043144e-05, + "loss": 1.5463420867919921, + "step": 118490 + }, + { + "epoch": 0.3586941715980809, + "grad_norm": 0.10951420664787292, + "learning_rate": 8.040699527879403e-05, + "loss": 1.5230727195739746, + "step": 118500 + }, + { + "epoch": 0.3586941715980809, + "eval_loss": 1.5603187084197998, + "eval_runtime": 28.1699, + "eval_samples_per_second": 17.749, + "eval_steps_per_second": 1.136, + "step": 118500 + }, + { + "epoch": 0.3587244411484267, + "grad_norm": 0.10659606009721756, + "learning_rate": 8.040320009715665e-05, + "loss": 1.5929004669189453, + "step": 118510 + }, + { + "epoch": 0.35875471069877257, + "grad_norm": 0.13059185445308685, + "learning_rate": 8.039940491551925e-05, + "loss": 1.55834903717041, + "step": 118520 + }, + { + "epoch": 0.3587849802491184, + "grad_norm": 0.10894738882780075, + "learning_rate": 8.039560973388187e-05, + "loss": 1.5487674713134765, + "step": 118530 + }, + { + "epoch": 0.3588152497994642, + "grad_norm": 0.10585493594408035, + "learning_rate": 8.039181455224447e-05, + "loss": 1.5594977378845214, + "step": 118540 + }, + { + "epoch": 0.35884551934981007, + "grad_norm": 0.11678700149059296, + "learning_rate": 8.038801937060708e-05, + "loss": 1.542520523071289, + "step": 118550 + }, + { + "epoch": 0.35887578890015587, + "grad_norm": 0.1061149537563324, + "learning_rate": 8.038422418896968e-05, + "loss": 1.5549300193786622, + "step": 118560 + }, + { + "epoch": 0.3589060584505017, + "grad_norm": 0.10480128228664398, + "learning_rate": 8.038042900733229e-05, + "loss": 1.539202880859375, + "step": 118570 + }, + { + "epoch": 0.35893632800084757, + "grad_norm": 0.10668682307004929, + "learning_rate": 8.037663382569491e-05, + "loss": 1.5166224479675292, + "step": 118580 + }, + { + "epoch": 0.35896659755119337, + "grad_norm": 0.11553844064474106, + "learning_rate": 8.03728386440575e-05, + "loss": 1.5760982513427735, + "step": 118590 + }, + { + "epoch": 0.3589968671015392, + "grad_norm": 0.11193574965000153, + "learning_rate": 8.036904346242012e-05, + "loss": 1.558128547668457, + "step": 118600 + }, + { + "epoch": 0.359027136651885, + "grad_norm": 0.11575190722942352, + "learning_rate": 8.036524828078271e-05, + "loss": 1.5517796516418456, + "step": 118610 + }, + { + "epoch": 0.35905740620223087, + "grad_norm": 0.12417321652173996, + "learning_rate": 8.036145309914533e-05, + "loss": 1.5426639556884765, + "step": 118620 + }, + { + "epoch": 0.3590876757525767, + "grad_norm": 0.10515143722295761, + "learning_rate": 8.035765791750793e-05, + "loss": 1.5148622512817382, + "step": 118630 + }, + { + "epoch": 0.3591179453029225, + "grad_norm": 0.1041489765048027, + "learning_rate": 8.035386273587054e-05, + "loss": 1.5560158729553222, + "step": 118640 + }, + { + "epoch": 0.35914821485326837, + "grad_norm": 0.10850361734628677, + "learning_rate": 8.035006755423315e-05, + "loss": 1.545844841003418, + "step": 118650 + }, + { + "epoch": 0.35917848440361416, + "grad_norm": 0.11474861204624176, + "learning_rate": 8.034627237259576e-05, + "loss": 1.5424772262573243, + "step": 118660 + }, + { + "epoch": 0.35920875395396, + "grad_norm": 0.10861927270889282, + "learning_rate": 8.034247719095836e-05, + "loss": 1.571477508544922, + "step": 118670 + }, + { + "epoch": 0.35923902350430587, + "grad_norm": 0.11671943962574005, + "learning_rate": 8.033868200932097e-05, + "loss": 1.5556915283203125, + "step": 118680 + }, + { + "epoch": 0.35926929305465166, + "grad_norm": 0.11674879491329193, + "learning_rate": 8.033488682768357e-05, + "loss": 1.5570239067077636, + "step": 118690 + }, + { + "epoch": 0.3592995626049975, + "grad_norm": 0.1203356683254242, + "learning_rate": 8.033109164604618e-05, + "loss": 1.542665386199951, + "step": 118700 + }, + { + "epoch": 0.3593298321553433, + "grad_norm": 0.1188371554017067, + "learning_rate": 8.032729646440879e-05, + "loss": 1.5445321083068848, + "step": 118710 + }, + { + "epoch": 0.35936010170568916, + "grad_norm": 0.10429948568344116, + "learning_rate": 8.032350128277139e-05, + "loss": 1.537776565551758, + "step": 118720 + }, + { + "epoch": 0.359390371256035, + "grad_norm": 0.11750392615795135, + "learning_rate": 8.0319706101134e-05, + "loss": 1.581406307220459, + "step": 118730 + }, + { + "epoch": 0.3594206408063808, + "grad_norm": 0.12413232028484344, + "learning_rate": 8.03159109194966e-05, + "loss": 1.5483269691467285, + "step": 118740 + }, + { + "epoch": 0.35945091035672666, + "grad_norm": 0.11707937717437744, + "learning_rate": 8.031211573785921e-05, + "loss": 1.5695106506347656, + "step": 118750 + }, + { + "epoch": 0.35948117990707246, + "grad_norm": 0.1093582957983017, + "learning_rate": 8.030832055622182e-05, + "loss": 1.526803970336914, + "step": 118760 + }, + { + "epoch": 0.3595114494574183, + "grad_norm": 0.12422195076942444, + "learning_rate": 8.030452537458444e-05, + "loss": 1.5241966247558594, + "step": 118770 + }, + { + "epoch": 0.35954171900776416, + "grad_norm": 0.10239122062921524, + "learning_rate": 8.030073019294704e-05, + "loss": 1.5202302932739258, + "step": 118780 + }, + { + "epoch": 0.35957198855810996, + "grad_norm": 0.11108333617448807, + "learning_rate": 8.029693501130965e-05, + "loss": 1.5459554672241211, + "step": 118790 + }, + { + "epoch": 0.3596022581084558, + "grad_norm": 0.11776399612426758, + "learning_rate": 8.029313982967225e-05, + "loss": 1.6086246490478515, + "step": 118800 + }, + { + "epoch": 0.3596325276588016, + "grad_norm": 0.11409489810466766, + "learning_rate": 8.028934464803486e-05, + "loss": 1.5731595039367676, + "step": 118810 + }, + { + "epoch": 0.35966279720914746, + "grad_norm": 0.10291177034378052, + "learning_rate": 8.028554946639747e-05, + "loss": 1.568985939025879, + "step": 118820 + }, + { + "epoch": 0.3596930667594933, + "grad_norm": 0.11540049314498901, + "learning_rate": 8.028175428476007e-05, + "loss": 1.5714350700378419, + "step": 118830 + }, + { + "epoch": 0.3597233363098391, + "grad_norm": 0.11048747599124908, + "learning_rate": 8.027795910312268e-05, + "loss": 1.5436052322387694, + "step": 118840 + }, + { + "epoch": 0.35975360586018496, + "grad_norm": 0.11413058638572693, + "learning_rate": 8.027416392148528e-05, + "loss": 1.5424235343933106, + "step": 118850 + }, + { + "epoch": 0.35978387541053075, + "grad_norm": 0.10820233076810837, + "learning_rate": 8.027036873984789e-05, + "loss": 1.5340024948120117, + "step": 118860 + }, + { + "epoch": 0.3598141449608766, + "grad_norm": 0.1120234802365303, + "learning_rate": 8.02665735582105e-05, + "loss": 1.5403312683105468, + "step": 118870 + }, + { + "epoch": 0.35984441451122245, + "grad_norm": 0.12583978474140167, + "learning_rate": 8.02627783765731e-05, + "loss": 1.5591896057128907, + "step": 118880 + }, + { + "epoch": 0.35987468406156825, + "grad_norm": 0.10671689361333847, + "learning_rate": 8.025898319493572e-05, + "loss": 1.565491008758545, + "step": 118890 + }, + { + "epoch": 0.3599049536119141, + "grad_norm": 0.1067379042506218, + "learning_rate": 8.025518801329831e-05, + "loss": 1.544021224975586, + "step": 118900 + }, + { + "epoch": 0.3599352231622599, + "grad_norm": 0.11974871903657913, + "learning_rate": 8.025139283166093e-05, + "loss": 1.508094310760498, + "step": 118910 + }, + { + "epoch": 0.35996549271260575, + "grad_norm": 0.10383105278015137, + "learning_rate": 8.024759765002352e-05, + "loss": 1.556045150756836, + "step": 118920 + }, + { + "epoch": 0.3599957622629516, + "grad_norm": 0.11698047071695328, + "learning_rate": 8.024380246838614e-05, + "loss": 1.5306418418884278, + "step": 118930 + }, + { + "epoch": 0.3600260318132974, + "grad_norm": 0.10245173424482346, + "learning_rate": 8.024000728674874e-05, + "loss": 1.5634612083435058, + "step": 118940 + }, + { + "epoch": 0.36005630136364325, + "grad_norm": 0.11584983021020889, + "learning_rate": 8.023621210511136e-05, + "loss": 1.5789477348327636, + "step": 118950 + }, + { + "epoch": 0.36008657091398905, + "grad_norm": 0.12122167646884918, + "learning_rate": 8.023241692347395e-05, + "loss": 1.5304442405700684, + "step": 118960 + }, + { + "epoch": 0.3601168404643349, + "grad_norm": 0.10938269644975662, + "learning_rate": 8.022862174183657e-05, + "loss": 1.551835060119629, + "step": 118970 + }, + { + "epoch": 0.36014711001468075, + "grad_norm": 0.10733843594789505, + "learning_rate": 8.022482656019917e-05, + "loss": 1.561720371246338, + "step": 118980 + }, + { + "epoch": 0.36017737956502655, + "grad_norm": 0.1100572794675827, + "learning_rate": 8.022103137856178e-05, + "loss": 1.545011043548584, + "step": 118990 + }, + { + "epoch": 0.3602076491153724, + "grad_norm": 0.10779521614313126, + "learning_rate": 8.021723619692439e-05, + "loss": 1.584876251220703, + "step": 119000 + }, + { + "epoch": 0.3602076491153724, + "eval_loss": 1.5397509336471558, + "eval_runtime": 28.3218, + "eval_samples_per_second": 17.654, + "eval_steps_per_second": 1.13, + "step": 119000 + }, + { + "epoch": 0.3602379186657182, + "grad_norm": 0.10535000264644623, + "learning_rate": 8.021344101528699e-05, + "loss": 1.5359648704528808, + "step": 119010 + }, + { + "epoch": 0.36026818821606404, + "grad_norm": 0.10101112723350525, + "learning_rate": 8.020964583364961e-05, + "loss": 1.5545830726623535, + "step": 119020 + }, + { + "epoch": 0.3602984577664099, + "grad_norm": 0.10492098331451416, + "learning_rate": 8.02058506520122e-05, + "loss": 1.5569393157958984, + "step": 119030 + }, + { + "epoch": 0.3603287273167557, + "grad_norm": 0.10594990104436874, + "learning_rate": 8.020205547037482e-05, + "loss": 1.5442066192626953, + "step": 119040 + }, + { + "epoch": 0.36035899686710154, + "grad_norm": 0.1095825731754303, + "learning_rate": 8.019826028873742e-05, + "loss": 1.541459846496582, + "step": 119050 + }, + { + "epoch": 0.3603892664174474, + "grad_norm": 0.10999222099781036, + "learning_rate": 8.019446510710004e-05, + "loss": 1.5619283676147462, + "step": 119060 + }, + { + "epoch": 0.3604195359677932, + "grad_norm": 0.12308865040540695, + "learning_rate": 8.019066992546263e-05, + "loss": 1.5268346786499023, + "step": 119070 + }, + { + "epoch": 0.36044980551813904, + "grad_norm": 0.10743635892868042, + "learning_rate": 8.018687474382525e-05, + "loss": 1.5369568824768067, + "step": 119080 + }, + { + "epoch": 0.36048007506848484, + "grad_norm": 0.11580324918031693, + "learning_rate": 8.018307956218784e-05, + "loss": 1.5475194931030274, + "step": 119090 + }, + { + "epoch": 0.3605103446188307, + "grad_norm": 0.10181858390569687, + "learning_rate": 8.017928438055046e-05, + "loss": 1.5593225479125976, + "step": 119100 + }, + { + "epoch": 0.36054061416917654, + "grad_norm": 0.10247373580932617, + "learning_rate": 8.017548919891305e-05, + "loss": 1.5809375762939453, + "step": 119110 + }, + { + "epoch": 0.36057088371952234, + "grad_norm": 0.1068606823682785, + "learning_rate": 8.017169401727567e-05, + "loss": 1.5638836860656737, + "step": 119120 + }, + { + "epoch": 0.3606011532698682, + "grad_norm": 0.11991730332374573, + "learning_rate": 8.016789883563826e-05, + "loss": 1.5302240371704101, + "step": 119130 + }, + { + "epoch": 0.360631422820214, + "grad_norm": 0.12755262851715088, + "learning_rate": 8.016410365400088e-05, + "loss": 1.5478200912475586, + "step": 119140 + }, + { + "epoch": 0.36066169237055984, + "grad_norm": 0.10999947041273117, + "learning_rate": 8.016030847236349e-05, + "loss": 1.5210655212402344, + "step": 119150 + }, + { + "epoch": 0.3606919619209057, + "grad_norm": 0.11459418386220932, + "learning_rate": 8.01565132907261e-05, + "loss": 1.5753562927246094, + "step": 119160 + }, + { + "epoch": 0.3607222314712515, + "grad_norm": 0.11525199562311172, + "learning_rate": 8.01527181090887e-05, + "loss": 1.553800392150879, + "step": 119170 + }, + { + "epoch": 0.36075250102159734, + "grad_norm": 0.11599893867969513, + "learning_rate": 8.01489229274513e-05, + "loss": 1.524226188659668, + "step": 119180 + }, + { + "epoch": 0.36078277057194313, + "grad_norm": 0.10480435937643051, + "learning_rate": 8.014512774581393e-05, + "loss": 1.523748016357422, + "step": 119190 + }, + { + "epoch": 0.360813040122289, + "grad_norm": 0.11226927489042282, + "learning_rate": 8.014133256417652e-05, + "loss": 1.5580236434936523, + "step": 119200 + }, + { + "epoch": 0.36084330967263484, + "grad_norm": 0.11905810981988907, + "learning_rate": 8.013753738253914e-05, + "loss": 1.5581953048706054, + "step": 119210 + }, + { + "epoch": 0.36087357922298063, + "grad_norm": 0.11733328551054001, + "learning_rate": 8.013374220090173e-05, + "loss": 1.5576784133911132, + "step": 119220 + }, + { + "epoch": 0.3609038487733265, + "grad_norm": 0.10228148102760315, + "learning_rate": 8.012994701926435e-05, + "loss": 1.5306854248046875, + "step": 119230 + }, + { + "epoch": 0.3609341183236723, + "grad_norm": 0.106119304895401, + "learning_rate": 8.012615183762694e-05, + "loss": 1.556002426147461, + "step": 119240 + }, + { + "epoch": 0.36096438787401813, + "grad_norm": 0.10574963688850403, + "learning_rate": 8.012235665598956e-05, + "loss": 1.567222499847412, + "step": 119250 + }, + { + "epoch": 0.360994657424364, + "grad_norm": 0.10410715639591217, + "learning_rate": 8.011856147435217e-05, + "loss": 1.557873821258545, + "step": 119260 + }, + { + "epoch": 0.3610249269747098, + "grad_norm": 0.0940556526184082, + "learning_rate": 8.011476629271477e-05, + "loss": 1.539161777496338, + "step": 119270 + }, + { + "epoch": 0.36105519652505563, + "grad_norm": 0.09560229629278183, + "learning_rate": 8.011097111107738e-05, + "loss": 1.5388148307800293, + "step": 119280 + }, + { + "epoch": 0.36108546607540143, + "grad_norm": 0.12065691500902176, + "learning_rate": 8.010717592943999e-05, + "loss": 1.5200884819030762, + "step": 119290 + }, + { + "epoch": 0.3611157356257473, + "grad_norm": 0.11535551398992538, + "learning_rate": 8.010338074780259e-05, + "loss": 1.5622239112854004, + "step": 119300 + }, + { + "epoch": 0.36114600517609313, + "grad_norm": 0.10811971873044968, + "learning_rate": 8.00995855661652e-05, + "loss": 1.558092975616455, + "step": 119310 + }, + { + "epoch": 0.36117627472643893, + "grad_norm": 0.11116522550582886, + "learning_rate": 8.00957903845278e-05, + "loss": 1.5833358764648438, + "step": 119320 + }, + { + "epoch": 0.3612065442767848, + "grad_norm": 0.11708463728427887, + "learning_rate": 8.009199520289041e-05, + "loss": 1.5146352767944335, + "step": 119330 + }, + { + "epoch": 0.3612368138271306, + "grad_norm": 0.11345544457435608, + "learning_rate": 8.008820002125302e-05, + "loss": 1.5167396545410157, + "step": 119340 + }, + { + "epoch": 0.36126708337747643, + "grad_norm": 0.12214601784944534, + "learning_rate": 8.008440483961562e-05, + "loss": 1.58370361328125, + "step": 119350 + }, + { + "epoch": 0.3612973529278223, + "grad_norm": 0.12792888283729553, + "learning_rate": 8.008060965797823e-05, + "loss": 1.5620831489562987, + "step": 119360 + }, + { + "epoch": 0.3613276224781681, + "grad_norm": 0.1274002641439438, + "learning_rate": 8.007681447634083e-05, + "loss": 1.5352458953857422, + "step": 119370 + }, + { + "epoch": 0.3613578920285139, + "grad_norm": 0.10333256423473358, + "learning_rate": 8.007301929470345e-05, + "loss": 1.5305648803710938, + "step": 119380 + }, + { + "epoch": 0.3613881615788597, + "grad_norm": 0.10802481323480606, + "learning_rate": 8.006922411306606e-05, + "loss": 1.5534652709960937, + "step": 119390 + }, + { + "epoch": 0.3614184311292056, + "grad_norm": 0.10365410894155502, + "learning_rate": 8.006542893142866e-05, + "loss": 1.5449790954589844, + "step": 119400 + }, + { + "epoch": 0.3614487006795514, + "grad_norm": 0.10680419206619263, + "learning_rate": 8.006163374979127e-05, + "loss": 1.5740352630615235, + "step": 119410 + }, + { + "epoch": 0.3614789702298972, + "grad_norm": 0.10452201217412949, + "learning_rate": 8.005783856815388e-05, + "loss": 1.5553197860717773, + "step": 119420 + }, + { + "epoch": 0.3615092397802431, + "grad_norm": 0.11256306618452072, + "learning_rate": 8.005404338651648e-05, + "loss": 1.5490467071533203, + "step": 119430 + }, + { + "epoch": 0.36153950933058887, + "grad_norm": 0.11950647085905075, + "learning_rate": 8.005024820487909e-05, + "loss": 1.5434118270874024, + "step": 119440 + }, + { + "epoch": 0.3615697788809347, + "grad_norm": 0.10994976758956909, + "learning_rate": 8.00464530232417e-05, + "loss": 1.5791141510009765, + "step": 119450 + }, + { + "epoch": 0.3616000484312806, + "grad_norm": 0.1210106611251831, + "learning_rate": 8.00426578416043e-05, + "loss": 1.5230523109436036, + "step": 119460 + }, + { + "epoch": 0.36163031798162637, + "grad_norm": 0.10273458808660507, + "learning_rate": 8.00388626599669e-05, + "loss": 1.5766429901123047, + "step": 119470 + }, + { + "epoch": 0.3616605875319722, + "grad_norm": 0.12044283002614975, + "learning_rate": 8.003506747832951e-05, + "loss": 1.5667357444763184, + "step": 119480 + }, + { + "epoch": 0.361690857082318, + "grad_norm": 0.11415160447359085, + "learning_rate": 8.003127229669212e-05, + "loss": 1.5826781272888184, + "step": 119490 + }, + { + "epoch": 0.36172112663266387, + "grad_norm": 0.1058766096830368, + "learning_rate": 8.002747711505472e-05, + "loss": 1.535222053527832, + "step": 119500 + }, + { + "epoch": 0.36172112663266387, + "eval_loss": 1.5508830547332764, + "eval_runtime": 27.6903, + "eval_samples_per_second": 18.057, + "eval_steps_per_second": 1.156, + "step": 119500 + }, + { + "epoch": 0.3617513961830097, + "grad_norm": 0.10479218512773514, + "learning_rate": 8.002368193341733e-05, + "loss": 1.5477566719055176, + "step": 119510 + }, + { + "epoch": 0.3617816657333555, + "grad_norm": 0.11572083830833435, + "learning_rate": 8.001988675177995e-05, + "loss": 1.5618963241577148, + "step": 119520 + }, + { + "epoch": 0.36181193528370137, + "grad_norm": 0.10788474977016449, + "learning_rate": 8.001609157014254e-05, + "loss": 1.5260778427124024, + "step": 119530 + }, + { + "epoch": 0.36184220483404717, + "grad_norm": 0.1301324963569641, + "learning_rate": 8.001229638850516e-05, + "loss": 1.5530221939086915, + "step": 119540 + }, + { + "epoch": 0.361872474384393, + "grad_norm": 0.10905373096466064, + "learning_rate": 8.000850120686775e-05, + "loss": 1.5625744819641114, + "step": 119550 + }, + { + "epoch": 0.36190274393473887, + "grad_norm": 0.11829006671905518, + "learning_rate": 8.000470602523037e-05, + "loss": 1.540328025817871, + "step": 119560 + }, + { + "epoch": 0.36193301348508466, + "grad_norm": 0.12261106073856354, + "learning_rate": 8.000091084359297e-05, + "loss": 1.5323948860168457, + "step": 119570 + }, + { + "epoch": 0.3619632830354305, + "grad_norm": 0.1094733253121376, + "learning_rate": 7.999711566195559e-05, + "loss": 1.5044619560241699, + "step": 119580 + }, + { + "epoch": 0.3619935525857763, + "grad_norm": 0.13034740090370178, + "learning_rate": 7.999332048031819e-05, + "loss": 1.5531928062438964, + "step": 119590 + }, + { + "epoch": 0.36202382213612216, + "grad_norm": 0.11732690036296844, + "learning_rate": 7.99895252986808e-05, + "loss": 1.5643078804016113, + "step": 119600 + }, + { + "epoch": 0.362054091686468, + "grad_norm": 0.1248534545302391, + "learning_rate": 7.99857301170434e-05, + "loss": 1.5420892715454102, + "step": 119610 + }, + { + "epoch": 0.3620843612368138, + "grad_norm": 0.11128196120262146, + "learning_rate": 7.998193493540601e-05, + "loss": 1.5532312393188477, + "step": 119620 + }, + { + "epoch": 0.36211463078715966, + "grad_norm": 0.12841148674488068, + "learning_rate": 7.997813975376863e-05, + "loss": 1.5858747482299804, + "step": 119630 + }, + { + "epoch": 0.36214490033750546, + "grad_norm": 0.11707067489624023, + "learning_rate": 7.997434457213122e-05, + "loss": 1.5354080200195312, + "step": 119640 + }, + { + "epoch": 0.3621751698878513, + "grad_norm": 0.111728236079216, + "learning_rate": 7.997054939049384e-05, + "loss": 1.5090301513671875, + "step": 119650 + }, + { + "epoch": 0.36220543943819716, + "grad_norm": 0.11520233750343323, + "learning_rate": 7.996675420885643e-05, + "loss": 1.533413028717041, + "step": 119660 + }, + { + "epoch": 0.36223570898854296, + "grad_norm": 0.11058712005615234, + "learning_rate": 7.996295902721905e-05, + "loss": 1.531287956237793, + "step": 119670 + }, + { + "epoch": 0.3622659785388888, + "grad_norm": 0.11108653992414474, + "learning_rate": 7.995916384558164e-05, + "loss": 1.5094976425170898, + "step": 119680 + }, + { + "epoch": 0.3622962480892346, + "grad_norm": 0.09924820065498352, + "learning_rate": 7.995536866394426e-05, + "loss": 1.541605281829834, + "step": 119690 + }, + { + "epoch": 0.36232651763958046, + "grad_norm": 0.10353604704141617, + "learning_rate": 7.995157348230686e-05, + "loss": 1.527510643005371, + "step": 119700 + }, + { + "epoch": 0.3623567871899263, + "grad_norm": 0.12071474641561508, + "learning_rate": 7.994777830066948e-05, + "loss": 1.5398981094360351, + "step": 119710 + }, + { + "epoch": 0.3623870567402721, + "grad_norm": 0.1050884947180748, + "learning_rate": 7.994398311903207e-05, + "loss": 1.5240770339965821, + "step": 119720 + }, + { + "epoch": 0.36241732629061796, + "grad_norm": 0.10574010014533997, + "learning_rate": 7.994018793739469e-05, + "loss": 1.5819211959838868, + "step": 119730 + }, + { + "epoch": 0.3624475958409638, + "grad_norm": 0.11677144467830658, + "learning_rate": 7.993639275575728e-05, + "loss": 1.5551767349243164, + "step": 119740 + }, + { + "epoch": 0.3624778653913096, + "grad_norm": 0.10449542105197906, + "learning_rate": 7.99325975741199e-05, + "loss": 1.547894287109375, + "step": 119750 + }, + { + "epoch": 0.36250813494165546, + "grad_norm": 0.11728370934724808, + "learning_rate": 7.99288023924825e-05, + "loss": 1.5413959503173829, + "step": 119760 + }, + { + "epoch": 0.36253840449200125, + "grad_norm": 0.1114061251282692, + "learning_rate": 7.992500721084511e-05, + "loss": 1.5184553146362305, + "step": 119770 + }, + { + "epoch": 0.3625686740423471, + "grad_norm": 0.11015912890434265, + "learning_rate": 7.992121202920772e-05, + "loss": 1.5219108581542968, + "step": 119780 + }, + { + "epoch": 0.36259894359269296, + "grad_norm": 0.11109960079193115, + "learning_rate": 7.991741684757032e-05, + "loss": 1.5129501342773437, + "step": 119790 + }, + { + "epoch": 0.36262921314303875, + "grad_norm": 0.11143983900547028, + "learning_rate": 7.991362166593294e-05, + "loss": 1.5689414978027343, + "step": 119800 + }, + { + "epoch": 0.3626594826933846, + "grad_norm": 0.11739973723888397, + "learning_rate": 7.990982648429554e-05, + "loss": 1.525314712524414, + "step": 119810 + }, + { + "epoch": 0.3626897522437304, + "grad_norm": 0.12619513273239136, + "learning_rate": 7.990603130265816e-05, + "loss": 1.5503244400024414, + "step": 119820 + }, + { + "epoch": 0.36272002179407625, + "grad_norm": 0.11365983635187149, + "learning_rate": 7.990223612102075e-05, + "loss": 1.5607755661010743, + "step": 119830 + }, + { + "epoch": 0.3627502913444221, + "grad_norm": 0.10361116379499435, + "learning_rate": 7.989844093938337e-05, + "loss": 1.5388957977294921, + "step": 119840 + }, + { + "epoch": 0.3627805608947679, + "grad_norm": 0.1040213331580162, + "learning_rate": 7.989464575774596e-05, + "loss": 1.5402842521667481, + "step": 119850 + }, + { + "epoch": 0.36281083044511375, + "grad_norm": 0.10467816144227982, + "learning_rate": 7.989085057610858e-05, + "loss": 1.5489728927612305, + "step": 119860 + }, + { + "epoch": 0.36284109999545955, + "grad_norm": 0.09818840771913528, + "learning_rate": 7.988705539447117e-05, + "loss": 1.5280406951904297, + "step": 119870 + }, + { + "epoch": 0.3628713695458054, + "grad_norm": 0.10832653939723969, + "learning_rate": 7.988326021283379e-05, + "loss": 1.5666074752807617, + "step": 119880 + }, + { + "epoch": 0.36290163909615125, + "grad_norm": 0.1028154194355011, + "learning_rate": 7.98794650311964e-05, + "loss": 1.5353026390075684, + "step": 119890 + }, + { + "epoch": 0.36293190864649705, + "grad_norm": 0.10901278257369995, + "learning_rate": 7.9875669849559e-05, + "loss": 1.534797477722168, + "step": 119900 + }, + { + "epoch": 0.3629621781968429, + "grad_norm": 0.10746384412050247, + "learning_rate": 7.987187466792161e-05, + "loss": 1.514370822906494, + "step": 119910 + }, + { + "epoch": 0.3629924477471887, + "grad_norm": 0.10292394459247589, + "learning_rate": 7.986807948628421e-05, + "loss": 1.519106674194336, + "step": 119920 + }, + { + "epoch": 0.36302271729753455, + "grad_norm": 0.10460837185382843, + "learning_rate": 7.986428430464682e-05, + "loss": 1.5434364318847655, + "step": 119930 + }, + { + "epoch": 0.3630529868478804, + "grad_norm": 0.12203651666641235, + "learning_rate": 7.986048912300943e-05, + "loss": 1.56736478805542, + "step": 119940 + }, + { + "epoch": 0.3630832563982262, + "grad_norm": 0.10569170862436295, + "learning_rate": 7.985669394137203e-05, + "loss": 1.5407014846801759, + "step": 119950 + }, + { + "epoch": 0.36311352594857205, + "grad_norm": 0.10330574959516525, + "learning_rate": 7.985289875973464e-05, + "loss": 1.5655102729797363, + "step": 119960 + }, + { + "epoch": 0.36314379549891784, + "grad_norm": 0.10665087401866913, + "learning_rate": 7.984910357809724e-05, + "loss": 1.5313864707946778, + "step": 119970 + }, + { + "epoch": 0.3631740650492637, + "grad_norm": 0.11436541378498077, + "learning_rate": 7.984530839645985e-05, + "loss": 1.5213737487792969, + "step": 119980 + }, + { + "epoch": 0.36320433459960955, + "grad_norm": 0.10457946360111237, + "learning_rate": 7.984151321482247e-05, + "loss": 1.5689496994018555, + "step": 119990 + }, + { + "epoch": 0.36323460414995534, + "grad_norm": 0.1160978153347969, + "learning_rate": 7.983771803318508e-05, + "loss": 1.5200769424438476, + "step": 120000 + }, + { + "epoch": 0.36323460414995534, + "eval_loss": 1.5633267164230347, + "eval_runtime": 27.9366, + "eval_samples_per_second": 17.898, + "eval_steps_per_second": 1.145, + "step": 120000 + }, + { + "epoch": 0.3632648737003012, + "grad_norm": 0.10124709457159042, + "learning_rate": 7.983392285154768e-05, + "loss": 1.5225582122802734, + "step": 120010 + }, + { + "epoch": 0.363295143250647, + "grad_norm": 0.1108529195189476, + "learning_rate": 7.983012766991029e-05, + "loss": 1.5772748947143556, + "step": 120020 + }, + { + "epoch": 0.36332541280099284, + "grad_norm": 0.10730913281440735, + "learning_rate": 7.98263324882729e-05, + "loss": 1.5573532104492187, + "step": 120030 + }, + { + "epoch": 0.3633556823513387, + "grad_norm": 0.11869019269943237, + "learning_rate": 7.98225373066355e-05, + "loss": 1.5249086380004884, + "step": 120040 + }, + { + "epoch": 0.3633859519016845, + "grad_norm": 0.11273865401744843, + "learning_rate": 7.98187421249981e-05, + "loss": 1.5361847877502441, + "step": 120050 + }, + { + "epoch": 0.36341622145203034, + "grad_norm": 0.10496655851602554, + "learning_rate": 7.981494694336071e-05, + "loss": 1.5416032791137695, + "step": 120060 + }, + { + "epoch": 0.36344649100237614, + "grad_norm": 0.10579323023557663, + "learning_rate": 7.981115176172332e-05, + "loss": 1.5589673042297363, + "step": 120070 + }, + { + "epoch": 0.363476760552722, + "grad_norm": 0.12291567027568817, + "learning_rate": 7.980735658008592e-05, + "loss": 1.5082245826721192, + "step": 120080 + }, + { + "epoch": 0.36350703010306784, + "grad_norm": 0.1167830228805542, + "learning_rate": 7.980356139844853e-05, + "loss": 1.56365966796875, + "step": 120090 + }, + { + "epoch": 0.36353729965341364, + "grad_norm": 0.11483872681856155, + "learning_rate": 7.979976621681114e-05, + "loss": 1.519981575012207, + "step": 120100 + }, + { + "epoch": 0.3635675692037595, + "grad_norm": 0.1238923966884613, + "learning_rate": 7.979597103517374e-05, + "loss": 1.5500875473022462, + "step": 120110 + }, + { + "epoch": 0.3635978387541053, + "grad_norm": 0.11245619505643845, + "learning_rate": 7.979217585353635e-05, + "loss": 1.5412589073181153, + "step": 120120 + }, + { + "epoch": 0.36362810830445114, + "grad_norm": 0.10687452554702759, + "learning_rate": 7.978838067189897e-05, + "loss": 1.5355572700500488, + "step": 120130 + }, + { + "epoch": 0.363658377854797, + "grad_norm": 0.107281893491745, + "learning_rate": 7.978458549026156e-05, + "loss": 1.5429903030395509, + "step": 120140 + }, + { + "epoch": 0.3636886474051428, + "grad_norm": 0.1149822250008583, + "learning_rate": 7.978079030862418e-05, + "loss": 1.5542694091796876, + "step": 120150 + }, + { + "epoch": 0.36371891695548864, + "grad_norm": 0.10144532471895218, + "learning_rate": 7.977699512698677e-05, + "loss": 1.5434074401855469, + "step": 120160 + }, + { + "epoch": 0.36374918650583443, + "grad_norm": 0.09848067164421082, + "learning_rate": 7.977319994534939e-05, + "loss": 1.546915626525879, + "step": 120170 + }, + { + "epoch": 0.3637794560561803, + "grad_norm": 0.11533541232347488, + "learning_rate": 7.976940476371198e-05, + "loss": 1.5576745986938476, + "step": 120180 + }, + { + "epoch": 0.36380972560652614, + "grad_norm": 0.11215078830718994, + "learning_rate": 7.97656095820746e-05, + "loss": 1.5259885787963867, + "step": 120190 + }, + { + "epoch": 0.36383999515687193, + "grad_norm": 0.12489821016788483, + "learning_rate": 7.976181440043721e-05, + "loss": 1.5026268005371093, + "step": 120200 + }, + { + "epoch": 0.3638702647072178, + "grad_norm": 0.12430969625711441, + "learning_rate": 7.975801921879981e-05, + "loss": 1.5235472679138184, + "step": 120210 + }, + { + "epoch": 0.3639005342575636, + "grad_norm": 0.10172007232904434, + "learning_rate": 7.975422403716242e-05, + "loss": 1.5489336013793946, + "step": 120220 + }, + { + "epoch": 0.36393080380790943, + "grad_norm": 0.110166534781456, + "learning_rate": 7.975042885552503e-05, + "loss": 1.5473941802978515, + "step": 120230 + }, + { + "epoch": 0.3639610733582553, + "grad_norm": 0.10802951455116272, + "learning_rate": 7.974663367388763e-05, + "loss": 1.5490357398986816, + "step": 120240 + }, + { + "epoch": 0.3639913429086011, + "grad_norm": 0.09839242696762085, + "learning_rate": 7.974283849225024e-05, + "loss": 1.5846036911010741, + "step": 120250 + }, + { + "epoch": 0.36402161245894693, + "grad_norm": 0.11214000731706619, + "learning_rate": 7.973904331061286e-05, + "loss": 1.544135856628418, + "step": 120260 + }, + { + "epoch": 0.3640518820092927, + "grad_norm": 0.09944590926170349, + "learning_rate": 7.973524812897545e-05, + "loss": 1.5089101791381836, + "step": 120270 + }, + { + "epoch": 0.3640821515596386, + "grad_norm": 0.09933601319789886, + "learning_rate": 7.973145294733807e-05, + "loss": 1.5275936126708984, + "step": 120280 + }, + { + "epoch": 0.36411242110998443, + "grad_norm": 0.11640585958957672, + "learning_rate": 7.972765776570066e-05, + "loss": 1.556563377380371, + "step": 120290 + }, + { + "epoch": 0.3641426906603302, + "grad_norm": 0.11263760179281235, + "learning_rate": 7.972386258406328e-05, + "loss": 1.5296772003173829, + "step": 120300 + }, + { + "epoch": 0.3641729602106761, + "grad_norm": 0.12029590457677841, + "learning_rate": 7.972006740242587e-05, + "loss": 1.5592116355895995, + "step": 120310 + }, + { + "epoch": 0.3642032297610219, + "grad_norm": 0.10950545966625214, + "learning_rate": 7.97162722207885e-05, + "loss": 1.5567362785339356, + "step": 120320 + }, + { + "epoch": 0.3642334993113677, + "grad_norm": 0.11748248338699341, + "learning_rate": 7.971247703915109e-05, + "loss": 1.545548152923584, + "step": 120330 + }, + { + "epoch": 0.3642637688617136, + "grad_norm": 0.11333977431058884, + "learning_rate": 7.97086818575137e-05, + "loss": 1.525937271118164, + "step": 120340 + }, + { + "epoch": 0.3642940384120594, + "grad_norm": 0.11071894317865372, + "learning_rate": 7.97048866758763e-05, + "loss": 1.5244229316711426, + "step": 120350 + }, + { + "epoch": 0.3643243079624052, + "grad_norm": 0.10957242548465729, + "learning_rate": 7.970109149423892e-05, + "loss": 1.4957061767578126, + "step": 120360 + }, + { + "epoch": 0.364354577512751, + "grad_norm": 0.11086999624967575, + "learning_rate": 7.969729631260152e-05, + "loss": 1.5505098342895507, + "step": 120370 + }, + { + "epoch": 0.3643848470630969, + "grad_norm": 0.09627987444400787, + "learning_rate": 7.969350113096413e-05, + "loss": 1.5253780364990235, + "step": 120380 + }, + { + "epoch": 0.3644151166134427, + "grad_norm": 0.10348851978778839, + "learning_rate": 7.968970594932674e-05, + "loss": 1.5135655403137207, + "step": 120390 + }, + { + "epoch": 0.3644453861637885, + "grad_norm": 0.11579791456460953, + "learning_rate": 7.968591076768934e-05, + "loss": 1.5632316589355468, + "step": 120400 + }, + { + "epoch": 0.36447565571413437, + "grad_norm": 0.1215868666768074, + "learning_rate": 7.968211558605196e-05, + "loss": 1.5639989852905274, + "step": 120410 + }, + { + "epoch": 0.3645059252644802, + "grad_norm": 0.10190602391958237, + "learning_rate": 7.967832040441455e-05, + "loss": 1.5493000984191894, + "step": 120420 + }, + { + "epoch": 0.364536194814826, + "grad_norm": 0.13177433609962463, + "learning_rate": 7.967452522277717e-05, + "loss": 1.5790801048278809, + "step": 120430 + }, + { + "epoch": 0.36456646436517187, + "grad_norm": 0.11105908453464508, + "learning_rate": 7.967073004113976e-05, + "loss": 1.5259069442749023, + "step": 120440 + }, + { + "epoch": 0.36459673391551767, + "grad_norm": 0.11665879189968109, + "learning_rate": 7.966693485950238e-05, + "loss": 1.564550018310547, + "step": 120450 + }, + { + "epoch": 0.3646270034658635, + "grad_norm": 0.10266700387001038, + "learning_rate": 7.966313967786498e-05, + "loss": 1.5184951782226563, + "step": 120460 + }, + { + "epoch": 0.36465727301620937, + "grad_norm": 0.11235041171312332, + "learning_rate": 7.96593444962276e-05, + "loss": 1.556034469604492, + "step": 120470 + }, + { + "epoch": 0.36468754256655517, + "grad_norm": 0.11506946384906769, + "learning_rate": 7.965554931459019e-05, + "loss": 1.5536802291870118, + "step": 120480 + }, + { + "epoch": 0.364717812116901, + "grad_norm": 0.09986856579780579, + "learning_rate": 7.965175413295281e-05, + "loss": 1.5766958236694335, + "step": 120490 + }, + { + "epoch": 0.3647480816672468, + "grad_norm": 0.11113684624433517, + "learning_rate": 7.964795895131541e-05, + "loss": 1.5618319511413574, + "step": 120500 + }, + { + "epoch": 0.3647480816672468, + "eval_loss": 1.550973653793335, + "eval_runtime": 27.9785, + "eval_samples_per_second": 17.871, + "eval_steps_per_second": 1.144, + "step": 120500 + }, + { + "epoch": 0.36477835121759267, + "grad_norm": 0.12142378836870193, + "learning_rate": 7.964416376967802e-05, + "loss": 1.568485641479492, + "step": 120510 + }, + { + "epoch": 0.3648086207679385, + "grad_norm": 0.1150793805718422, + "learning_rate": 7.964036858804063e-05, + "loss": 1.5435701370239259, + "step": 120520 + }, + { + "epoch": 0.3648388903182843, + "grad_norm": 0.10476506501436234, + "learning_rate": 7.963657340640323e-05, + "loss": 1.5604364395141601, + "step": 120530 + }, + { + "epoch": 0.36486915986863017, + "grad_norm": 0.11137660592794418, + "learning_rate": 7.963277822476584e-05, + "loss": 1.5539305686950684, + "step": 120540 + }, + { + "epoch": 0.36489942941897596, + "grad_norm": 0.1218765527009964, + "learning_rate": 7.962898304312844e-05, + "loss": 1.5731151580810547, + "step": 120550 + }, + { + "epoch": 0.3649296989693218, + "grad_norm": 0.10209383070468903, + "learning_rate": 7.962518786149105e-05, + "loss": 1.5396872520446778, + "step": 120560 + }, + { + "epoch": 0.36495996851966767, + "grad_norm": 0.10293935984373093, + "learning_rate": 7.962139267985366e-05, + "loss": 1.548442268371582, + "step": 120570 + }, + { + "epoch": 0.36499023807001346, + "grad_norm": 0.10637928545475006, + "learning_rate": 7.961759749821626e-05, + "loss": 1.5240255355834962, + "step": 120580 + }, + { + "epoch": 0.3650205076203593, + "grad_norm": 0.10584153234958649, + "learning_rate": 7.961380231657887e-05, + "loss": 1.5317360877990722, + "step": 120590 + }, + { + "epoch": 0.3650507771707051, + "grad_norm": 0.11998672783374786, + "learning_rate": 7.961000713494149e-05, + "loss": 1.494852638244629, + "step": 120600 + }, + { + "epoch": 0.36508104672105096, + "grad_norm": 0.110096774995327, + "learning_rate": 7.960621195330408e-05, + "loss": 1.5842903137207032, + "step": 120610 + }, + { + "epoch": 0.3651113162713968, + "grad_norm": 0.10795463621616364, + "learning_rate": 7.96024167716667e-05, + "loss": 1.5731733322143555, + "step": 120620 + }, + { + "epoch": 0.3651415858217426, + "grad_norm": 0.10755354166030884, + "learning_rate": 7.95986215900293e-05, + "loss": 1.5591679573059083, + "step": 120630 + }, + { + "epoch": 0.36517185537208846, + "grad_norm": 0.10681460797786713, + "learning_rate": 7.959482640839191e-05, + "loss": 1.5783162117004395, + "step": 120640 + }, + { + "epoch": 0.36520212492243426, + "grad_norm": 0.11120951175689697, + "learning_rate": 7.959103122675452e-05, + "loss": 1.5520078659057617, + "step": 120650 + }, + { + "epoch": 0.3652323944727801, + "grad_norm": 0.1176246926188469, + "learning_rate": 7.958723604511712e-05, + "loss": 1.5669817924499512, + "step": 120660 + }, + { + "epoch": 0.36526266402312596, + "grad_norm": 0.1111312210559845, + "learning_rate": 7.958344086347973e-05, + "loss": 1.5791728973388672, + "step": 120670 + }, + { + "epoch": 0.36529293357347176, + "grad_norm": 0.11033046245574951, + "learning_rate": 7.957964568184233e-05, + "loss": 1.5475598335266114, + "step": 120680 + }, + { + "epoch": 0.3653232031238176, + "grad_norm": 0.12299161404371262, + "learning_rate": 7.957585050020494e-05, + "loss": 1.525613498687744, + "step": 120690 + }, + { + "epoch": 0.3653534726741634, + "grad_norm": 0.10249972343444824, + "learning_rate": 7.957205531856755e-05, + "loss": 1.571918773651123, + "step": 120700 + }, + { + "epoch": 0.36538374222450926, + "grad_norm": 0.1003580093383789, + "learning_rate": 7.956826013693015e-05, + "loss": 1.5582473754882813, + "step": 120710 + }, + { + "epoch": 0.3654140117748551, + "grad_norm": 0.1130504161119461, + "learning_rate": 7.956446495529276e-05, + "loss": 1.5504773139953614, + "step": 120720 + }, + { + "epoch": 0.3654442813252009, + "grad_norm": 0.11256568133831024, + "learning_rate": 7.956066977365536e-05, + "loss": 1.5360151290893556, + "step": 120730 + }, + { + "epoch": 0.36547455087554676, + "grad_norm": 0.10831775516271591, + "learning_rate": 7.955687459201798e-05, + "loss": 1.5601024627685547, + "step": 120740 + }, + { + "epoch": 0.36550482042589255, + "grad_norm": 0.1115284264087677, + "learning_rate": 7.955307941038058e-05, + "loss": 1.535411262512207, + "step": 120750 + }, + { + "epoch": 0.3655350899762384, + "grad_norm": 0.10392200201749802, + "learning_rate": 7.95492842287432e-05, + "loss": 1.5227228164672852, + "step": 120760 + }, + { + "epoch": 0.36556535952658425, + "grad_norm": 0.10916021466255188, + "learning_rate": 7.954548904710579e-05, + "loss": 1.5627289772033692, + "step": 120770 + }, + { + "epoch": 0.36559562907693005, + "grad_norm": 0.10869880020618439, + "learning_rate": 7.954169386546841e-05, + "loss": 1.5544707298278808, + "step": 120780 + }, + { + "epoch": 0.3656258986272759, + "grad_norm": 0.10361681133508682, + "learning_rate": 7.9537898683831e-05, + "loss": 1.5361621856689454, + "step": 120790 + }, + { + "epoch": 0.3656561681776217, + "grad_norm": 0.11336758732795715, + "learning_rate": 7.953410350219362e-05, + "loss": 1.504345989227295, + "step": 120800 + }, + { + "epoch": 0.36568643772796755, + "grad_norm": 0.10484078526496887, + "learning_rate": 7.953030832055623e-05, + "loss": 1.5674380302429198, + "step": 120810 + }, + { + "epoch": 0.3657167072783134, + "grad_norm": 0.10523830354213715, + "learning_rate": 7.952651313891883e-05, + "loss": 1.5296738624572754, + "step": 120820 + }, + { + "epoch": 0.3657469768286592, + "grad_norm": 0.10920606553554535, + "learning_rate": 7.952271795728144e-05, + "loss": 1.5466577529907226, + "step": 120830 + }, + { + "epoch": 0.36577724637900505, + "grad_norm": 0.10608847439289093, + "learning_rate": 7.951892277564404e-05, + "loss": 1.5460431098937988, + "step": 120840 + }, + { + "epoch": 0.36580751592935085, + "grad_norm": 0.11173973232507706, + "learning_rate": 7.951512759400665e-05, + "loss": 1.5571560859680176, + "step": 120850 + }, + { + "epoch": 0.3658377854796967, + "grad_norm": 0.11765037477016449, + "learning_rate": 7.951133241236926e-05, + "loss": 1.5718464851379395, + "step": 120860 + }, + { + "epoch": 0.36586805503004255, + "grad_norm": 0.13725413382053375, + "learning_rate": 7.950753723073187e-05, + "loss": 1.524782371520996, + "step": 120870 + }, + { + "epoch": 0.36589832458038835, + "grad_norm": 0.10552018135786057, + "learning_rate": 7.950374204909447e-05, + "loss": 1.5478731155395509, + "step": 120880 + }, + { + "epoch": 0.3659285941307342, + "grad_norm": 0.11558979749679565, + "learning_rate": 7.949994686745709e-05, + "loss": 1.561782169342041, + "step": 120890 + }, + { + "epoch": 0.36595886368108, + "grad_norm": 0.10541361570358276, + "learning_rate": 7.949615168581968e-05, + "loss": 1.534773349761963, + "step": 120900 + }, + { + "epoch": 0.36598913323142584, + "grad_norm": 0.11552467942237854, + "learning_rate": 7.94923565041823e-05, + "loss": 1.5536455154418944, + "step": 120910 + }, + { + "epoch": 0.3660194027817717, + "grad_norm": 0.11472722142934799, + "learning_rate": 7.948856132254489e-05, + "loss": 1.5656822204589844, + "step": 120920 + }, + { + "epoch": 0.3660496723321175, + "grad_norm": 0.10478921979665756, + "learning_rate": 7.948476614090751e-05, + "loss": 1.548710250854492, + "step": 120930 + }, + { + "epoch": 0.36607994188246334, + "grad_norm": 0.1150604858994484, + "learning_rate": 7.94809709592701e-05, + "loss": 1.606058692932129, + "step": 120940 + }, + { + "epoch": 0.36611021143280914, + "grad_norm": 0.11389435827732086, + "learning_rate": 7.947717577763272e-05, + "loss": 1.5576642990112304, + "step": 120950 + }, + { + "epoch": 0.366140480983155, + "grad_norm": 0.09773943573236465, + "learning_rate": 7.947338059599531e-05, + "loss": 1.574719524383545, + "step": 120960 + }, + { + "epoch": 0.36617075053350084, + "grad_norm": 0.1093868762254715, + "learning_rate": 7.946958541435793e-05, + "loss": 1.5656917572021485, + "step": 120970 + }, + { + "epoch": 0.36620102008384664, + "grad_norm": 0.11978507786989212, + "learning_rate": 7.946579023272053e-05, + "loss": 1.529879093170166, + "step": 120980 + }, + { + "epoch": 0.3662312896341925, + "grad_norm": 0.11136634647846222, + "learning_rate": 7.946199505108315e-05, + "loss": 1.5193841934204102, + "step": 120990 + }, + { + "epoch": 0.3662615591845383, + "grad_norm": 0.1220119297504425, + "learning_rate": 7.945819986944575e-05, + "loss": 1.5277981758117676, + "step": 121000 + }, + { + "epoch": 0.3662615591845383, + "eval_loss": 1.5413918495178223, + "eval_runtime": 28.0119, + "eval_samples_per_second": 17.85, + "eval_steps_per_second": 1.142, + "step": 121000 + }, + { + "epoch": 0.36629182873488414, + "grad_norm": 0.11625485867261887, + "learning_rate": 7.945440468780836e-05, + "loss": 1.5949392318725586, + "step": 121010 + }, + { + "epoch": 0.36632209828523, + "grad_norm": 0.10736967623233795, + "learning_rate": 7.945060950617098e-05, + "loss": 1.5687997817993165, + "step": 121020 + }, + { + "epoch": 0.3663523678355758, + "grad_norm": 0.11732961237430573, + "learning_rate": 7.944681432453357e-05, + "loss": 1.5418173789978027, + "step": 121030 + }, + { + "epoch": 0.36638263738592164, + "grad_norm": 0.10867082327604294, + "learning_rate": 7.944301914289619e-05, + "loss": 1.5171241760253906, + "step": 121040 + }, + { + "epoch": 0.36641290693626744, + "grad_norm": 0.11778106540441513, + "learning_rate": 7.943922396125878e-05, + "loss": 1.5353311538696288, + "step": 121050 + }, + { + "epoch": 0.3664431764866133, + "grad_norm": 0.0982089638710022, + "learning_rate": 7.94354287796214e-05, + "loss": 1.537977695465088, + "step": 121060 + }, + { + "epoch": 0.36647344603695914, + "grad_norm": 0.1152745932340622, + "learning_rate": 7.9431633597984e-05, + "loss": 1.5167266845703125, + "step": 121070 + }, + { + "epoch": 0.36650371558730493, + "grad_norm": 0.11956680566072464, + "learning_rate": 7.942783841634661e-05, + "loss": 1.535594367980957, + "step": 121080 + }, + { + "epoch": 0.3665339851376508, + "grad_norm": 0.11688853800296783, + "learning_rate": 7.94240432347092e-05, + "loss": 1.5343825340270996, + "step": 121090 + }, + { + "epoch": 0.3665642546879966, + "grad_norm": 0.1159057542681694, + "learning_rate": 7.942024805307183e-05, + "loss": 1.5273802757263184, + "step": 121100 + }, + { + "epoch": 0.36659452423834243, + "grad_norm": 0.11993812024593353, + "learning_rate": 7.941645287143443e-05, + "loss": 1.5806733131408692, + "step": 121110 + }, + { + "epoch": 0.3666247937886883, + "grad_norm": 0.1226794645190239, + "learning_rate": 7.941265768979704e-05, + "loss": 1.5057061195373536, + "step": 121120 + }, + { + "epoch": 0.3666550633390341, + "grad_norm": 0.11386077105998993, + "learning_rate": 7.940886250815964e-05, + "loss": 1.543593406677246, + "step": 121130 + }, + { + "epoch": 0.36668533288937993, + "grad_norm": 0.10809510201215744, + "learning_rate": 7.940506732652225e-05, + "loss": 1.5651536941528321, + "step": 121140 + }, + { + "epoch": 0.3667156024397258, + "grad_norm": 0.10951322317123413, + "learning_rate": 7.940127214488486e-05, + "loss": 1.5529452323913575, + "step": 121150 + }, + { + "epoch": 0.3667458719900716, + "grad_norm": 0.1118670180439949, + "learning_rate": 7.939747696324746e-05, + "loss": 1.5465265274047852, + "step": 121160 + }, + { + "epoch": 0.36677614154041743, + "grad_norm": 0.11959612369537354, + "learning_rate": 7.939368178161007e-05, + "loss": 1.5482637405395507, + "step": 121170 + }, + { + "epoch": 0.36680641109076323, + "grad_norm": 0.12018473446369171, + "learning_rate": 7.938988659997267e-05, + "loss": 1.5279788017272948, + "step": 121180 + }, + { + "epoch": 0.3668366806411091, + "grad_norm": 0.09948904067277908, + "learning_rate": 7.938609141833528e-05, + "loss": 1.5558364868164063, + "step": 121190 + }, + { + "epoch": 0.36686695019145493, + "grad_norm": 0.10478974878787994, + "learning_rate": 7.938229623669788e-05, + "loss": 1.5603118896484376, + "step": 121200 + }, + { + "epoch": 0.36689721974180073, + "grad_norm": 0.11125880479812622, + "learning_rate": 7.93785010550605e-05, + "loss": 1.4998064041137695, + "step": 121210 + }, + { + "epoch": 0.3669274892921466, + "grad_norm": 0.11443865299224854, + "learning_rate": 7.93747058734231e-05, + "loss": 1.5420538902282714, + "step": 121220 + }, + { + "epoch": 0.3669577588424924, + "grad_norm": 0.11169029027223587, + "learning_rate": 7.937091069178572e-05, + "loss": 1.568712043762207, + "step": 121230 + }, + { + "epoch": 0.36698802839283823, + "grad_norm": 0.10948793590068817, + "learning_rate": 7.936711551014832e-05, + "loss": 1.557958984375, + "step": 121240 + }, + { + "epoch": 0.3670182979431841, + "grad_norm": 0.10046643018722534, + "learning_rate": 7.936332032851093e-05, + "loss": 1.5824163436889649, + "step": 121250 + }, + { + "epoch": 0.3670485674935299, + "grad_norm": 0.11191291362047195, + "learning_rate": 7.935952514687353e-05, + "loss": 1.5768251419067383, + "step": 121260 + }, + { + "epoch": 0.3670788370438757, + "grad_norm": 0.11360278725624084, + "learning_rate": 7.935572996523614e-05, + "loss": 1.5508540153503418, + "step": 121270 + }, + { + "epoch": 0.3671091065942215, + "grad_norm": 0.10874755680561066, + "learning_rate": 7.935193478359875e-05, + "loss": 1.5385693550109862, + "step": 121280 + }, + { + "epoch": 0.3671393761445674, + "grad_norm": 0.11539915949106216, + "learning_rate": 7.934813960196135e-05, + "loss": 1.546433448791504, + "step": 121290 + }, + { + "epoch": 0.3671696456949132, + "grad_norm": 0.11043154448270798, + "learning_rate": 7.934434442032396e-05, + "loss": 1.5142074584960938, + "step": 121300 + }, + { + "epoch": 0.367199915245259, + "grad_norm": 0.12096555531024933, + "learning_rate": 7.934054923868656e-05, + "loss": 1.5458633422851562, + "step": 121310 + }, + { + "epoch": 0.3672301847956049, + "grad_norm": 0.1101599633693695, + "learning_rate": 7.933675405704917e-05, + "loss": 1.5713300704956055, + "step": 121320 + }, + { + "epoch": 0.36726045434595067, + "grad_norm": 0.10140800476074219, + "learning_rate": 7.933295887541178e-05, + "loss": 1.5646126747131348, + "step": 121330 + }, + { + "epoch": 0.3672907238962965, + "grad_norm": 0.1004246175289154, + "learning_rate": 7.932916369377438e-05, + "loss": 1.4969993591308595, + "step": 121340 + }, + { + "epoch": 0.3673209934466424, + "grad_norm": 0.11967848986387253, + "learning_rate": 7.932536851213699e-05, + "loss": 1.5638341903686523, + "step": 121350 + }, + { + "epoch": 0.36735126299698817, + "grad_norm": 0.12819305062294006, + "learning_rate": 7.93215733304996e-05, + "loss": 1.5379382133483888, + "step": 121360 + }, + { + "epoch": 0.367381532547334, + "grad_norm": 0.11479643732309341, + "learning_rate": 7.931777814886221e-05, + "loss": 1.5305692672729492, + "step": 121370 + }, + { + "epoch": 0.3674118020976798, + "grad_norm": 0.09799011051654816, + "learning_rate": 7.93139829672248e-05, + "loss": 1.53649959564209, + "step": 121380 + }, + { + "epoch": 0.36744207164802567, + "grad_norm": 0.10336767882108688, + "learning_rate": 7.931018778558743e-05, + "loss": 1.5557086944580079, + "step": 121390 + }, + { + "epoch": 0.3674723411983715, + "grad_norm": 0.111362986266613, + "learning_rate": 7.930639260395002e-05, + "loss": 1.5476001739501952, + "step": 121400 + }, + { + "epoch": 0.3675026107487173, + "grad_norm": 0.10849712044000626, + "learning_rate": 7.930259742231264e-05, + "loss": 1.5233266830444336, + "step": 121410 + }, + { + "epoch": 0.36753288029906317, + "grad_norm": 0.11260806024074554, + "learning_rate": 7.929880224067524e-05, + "loss": 1.5204008102416993, + "step": 121420 + }, + { + "epoch": 0.36756314984940897, + "grad_norm": 0.10141921788454056, + "learning_rate": 7.929500705903785e-05, + "loss": 1.5333399772644043, + "step": 121430 + }, + { + "epoch": 0.3675934193997548, + "grad_norm": 0.11176460981369019, + "learning_rate": 7.929121187740045e-05, + "loss": 1.5681941986083985, + "step": 121440 + }, + { + "epoch": 0.36762368895010067, + "grad_norm": 0.11704893410205841, + "learning_rate": 7.928741669576306e-05, + "loss": 1.5593008995056152, + "step": 121450 + }, + { + "epoch": 0.36765395850044646, + "grad_norm": 0.13241077959537506, + "learning_rate": 7.928362151412567e-05, + "loss": 1.5569483757019043, + "step": 121460 + }, + { + "epoch": 0.3676842280507923, + "grad_norm": 0.10922811925411224, + "learning_rate": 7.927982633248827e-05, + "loss": 1.5289775848388671, + "step": 121470 + }, + { + "epoch": 0.3677144976011381, + "grad_norm": 0.10191714018583298, + "learning_rate": 7.927603115085089e-05, + "loss": 1.5387109756469726, + "step": 121480 + }, + { + "epoch": 0.36774476715148396, + "grad_norm": 0.11022002249956131, + "learning_rate": 7.927223596921348e-05, + "loss": 1.5458318710327148, + "step": 121490 + }, + { + "epoch": 0.3677750367018298, + "grad_norm": 0.10148750990629196, + "learning_rate": 7.92684407875761e-05, + "loss": 1.5493069648742677, + "step": 121500 + }, + { + "epoch": 0.3677750367018298, + "eval_loss": 1.5408555269241333, + "eval_runtime": 28.325, + "eval_samples_per_second": 17.652, + "eval_steps_per_second": 1.13, + "step": 121500 + }, + { + "epoch": 0.3678053062521756, + "grad_norm": 0.10759423673152924, + "learning_rate": 7.92646456059387e-05, + "loss": 1.5064889907836914, + "step": 121510 + }, + { + "epoch": 0.36783557580252146, + "grad_norm": 0.10132208466529846, + "learning_rate": 7.926085042430132e-05, + "loss": 1.5264044761657716, + "step": 121520 + }, + { + "epoch": 0.36786584535286726, + "grad_norm": 0.12227798253297806, + "learning_rate": 7.925705524266391e-05, + "loss": 1.5661041259765625, + "step": 121530 + }, + { + "epoch": 0.3678961149032131, + "grad_norm": 0.10615949332714081, + "learning_rate": 7.925326006102653e-05, + "loss": 1.5331290245056153, + "step": 121540 + }, + { + "epoch": 0.36792638445355896, + "grad_norm": 0.10903488099575043, + "learning_rate": 7.924946487938912e-05, + "loss": 1.5344558715820313, + "step": 121550 + }, + { + "epoch": 0.36795665400390476, + "grad_norm": 0.1183042898774147, + "learning_rate": 7.924566969775174e-05, + "loss": 1.5302552223205566, + "step": 121560 + }, + { + "epoch": 0.3679869235542506, + "grad_norm": 0.10882695764303207, + "learning_rate": 7.924187451611433e-05, + "loss": 1.5840234756469727, + "step": 121570 + }, + { + "epoch": 0.3680171931045964, + "grad_norm": 0.12431253492832184, + "learning_rate": 7.923807933447695e-05, + "loss": 1.5287742614746094, + "step": 121580 + }, + { + "epoch": 0.36804746265494226, + "grad_norm": 0.12548808753490448, + "learning_rate": 7.923428415283954e-05, + "loss": 1.5276636123657226, + "step": 121590 + }, + { + "epoch": 0.3680777322052881, + "grad_norm": 0.10964632779359818, + "learning_rate": 7.923048897120216e-05, + "loss": 1.5875213623046875, + "step": 121600 + }, + { + "epoch": 0.3681080017556339, + "grad_norm": 0.11524340510368347, + "learning_rate": 7.922669378956478e-05, + "loss": 1.5607707977294922, + "step": 121610 + }, + { + "epoch": 0.36813827130597976, + "grad_norm": 0.10291913896799088, + "learning_rate": 7.922289860792738e-05, + "loss": 1.5107549667358398, + "step": 121620 + }, + { + "epoch": 0.36816854085632555, + "grad_norm": 0.10143952071666718, + "learning_rate": 7.921910342629e-05, + "loss": 1.532886791229248, + "step": 121630 + }, + { + "epoch": 0.3681988104066714, + "grad_norm": 0.12100222706794739, + "learning_rate": 7.921530824465259e-05, + "loss": 1.5214537620544433, + "step": 121640 + }, + { + "epoch": 0.36822907995701726, + "grad_norm": 0.12114809453487396, + "learning_rate": 7.921151306301521e-05, + "loss": 1.585560417175293, + "step": 121650 + }, + { + "epoch": 0.36825934950736305, + "grad_norm": 0.10845267027616501, + "learning_rate": 7.92077178813778e-05, + "loss": 1.5792501449584961, + "step": 121660 + }, + { + "epoch": 0.3682896190577089, + "grad_norm": 0.11358881741762161, + "learning_rate": 7.920392269974042e-05, + "loss": 1.5425817489624023, + "step": 121670 + }, + { + "epoch": 0.3683198886080547, + "grad_norm": 0.11378277093172073, + "learning_rate": 7.920012751810301e-05, + "loss": 1.507070541381836, + "step": 121680 + }, + { + "epoch": 0.36835015815840055, + "grad_norm": 0.10717252641916275, + "learning_rate": 7.919633233646563e-05, + "loss": 1.5559466361999512, + "step": 121690 + }, + { + "epoch": 0.3683804277087464, + "grad_norm": 0.10942478477954865, + "learning_rate": 7.919253715482822e-05, + "loss": 1.5020963668823242, + "step": 121700 + }, + { + "epoch": 0.3684106972590922, + "grad_norm": 0.10279570519924164, + "learning_rate": 7.918874197319084e-05, + "loss": 1.5471355438232421, + "step": 121710 + }, + { + "epoch": 0.36844096680943805, + "grad_norm": 0.10676228255033493, + "learning_rate": 7.918494679155343e-05, + "loss": 1.5778305053710937, + "step": 121720 + }, + { + "epoch": 0.36847123635978385, + "grad_norm": 0.10254205018281937, + "learning_rate": 7.918115160991605e-05, + "loss": 1.503959846496582, + "step": 121730 + }, + { + "epoch": 0.3685015059101297, + "grad_norm": 0.0993991270661354, + "learning_rate": 7.917735642827866e-05, + "loss": 1.5488264083862304, + "step": 121740 + }, + { + "epoch": 0.36853177546047555, + "grad_norm": 0.10939555615186691, + "learning_rate": 7.917356124664127e-05, + "loss": 1.5625238418579102, + "step": 121750 + }, + { + "epoch": 0.36856204501082135, + "grad_norm": 0.12551866471767426, + "learning_rate": 7.916976606500387e-05, + "loss": 1.5317464828491212, + "step": 121760 + }, + { + "epoch": 0.3685923145611672, + "grad_norm": 0.10028201341629028, + "learning_rate": 7.916597088336648e-05, + "loss": 1.5826276779174804, + "step": 121770 + }, + { + "epoch": 0.368622584111513, + "grad_norm": 0.11248858273029327, + "learning_rate": 7.916217570172908e-05, + "loss": 1.5367273330688476, + "step": 121780 + }, + { + "epoch": 0.36865285366185885, + "grad_norm": 0.10055982321500778, + "learning_rate": 7.915838052009169e-05, + "loss": 1.5394169807434082, + "step": 121790 + }, + { + "epoch": 0.3686831232122047, + "grad_norm": 0.11250074207782745, + "learning_rate": 7.91545853384543e-05, + "loss": 1.570627212524414, + "step": 121800 + }, + { + "epoch": 0.3687133927625505, + "grad_norm": 0.10962216556072235, + "learning_rate": 7.91507901568169e-05, + "loss": 1.5593594551086425, + "step": 121810 + }, + { + "epoch": 0.36874366231289635, + "grad_norm": 0.12247055023908615, + "learning_rate": 7.914699497517952e-05, + "loss": 1.546356773376465, + "step": 121820 + }, + { + "epoch": 0.3687739318632422, + "grad_norm": 0.11205495148897171, + "learning_rate": 7.914319979354211e-05, + "loss": 1.5465252876281739, + "step": 121830 + }, + { + "epoch": 0.368804201413588, + "grad_norm": 0.10634899139404297, + "learning_rate": 7.913940461190473e-05, + "loss": 1.5436829566955566, + "step": 121840 + }, + { + "epoch": 0.36883447096393385, + "grad_norm": 0.10750797390937805, + "learning_rate": 7.913560943026734e-05, + "loss": 1.5440485954284668, + "step": 121850 + }, + { + "epoch": 0.36886474051427964, + "grad_norm": 0.1174682006239891, + "learning_rate": 7.913181424862995e-05, + "loss": 1.502757740020752, + "step": 121860 + }, + { + "epoch": 0.3688950100646255, + "grad_norm": 0.09959378093481064, + "learning_rate": 7.912801906699255e-05, + "loss": 1.4904390335083009, + "step": 121870 + }, + { + "epoch": 0.36892527961497135, + "grad_norm": 0.11302772164344788, + "learning_rate": 7.912422388535516e-05, + "loss": 1.5356632232666017, + "step": 121880 + }, + { + "epoch": 0.36895554916531714, + "grad_norm": 0.11127744615077972, + "learning_rate": 7.912042870371776e-05, + "loss": 1.5314473152160644, + "step": 121890 + }, + { + "epoch": 0.368985818715663, + "grad_norm": 0.10524328052997589, + "learning_rate": 7.911663352208037e-05, + "loss": 1.5764188766479492, + "step": 121900 + }, + { + "epoch": 0.3690160882660088, + "grad_norm": 0.10708094388246536, + "learning_rate": 7.911283834044298e-05, + "loss": 1.5254743576049805, + "step": 121910 + }, + { + "epoch": 0.36904635781635464, + "grad_norm": 0.11997707933187485, + "learning_rate": 7.910904315880558e-05, + "loss": 1.5528639793395995, + "step": 121920 + }, + { + "epoch": 0.3690766273667005, + "grad_norm": 0.11097965389490128, + "learning_rate": 7.910524797716819e-05, + "loss": 1.5299516677856446, + "step": 121930 + }, + { + "epoch": 0.3691068969170463, + "grad_norm": 0.10963863134384155, + "learning_rate": 7.910145279553079e-05, + "loss": 1.55627384185791, + "step": 121940 + }, + { + "epoch": 0.36913716646739214, + "grad_norm": 0.11156019568443298, + "learning_rate": 7.90976576138934e-05, + "loss": 1.5258990287780763, + "step": 121950 + }, + { + "epoch": 0.36916743601773794, + "grad_norm": 0.11152108758687973, + "learning_rate": 7.9093862432256e-05, + "loss": 1.5226699829101562, + "step": 121960 + }, + { + "epoch": 0.3691977055680838, + "grad_norm": 0.11568670719861984, + "learning_rate": 7.909006725061861e-05, + "loss": 1.5238221168518067, + "step": 121970 + }, + { + "epoch": 0.36922797511842964, + "grad_norm": 0.11384882777929306, + "learning_rate": 7.908627206898123e-05, + "loss": 1.5534112930297852, + "step": 121980 + }, + { + "epoch": 0.36925824466877544, + "grad_norm": 0.11453069746494293, + "learning_rate": 7.908247688734382e-05, + "loss": 1.5106845855712892, + "step": 121990 + }, + { + "epoch": 0.3692885142191213, + "grad_norm": 0.10644388198852539, + "learning_rate": 7.907868170570644e-05, + "loss": 1.5445466041564941, + "step": 122000 + }, + { + "epoch": 0.3692885142191213, + "eval_loss": 1.5484116077423096, + "eval_runtime": 27.2932, + "eval_samples_per_second": 18.32, + "eval_steps_per_second": 1.172, + "step": 122000 + }, + { + "epoch": 0.3693187837694671, + "grad_norm": 0.11427250504493713, + "learning_rate": 7.907488652406903e-05, + "loss": 1.553147315979004, + "step": 122010 + }, + { + "epoch": 0.36934905331981294, + "grad_norm": 0.10993396490812302, + "learning_rate": 7.907109134243165e-05, + "loss": 1.5345660209655763, + "step": 122020 + }, + { + "epoch": 0.3693793228701588, + "grad_norm": 0.122330442070961, + "learning_rate": 7.906729616079426e-05, + "loss": 1.5128704071044923, + "step": 122030 + }, + { + "epoch": 0.3694095924205046, + "grad_norm": 0.11085987836122513, + "learning_rate": 7.906350097915687e-05, + "loss": 1.554330062866211, + "step": 122040 + }, + { + "epoch": 0.36943986197085044, + "grad_norm": 0.10586876422166824, + "learning_rate": 7.905970579751947e-05, + "loss": 1.5411888122558595, + "step": 122050 + }, + { + "epoch": 0.36947013152119623, + "grad_norm": 0.11986511945724487, + "learning_rate": 7.905591061588208e-05, + "loss": 1.532672119140625, + "step": 122060 + }, + { + "epoch": 0.3695004010715421, + "grad_norm": 0.10202478617429733, + "learning_rate": 7.905211543424468e-05, + "loss": 1.5673242568969727, + "step": 122070 + }, + { + "epoch": 0.36953067062188794, + "grad_norm": 0.10262223333120346, + "learning_rate": 7.904832025260729e-05, + "loss": 1.5412943840026856, + "step": 122080 + }, + { + "epoch": 0.36956094017223373, + "grad_norm": 0.10415371507406235, + "learning_rate": 7.904452507096991e-05, + "loss": 1.5640762329101563, + "step": 122090 + }, + { + "epoch": 0.3695912097225796, + "grad_norm": 0.09989596903324127, + "learning_rate": 7.90407298893325e-05, + "loss": 1.5614599227905273, + "step": 122100 + }, + { + "epoch": 0.3696214792729254, + "grad_norm": 0.10245871543884277, + "learning_rate": 7.903693470769512e-05, + "loss": 1.564773178100586, + "step": 122110 + }, + { + "epoch": 0.36965174882327123, + "grad_norm": 0.10156626999378204, + "learning_rate": 7.903313952605771e-05, + "loss": 1.5454068183898926, + "step": 122120 + }, + { + "epoch": 0.3696820183736171, + "grad_norm": 0.11086229979991913, + "learning_rate": 7.902934434442033e-05, + "loss": 1.506918716430664, + "step": 122130 + }, + { + "epoch": 0.3697122879239629, + "grad_norm": 0.13419127464294434, + "learning_rate": 7.902554916278293e-05, + "loss": 1.5274198532104493, + "step": 122140 + }, + { + "epoch": 0.36974255747430873, + "grad_norm": 0.12014234066009521, + "learning_rate": 7.902175398114555e-05, + "loss": 1.567855167388916, + "step": 122150 + }, + { + "epoch": 0.3697728270246545, + "grad_norm": 0.10803484171628952, + "learning_rate": 7.901795879950814e-05, + "loss": 1.5395217895507813, + "step": 122160 + }, + { + "epoch": 0.3698030965750004, + "grad_norm": 0.1181894987821579, + "learning_rate": 7.901416361787076e-05, + "loss": 1.549666976928711, + "step": 122170 + }, + { + "epoch": 0.36983336612534623, + "grad_norm": 0.11521488428115845, + "learning_rate": 7.901036843623335e-05, + "loss": 1.5736785888671876, + "step": 122180 + }, + { + "epoch": 0.369863635675692, + "grad_norm": 0.1176970899105072, + "learning_rate": 7.900657325459597e-05, + "loss": 1.5464102745056152, + "step": 122190 + }, + { + "epoch": 0.3698939052260379, + "grad_norm": 0.10450426489114761, + "learning_rate": 7.900277807295856e-05, + "loss": 1.5548590660095214, + "step": 122200 + }, + { + "epoch": 0.3699241747763837, + "grad_norm": 0.10875042527914047, + "learning_rate": 7.899898289132118e-05, + "loss": 1.5797649383544923, + "step": 122210 + }, + { + "epoch": 0.3699544443267295, + "grad_norm": 0.11004681140184402, + "learning_rate": 7.89951877096838e-05, + "loss": 1.5411713600158692, + "step": 122220 + }, + { + "epoch": 0.3699847138770754, + "grad_norm": 0.10774191468954086, + "learning_rate": 7.899139252804639e-05, + "loss": 1.5483120918273925, + "step": 122230 + }, + { + "epoch": 0.3700149834274212, + "grad_norm": 0.11094417423009872, + "learning_rate": 7.898759734640901e-05, + "loss": 1.5030275344848634, + "step": 122240 + }, + { + "epoch": 0.370045252977767, + "grad_norm": 0.10439768433570862, + "learning_rate": 7.89838021647716e-05, + "loss": 1.5360549926757812, + "step": 122250 + }, + { + "epoch": 0.3700755225281128, + "grad_norm": 0.11219077557325363, + "learning_rate": 7.898000698313422e-05, + "loss": 1.5176158905029298, + "step": 122260 + }, + { + "epoch": 0.3701057920784587, + "grad_norm": 0.10783301293849945, + "learning_rate": 7.897621180149682e-05, + "loss": 1.5507708549499513, + "step": 122270 + }, + { + "epoch": 0.3701360616288045, + "grad_norm": 0.12051955610513687, + "learning_rate": 7.897241661985944e-05, + "loss": 1.5484156608581543, + "step": 122280 + }, + { + "epoch": 0.3701663311791503, + "grad_norm": 0.13103652000427246, + "learning_rate": 7.896862143822203e-05, + "loss": 1.5166220664978027, + "step": 122290 + }, + { + "epoch": 0.3701966007294962, + "grad_norm": 0.11583051830530167, + "learning_rate": 7.896482625658465e-05, + "loss": 1.5599235534667968, + "step": 122300 + }, + { + "epoch": 0.37022687027984197, + "grad_norm": 0.11727794259786606, + "learning_rate": 7.896103107494724e-05, + "loss": 1.5427690505981446, + "step": 122310 + }, + { + "epoch": 0.3702571398301878, + "grad_norm": 0.1072007492184639, + "learning_rate": 7.895723589330986e-05, + "loss": 1.5380434036254882, + "step": 122320 + }, + { + "epoch": 0.37028740938053367, + "grad_norm": 0.1062387228012085, + "learning_rate": 7.895344071167245e-05, + "loss": 1.5723862648010254, + "step": 122330 + }, + { + "epoch": 0.37031767893087947, + "grad_norm": 0.11533685028553009, + "learning_rate": 7.894964553003507e-05, + "loss": 1.522743034362793, + "step": 122340 + }, + { + "epoch": 0.3703479484812253, + "grad_norm": 0.10378537327051163, + "learning_rate": 7.894585034839768e-05, + "loss": 1.5424856185913085, + "step": 122350 + }, + { + "epoch": 0.3703782180315711, + "grad_norm": 0.09172936528921127, + "learning_rate": 7.894205516676028e-05, + "loss": 1.5566478729248048, + "step": 122360 + }, + { + "epoch": 0.37040848758191697, + "grad_norm": 0.10992442071437836, + "learning_rate": 7.893825998512289e-05, + "loss": 1.5285406112670898, + "step": 122370 + }, + { + "epoch": 0.3704387571322628, + "grad_norm": 0.09275504946708679, + "learning_rate": 7.89344648034855e-05, + "loss": 1.535636043548584, + "step": 122380 + }, + { + "epoch": 0.3704690266826086, + "grad_norm": 0.1079263836145401, + "learning_rate": 7.89306696218481e-05, + "loss": 1.5487369537353515, + "step": 122390 + }, + { + "epoch": 0.37049929623295447, + "grad_norm": 0.12001155316829681, + "learning_rate": 7.892687444021071e-05, + "loss": 1.577598762512207, + "step": 122400 + }, + { + "epoch": 0.37052956578330026, + "grad_norm": 0.11082439124584198, + "learning_rate": 7.892307925857331e-05, + "loss": 1.5094077110290527, + "step": 122410 + }, + { + "epoch": 0.3705598353336461, + "grad_norm": 0.10763595253229141, + "learning_rate": 7.891928407693592e-05, + "loss": 1.5221047401428223, + "step": 122420 + }, + { + "epoch": 0.37059010488399197, + "grad_norm": 0.10990901291370392, + "learning_rate": 7.891548889529854e-05, + "loss": 1.5378063201904297, + "step": 122430 + }, + { + "epoch": 0.37062037443433776, + "grad_norm": 0.1165250763297081, + "learning_rate": 7.891169371366113e-05, + "loss": 1.517630386352539, + "step": 122440 + }, + { + "epoch": 0.3706506439846836, + "grad_norm": 0.11133051663637161, + "learning_rate": 7.890789853202375e-05, + "loss": 1.5582498550415038, + "step": 122450 + }, + { + "epoch": 0.3706809135350294, + "grad_norm": 0.1184321716427803, + "learning_rate": 7.890410335038634e-05, + "loss": 1.544774627685547, + "step": 122460 + }, + { + "epoch": 0.37071118308537526, + "grad_norm": 0.12668946385383606, + "learning_rate": 7.890030816874896e-05, + "loss": 1.564579963684082, + "step": 122470 + }, + { + "epoch": 0.3707414526357211, + "grad_norm": 0.11095356941223145, + "learning_rate": 7.889651298711157e-05, + "loss": 1.5396720886230468, + "step": 122480 + }, + { + "epoch": 0.3707717221860669, + "grad_norm": 0.10568206012248993, + "learning_rate": 7.889271780547417e-05, + "loss": 1.5133660316467286, + "step": 122490 + }, + { + "epoch": 0.37080199173641276, + "grad_norm": 0.11597555875778198, + "learning_rate": 7.888892262383678e-05, + "loss": 1.522661018371582, + "step": 122500 + }, + { + "epoch": 0.37080199173641276, + "eval_loss": 1.542797327041626, + "eval_runtime": 27.9916, + "eval_samples_per_second": 17.862, + "eval_steps_per_second": 1.143, + "step": 122500 + }, + { + "epoch": 0.37083226128675856, + "grad_norm": 0.11331702023744583, + "learning_rate": 7.888512744219939e-05, + "loss": 1.5356351852416992, + "step": 122510 + }, + { + "epoch": 0.3708625308371044, + "grad_norm": 0.1060602143406868, + "learning_rate": 7.888133226056199e-05, + "loss": 1.5758251190185546, + "step": 122520 + }, + { + "epoch": 0.37089280038745026, + "grad_norm": 0.10946846008300781, + "learning_rate": 7.88775370789246e-05, + "loss": 1.5340424537658692, + "step": 122530 + }, + { + "epoch": 0.37092306993779606, + "grad_norm": 0.09972263127565384, + "learning_rate": 7.88737418972872e-05, + "loss": 1.514272117614746, + "step": 122540 + }, + { + "epoch": 0.3709533394881419, + "grad_norm": 0.1109563410282135, + "learning_rate": 7.886994671564981e-05, + "loss": 1.5681854248046876, + "step": 122550 + }, + { + "epoch": 0.37098360903848776, + "grad_norm": 0.1003333032131195, + "learning_rate": 7.886615153401242e-05, + "loss": 1.5604994773864747, + "step": 122560 + }, + { + "epoch": 0.37101387858883356, + "grad_norm": 0.10965106636285782, + "learning_rate": 7.886235635237502e-05, + "loss": 1.5415636062622071, + "step": 122570 + }, + { + "epoch": 0.3710441481391794, + "grad_norm": 0.11284690350294113, + "learning_rate": 7.885856117073763e-05, + "loss": 1.5657370567321778, + "step": 122580 + }, + { + "epoch": 0.3710744176895252, + "grad_norm": 0.10455892980098724, + "learning_rate": 7.885476598910025e-05, + "loss": 1.5299186706542969, + "step": 122590 + }, + { + "epoch": 0.37110468723987106, + "grad_norm": 0.10808619856834412, + "learning_rate": 7.885097080746284e-05, + "loss": 1.5129524230957032, + "step": 122600 + }, + { + "epoch": 0.3711349567902169, + "grad_norm": 0.11250172555446625, + "learning_rate": 7.884717562582546e-05, + "loss": 1.5533564567565918, + "step": 122610 + }, + { + "epoch": 0.3711652263405627, + "grad_norm": 0.12647411227226257, + "learning_rate": 7.884338044418805e-05, + "loss": 1.561160182952881, + "step": 122620 + }, + { + "epoch": 0.37119549589090856, + "grad_norm": 0.10972438007593155, + "learning_rate": 7.883958526255067e-05, + "loss": 1.5680359840393066, + "step": 122630 + }, + { + "epoch": 0.37122576544125435, + "grad_norm": 0.1053023412823677, + "learning_rate": 7.883579008091328e-05, + "loss": 1.5473146438598633, + "step": 122640 + }, + { + "epoch": 0.3712560349916002, + "grad_norm": 0.0938107892870903, + "learning_rate": 7.883199489927588e-05, + "loss": 1.5431078910827636, + "step": 122650 + }, + { + "epoch": 0.37128630454194606, + "grad_norm": 0.10354308784008026, + "learning_rate": 7.882819971763849e-05, + "loss": 1.5397640228271485, + "step": 122660 + }, + { + "epoch": 0.37131657409229185, + "grad_norm": 0.10153347998857498, + "learning_rate": 7.88244045360011e-05, + "loss": 1.535489845275879, + "step": 122670 + }, + { + "epoch": 0.3713468436426377, + "grad_norm": 0.11456096172332764, + "learning_rate": 7.88206093543637e-05, + "loss": 1.525717830657959, + "step": 122680 + }, + { + "epoch": 0.3713771131929835, + "grad_norm": 0.11047118902206421, + "learning_rate": 7.881681417272631e-05, + "loss": 1.572191047668457, + "step": 122690 + }, + { + "epoch": 0.37140738274332935, + "grad_norm": 0.11531922966241837, + "learning_rate": 7.881301899108891e-05, + "loss": 1.5383878707885743, + "step": 122700 + }, + { + "epoch": 0.3714376522936752, + "grad_norm": 0.1337405890226364, + "learning_rate": 7.880922380945152e-05, + "loss": 1.5577932357788087, + "step": 122710 + }, + { + "epoch": 0.371467921844021, + "grad_norm": 0.10101579874753952, + "learning_rate": 7.880542862781414e-05, + "loss": 1.5521891593933106, + "step": 122720 + }, + { + "epoch": 0.37149819139436685, + "grad_norm": 0.12106704711914062, + "learning_rate": 7.880163344617673e-05, + "loss": 1.532008171081543, + "step": 122730 + }, + { + "epoch": 0.37152846094471265, + "grad_norm": 0.10082364827394485, + "learning_rate": 7.879783826453935e-05, + "loss": 1.5366467475891112, + "step": 122740 + }, + { + "epoch": 0.3715587304950585, + "grad_norm": 0.09915362298488617, + "learning_rate": 7.879404308290194e-05, + "loss": 1.5325197219848632, + "step": 122750 + }, + { + "epoch": 0.37158900004540435, + "grad_norm": 0.11066340655088425, + "learning_rate": 7.879024790126456e-05, + "loss": 1.5546692848205566, + "step": 122760 + }, + { + "epoch": 0.37161926959575015, + "grad_norm": 0.10562209784984589, + "learning_rate": 7.878645271962715e-05, + "loss": 1.5584044456481934, + "step": 122770 + }, + { + "epoch": 0.371649539146096, + "grad_norm": 0.1096421480178833, + "learning_rate": 7.878265753798977e-05, + "loss": 1.5320000648498535, + "step": 122780 + }, + { + "epoch": 0.3716798086964418, + "grad_norm": 0.11445793509483337, + "learning_rate": 7.877886235635237e-05, + "loss": 1.5530364990234375, + "step": 122790 + }, + { + "epoch": 0.37171007824678765, + "grad_norm": 0.11603336781263351, + "learning_rate": 7.877506717471499e-05, + "loss": 1.5371074676513672, + "step": 122800 + }, + { + "epoch": 0.3717403477971335, + "grad_norm": 0.1164972260594368, + "learning_rate": 7.877127199307758e-05, + "loss": 1.5505498886108398, + "step": 122810 + }, + { + "epoch": 0.3717706173474793, + "grad_norm": 0.10260080546140671, + "learning_rate": 7.87674768114402e-05, + "loss": 1.561577320098877, + "step": 122820 + }, + { + "epoch": 0.37180088689782514, + "grad_norm": 0.1325543224811554, + "learning_rate": 7.876368162980282e-05, + "loss": 1.5724560737609863, + "step": 122830 + }, + { + "epoch": 0.37183115644817094, + "grad_norm": 0.10824090987443924, + "learning_rate": 7.875988644816541e-05, + "loss": 1.569845962524414, + "step": 122840 + }, + { + "epoch": 0.3718614259985168, + "grad_norm": 0.10874127596616745, + "learning_rate": 7.875609126652803e-05, + "loss": 1.5462231636047363, + "step": 122850 + }, + { + "epoch": 0.37189169554886264, + "grad_norm": 0.11215042322874069, + "learning_rate": 7.875229608489062e-05, + "loss": 1.5421218872070312, + "step": 122860 + }, + { + "epoch": 0.37192196509920844, + "grad_norm": 0.12162323296070099, + "learning_rate": 7.874850090325324e-05, + "loss": 1.565872573852539, + "step": 122870 + }, + { + "epoch": 0.3719522346495543, + "grad_norm": 0.09854784607887268, + "learning_rate": 7.874470572161583e-05, + "loss": 1.5631929397583009, + "step": 122880 + }, + { + "epoch": 0.3719825041999001, + "grad_norm": 0.10759894549846649, + "learning_rate": 7.874091053997845e-05, + "loss": 1.5385337829589845, + "step": 122890 + }, + { + "epoch": 0.37201277375024594, + "grad_norm": 0.10215164721012115, + "learning_rate": 7.873711535834105e-05, + "loss": 1.5533771514892578, + "step": 122900 + }, + { + "epoch": 0.3720430433005918, + "grad_norm": 0.10437124967575073, + "learning_rate": 7.873332017670367e-05, + "loss": 1.5431921005249023, + "step": 122910 + }, + { + "epoch": 0.3720733128509376, + "grad_norm": 0.10395858436822891, + "learning_rate": 7.872952499506626e-05, + "loss": 1.5617002487182616, + "step": 122920 + }, + { + "epoch": 0.37210358240128344, + "grad_norm": 0.11367291212081909, + "learning_rate": 7.872572981342888e-05, + "loss": 1.5356195449829102, + "step": 122930 + }, + { + "epoch": 0.37213385195162924, + "grad_norm": 0.10823238641023636, + "learning_rate": 7.872193463179147e-05, + "loss": 1.5658823013305665, + "step": 122940 + }, + { + "epoch": 0.3721641215019751, + "grad_norm": 0.1288941353559494, + "learning_rate": 7.871813945015409e-05, + "loss": 1.5314104080200195, + "step": 122950 + }, + { + "epoch": 0.37219439105232094, + "grad_norm": 0.10202673077583313, + "learning_rate": 7.87143442685167e-05, + "loss": 1.5672290802001954, + "step": 122960 + }, + { + "epoch": 0.37222466060266673, + "grad_norm": 0.10573437809944153, + "learning_rate": 7.87105490868793e-05, + "loss": 1.529643726348877, + "step": 122970 + }, + { + "epoch": 0.3722549301530126, + "grad_norm": 0.09908882528543472, + "learning_rate": 7.87067539052419e-05, + "loss": 1.5371820449829101, + "step": 122980 + }, + { + "epoch": 0.3722851997033584, + "grad_norm": 0.12895114719867706, + "learning_rate": 7.870295872360451e-05, + "loss": 1.58398380279541, + "step": 122990 + }, + { + "epoch": 0.37231546925370423, + "grad_norm": 0.10864640027284622, + "learning_rate": 7.869916354196712e-05, + "loss": 1.5600201606750488, + "step": 123000 + }, + { + "epoch": 0.37231546925370423, + "eval_loss": 1.5646554231643677, + "eval_runtime": 27.8489, + "eval_samples_per_second": 17.954, + "eval_steps_per_second": 1.149, + "step": 123000 + }, + { + "epoch": 0.3723457388040501, + "grad_norm": 0.10826973617076874, + "learning_rate": 7.869536836032972e-05, + "loss": 1.5693367004394532, + "step": 123010 + }, + { + "epoch": 0.3723760083543959, + "grad_norm": 0.12650960683822632, + "learning_rate": 7.869157317869233e-05, + "loss": 1.5627859115600586, + "step": 123020 + }, + { + "epoch": 0.37240627790474173, + "grad_norm": 0.11235778778791428, + "learning_rate": 7.868777799705494e-05, + "loss": 1.5460891723632812, + "step": 123030 + }, + { + "epoch": 0.37243654745508753, + "grad_norm": 0.10978718847036362, + "learning_rate": 7.868398281541756e-05, + "loss": 1.5377115249633788, + "step": 123040 + }, + { + "epoch": 0.3724668170054334, + "grad_norm": 0.11883910000324249, + "learning_rate": 7.868018763378015e-05, + "loss": 1.5973701477050781, + "step": 123050 + }, + { + "epoch": 0.37249708655577923, + "grad_norm": 0.11148910224437714, + "learning_rate": 7.867639245214277e-05, + "loss": 1.5180067062377929, + "step": 123060 + }, + { + "epoch": 0.37252735610612503, + "grad_norm": 0.10214729607105255, + "learning_rate": 7.867259727050536e-05, + "loss": 1.5497915267944335, + "step": 123070 + }, + { + "epoch": 0.3725576256564709, + "grad_norm": 0.11432183533906937, + "learning_rate": 7.866880208886798e-05, + "loss": 1.5442977905273438, + "step": 123080 + }, + { + "epoch": 0.3725878952068167, + "grad_norm": 0.12232266366481781, + "learning_rate": 7.866500690723059e-05, + "loss": 1.5278331756591796, + "step": 123090 + }, + { + "epoch": 0.37261816475716253, + "grad_norm": 0.10335355252027512, + "learning_rate": 7.866121172559319e-05, + "loss": 1.570138931274414, + "step": 123100 + }, + { + "epoch": 0.3726484343075084, + "grad_norm": 0.11750483512878418, + "learning_rate": 7.86574165439558e-05, + "loss": 1.5038269996643066, + "step": 123110 + }, + { + "epoch": 0.3726787038578542, + "grad_norm": 0.11712535470724106, + "learning_rate": 7.86536213623184e-05, + "loss": 1.5222230911254884, + "step": 123120 + }, + { + "epoch": 0.37270897340820003, + "grad_norm": 0.1332809180021286, + "learning_rate": 7.864982618068101e-05, + "loss": 1.5337800979614258, + "step": 123130 + }, + { + "epoch": 0.3727392429585458, + "grad_norm": 0.10893937945365906, + "learning_rate": 7.864603099904362e-05, + "loss": 1.5331941604614259, + "step": 123140 + }, + { + "epoch": 0.3727695125088917, + "grad_norm": 0.1094285398721695, + "learning_rate": 7.864223581740622e-05, + "loss": 1.5529167175292968, + "step": 123150 + }, + { + "epoch": 0.37279978205923753, + "grad_norm": 0.10788581520318985, + "learning_rate": 7.863844063576883e-05, + "loss": 1.541152000427246, + "step": 123160 + }, + { + "epoch": 0.3728300516095833, + "grad_norm": 0.11359256505966187, + "learning_rate": 7.863464545413143e-05, + "loss": 1.5707727432250977, + "step": 123170 + }, + { + "epoch": 0.3728603211599292, + "grad_norm": 0.10886427015066147, + "learning_rate": 7.863085027249404e-05, + "loss": 1.5382240295410157, + "step": 123180 + }, + { + "epoch": 0.37289059071027497, + "grad_norm": 0.11577894538640976, + "learning_rate": 7.862705509085665e-05, + "loss": 1.5312394142150878, + "step": 123190 + }, + { + "epoch": 0.3729208602606208, + "grad_norm": 0.10717403143644333, + "learning_rate": 7.862325990921926e-05, + "loss": 1.550644874572754, + "step": 123200 + }, + { + "epoch": 0.3729511298109667, + "grad_norm": 0.0980646014213562, + "learning_rate": 7.861946472758186e-05, + "loss": 1.571796989440918, + "step": 123210 + }, + { + "epoch": 0.37298139936131247, + "grad_norm": 0.12686997652053833, + "learning_rate": 7.861566954594448e-05, + "loss": 1.5289222717285156, + "step": 123220 + }, + { + "epoch": 0.3730116689116583, + "grad_norm": 0.10440755635499954, + "learning_rate": 7.861187436430707e-05, + "loss": 1.5291994094848633, + "step": 123230 + }, + { + "epoch": 0.3730419384620042, + "grad_norm": 0.1140744686126709, + "learning_rate": 7.860807918266969e-05, + "loss": 1.5186685562133788, + "step": 123240 + }, + { + "epoch": 0.37307220801234997, + "grad_norm": 0.10755728930234909, + "learning_rate": 7.86042840010323e-05, + "loss": 1.5347867965698243, + "step": 123250 + }, + { + "epoch": 0.3731024775626958, + "grad_norm": 0.11477361619472504, + "learning_rate": 7.86004888193949e-05, + "loss": 1.532104778289795, + "step": 123260 + }, + { + "epoch": 0.3731327471130416, + "grad_norm": 0.11177795380353928, + "learning_rate": 7.85966936377575e-05, + "loss": 1.5298343658447267, + "step": 123270 + }, + { + "epoch": 0.37316301666338747, + "grad_norm": 0.1149945929646492, + "learning_rate": 7.859289845612011e-05, + "loss": 1.5428872108459473, + "step": 123280 + }, + { + "epoch": 0.3731932862137333, + "grad_norm": 0.10515456646680832, + "learning_rate": 7.858910327448272e-05, + "loss": 1.5773712158203126, + "step": 123290 + }, + { + "epoch": 0.3732235557640791, + "grad_norm": 0.11182114481925964, + "learning_rate": 7.858530809284532e-05, + "loss": 1.4955804824829102, + "step": 123300 + }, + { + "epoch": 0.37325382531442497, + "grad_norm": 0.12219427525997162, + "learning_rate": 7.858151291120793e-05, + "loss": 1.5043851852416992, + "step": 123310 + }, + { + "epoch": 0.37328409486477077, + "grad_norm": 0.10417310893535614, + "learning_rate": 7.857771772957054e-05, + "loss": 1.5564970970153809, + "step": 123320 + }, + { + "epoch": 0.3733143644151166, + "grad_norm": 0.11810820549726486, + "learning_rate": 7.857392254793316e-05, + "loss": 1.5506546020507812, + "step": 123330 + }, + { + "epoch": 0.37334463396546247, + "grad_norm": 0.11912750452756882, + "learning_rate": 7.857012736629575e-05, + "loss": 1.5365083694458008, + "step": 123340 + }, + { + "epoch": 0.37337490351580827, + "grad_norm": 0.11438721418380737, + "learning_rate": 7.856633218465837e-05, + "loss": 1.5429906845092773, + "step": 123350 + }, + { + "epoch": 0.3734051730661541, + "grad_norm": 0.11403699964284897, + "learning_rate": 7.856253700302096e-05, + "loss": 1.503271484375, + "step": 123360 + }, + { + "epoch": 0.3734354426164999, + "grad_norm": 0.1227453276515007, + "learning_rate": 7.855874182138358e-05, + "loss": 1.5246219635009766, + "step": 123370 + }, + { + "epoch": 0.37346571216684576, + "grad_norm": 0.09697453677654266, + "learning_rate": 7.855494663974617e-05, + "loss": 1.5310007095336915, + "step": 123380 + }, + { + "epoch": 0.3734959817171916, + "grad_norm": 0.09810136258602142, + "learning_rate": 7.855115145810879e-05, + "loss": 1.5188600540161132, + "step": 123390 + }, + { + "epoch": 0.3735262512675374, + "grad_norm": 0.11460928618907928, + "learning_rate": 7.854735627647138e-05, + "loss": 1.5469343185424804, + "step": 123400 + }, + { + "epoch": 0.37355652081788326, + "grad_norm": 0.11144246906042099, + "learning_rate": 7.8543561094834e-05, + "loss": 1.5850408554077149, + "step": 123410 + }, + { + "epoch": 0.37358679036822906, + "grad_norm": 0.1182708591222763, + "learning_rate": 7.85397659131966e-05, + "loss": 1.572454071044922, + "step": 123420 + }, + { + "epoch": 0.3736170599185749, + "grad_norm": 0.10123807936906815, + "learning_rate": 7.853597073155922e-05, + "loss": 1.5371448516845703, + "step": 123430 + }, + { + "epoch": 0.37364732946892076, + "grad_norm": 0.09471231698989868, + "learning_rate": 7.853217554992182e-05, + "loss": 1.5519033432006837, + "step": 123440 + }, + { + "epoch": 0.37367759901926656, + "grad_norm": 0.11954084783792496, + "learning_rate": 7.852838036828443e-05, + "loss": 1.5548792839050294, + "step": 123450 + }, + { + "epoch": 0.3737078685696124, + "grad_norm": 0.1125955656170845, + "learning_rate": 7.852458518664705e-05, + "loss": 1.5758683204650878, + "step": 123460 + }, + { + "epoch": 0.3737381381199582, + "grad_norm": 0.10409846901893616, + "learning_rate": 7.852079000500964e-05, + "loss": 1.5797367095947266, + "step": 123470 + }, + { + "epoch": 0.37376840767030406, + "grad_norm": 0.11275631189346313, + "learning_rate": 7.851699482337226e-05, + "loss": 1.5358877182006836, + "step": 123480 + }, + { + "epoch": 0.3737986772206499, + "grad_norm": 0.10386069118976593, + "learning_rate": 7.851319964173485e-05, + "loss": 1.5426721572875977, + "step": 123490 + }, + { + "epoch": 0.3738289467709957, + "grad_norm": 0.1120787262916565, + "learning_rate": 7.850940446009747e-05, + "loss": 1.5575876235961914, + "step": 123500 + }, + { + "epoch": 0.3738289467709957, + "eval_loss": 1.556755781173706, + "eval_runtime": 27.8836, + "eval_samples_per_second": 17.932, + "eval_steps_per_second": 1.148, + "step": 123500 + }, + { + "epoch": 0.37385921632134156, + "grad_norm": 0.12140476703643799, + "learning_rate": 7.850560927846006e-05, + "loss": 1.5399725914001465, + "step": 123510 + }, + { + "epoch": 0.37388948587168735, + "grad_norm": 0.12762309610843658, + "learning_rate": 7.850181409682268e-05, + "loss": 1.5578112602233887, + "step": 123520 + }, + { + "epoch": 0.3739197554220332, + "grad_norm": 0.11494474858045578, + "learning_rate": 7.849801891518527e-05, + "loss": 1.550445556640625, + "step": 123530 + }, + { + "epoch": 0.37395002497237906, + "grad_norm": 0.10591481626033783, + "learning_rate": 7.84942237335479e-05, + "loss": 1.5602755546569824, + "step": 123540 + }, + { + "epoch": 0.37398029452272485, + "grad_norm": 0.11555200070142746, + "learning_rate": 7.849042855191049e-05, + "loss": 1.5343549728393555, + "step": 123550 + }, + { + "epoch": 0.3740105640730707, + "grad_norm": 0.117301344871521, + "learning_rate": 7.84866333702731e-05, + "loss": 1.5501106262207032, + "step": 123560 + }, + { + "epoch": 0.3740408336234165, + "grad_norm": 0.1044548973441124, + "learning_rate": 7.84828381886357e-05, + "loss": 1.5741615295410156, + "step": 123570 + }, + { + "epoch": 0.37407110317376235, + "grad_norm": 0.10530611872673035, + "learning_rate": 7.847904300699832e-05, + "loss": 1.5617206573486329, + "step": 123580 + }, + { + "epoch": 0.3741013727241082, + "grad_norm": 0.11129415780305862, + "learning_rate": 7.847524782536092e-05, + "loss": 1.5359984397888184, + "step": 123590 + }, + { + "epoch": 0.374131642274454, + "grad_norm": 0.10411923378705978, + "learning_rate": 7.847145264372353e-05, + "loss": 1.5665621757507324, + "step": 123600 + }, + { + "epoch": 0.37416191182479985, + "grad_norm": 0.11101610958576202, + "learning_rate": 7.846765746208614e-05, + "loss": 1.5653287887573242, + "step": 123610 + }, + { + "epoch": 0.37419218137514565, + "grad_norm": 0.11377190798521042, + "learning_rate": 7.846386228044874e-05, + "loss": 1.5322527885437012, + "step": 123620 + }, + { + "epoch": 0.3742224509254915, + "grad_norm": 0.11239922046661377, + "learning_rate": 7.846006709881135e-05, + "loss": 1.5705942153930663, + "step": 123630 + }, + { + "epoch": 0.37425272047583735, + "grad_norm": 0.1089606061577797, + "learning_rate": 7.845627191717395e-05, + "loss": 1.557175350189209, + "step": 123640 + }, + { + "epoch": 0.37428299002618315, + "grad_norm": 0.10722646117210388, + "learning_rate": 7.845247673553657e-05, + "loss": 1.5454980850219726, + "step": 123650 + }, + { + "epoch": 0.374313259576529, + "grad_norm": 0.12721727788448334, + "learning_rate": 7.844868155389917e-05, + "loss": 1.5158191680908204, + "step": 123660 + }, + { + "epoch": 0.3743435291268748, + "grad_norm": 0.12428539991378784, + "learning_rate": 7.844488637226179e-05, + "loss": 1.5434328079223634, + "step": 123670 + }, + { + "epoch": 0.37437379867722065, + "grad_norm": 0.11999691277742386, + "learning_rate": 7.844109119062438e-05, + "loss": 1.533533477783203, + "step": 123680 + }, + { + "epoch": 0.3744040682275665, + "grad_norm": 0.11484848707914352, + "learning_rate": 7.8437296008987e-05, + "loss": 1.586699867248535, + "step": 123690 + }, + { + "epoch": 0.3744343377779123, + "grad_norm": 0.11217370629310608, + "learning_rate": 7.84335008273496e-05, + "loss": 1.5665618896484375, + "step": 123700 + }, + { + "epoch": 0.37446460732825815, + "grad_norm": 0.1133379116654396, + "learning_rate": 7.842970564571221e-05, + "loss": 1.5520843505859374, + "step": 123710 + }, + { + "epoch": 0.37449487687860394, + "grad_norm": 0.0971280112862587, + "learning_rate": 7.842591046407481e-05, + "loss": 1.5455121994018555, + "step": 123720 + }, + { + "epoch": 0.3745251464289498, + "grad_norm": 0.10068482905626297, + "learning_rate": 7.842211528243742e-05, + "loss": 1.537571907043457, + "step": 123730 + }, + { + "epoch": 0.37455541597929565, + "grad_norm": 0.10235607624053955, + "learning_rate": 7.841832010080003e-05, + "loss": 1.5437291145324707, + "step": 123740 + }, + { + "epoch": 0.37458568552964144, + "grad_norm": 0.10745368152856827, + "learning_rate": 7.841452491916263e-05, + "loss": 1.5591668128967284, + "step": 123750 + }, + { + "epoch": 0.3746159550799873, + "grad_norm": 0.13446369767189026, + "learning_rate": 7.841072973752524e-05, + "loss": 1.5475915908813476, + "step": 123760 + }, + { + "epoch": 0.3746462246303331, + "grad_norm": 0.10779766738414764, + "learning_rate": 7.840693455588784e-05, + "loss": 1.5455661773681642, + "step": 123770 + }, + { + "epoch": 0.37467649418067894, + "grad_norm": 0.10464919358491898, + "learning_rate": 7.840313937425045e-05, + "loss": 1.5493980407714845, + "step": 123780 + }, + { + "epoch": 0.3747067637310248, + "grad_norm": 0.12320031225681305, + "learning_rate": 7.839934419261306e-05, + "loss": 1.5375271797180177, + "step": 123790 + }, + { + "epoch": 0.3747370332813706, + "grad_norm": 0.09626539051532745, + "learning_rate": 7.839554901097566e-05, + "loss": 1.545501708984375, + "step": 123800 + }, + { + "epoch": 0.37476730283171644, + "grad_norm": 0.10225016623735428, + "learning_rate": 7.839175382933827e-05, + "loss": 1.5658184051513673, + "step": 123810 + }, + { + "epoch": 0.37479757238206224, + "grad_norm": 0.11442342400550842, + "learning_rate": 7.838795864770087e-05, + "loss": 1.5349555969238282, + "step": 123820 + }, + { + "epoch": 0.3748278419324081, + "grad_norm": 0.11511227488517761, + "learning_rate": 7.83841634660635e-05, + "loss": 1.5459070205688477, + "step": 123830 + }, + { + "epoch": 0.37485811148275394, + "grad_norm": 0.12076841294765472, + "learning_rate": 7.838036828442609e-05, + "loss": 1.5302238464355469, + "step": 123840 + }, + { + "epoch": 0.37488838103309974, + "grad_norm": 0.12279901653528214, + "learning_rate": 7.83765731027887e-05, + "loss": 1.5515507698059081, + "step": 123850 + }, + { + "epoch": 0.3749186505834456, + "grad_norm": 0.11920282244682312, + "learning_rate": 7.837277792115131e-05, + "loss": 1.5170434951782226, + "step": 123860 + }, + { + "epoch": 0.3749489201337914, + "grad_norm": 0.10639677196741104, + "learning_rate": 7.836898273951392e-05, + "loss": 1.5537397384643554, + "step": 123870 + }, + { + "epoch": 0.37497918968413724, + "grad_norm": 0.10681329667568207, + "learning_rate": 7.836518755787652e-05, + "loss": 1.543430995941162, + "step": 123880 + }, + { + "epoch": 0.3750094592344831, + "grad_norm": 0.11657264083623886, + "learning_rate": 7.836139237623913e-05, + "loss": 1.5256211280822753, + "step": 123890 + }, + { + "epoch": 0.3750397287848289, + "grad_norm": 0.101152703166008, + "learning_rate": 7.835759719460174e-05, + "loss": 1.5511279106140137, + "step": 123900 + }, + { + "epoch": 0.37506999833517474, + "grad_norm": 0.12227314710617065, + "learning_rate": 7.835380201296434e-05, + "loss": 1.5838029861450196, + "step": 123910 + }, + { + "epoch": 0.37510026788552053, + "grad_norm": 0.11185334622859955, + "learning_rate": 7.835000683132695e-05, + "loss": 1.532557487487793, + "step": 123920 + }, + { + "epoch": 0.3751305374358664, + "grad_norm": 0.11232585459947586, + "learning_rate": 7.834621164968955e-05, + "loss": 1.5571337699890138, + "step": 123930 + }, + { + "epoch": 0.37516080698621224, + "grad_norm": 0.10108406841754913, + "learning_rate": 7.834241646805217e-05, + "loss": 1.5408939361572265, + "step": 123940 + }, + { + "epoch": 0.37519107653655803, + "grad_norm": 0.1051301434636116, + "learning_rate": 7.833862128641477e-05, + "loss": 1.5624615669250488, + "step": 123950 + }, + { + "epoch": 0.3752213460869039, + "grad_norm": 0.10420669615268707, + "learning_rate": 7.833482610477738e-05, + "loss": 1.5542362213134766, + "step": 123960 + }, + { + "epoch": 0.37525161563724974, + "grad_norm": 0.10503990948200226, + "learning_rate": 7.833103092313998e-05, + "loss": 1.5448945999145507, + "step": 123970 + }, + { + "epoch": 0.37528188518759553, + "grad_norm": 0.10291137546300888, + "learning_rate": 7.83272357415026e-05, + "loss": 1.5528759002685546, + "step": 123980 + }, + { + "epoch": 0.3753121547379414, + "grad_norm": 0.1076933890581131, + "learning_rate": 7.832344055986519e-05, + "loss": 1.5441478729248046, + "step": 123990 + }, + { + "epoch": 0.3753424242882872, + "grad_norm": 0.10333768278360367, + "learning_rate": 7.831964537822781e-05, + "loss": 1.528702163696289, + "step": 124000 + }, + { + "epoch": 0.3753424242882872, + "eval_loss": 1.5553114414215088, + "eval_runtime": 28.334, + "eval_samples_per_second": 17.647, + "eval_steps_per_second": 1.129, + "step": 124000 + }, + { + "epoch": 0.37537269383863303, + "grad_norm": 0.11849364638328552, + "learning_rate": 7.83158501965904e-05, + "loss": 1.5391289710998535, + "step": 124010 + }, + { + "epoch": 0.3754029633889789, + "grad_norm": 0.10512015968561172, + "learning_rate": 7.831205501495302e-05, + "loss": 1.5811758995056153, + "step": 124020 + }, + { + "epoch": 0.3754332329393247, + "grad_norm": 0.11173853278160095, + "learning_rate": 7.830825983331561e-05, + "loss": 1.5535810470581055, + "step": 124030 + }, + { + "epoch": 0.37546350248967053, + "grad_norm": 0.10671116411685944, + "learning_rate": 7.830446465167823e-05, + "loss": 1.5410212516784667, + "step": 124040 + }, + { + "epoch": 0.3754937720400163, + "grad_norm": 0.12143916636705399, + "learning_rate": 7.830066947004084e-05, + "loss": 1.5750380516052247, + "step": 124050 + }, + { + "epoch": 0.3755240415903622, + "grad_norm": 0.11480878293514252, + "learning_rate": 7.829687428840344e-05, + "loss": 1.5451374053955078, + "step": 124060 + }, + { + "epoch": 0.37555431114070803, + "grad_norm": 0.09744883328676224, + "learning_rate": 7.829307910676606e-05, + "loss": 1.5817306518554688, + "step": 124070 + }, + { + "epoch": 0.3755845806910538, + "grad_norm": 0.11143193393945694, + "learning_rate": 7.828928392512866e-05, + "loss": 1.5298145294189454, + "step": 124080 + }, + { + "epoch": 0.3756148502413997, + "grad_norm": 0.11042953282594681, + "learning_rate": 7.828548874349128e-05, + "loss": 1.553391170501709, + "step": 124090 + }, + { + "epoch": 0.3756451197917455, + "grad_norm": 0.11371395736932755, + "learning_rate": 7.828169356185387e-05, + "loss": 1.5344701766967774, + "step": 124100 + }, + { + "epoch": 0.3756753893420913, + "grad_norm": 0.1006670668721199, + "learning_rate": 7.827789838021649e-05, + "loss": 1.593736457824707, + "step": 124110 + }, + { + "epoch": 0.3757056588924372, + "grad_norm": 0.11610671132802963, + "learning_rate": 7.827410319857908e-05, + "loss": 1.5153194427490235, + "step": 124120 + }, + { + "epoch": 0.375735928442783, + "grad_norm": 0.13031314313411713, + "learning_rate": 7.82703080169417e-05, + "loss": 1.5075580596923828, + "step": 124130 + }, + { + "epoch": 0.3757661979931288, + "grad_norm": 0.12364786863327026, + "learning_rate": 7.826651283530429e-05, + "loss": 1.5406024932861329, + "step": 124140 + }, + { + "epoch": 0.3757964675434746, + "grad_norm": 0.1141030490398407, + "learning_rate": 7.826271765366691e-05, + "loss": 1.510844898223877, + "step": 124150 + }, + { + "epoch": 0.3758267370938205, + "grad_norm": 0.10967309027910233, + "learning_rate": 7.82589224720295e-05, + "loss": 1.5137542724609374, + "step": 124160 + }, + { + "epoch": 0.3758570066441663, + "grad_norm": 0.10233749449253082, + "learning_rate": 7.825512729039212e-05, + "loss": 1.5336215019226074, + "step": 124170 + }, + { + "epoch": 0.3758872761945121, + "grad_norm": 0.12463299185037613, + "learning_rate": 7.825133210875472e-05, + "loss": 1.5675820350646972, + "step": 124180 + }, + { + "epoch": 0.375917545744858, + "grad_norm": 0.10740918666124344, + "learning_rate": 7.824753692711734e-05, + "loss": 1.5310824394226075, + "step": 124190 + }, + { + "epoch": 0.37594781529520377, + "grad_norm": 0.09786801040172577, + "learning_rate": 7.824374174547994e-05, + "loss": 1.552826499938965, + "step": 124200 + }, + { + "epoch": 0.3759780848455496, + "grad_norm": 0.11690684407949448, + "learning_rate": 7.823994656384255e-05, + "loss": 1.5487610816955566, + "step": 124210 + }, + { + "epoch": 0.37600835439589547, + "grad_norm": 0.11129148304462433, + "learning_rate": 7.823615138220515e-05, + "loss": 1.5204258918762208, + "step": 124220 + }, + { + "epoch": 0.37603862394624127, + "grad_norm": 0.11126837134361267, + "learning_rate": 7.823235620056776e-05, + "loss": 1.500037956237793, + "step": 124230 + }, + { + "epoch": 0.3760688934965871, + "grad_norm": 0.09936875849962234, + "learning_rate": 7.822856101893036e-05, + "loss": 1.531236743927002, + "step": 124240 + }, + { + "epoch": 0.3760991630469329, + "grad_norm": 0.10821028053760529, + "learning_rate": 7.822476583729297e-05, + "loss": 1.5122676849365235, + "step": 124250 + }, + { + "epoch": 0.37612943259727877, + "grad_norm": 0.1158105880022049, + "learning_rate": 7.822097065565559e-05, + "loss": 1.5646239280700684, + "step": 124260 + }, + { + "epoch": 0.3761597021476246, + "grad_norm": 0.10655972361564636, + "learning_rate": 7.821717547401818e-05, + "loss": 1.51913423538208, + "step": 124270 + }, + { + "epoch": 0.3761899716979704, + "grad_norm": 0.0967874825000763, + "learning_rate": 7.82133802923808e-05, + "loss": 1.529617691040039, + "step": 124280 + }, + { + "epoch": 0.37622024124831627, + "grad_norm": 0.11575155705213547, + "learning_rate": 7.82095851107434e-05, + "loss": 1.5578388214111327, + "step": 124290 + }, + { + "epoch": 0.37625051079866206, + "grad_norm": 0.11542347073554993, + "learning_rate": 7.820578992910601e-05, + "loss": 1.5289918899536132, + "step": 124300 + }, + { + "epoch": 0.3762807803490079, + "grad_norm": 0.10616964101791382, + "learning_rate": 7.820199474746862e-05, + "loss": 1.5360715866088868, + "step": 124310 + }, + { + "epoch": 0.37631104989935377, + "grad_norm": 0.11154253780841827, + "learning_rate": 7.819819956583123e-05, + "loss": 1.5802847862243652, + "step": 124320 + }, + { + "epoch": 0.37634131944969956, + "grad_norm": 0.10336585342884064, + "learning_rate": 7.819440438419383e-05, + "loss": 1.565964412689209, + "step": 124330 + }, + { + "epoch": 0.3763715890000454, + "grad_norm": 0.11914172023534775, + "learning_rate": 7.819060920255644e-05, + "loss": 1.5055810928344726, + "step": 124340 + }, + { + "epoch": 0.3764018585503912, + "grad_norm": 0.10449978709220886, + "learning_rate": 7.818681402091904e-05, + "loss": 1.5565418243408202, + "step": 124350 + }, + { + "epoch": 0.37643212810073706, + "grad_norm": 0.11761687695980072, + "learning_rate": 7.818301883928165e-05, + "loss": 1.5398515701293944, + "step": 124360 + }, + { + "epoch": 0.3764623976510829, + "grad_norm": 0.10472166538238525, + "learning_rate": 7.817922365764426e-05, + "loss": 1.4851364135742187, + "step": 124370 + }, + { + "epoch": 0.3764926672014287, + "grad_norm": 0.10967394709587097, + "learning_rate": 7.817542847600686e-05, + "loss": 1.5558411598205566, + "step": 124380 + }, + { + "epoch": 0.37652293675177456, + "grad_norm": 0.1080736294388771, + "learning_rate": 7.817163329436947e-05, + "loss": 1.540559959411621, + "step": 124390 + }, + { + "epoch": 0.37655320630212036, + "grad_norm": 0.10807117074728012, + "learning_rate": 7.816783811273207e-05, + "loss": 1.553148365020752, + "step": 124400 + }, + { + "epoch": 0.3765834758524662, + "grad_norm": 0.10990668088197708, + "learning_rate": 7.816404293109468e-05, + "loss": 1.5106281280517577, + "step": 124410 + }, + { + "epoch": 0.37661374540281206, + "grad_norm": 0.10938109457492828, + "learning_rate": 7.816024774945729e-05, + "loss": 1.5053293228149414, + "step": 124420 + }, + { + "epoch": 0.37664401495315786, + "grad_norm": 0.11533699184656143, + "learning_rate": 7.815645256781989e-05, + "loss": 1.5515031814575195, + "step": 124430 + }, + { + "epoch": 0.3766742845035037, + "grad_norm": 0.10806475579738617, + "learning_rate": 7.815265738618251e-05, + "loss": 1.56032075881958, + "step": 124440 + }, + { + "epoch": 0.3767045540538495, + "grad_norm": 0.12926408648490906, + "learning_rate": 7.814886220454512e-05, + "loss": 1.5768905639648438, + "step": 124450 + }, + { + "epoch": 0.37673482360419536, + "grad_norm": 0.10194261372089386, + "learning_rate": 7.814506702290772e-05, + "loss": 1.5251350402832031, + "step": 124460 + }, + { + "epoch": 0.3767650931545412, + "grad_norm": 0.10907705873250961, + "learning_rate": 7.814127184127033e-05, + "loss": 1.5223281860351563, + "step": 124470 + }, + { + "epoch": 0.376795362704887, + "grad_norm": 0.10098656266927719, + "learning_rate": 7.813747665963293e-05, + "loss": 1.5467851638793946, + "step": 124480 + }, + { + "epoch": 0.37682563225523286, + "grad_norm": 0.11448768526315689, + "learning_rate": 7.813368147799554e-05, + "loss": 1.570875358581543, + "step": 124490 + }, + { + "epoch": 0.37685590180557865, + "grad_norm": 0.1143069788813591, + "learning_rate": 7.812988629635815e-05, + "loss": 1.552708911895752, + "step": 124500 + }, + { + "epoch": 0.37685590180557865, + "eval_loss": 1.547326683998108, + "eval_runtime": 28.5154, + "eval_samples_per_second": 17.534, + "eval_steps_per_second": 1.122, + "step": 124500 + }, + { + "epoch": 0.3768861713559245, + "grad_norm": 0.11194559931755066, + "learning_rate": 7.812609111472075e-05, + "loss": 1.5518945693969726, + "step": 124510 + }, + { + "epoch": 0.37691644090627036, + "grad_norm": 0.10997824370861053, + "learning_rate": 7.812229593308336e-05, + "loss": 1.5180925369262694, + "step": 124520 + }, + { + "epoch": 0.37694671045661615, + "grad_norm": 0.1303548812866211, + "learning_rate": 7.811850075144596e-05, + "loss": 1.5698184967041016, + "step": 124530 + }, + { + "epoch": 0.376976980006962, + "grad_norm": 0.11665280908346176, + "learning_rate": 7.811470556980857e-05, + "loss": 1.492668914794922, + "step": 124540 + }, + { + "epoch": 0.3770072495573078, + "grad_norm": 0.11669379472732544, + "learning_rate": 7.811091038817118e-05, + "loss": 1.5364575386047363, + "step": 124550 + }, + { + "epoch": 0.37703751910765365, + "grad_norm": 0.10748206824064255, + "learning_rate": 7.810711520653378e-05, + "loss": 1.548086166381836, + "step": 124560 + }, + { + "epoch": 0.3770677886579995, + "grad_norm": 0.10381444543600082, + "learning_rate": 7.81033200248964e-05, + "loss": 1.5382197380065918, + "step": 124570 + }, + { + "epoch": 0.3770980582083453, + "grad_norm": 0.0997549295425415, + "learning_rate": 7.8099524843259e-05, + "loss": 1.510352897644043, + "step": 124580 + }, + { + "epoch": 0.37712832775869115, + "grad_norm": 0.10391174256801605, + "learning_rate": 7.809572966162161e-05, + "loss": 1.5695722579956055, + "step": 124590 + }, + { + "epoch": 0.37715859730903695, + "grad_norm": 0.11247923970222473, + "learning_rate": 7.80919344799842e-05, + "loss": 1.5310997009277343, + "step": 124600 + }, + { + "epoch": 0.3771888668593828, + "grad_norm": 0.10180564224720001, + "learning_rate": 7.808813929834683e-05, + "loss": 1.5263407707214356, + "step": 124610 + }, + { + "epoch": 0.37721913640972865, + "grad_norm": 0.11927827447652817, + "learning_rate": 7.808434411670942e-05, + "loss": 1.5308331489562987, + "step": 124620 + }, + { + "epoch": 0.37724940596007445, + "grad_norm": 0.12423498928546906, + "learning_rate": 7.808054893507204e-05, + "loss": 1.5112530708312988, + "step": 124630 + }, + { + "epoch": 0.3772796755104203, + "grad_norm": 0.10743376612663269, + "learning_rate": 7.807675375343463e-05, + "loss": 1.5468029975891113, + "step": 124640 + }, + { + "epoch": 0.37730994506076615, + "grad_norm": 0.09128275513648987, + "learning_rate": 7.807295857179725e-05, + "loss": 1.5476810455322265, + "step": 124650 + }, + { + "epoch": 0.37734021461111195, + "grad_norm": 0.10955742001533508, + "learning_rate": 7.806916339015986e-05, + "loss": 1.5440237998962403, + "step": 124660 + }, + { + "epoch": 0.3773704841614578, + "grad_norm": 0.10436690598726273, + "learning_rate": 7.806536820852246e-05, + "loss": 1.53558406829834, + "step": 124670 + }, + { + "epoch": 0.3774007537118036, + "grad_norm": 0.11703242361545563, + "learning_rate": 7.806157302688508e-05, + "loss": 1.5504987716674805, + "step": 124680 + }, + { + "epoch": 0.37743102326214945, + "grad_norm": 0.11628508567810059, + "learning_rate": 7.805777784524767e-05, + "loss": 1.5444955825805664, + "step": 124690 + }, + { + "epoch": 0.3774612928124953, + "grad_norm": 0.11117485165596008, + "learning_rate": 7.805398266361029e-05, + "loss": 1.5311675071716309, + "step": 124700 + }, + { + "epoch": 0.3774915623628411, + "grad_norm": 0.11484196037054062, + "learning_rate": 7.805018748197289e-05, + "loss": 1.5405492782592773, + "step": 124710 + }, + { + "epoch": 0.37752183191318694, + "grad_norm": 0.11601205170154572, + "learning_rate": 7.80463923003355e-05, + "loss": 1.5517692565917969, + "step": 124720 + }, + { + "epoch": 0.37755210146353274, + "grad_norm": 0.11481926590204239, + "learning_rate": 7.80425971186981e-05, + "loss": 1.5502387046813966, + "step": 124730 + }, + { + "epoch": 0.3775823710138786, + "grad_norm": 0.11100143194198608, + "learning_rate": 7.803880193706072e-05, + "loss": 1.5281716346740724, + "step": 124740 + }, + { + "epoch": 0.37761264056422444, + "grad_norm": 0.12310591340065002, + "learning_rate": 7.803500675542331e-05, + "loss": 1.539762306213379, + "step": 124750 + }, + { + "epoch": 0.37764291011457024, + "grad_norm": 0.1134185642004013, + "learning_rate": 7.803121157378593e-05, + "loss": 1.5585182189941407, + "step": 124760 + }, + { + "epoch": 0.3776731796649161, + "grad_norm": 0.10520292073488235, + "learning_rate": 7.802741639214852e-05, + "loss": 1.5385441780090332, + "step": 124770 + }, + { + "epoch": 0.3777034492152619, + "grad_norm": 0.10667470842599869, + "learning_rate": 7.802362121051114e-05, + "loss": 1.527846336364746, + "step": 124780 + }, + { + "epoch": 0.37773371876560774, + "grad_norm": 0.10279656201601028, + "learning_rate": 7.801982602887373e-05, + "loss": 1.575141716003418, + "step": 124790 + }, + { + "epoch": 0.3777639883159536, + "grad_norm": 0.10194802284240723, + "learning_rate": 7.801603084723635e-05, + "loss": 1.561882781982422, + "step": 124800 + }, + { + "epoch": 0.3777942578662994, + "grad_norm": 0.10328776389360428, + "learning_rate": 7.801223566559896e-05, + "loss": 1.517100715637207, + "step": 124810 + }, + { + "epoch": 0.37782452741664524, + "grad_norm": 0.11131805181503296, + "learning_rate": 7.800844048396156e-05, + "loss": 1.522416877746582, + "step": 124820 + }, + { + "epoch": 0.37785479696699104, + "grad_norm": 0.10591702908277512, + "learning_rate": 7.800464530232417e-05, + "loss": 1.5796699523925781, + "step": 124830 + }, + { + "epoch": 0.3778850665173369, + "grad_norm": 0.10633210092782974, + "learning_rate": 7.800085012068678e-05, + "loss": 1.5376873016357422, + "step": 124840 + }, + { + "epoch": 0.37791533606768274, + "grad_norm": 0.1184820905327797, + "learning_rate": 7.799705493904938e-05, + "loss": 1.5121170043945313, + "step": 124850 + }, + { + "epoch": 0.37794560561802854, + "grad_norm": 0.1142907440662384, + "learning_rate": 7.799325975741199e-05, + "loss": 1.557832908630371, + "step": 124860 + }, + { + "epoch": 0.3779758751683744, + "grad_norm": 0.10865312069654465, + "learning_rate": 7.798946457577461e-05, + "loss": 1.5754276275634767, + "step": 124870 + }, + { + "epoch": 0.3780061447187202, + "grad_norm": 0.11212467402219772, + "learning_rate": 7.79856693941372e-05, + "loss": 1.5288816452026368, + "step": 124880 + }, + { + "epoch": 0.37803641426906603, + "grad_norm": 0.0982433632016182, + "learning_rate": 7.798187421249982e-05, + "loss": 1.5646652221679687, + "step": 124890 + }, + { + "epoch": 0.3780666838194119, + "grad_norm": 0.1155661940574646, + "learning_rate": 7.797807903086241e-05, + "loss": 1.563053512573242, + "step": 124900 + }, + { + "epoch": 0.3780969533697577, + "grad_norm": 0.11124426871538162, + "learning_rate": 7.797428384922503e-05, + "loss": 1.569890594482422, + "step": 124910 + }, + { + "epoch": 0.37812722292010353, + "grad_norm": 0.10543617606163025, + "learning_rate": 7.797048866758762e-05, + "loss": 1.548901081085205, + "step": 124920 + }, + { + "epoch": 0.37815749247044933, + "grad_norm": 0.11520721018314362, + "learning_rate": 7.796669348595024e-05, + "loss": 1.5508520126342773, + "step": 124930 + }, + { + "epoch": 0.3781877620207952, + "grad_norm": 0.11166295409202576, + "learning_rate": 7.796289830431285e-05, + "loss": 1.5626482009887694, + "step": 124940 + }, + { + "epoch": 0.37821803157114103, + "grad_norm": 0.10535787045955658, + "learning_rate": 7.795910312267546e-05, + "loss": 1.5624653816223144, + "step": 124950 + }, + { + "epoch": 0.37824830112148683, + "grad_norm": 0.112161785364151, + "learning_rate": 7.795530794103806e-05, + "loss": 1.5552425384521484, + "step": 124960 + }, + { + "epoch": 0.3782785706718327, + "grad_norm": 0.10682575404644012, + "learning_rate": 7.795151275940067e-05, + "loss": 1.5523662567138672, + "step": 124970 + }, + { + "epoch": 0.3783088402221785, + "grad_norm": 0.10168129205703735, + "learning_rate": 7.794771757776327e-05, + "loss": 1.5370966911315918, + "step": 124980 + }, + { + "epoch": 0.37833910977252433, + "grad_norm": 0.11750657111406326, + "learning_rate": 7.794392239612588e-05, + "loss": 1.5617979049682618, + "step": 124990 + }, + { + "epoch": 0.3783693793228702, + "grad_norm": 0.12950672209262848, + "learning_rate": 7.794012721448848e-05, + "loss": 1.531644058227539, + "step": 125000 + }, + { + "epoch": 0.3783693793228702, + "eval_loss": 1.544757604598999, + "eval_runtime": 28.2887, + "eval_samples_per_second": 17.675, + "eval_steps_per_second": 1.131, + "step": 125000 + }, + { + "epoch": 0.378399648873216, + "grad_norm": 0.10717077553272247, + "learning_rate": 7.793633203285109e-05, + "loss": 1.5311068534851073, + "step": 125010 + }, + { + "epoch": 0.37842991842356183, + "grad_norm": 0.11261434853076935, + "learning_rate": 7.79325368512137e-05, + "loss": 1.5603097915649413, + "step": 125020 + }, + { + "epoch": 0.3784601879739076, + "grad_norm": 0.1175759881734848, + "learning_rate": 7.79287416695763e-05, + "loss": 1.5508617401123046, + "step": 125030 + }, + { + "epoch": 0.3784904575242535, + "grad_norm": 0.1113395020365715, + "learning_rate": 7.792494648793891e-05, + "loss": 1.5460955619812011, + "step": 125040 + }, + { + "epoch": 0.37852072707459933, + "grad_norm": 0.11420425027608871, + "learning_rate": 7.792115130630153e-05, + "loss": 1.5521387100219726, + "step": 125050 + }, + { + "epoch": 0.3785509966249451, + "grad_norm": 0.11187806725502014, + "learning_rate": 7.791735612466413e-05, + "loss": 1.551070785522461, + "step": 125060 + }, + { + "epoch": 0.378581266175291, + "grad_norm": 0.10934755951166153, + "learning_rate": 7.791356094302674e-05, + "loss": 1.5481518745422362, + "step": 125070 + }, + { + "epoch": 0.37861153572563677, + "grad_norm": 0.10165052115917206, + "learning_rate": 7.790976576138935e-05, + "loss": 1.5595223426818847, + "step": 125080 + }, + { + "epoch": 0.3786418052759826, + "grad_norm": 0.11516346782445908, + "learning_rate": 7.790597057975195e-05, + "loss": 1.5247312545776368, + "step": 125090 + }, + { + "epoch": 0.3786720748263285, + "grad_norm": 0.11809370666742325, + "learning_rate": 7.790217539811456e-05, + "loss": 1.5590259552001953, + "step": 125100 + }, + { + "epoch": 0.37870234437667427, + "grad_norm": 0.10426908731460571, + "learning_rate": 7.789838021647716e-05, + "loss": 1.540440845489502, + "step": 125110 + }, + { + "epoch": 0.3787326139270201, + "grad_norm": 0.11147189140319824, + "learning_rate": 7.789458503483977e-05, + "loss": 1.509716510772705, + "step": 125120 + }, + { + "epoch": 0.3787628834773659, + "grad_norm": 0.10715442150831223, + "learning_rate": 7.789078985320238e-05, + "loss": 1.5565351486206054, + "step": 125130 + }, + { + "epoch": 0.37879315302771177, + "grad_norm": 0.11217945069074631, + "learning_rate": 7.788699467156498e-05, + "loss": 1.525954532623291, + "step": 125140 + }, + { + "epoch": 0.3788234225780576, + "grad_norm": 0.10205994546413422, + "learning_rate": 7.788319948992759e-05, + "loss": 1.4961772918701173, + "step": 125150 + }, + { + "epoch": 0.3788536921284034, + "grad_norm": 0.1025727242231369, + "learning_rate": 7.78794043082902e-05, + "loss": 1.5318023681640625, + "step": 125160 + }, + { + "epoch": 0.37888396167874927, + "grad_norm": 0.10395292937755585, + "learning_rate": 7.78756091266528e-05, + "loss": 1.526664924621582, + "step": 125170 + }, + { + "epoch": 0.37891423122909507, + "grad_norm": 0.111647829413414, + "learning_rate": 7.787181394501542e-05, + "loss": 1.5392663955688477, + "step": 125180 + }, + { + "epoch": 0.3789445007794409, + "grad_norm": 0.11627734452486038, + "learning_rate": 7.786801876337801e-05, + "loss": 1.529076385498047, + "step": 125190 + }, + { + "epoch": 0.37897477032978677, + "grad_norm": 0.11482066661119461, + "learning_rate": 7.786422358174063e-05, + "loss": 1.5547357559204102, + "step": 125200 + }, + { + "epoch": 0.37900503988013257, + "grad_norm": 0.10436350852251053, + "learning_rate": 7.786042840010322e-05, + "loss": 1.5016433715820312, + "step": 125210 + }, + { + "epoch": 0.3790353094304784, + "grad_norm": 0.1014622151851654, + "learning_rate": 7.785663321846584e-05, + "loss": 1.5302824020385741, + "step": 125220 + }, + { + "epoch": 0.3790655789808242, + "grad_norm": 0.10599348694086075, + "learning_rate": 7.785283803682844e-05, + "loss": 1.5481544494628907, + "step": 125230 + }, + { + "epoch": 0.37909584853117007, + "grad_norm": 0.13239145278930664, + "learning_rate": 7.784904285519105e-05, + "loss": 1.5229955673217774, + "step": 125240 + }, + { + "epoch": 0.3791261180815159, + "grad_norm": 0.11710264533758163, + "learning_rate": 7.784524767355365e-05, + "loss": 1.5315679550170898, + "step": 125250 + }, + { + "epoch": 0.3791563876318617, + "grad_norm": 0.0998777449131012, + "learning_rate": 7.784145249191627e-05, + "loss": 1.5028230667114257, + "step": 125260 + }, + { + "epoch": 0.37918665718220756, + "grad_norm": 0.10576969385147095, + "learning_rate": 7.783765731027887e-05, + "loss": 1.5487714767456056, + "step": 125270 + }, + { + "epoch": 0.37921692673255336, + "grad_norm": 0.10094334185123444, + "learning_rate": 7.783386212864148e-05, + "loss": 1.5514753341674805, + "step": 125280 + }, + { + "epoch": 0.3792471962828992, + "grad_norm": 0.1088874489068985, + "learning_rate": 7.783006694700408e-05, + "loss": 1.558816623687744, + "step": 125290 + }, + { + "epoch": 0.37927746583324506, + "grad_norm": 0.11225857585668564, + "learning_rate": 7.782627176536669e-05, + "loss": 1.5077857971191406, + "step": 125300 + }, + { + "epoch": 0.37930773538359086, + "grad_norm": 0.10694155842065811, + "learning_rate": 7.782247658372931e-05, + "loss": 1.5073369026184082, + "step": 125310 + }, + { + "epoch": 0.3793380049339367, + "grad_norm": 0.11704748123884201, + "learning_rate": 7.78186814020919e-05, + "loss": 1.5473827362060546, + "step": 125320 + }, + { + "epoch": 0.3793682744842825, + "grad_norm": 0.10734642297029495, + "learning_rate": 7.781488622045452e-05, + "loss": 1.540577507019043, + "step": 125330 + }, + { + "epoch": 0.37939854403462836, + "grad_norm": 0.12816832959651947, + "learning_rate": 7.781109103881711e-05, + "loss": 1.5530333518981934, + "step": 125340 + }, + { + "epoch": 0.3794288135849742, + "grad_norm": 0.1024065911769867, + "learning_rate": 7.780729585717973e-05, + "loss": 1.5283116340637206, + "step": 125350 + }, + { + "epoch": 0.37945908313532, + "grad_norm": 0.10772881656885147, + "learning_rate": 7.780350067554233e-05, + "loss": 1.5495876312255858, + "step": 125360 + }, + { + "epoch": 0.37948935268566586, + "grad_norm": 0.11166694015264511, + "learning_rate": 7.779970549390495e-05, + "loss": 1.5344598770141602, + "step": 125370 + }, + { + "epoch": 0.3795196222360117, + "grad_norm": 0.10709843784570694, + "learning_rate": 7.779591031226754e-05, + "loss": 1.5511358261108399, + "step": 125380 + }, + { + "epoch": 0.3795498917863575, + "grad_norm": 0.12927114963531494, + "learning_rate": 7.779211513063016e-05, + "loss": 1.540926456451416, + "step": 125390 + }, + { + "epoch": 0.37958016133670336, + "grad_norm": 0.11250777542591095, + "learning_rate": 7.778831994899275e-05, + "loss": 1.553605842590332, + "step": 125400 + }, + { + "epoch": 0.37961043088704916, + "grad_norm": 0.10972805321216583, + "learning_rate": 7.778452476735537e-05, + "loss": 1.5745100021362304, + "step": 125410 + }, + { + "epoch": 0.379640700437395, + "grad_norm": 0.11138995736837387, + "learning_rate": 7.778072958571798e-05, + "loss": 1.5237184524536134, + "step": 125420 + }, + { + "epoch": 0.37967096998774086, + "grad_norm": 0.10780816525220871, + "learning_rate": 7.777693440408058e-05, + "loss": 1.55552339553833, + "step": 125430 + }, + { + "epoch": 0.37970123953808665, + "grad_norm": 0.10621531307697296, + "learning_rate": 7.777313922244319e-05, + "loss": 1.5587340354919434, + "step": 125440 + }, + { + "epoch": 0.3797315090884325, + "grad_norm": 0.12105099111795425, + "learning_rate": 7.776934404080579e-05, + "loss": 1.5179021835327149, + "step": 125450 + }, + { + "epoch": 0.3797617786387783, + "grad_norm": 0.11664652824401855, + "learning_rate": 7.77655488591684e-05, + "loss": 1.5581480026245118, + "step": 125460 + }, + { + "epoch": 0.37979204818912415, + "grad_norm": 0.1089872345328331, + "learning_rate": 7.7761753677531e-05, + "loss": 1.5371572494506835, + "step": 125470 + }, + { + "epoch": 0.37982231773947, + "grad_norm": 0.10838381201028824, + "learning_rate": 7.775795849589362e-05, + "loss": 1.5638675689697266, + "step": 125480 + }, + { + "epoch": 0.3798525872898158, + "grad_norm": 0.10919343680143356, + "learning_rate": 7.775416331425622e-05, + "loss": 1.52598237991333, + "step": 125490 + }, + { + "epoch": 0.37988285684016165, + "grad_norm": 0.11269760876893997, + "learning_rate": 7.775036813261884e-05, + "loss": 1.500404167175293, + "step": 125500 + }, + { + "epoch": 0.37988285684016165, + "eval_loss": 1.5240540504455566, + "eval_runtime": 27.7774, + "eval_samples_per_second": 18.0, + "eval_steps_per_second": 1.152, + "step": 125500 + } + ], + "logging_steps": 10, + "max_steps": 330365, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.599435725170331e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}