diff --git "a/checkpoints/checkpoint-28325/trainer_state.json" "b/checkpoints/checkpoint-28325/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/checkpoint-28325/trainer_state.json" @@ -0,0 +1,20109 @@ +{ + "best_metric": 0.6229148507118225, + "best_model_checkpoint": "model/checkpoints/run1-csharp-codegen/checkpoint-22000", + "epoch": 4.999558732680258, + "eval_steps": 1000, + "global_step": 28325, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017650692789691995, + "grad_norm": 5.503284931182861, + "learning_rate": 4.9982347749338044e-05, + "loss": 1.3127, + "step": 10 + }, + { + "epoch": 0.003530138557938399, + "grad_norm": 1.4239877462387085, + "learning_rate": 4.9964695498676085e-05, + "loss": 1.2442, + "step": 20 + }, + { + "epoch": 0.0052952078369075985, + "grad_norm": 1.141628384590149, + "learning_rate": 4.9947043248014126e-05, + "loss": 1.1426, + "step": 30 + }, + { + "epoch": 0.007060277115876798, + "grad_norm": 4.818049907684326, + "learning_rate": 4.992939099735217e-05, + "loss": 1.2138, + "step": 40 + }, + { + "epoch": 0.008825346394845997, + "grad_norm": 2.6751058101654053, + "learning_rate": 4.991173874669021e-05, + "loss": 1.2369, + "step": 50 + }, + { + "epoch": 0.010590415673815197, + "grad_norm": 1.0971022844314575, + "learning_rate": 4.989408649602825e-05, + "loss": 1.1062, + "step": 60 + }, + { + "epoch": 0.012355484952784396, + "grad_norm": 3.7238683700561523, + "learning_rate": 4.987643424536629e-05, + "loss": 1.1359, + "step": 70 + }, + { + "epoch": 0.014120554231753596, + "grad_norm": 0.988836944103241, + "learning_rate": 4.985878199470433e-05, + "loss": 1.0044, + "step": 80 + }, + { + "epoch": 0.015885623510722795, + "grad_norm": 3.4970245361328125, + "learning_rate": 4.984112974404237e-05, + "loss": 1.0781, + "step": 90 + }, + { + "epoch": 0.017650692789691995, + "grad_norm": 1.4898335933685303, + "learning_rate": 4.9823477493380414e-05, + "loss": 1.0369, + "step": 100 + }, + { + "epoch": 0.019415762068661194, + "grad_norm": 3.7124125957489014, + "learning_rate": 4.980582524271845e-05, + "loss": 1.0744, + "step": 110 + }, + { + "epoch": 0.021180831347630394, + "grad_norm": 1.2735360860824585, + "learning_rate": 4.978817299205649e-05, + "loss": 1.0761, + "step": 120 + }, + { + "epoch": 0.022945900626599593, + "grad_norm": 1.1731593608856201, + "learning_rate": 4.977052074139453e-05, + "loss": 1.0761, + "step": 130 + }, + { + "epoch": 0.024710969905568793, + "grad_norm": 1.4130672216415405, + "learning_rate": 4.9752868490732565e-05, + "loss": 0.9775, + "step": 140 + }, + { + "epoch": 0.026476039184537992, + "grad_norm": 3.3941397666931152, + "learning_rate": 4.9735216240070606e-05, + "loss": 1.1248, + "step": 150 + }, + { + "epoch": 0.028241108463507192, + "grad_norm": 2.613243818283081, + "learning_rate": 4.971756398940865e-05, + "loss": 1.0177, + "step": 160 + }, + { + "epoch": 0.03000617774247639, + "grad_norm": 4.421411991119385, + "learning_rate": 4.969991173874669e-05, + "loss": 1.039, + "step": 170 + }, + { + "epoch": 0.03177124702144559, + "grad_norm": 3.040614128112793, + "learning_rate": 4.968225948808473e-05, + "loss": 1.0389, + "step": 180 + }, + { + "epoch": 0.033536316300414794, + "grad_norm": 2.769402265548706, + "learning_rate": 4.966460723742277e-05, + "loss": 0.9449, + "step": 190 + }, + { + "epoch": 0.03530138557938399, + "grad_norm": 1.115365743637085, + "learning_rate": 4.964695498676081e-05, + "loss": 0.9773, + "step": 200 + }, + { + "epoch": 0.03706645485835319, + "grad_norm": 1.0562186241149902, + "learning_rate": 4.962930273609885e-05, + "loss": 1.0351, + "step": 210 + }, + { + "epoch": 0.03883152413732239, + "grad_norm": 1.2428004741668701, + "learning_rate": 4.9611650485436894e-05, + "loss": 1.0004, + "step": 220 + }, + { + "epoch": 0.04059659341629159, + "grad_norm": 2.263096809387207, + "learning_rate": 4.9593998234774935e-05, + "loss": 1.1012, + "step": 230 + }, + { + "epoch": 0.04236166269526079, + "grad_norm": 3.0100183486938477, + "learning_rate": 4.9576345984112976e-05, + "loss": 1.0645, + "step": 240 + }, + { + "epoch": 0.04412673197422999, + "grad_norm": 1.0053493976593018, + "learning_rate": 4.955869373345102e-05, + "loss": 0.9729, + "step": 250 + }, + { + "epoch": 0.04589180125319919, + "grad_norm": 1.2269765138626099, + "learning_rate": 4.954104148278906e-05, + "loss": 0.94, + "step": 260 + }, + { + "epoch": 0.04765687053216839, + "grad_norm": 0.9635799527168274, + "learning_rate": 4.95233892321271e-05, + "loss": 0.9649, + "step": 270 + }, + { + "epoch": 0.049421939811137586, + "grad_norm": 1.4033010005950928, + "learning_rate": 4.950573698146514e-05, + "loss": 1.0486, + "step": 280 + }, + { + "epoch": 0.05118700909010679, + "grad_norm": 1.3257986307144165, + "learning_rate": 4.948808473080318e-05, + "loss": 0.9739, + "step": 290 + }, + { + "epoch": 0.052952078369075985, + "grad_norm": 3.656306028366089, + "learning_rate": 4.947043248014122e-05, + "loss": 0.9864, + "step": 300 + }, + { + "epoch": 0.05471714764804519, + "grad_norm": 3.2791707515716553, + "learning_rate": 4.9452780229479264e-05, + "loss": 0.9734, + "step": 310 + }, + { + "epoch": 0.056482216927014384, + "grad_norm": 1.0598105192184448, + "learning_rate": 4.9435127978817306e-05, + "loss": 0.9014, + "step": 320 + }, + { + "epoch": 0.05824728620598359, + "grad_norm": 2.3721909523010254, + "learning_rate": 4.941747572815535e-05, + "loss": 1.0495, + "step": 330 + }, + { + "epoch": 0.06001235548495278, + "grad_norm": 1.3134008646011353, + "learning_rate": 4.939982347749338e-05, + "loss": 0.9764, + "step": 340 + }, + { + "epoch": 0.061777424763921986, + "grad_norm": 2.561619758605957, + "learning_rate": 4.938217122683142e-05, + "loss": 0.9945, + "step": 350 + }, + { + "epoch": 0.06354249404289118, + "grad_norm": 4.891090393066406, + "learning_rate": 4.936451897616946e-05, + "loss": 1.1421, + "step": 360 + }, + { + "epoch": 0.06530756332186038, + "grad_norm": 1.0825132131576538, + "learning_rate": 4.9346866725507504e-05, + "loss": 0.9436, + "step": 370 + }, + { + "epoch": 0.06707263260082959, + "grad_norm": 2.776716947555542, + "learning_rate": 4.9329214474845546e-05, + "loss": 1.0025, + "step": 380 + }, + { + "epoch": 0.06883770187979878, + "grad_norm": 0.9759344458580017, + "learning_rate": 4.931156222418359e-05, + "loss": 0.8608, + "step": 390 + }, + { + "epoch": 0.07060277115876798, + "grad_norm": 1.0649555921554565, + "learning_rate": 4.929390997352162e-05, + "loss": 0.9983, + "step": 400 + }, + { + "epoch": 0.07236784043773718, + "grad_norm": 2.2306652069091797, + "learning_rate": 4.927625772285966e-05, + "loss": 0.9451, + "step": 410 + }, + { + "epoch": 0.07413290971670639, + "grad_norm": 3.1723175048828125, + "learning_rate": 4.9258605472197703e-05, + "loss": 0.8851, + "step": 420 + }, + { + "epoch": 0.07589797899567557, + "grad_norm": 1.0387526750564575, + "learning_rate": 4.9240953221535745e-05, + "loss": 0.8624, + "step": 430 + }, + { + "epoch": 0.07766304827464478, + "grad_norm": 2.3691160678863525, + "learning_rate": 4.9223300970873786e-05, + "loss": 0.9315, + "step": 440 + }, + { + "epoch": 0.07942811755361398, + "grad_norm": 2.692945718765259, + "learning_rate": 4.920564872021183e-05, + "loss": 0.8513, + "step": 450 + }, + { + "epoch": 0.08119318683258318, + "grad_norm": 3.980057954788208, + "learning_rate": 4.918799646954987e-05, + "loss": 0.8854, + "step": 460 + }, + { + "epoch": 0.08295825611155237, + "grad_norm": 2.8114423751831055, + "learning_rate": 4.917034421888791e-05, + "loss": 1.0698, + "step": 470 + }, + { + "epoch": 0.08472332539052158, + "grad_norm": 1.0136396884918213, + "learning_rate": 4.915269196822595e-05, + "loss": 0.9141, + "step": 480 + }, + { + "epoch": 0.08648839466949078, + "grad_norm": 0.8035016059875488, + "learning_rate": 4.913503971756399e-05, + "loss": 0.9708, + "step": 490 + }, + { + "epoch": 0.08825346394845998, + "grad_norm": 3.6212997436523438, + "learning_rate": 4.911738746690203e-05, + "loss": 0.9765, + "step": 500 + }, + { + "epoch": 0.09001853322742917, + "grad_norm": 1.0750012397766113, + "learning_rate": 4.9099735216240074e-05, + "loss": 0.9412, + "step": 510 + }, + { + "epoch": 0.09178360250639837, + "grad_norm": 0.9042845368385315, + "learning_rate": 4.9082082965578115e-05, + "loss": 0.863, + "step": 520 + }, + { + "epoch": 0.09354867178536758, + "grad_norm": 1.1494684219360352, + "learning_rate": 4.9064430714916156e-05, + "loss": 0.9759, + "step": 530 + }, + { + "epoch": 0.09531374106433678, + "grad_norm": 1.2344112396240234, + "learning_rate": 4.90467784642542e-05, + "loss": 0.9703, + "step": 540 + }, + { + "epoch": 0.09707881034330597, + "grad_norm": 3.4947280883789062, + "learning_rate": 4.902912621359224e-05, + "loss": 0.9009, + "step": 550 + }, + { + "epoch": 0.09884387962227517, + "grad_norm": 2.9842255115509033, + "learning_rate": 4.901147396293028e-05, + "loss": 0.955, + "step": 560 + }, + { + "epoch": 0.10060894890124437, + "grad_norm": 2.9984028339385986, + "learning_rate": 4.899382171226832e-05, + "loss": 0.8698, + "step": 570 + }, + { + "epoch": 0.10237401818021358, + "grad_norm": 1.9957551956176758, + "learning_rate": 4.8976169461606355e-05, + "loss": 0.9207, + "step": 580 + }, + { + "epoch": 0.10413908745918277, + "grad_norm": 2.6019134521484375, + "learning_rate": 4.8958517210944396e-05, + "loss": 0.9672, + "step": 590 + }, + { + "epoch": 0.10590415673815197, + "grad_norm": 1.1441612243652344, + "learning_rate": 4.894086496028244e-05, + "loss": 0.904, + "step": 600 + }, + { + "epoch": 0.10766922601712117, + "grad_norm": 1.24358069896698, + "learning_rate": 4.892321270962048e-05, + "loss": 0.9852, + "step": 610 + }, + { + "epoch": 0.10943429529609038, + "grad_norm": 1.109772801399231, + "learning_rate": 4.890556045895852e-05, + "loss": 0.9899, + "step": 620 + }, + { + "epoch": 0.11119936457505956, + "grad_norm": 3.4650866985321045, + "learning_rate": 4.888790820829656e-05, + "loss": 0.8353, + "step": 630 + }, + { + "epoch": 0.11296443385402877, + "grad_norm": 2.650848865509033, + "learning_rate": 4.88702559576346e-05, + "loss": 0.8749, + "step": 640 + }, + { + "epoch": 0.11472950313299797, + "grad_norm": 0.7682311534881592, + "learning_rate": 4.885260370697264e-05, + "loss": 0.8335, + "step": 650 + }, + { + "epoch": 0.11649457241196717, + "grad_norm": 4.478682994842529, + "learning_rate": 4.8834951456310684e-05, + "loss": 0.8194, + "step": 660 + }, + { + "epoch": 0.11825964169093636, + "grad_norm": 1.0624170303344727, + "learning_rate": 4.881729920564872e-05, + "loss": 0.8129, + "step": 670 + }, + { + "epoch": 0.12002471096990557, + "grad_norm": 0.9311354160308838, + "learning_rate": 4.879964695498676e-05, + "loss": 0.8108, + "step": 680 + }, + { + "epoch": 0.12178978024887477, + "grad_norm": 0.8768683671951294, + "learning_rate": 4.87819947043248e-05, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.12355484952784397, + "grad_norm": 2.0419557094573975, + "learning_rate": 4.876434245366284e-05, + "loss": 0.9701, + "step": 700 + }, + { + "epoch": 0.12531991880681317, + "grad_norm": 2.4168951511383057, + "learning_rate": 4.874669020300088e-05, + "loss": 0.9429, + "step": 710 + }, + { + "epoch": 0.12708498808578236, + "grad_norm": 0.8814497590065002, + "learning_rate": 4.8729037952338924e-05, + "loss": 0.8575, + "step": 720 + }, + { + "epoch": 0.12885005736475158, + "grad_norm": 1.1425260305404663, + "learning_rate": 4.8711385701676965e-05, + "loss": 0.8663, + "step": 730 + }, + { + "epoch": 0.13061512664372077, + "grad_norm": 0.9289985299110413, + "learning_rate": 4.8693733451015007e-05, + "loss": 0.9826, + "step": 740 + }, + { + "epoch": 0.13238019592268996, + "grad_norm": 3.505676746368408, + "learning_rate": 4.867608120035305e-05, + "loss": 1.0716, + "step": 750 + }, + { + "epoch": 0.13414526520165918, + "grad_norm": 1.1043481826782227, + "learning_rate": 4.865842894969109e-05, + "loss": 0.9505, + "step": 760 + }, + { + "epoch": 0.13591033448062836, + "grad_norm": 2.4004900455474854, + "learning_rate": 4.864077669902913e-05, + "loss": 0.9706, + "step": 770 + }, + { + "epoch": 0.13767540375959755, + "grad_norm": 2.9325432777404785, + "learning_rate": 4.862312444836717e-05, + "loss": 0.8697, + "step": 780 + }, + { + "epoch": 0.13944047303856677, + "grad_norm": 2.5764336585998535, + "learning_rate": 4.860547219770521e-05, + "loss": 0.8022, + "step": 790 + }, + { + "epoch": 0.14120554231753596, + "grad_norm": 1.0109184980392456, + "learning_rate": 4.8587819947043253e-05, + "loss": 0.8748, + "step": 800 + }, + { + "epoch": 0.14297061159650518, + "grad_norm": 1.110236406326294, + "learning_rate": 4.8570167696381295e-05, + "loss": 0.8522, + "step": 810 + }, + { + "epoch": 0.14473568087547437, + "grad_norm": 3.5498907566070557, + "learning_rate": 4.855251544571933e-05, + "loss": 0.9105, + "step": 820 + }, + { + "epoch": 0.14650075015444355, + "grad_norm": 3.878492832183838, + "learning_rate": 4.853486319505737e-05, + "loss": 0.8466, + "step": 830 + }, + { + "epoch": 0.14826581943341277, + "grad_norm": 2.681072235107422, + "learning_rate": 4.851721094439541e-05, + "loss": 0.8786, + "step": 840 + }, + { + "epoch": 0.15003088871238196, + "grad_norm": 4.230712890625, + "learning_rate": 4.849955869373345e-05, + "loss": 0.8864, + "step": 850 + }, + { + "epoch": 0.15179595799135115, + "grad_norm": 3.1931087970733643, + "learning_rate": 4.8481906443071494e-05, + "loss": 0.9766, + "step": 860 + }, + { + "epoch": 0.15356102727032037, + "grad_norm": 2.8019542694091797, + "learning_rate": 4.8464254192409535e-05, + "loss": 0.908, + "step": 870 + }, + { + "epoch": 0.15532609654928956, + "grad_norm": 2.4553959369659424, + "learning_rate": 4.8446601941747576e-05, + "loss": 0.9001, + "step": 880 + }, + { + "epoch": 0.15709116582825877, + "grad_norm": 3.005300283432007, + "learning_rate": 4.842894969108562e-05, + "loss": 0.9223, + "step": 890 + }, + { + "epoch": 0.15885623510722796, + "grad_norm": 1.1253565549850464, + "learning_rate": 4.841129744042366e-05, + "loss": 0.8084, + "step": 900 + }, + { + "epoch": 0.16062130438619715, + "grad_norm": 2.9257609844207764, + "learning_rate": 4.83936451897617e-05, + "loss": 0.8003, + "step": 910 + }, + { + "epoch": 0.16238637366516637, + "grad_norm": 2.794377326965332, + "learning_rate": 4.837599293909974e-05, + "loss": 0.7103, + "step": 920 + }, + { + "epoch": 0.16415144294413556, + "grad_norm": 0.9524262547492981, + "learning_rate": 4.8358340688437775e-05, + "loss": 0.8484, + "step": 930 + }, + { + "epoch": 0.16591651222310475, + "grad_norm": 3.1264827251434326, + "learning_rate": 4.8340688437775816e-05, + "loss": 0.8437, + "step": 940 + }, + { + "epoch": 0.16768158150207396, + "grad_norm": 3.1434571743011475, + "learning_rate": 4.832303618711386e-05, + "loss": 0.9207, + "step": 950 + }, + { + "epoch": 0.16944665078104315, + "grad_norm": 3.0784549713134766, + "learning_rate": 4.83053839364519e-05, + "loss": 0.8078, + "step": 960 + }, + { + "epoch": 0.17121172006001237, + "grad_norm": 0.7679011821746826, + "learning_rate": 4.828773168578994e-05, + "loss": 0.7875, + "step": 970 + }, + { + "epoch": 0.17297678933898156, + "grad_norm": 2.534778356552124, + "learning_rate": 4.827007943512798e-05, + "loss": 0.9637, + "step": 980 + }, + { + "epoch": 0.17474185861795075, + "grad_norm": 0.9244309067726135, + "learning_rate": 4.825242718446602e-05, + "loss": 0.9574, + "step": 990 + }, + { + "epoch": 0.17650692789691996, + "grad_norm": 3.150660753250122, + "learning_rate": 4.823477493380406e-05, + "loss": 0.9187, + "step": 1000 + }, + { + "epoch": 0.17650692789691996, + "eval_loss": 0.8387640714645386, + "eval_runtime": 591.4545, + "eval_samples_per_second": 47.894, + "eval_steps_per_second": 2.396, + "eval_token_accuracy": 0.0005117090228854299, + "step": 1000 + }, + { + "epoch": 0.17827199717588915, + "grad_norm": 1.0762163400650024, + "learning_rate": 4.8217122683142104e-05, + "loss": 0.9378, + "step": 1010 + }, + { + "epoch": 0.18003706645485834, + "grad_norm": 2.823103904724121, + "learning_rate": 4.8199470432480145e-05, + "loss": 0.7404, + "step": 1020 + }, + { + "epoch": 0.18180213573382756, + "grad_norm": 2.402489423751831, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.9427, + "step": 1030 + }, + { + "epoch": 0.18356720501279675, + "grad_norm": 2.4162659645080566, + "learning_rate": 4.816416593115623e-05, + "loss": 0.8083, + "step": 1040 + }, + { + "epoch": 0.18533227429176596, + "grad_norm": 2.9245927333831787, + "learning_rate": 4.814651368049427e-05, + "loss": 0.9984, + "step": 1050 + }, + { + "epoch": 0.18709734357073515, + "grad_norm": 0.8261721730232239, + "learning_rate": 4.81288614298323e-05, + "loss": 0.7679, + "step": 1060 + }, + { + "epoch": 0.18886241284970434, + "grad_norm": 2.2313833236694336, + "learning_rate": 4.8111209179170344e-05, + "loss": 0.7864, + "step": 1070 + }, + { + "epoch": 0.19062748212867356, + "grad_norm": 2.17639422416687, + "learning_rate": 4.8093556928508385e-05, + "loss": 0.8615, + "step": 1080 + }, + { + "epoch": 0.19239255140764275, + "grad_norm": 1.0018310546875, + "learning_rate": 4.8075904677846426e-05, + "loss": 0.955, + "step": 1090 + }, + { + "epoch": 0.19415762068661194, + "grad_norm": 0.93125319480896, + "learning_rate": 4.805825242718447e-05, + "loss": 0.9129, + "step": 1100 + }, + { + "epoch": 0.19592268996558115, + "grad_norm": 2.5495829582214355, + "learning_rate": 4.804060017652251e-05, + "loss": 0.8793, + "step": 1110 + }, + { + "epoch": 0.19768775924455034, + "grad_norm": 0.8696863055229187, + "learning_rate": 4.802294792586055e-05, + "loss": 0.9327, + "step": 1120 + }, + { + "epoch": 0.19945282852351956, + "grad_norm": 0.8448121547698975, + "learning_rate": 4.800529567519859e-05, + "loss": 1.016, + "step": 1130 + }, + { + "epoch": 0.20121789780248875, + "grad_norm": 2.590965747833252, + "learning_rate": 4.798764342453663e-05, + "loss": 1.0176, + "step": 1140 + }, + { + "epoch": 0.20298296708145794, + "grad_norm": 2.4967727661132812, + "learning_rate": 4.796999117387467e-05, + "loss": 0.894, + "step": 1150 + }, + { + "epoch": 0.20474803636042715, + "grad_norm": 2.591536521911621, + "learning_rate": 4.7952338923212714e-05, + "loss": 0.8355, + "step": 1160 + }, + { + "epoch": 0.20651310563939634, + "grad_norm": 0.718768298625946, + "learning_rate": 4.7934686672550755e-05, + "loss": 0.8619, + "step": 1170 + }, + { + "epoch": 0.20827817491836553, + "grad_norm": 2.2282583713531494, + "learning_rate": 4.79170344218888e-05, + "loss": 0.9338, + "step": 1180 + }, + { + "epoch": 0.21004324419733475, + "grad_norm": 0.8208143711090088, + "learning_rate": 4.789938217122684e-05, + "loss": 0.8617, + "step": 1190 + }, + { + "epoch": 0.21180831347630394, + "grad_norm": 0.9473330974578857, + "learning_rate": 4.788172992056487e-05, + "loss": 0.8861, + "step": 1200 + }, + { + "epoch": 0.21357338275527316, + "grad_norm": 1.0840715169906616, + "learning_rate": 4.786407766990291e-05, + "loss": 0.966, + "step": 1210 + }, + { + "epoch": 0.21533845203424234, + "grad_norm": 1.0690734386444092, + "learning_rate": 4.7846425419240954e-05, + "loss": 0.8333, + "step": 1220 + }, + { + "epoch": 0.21710352131321153, + "grad_norm": 2.4799931049346924, + "learning_rate": 4.7828773168578996e-05, + "loss": 0.7794, + "step": 1230 + }, + { + "epoch": 0.21886859059218075, + "grad_norm": 1.1045414209365845, + "learning_rate": 4.781112091791704e-05, + "loss": 0.9114, + "step": 1240 + }, + { + "epoch": 0.22063365987114994, + "grad_norm": 3.1106817722320557, + "learning_rate": 4.779346866725508e-05, + "loss": 0.8363, + "step": 1250 + }, + { + "epoch": 0.22239872915011913, + "grad_norm": 3.011002779006958, + "learning_rate": 4.777581641659312e-05, + "loss": 0.8383, + "step": 1260 + }, + { + "epoch": 0.22416379842908835, + "grad_norm": 2.440241575241089, + "learning_rate": 4.775816416593116e-05, + "loss": 0.8405, + "step": 1270 + }, + { + "epoch": 0.22592886770805753, + "grad_norm": 2.4338467121124268, + "learning_rate": 4.77405119152692e-05, + "loss": 0.8317, + "step": 1280 + }, + { + "epoch": 0.22769393698702675, + "grad_norm": 0.8372032642364502, + "learning_rate": 4.7722859664607236e-05, + "loss": 0.8259, + "step": 1290 + }, + { + "epoch": 0.22945900626599594, + "grad_norm": 2.346604824066162, + "learning_rate": 4.770520741394528e-05, + "loss": 0.8471, + "step": 1300 + }, + { + "epoch": 0.23122407554496513, + "grad_norm": 1.1082093715667725, + "learning_rate": 4.768755516328332e-05, + "loss": 0.8924, + "step": 1310 + }, + { + "epoch": 0.23298914482393435, + "grad_norm": 0.9796193838119507, + "learning_rate": 4.766990291262136e-05, + "loss": 0.8913, + "step": 1320 + }, + { + "epoch": 0.23475421410290354, + "grad_norm": 2.4977235794067383, + "learning_rate": 4.76522506619594e-05, + "loss": 0.9109, + "step": 1330 + }, + { + "epoch": 0.23651928338187272, + "grad_norm": 3.6340599060058594, + "learning_rate": 4.763459841129744e-05, + "loss": 1.0204, + "step": 1340 + }, + { + "epoch": 0.23828435266084194, + "grad_norm": 0.9401459097862244, + "learning_rate": 4.761694616063548e-05, + "loss": 0.922, + "step": 1350 + }, + { + "epoch": 0.24004942193981113, + "grad_norm": 3.333784580230713, + "learning_rate": 4.7599293909973524e-05, + "loss": 0.7972, + "step": 1360 + }, + { + "epoch": 0.24181449121878035, + "grad_norm": 2.373687505722046, + "learning_rate": 4.7581641659311565e-05, + "loss": 0.8351, + "step": 1370 + }, + { + "epoch": 0.24357956049774954, + "grad_norm": 1.3305423259735107, + "learning_rate": 4.7563989408649606e-05, + "loss": 1.09, + "step": 1380 + }, + { + "epoch": 0.24534462977671873, + "grad_norm": 0.9417628049850464, + "learning_rate": 4.754633715798765e-05, + "loss": 0.8717, + "step": 1390 + }, + { + "epoch": 0.24710969905568794, + "grad_norm": 1.0593584775924683, + "learning_rate": 4.752868490732569e-05, + "loss": 0.7458, + "step": 1400 + }, + { + "epoch": 0.24887476833465713, + "grad_norm": 0.8886284232139587, + "learning_rate": 4.751103265666373e-05, + "loss": 0.8889, + "step": 1410 + }, + { + "epoch": 0.25063983761362635, + "grad_norm": 2.407033681869507, + "learning_rate": 4.749338040600177e-05, + "loss": 0.9037, + "step": 1420 + }, + { + "epoch": 0.25240490689259554, + "grad_norm": 1.047439694404602, + "learning_rate": 4.747572815533981e-05, + "loss": 0.8975, + "step": 1430 + }, + { + "epoch": 0.2541699761715647, + "grad_norm": 3.1553778648376465, + "learning_rate": 4.745807590467785e-05, + "loss": 0.9644, + "step": 1440 + }, + { + "epoch": 0.2559350454505339, + "grad_norm": 3.748414993286133, + "learning_rate": 4.7440423654015894e-05, + "loss": 0.9116, + "step": 1450 + }, + { + "epoch": 0.25770011472950316, + "grad_norm": 1.0617207288742065, + "learning_rate": 4.742277140335393e-05, + "loss": 0.9632, + "step": 1460 + }, + { + "epoch": 0.25946518400847235, + "grad_norm": 1.1011401414871216, + "learning_rate": 4.740511915269197e-05, + "loss": 0.9103, + "step": 1470 + }, + { + "epoch": 0.26123025328744154, + "grad_norm": 3.420823097229004, + "learning_rate": 4.738746690203001e-05, + "loss": 0.849, + "step": 1480 + }, + { + "epoch": 0.2629953225664107, + "grad_norm": 3.4054012298583984, + "learning_rate": 4.736981465136805e-05, + "loss": 0.9037, + "step": 1490 + }, + { + "epoch": 0.2647603918453799, + "grad_norm": 0.7947477698326111, + "learning_rate": 4.735216240070609e-05, + "loss": 0.8944, + "step": 1500 + }, + { + "epoch": 0.2665254611243491, + "grad_norm": 1.9546200037002563, + "learning_rate": 4.7334510150044134e-05, + "loss": 0.9005, + "step": 1510 + }, + { + "epoch": 0.26829053040331835, + "grad_norm": 3.5012011528015137, + "learning_rate": 4.7316857899382175e-05, + "loss": 0.7747, + "step": 1520 + }, + { + "epoch": 0.27005559968228754, + "grad_norm": 3.4226977825164795, + "learning_rate": 4.729920564872021e-05, + "loss": 0.9032, + "step": 1530 + }, + { + "epoch": 0.27182066896125673, + "grad_norm": 2.116934299468994, + "learning_rate": 4.728155339805825e-05, + "loss": 0.8319, + "step": 1540 + }, + { + "epoch": 0.2735857382402259, + "grad_norm": 0.7913022041320801, + "learning_rate": 4.726390114739629e-05, + "loss": 0.9007, + "step": 1550 + }, + { + "epoch": 0.2753508075191951, + "grad_norm": 1.7177999019622803, + "learning_rate": 4.724624889673433e-05, + "loss": 0.8825, + "step": 1560 + }, + { + "epoch": 0.27711587679816435, + "grad_norm": 0.7861548662185669, + "learning_rate": 4.7228596646072374e-05, + "loss": 0.9701, + "step": 1570 + }, + { + "epoch": 0.27888094607713354, + "grad_norm": 0.988394021987915, + "learning_rate": 4.7210944395410415e-05, + "loss": 0.8454, + "step": 1580 + }, + { + "epoch": 0.28064601535610273, + "grad_norm": 1.0350430011749268, + "learning_rate": 4.7193292144748457e-05, + "loss": 0.9811, + "step": 1590 + }, + { + "epoch": 0.2824110846350719, + "grad_norm": 2.351750373840332, + "learning_rate": 4.71756398940865e-05, + "loss": 0.8091, + "step": 1600 + }, + { + "epoch": 0.2841761539140411, + "grad_norm": 1.85820472240448, + "learning_rate": 4.715798764342454e-05, + "loss": 0.9372, + "step": 1610 + }, + { + "epoch": 0.28594122319301035, + "grad_norm": 0.8532019257545471, + "learning_rate": 4.714033539276258e-05, + "loss": 0.9284, + "step": 1620 + }, + { + "epoch": 0.28770629247197954, + "grad_norm": 1.9938181638717651, + "learning_rate": 4.712268314210062e-05, + "loss": 0.7879, + "step": 1630 + }, + { + "epoch": 0.28947136175094873, + "grad_norm": 2.600447177886963, + "learning_rate": 4.710503089143866e-05, + "loss": 0.8553, + "step": 1640 + }, + { + "epoch": 0.2912364310299179, + "grad_norm": 1.0016512870788574, + "learning_rate": 4.7087378640776703e-05, + "loss": 0.9343, + "step": 1650 + }, + { + "epoch": 0.2930015003088871, + "grad_norm": 2.372873544692993, + "learning_rate": 4.7069726390114745e-05, + "loss": 0.9583, + "step": 1660 + }, + { + "epoch": 0.2947665695878563, + "grad_norm": 0.9608777761459351, + "learning_rate": 4.7052074139452786e-05, + "loss": 0.7909, + "step": 1670 + }, + { + "epoch": 0.29653163886682554, + "grad_norm": 3.220609664916992, + "learning_rate": 4.703442188879083e-05, + "loss": 0.879, + "step": 1680 + }, + { + "epoch": 0.29829670814579473, + "grad_norm": 1.023384690284729, + "learning_rate": 4.701676963812887e-05, + "loss": 0.8636, + "step": 1690 + }, + { + "epoch": 0.3000617774247639, + "grad_norm": 1.031020998954773, + "learning_rate": 4.699911738746691e-05, + "loss": 0.9112, + "step": 1700 + }, + { + "epoch": 0.3018268467037331, + "grad_norm": 1.9837015867233276, + "learning_rate": 4.698146513680495e-05, + "loss": 0.8857, + "step": 1710 + }, + { + "epoch": 0.3035919159827023, + "grad_norm": 1.0206444263458252, + "learning_rate": 4.696381288614299e-05, + "loss": 0.8305, + "step": 1720 + }, + { + "epoch": 0.30535698526167154, + "grad_norm": 0.805181086063385, + "learning_rate": 4.6946160635481026e-05, + "loss": 0.7456, + "step": 1730 + }, + { + "epoch": 0.30712205454064073, + "grad_norm": 2.465508460998535, + "learning_rate": 4.692850838481907e-05, + "loss": 0.8171, + "step": 1740 + }, + { + "epoch": 0.3088871238196099, + "grad_norm": 1.6663717031478882, + "learning_rate": 4.691085613415711e-05, + "loss": 0.8766, + "step": 1750 + }, + { + "epoch": 0.3106521930985791, + "grad_norm": 3.0242748260498047, + "learning_rate": 4.689320388349515e-05, + "loss": 0.8909, + "step": 1760 + }, + { + "epoch": 0.3124172623775483, + "grad_norm": 1.4774634838104248, + "learning_rate": 4.6875551632833184e-05, + "loss": 0.7763, + "step": 1770 + }, + { + "epoch": 0.31418233165651754, + "grad_norm": 1.7824957370758057, + "learning_rate": 4.6857899382171225e-05, + "loss": 0.7331, + "step": 1780 + }, + { + "epoch": 0.31594740093548673, + "grad_norm": 3.031038522720337, + "learning_rate": 4.6840247131509266e-05, + "loss": 0.7964, + "step": 1790 + }, + { + "epoch": 0.3177124702144559, + "grad_norm": 4.109992504119873, + "learning_rate": 4.682259488084731e-05, + "loss": 0.8113, + "step": 1800 + }, + { + "epoch": 0.3194775394934251, + "grad_norm": 2.4531877040863037, + "learning_rate": 4.680494263018535e-05, + "loss": 0.902, + "step": 1810 + }, + { + "epoch": 0.3212426087723943, + "grad_norm": 1.009549617767334, + "learning_rate": 4.678729037952339e-05, + "loss": 0.9048, + "step": 1820 + }, + { + "epoch": 0.3230076780513635, + "grad_norm": 3.0832602977752686, + "learning_rate": 4.676963812886143e-05, + "loss": 0.8321, + "step": 1830 + }, + { + "epoch": 0.32477274733033273, + "grad_norm": 2.7193832397460938, + "learning_rate": 4.675198587819947e-05, + "loss": 0.9515, + "step": 1840 + }, + { + "epoch": 0.3265378166093019, + "grad_norm": 2.153125286102295, + "learning_rate": 4.673433362753751e-05, + "loss": 0.8171, + "step": 1850 + }, + { + "epoch": 0.3283028858882711, + "grad_norm": 2.327843427658081, + "learning_rate": 4.6716681376875554e-05, + "loss": 0.8299, + "step": 1860 + }, + { + "epoch": 0.3300679551672403, + "grad_norm": 2.4244372844696045, + "learning_rate": 4.6699029126213595e-05, + "loss": 0.8345, + "step": 1870 + }, + { + "epoch": 0.3318330244462095, + "grad_norm": 2.6205737590789795, + "learning_rate": 4.6681376875551636e-05, + "loss": 0.7703, + "step": 1880 + }, + { + "epoch": 0.33359809372517873, + "grad_norm": 3.1626133918762207, + "learning_rate": 4.666372462488968e-05, + "loss": 0.8627, + "step": 1890 + }, + { + "epoch": 0.3353631630041479, + "grad_norm": 2.5088164806365967, + "learning_rate": 4.664607237422772e-05, + "loss": 0.8964, + "step": 1900 + }, + { + "epoch": 0.3371282322831171, + "grad_norm": 0.9049986600875854, + "learning_rate": 4.662842012356576e-05, + "loss": 0.8445, + "step": 1910 + }, + { + "epoch": 0.3388933015620863, + "grad_norm": 1.1667076349258423, + "learning_rate": 4.66107678729038e-05, + "loss": 0.8793, + "step": 1920 + }, + { + "epoch": 0.3406583708410555, + "grad_norm": 1.1097145080566406, + "learning_rate": 4.659311562224184e-05, + "loss": 0.8007, + "step": 1930 + }, + { + "epoch": 0.34242344012002474, + "grad_norm": 2.5024774074554443, + "learning_rate": 4.657546337157988e-05, + "loss": 0.7753, + "step": 1940 + }, + { + "epoch": 0.3441885093989939, + "grad_norm": 0.8330347537994385, + "learning_rate": 4.6557811120917924e-05, + "loss": 0.9424, + "step": 1950 + }, + { + "epoch": 0.3459535786779631, + "grad_norm": 1.0398578643798828, + "learning_rate": 4.6540158870255965e-05, + "loss": 0.7939, + "step": 1960 + }, + { + "epoch": 0.3477186479569323, + "grad_norm": 0.8610934019088745, + "learning_rate": 4.6522506619594e-05, + "loss": 0.7586, + "step": 1970 + }, + { + "epoch": 0.3494837172359015, + "grad_norm": 1.0551927089691162, + "learning_rate": 4.650485436893204e-05, + "loss": 0.7738, + "step": 1980 + }, + { + "epoch": 0.3512487865148707, + "grad_norm": 2.8230621814727783, + "learning_rate": 4.648720211827008e-05, + "loss": 0.8079, + "step": 1990 + }, + { + "epoch": 0.3530138557938399, + "grad_norm": 2.029458999633789, + "learning_rate": 4.646954986760812e-05, + "loss": 0.9508, + "step": 2000 + }, + { + "epoch": 0.3530138557938399, + "eval_loss": 0.7845870852470398, + "eval_runtime": 591.5769, + "eval_samples_per_second": 47.884, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0005102881727623844, + "step": 2000 + }, + { + "epoch": 0.3547789250728091, + "grad_norm": 0.8111034631729126, + "learning_rate": 4.645189761694616e-05, + "loss": 0.886, + "step": 2010 + }, + { + "epoch": 0.3565439943517783, + "grad_norm": 1.2739574909210205, + "learning_rate": 4.64342453662842e-05, + "loss": 0.7933, + "step": 2020 + }, + { + "epoch": 0.3583090636307475, + "grad_norm": 1.3890713453292847, + "learning_rate": 4.641659311562224e-05, + "loss": 0.7444, + "step": 2030 + }, + { + "epoch": 0.3600741329097167, + "grad_norm": 4.180551052093506, + "learning_rate": 4.639894086496028e-05, + "loss": 0.8468, + "step": 2040 + }, + { + "epoch": 0.3618392021886859, + "grad_norm": 2.901968240737915, + "learning_rate": 4.638128861429832e-05, + "loss": 0.8847, + "step": 2050 + }, + { + "epoch": 0.3636042714676551, + "grad_norm": 0.9198481440544128, + "learning_rate": 4.636363636363636e-05, + "loss": 0.8842, + "step": 2060 + }, + { + "epoch": 0.3653693407466243, + "grad_norm": 1.019461750984192, + "learning_rate": 4.6345984112974404e-05, + "loss": 0.6959, + "step": 2070 + }, + { + "epoch": 0.3671344100255935, + "grad_norm": 1.0316898822784424, + "learning_rate": 4.6328331862312446e-05, + "loss": 0.8417, + "step": 2080 + }, + { + "epoch": 0.3688994793045627, + "grad_norm": 0.7056171894073486, + "learning_rate": 4.631067961165049e-05, + "loss": 0.8592, + "step": 2090 + }, + { + "epoch": 0.3706645485835319, + "grad_norm": 2.2394204139709473, + "learning_rate": 4.629302736098853e-05, + "loss": 0.8624, + "step": 2100 + }, + { + "epoch": 0.3724296178625011, + "grad_norm": 0.519085168838501, + "learning_rate": 4.627537511032657e-05, + "loss": 0.8056, + "step": 2110 + }, + { + "epoch": 0.3741946871414703, + "grad_norm": 1.0584418773651123, + "learning_rate": 4.625772285966461e-05, + "loss": 0.8155, + "step": 2120 + }, + { + "epoch": 0.3759597564204395, + "grad_norm": 3.4729137420654297, + "learning_rate": 4.624007060900265e-05, + "loss": 0.8597, + "step": 2130 + }, + { + "epoch": 0.3777248256994087, + "grad_norm": 3.2471015453338623, + "learning_rate": 4.622241835834069e-05, + "loss": 0.908, + "step": 2140 + }, + { + "epoch": 0.37948989497837793, + "grad_norm": 3.068171977996826, + "learning_rate": 4.6204766107678734e-05, + "loss": 0.7869, + "step": 2150 + }, + { + "epoch": 0.3812549642573471, + "grad_norm": 0.8335369825363159, + "learning_rate": 4.6187113857016775e-05, + "loss": 0.8426, + "step": 2160 + }, + { + "epoch": 0.3830200335363163, + "grad_norm": 0.7075872421264648, + "learning_rate": 4.6169461606354816e-05, + "loss": 0.7434, + "step": 2170 + }, + { + "epoch": 0.3847851028152855, + "grad_norm": 0.8802381753921509, + "learning_rate": 4.615180935569286e-05, + "loss": 0.9235, + "step": 2180 + }, + { + "epoch": 0.3865501720942547, + "grad_norm": 0.650245189666748, + "learning_rate": 4.61341571050309e-05, + "loss": 0.8441, + "step": 2190 + }, + { + "epoch": 0.3883152413732239, + "grad_norm": 0.9095965623855591, + "learning_rate": 4.611650485436894e-05, + "loss": 0.9275, + "step": 2200 + }, + { + "epoch": 0.3900803106521931, + "grad_norm": 2.9897871017456055, + "learning_rate": 4.6098852603706974e-05, + "loss": 0.8002, + "step": 2210 + }, + { + "epoch": 0.3918453799311623, + "grad_norm": 2.078740119934082, + "learning_rate": 4.6081200353045015e-05, + "loss": 0.7567, + "step": 2220 + }, + { + "epoch": 0.3936104492101315, + "grad_norm": 2.397658586502075, + "learning_rate": 4.6063548102383056e-05, + "loss": 0.808, + "step": 2230 + }, + { + "epoch": 0.3953755184891007, + "grad_norm": 3.447932720184326, + "learning_rate": 4.60458958517211e-05, + "loss": 0.8827, + "step": 2240 + }, + { + "epoch": 0.3971405877680699, + "grad_norm": 0.8672122359275818, + "learning_rate": 4.602824360105914e-05, + "loss": 0.8806, + "step": 2250 + }, + { + "epoch": 0.3989056570470391, + "grad_norm": 1.9333924055099487, + "learning_rate": 4.601059135039717e-05, + "loss": 0.8342, + "step": 2260 + }, + { + "epoch": 0.4006707263260083, + "grad_norm": 0.8829795718193054, + "learning_rate": 4.5992939099735214e-05, + "loss": 0.7546, + "step": 2270 + }, + { + "epoch": 0.4024357956049775, + "grad_norm": 0.8878025412559509, + "learning_rate": 4.5975286849073255e-05, + "loss": 0.7602, + "step": 2280 + }, + { + "epoch": 0.4042008648839467, + "grad_norm": 2.6549007892608643, + "learning_rate": 4.5957634598411296e-05, + "loss": 0.8296, + "step": 2290 + }, + { + "epoch": 0.4059659341629159, + "grad_norm": 3.5660505294799805, + "learning_rate": 4.593998234774934e-05, + "loss": 0.8291, + "step": 2300 + }, + { + "epoch": 0.4077310034418851, + "grad_norm": 2.9781057834625244, + "learning_rate": 4.592233009708738e-05, + "loss": 0.8012, + "step": 2310 + }, + { + "epoch": 0.4094960727208543, + "grad_norm": 2.6867148876190186, + "learning_rate": 4.590467784642542e-05, + "loss": 0.8668, + "step": 2320 + }, + { + "epoch": 0.4112611419998235, + "grad_norm": 2.252251386642456, + "learning_rate": 4.588702559576346e-05, + "loss": 0.8445, + "step": 2330 + }, + { + "epoch": 0.4130262112787927, + "grad_norm": 0.8506266474723816, + "learning_rate": 4.58693733451015e-05, + "loss": 0.8879, + "step": 2340 + }, + { + "epoch": 0.4147912805577619, + "grad_norm": 2.8754031658172607, + "learning_rate": 4.585172109443954e-05, + "loss": 0.996, + "step": 2350 + }, + { + "epoch": 0.41655634983673107, + "grad_norm": 2.471627712249756, + "learning_rate": 4.5834068843777584e-05, + "loss": 0.8848, + "step": 2360 + }, + { + "epoch": 0.4183214191157003, + "grad_norm": 2.1100962162017822, + "learning_rate": 4.5816416593115625e-05, + "loss": 0.8333, + "step": 2370 + }, + { + "epoch": 0.4200864883946695, + "grad_norm": 2.6194207668304443, + "learning_rate": 4.5798764342453666e-05, + "loss": 0.8444, + "step": 2380 + }, + { + "epoch": 0.4218515576736387, + "grad_norm": 1.025685429573059, + "learning_rate": 4.578111209179171e-05, + "loss": 0.8992, + "step": 2390 + }, + { + "epoch": 0.4236166269526079, + "grad_norm": 2.1514077186584473, + "learning_rate": 4.576345984112975e-05, + "loss": 0.7833, + "step": 2400 + }, + { + "epoch": 0.42538169623157707, + "grad_norm": 1.8769290447235107, + "learning_rate": 4.574580759046779e-05, + "loss": 0.7602, + "step": 2410 + }, + { + "epoch": 0.4271467655105463, + "grad_norm": 0.990746796131134, + "learning_rate": 4.572815533980583e-05, + "loss": 0.7384, + "step": 2420 + }, + { + "epoch": 0.4289118347895155, + "grad_norm": 2.712986946105957, + "learning_rate": 4.571050308914387e-05, + "loss": 0.8197, + "step": 2430 + }, + { + "epoch": 0.4306769040684847, + "grad_norm": 2.931225061416626, + "learning_rate": 4.5692850838481906e-05, + "loss": 0.8079, + "step": 2440 + }, + { + "epoch": 0.4324419733474539, + "grad_norm": 0.9967415928840637, + "learning_rate": 4.567519858781995e-05, + "loss": 0.7264, + "step": 2450 + }, + { + "epoch": 0.43420704262642307, + "grad_norm": 1.942150354385376, + "learning_rate": 4.565754633715799e-05, + "loss": 0.8067, + "step": 2460 + }, + { + "epoch": 0.4359721119053923, + "grad_norm": 2.0994186401367188, + "learning_rate": 4.563989408649603e-05, + "loss": 0.8242, + "step": 2470 + }, + { + "epoch": 0.4377371811843615, + "grad_norm": 1.0104529857635498, + "learning_rate": 4.562224183583407e-05, + "loss": 0.7889, + "step": 2480 + }, + { + "epoch": 0.4395022504633307, + "grad_norm": 1.2194324731826782, + "learning_rate": 4.560458958517211e-05, + "loss": 0.841, + "step": 2490 + }, + { + "epoch": 0.4412673197422999, + "grad_norm": 1.023699164390564, + "learning_rate": 4.558693733451015e-05, + "loss": 0.8935, + "step": 2500 + }, + { + "epoch": 0.44303238902126907, + "grad_norm": 1.049811601638794, + "learning_rate": 4.5569285083848195e-05, + "loss": 0.9363, + "step": 2510 + }, + { + "epoch": 0.44479745830023826, + "grad_norm": 1.1310874223709106, + "learning_rate": 4.555163283318623e-05, + "loss": 0.8604, + "step": 2520 + }, + { + "epoch": 0.4465625275792075, + "grad_norm": 2.1629273891448975, + "learning_rate": 4.553398058252427e-05, + "loss": 0.8933, + "step": 2530 + }, + { + "epoch": 0.4483275968581767, + "grad_norm": 1.2044459581375122, + "learning_rate": 4.551632833186231e-05, + "loss": 0.9179, + "step": 2540 + }, + { + "epoch": 0.4500926661371459, + "grad_norm": 0.8064510822296143, + "learning_rate": 4.549867608120035e-05, + "loss": 0.7586, + "step": 2550 + }, + { + "epoch": 0.45185773541611507, + "grad_norm": 3.98811936378479, + "learning_rate": 4.5481023830538393e-05, + "loss": 0.8881, + "step": 2560 + }, + { + "epoch": 0.45362280469508426, + "grad_norm": 1.326183795928955, + "learning_rate": 4.5463371579876435e-05, + "loss": 0.8303, + "step": 2570 + }, + { + "epoch": 0.4553878739740535, + "grad_norm": 2.2841854095458984, + "learning_rate": 4.5445719329214476e-05, + "loss": 0.808, + "step": 2580 + }, + { + "epoch": 0.4571529432530227, + "grad_norm": 0.9683240652084351, + "learning_rate": 4.542806707855252e-05, + "loss": 0.9093, + "step": 2590 + }, + { + "epoch": 0.4589180125319919, + "grad_norm": 0.8216995000839233, + "learning_rate": 4.541041482789056e-05, + "loss": 0.7946, + "step": 2600 + }, + { + "epoch": 0.46068308181096107, + "grad_norm": 0.9047673940658569, + "learning_rate": 4.53927625772286e-05, + "loss": 0.7807, + "step": 2610 + }, + { + "epoch": 0.46244815108993026, + "grad_norm": 0.7651693820953369, + "learning_rate": 4.537511032656664e-05, + "loss": 0.7052, + "step": 2620 + }, + { + "epoch": 0.4642132203688995, + "grad_norm": 3.0374672412872314, + "learning_rate": 4.535745807590468e-05, + "loss": 0.8134, + "step": 2630 + }, + { + "epoch": 0.4659782896478687, + "grad_norm": 4.012673854827881, + "learning_rate": 4.533980582524272e-05, + "loss": 0.833, + "step": 2640 + }, + { + "epoch": 0.4677433589268379, + "grad_norm": 2.6433727741241455, + "learning_rate": 4.5322153574580764e-05, + "loss": 0.9264, + "step": 2650 + }, + { + "epoch": 0.46950842820580707, + "grad_norm": 2.4709348678588867, + "learning_rate": 4.5304501323918805e-05, + "loss": 0.9154, + "step": 2660 + }, + { + "epoch": 0.47127349748477626, + "grad_norm": 0.8282108902931213, + "learning_rate": 4.5286849073256846e-05, + "loss": 0.7335, + "step": 2670 + }, + { + "epoch": 0.47303856676374545, + "grad_norm": 1.0846011638641357, + "learning_rate": 4.526919682259488e-05, + "loss": 0.9035, + "step": 2680 + }, + { + "epoch": 0.4748036360427147, + "grad_norm": 2.7011947631835938, + "learning_rate": 4.525154457193292e-05, + "loss": 0.7915, + "step": 2690 + }, + { + "epoch": 0.4765687053216839, + "grad_norm": 1.912846565246582, + "learning_rate": 4.523389232127096e-05, + "loss": 0.8141, + "step": 2700 + }, + { + "epoch": 0.47833377460065307, + "grad_norm": 0.9846749305725098, + "learning_rate": 4.5216240070609004e-05, + "loss": 0.8167, + "step": 2710 + }, + { + "epoch": 0.48009884387962226, + "grad_norm": 1.9632867574691772, + "learning_rate": 4.5198587819947045e-05, + "loss": 0.8177, + "step": 2720 + }, + { + "epoch": 0.48186391315859145, + "grad_norm": 1.979436993598938, + "learning_rate": 4.5180935569285086e-05, + "loss": 0.753, + "step": 2730 + }, + { + "epoch": 0.4836289824375607, + "grad_norm": 2.5256102085113525, + "learning_rate": 4.516328331862313e-05, + "loss": 0.8481, + "step": 2740 + }, + { + "epoch": 0.4853940517165299, + "grad_norm": 3.493546962738037, + "learning_rate": 4.514563106796117e-05, + "loss": 0.8997, + "step": 2750 + }, + { + "epoch": 0.4871591209954991, + "grad_norm": 1.9048209190368652, + "learning_rate": 4.512797881729921e-05, + "loss": 0.7698, + "step": 2760 + }, + { + "epoch": 0.48892419027446826, + "grad_norm": 2.712498664855957, + "learning_rate": 4.511032656663725e-05, + "loss": 0.8042, + "step": 2770 + }, + { + "epoch": 0.49068925955343745, + "grad_norm": 1.0537688732147217, + "learning_rate": 4.5092674315975285e-05, + "loss": 0.9004, + "step": 2780 + }, + { + "epoch": 0.4924543288324067, + "grad_norm": 2.128462553024292, + "learning_rate": 4.5075022065313326e-05, + "loss": 0.9807, + "step": 2790 + }, + { + "epoch": 0.4942193981113759, + "grad_norm": 0.7416483759880066, + "learning_rate": 4.505736981465137e-05, + "loss": 0.8408, + "step": 2800 + }, + { + "epoch": 0.4959844673903451, + "grad_norm": 1.0257169008255005, + "learning_rate": 4.503971756398941e-05, + "loss": 0.8744, + "step": 2810 + }, + { + "epoch": 0.49774953666931426, + "grad_norm": 2.0702273845672607, + "learning_rate": 4.502206531332745e-05, + "loss": 0.8238, + "step": 2820 + }, + { + "epoch": 0.49951460594828345, + "grad_norm": 0.9203521013259888, + "learning_rate": 4.500441306266549e-05, + "loss": 0.7613, + "step": 2830 + }, + { + "epoch": 0.5012796752272527, + "grad_norm": 2.582195281982422, + "learning_rate": 4.498676081200353e-05, + "loss": 0.8405, + "step": 2840 + }, + { + "epoch": 0.5030447445062218, + "grad_norm": 2.5932891368865967, + "learning_rate": 4.496910856134157e-05, + "loss": 0.8282, + "step": 2850 + }, + { + "epoch": 0.5048098137851911, + "grad_norm": 0.9128146171569824, + "learning_rate": 4.4951456310679614e-05, + "loss": 0.8337, + "step": 2860 + }, + { + "epoch": 0.5065748830641603, + "grad_norm": 1.1323065757751465, + "learning_rate": 4.4933804060017655e-05, + "loss": 0.7668, + "step": 2870 + }, + { + "epoch": 0.5083399523431295, + "grad_norm": 2.5009121894836426, + "learning_rate": 4.4916151809355697e-05, + "loss": 0.9654, + "step": 2880 + }, + { + "epoch": 0.5101050216220987, + "grad_norm": 0.845964252948761, + "learning_rate": 4.489849955869374e-05, + "loss": 0.8481, + "step": 2890 + }, + { + "epoch": 0.5118700909010678, + "grad_norm": 3.271153450012207, + "learning_rate": 4.488084730803178e-05, + "loss": 0.8701, + "step": 2900 + }, + { + "epoch": 0.5136351601800371, + "grad_norm": 1.1637883186340332, + "learning_rate": 4.486319505736982e-05, + "loss": 0.835, + "step": 2910 + }, + { + "epoch": 0.5154002294590063, + "grad_norm": 0.9242041707038879, + "learning_rate": 4.4845542806707854e-05, + "loss": 0.7817, + "step": 2920 + }, + { + "epoch": 0.5171652987379755, + "grad_norm": 3.017683267593384, + "learning_rate": 4.4827890556045896e-05, + "loss": 0.8759, + "step": 2930 + }, + { + "epoch": 0.5189303680169447, + "grad_norm": 0.7924336194992065, + "learning_rate": 4.481023830538394e-05, + "loss": 0.8159, + "step": 2940 + }, + { + "epoch": 0.5206954372959138, + "grad_norm": 0.8962329030036926, + "learning_rate": 4.479258605472198e-05, + "loss": 0.835, + "step": 2950 + }, + { + "epoch": 0.5224605065748831, + "grad_norm": 1.6473637819290161, + "learning_rate": 4.477493380406002e-05, + "loss": 0.7439, + "step": 2960 + }, + { + "epoch": 0.5242255758538522, + "grad_norm": 2.8031015396118164, + "learning_rate": 4.475728155339806e-05, + "loss": 0.6378, + "step": 2970 + }, + { + "epoch": 0.5259906451328215, + "grad_norm": 3.2415006160736084, + "learning_rate": 4.47396293027361e-05, + "loss": 0.7761, + "step": 2980 + }, + { + "epoch": 0.5277557144117907, + "grad_norm": 0.971165120601654, + "learning_rate": 4.472197705207414e-05, + "loss": 0.8751, + "step": 2990 + }, + { + "epoch": 0.5295207836907598, + "grad_norm": 0.7631526589393616, + "learning_rate": 4.4704324801412184e-05, + "loss": 0.8916, + "step": 3000 + }, + { + "epoch": 0.5295207836907598, + "eval_loss": 0.7506680488586426, + "eval_runtime": 591.5692, + "eval_samples_per_second": 47.885, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0005326158175531012, + "step": 3000 + }, + { + "epoch": 0.5312858529697291, + "grad_norm": 2.212510108947754, + "learning_rate": 4.4686672550750225e-05, + "loss": 0.837, + "step": 3010 + }, + { + "epoch": 0.5330509222486982, + "grad_norm": 0.9668548703193665, + "learning_rate": 4.4669020300088266e-05, + "loss": 0.8005, + "step": 3020 + }, + { + "epoch": 0.5348159915276675, + "grad_norm": 2.5976197719573975, + "learning_rate": 4.465136804942631e-05, + "loss": 0.7103, + "step": 3030 + }, + { + "epoch": 0.5365810608066367, + "grad_norm": 1.0848029851913452, + "learning_rate": 4.463371579876435e-05, + "loss": 0.8199, + "step": 3040 + }, + { + "epoch": 0.5383461300856058, + "grad_norm": 0.8735620975494385, + "learning_rate": 4.461606354810238e-05, + "loss": 0.8422, + "step": 3050 + }, + { + "epoch": 0.5401111993645751, + "grad_norm": 0.9805198907852173, + "learning_rate": 4.4598411297440424e-05, + "loss": 0.848, + "step": 3060 + }, + { + "epoch": 0.5418762686435442, + "grad_norm": 1.2983773946762085, + "learning_rate": 4.4580759046778465e-05, + "loss": 0.84, + "step": 3070 + }, + { + "epoch": 0.5436413379225135, + "grad_norm": 2.475261688232422, + "learning_rate": 4.4563106796116506e-05, + "loss": 0.7414, + "step": 3080 + }, + { + "epoch": 0.5454064072014827, + "grad_norm": 0.8302130699157715, + "learning_rate": 4.454545454545455e-05, + "loss": 0.7616, + "step": 3090 + }, + { + "epoch": 0.5471714764804518, + "grad_norm": 0.9707878828048706, + "learning_rate": 4.452780229479259e-05, + "loss": 0.8007, + "step": 3100 + }, + { + "epoch": 0.5489365457594211, + "grad_norm": 0.8669869303703308, + "learning_rate": 4.451015004413063e-05, + "loss": 0.7382, + "step": 3110 + }, + { + "epoch": 0.5507016150383902, + "grad_norm": 2.2172632217407227, + "learning_rate": 4.449249779346867e-05, + "loss": 0.8489, + "step": 3120 + }, + { + "epoch": 0.5524666843173595, + "grad_norm": 0.8614495396614075, + "learning_rate": 4.447484554280671e-05, + "loss": 0.8115, + "step": 3130 + }, + { + "epoch": 0.5542317535963287, + "grad_norm": 2.0771589279174805, + "learning_rate": 4.445719329214475e-05, + "loss": 0.7712, + "step": 3140 + }, + { + "epoch": 0.5559968228752978, + "grad_norm": 0.7658421993255615, + "learning_rate": 4.4439541041482794e-05, + "loss": 0.737, + "step": 3150 + }, + { + "epoch": 0.5577618921542671, + "grad_norm": 1.0527628660202026, + "learning_rate": 4.442188879082083e-05, + "loss": 0.7613, + "step": 3160 + }, + { + "epoch": 0.5595269614332362, + "grad_norm": 1.8799755573272705, + "learning_rate": 4.440423654015887e-05, + "loss": 0.8304, + "step": 3170 + }, + { + "epoch": 0.5612920307122055, + "grad_norm": 0.8906537294387817, + "learning_rate": 4.438658428949691e-05, + "loss": 0.8177, + "step": 3180 + }, + { + "epoch": 0.5630570999911747, + "grad_norm": 3.328421115875244, + "learning_rate": 4.436893203883495e-05, + "loss": 0.8311, + "step": 3190 + }, + { + "epoch": 0.5648221692701438, + "grad_norm": 0.9287519454956055, + "learning_rate": 4.435127978817299e-05, + "loss": 0.7043, + "step": 3200 + }, + { + "epoch": 0.5665872385491131, + "grad_norm": 1.993821144104004, + "learning_rate": 4.4333627537511034e-05, + "loss": 0.8357, + "step": 3210 + }, + { + "epoch": 0.5683523078280822, + "grad_norm": 1.6915552616119385, + "learning_rate": 4.4315975286849075e-05, + "loss": 0.766, + "step": 3220 + }, + { + "epoch": 0.5701173771070515, + "grad_norm": 0.865930438041687, + "learning_rate": 4.4298323036187116e-05, + "loss": 0.8084, + "step": 3230 + }, + { + "epoch": 0.5718824463860207, + "grad_norm": 2.868046760559082, + "learning_rate": 4.428067078552516e-05, + "loss": 0.8383, + "step": 3240 + }, + { + "epoch": 0.5736475156649898, + "grad_norm": 0.9126543402671814, + "learning_rate": 4.42630185348632e-05, + "loss": 0.8596, + "step": 3250 + }, + { + "epoch": 0.5754125849439591, + "grad_norm": 1.795299768447876, + "learning_rate": 4.424536628420124e-05, + "loss": 0.7732, + "step": 3260 + }, + { + "epoch": 0.5771776542229282, + "grad_norm": 2.774261474609375, + "learning_rate": 4.422771403353928e-05, + "loss": 0.7921, + "step": 3270 + }, + { + "epoch": 0.5789427235018975, + "grad_norm": 0.7389079928398132, + "learning_rate": 4.421006178287732e-05, + "loss": 0.7926, + "step": 3280 + }, + { + "epoch": 0.5807077927808666, + "grad_norm": 0.9272739291191101, + "learning_rate": 4.419240953221536e-05, + "loss": 0.7412, + "step": 3290 + }, + { + "epoch": 0.5824728620598358, + "grad_norm": 2.9061577320098877, + "learning_rate": 4.4174757281553404e-05, + "loss": 0.8038, + "step": 3300 + }, + { + "epoch": 0.5842379313388051, + "grad_norm": 0.781683087348938, + "learning_rate": 4.415710503089144e-05, + "loss": 0.6972, + "step": 3310 + }, + { + "epoch": 0.5860030006177742, + "grad_norm": 2.3418874740600586, + "learning_rate": 4.413945278022948e-05, + "loss": 0.7838, + "step": 3320 + }, + { + "epoch": 0.5877680698967435, + "grad_norm": 1.735093355178833, + "learning_rate": 4.412180052956752e-05, + "loss": 0.7661, + "step": 3330 + }, + { + "epoch": 0.5895331391757126, + "grad_norm": 1.6221524477005005, + "learning_rate": 4.410414827890556e-05, + "loss": 0.8704, + "step": 3340 + }, + { + "epoch": 0.5912982084546818, + "grad_norm": 3.7049639225006104, + "learning_rate": 4.40864960282436e-05, + "loss": 0.7667, + "step": 3350 + }, + { + "epoch": 0.5930632777336511, + "grad_norm": 0.7311994433403015, + "learning_rate": 4.4068843777581644e-05, + "loss": 0.7703, + "step": 3360 + }, + { + "epoch": 0.5948283470126202, + "grad_norm": 0.9005844593048096, + "learning_rate": 4.4051191526919686e-05, + "loss": 0.7228, + "step": 3370 + }, + { + "epoch": 0.5965934162915895, + "grad_norm": 0.8378689289093018, + "learning_rate": 4.403353927625773e-05, + "loss": 0.799, + "step": 3380 + }, + { + "epoch": 0.5983584855705586, + "grad_norm": 2.0568249225616455, + "learning_rate": 4.401588702559576e-05, + "loss": 0.8938, + "step": 3390 + }, + { + "epoch": 0.6001235548495278, + "grad_norm": 3.126661539077759, + "learning_rate": 4.39982347749338e-05, + "loss": 0.7579, + "step": 3400 + }, + { + "epoch": 0.6018886241284971, + "grad_norm": 0.8094497323036194, + "learning_rate": 4.3980582524271843e-05, + "loss": 0.752, + "step": 3410 + }, + { + "epoch": 0.6036536934074662, + "grad_norm": 0.8366499543190002, + "learning_rate": 4.3962930273609885e-05, + "loss": 0.6863, + "step": 3420 + }, + { + "epoch": 0.6054187626864355, + "grad_norm": 0.9357757568359375, + "learning_rate": 4.3945278022947926e-05, + "loss": 0.7945, + "step": 3430 + }, + { + "epoch": 0.6071838319654046, + "grad_norm": 2.3124871253967285, + "learning_rate": 4.392762577228597e-05, + "loss": 0.898, + "step": 3440 + }, + { + "epoch": 0.6089489012443738, + "grad_norm": 2.69716215133667, + "learning_rate": 4.390997352162401e-05, + "loss": 0.7801, + "step": 3450 + }, + { + "epoch": 0.6107139705233431, + "grad_norm": 1.620527982711792, + "learning_rate": 4.389232127096205e-05, + "loss": 0.7479, + "step": 3460 + }, + { + "epoch": 0.6124790398023122, + "grad_norm": 2.4881954193115234, + "learning_rate": 4.387466902030009e-05, + "loss": 0.7216, + "step": 3470 + }, + { + "epoch": 0.6142441090812815, + "grad_norm": 0.8429247736930847, + "learning_rate": 4.385701676963813e-05, + "loss": 0.7887, + "step": 3480 + }, + { + "epoch": 0.6160091783602506, + "grad_norm": 1.0181946754455566, + "learning_rate": 4.383936451897617e-05, + "loss": 0.6728, + "step": 3490 + }, + { + "epoch": 0.6177742476392198, + "grad_norm": 2.189239740371704, + "learning_rate": 4.3821712268314214e-05, + "loss": 0.761, + "step": 3500 + }, + { + "epoch": 0.6195393169181891, + "grad_norm": 2.9036712646484375, + "learning_rate": 4.3804060017652255e-05, + "loss": 0.8686, + "step": 3510 + }, + { + "epoch": 0.6213043861971582, + "grad_norm": 2.4876108169555664, + "learning_rate": 4.3786407766990296e-05, + "loss": 0.8295, + "step": 3520 + }, + { + "epoch": 0.6230694554761275, + "grad_norm": 3.2689085006713867, + "learning_rate": 4.376875551632834e-05, + "loss": 0.933, + "step": 3530 + }, + { + "epoch": 0.6248345247550966, + "grad_norm": 1.3380885124206543, + "learning_rate": 4.375110326566638e-05, + "loss": 0.7828, + "step": 3540 + }, + { + "epoch": 0.6265995940340658, + "grad_norm": 3.1748392581939697, + "learning_rate": 4.373345101500442e-05, + "loss": 0.824, + "step": 3550 + }, + { + "epoch": 0.6283646633130351, + "grad_norm": 0.660198450088501, + "learning_rate": 4.371579876434246e-05, + "loss": 0.7187, + "step": 3560 + }, + { + "epoch": 0.6301297325920042, + "grad_norm": 1.0571256875991821, + "learning_rate": 4.36981465136805e-05, + "loss": 0.8839, + "step": 3570 + }, + { + "epoch": 0.6318948018709735, + "grad_norm": 2.345848560333252, + "learning_rate": 4.3680494263018536e-05, + "loss": 0.8157, + "step": 3580 + }, + { + "epoch": 0.6336598711499426, + "grad_norm": 2.5955724716186523, + "learning_rate": 4.366284201235658e-05, + "loss": 0.9024, + "step": 3590 + }, + { + "epoch": 0.6354249404289118, + "grad_norm": 0.6851534247398376, + "learning_rate": 4.364518976169462e-05, + "loss": 0.7143, + "step": 3600 + }, + { + "epoch": 0.6371900097078811, + "grad_norm": 1.5733120441436768, + "learning_rate": 4.362753751103266e-05, + "loss": 0.8427, + "step": 3610 + }, + { + "epoch": 0.6389550789868502, + "grad_norm": 1.0154445171356201, + "learning_rate": 4.36098852603707e-05, + "loss": 0.7195, + "step": 3620 + }, + { + "epoch": 0.6407201482658195, + "grad_norm": 1.6782732009887695, + "learning_rate": 4.3592233009708735e-05, + "loss": 0.7455, + "step": 3630 + }, + { + "epoch": 0.6424852175447886, + "grad_norm": 0.9354893565177917, + "learning_rate": 4.3574580759046776e-05, + "loss": 0.7633, + "step": 3640 + }, + { + "epoch": 0.6442502868237578, + "grad_norm": 2.326085090637207, + "learning_rate": 4.355692850838482e-05, + "loss": 0.7908, + "step": 3650 + }, + { + "epoch": 0.646015356102727, + "grad_norm": 0.8648369312286377, + "learning_rate": 4.353927625772286e-05, + "loss": 0.6926, + "step": 3660 + }, + { + "epoch": 0.6477804253816962, + "grad_norm": 1.0704703330993652, + "learning_rate": 4.35216240070609e-05, + "loss": 0.7622, + "step": 3670 + }, + { + "epoch": 0.6495454946606655, + "grad_norm": 0.9948635101318359, + "learning_rate": 4.350397175639894e-05, + "loss": 0.7132, + "step": 3680 + }, + { + "epoch": 0.6513105639396346, + "grad_norm": 3.173682689666748, + "learning_rate": 4.348631950573698e-05, + "loss": 0.7653, + "step": 3690 + }, + { + "epoch": 0.6530756332186038, + "grad_norm": 0.9430578351020813, + "learning_rate": 4.346866725507502e-05, + "loss": 0.786, + "step": 3700 + }, + { + "epoch": 0.654840702497573, + "grad_norm": 2.426671266555786, + "learning_rate": 4.3451015004413064e-05, + "loss": 0.7977, + "step": 3710 + }, + { + "epoch": 0.6566057717765422, + "grad_norm": 2.2651498317718506, + "learning_rate": 4.3433362753751105e-05, + "loss": 0.6994, + "step": 3720 + }, + { + "epoch": 0.6583708410555115, + "grad_norm": 2.3418796062469482, + "learning_rate": 4.3415710503089147e-05, + "loss": 0.7567, + "step": 3730 + }, + { + "epoch": 0.6601359103344806, + "grad_norm": 0.9187758564949036, + "learning_rate": 4.339805825242719e-05, + "loss": 0.7054, + "step": 3740 + }, + { + "epoch": 0.6619009796134498, + "grad_norm": 1.027400016784668, + "learning_rate": 4.338040600176523e-05, + "loss": 0.8261, + "step": 3750 + }, + { + "epoch": 0.663666048892419, + "grad_norm": 3.1188466548919678, + "learning_rate": 4.336275375110327e-05, + "loss": 0.8198, + "step": 3760 + }, + { + "epoch": 0.6654311181713882, + "grad_norm": 0.9015699028968811, + "learning_rate": 4.334510150044131e-05, + "loss": 0.7301, + "step": 3770 + }, + { + "epoch": 0.6671961874503575, + "grad_norm": 1.2164896726608276, + "learning_rate": 4.332744924977935e-05, + "loss": 0.7977, + "step": 3780 + }, + { + "epoch": 0.6689612567293266, + "grad_norm": 0.9232011437416077, + "learning_rate": 4.3309796999117393e-05, + "loss": 0.7386, + "step": 3790 + }, + { + "epoch": 0.6707263260082958, + "grad_norm": 0.8261239528656006, + "learning_rate": 4.3292144748455435e-05, + "loss": 0.7747, + "step": 3800 + }, + { + "epoch": 0.672491395287265, + "grad_norm": 1.8265053033828735, + "learning_rate": 4.3274492497793476e-05, + "loss": 0.8045, + "step": 3810 + }, + { + "epoch": 0.6742564645662342, + "grad_norm": 1.030633807182312, + "learning_rate": 4.325684024713152e-05, + "loss": 0.7423, + "step": 3820 + }, + { + "epoch": 0.6760215338452035, + "grad_norm": 0.8075785040855408, + "learning_rate": 4.323918799646955e-05, + "loss": 0.7633, + "step": 3830 + }, + { + "epoch": 0.6777866031241726, + "grad_norm": 1.1641757488250732, + "learning_rate": 4.322153574580759e-05, + "loss": 0.7748, + "step": 3840 + }, + { + "epoch": 0.6795516724031418, + "grad_norm": 0.6394134163856506, + "learning_rate": 4.3203883495145634e-05, + "loss": 0.7864, + "step": 3850 + }, + { + "epoch": 0.681316741682111, + "grad_norm": 0.7961482405662537, + "learning_rate": 4.3186231244483675e-05, + "loss": 0.7264, + "step": 3860 + }, + { + "epoch": 0.6830818109610802, + "grad_norm": 1.5039088726043701, + "learning_rate": 4.316857899382171e-05, + "loss": 0.7005, + "step": 3870 + }, + { + "epoch": 0.6848468802400495, + "grad_norm": 2.984955072402954, + "learning_rate": 4.315092674315975e-05, + "loss": 0.756, + "step": 3880 + }, + { + "epoch": 0.6866119495190186, + "grad_norm": 0.5771501660346985, + "learning_rate": 4.313327449249779e-05, + "loss": 0.7099, + "step": 3890 + }, + { + "epoch": 0.6883770187979878, + "grad_norm": 3.850680112838745, + "learning_rate": 4.311562224183583e-05, + "loss": 0.768, + "step": 3900 + }, + { + "epoch": 0.690142088076957, + "grad_norm": 3.665675163269043, + "learning_rate": 4.3097969991173874e-05, + "loss": 0.8096, + "step": 3910 + }, + { + "epoch": 0.6919071573559262, + "grad_norm": 3.480576515197754, + "learning_rate": 4.3080317740511915e-05, + "loss": 0.8829, + "step": 3920 + }, + { + "epoch": 0.6936722266348955, + "grad_norm": 0.6120189428329468, + "learning_rate": 4.3062665489849956e-05, + "loss": 0.6944, + "step": 3930 + }, + { + "epoch": 0.6954372959138646, + "grad_norm": 2.2129323482513428, + "learning_rate": 4.3045013239188e-05, + "loss": 0.7401, + "step": 3940 + }, + { + "epoch": 0.6972023651928339, + "grad_norm": 1.296460747718811, + "learning_rate": 4.302736098852604e-05, + "loss": 0.792, + "step": 3950 + }, + { + "epoch": 0.698967434471803, + "grad_norm": 2.8519787788391113, + "learning_rate": 4.300970873786408e-05, + "loss": 0.7726, + "step": 3960 + }, + { + "epoch": 0.7007325037507722, + "grad_norm": 0.8078942894935608, + "learning_rate": 4.299205648720212e-05, + "loss": 0.8331, + "step": 3970 + }, + { + "epoch": 0.7024975730297414, + "grad_norm": 3.126859426498413, + "learning_rate": 4.297440423654016e-05, + "loss": 0.845, + "step": 3980 + }, + { + "epoch": 0.7042626423087106, + "grad_norm": 0.9351972937583923, + "learning_rate": 4.29567519858782e-05, + "loss": 0.761, + "step": 3990 + }, + { + "epoch": 0.7060277115876799, + "grad_norm": 0.8094435334205627, + "learning_rate": 4.2939099735216244e-05, + "loss": 0.7853, + "step": 4000 + }, + { + "epoch": 0.7060277115876799, + "eval_loss": 0.7302612662315369, + "eval_runtime": 591.667, + "eval_samples_per_second": 47.877, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0005275413528279383, + "step": 4000 + }, + { + "epoch": 0.707792780866649, + "grad_norm": 3.202349901199341, + "learning_rate": 4.2921447484554285e-05, + "loss": 0.8615, + "step": 4010 + }, + { + "epoch": 0.7095578501456182, + "grad_norm": 1.095604658126831, + "learning_rate": 4.2903795233892326e-05, + "loss": 0.7358, + "step": 4020 + }, + { + "epoch": 0.7113229194245874, + "grad_norm": 1.876907229423523, + "learning_rate": 4.288614298323037e-05, + "loss": 0.7144, + "step": 4030 + }, + { + "epoch": 0.7130879887035566, + "grad_norm": 1.2887446880340576, + "learning_rate": 4.286849073256841e-05, + "loss": 0.6641, + "step": 4040 + }, + { + "epoch": 0.7148530579825259, + "grad_norm": 2.4284539222717285, + "learning_rate": 4.285083848190645e-05, + "loss": 0.8055, + "step": 4050 + }, + { + "epoch": 0.716618127261495, + "grad_norm": 1.0811606645584106, + "learning_rate": 4.283318623124449e-05, + "loss": 0.7444, + "step": 4060 + }, + { + "epoch": 0.7183831965404642, + "grad_norm": 2.0655746459960938, + "learning_rate": 4.2815533980582525e-05, + "loss": 0.75, + "step": 4070 + }, + { + "epoch": 0.7201482658194334, + "grad_norm": 1.7863682508468628, + "learning_rate": 4.2797881729920566e-05, + "loss": 0.7308, + "step": 4080 + }, + { + "epoch": 0.7219133350984026, + "grad_norm": 0.7258486747741699, + "learning_rate": 4.278022947925861e-05, + "loss": 0.7459, + "step": 4090 + }, + { + "epoch": 0.7236784043773719, + "grad_norm": 2.0655391216278076, + "learning_rate": 4.276257722859665e-05, + "loss": 0.7109, + "step": 4100 + }, + { + "epoch": 0.725443473656341, + "grad_norm": 0.7749966979026794, + "learning_rate": 4.274492497793468e-05, + "loss": 0.7652, + "step": 4110 + }, + { + "epoch": 0.7272085429353102, + "grad_norm": 0.8433781862258911, + "learning_rate": 4.2727272727272724e-05, + "loss": 0.7187, + "step": 4120 + }, + { + "epoch": 0.7289736122142794, + "grad_norm": 2.252690315246582, + "learning_rate": 4.2709620476610765e-05, + "loss": 0.7223, + "step": 4130 + }, + { + "epoch": 0.7307386814932486, + "grad_norm": 0.876118540763855, + "learning_rate": 4.2691968225948806e-05, + "loss": 0.7532, + "step": 4140 + }, + { + "epoch": 0.7325037507722179, + "grad_norm": 2.4882471561431885, + "learning_rate": 4.267431597528685e-05, + "loss": 0.7415, + "step": 4150 + }, + { + "epoch": 0.734268820051187, + "grad_norm": 2.053722381591797, + "learning_rate": 4.265666372462489e-05, + "loss": 0.8302, + "step": 4160 + }, + { + "epoch": 0.7360338893301562, + "grad_norm": 2.7460062503814697, + "learning_rate": 4.263901147396293e-05, + "loss": 0.8077, + "step": 4170 + }, + { + "epoch": 0.7377989586091254, + "grad_norm": 2.6140687465667725, + "learning_rate": 4.262135922330097e-05, + "loss": 0.7162, + "step": 4180 + }, + { + "epoch": 0.7395640278880946, + "grad_norm": 0.9406307935714722, + "learning_rate": 4.260370697263901e-05, + "loss": 0.7295, + "step": 4190 + }, + { + "epoch": 0.7413290971670639, + "grad_norm": 1.2313580513000488, + "learning_rate": 4.258605472197705e-05, + "loss": 0.7561, + "step": 4200 + }, + { + "epoch": 0.743094166446033, + "grad_norm": 1.6174222230911255, + "learning_rate": 4.2568402471315094e-05, + "loss": 0.7096, + "step": 4210 + }, + { + "epoch": 0.7448592357250022, + "grad_norm": 2.9496357440948486, + "learning_rate": 4.2550750220653136e-05, + "loss": 0.8138, + "step": 4220 + }, + { + "epoch": 0.7466243050039714, + "grad_norm": 1.0731405019760132, + "learning_rate": 4.253309796999118e-05, + "loss": 0.7358, + "step": 4230 + }, + { + "epoch": 0.7483893742829406, + "grad_norm": 1.912284016609192, + "learning_rate": 4.251544571932922e-05, + "loss": 0.6613, + "step": 4240 + }, + { + "epoch": 0.7501544435619099, + "grad_norm": 2.178107261657715, + "learning_rate": 4.249779346866726e-05, + "loss": 0.7444, + "step": 4250 + }, + { + "epoch": 0.751919512840879, + "grad_norm": 3.0223817825317383, + "learning_rate": 4.24801412180053e-05, + "loss": 0.7841, + "step": 4260 + }, + { + "epoch": 0.7536845821198482, + "grad_norm": 2.2032418251037598, + "learning_rate": 4.246248896734334e-05, + "loss": 0.7662, + "step": 4270 + }, + { + "epoch": 0.7554496513988174, + "grad_norm": 2.5964224338531494, + "learning_rate": 4.244483671668138e-05, + "loss": 0.7513, + "step": 4280 + }, + { + "epoch": 0.7572147206777866, + "grad_norm": 0.7769788503646851, + "learning_rate": 4.2427184466019424e-05, + "loss": 0.753, + "step": 4290 + }, + { + "epoch": 0.7589797899567559, + "grad_norm": 2.543841600418091, + "learning_rate": 4.2409532215357465e-05, + "loss": 0.7381, + "step": 4300 + }, + { + "epoch": 0.760744859235725, + "grad_norm": 1.1544684171676636, + "learning_rate": 4.23918799646955e-05, + "loss": 0.7067, + "step": 4310 + }, + { + "epoch": 0.7625099285146942, + "grad_norm": 3.0461089611053467, + "learning_rate": 4.237422771403354e-05, + "loss": 0.7107, + "step": 4320 + }, + { + "epoch": 0.7642749977936634, + "grad_norm": 2.073349952697754, + "learning_rate": 4.235657546337158e-05, + "loss": 0.7259, + "step": 4330 + }, + { + "epoch": 0.7660400670726326, + "grad_norm": 2.9507546424865723, + "learning_rate": 4.233892321270962e-05, + "loss": 0.781, + "step": 4340 + }, + { + "epoch": 0.7678051363516017, + "grad_norm": 0.9811722636222839, + "learning_rate": 4.2321270962047664e-05, + "loss": 0.7541, + "step": 4350 + }, + { + "epoch": 0.769570205630571, + "grad_norm": 2.324629068374634, + "learning_rate": 4.2303618711385705e-05, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.7713352749095402, + "grad_norm": 2.0748445987701416, + "learning_rate": 4.228596646072374e-05, + "loss": 0.8305, + "step": 4370 + }, + { + "epoch": 0.7731003441885094, + "grad_norm": 2.2968170642852783, + "learning_rate": 4.226831421006178e-05, + "loss": 0.7492, + "step": 4380 + }, + { + "epoch": 0.7748654134674786, + "grad_norm": 2.235989570617676, + "learning_rate": 4.225066195939982e-05, + "loss": 0.9504, + "step": 4390 + }, + { + "epoch": 0.7766304827464477, + "grad_norm": 1.7711191177368164, + "learning_rate": 4.223300970873786e-05, + "loss": 0.7801, + "step": 4400 + }, + { + "epoch": 0.778395552025417, + "grad_norm": 1.8657867908477783, + "learning_rate": 4.2215357458075904e-05, + "loss": 0.7403, + "step": 4410 + }, + { + "epoch": 0.7801606213043862, + "grad_norm": 0.9536270499229431, + "learning_rate": 4.2197705207413945e-05, + "loss": 0.7349, + "step": 4420 + }, + { + "epoch": 0.7819256905833554, + "grad_norm": 1.7488056421279907, + "learning_rate": 4.2180052956751986e-05, + "loss": 0.9128, + "step": 4430 + }, + { + "epoch": 0.7836907598623246, + "grad_norm": 1.809354543685913, + "learning_rate": 4.216240070609003e-05, + "loss": 0.8036, + "step": 4440 + }, + { + "epoch": 0.7854558291412937, + "grad_norm": 1.8551990985870361, + "learning_rate": 4.214474845542807e-05, + "loss": 0.7554, + "step": 4450 + }, + { + "epoch": 0.787220898420263, + "grad_norm": 0.8702948093414307, + "learning_rate": 4.212709620476611e-05, + "loss": 0.7256, + "step": 4460 + }, + { + "epoch": 0.7889859676992322, + "grad_norm": 1.093543291091919, + "learning_rate": 4.210944395410415e-05, + "loss": 0.705, + "step": 4470 + }, + { + "epoch": 0.7907510369782014, + "grad_norm": 2.765627384185791, + "learning_rate": 4.209179170344219e-05, + "loss": 0.6901, + "step": 4480 + }, + { + "epoch": 0.7925161062571706, + "grad_norm": 1.0054060220718384, + "learning_rate": 4.207413945278023e-05, + "loss": 0.7003, + "step": 4490 + }, + { + "epoch": 0.7942811755361397, + "grad_norm": 2.850806951522827, + "learning_rate": 4.2056487202118274e-05, + "loss": 0.7657, + "step": 4500 + }, + { + "epoch": 0.796046244815109, + "grad_norm": 2.02577543258667, + "learning_rate": 4.2038834951456315e-05, + "loss": 0.7876, + "step": 4510 + }, + { + "epoch": 0.7978113140940782, + "grad_norm": 2.8630881309509277, + "learning_rate": 4.2021182700794356e-05, + "loss": 0.8187, + "step": 4520 + }, + { + "epoch": 0.7995763833730474, + "grad_norm": 1.5044877529144287, + "learning_rate": 4.20035304501324e-05, + "loss": 0.7504, + "step": 4530 + }, + { + "epoch": 0.8013414526520166, + "grad_norm": 1.9010065793991089, + "learning_rate": 4.198587819947043e-05, + "loss": 0.7958, + "step": 4540 + }, + { + "epoch": 0.8031065219309858, + "grad_norm": 2.720659017562866, + "learning_rate": 4.196822594880847e-05, + "loss": 0.7737, + "step": 4550 + }, + { + "epoch": 0.804871591209955, + "grad_norm": 0.9726505875587463, + "learning_rate": 4.1950573698146514e-05, + "loss": 0.8296, + "step": 4560 + }, + { + "epoch": 0.8066366604889242, + "grad_norm": 2.710341215133667, + "learning_rate": 4.1932921447484555e-05, + "loss": 0.7509, + "step": 4570 + }, + { + "epoch": 0.8084017297678934, + "grad_norm": 0.9819344282150269, + "learning_rate": 4.1915269196822597e-05, + "loss": 0.7451, + "step": 4580 + }, + { + "epoch": 0.8101667990468626, + "grad_norm": 0.9601898193359375, + "learning_rate": 4.189761694616064e-05, + "loss": 0.929, + "step": 4590 + }, + { + "epoch": 0.8119318683258318, + "grad_norm": 2.9946250915527344, + "learning_rate": 4.187996469549868e-05, + "loss": 0.7867, + "step": 4600 + }, + { + "epoch": 0.813696937604801, + "grad_norm": 1.9155701398849487, + "learning_rate": 4.186231244483672e-05, + "loss": 0.664, + "step": 4610 + }, + { + "epoch": 0.8154620068837702, + "grad_norm": 2.1461758613586426, + "learning_rate": 4.184466019417476e-05, + "loss": 0.7129, + "step": 4620 + }, + { + "epoch": 0.8172270761627394, + "grad_norm": 3.015730381011963, + "learning_rate": 4.18270079435128e-05, + "loss": 0.8257, + "step": 4630 + }, + { + "epoch": 0.8189921454417086, + "grad_norm": 0.8726127743721008, + "learning_rate": 4.180935569285084e-05, + "loss": 0.6789, + "step": 4640 + }, + { + "epoch": 0.8207572147206778, + "grad_norm": 3.004166603088379, + "learning_rate": 4.179170344218888e-05, + "loss": 0.7675, + "step": 4650 + }, + { + "epoch": 0.822522283999647, + "grad_norm": 0.798729419708252, + "learning_rate": 4.177405119152692e-05, + "loss": 0.702, + "step": 4660 + }, + { + "epoch": 0.8242873532786161, + "grad_norm": 0.7195820212364197, + "learning_rate": 4.175639894086496e-05, + "loss": 0.7232, + "step": 4670 + }, + { + "epoch": 0.8260524225575854, + "grad_norm": 0.9878723621368408, + "learning_rate": 4.1738746690203e-05, + "loss": 0.7543, + "step": 4680 + }, + { + "epoch": 0.8278174918365546, + "grad_norm": 2.5027530193328857, + "learning_rate": 4.172109443954104e-05, + "loss": 0.7169, + "step": 4690 + }, + { + "epoch": 0.8295825611155238, + "grad_norm": 0.9524794220924377, + "learning_rate": 4.1703442188879084e-05, + "loss": 0.8512, + "step": 4700 + }, + { + "epoch": 0.831347630394493, + "grad_norm": 0.9306320548057556, + "learning_rate": 4.1685789938217125e-05, + "loss": 0.8466, + "step": 4710 + }, + { + "epoch": 0.8331126996734621, + "grad_norm": 0.6997801661491394, + "learning_rate": 4.1668137687555166e-05, + "loss": 0.6713, + "step": 4720 + }, + { + "epoch": 0.8348777689524314, + "grad_norm": 1.0483647584915161, + "learning_rate": 4.165048543689321e-05, + "loss": 0.7824, + "step": 4730 + }, + { + "epoch": 0.8366428382314006, + "grad_norm": 1.7400377988815308, + "learning_rate": 4.163283318623125e-05, + "loss": 0.7119, + "step": 4740 + }, + { + "epoch": 0.8384079075103698, + "grad_norm": 2.9384422302246094, + "learning_rate": 4.161518093556929e-05, + "loss": 0.837, + "step": 4750 + }, + { + "epoch": 0.840172976789339, + "grad_norm": 0.8086302280426025, + "learning_rate": 4.159752868490733e-05, + "loss": 0.7704, + "step": 4760 + }, + { + "epoch": 0.8419380460683081, + "grad_norm": 2.740748405456543, + "learning_rate": 4.157987643424537e-05, + "loss": 0.8491, + "step": 4770 + }, + { + "epoch": 0.8437031153472774, + "grad_norm": 2.4292073249816895, + "learning_rate": 4.1562224183583406e-05, + "loss": 0.7163, + "step": 4780 + }, + { + "epoch": 0.8454681846262466, + "grad_norm": 0.8519812226295471, + "learning_rate": 4.154457193292145e-05, + "loss": 0.7669, + "step": 4790 + }, + { + "epoch": 0.8472332539052158, + "grad_norm": 2.4483604431152344, + "learning_rate": 4.152691968225949e-05, + "loss": 0.6766, + "step": 4800 + }, + { + "epoch": 0.848998323184185, + "grad_norm": 0.7039122581481934, + "learning_rate": 4.150926743159753e-05, + "loss": 0.7462, + "step": 4810 + }, + { + "epoch": 0.8507633924631541, + "grad_norm": 1.0183981657028198, + "learning_rate": 4.149161518093557e-05, + "loss": 0.7203, + "step": 4820 + }, + { + "epoch": 0.8525284617421234, + "grad_norm": 2.9556922912597656, + "learning_rate": 4.147396293027361e-05, + "loss": 0.7431, + "step": 4830 + }, + { + "epoch": 0.8542935310210926, + "grad_norm": 1.9758566617965698, + "learning_rate": 4.145631067961165e-05, + "loss": 0.6689, + "step": 4840 + }, + { + "epoch": 0.8560586003000618, + "grad_norm": 1.072682499885559, + "learning_rate": 4.1438658428949694e-05, + "loss": 0.7904, + "step": 4850 + }, + { + "epoch": 0.857823669579031, + "grad_norm": 1.078199028968811, + "learning_rate": 4.1421006178287735e-05, + "loss": 0.7404, + "step": 4860 + }, + { + "epoch": 0.8595887388580001, + "grad_norm": 1.5657134056091309, + "learning_rate": 4.1403353927625776e-05, + "loss": 0.7326, + "step": 4870 + }, + { + "epoch": 0.8613538081369694, + "grad_norm": 2.0855634212493896, + "learning_rate": 4.138570167696382e-05, + "loss": 0.7502, + "step": 4880 + }, + { + "epoch": 0.8631188774159386, + "grad_norm": 0.7354227304458618, + "learning_rate": 4.136804942630186e-05, + "loss": 0.6875, + "step": 4890 + }, + { + "epoch": 0.8648839466949078, + "grad_norm": 0.7951927781105042, + "learning_rate": 4.135039717563989e-05, + "loss": 0.7122, + "step": 4900 + }, + { + "epoch": 0.866649015973877, + "grad_norm": 1.1246849298477173, + "learning_rate": 4.1332744924977934e-05, + "loss": 0.8425, + "step": 4910 + }, + { + "epoch": 0.8684140852528461, + "grad_norm": 1.5327008962631226, + "learning_rate": 4.1315092674315975e-05, + "loss": 0.7479, + "step": 4920 + }, + { + "epoch": 0.8701791545318154, + "grad_norm": 2.5594542026519775, + "learning_rate": 4.1297440423654016e-05, + "loss": 0.7138, + "step": 4930 + }, + { + "epoch": 0.8719442238107846, + "grad_norm": 1.085647702217102, + "learning_rate": 4.127978817299206e-05, + "loss": 0.7799, + "step": 4940 + }, + { + "epoch": 0.8737092930897538, + "grad_norm": 0.8347994685173035, + "learning_rate": 4.12621359223301e-05, + "loss": 0.6876, + "step": 4950 + }, + { + "epoch": 0.875474362368723, + "grad_norm": 1.6027971506118774, + "learning_rate": 4.124448367166814e-05, + "loss": 0.7479, + "step": 4960 + }, + { + "epoch": 0.8772394316476921, + "grad_norm": 0.7663355469703674, + "learning_rate": 4.122683142100618e-05, + "loss": 0.6946, + "step": 4970 + }, + { + "epoch": 0.8790045009266614, + "grad_norm": 0.6748294234275818, + "learning_rate": 4.120917917034422e-05, + "loss": 0.623, + "step": 4980 + }, + { + "epoch": 0.8807695702056306, + "grad_norm": 0.9014782905578613, + "learning_rate": 4.119152691968226e-05, + "loss": 0.8502, + "step": 4990 + }, + { + "epoch": 0.8825346394845998, + "grad_norm": 0.7959718108177185, + "learning_rate": 4.1173874669020304e-05, + "loss": 0.6841, + "step": 5000 + }, + { + "epoch": 0.8825346394845998, + "eval_loss": 0.7080652713775635, + "eval_runtime": 591.5822, + "eval_samples_per_second": 47.883, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0005015600934351041, + "step": 5000 + }, + { + "epoch": 0.884299708763569, + "grad_norm": 2.963513135910034, + "learning_rate": 4.1156222418358345e-05, + "loss": 0.8104, + "step": 5010 + }, + { + "epoch": 0.8860647780425381, + "grad_norm": 1.7745774984359741, + "learning_rate": 4.113857016769638e-05, + "loss": 0.7374, + "step": 5020 + }, + { + "epoch": 0.8878298473215074, + "grad_norm": 0.7581727504730225, + "learning_rate": 4.112091791703442e-05, + "loss": 0.6546, + "step": 5030 + }, + { + "epoch": 0.8895949166004765, + "grad_norm": 0.9805993437767029, + "learning_rate": 4.110326566637246e-05, + "loss": 0.7799, + "step": 5040 + }, + { + "epoch": 0.8913599858794458, + "grad_norm": 2.659174919128418, + "learning_rate": 4.10856134157105e-05, + "loss": 0.8281, + "step": 5050 + }, + { + "epoch": 0.893125055158415, + "grad_norm": 1.8057835102081299, + "learning_rate": 4.1067961165048544e-05, + "loss": 0.7072, + "step": 5060 + }, + { + "epoch": 0.8948901244373841, + "grad_norm": 0.9309649467468262, + "learning_rate": 4.1050308914386586e-05, + "loss": 0.8069, + "step": 5070 + }, + { + "epoch": 0.8966551937163534, + "grad_norm": 2.148200273513794, + "learning_rate": 4.103265666372463e-05, + "loss": 0.7694, + "step": 5080 + }, + { + "epoch": 0.8984202629953225, + "grad_norm": 0.7988691329956055, + "learning_rate": 4.101500441306267e-05, + "loss": 0.6688, + "step": 5090 + }, + { + "epoch": 0.9001853322742918, + "grad_norm": 1.795934796333313, + "learning_rate": 4.099735216240071e-05, + "loss": 0.7281, + "step": 5100 + }, + { + "epoch": 0.901950401553261, + "grad_norm": 1.2138944864273071, + "learning_rate": 4.097969991173875e-05, + "loss": 0.6958, + "step": 5110 + }, + { + "epoch": 0.9037154708322301, + "grad_norm": 0.8563159704208374, + "learning_rate": 4.096204766107679e-05, + "loss": 0.7041, + "step": 5120 + }, + { + "epoch": 0.9054805401111994, + "grad_norm": 1.0345391035079956, + "learning_rate": 4.094439541041483e-05, + "loss": 0.6316, + "step": 5130 + }, + { + "epoch": 0.9072456093901685, + "grad_norm": 1.0438239574432373, + "learning_rate": 4.0926743159752874e-05, + "loss": 0.7822, + "step": 5140 + }, + { + "epoch": 0.9090106786691378, + "grad_norm": 2.5692899227142334, + "learning_rate": 4.0909090909090915e-05, + "loss": 0.8251, + "step": 5150 + }, + { + "epoch": 0.910775747948107, + "grad_norm": 0.9563241004943848, + "learning_rate": 4.0891438658428956e-05, + "loss": 0.8355, + "step": 5160 + }, + { + "epoch": 0.9125408172270761, + "grad_norm": 2.4111404418945312, + "learning_rate": 4.087378640776699e-05, + "loss": 0.7656, + "step": 5170 + }, + { + "epoch": 0.9143058865060454, + "grad_norm": 2.1064178943634033, + "learning_rate": 4.085613415710503e-05, + "loss": 0.6923, + "step": 5180 + }, + { + "epoch": 0.9160709557850145, + "grad_norm": 1.022700548171997, + "learning_rate": 4.083848190644307e-05, + "loss": 0.7944, + "step": 5190 + }, + { + "epoch": 0.9178360250639838, + "grad_norm": 1.8107175827026367, + "learning_rate": 4.0820829655781114e-05, + "loss": 0.7349, + "step": 5200 + }, + { + "epoch": 0.919601094342953, + "grad_norm": 2.170214891433716, + "learning_rate": 4.0803177405119155e-05, + "loss": 0.7676, + "step": 5210 + }, + { + "epoch": 0.9213661636219221, + "grad_norm": 2.0516133308410645, + "learning_rate": 4.0785525154457196e-05, + "loss": 0.8233, + "step": 5220 + }, + { + "epoch": 0.9231312329008914, + "grad_norm": 2.494417428970337, + "learning_rate": 4.076787290379524e-05, + "loss": 0.7054, + "step": 5230 + }, + { + "epoch": 0.9248963021798605, + "grad_norm": 2.1171908378601074, + "learning_rate": 4.075022065313328e-05, + "loss": 0.6854, + "step": 5240 + }, + { + "epoch": 0.9266613714588298, + "grad_norm": 0.9174603223800659, + "learning_rate": 4.073256840247132e-05, + "loss": 0.6768, + "step": 5250 + }, + { + "epoch": 0.928426440737799, + "grad_norm": 2.3948967456817627, + "learning_rate": 4.0714916151809354e-05, + "loss": 0.8037, + "step": 5260 + }, + { + "epoch": 0.9301915100167681, + "grad_norm": 1.9202173948287964, + "learning_rate": 4.0697263901147395e-05, + "loss": 0.747, + "step": 5270 + }, + { + "epoch": 0.9319565792957374, + "grad_norm": 2.5543434619903564, + "learning_rate": 4.0679611650485436e-05, + "loss": 0.7877, + "step": 5280 + }, + { + "epoch": 0.9337216485747065, + "grad_norm": 2.527691602706909, + "learning_rate": 4.066195939982348e-05, + "loss": 0.7361, + "step": 5290 + }, + { + "epoch": 0.9354867178536758, + "grad_norm": 1.1392663717269897, + "learning_rate": 4.064430714916152e-05, + "loss": 0.7346, + "step": 5300 + }, + { + "epoch": 0.937251787132645, + "grad_norm": 2.4553403854370117, + "learning_rate": 4.062665489849956e-05, + "loss": 0.8383, + "step": 5310 + }, + { + "epoch": 0.9390168564116141, + "grad_norm": 1.9993935823440552, + "learning_rate": 4.06090026478376e-05, + "loss": 0.7392, + "step": 5320 + }, + { + "epoch": 0.9407819256905834, + "grad_norm": 0.6954941749572754, + "learning_rate": 4.059135039717564e-05, + "loss": 0.7452, + "step": 5330 + }, + { + "epoch": 0.9425469949695525, + "grad_norm": 1.6930773258209229, + "learning_rate": 4.057369814651368e-05, + "loss": 0.7071, + "step": 5340 + }, + { + "epoch": 0.9443120642485218, + "grad_norm": 0.7938792705535889, + "learning_rate": 4.0556045895851724e-05, + "loss": 0.6892, + "step": 5350 + }, + { + "epoch": 0.9460771335274909, + "grad_norm": 0.7790454030036926, + "learning_rate": 4.0538393645189765e-05, + "loss": 0.6887, + "step": 5360 + }, + { + "epoch": 0.9478422028064601, + "grad_norm": 1.0746580362319946, + "learning_rate": 4.0520741394527806e-05, + "loss": 0.7402, + "step": 5370 + }, + { + "epoch": 0.9496072720854294, + "grad_norm": 2.610752582550049, + "learning_rate": 4.050308914386585e-05, + "loss": 0.7915, + "step": 5380 + }, + { + "epoch": 0.9513723413643985, + "grad_norm": 2.0769665241241455, + "learning_rate": 4.048543689320389e-05, + "loss": 0.6858, + "step": 5390 + }, + { + "epoch": 0.9531374106433678, + "grad_norm": 0.7704333662986755, + "learning_rate": 4.046778464254193e-05, + "loss": 0.7199, + "step": 5400 + }, + { + "epoch": 0.9549024799223369, + "grad_norm": 1.770251750946045, + "learning_rate": 4.045013239187997e-05, + "loss": 0.6665, + "step": 5410 + }, + { + "epoch": 0.9566675492013061, + "grad_norm": 3.3667657375335693, + "learning_rate": 4.043248014121801e-05, + "loss": 0.6992, + "step": 5420 + }, + { + "epoch": 0.9584326184802754, + "grad_norm": 2.0577900409698486, + "learning_rate": 4.0414827890556047e-05, + "loss": 0.728, + "step": 5430 + }, + { + "epoch": 0.9601976877592445, + "grad_norm": 0.8403862118721008, + "learning_rate": 4.039717563989409e-05, + "loss": 0.8615, + "step": 5440 + }, + { + "epoch": 0.9619627570382138, + "grad_norm": 2.633230686187744, + "learning_rate": 4.037952338923213e-05, + "loss": 0.7057, + "step": 5450 + }, + { + "epoch": 0.9637278263171829, + "grad_norm": 1.1049001216888428, + "learning_rate": 4.036187113857017e-05, + "loss": 0.755, + "step": 5460 + }, + { + "epoch": 0.9654928955961521, + "grad_norm": 2.386627435684204, + "learning_rate": 4.034421888790821e-05, + "loss": 0.7846, + "step": 5470 + }, + { + "epoch": 0.9672579648751214, + "grad_norm": 2.9040069580078125, + "learning_rate": 4.032656663724625e-05, + "loss": 0.7653, + "step": 5480 + }, + { + "epoch": 0.9690230341540905, + "grad_norm": 2.9959592819213867, + "learning_rate": 4.0308914386584287e-05, + "loss": 0.7296, + "step": 5490 + }, + { + "epoch": 0.9707881034330598, + "grad_norm": 3.807882785797119, + "learning_rate": 4.029126213592233e-05, + "loss": 0.7168, + "step": 5500 + }, + { + "epoch": 0.9725531727120289, + "grad_norm": 1.1350913047790527, + "learning_rate": 4.027360988526037e-05, + "loss": 0.7343, + "step": 5510 + }, + { + "epoch": 0.9743182419909981, + "grad_norm": 0.6422159671783447, + "learning_rate": 4.025595763459841e-05, + "loss": 0.6593, + "step": 5520 + }, + { + "epoch": 0.9760833112699674, + "grad_norm": 1.7712031602859497, + "learning_rate": 4.023830538393645e-05, + "loss": 0.6703, + "step": 5530 + }, + { + "epoch": 0.9778483805489365, + "grad_norm": 3.241684675216675, + "learning_rate": 4.022065313327449e-05, + "loss": 0.7294, + "step": 5540 + }, + { + "epoch": 0.9796134498279058, + "grad_norm": 2.6312735080718994, + "learning_rate": 4.0203000882612533e-05, + "loss": 0.6784, + "step": 5550 + }, + { + "epoch": 0.9813785191068749, + "grad_norm": 0.8754311800003052, + "learning_rate": 4.0185348631950575e-05, + "loss": 0.7842, + "step": 5560 + }, + { + "epoch": 0.9831435883858441, + "grad_norm": 2.187657594680786, + "learning_rate": 4.0167696381288616e-05, + "loss": 0.7507, + "step": 5570 + }, + { + "epoch": 0.9849086576648134, + "grad_norm": 0.8857598304748535, + "learning_rate": 4.015004413062666e-05, + "loss": 0.761, + "step": 5580 + }, + { + "epoch": 0.9866737269437825, + "grad_norm": 1.6948868036270142, + "learning_rate": 4.01323918799647e-05, + "loss": 0.7893, + "step": 5590 + }, + { + "epoch": 0.9884387962227518, + "grad_norm": 2.731844902038574, + "learning_rate": 4.011473962930274e-05, + "loss": 0.752, + "step": 5600 + }, + { + "epoch": 0.9902038655017209, + "grad_norm": 2.4988324642181396, + "learning_rate": 4.009708737864078e-05, + "loss": 0.6898, + "step": 5610 + }, + { + "epoch": 0.9919689347806901, + "grad_norm": 2.6746983528137207, + "learning_rate": 4.007943512797882e-05, + "loss": 0.7696, + "step": 5620 + }, + { + "epoch": 0.9937340040596594, + "grad_norm": 0.6741234064102173, + "learning_rate": 4.006178287731686e-05, + "loss": 0.6574, + "step": 5630 + }, + { + "epoch": 0.9954990733386285, + "grad_norm": 0.7766237258911133, + "learning_rate": 4.0044130626654904e-05, + "loss": 0.6623, + "step": 5640 + }, + { + "epoch": 0.9972641426175978, + "grad_norm": 0.9466866254806519, + "learning_rate": 4.0026478375992945e-05, + "loss": 0.7363, + "step": 5650 + }, + { + "epoch": 0.9990292118965669, + "grad_norm": 2.129058599472046, + "learning_rate": 4.0008826125330986e-05, + "loss": 0.6279, + "step": 5660 + }, + { + "epoch": 1.0007942811755361, + "grad_norm": 1.1645121574401855, + "learning_rate": 3.999117387466903e-05, + "loss": 0.7561, + "step": 5670 + }, + { + "epoch": 1.0025593504545054, + "grad_norm": 0.7823759913444519, + "learning_rate": 3.997352162400707e-05, + "loss": 0.5656, + "step": 5680 + }, + { + "epoch": 1.0043244197334746, + "grad_norm": 1.0414769649505615, + "learning_rate": 3.995586937334511e-05, + "loss": 0.567, + "step": 5690 + }, + { + "epoch": 1.0060894890124437, + "grad_norm": 1.495110273361206, + "learning_rate": 3.9938217122683144e-05, + "loss": 0.6325, + "step": 5700 + }, + { + "epoch": 1.007854558291413, + "grad_norm": 0.8955295085906982, + "learning_rate": 3.9920564872021185e-05, + "loss": 0.7344, + "step": 5710 + }, + { + "epoch": 1.0096196275703821, + "grad_norm": 2.1746819019317627, + "learning_rate": 3.9902912621359226e-05, + "loss": 0.6629, + "step": 5720 + }, + { + "epoch": 1.0113846968493514, + "grad_norm": 0.9131489396095276, + "learning_rate": 3.988526037069726e-05, + "loss": 0.5815, + "step": 5730 + }, + { + "epoch": 1.0131497661283206, + "grad_norm": 1.0117508172988892, + "learning_rate": 3.98676081200353e-05, + "loss": 0.5926, + "step": 5740 + }, + { + "epoch": 1.0149148354072897, + "grad_norm": 1.6003496646881104, + "learning_rate": 3.984995586937334e-05, + "loss": 0.5605, + "step": 5750 + }, + { + "epoch": 1.016679904686259, + "grad_norm": 1.622113823890686, + "learning_rate": 3.9832303618711384e-05, + "loss": 0.6624, + "step": 5760 + }, + { + "epoch": 1.0184449739652282, + "grad_norm": 2.4824321269989014, + "learning_rate": 3.9814651368049425e-05, + "loss": 0.6636, + "step": 5770 + }, + { + "epoch": 1.0202100432441974, + "grad_norm": 3.0618515014648438, + "learning_rate": 3.9796999117387466e-05, + "loss": 0.7127, + "step": 5780 + }, + { + "epoch": 1.0219751125231666, + "grad_norm": 1.2835907936096191, + "learning_rate": 3.977934686672551e-05, + "loss": 0.5987, + "step": 5790 + }, + { + "epoch": 1.0237401818021357, + "grad_norm": 2.8061492443084717, + "learning_rate": 3.976169461606355e-05, + "loss": 0.7181, + "step": 5800 + }, + { + "epoch": 1.025505251081105, + "grad_norm": 1.2812072038650513, + "learning_rate": 3.974404236540159e-05, + "loss": 0.6901, + "step": 5810 + }, + { + "epoch": 1.0272703203600742, + "grad_norm": 4.024072647094727, + "learning_rate": 3.972639011473963e-05, + "loss": 0.5881, + "step": 5820 + }, + { + "epoch": 1.0290353896390434, + "grad_norm": 2.0519351959228516, + "learning_rate": 3.970873786407767e-05, + "loss": 0.6031, + "step": 5830 + }, + { + "epoch": 1.0308004589180126, + "grad_norm": 1.9414854049682617, + "learning_rate": 3.969108561341571e-05, + "loss": 0.659, + "step": 5840 + }, + { + "epoch": 1.0325655281969817, + "grad_norm": 0.7736496925354004, + "learning_rate": 3.9673433362753754e-05, + "loss": 0.6354, + "step": 5850 + }, + { + "epoch": 1.034330597475951, + "grad_norm": 2.337108850479126, + "learning_rate": 3.9655781112091795e-05, + "loss": 0.6873, + "step": 5860 + }, + { + "epoch": 1.0360956667549202, + "grad_norm": 2.7758662700653076, + "learning_rate": 3.9638128861429837e-05, + "loss": 0.619, + "step": 5870 + }, + { + "epoch": 1.0378607360338894, + "grad_norm": 0.7028997540473938, + "learning_rate": 3.962047661076788e-05, + "loss": 0.5763, + "step": 5880 + }, + { + "epoch": 1.0396258053128586, + "grad_norm": 1.7901548147201538, + "learning_rate": 3.960282436010592e-05, + "loss": 0.5724, + "step": 5890 + }, + { + "epoch": 1.0413908745918277, + "grad_norm": 2.513047218322754, + "learning_rate": 3.958517210944396e-05, + "loss": 0.7337, + "step": 5900 + }, + { + "epoch": 1.043155943870797, + "grad_norm": 1.1275721788406372, + "learning_rate": 3.9567519858782e-05, + "loss": 0.6653, + "step": 5910 + }, + { + "epoch": 1.0449210131497662, + "grad_norm": 0.7849834561347961, + "learning_rate": 3.954986760812004e-05, + "loss": 0.6944, + "step": 5920 + }, + { + "epoch": 1.0466860824287354, + "grad_norm": 2.418022632598877, + "learning_rate": 3.953221535745808e-05, + "loss": 0.6294, + "step": 5930 + }, + { + "epoch": 1.0484511517077044, + "grad_norm": 1.0242151021957397, + "learning_rate": 3.951456310679612e-05, + "loss": 0.6257, + "step": 5940 + }, + { + "epoch": 1.0502162209866737, + "grad_norm": 1.2218064069747925, + "learning_rate": 3.949691085613416e-05, + "loss": 0.6824, + "step": 5950 + }, + { + "epoch": 1.051981290265643, + "grad_norm": 2.2518460750579834, + "learning_rate": 3.94792586054722e-05, + "loss": 0.7105, + "step": 5960 + }, + { + "epoch": 1.0537463595446122, + "grad_norm": 0.7626746892929077, + "learning_rate": 3.9461606354810235e-05, + "loss": 0.5903, + "step": 5970 + }, + { + "epoch": 1.0555114288235814, + "grad_norm": 2.1651909351348877, + "learning_rate": 3.9443954104148276e-05, + "loss": 0.5997, + "step": 5980 + }, + { + "epoch": 1.0572764981025504, + "grad_norm": 3.152777910232544, + "learning_rate": 3.942630185348632e-05, + "loss": 0.7502, + "step": 5990 + }, + { + "epoch": 1.0590415673815197, + "grad_norm": 0.6665771007537842, + "learning_rate": 3.940864960282436e-05, + "loss": 0.6785, + "step": 6000 + }, + { + "epoch": 1.0590415673815197, + "eval_loss": 0.6956175565719604, + "eval_runtime": 591.8022, + "eval_samples_per_second": 47.866, + "eval_steps_per_second": 2.394, + "eval_token_accuracy": 0.0005115060442964234, + "step": 6000 + }, + { + "epoch": 1.060806636660489, + "grad_norm": 1.2140997648239136, + "learning_rate": 3.93909973521624e-05, + "loss": 0.6932, + "step": 6010 + }, + { + "epoch": 1.0625717059394582, + "grad_norm": 1.9795242547988892, + "learning_rate": 3.937511032656664e-05, + "loss": 0.6777, + "step": 6020 + }, + { + "epoch": 1.0643367752184274, + "grad_norm": 2.0826520919799805, + "learning_rate": 3.935745807590468e-05, + "loss": 0.7064, + "step": 6030 + }, + { + "epoch": 1.0661018444973964, + "grad_norm": 2.58691143989563, + "learning_rate": 3.933980582524272e-05, + "loss": 0.7037, + "step": 6040 + }, + { + "epoch": 1.0678669137763657, + "grad_norm": 2.3377749919891357, + "learning_rate": 3.932215357458076e-05, + "loss": 0.6285, + "step": 6050 + }, + { + "epoch": 1.069631983055335, + "grad_norm": 2.8440327644348145, + "learning_rate": 3.93045013239188e-05, + "loss": 0.6828, + "step": 6060 + }, + { + "epoch": 1.0713970523343042, + "grad_norm": 1.790805459022522, + "learning_rate": 3.928684907325684e-05, + "loss": 0.5888, + "step": 6070 + }, + { + "epoch": 1.0731621216132734, + "grad_norm": 0.8189541697502136, + "learning_rate": 3.9269196822594884e-05, + "loss": 0.6213, + "step": 6080 + }, + { + "epoch": 1.0749271908922424, + "grad_norm": 1.0348247289657593, + "learning_rate": 3.9251544571932925e-05, + "loss": 0.6106, + "step": 6090 + }, + { + "epoch": 1.0766922601712117, + "grad_norm": 1.8420778512954712, + "learning_rate": 3.9233892321270966e-05, + "loss": 0.6458, + "step": 6100 + }, + { + "epoch": 1.078457329450181, + "grad_norm": 0.9982885122299194, + "learning_rate": 3.921624007060901e-05, + "loss": 0.6307, + "step": 6110 + }, + { + "epoch": 1.0802223987291502, + "grad_norm": 3.139690399169922, + "learning_rate": 3.919858781994705e-05, + "loss": 0.6454, + "step": 6120 + }, + { + "epoch": 1.0819874680081194, + "grad_norm": 0.9377104640007019, + "learning_rate": 3.918093556928509e-05, + "loss": 0.7181, + "step": 6130 + }, + { + "epoch": 1.0837525372870884, + "grad_norm": 0.8897203207015991, + "learning_rate": 3.916328331862313e-05, + "loss": 0.5477, + "step": 6140 + }, + { + "epoch": 1.0855176065660577, + "grad_norm": 2.0230560302734375, + "learning_rate": 3.914563106796117e-05, + "loss": 0.6154, + "step": 6150 + }, + { + "epoch": 1.087282675845027, + "grad_norm": 1.1352148056030273, + "learning_rate": 3.912797881729921e-05, + "loss": 0.658, + "step": 6160 + }, + { + "epoch": 1.0890477451239962, + "grad_norm": 1.4181119203567505, + "learning_rate": 3.911032656663725e-05, + "loss": 0.5686, + "step": 6170 + }, + { + "epoch": 1.0908128144029654, + "grad_norm": 2.281613349914551, + "learning_rate": 3.909267431597529e-05, + "loss": 0.6839, + "step": 6180 + }, + { + "epoch": 1.0925778836819344, + "grad_norm": 2.092272996902466, + "learning_rate": 3.907502206531333e-05, + "loss": 0.6573, + "step": 6190 + }, + { + "epoch": 1.0943429529609037, + "grad_norm": 1.9302388429641724, + "learning_rate": 3.905736981465137e-05, + "loss": 0.5752, + "step": 6200 + }, + { + "epoch": 1.096108022239873, + "grad_norm": 0.6753134727478027, + "learning_rate": 3.903971756398941e-05, + "loss": 0.6363, + "step": 6210 + }, + { + "epoch": 1.0978730915188422, + "grad_norm": 3.038670539855957, + "learning_rate": 3.9022065313327446e-05, + "loss": 0.606, + "step": 6220 + }, + { + "epoch": 1.0996381607978114, + "grad_norm": 0.8844441175460815, + "learning_rate": 3.900441306266549e-05, + "loss": 0.5935, + "step": 6230 + }, + { + "epoch": 1.1014032300767804, + "grad_norm": 3.831115245819092, + "learning_rate": 3.898676081200353e-05, + "loss": 0.7762, + "step": 6240 + }, + { + "epoch": 1.1031682993557497, + "grad_norm": 0.9423919916152954, + "learning_rate": 3.896910856134157e-05, + "loss": 0.6149, + "step": 6250 + }, + { + "epoch": 1.104933368634719, + "grad_norm": 0.7882218956947327, + "learning_rate": 3.895145631067961e-05, + "loss": 0.5544, + "step": 6260 + }, + { + "epoch": 1.1066984379136882, + "grad_norm": 3.2080211639404297, + "learning_rate": 3.893380406001765e-05, + "loss": 0.6712, + "step": 6270 + }, + { + "epoch": 1.1084635071926574, + "grad_norm": 0.7996103167533875, + "learning_rate": 3.891615180935569e-05, + "loss": 0.5802, + "step": 6280 + }, + { + "epoch": 1.1102285764716264, + "grad_norm": 1.8485013246536255, + "learning_rate": 3.8898499558693734e-05, + "loss": 0.6356, + "step": 6290 + }, + { + "epoch": 1.1119936457505957, + "grad_norm": 1.0424914360046387, + "learning_rate": 3.8880847308031775e-05, + "loss": 0.6124, + "step": 6300 + }, + { + "epoch": 1.113758715029565, + "grad_norm": 1.83061683177948, + "learning_rate": 3.8863195057369816e-05, + "loss": 0.718, + "step": 6310 + }, + { + "epoch": 1.1155237843085342, + "grad_norm": 1.0995100736618042, + "learning_rate": 3.884554280670786e-05, + "loss": 0.6101, + "step": 6320 + }, + { + "epoch": 1.1172888535875034, + "grad_norm": 1.6366883516311646, + "learning_rate": 3.88278905560459e-05, + "loss": 0.6742, + "step": 6330 + }, + { + "epoch": 1.1190539228664724, + "grad_norm": 2.9243485927581787, + "learning_rate": 3.881023830538394e-05, + "loss": 0.6337, + "step": 6340 + }, + { + "epoch": 1.1208189921454417, + "grad_norm": 3.8549795150756836, + "learning_rate": 3.879258605472198e-05, + "loss": 0.649, + "step": 6350 + }, + { + "epoch": 1.122584061424411, + "grad_norm": 2.9887771606445312, + "learning_rate": 3.877493380406002e-05, + "loss": 0.6472, + "step": 6360 + }, + { + "epoch": 1.1243491307033802, + "grad_norm": 3.433417797088623, + "learning_rate": 3.875728155339806e-05, + "loss": 0.7064, + "step": 6370 + }, + { + "epoch": 1.1261141999823492, + "grad_norm": 1.2327500581741333, + "learning_rate": 3.8739629302736105e-05, + "loss": 0.8526, + "step": 6380 + }, + { + "epoch": 1.1278792692613184, + "grad_norm": 1.535632848739624, + "learning_rate": 3.8721977052074146e-05, + "loss": 0.7019, + "step": 6390 + }, + { + "epoch": 1.1296443385402877, + "grad_norm": 2.638556718826294, + "learning_rate": 3.870432480141219e-05, + "loss": 0.6715, + "step": 6400 + }, + { + "epoch": 1.131409407819257, + "grad_norm": 2.2223639488220215, + "learning_rate": 3.868667255075023e-05, + "loss": 0.628, + "step": 6410 + }, + { + "epoch": 1.1331744770982262, + "grad_norm": 0.6442582011222839, + "learning_rate": 3.866902030008826e-05, + "loss": 0.5226, + "step": 6420 + }, + { + "epoch": 1.1349395463771952, + "grad_norm": 0.9084598422050476, + "learning_rate": 3.8651368049426303e-05, + "loss": 0.6445, + "step": 6430 + }, + { + "epoch": 1.1367046156561644, + "grad_norm": 0.926341712474823, + "learning_rate": 3.8633715798764345e-05, + "loss": 0.6636, + "step": 6440 + }, + { + "epoch": 1.1384696849351337, + "grad_norm": 0.653100848197937, + "learning_rate": 3.8616063548102386e-05, + "loss": 0.5924, + "step": 6450 + }, + { + "epoch": 1.140234754214103, + "grad_norm": 1.7288234233856201, + "learning_rate": 3.859841129744042e-05, + "loss": 0.7612, + "step": 6460 + }, + { + "epoch": 1.1419998234930722, + "grad_norm": 2.005732536315918, + "learning_rate": 3.858075904677846e-05, + "loss": 0.6961, + "step": 6470 + }, + { + "epoch": 1.1437648927720412, + "grad_norm": 1.5119543075561523, + "learning_rate": 3.85631067961165e-05, + "loss": 0.5927, + "step": 6480 + }, + { + "epoch": 1.1455299620510104, + "grad_norm": 0.7860826849937439, + "learning_rate": 3.8545454545454544e-05, + "loss": 0.6218, + "step": 6490 + }, + { + "epoch": 1.1472950313299797, + "grad_norm": 0.9668664336204529, + "learning_rate": 3.8527802294792585e-05, + "loss": 0.6406, + "step": 6500 + }, + { + "epoch": 1.149060100608949, + "grad_norm": 1.3885242938995361, + "learning_rate": 3.8510150044130626e-05, + "loss": 0.6488, + "step": 6510 + }, + { + "epoch": 1.1508251698879182, + "grad_norm": 2.342130422592163, + "learning_rate": 3.849249779346867e-05, + "loss": 0.6748, + "step": 6520 + }, + { + "epoch": 1.1525902391668872, + "grad_norm": 1.0215253829956055, + "learning_rate": 3.847484554280671e-05, + "loss": 0.6612, + "step": 6530 + }, + { + "epoch": 1.1543553084458564, + "grad_norm": 1.2095513343811035, + "learning_rate": 3.845719329214475e-05, + "loss": 0.6519, + "step": 6540 + }, + { + "epoch": 1.1561203777248257, + "grad_norm": 2.6220827102661133, + "learning_rate": 3.843954104148279e-05, + "loss": 0.6615, + "step": 6550 + }, + { + "epoch": 1.157885447003795, + "grad_norm": 1.501598596572876, + "learning_rate": 3.842188879082083e-05, + "loss": 0.6628, + "step": 6560 + }, + { + "epoch": 1.1596505162827642, + "grad_norm": 0.7498106360435486, + "learning_rate": 3.840423654015887e-05, + "loss": 0.5555, + "step": 6570 + }, + { + "epoch": 1.1614155855617332, + "grad_norm": 0.6536363363265991, + "learning_rate": 3.8386584289496914e-05, + "loss": 0.6255, + "step": 6580 + }, + { + "epoch": 1.1631806548407024, + "grad_norm": 2.851771831512451, + "learning_rate": 3.8368932038834955e-05, + "loss": 0.7082, + "step": 6590 + }, + { + "epoch": 1.1649457241196717, + "grad_norm": 1.1639115810394287, + "learning_rate": 3.8351279788172996e-05, + "loss": 0.6362, + "step": 6600 + }, + { + "epoch": 1.166710793398641, + "grad_norm": 2.0477452278137207, + "learning_rate": 3.833362753751104e-05, + "loss": 0.6903, + "step": 6610 + }, + { + "epoch": 1.1684758626776102, + "grad_norm": 0.8935057520866394, + "learning_rate": 3.831597528684908e-05, + "loss": 0.69, + "step": 6620 + }, + { + "epoch": 1.1702409319565792, + "grad_norm": 0.8473203778266907, + "learning_rate": 3.829832303618712e-05, + "loss": 0.5369, + "step": 6630 + }, + { + "epoch": 1.1720060012355484, + "grad_norm": 0.7587894797325134, + "learning_rate": 3.828067078552516e-05, + "loss": 0.6086, + "step": 6640 + }, + { + "epoch": 1.1737710705145177, + "grad_norm": 2.5984623432159424, + "learning_rate": 3.82630185348632e-05, + "loss": 0.6499, + "step": 6650 + }, + { + "epoch": 1.175536139793487, + "grad_norm": 1.1337202787399292, + "learning_rate": 3.8245366284201236e-05, + "loss": 0.6571, + "step": 6660 + }, + { + "epoch": 1.1773012090724562, + "grad_norm": 0.6955274939537048, + "learning_rate": 3.822771403353928e-05, + "loss": 0.6734, + "step": 6670 + }, + { + "epoch": 1.1790662783514252, + "grad_norm": 2.8740875720977783, + "learning_rate": 3.821006178287732e-05, + "loss": 0.7027, + "step": 6680 + }, + { + "epoch": 1.1808313476303944, + "grad_norm": 2.8766727447509766, + "learning_rate": 3.819240953221536e-05, + "loss": 0.7541, + "step": 6690 + }, + { + "epoch": 1.1825964169093637, + "grad_norm": 3.3576607704162598, + "learning_rate": 3.8174757281553394e-05, + "loss": 0.6901, + "step": 6700 + }, + { + "epoch": 1.184361486188333, + "grad_norm": 1.2829562425613403, + "learning_rate": 3.8157105030891435e-05, + "loss": 0.6446, + "step": 6710 + }, + { + "epoch": 1.1861265554673022, + "grad_norm": 0.8178977370262146, + "learning_rate": 3.8139452780229476e-05, + "loss": 0.5423, + "step": 6720 + }, + { + "epoch": 1.1878916247462712, + "grad_norm": 0.8948667645454407, + "learning_rate": 3.812180052956752e-05, + "loss": 0.6114, + "step": 6730 + }, + { + "epoch": 1.1896566940252404, + "grad_norm": 0.9846989512443542, + "learning_rate": 3.810414827890556e-05, + "loss": 0.6515, + "step": 6740 + }, + { + "epoch": 1.1914217633042097, + "grad_norm": 0.9987642765045166, + "learning_rate": 3.80864960282436e-05, + "loss": 0.7103, + "step": 6750 + }, + { + "epoch": 1.193186832583179, + "grad_norm": 2.2832117080688477, + "learning_rate": 3.806884377758164e-05, + "loss": 0.6537, + "step": 6760 + }, + { + "epoch": 1.1949519018621482, + "grad_norm": 1.747611403465271, + "learning_rate": 3.805119152691968e-05, + "loss": 0.5134, + "step": 6770 + }, + { + "epoch": 1.1967169711411172, + "grad_norm": 2.011439561843872, + "learning_rate": 3.803353927625772e-05, + "loss": 0.6191, + "step": 6780 + }, + { + "epoch": 1.1984820404200864, + "grad_norm": 1.4673908948898315, + "learning_rate": 3.8015887025595764e-05, + "loss": 0.601, + "step": 6790 + }, + { + "epoch": 1.2002471096990557, + "grad_norm": 0.9508843421936035, + "learning_rate": 3.7998234774933806e-05, + "loss": 0.5438, + "step": 6800 + }, + { + "epoch": 1.202012178978025, + "grad_norm": 0.8046093583106995, + "learning_rate": 3.798058252427185e-05, + "loss": 0.6197, + "step": 6810 + }, + { + "epoch": 1.2037772482569942, + "grad_norm": 0.741568386554718, + "learning_rate": 3.796293027360989e-05, + "loss": 0.6795, + "step": 6820 + }, + { + "epoch": 1.2055423175359632, + "grad_norm": 0.751842200756073, + "learning_rate": 3.794527802294793e-05, + "loss": 0.626, + "step": 6830 + }, + { + "epoch": 1.2073073868149324, + "grad_norm": 1.0445666313171387, + "learning_rate": 3.792762577228597e-05, + "loss": 0.7252, + "step": 6840 + }, + { + "epoch": 1.2090724560939017, + "grad_norm": 0.7413420677185059, + "learning_rate": 3.790997352162401e-05, + "loss": 0.6268, + "step": 6850 + }, + { + "epoch": 1.210837525372871, + "grad_norm": 0.6899815201759338, + "learning_rate": 3.789232127096205e-05, + "loss": 0.643, + "step": 6860 + }, + { + "epoch": 1.2126025946518402, + "grad_norm": 2.5429368019104004, + "learning_rate": 3.7874669020300094e-05, + "loss": 0.6941, + "step": 6870 + }, + { + "epoch": 1.2143676639308092, + "grad_norm": 0.969836950302124, + "learning_rate": 3.7857016769638135e-05, + "loss": 0.6907, + "step": 6880 + }, + { + "epoch": 1.2161327332097784, + "grad_norm": 0.7424036860466003, + "learning_rate": 3.7839364518976176e-05, + "loss": 0.6324, + "step": 6890 + }, + { + "epoch": 1.2178978024887477, + "grad_norm": 2.565000057220459, + "learning_rate": 3.782171226831421e-05, + "loss": 0.6324, + "step": 6900 + }, + { + "epoch": 1.219662871767717, + "grad_norm": 1.3518567085266113, + "learning_rate": 3.780406001765225e-05, + "loss": 0.6398, + "step": 6910 + }, + { + "epoch": 1.2214279410466862, + "grad_norm": 0.7438364624977112, + "learning_rate": 3.778640776699029e-05, + "loss": 0.6396, + "step": 6920 + }, + { + "epoch": 1.2231930103256552, + "grad_norm": 0.72452712059021, + "learning_rate": 3.7768755516328334e-05, + "loss": 0.6357, + "step": 6930 + }, + { + "epoch": 1.2249580796046244, + "grad_norm": 0.9610331058502197, + "learning_rate": 3.7751103265666375e-05, + "loss": 0.6202, + "step": 6940 + }, + { + "epoch": 1.2267231488835937, + "grad_norm": 0.8247737288475037, + "learning_rate": 3.7733451015004416e-05, + "loss": 0.5992, + "step": 6950 + }, + { + "epoch": 1.228488218162563, + "grad_norm": 1.0646226406097412, + "learning_rate": 3.771579876434246e-05, + "loss": 0.6119, + "step": 6960 + }, + { + "epoch": 1.2302532874415322, + "grad_norm": 0.8103353977203369, + "learning_rate": 3.769814651368049e-05, + "loss": 0.6086, + "step": 6970 + }, + { + "epoch": 1.2320183567205012, + "grad_norm": 0.9309709072113037, + "learning_rate": 3.768049426301853e-05, + "loss": 0.6288, + "step": 6980 + }, + { + "epoch": 1.2337834259994704, + "grad_norm": 0.6081012487411499, + "learning_rate": 3.7662842012356574e-05, + "loss": 0.5723, + "step": 6990 + }, + { + "epoch": 1.2355484952784397, + "grad_norm": 2.909627914428711, + "learning_rate": 3.7645189761694615e-05, + "loss": 0.7148, + "step": 7000 + }, + { + "epoch": 1.2355484952784397, + "eval_loss": 0.6834109425544739, + "eval_runtime": 591.6374, + "eval_samples_per_second": 47.879, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0004970945644769607, + "step": 7000 + }, + { + "epoch": 1.237313564557409, + "grad_norm": 3.0048515796661377, + "learning_rate": 3.7627537511032656e-05, + "loss": 0.574, + "step": 7010 + }, + { + "epoch": 1.2390786338363782, + "grad_norm": 0.748912513256073, + "learning_rate": 3.76098852603707e-05, + "loss": 0.6341, + "step": 7020 + }, + { + "epoch": 1.2408437031153472, + "grad_norm": 0.8313178420066833, + "learning_rate": 3.759223300970874e-05, + "loss": 0.6569, + "step": 7030 + }, + { + "epoch": 1.2426087723943164, + "grad_norm": 0.7276086211204529, + "learning_rate": 3.757458075904678e-05, + "loss": 0.6759, + "step": 7040 + }, + { + "epoch": 1.2443738416732857, + "grad_norm": 2.707221508026123, + "learning_rate": 3.755692850838482e-05, + "loss": 0.6932, + "step": 7050 + }, + { + "epoch": 1.246138910952255, + "grad_norm": 0.9770756959915161, + "learning_rate": 3.753927625772286e-05, + "loss": 0.6494, + "step": 7060 + }, + { + "epoch": 1.2479039802312242, + "grad_norm": 0.7149855494499207, + "learning_rate": 3.75216240070609e-05, + "loss": 0.615, + "step": 7070 + }, + { + "epoch": 1.2496690495101932, + "grad_norm": 0.9433127045631409, + "learning_rate": 3.7503971756398944e-05, + "loss": 0.6613, + "step": 7080 + }, + { + "epoch": 1.2514341187891624, + "grad_norm": 0.9925957322120667, + "learning_rate": 3.7486319505736985e-05, + "loss": 0.7038, + "step": 7090 + }, + { + "epoch": 1.2531991880681317, + "grad_norm": 2.517293691635132, + "learning_rate": 3.7468667255075026e-05, + "loss": 0.7463, + "step": 7100 + }, + { + "epoch": 1.254964257347101, + "grad_norm": 2.6264331340789795, + "learning_rate": 3.745101500441307e-05, + "loss": 0.6378, + "step": 7110 + }, + { + "epoch": 1.2567293266260702, + "grad_norm": 1.0302218198776245, + "learning_rate": 3.743336275375111e-05, + "loss": 0.6998, + "step": 7120 + }, + { + "epoch": 1.2584943959050392, + "grad_norm": 0.9635699391365051, + "learning_rate": 3.741571050308914e-05, + "loss": 0.5448, + "step": 7130 + }, + { + "epoch": 1.2602594651840084, + "grad_norm": 0.65135258436203, + "learning_rate": 3.7398058252427184e-05, + "loss": 0.619, + "step": 7140 + }, + { + "epoch": 1.2620245344629777, + "grad_norm": 0.7986999154090881, + "learning_rate": 3.7380406001765225e-05, + "loss": 0.7834, + "step": 7150 + }, + { + "epoch": 1.263789603741947, + "grad_norm": 0.9011120200157166, + "learning_rate": 3.7362753751103266e-05, + "loss": 0.6495, + "step": 7160 + }, + { + "epoch": 1.2655546730209162, + "grad_norm": 2.6666321754455566, + "learning_rate": 3.734510150044131e-05, + "loss": 0.6368, + "step": 7170 + }, + { + "epoch": 1.2673197422998852, + "grad_norm": 2.041754722595215, + "learning_rate": 3.732744924977935e-05, + "loss": 0.6377, + "step": 7180 + }, + { + "epoch": 1.2690848115788544, + "grad_norm": 0.9585306644439697, + "learning_rate": 3.730979699911739e-05, + "loss": 0.6688, + "step": 7190 + }, + { + "epoch": 1.2708498808578237, + "grad_norm": 2.1470253467559814, + "learning_rate": 3.729214474845543e-05, + "loss": 0.6592, + "step": 7200 + }, + { + "epoch": 1.272614950136793, + "grad_norm": 3.2827067375183105, + "learning_rate": 3.727449249779347e-05, + "loss": 0.6111, + "step": 7210 + }, + { + "epoch": 1.2743800194157622, + "grad_norm": 0.5595322251319885, + "learning_rate": 3.725684024713151e-05, + "loss": 0.5442, + "step": 7220 + }, + { + "epoch": 1.2761450886947312, + "grad_norm": 1.8486113548278809, + "learning_rate": 3.723918799646955e-05, + "loss": 0.6346, + "step": 7230 + }, + { + "epoch": 1.2779101579737004, + "grad_norm": 2.427776336669922, + "learning_rate": 3.722153574580759e-05, + "loss": 0.6365, + "step": 7240 + }, + { + "epoch": 1.2796752272526697, + "grad_norm": 0.7573685646057129, + "learning_rate": 3.720388349514563e-05, + "loss": 0.5591, + "step": 7250 + }, + { + "epoch": 1.281440296531639, + "grad_norm": 3.019432783126831, + "learning_rate": 3.718623124448367e-05, + "loss": 0.6491, + "step": 7260 + }, + { + "epoch": 1.2832053658106082, + "grad_norm": 0.7502275109291077, + "learning_rate": 3.716857899382171e-05, + "loss": 0.5882, + "step": 7270 + }, + { + "epoch": 1.2849704350895772, + "grad_norm": 0.9686793684959412, + "learning_rate": 3.7150926743159753e-05, + "loss": 0.5767, + "step": 7280 + }, + { + "epoch": 1.2867355043685464, + "grad_norm": 1.0646675825119019, + "learning_rate": 3.7133274492497795e-05, + "loss": 0.5611, + "step": 7290 + }, + { + "epoch": 1.2885005736475157, + "grad_norm": 0.9285513162612915, + "learning_rate": 3.7115622241835836e-05, + "loss": 0.5345, + "step": 7300 + }, + { + "epoch": 1.290265642926485, + "grad_norm": 2.482447624206543, + "learning_rate": 3.709796999117388e-05, + "loss": 0.6679, + "step": 7310 + }, + { + "epoch": 1.2920307122054542, + "grad_norm": 0.7033592462539673, + "learning_rate": 3.708031774051192e-05, + "loss": 0.657, + "step": 7320 + }, + { + "epoch": 1.2937957814844232, + "grad_norm": 1.1254754066467285, + "learning_rate": 3.706266548984996e-05, + "loss": 0.6271, + "step": 7330 + }, + { + "epoch": 1.2955608507633924, + "grad_norm": 2.424238920211792, + "learning_rate": 3.7045013239188e-05, + "loss": 0.6206, + "step": 7340 + }, + { + "epoch": 1.2973259200423617, + "grad_norm": 1.2514790296554565, + "learning_rate": 3.702736098852604e-05, + "loss": 0.6148, + "step": 7350 + }, + { + "epoch": 1.299090989321331, + "grad_norm": 2.183605909347534, + "learning_rate": 3.700970873786408e-05, + "loss": 0.6309, + "step": 7360 + }, + { + "epoch": 1.3008560586003002, + "grad_norm": 2.991091251373291, + "learning_rate": 3.699205648720212e-05, + "loss": 0.6264, + "step": 7370 + }, + { + "epoch": 1.3026211278792692, + "grad_norm": 2.2229976654052734, + "learning_rate": 3.697440423654016e-05, + "loss": 0.665, + "step": 7380 + }, + { + "epoch": 1.3043861971582384, + "grad_norm": 0.7638149857521057, + "learning_rate": 3.69567519858782e-05, + "loss": 0.682, + "step": 7390 + }, + { + "epoch": 1.3061512664372077, + "grad_norm": 2.2151546478271484, + "learning_rate": 3.693909973521624e-05, + "loss": 0.6459, + "step": 7400 + }, + { + "epoch": 1.307916335716177, + "grad_norm": 1.0009431838989258, + "learning_rate": 3.692144748455428e-05, + "loss": 0.6326, + "step": 7410 + }, + { + "epoch": 1.3096814049951462, + "grad_norm": 2.3861541748046875, + "learning_rate": 3.690379523389232e-05, + "loss": 0.6182, + "step": 7420 + }, + { + "epoch": 1.3114464742741152, + "grad_norm": 0.7620774507522583, + "learning_rate": 3.6886142983230364e-05, + "loss": 0.6425, + "step": 7430 + }, + { + "epoch": 1.3132115435530844, + "grad_norm": 1.1126807928085327, + "learning_rate": 3.6868490732568405e-05, + "loss": 0.6115, + "step": 7440 + }, + { + "epoch": 1.3149766128320537, + "grad_norm": 1.0040702819824219, + "learning_rate": 3.6850838481906446e-05, + "loss": 0.644, + "step": 7450 + }, + { + "epoch": 1.316741682111023, + "grad_norm": 2.9938085079193115, + "learning_rate": 3.683318623124449e-05, + "loss": 0.6142, + "step": 7460 + }, + { + "epoch": 1.3185067513899922, + "grad_norm": 2.7956087589263916, + "learning_rate": 3.681553398058253e-05, + "loss": 0.6253, + "step": 7470 + }, + { + "epoch": 1.3202718206689612, + "grad_norm": 1.8580163717269897, + "learning_rate": 3.679788172992057e-05, + "loss": 0.607, + "step": 7480 + }, + { + "epoch": 1.3220368899479304, + "grad_norm": 2.707329511642456, + "learning_rate": 3.678022947925861e-05, + "loss": 0.6795, + "step": 7490 + }, + { + "epoch": 1.3238019592268997, + "grad_norm": 0.6467975378036499, + "learning_rate": 3.6762577228596645e-05, + "loss": 0.5137, + "step": 7500 + }, + { + "epoch": 1.325567028505869, + "grad_norm": 0.9823447465896606, + "learning_rate": 3.6744924977934686e-05, + "loss": 0.74, + "step": 7510 + }, + { + "epoch": 1.3273320977848382, + "grad_norm": 0.680709183216095, + "learning_rate": 3.672727272727273e-05, + "loss": 0.6245, + "step": 7520 + }, + { + "epoch": 1.3290971670638072, + "grad_norm": 4.53438663482666, + "learning_rate": 3.670962047661077e-05, + "loss": 0.6091, + "step": 7530 + }, + { + "epoch": 1.3308622363427765, + "grad_norm": 0.66978919506073, + "learning_rate": 3.669196822594881e-05, + "loss": 0.5938, + "step": 7540 + }, + { + "epoch": 1.3326273056217457, + "grad_norm": 2.9042139053344727, + "learning_rate": 3.667431597528685e-05, + "loss": 0.6434, + "step": 7550 + }, + { + "epoch": 1.3343923749007147, + "grad_norm": 0.6210140585899353, + "learning_rate": 3.665666372462489e-05, + "loss": 0.6021, + "step": 7560 + }, + { + "epoch": 1.3361574441796842, + "grad_norm": 3.4923391342163086, + "learning_rate": 3.663901147396293e-05, + "loss": 0.6562, + "step": 7570 + }, + { + "epoch": 1.3379225134586532, + "grad_norm": 0.7961561679840088, + "learning_rate": 3.6621359223300974e-05, + "loss": 0.611, + "step": 7580 + }, + { + "epoch": 1.3396875827376225, + "grad_norm": 1.0654687881469727, + "learning_rate": 3.6603706972639015e-05, + "loss": 0.6824, + "step": 7590 + }, + { + "epoch": 1.3414526520165917, + "grad_norm": 3.1214218139648438, + "learning_rate": 3.6586054721977057e-05, + "loss": 0.7111, + "step": 7600 + }, + { + "epoch": 1.3432177212955607, + "grad_norm": 2.3251359462738037, + "learning_rate": 3.656840247131509e-05, + "loss": 0.5993, + "step": 7610 + }, + { + "epoch": 1.3449827905745302, + "grad_norm": 0.804865837097168, + "learning_rate": 3.655075022065313e-05, + "loss": 0.6413, + "step": 7620 + }, + { + "epoch": 1.3467478598534992, + "grad_norm": 3.0983431339263916, + "learning_rate": 3.653309796999117e-05, + "loss": 0.6574, + "step": 7630 + }, + { + "epoch": 1.3485129291324685, + "grad_norm": 0.9018577933311462, + "learning_rate": 3.6515445719329214e-05, + "loss": 0.6052, + "step": 7640 + }, + { + "epoch": 1.3502779984114377, + "grad_norm": 0.7807791233062744, + "learning_rate": 3.6497793468667256e-05, + "loss": 0.5871, + "step": 7650 + }, + { + "epoch": 1.3520430676904067, + "grad_norm": 0.8855323195457458, + "learning_rate": 3.64801412180053e-05, + "loss": 0.6492, + "step": 7660 + }, + { + "epoch": 1.3538081369693762, + "grad_norm": 2.0354232788085938, + "learning_rate": 3.646248896734334e-05, + "loss": 0.6495, + "step": 7670 + }, + { + "epoch": 1.3555732062483452, + "grad_norm": 3.2508060932159424, + "learning_rate": 3.644483671668138e-05, + "loss": 0.7525, + "step": 7680 + }, + { + "epoch": 1.3573382755273145, + "grad_norm": 2.6301016807556152, + "learning_rate": 3.642718446601942e-05, + "loss": 0.5911, + "step": 7690 + }, + { + "epoch": 1.3591033448062837, + "grad_norm": 0.9165176153182983, + "learning_rate": 3.640953221535746e-05, + "loss": 0.6071, + "step": 7700 + }, + { + "epoch": 1.3608684140852527, + "grad_norm": 0.8968174457550049, + "learning_rate": 3.63918799646955e-05, + "loss": 0.6456, + "step": 7710 + }, + { + "epoch": 1.362633483364222, + "grad_norm": 0.6524578332901001, + "learning_rate": 3.6374227714033544e-05, + "loss": 0.5536, + "step": 7720 + }, + { + "epoch": 1.3643985526431912, + "grad_norm": 0.7832716107368469, + "learning_rate": 3.6356575463371585e-05, + "loss": 0.6569, + "step": 7730 + }, + { + "epoch": 1.3661636219221605, + "grad_norm": 2.0386695861816406, + "learning_rate": 3.6338923212709626e-05, + "loss": 0.7066, + "step": 7740 + }, + { + "epoch": 1.3679286912011297, + "grad_norm": 0.8649764657020569, + "learning_rate": 3.632127096204767e-05, + "loss": 0.7365, + "step": 7750 + }, + { + "epoch": 1.3696937604800987, + "grad_norm": 1.356444001197815, + "learning_rate": 3.63036187113857e-05, + "loss": 0.6324, + "step": 7760 + }, + { + "epoch": 1.371458829759068, + "grad_norm": 1.923880934715271, + "learning_rate": 3.628596646072374e-05, + "loss": 0.5376, + "step": 7770 + }, + { + "epoch": 1.3732238990380372, + "grad_norm": 0.9914042353630066, + "learning_rate": 3.6268314210061784e-05, + "loss": 0.5838, + "step": 7780 + }, + { + "epoch": 1.3749889683170065, + "grad_norm": 2.0924298763275146, + "learning_rate": 3.6250661959399825e-05, + "loss": 0.5839, + "step": 7790 + }, + { + "epoch": 1.3767540375959757, + "grad_norm": 1.959044098854065, + "learning_rate": 3.6233009708737866e-05, + "loss": 0.6112, + "step": 7800 + }, + { + "epoch": 1.3785191068749447, + "grad_norm": 2.142839193344116, + "learning_rate": 3.621535745807591e-05, + "loss": 0.6222, + "step": 7810 + }, + { + "epoch": 1.380284176153914, + "grad_norm": 1.0231504440307617, + "learning_rate": 3.619770520741395e-05, + "loss": 0.6006, + "step": 7820 + }, + { + "epoch": 1.3820492454328832, + "grad_norm": 4.729889869689941, + "learning_rate": 3.618005295675199e-05, + "loss": 0.5953, + "step": 7830 + }, + { + "epoch": 1.3838143147118525, + "grad_norm": 1.9787018299102783, + "learning_rate": 3.616240070609003e-05, + "loss": 0.6817, + "step": 7840 + }, + { + "epoch": 1.3855793839908217, + "grad_norm": 0.9792063236236572, + "learning_rate": 3.6144748455428065e-05, + "loss": 0.6927, + "step": 7850 + }, + { + "epoch": 1.3873444532697907, + "grad_norm": 0.8229073882102966, + "learning_rate": 3.6127096204766106e-05, + "loss": 0.6544, + "step": 7860 + }, + { + "epoch": 1.38910952254876, + "grad_norm": 0.9874571561813354, + "learning_rate": 3.610944395410415e-05, + "loss": 0.7415, + "step": 7870 + }, + { + "epoch": 1.3908745918277292, + "grad_norm": 2.853695869445801, + "learning_rate": 3.609179170344219e-05, + "loss": 0.6804, + "step": 7880 + }, + { + "epoch": 1.3926396611066985, + "grad_norm": 0.6411375403404236, + "learning_rate": 3.607413945278023e-05, + "loss": 0.6304, + "step": 7890 + }, + { + "epoch": 1.3944047303856677, + "grad_norm": 0.9372557997703552, + "learning_rate": 3.605648720211827e-05, + "loss": 0.6058, + "step": 7900 + }, + { + "epoch": 1.3961697996646367, + "grad_norm": 2.166592836380005, + "learning_rate": 3.603883495145631e-05, + "loss": 0.694, + "step": 7910 + }, + { + "epoch": 1.397934868943606, + "grad_norm": 2.2341320514678955, + "learning_rate": 3.602118270079435e-05, + "loss": 0.6103, + "step": 7920 + }, + { + "epoch": 1.3996999382225752, + "grad_norm": 3.3916139602661133, + "learning_rate": 3.6003530450132394e-05, + "loss": 0.6624, + "step": 7930 + }, + { + "epoch": 1.4014650075015445, + "grad_norm": 0.7535905838012695, + "learning_rate": 3.5985878199470435e-05, + "loss": 0.5625, + "step": 7940 + }, + { + "epoch": 1.4032300767805137, + "grad_norm": 2.7306909561157227, + "learning_rate": 3.5968225948808476e-05, + "loss": 0.6289, + "step": 7950 + }, + { + "epoch": 1.4049951460594827, + "grad_norm": 3.3049874305725098, + "learning_rate": 3.595057369814652e-05, + "loss": 0.6652, + "step": 7960 + }, + { + "epoch": 1.406760215338452, + "grad_norm": 1.124943494796753, + "learning_rate": 3.593292144748456e-05, + "loss": 0.6471, + "step": 7970 + }, + { + "epoch": 1.4085252846174212, + "grad_norm": 1.1680623292922974, + "learning_rate": 3.59152691968226e-05, + "loss": 0.661, + "step": 7980 + }, + { + "epoch": 1.4102903538963905, + "grad_norm": 0.8950367569923401, + "learning_rate": 3.589761694616064e-05, + "loss": 0.6447, + "step": 7990 + }, + { + "epoch": 1.4120554231753597, + "grad_norm": 3.3598592281341553, + "learning_rate": 3.587996469549868e-05, + "loss": 0.6222, + "step": 8000 + }, + { + "epoch": 1.4120554231753597, + "eval_loss": 0.6748311519622803, + "eval_runtime": 591.9003, + "eval_samples_per_second": 47.858, + "eval_steps_per_second": 2.394, + "eval_token_accuracy": 0.0005295711387180035, + "step": 8000 + }, + { + "epoch": 1.4138204924543287, + "grad_norm": 3.1322696208953857, + "learning_rate": 3.586231244483672e-05, + "loss": 0.5731, + "step": 8010 + }, + { + "epoch": 1.415585561733298, + "grad_norm": 0.8525201082229614, + "learning_rate": 3.5846425419240954e-05, + "loss": 0.5651, + "step": 8020 + }, + { + "epoch": 1.4173506310122672, + "grad_norm": 1.0389692783355713, + "learning_rate": 3.5828773168578995e-05, + "loss": 0.674, + "step": 8030 + }, + { + "epoch": 1.4191157002912365, + "grad_norm": 1.0880883932113647, + "learning_rate": 3.5811120917917036e-05, + "loss": 0.5929, + "step": 8040 + }, + { + "epoch": 1.4208807695702057, + "grad_norm": 2.932532548904419, + "learning_rate": 3.579346866725508e-05, + "loss": 0.6861, + "step": 8050 + }, + { + "epoch": 1.4226458388491747, + "grad_norm": 1.5305405855178833, + "learning_rate": 3.577581641659312e-05, + "loss": 0.5335, + "step": 8060 + }, + { + "epoch": 1.424410908128144, + "grad_norm": 3.449867010116577, + "learning_rate": 3.575816416593116e-05, + "loss": 0.7106, + "step": 8070 + }, + { + "epoch": 1.4261759774071132, + "grad_norm": 1.029525637626648, + "learning_rate": 3.57405119152692e-05, + "loss": 0.6468, + "step": 8080 + }, + { + "epoch": 1.4279410466860825, + "grad_norm": 3.1664063930511475, + "learning_rate": 3.572285966460724e-05, + "loss": 0.6735, + "step": 8090 + }, + { + "epoch": 1.4297061159650517, + "grad_norm": 2.8890814781188965, + "learning_rate": 3.5705207413945277e-05, + "loss": 0.6517, + "step": 8100 + }, + { + "epoch": 1.4314711852440207, + "grad_norm": 1.0854856967926025, + "learning_rate": 3.568755516328332e-05, + "loss": 0.6061, + "step": 8110 + }, + { + "epoch": 1.43323625452299, + "grad_norm": 0.7709710597991943, + "learning_rate": 3.566990291262136e-05, + "loss": 0.5545, + "step": 8120 + }, + { + "epoch": 1.4350013238019592, + "grad_norm": 2.4035892486572266, + "learning_rate": 3.56522506619594e-05, + "loss": 0.5354, + "step": 8130 + }, + { + "epoch": 1.4367663930809285, + "grad_norm": 2.355015754699707, + "learning_rate": 3.563459841129744e-05, + "loss": 0.5083, + "step": 8140 + }, + { + "epoch": 1.4385314623598977, + "grad_norm": 2.647833824157715, + "learning_rate": 3.561694616063548e-05, + "loss": 0.5592, + "step": 8150 + }, + { + "epoch": 1.4402965316388667, + "grad_norm": 0.4458978772163391, + "learning_rate": 3.5599293909973523e-05, + "loss": 0.5751, + "step": 8160 + }, + { + "epoch": 1.442061600917836, + "grad_norm": 1.8528035879135132, + "learning_rate": 3.5581641659311565e-05, + "loss": 0.5605, + "step": 8170 + }, + { + "epoch": 1.4438266701968052, + "grad_norm": 2.4769816398620605, + "learning_rate": 3.5563989408649606e-05, + "loss": 0.6521, + "step": 8180 + }, + { + "epoch": 1.4455917394757745, + "grad_norm": 0.6114835739135742, + "learning_rate": 3.554633715798765e-05, + "loss": 0.6659, + "step": 8190 + }, + { + "epoch": 1.4473568087547437, + "grad_norm": 0.9413579106330872, + "learning_rate": 3.552868490732569e-05, + "loss": 0.6604, + "step": 8200 + }, + { + "epoch": 1.4491218780337127, + "grad_norm": 2.2076611518859863, + "learning_rate": 3.551103265666373e-05, + "loss": 0.5319, + "step": 8210 + }, + { + "epoch": 1.450886947312682, + "grad_norm": 2.3304405212402344, + "learning_rate": 3.549338040600177e-05, + "loss": 0.7263, + "step": 8220 + }, + { + "epoch": 1.4526520165916512, + "grad_norm": 3.872177839279175, + "learning_rate": 3.547572815533981e-05, + "loss": 0.6536, + "step": 8230 + }, + { + "epoch": 1.4544170858706205, + "grad_norm": 0.8268742561340332, + "learning_rate": 3.5458075904677846e-05, + "loss": 0.4964, + "step": 8240 + }, + { + "epoch": 1.4561821551495897, + "grad_norm": 0.9529076218605042, + "learning_rate": 3.544042365401589e-05, + "loss": 0.5737, + "step": 8250 + }, + { + "epoch": 1.4579472244285587, + "grad_norm": 0.7376457452774048, + "learning_rate": 3.542277140335393e-05, + "loss": 0.5622, + "step": 8260 + }, + { + "epoch": 1.459712293707528, + "grad_norm": 0.8693493008613586, + "learning_rate": 3.540511915269197e-05, + "loss": 0.5919, + "step": 8270 + }, + { + "epoch": 1.4614773629864972, + "grad_norm": 1.8074113130569458, + "learning_rate": 3.538746690203001e-05, + "loss": 0.5726, + "step": 8280 + }, + { + "epoch": 1.4632424322654665, + "grad_norm": 1.9681881666183472, + "learning_rate": 3.536981465136805e-05, + "loss": 0.605, + "step": 8290 + }, + { + "epoch": 1.4650075015444357, + "grad_norm": 2.3759384155273438, + "learning_rate": 3.535216240070609e-05, + "loss": 0.7572, + "step": 8300 + }, + { + "epoch": 1.4667725708234047, + "grad_norm": 0.8262273669242859, + "learning_rate": 3.5334510150044134e-05, + "loss": 0.6218, + "step": 8310 + }, + { + "epoch": 1.468537640102374, + "grad_norm": 3.165436267852783, + "learning_rate": 3.5316857899382175e-05, + "loss": 0.7645, + "step": 8320 + }, + { + "epoch": 1.4703027093813432, + "grad_norm": 0.9559675455093384, + "learning_rate": 3.5299205648720216e-05, + "loss": 0.5479, + "step": 8330 + }, + { + "epoch": 1.4720677786603125, + "grad_norm": 2.0745654106140137, + "learning_rate": 3.528155339805825e-05, + "loss": 0.6754, + "step": 8340 + }, + { + "epoch": 1.4738328479392817, + "grad_norm": 0.9734734892845154, + "learning_rate": 3.526390114739629e-05, + "loss": 0.5967, + "step": 8350 + }, + { + "epoch": 1.4755979172182507, + "grad_norm": 0.9553613066673279, + "learning_rate": 3.524624889673433e-05, + "loss": 0.6862, + "step": 8360 + }, + { + "epoch": 1.47736298649722, + "grad_norm": 0.822285532951355, + "learning_rate": 3.5228596646072374e-05, + "loss": 0.6204, + "step": 8370 + }, + { + "epoch": 1.4791280557761892, + "grad_norm": 0.9431090950965881, + "learning_rate": 3.5210944395410415e-05, + "loss": 0.6096, + "step": 8380 + }, + { + "epoch": 1.4808931250551585, + "grad_norm": 4.019901275634766, + "learning_rate": 3.5193292144748456e-05, + "loss": 0.5686, + "step": 8390 + }, + { + "epoch": 1.4826581943341277, + "grad_norm": 3.979301691055298, + "learning_rate": 3.51756398940865e-05, + "loss": 0.665, + "step": 8400 + }, + { + "epoch": 1.4844232636130967, + "grad_norm": 0.7126337885856628, + "learning_rate": 3.515798764342454e-05, + "loss": 0.7266, + "step": 8410 + }, + { + "epoch": 1.486188332892066, + "grad_norm": 1.1517317295074463, + "learning_rate": 3.514033539276258e-05, + "loss": 0.6027, + "step": 8420 + }, + { + "epoch": 1.4879534021710352, + "grad_norm": 1.9708514213562012, + "learning_rate": 3.512268314210062e-05, + "loss": 0.707, + "step": 8430 + }, + { + "epoch": 1.4897184714500045, + "grad_norm": 0.8406355381011963, + "learning_rate": 3.510503089143866e-05, + "loss": 0.5988, + "step": 8440 + }, + { + "epoch": 1.4914835407289737, + "grad_norm": 3.593660354614258, + "learning_rate": 3.50873786407767e-05, + "loss": 0.6455, + "step": 8450 + }, + { + "epoch": 1.4932486100079427, + "grad_norm": 3.105415105819702, + "learning_rate": 3.5069726390114744e-05, + "loss": 0.6632, + "step": 8460 + }, + { + "epoch": 1.495013679286912, + "grad_norm": 2.8463072776794434, + "learning_rate": 3.5052074139452785e-05, + "loss": 0.6502, + "step": 8470 + }, + { + "epoch": 1.4967787485658812, + "grad_norm": 1.0606666803359985, + "learning_rate": 3.5034421888790827e-05, + "loss": 0.5616, + "step": 8480 + }, + { + "epoch": 1.4985438178448505, + "grad_norm": 5.110599517822266, + "learning_rate": 3.501676963812887e-05, + "loss": 0.6141, + "step": 8490 + }, + { + "epoch": 1.5003088871238197, + "grad_norm": 1.8309544324874878, + "learning_rate": 3.49991173874669e-05, + "loss": 0.603, + "step": 8500 + }, + { + "epoch": 1.5020739564027887, + "grad_norm": 3.568061113357544, + "learning_rate": 3.498146513680494e-05, + "loss": 0.6794, + "step": 8510 + }, + { + "epoch": 1.503839025681758, + "grad_norm": 2.1609065532684326, + "learning_rate": 3.4963812886142984e-05, + "loss": 0.6598, + "step": 8520 + }, + { + "epoch": 1.5056040949607272, + "grad_norm": 1.8619052171707153, + "learning_rate": 3.4946160635481025e-05, + "loss": 0.6009, + "step": 8530 + }, + { + "epoch": 1.5073691642396962, + "grad_norm": 1.1925305128097534, + "learning_rate": 3.492850838481907e-05, + "loss": 0.7325, + "step": 8540 + }, + { + "epoch": 1.5091342335186657, + "grad_norm": 0.7662707567214966, + "learning_rate": 3.491085613415711e-05, + "loss": 0.6295, + "step": 8550 + }, + { + "epoch": 1.5108993027976347, + "grad_norm": 1.0044121742248535, + "learning_rate": 3.489320388349515e-05, + "loss": 0.6156, + "step": 8560 + }, + { + "epoch": 1.512664372076604, + "grad_norm": 0.935528039932251, + "learning_rate": 3.487555163283318e-05, + "loss": 0.6138, + "step": 8570 + }, + { + "epoch": 1.5144294413555732, + "grad_norm": 2.4792351722717285, + "learning_rate": 3.4857899382171224e-05, + "loss": 0.6002, + "step": 8580 + }, + { + "epoch": 1.5161945106345422, + "grad_norm": 2.8710122108459473, + "learning_rate": 3.4840247131509266e-05, + "loss": 0.6942, + "step": 8590 + }, + { + "epoch": 1.5179595799135117, + "grad_norm": 0.8079431653022766, + "learning_rate": 3.482259488084731e-05, + "loss": 0.6515, + "step": 8600 + }, + { + "epoch": 1.5197246491924807, + "grad_norm": 1.9361704587936401, + "learning_rate": 3.480494263018535e-05, + "loss": 0.5994, + "step": 8610 + }, + { + "epoch": 1.52148971847145, + "grad_norm": 1.108770728111267, + "learning_rate": 3.478729037952339e-05, + "loss": 0.6323, + "step": 8620 + }, + { + "epoch": 1.5232547877504192, + "grad_norm": 0.8695691227912903, + "learning_rate": 3.476963812886143e-05, + "loss": 0.5698, + "step": 8630 + }, + { + "epoch": 1.5250198570293882, + "grad_norm": 1.988014578819275, + "learning_rate": 3.475198587819947e-05, + "loss": 0.598, + "step": 8640 + }, + { + "epoch": 1.5267849263083577, + "grad_norm": 2.2199532985687256, + "learning_rate": 3.473433362753751e-05, + "loss": 0.6659, + "step": 8650 + }, + { + "epoch": 1.5285499955873267, + "grad_norm": 0.845320463180542, + "learning_rate": 3.4716681376875554e-05, + "loss": 0.593, + "step": 8660 + }, + { + "epoch": 1.530315064866296, + "grad_norm": 2.645056962966919, + "learning_rate": 3.4699029126213595e-05, + "loss": 0.6378, + "step": 8670 + }, + { + "epoch": 1.5320801341452652, + "grad_norm": 1.788921594619751, + "learning_rate": 3.4681376875551636e-05, + "loss": 0.6099, + "step": 8680 + }, + { + "epoch": 1.5338452034242342, + "grad_norm": 0.826831579208374, + "learning_rate": 3.466372462488968e-05, + "loss": 0.7102, + "step": 8690 + }, + { + "epoch": 1.5356102727032037, + "grad_norm": 2.5500268936157227, + "learning_rate": 3.464607237422772e-05, + "loss": 0.5822, + "step": 8700 + }, + { + "epoch": 1.5373753419821727, + "grad_norm": 3.20348858833313, + "learning_rate": 3.462842012356576e-05, + "loss": 0.6996, + "step": 8710 + }, + { + "epoch": 1.539140411261142, + "grad_norm": 0.9246511459350586, + "learning_rate": 3.46107678729038e-05, + "loss": 0.6416, + "step": 8720 + }, + { + "epoch": 1.5409054805401112, + "grad_norm": 2.670889139175415, + "learning_rate": 3.459311562224184e-05, + "loss": 0.5498, + "step": 8730 + }, + { + "epoch": 1.5426705498190803, + "grad_norm": 2.474168062210083, + "learning_rate": 3.457546337157988e-05, + "loss": 0.6324, + "step": 8740 + }, + { + "epoch": 1.5444356190980497, + "grad_norm": 3.8397858142852783, + "learning_rate": 3.4557811120917924e-05, + "loss": 0.6154, + "step": 8750 + }, + { + "epoch": 1.5462006883770187, + "grad_norm": 2.40657901763916, + "learning_rate": 3.4540158870255965e-05, + "loss": 0.6554, + "step": 8760 + }, + { + "epoch": 1.547965757655988, + "grad_norm": 0.9760830998420715, + "learning_rate": 3.4522506619594e-05, + "loss": 0.5496, + "step": 8770 + }, + { + "epoch": 1.5497308269349572, + "grad_norm": 0.8230709433555603, + "learning_rate": 3.450485436893204e-05, + "loss": 0.4997, + "step": 8780 + }, + { + "epoch": 1.5514958962139263, + "grad_norm": 1.9407001733779907, + "learning_rate": 3.448720211827008e-05, + "loss": 0.6807, + "step": 8790 + }, + { + "epoch": 1.5532609654928957, + "grad_norm": 3.2647931575775146, + "learning_rate": 3.446954986760812e-05, + "loss": 0.5814, + "step": 8800 + }, + { + "epoch": 1.5550260347718647, + "grad_norm": 1.772679328918457, + "learning_rate": 3.445189761694616e-05, + "loss": 0.6516, + "step": 8810 + }, + { + "epoch": 1.556791104050834, + "grad_norm": 1.2060052156448364, + "learning_rate": 3.44342453662842e-05, + "loss": 0.6775, + "step": 8820 + }, + { + "epoch": 1.5585561733298032, + "grad_norm": 1.1109216213226318, + "learning_rate": 3.441659311562224e-05, + "loss": 0.6459, + "step": 8830 + }, + { + "epoch": 1.5603212426087723, + "grad_norm": 2.8640549182891846, + "learning_rate": 3.439894086496028e-05, + "loss": 0.5936, + "step": 8840 + }, + { + "epoch": 1.5620863118877417, + "grad_norm": 1.027740478515625, + "learning_rate": 3.438128861429832e-05, + "loss": 0.6354, + "step": 8850 + }, + { + "epoch": 1.5638513811667107, + "grad_norm": 1.7239662408828735, + "learning_rate": 3.436363636363636e-05, + "loss": 0.6095, + "step": 8860 + }, + { + "epoch": 1.56561645044568, + "grad_norm": 0.9941009283065796, + "learning_rate": 3.4345984112974404e-05, + "loss": 0.6207, + "step": 8870 + }, + { + "epoch": 1.5673815197246492, + "grad_norm": 2.983370542526245, + "learning_rate": 3.4328331862312445e-05, + "loss": 0.5937, + "step": 8880 + }, + { + "epoch": 1.5691465890036183, + "grad_norm": 1.0334148406982422, + "learning_rate": 3.4310679611650486e-05, + "loss": 0.733, + "step": 8890 + }, + { + "epoch": 1.5709116582825877, + "grad_norm": 0.7821390628814697, + "learning_rate": 3.429302736098853e-05, + "loss": 0.6771, + "step": 8900 + }, + { + "epoch": 1.5726767275615567, + "grad_norm": 3.0682766437530518, + "learning_rate": 3.427537511032657e-05, + "loss": 0.6754, + "step": 8910 + }, + { + "epoch": 1.574441796840526, + "grad_norm": 1.0227667093276978, + "learning_rate": 3.425772285966461e-05, + "loss": 0.5639, + "step": 8920 + }, + { + "epoch": 1.5762068661194952, + "grad_norm": 0.9798213839530945, + "learning_rate": 3.424007060900265e-05, + "loss": 0.6775, + "step": 8930 + }, + { + "epoch": 1.5779719353984643, + "grad_norm": 0.8554840087890625, + "learning_rate": 3.422241835834069e-05, + "loss": 0.6324, + "step": 8940 + }, + { + "epoch": 1.5797370046774337, + "grad_norm": 0.913659393787384, + "learning_rate": 3.420476610767873e-05, + "loss": 0.6402, + "step": 8950 + }, + { + "epoch": 1.5815020739564027, + "grad_norm": 2.2020037174224854, + "learning_rate": 3.4187113857016774e-05, + "loss": 0.6192, + "step": 8960 + }, + { + "epoch": 1.583267143235372, + "grad_norm": 1.2366520166397095, + "learning_rate": 3.4169461606354816e-05, + "loss": 0.6373, + "step": 8970 + }, + { + "epoch": 1.5850322125143412, + "grad_norm": 0.6150069236755371, + "learning_rate": 3.415180935569286e-05, + "loss": 0.6285, + "step": 8980 + }, + { + "epoch": 1.5867972817933103, + "grad_norm": 3.2270278930664062, + "learning_rate": 3.41341571050309e-05, + "loss": 0.5097, + "step": 8990 + }, + { + "epoch": 1.5885623510722797, + "grad_norm": 1.8538926839828491, + "learning_rate": 3.411650485436894e-05, + "loss": 0.629, + "step": 9000 + }, + { + "epoch": 1.5885623510722797, + "eval_loss": 0.6662057638168335, + "eval_runtime": 591.8171, + "eval_samples_per_second": 47.864, + "eval_steps_per_second": 2.394, + "eval_token_accuracy": 0.000523887738225821, + "step": 9000 + }, + { + "epoch": 1.5903274203512487, + "grad_norm": 2.432816982269287, + "learning_rate": 3.409885260370697e-05, + "loss": 0.5923, + "step": 9010 + }, + { + "epoch": 1.592092489630218, + "grad_norm": 2.547600507736206, + "learning_rate": 3.4081200353045015e-05, + "loss": 0.6395, + "step": 9020 + }, + { + "epoch": 1.5938575589091872, + "grad_norm": 1.0481719970703125, + "learning_rate": 3.4063548102383056e-05, + "loss": 0.6421, + "step": 9030 + }, + { + "epoch": 1.5956226281881563, + "grad_norm": 2.210798740386963, + "learning_rate": 3.40458958517211e-05, + "loss": 0.6653, + "step": 9040 + }, + { + "epoch": 1.5973876974671257, + "grad_norm": 1.0804709196090698, + "learning_rate": 3.402824360105913e-05, + "loss": 0.6169, + "step": 9050 + }, + { + "epoch": 1.5991527667460947, + "grad_norm": 0.8143006563186646, + "learning_rate": 3.401059135039717e-05, + "loss": 0.637, + "step": 9060 + }, + { + "epoch": 1.600917836025064, + "grad_norm": 1.9847668409347534, + "learning_rate": 3.3992939099735213e-05, + "loss": 0.5887, + "step": 9070 + }, + { + "epoch": 1.6026829053040332, + "grad_norm": 1.3877208232879639, + "learning_rate": 3.3975286849073255e-05, + "loss": 0.626, + "step": 9080 + }, + { + "epoch": 1.6044479745830023, + "grad_norm": 0.9856419563293457, + "learning_rate": 3.3957634598411296e-05, + "loss": 0.5996, + "step": 9090 + }, + { + "epoch": 1.6062130438619717, + "grad_norm": 2.6040806770324707, + "learning_rate": 3.393998234774934e-05, + "loss": 0.6237, + "step": 9100 + }, + { + "epoch": 1.6079781131409407, + "grad_norm": 0.9981382489204407, + "learning_rate": 3.392233009708738e-05, + "loss": 0.5975, + "step": 9110 + }, + { + "epoch": 1.60974318241991, + "grad_norm": 0.9611056447029114, + "learning_rate": 3.390467784642542e-05, + "loss": 0.5722, + "step": 9120 + }, + { + "epoch": 1.6115082516988792, + "grad_norm": 1.7118070125579834, + "learning_rate": 3.388702559576346e-05, + "loss": 0.5698, + "step": 9130 + }, + { + "epoch": 1.6132733209778483, + "grad_norm": 0.9777098894119263, + "learning_rate": 3.38693733451015e-05, + "loss": 0.6284, + "step": 9140 + }, + { + "epoch": 1.6150383902568177, + "grad_norm": 3.165189743041992, + "learning_rate": 3.385172109443954e-05, + "loss": 0.6531, + "step": 9150 + }, + { + "epoch": 1.6168034595357867, + "grad_norm": 2.1566078662872314, + "learning_rate": 3.3834068843777584e-05, + "loss": 0.6539, + "step": 9160 + }, + { + "epoch": 1.618568528814756, + "grad_norm": 0.7770845890045166, + "learning_rate": 3.3816416593115625e-05, + "loss": 0.6576, + "step": 9170 + }, + { + "epoch": 1.6203335980937252, + "grad_norm": 0.9254570603370667, + "learning_rate": 3.3798764342453666e-05, + "loss": 0.6781, + "step": 9180 + }, + { + "epoch": 1.6220986673726943, + "grad_norm": 3.901735782623291, + "learning_rate": 3.378111209179171e-05, + "loss": 0.6293, + "step": 9190 + }, + { + "epoch": 1.6238637366516637, + "grad_norm": 3.071284770965576, + "learning_rate": 3.376345984112975e-05, + "loss": 0.6078, + "step": 9200 + }, + { + "epoch": 1.6256288059306327, + "grad_norm": 0.9238028526306152, + "learning_rate": 3.374580759046779e-05, + "loss": 0.6236, + "step": 9210 + }, + { + "epoch": 1.627393875209602, + "grad_norm": 0.923682451248169, + "learning_rate": 3.372815533980583e-05, + "loss": 0.6485, + "step": 9220 + }, + { + "epoch": 1.6291589444885712, + "grad_norm": 0.8640393018722534, + "learning_rate": 3.371050308914387e-05, + "loss": 0.6288, + "step": 9230 + }, + { + "epoch": 1.6309240137675403, + "grad_norm": 1.036797046661377, + "learning_rate": 3.369285083848191e-05, + "loss": 0.6533, + "step": 9240 + }, + { + "epoch": 1.6326890830465097, + "grad_norm": 1.9658279418945312, + "learning_rate": 3.367519858781995e-05, + "loss": 0.6105, + "step": 9250 + }, + { + "epoch": 1.6344541523254787, + "grad_norm": 1.4766429662704468, + "learning_rate": 3.365754633715799e-05, + "loss": 0.6158, + "step": 9260 + }, + { + "epoch": 1.636219221604448, + "grad_norm": 0.8438056707382202, + "learning_rate": 3.363989408649603e-05, + "loss": 0.6446, + "step": 9270 + }, + { + "epoch": 1.6379842908834172, + "grad_norm": 0.8339599370956421, + "learning_rate": 3.362224183583407e-05, + "loss": 0.6512, + "step": 9280 + }, + { + "epoch": 1.6397493601623863, + "grad_norm": 1.8318166732788086, + "learning_rate": 3.360458958517211e-05, + "loss": 0.5804, + "step": 9290 + }, + { + "epoch": 1.6415144294413557, + "grad_norm": 1.179258108139038, + "learning_rate": 3.3586937334510146e-05, + "loss": 0.6192, + "step": 9300 + }, + { + "epoch": 1.6432794987203247, + "grad_norm": 2.5670175552368164, + "learning_rate": 3.356928508384819e-05, + "loss": 0.6171, + "step": 9310 + }, + { + "epoch": 1.645044567999294, + "grad_norm": 0.7155138254165649, + "learning_rate": 3.355163283318623e-05, + "loss": 0.6379, + "step": 9320 + }, + { + "epoch": 1.6468096372782632, + "grad_norm": 3.0688700675964355, + "learning_rate": 3.353398058252427e-05, + "loss": 0.659, + "step": 9330 + }, + { + "epoch": 1.6485747065572323, + "grad_norm": 0.8237749934196472, + "learning_rate": 3.351632833186231e-05, + "loss": 0.5595, + "step": 9340 + }, + { + "epoch": 1.6503397758362017, + "grad_norm": 0.6362634301185608, + "learning_rate": 3.349867608120035e-05, + "loss": 0.5071, + "step": 9350 + }, + { + "epoch": 1.6521048451151708, + "grad_norm": 2.7345876693725586, + "learning_rate": 3.348102383053839e-05, + "loss": 0.5936, + "step": 9360 + }, + { + "epoch": 1.65386991439414, + "grad_norm": 0.9333705902099609, + "learning_rate": 3.3463371579876434e-05, + "loss": 0.6701, + "step": 9370 + }, + { + "epoch": 1.6556349836731092, + "grad_norm": 2.97668194770813, + "learning_rate": 3.3445719329214475e-05, + "loss": 0.5406, + "step": 9380 + }, + { + "epoch": 1.6574000529520783, + "grad_norm": 0.6854531168937683, + "learning_rate": 3.3428067078552517e-05, + "loss": 0.6807, + "step": 9390 + }, + { + "epoch": 1.6591651222310477, + "grad_norm": 2.8641855716705322, + "learning_rate": 3.341041482789056e-05, + "loss": 0.6658, + "step": 9400 + }, + { + "epoch": 1.6609301915100168, + "grad_norm": 2.481201410293579, + "learning_rate": 3.33927625772286e-05, + "loss": 0.6206, + "step": 9410 + }, + { + "epoch": 1.662695260788986, + "grad_norm": 2.451979160308838, + "learning_rate": 3.337511032656664e-05, + "loss": 0.6998, + "step": 9420 + }, + { + "epoch": 1.6644603300679552, + "grad_norm": 0.9651976823806763, + "learning_rate": 3.335745807590468e-05, + "loss": 0.7207, + "step": 9430 + }, + { + "epoch": 1.6662253993469243, + "grad_norm": 1.954372525215149, + "learning_rate": 3.333980582524272e-05, + "loss": 0.6195, + "step": 9440 + }, + { + "epoch": 1.6679904686258937, + "grad_norm": 1.883323073387146, + "learning_rate": 3.3322153574580763e-05, + "loss": 0.5419, + "step": 9450 + }, + { + "epoch": 1.6697555379048628, + "grad_norm": 2.2481348514556885, + "learning_rate": 3.3304501323918805e-05, + "loss": 0.582, + "step": 9460 + }, + { + "epoch": 1.671520607183832, + "grad_norm": 3.2739031314849854, + "learning_rate": 3.3286849073256846e-05, + "loss": 0.6003, + "step": 9470 + }, + { + "epoch": 1.6732856764628012, + "grad_norm": 2.612467050552368, + "learning_rate": 3.326919682259489e-05, + "loss": 0.6108, + "step": 9480 + }, + { + "epoch": 1.6750507457417703, + "grad_norm": 2.2844624519348145, + "learning_rate": 3.325154457193292e-05, + "loss": 0.6185, + "step": 9490 + }, + { + "epoch": 1.6768158150207397, + "grad_norm": 0.9374117851257324, + "learning_rate": 3.323389232127096e-05, + "loss": 0.6983, + "step": 9500 + }, + { + "epoch": 1.6785808842997088, + "grad_norm": 2.6882810592651367, + "learning_rate": 3.3216240070609004e-05, + "loss": 0.5724, + "step": 9510 + }, + { + "epoch": 1.680345953578678, + "grad_norm": 1.042275309562683, + "learning_rate": 3.3198587819947045e-05, + "loss": 0.6018, + "step": 9520 + }, + { + "epoch": 1.6821110228576472, + "grad_norm": 2.711000442504883, + "learning_rate": 3.3180935569285086e-05, + "loss": 0.6355, + "step": 9530 + }, + { + "epoch": 1.6838760921366163, + "grad_norm": 1.005644679069519, + "learning_rate": 3.316328331862313e-05, + "loss": 0.5572, + "step": 9540 + }, + { + "epoch": 1.6856411614155855, + "grad_norm": 0.8825269937515259, + "learning_rate": 3.314563106796117e-05, + "loss": 0.6376, + "step": 9550 + }, + { + "epoch": 1.6874062306945548, + "grad_norm": 4.650322914123535, + "learning_rate": 3.31279788172992e-05, + "loss": 0.5378, + "step": 9560 + }, + { + "epoch": 1.689171299973524, + "grad_norm": 0.8221828937530518, + "learning_rate": 3.3110326566637244e-05, + "loss": 0.7414, + "step": 9570 + }, + { + "epoch": 1.6909363692524932, + "grad_norm": 1.6978652477264404, + "learning_rate": 3.3092674315975285e-05, + "loss": 0.5823, + "step": 9580 + }, + { + "epoch": 1.6927014385314623, + "grad_norm": 0.978359043598175, + "learning_rate": 3.3075022065313326e-05, + "loss": 0.6359, + "step": 9590 + }, + { + "epoch": 1.6944665078104315, + "grad_norm": 2.7753124237060547, + "learning_rate": 3.305736981465137e-05, + "loss": 0.6715, + "step": 9600 + }, + { + "epoch": 1.6962315770894008, + "grad_norm": 3.3159520626068115, + "learning_rate": 3.303971756398941e-05, + "loss": 0.5691, + "step": 9610 + }, + { + "epoch": 1.69799664636837, + "grad_norm": 2.1915853023529053, + "learning_rate": 3.302206531332745e-05, + "loss": 0.6413, + "step": 9620 + }, + { + "epoch": 1.6997617156473392, + "grad_norm": 0.6985222697257996, + "learning_rate": 3.300441306266549e-05, + "loss": 0.5339, + "step": 9630 + }, + { + "epoch": 1.7015267849263083, + "grad_norm": 1.000430703163147, + "learning_rate": 3.298676081200353e-05, + "loss": 0.627, + "step": 9640 + }, + { + "epoch": 1.7032918542052775, + "grad_norm": 2.2707552909851074, + "learning_rate": 3.296910856134157e-05, + "loss": 0.5694, + "step": 9650 + }, + { + "epoch": 1.7050569234842468, + "grad_norm": 0.9370206594467163, + "learning_rate": 3.2951456310679614e-05, + "loss": 0.6173, + "step": 9660 + }, + { + "epoch": 1.706821992763216, + "grad_norm": 0.840900719165802, + "learning_rate": 3.2933804060017655e-05, + "loss": 0.6081, + "step": 9670 + }, + { + "epoch": 1.7085870620421852, + "grad_norm": 0.9345539212226868, + "learning_rate": 3.2916151809355696e-05, + "loss": 0.5849, + "step": 9680 + }, + { + "epoch": 1.7103521313211543, + "grad_norm": 1.4001808166503906, + "learning_rate": 3.289849955869374e-05, + "loss": 0.6198, + "step": 9690 + }, + { + "epoch": 1.7121172006001235, + "grad_norm": 2.205587387084961, + "learning_rate": 3.288084730803178e-05, + "loss": 0.6937, + "step": 9700 + }, + { + "epoch": 1.7138822698790928, + "grad_norm": 2.924020290374756, + "learning_rate": 3.286319505736982e-05, + "loss": 0.5451, + "step": 9710 + }, + { + "epoch": 1.715647339158062, + "grad_norm": 2.550149917602539, + "learning_rate": 3.2845542806707854e-05, + "loss": 0.5779, + "step": 9720 + }, + { + "epoch": 1.7174124084370312, + "grad_norm": 0.8188669085502625, + "learning_rate": 3.2827890556045895e-05, + "loss": 0.6391, + "step": 9730 + }, + { + "epoch": 1.7191774777160003, + "grad_norm": 2.1378254890441895, + "learning_rate": 3.2810238305383936e-05, + "loss": 0.5491, + "step": 9740 + }, + { + "epoch": 1.7209425469949695, + "grad_norm": 1.0728440284729004, + "learning_rate": 3.279258605472198e-05, + "loss": 0.6991, + "step": 9750 + }, + { + "epoch": 1.7227076162739388, + "grad_norm": 2.225074529647827, + "learning_rate": 3.277493380406002e-05, + "loss": 0.6602, + "step": 9760 + }, + { + "epoch": 1.724472685552908, + "grad_norm": 2.452317714691162, + "learning_rate": 3.275728155339806e-05, + "loss": 0.6031, + "step": 9770 + }, + { + "epoch": 1.7262377548318772, + "grad_norm": 1.0104994773864746, + "learning_rate": 3.27396293027361e-05, + "loss": 0.65, + "step": 9780 + }, + { + "epoch": 1.7280028241108463, + "grad_norm": 2.9075286388397217, + "learning_rate": 3.272197705207414e-05, + "loss": 0.66, + "step": 9790 + }, + { + "epoch": 1.7297678933898155, + "grad_norm": 3.247784376144409, + "learning_rate": 3.270432480141218e-05, + "loss": 0.6073, + "step": 9800 + }, + { + "epoch": 1.7315329626687848, + "grad_norm": 1.802399754524231, + "learning_rate": 3.2686672550750224e-05, + "loss": 0.7026, + "step": 9810 + }, + { + "epoch": 1.733298031947754, + "grad_norm": 1.8000247478485107, + "learning_rate": 3.2669020300088266e-05, + "loss": 0.6974, + "step": 9820 + }, + { + "epoch": 1.7350631012267232, + "grad_norm": 0.6736454367637634, + "learning_rate": 3.26513680494263e-05, + "loss": 0.6313, + "step": 9830 + }, + { + "epoch": 1.7368281705056923, + "grad_norm": 0.8809193968772888, + "learning_rate": 3.263371579876434e-05, + "loss": 0.5773, + "step": 9840 + }, + { + "epoch": 1.7385932397846615, + "grad_norm": 1.3804481029510498, + "learning_rate": 3.261606354810238e-05, + "loss": 0.5644, + "step": 9850 + }, + { + "epoch": 1.7403583090636308, + "grad_norm": 2.855910301208496, + "learning_rate": 3.259841129744042e-05, + "loss": 0.7065, + "step": 9860 + }, + { + "epoch": 1.7421233783425998, + "grad_norm": 0.8118297457695007, + "learning_rate": 3.2580759046778465e-05, + "loss": 0.5712, + "step": 9870 + }, + { + "epoch": 1.7438884476215692, + "grad_norm": 1.8637617826461792, + "learning_rate": 3.2563106796116506e-05, + "loss": 0.615, + "step": 9880 + }, + { + "epoch": 1.7456535169005383, + "grad_norm": 0.7027237415313721, + "learning_rate": 3.254545454545455e-05, + "loss": 0.585, + "step": 9890 + }, + { + "epoch": 1.7474185861795075, + "grad_norm": 1.9294718503952026, + "learning_rate": 3.252780229479259e-05, + "loss": 0.721, + "step": 9900 + }, + { + "epoch": 1.7491836554584768, + "grad_norm": 0.7499776482582092, + "learning_rate": 3.251015004413063e-05, + "loss": 0.5693, + "step": 9910 + }, + { + "epoch": 1.7509487247374458, + "grad_norm": 0.7779677510261536, + "learning_rate": 3.249249779346867e-05, + "loss": 0.6228, + "step": 9920 + }, + { + "epoch": 1.7527137940164152, + "grad_norm": 0.8223459720611572, + "learning_rate": 3.247484554280671e-05, + "loss": 0.5272, + "step": 9930 + }, + { + "epoch": 1.7544788632953843, + "grad_norm": 2.417341709136963, + "learning_rate": 3.245719329214475e-05, + "loss": 0.6413, + "step": 9940 + }, + { + "epoch": 1.7562439325743535, + "grad_norm": 1.4994111061096191, + "learning_rate": 3.2439541041482794e-05, + "loss": 0.6917, + "step": 9950 + }, + { + "epoch": 1.7580090018533228, + "grad_norm": 1.04421865940094, + "learning_rate": 3.242188879082083e-05, + "loss": 0.6192, + "step": 9960 + }, + { + "epoch": 1.7597740711322918, + "grad_norm": 0.8585307598114014, + "learning_rate": 3.240423654015887e-05, + "loss": 0.5649, + "step": 9970 + }, + { + "epoch": 1.7615391404112613, + "grad_norm": 2.860748529434204, + "learning_rate": 3.238658428949691e-05, + "loss": 0.6115, + "step": 9980 + }, + { + "epoch": 1.7633042096902303, + "grad_norm": 1.2423299551010132, + "learning_rate": 3.236893203883495e-05, + "loss": 0.5609, + "step": 9990 + }, + { + "epoch": 1.7650692789691995, + "grad_norm": 1.2183946371078491, + "learning_rate": 3.235127978817299e-05, + "loss": 0.6757, + "step": 10000 + }, + { + "epoch": 1.7650692789691995, + "eval_loss": 0.6546699404716492, + "eval_runtime": 592.0492, + "eval_samples_per_second": 47.846, + "eval_steps_per_second": 2.393, + "eval_token_accuracy": 0.0004970945644769607, + "step": 10000 + }, + { + "epoch": 1.7668343482481688, + "grad_norm": 2.420112133026123, + "learning_rate": 3.2333627537511034e-05, + "loss": 0.6171, + "step": 10010 + }, + { + "epoch": 1.7685994175271378, + "grad_norm": 1.9335789680480957, + "learning_rate": 3.2315975286849075e-05, + "loss": 0.5806, + "step": 10020 + }, + { + "epoch": 1.7703644868061073, + "grad_norm": 0.945406973361969, + "learning_rate": 3.2298323036187116e-05, + "loss": 0.6006, + "step": 10030 + }, + { + "epoch": 1.7721295560850763, + "grad_norm": 4.3664445877075195, + "learning_rate": 3.228067078552516e-05, + "loss": 0.638, + "step": 10040 + }, + { + "epoch": 1.7738946253640455, + "grad_norm": 3.256833076477051, + "learning_rate": 3.22630185348632e-05, + "loss": 0.6134, + "step": 10050 + }, + { + "epoch": 1.7756596946430148, + "grad_norm": 1.7355093955993652, + "learning_rate": 3.224713150926743e-05, + "loss": 0.5735, + "step": 10060 + }, + { + "epoch": 1.7774247639219838, + "grad_norm": 0.8020845651626587, + "learning_rate": 3.222947925860547e-05, + "loss": 0.6668, + "step": 10070 + }, + { + "epoch": 1.7791898332009533, + "grad_norm": 2.6556622982025146, + "learning_rate": 3.221182700794351e-05, + "loss": 0.5895, + "step": 10080 + }, + { + "epoch": 1.7809549024799223, + "grad_norm": 0.7236629724502563, + "learning_rate": 3.219417475728155e-05, + "loss": 0.7201, + "step": 10090 + }, + { + "epoch": 1.7827199717588915, + "grad_norm": 2.882829427719116, + "learning_rate": 3.2176522506619594e-05, + "loss": 0.5578, + "step": 10100 + }, + { + "epoch": 1.7844850410378608, + "grad_norm": 0.6686634421348572, + "learning_rate": 3.2158870255957635e-05, + "loss": 0.6551, + "step": 10110 + }, + { + "epoch": 1.7862501103168298, + "grad_norm": 1.0659387111663818, + "learning_rate": 3.2141218005295676e-05, + "loss": 0.6214, + "step": 10120 + }, + { + "epoch": 1.7880151795957993, + "grad_norm": 0.9529328942298889, + "learning_rate": 3.212356575463372e-05, + "loss": 0.5799, + "step": 10130 + }, + { + "epoch": 1.7897802488747683, + "grad_norm": 0.9451195001602173, + "learning_rate": 3.210591350397176e-05, + "loss": 0.6425, + "step": 10140 + }, + { + "epoch": 1.7915453181537375, + "grad_norm": 0.9170915484428406, + "learning_rate": 3.20882612533098e-05, + "loss": 0.6067, + "step": 10150 + }, + { + "epoch": 1.7933103874327068, + "grad_norm": 0.7924943566322327, + "learning_rate": 3.207060900264784e-05, + "loss": 0.6592, + "step": 10160 + }, + { + "epoch": 1.7950754567116758, + "grad_norm": 0.8339998722076416, + "learning_rate": 3.205295675198588e-05, + "loss": 0.5662, + "step": 10170 + }, + { + "epoch": 1.7968405259906453, + "grad_norm": 2.43949818611145, + "learning_rate": 3.203530450132392e-05, + "loss": 0.6113, + "step": 10180 + }, + { + "epoch": 1.7986055952696143, + "grad_norm": 2.2304933071136475, + "learning_rate": 3.2017652250661964e-05, + "loss": 0.6392, + "step": 10190 + }, + { + "epoch": 1.8003706645485835, + "grad_norm": 0.8846359252929688, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.6161, + "step": 10200 + }, + { + "epoch": 1.8021357338275528, + "grad_norm": 3.043703317642212, + "learning_rate": 3.198234774933804e-05, + "loss": 0.6005, + "step": 10210 + }, + { + "epoch": 1.8039008031065218, + "grad_norm": 3.343468427658081, + "learning_rate": 3.196469549867608e-05, + "loss": 0.6702, + "step": 10220 + }, + { + "epoch": 1.8056658723854913, + "grad_norm": 0.95237797498703, + "learning_rate": 3.194704324801412e-05, + "loss": 0.6036, + "step": 10230 + }, + { + "epoch": 1.8074309416644603, + "grad_norm": 2.6117990016937256, + "learning_rate": 3.192939099735216e-05, + "loss": 0.6074, + "step": 10240 + }, + { + "epoch": 1.8091960109434295, + "grad_norm": 1.5844181776046753, + "learning_rate": 3.1911738746690204e-05, + "loss": 0.613, + "step": 10250 + }, + { + "epoch": 1.8109610802223988, + "grad_norm": 0.8131887316703796, + "learning_rate": 3.1894086496028245e-05, + "loss": 0.6517, + "step": 10260 + }, + { + "epoch": 1.8127261495013678, + "grad_norm": 2.3605868816375732, + "learning_rate": 3.1876434245366287e-05, + "loss": 0.7417, + "step": 10270 + }, + { + "epoch": 1.8144912187803373, + "grad_norm": 0.7757390737533569, + "learning_rate": 3.185878199470433e-05, + "loss": 0.6353, + "step": 10280 + }, + { + "epoch": 1.8162562880593063, + "grad_norm": 0.8382796049118042, + "learning_rate": 3.184112974404237e-05, + "loss": 0.5868, + "step": 10290 + }, + { + "epoch": 1.8180213573382755, + "grad_norm": 2.488783836364746, + "learning_rate": 3.182347749338041e-05, + "loss": 0.6489, + "step": 10300 + }, + { + "epoch": 1.8197864266172448, + "grad_norm": 0.9477055668830872, + "learning_rate": 3.1805825242718444e-05, + "loss": 0.5931, + "step": 10310 + }, + { + "epoch": 1.8215514958962138, + "grad_norm": 0.9396250247955322, + "learning_rate": 3.1788172992056486e-05, + "loss": 0.5828, + "step": 10320 + }, + { + "epoch": 1.8233165651751833, + "grad_norm": 1.528105616569519, + "learning_rate": 3.177052074139453e-05, + "loss": 0.6485, + "step": 10330 + }, + { + "epoch": 1.8250816344541523, + "grad_norm": 1.0290082693099976, + "learning_rate": 3.175286849073257e-05, + "loss": 0.5571, + "step": 10340 + }, + { + "epoch": 1.8268467037331215, + "grad_norm": 2.224963665008545, + "learning_rate": 3.173521624007061e-05, + "loss": 0.6077, + "step": 10350 + }, + { + "epoch": 1.8286117730120908, + "grad_norm": 0.8264523148536682, + "learning_rate": 3.171756398940865e-05, + "loss": 0.6299, + "step": 10360 + }, + { + "epoch": 1.8303768422910598, + "grad_norm": 0.7780860066413879, + "learning_rate": 3.169991173874669e-05, + "loss": 0.6193, + "step": 10370 + }, + { + "epoch": 1.8321419115700293, + "grad_norm": 2.4344122409820557, + "learning_rate": 3.168225948808473e-05, + "loss": 0.6094, + "step": 10380 + }, + { + "epoch": 1.8339069808489983, + "grad_norm": 2.5928852558135986, + "learning_rate": 3.1664607237422774e-05, + "loss": 0.6777, + "step": 10390 + }, + { + "epoch": 1.8356720501279675, + "grad_norm": 4.44277811050415, + "learning_rate": 3.1646954986760815e-05, + "loss": 0.623, + "step": 10400 + }, + { + "epoch": 1.8374371194069368, + "grad_norm": 2.1471035480499268, + "learning_rate": 3.1629302736098856e-05, + "loss": 0.6382, + "step": 10410 + }, + { + "epoch": 1.8392021886859058, + "grad_norm": 2.2607192993164062, + "learning_rate": 3.16116504854369e-05, + "loss": 0.6338, + "step": 10420 + }, + { + "epoch": 1.8409672579648753, + "grad_norm": 0.7422958016395569, + "learning_rate": 3.159399823477494e-05, + "loss": 0.6132, + "step": 10430 + }, + { + "epoch": 1.8427323272438443, + "grad_norm": 0.8959227204322815, + "learning_rate": 3.157634598411298e-05, + "loss": 0.6039, + "step": 10440 + }, + { + "epoch": 1.8444973965228135, + "grad_norm": 1.0099540948867798, + "learning_rate": 3.1558693733451014e-05, + "loss": 0.5652, + "step": 10450 + }, + { + "epoch": 1.8462624658017828, + "grad_norm": 0.943722128868103, + "learning_rate": 3.1541041482789055e-05, + "loss": 0.6487, + "step": 10460 + }, + { + "epoch": 1.8480275350807518, + "grad_norm": 0.8334342241287231, + "learning_rate": 3.1523389232127096e-05, + "loss": 0.5692, + "step": 10470 + }, + { + "epoch": 1.8497926043597213, + "grad_norm": 2.3223907947540283, + "learning_rate": 3.150573698146514e-05, + "loss": 0.614, + "step": 10480 + }, + { + "epoch": 1.8515576736386903, + "grad_norm": 1.7104765176773071, + "learning_rate": 3.148808473080318e-05, + "loss": 0.591, + "step": 10490 + }, + { + "epoch": 1.8533227429176595, + "grad_norm": 2.331986665725708, + "learning_rate": 3.147043248014122e-05, + "loss": 0.6332, + "step": 10500 + }, + { + "epoch": 1.8550878121966288, + "grad_norm": 2.031975746154785, + "learning_rate": 3.145278022947926e-05, + "loss": 0.5918, + "step": 10510 + }, + { + "epoch": 1.8568528814755978, + "grad_norm": 0.9749746918678284, + "learning_rate": 3.14351279788173e-05, + "loss": 0.6063, + "step": 10520 + }, + { + "epoch": 1.8586179507545673, + "grad_norm": 1.1249350309371948, + "learning_rate": 3.141747572815534e-05, + "loss": 0.6134, + "step": 10530 + }, + { + "epoch": 1.8603830200335363, + "grad_norm": 2.1191256046295166, + "learning_rate": 3.1399823477493384e-05, + "loss": 0.6165, + "step": 10540 + }, + { + "epoch": 1.8621480893125055, + "grad_norm": 3.8866329193115234, + "learning_rate": 3.1382171226831425e-05, + "loss": 0.5966, + "step": 10550 + }, + { + "epoch": 1.8639131585914748, + "grad_norm": 0.8333554267883301, + "learning_rate": 3.1364518976169466e-05, + "loss": 0.5847, + "step": 10560 + }, + { + "epoch": 1.8656782278704438, + "grad_norm": 0.8728168606758118, + "learning_rate": 3.13468667255075e-05, + "loss": 0.5101, + "step": 10570 + }, + { + "epoch": 1.8674432971494133, + "grad_norm": 1.8896209001541138, + "learning_rate": 3.132921447484554e-05, + "loss": 0.5888, + "step": 10580 + }, + { + "epoch": 1.8692083664283823, + "grad_norm": 2.8303306102752686, + "learning_rate": 3.131156222418358e-05, + "loss": 0.5488, + "step": 10590 + }, + { + "epoch": 1.8709734357073515, + "grad_norm": 0.8313313722610474, + "learning_rate": 3.1293909973521624e-05, + "loss": 0.5155, + "step": 10600 + }, + { + "epoch": 1.8727385049863208, + "grad_norm": 0.8679807186126709, + "learning_rate": 3.1276257722859665e-05, + "loss": 0.6382, + "step": 10610 + }, + { + "epoch": 1.8745035742652898, + "grad_norm": 1.9634865522384644, + "learning_rate": 3.1258605472197706e-05, + "loss": 0.5434, + "step": 10620 + }, + { + "epoch": 1.8762686435442593, + "grad_norm": 1.5889586210250854, + "learning_rate": 3.124095322153575e-05, + "loss": 0.6297, + "step": 10630 + }, + { + "epoch": 1.8780337128232283, + "grad_norm": 3.3781940937042236, + "learning_rate": 3.122330097087379e-05, + "loss": 0.6748, + "step": 10640 + }, + { + "epoch": 1.8797987821021975, + "grad_norm": 1.1254082918167114, + "learning_rate": 3.120564872021183e-05, + "loss": 0.5963, + "step": 10650 + }, + { + "epoch": 1.8815638513811668, + "grad_norm": 0.8166277408599854, + "learning_rate": 3.118799646954987e-05, + "loss": 0.5659, + "step": 10660 + }, + { + "epoch": 1.8833289206601358, + "grad_norm": 0.9837700128555298, + "learning_rate": 3.117034421888791e-05, + "loss": 0.6685, + "step": 10670 + }, + { + "epoch": 1.8850939899391053, + "grad_norm": 0.8627223968505859, + "learning_rate": 3.115269196822595e-05, + "loss": 0.6403, + "step": 10680 + }, + { + "epoch": 1.8868590592180743, + "grad_norm": 0.9987977743148804, + "learning_rate": 3.113503971756399e-05, + "loss": 0.6842, + "step": 10690 + }, + { + "epoch": 1.8886241284970435, + "grad_norm": 3.286561965942383, + "learning_rate": 3.111738746690203e-05, + "loss": 0.7374, + "step": 10700 + }, + { + "epoch": 1.8903891977760128, + "grad_norm": 0.7906441688537598, + "learning_rate": 3.109973521624007e-05, + "loss": 0.5732, + "step": 10710 + }, + { + "epoch": 1.8921542670549818, + "grad_norm": 2.687250852584839, + "learning_rate": 3.108208296557811e-05, + "loss": 0.5192, + "step": 10720 + }, + { + "epoch": 1.8939193363339513, + "grad_norm": 1.3238317966461182, + "learning_rate": 3.106443071491615e-05, + "loss": 0.6694, + "step": 10730 + }, + { + "epoch": 1.8956844056129203, + "grad_norm": 0.8862921595573425, + "learning_rate": 3.104677846425419e-05, + "loss": 0.6259, + "step": 10740 + }, + { + "epoch": 1.8974494748918895, + "grad_norm": 1.2485682964324951, + "learning_rate": 3.1029126213592234e-05, + "loss": 0.6235, + "step": 10750 + }, + { + "epoch": 1.8992145441708588, + "grad_norm": 1.9461857080459595, + "learning_rate": 3.1011473962930276e-05, + "loss": 0.6214, + "step": 10760 + }, + { + "epoch": 1.9009796134498278, + "grad_norm": 3.330052137374878, + "learning_rate": 3.099382171226832e-05, + "loss": 0.6129, + "step": 10770 + }, + { + "epoch": 1.9027446827287973, + "grad_norm": 0.7479764819145203, + "learning_rate": 3.097616946160636e-05, + "loss": 0.6273, + "step": 10780 + }, + { + "epoch": 1.9045097520077663, + "grad_norm": 0.8017953634262085, + "learning_rate": 3.09585172109444e-05, + "loss": 0.5989, + "step": 10790 + }, + { + "epoch": 1.9062748212867355, + "grad_norm": 2.009951114654541, + "learning_rate": 3.094086496028244e-05, + "loss": 0.5804, + "step": 10800 + }, + { + "epoch": 1.9080398905657048, + "grad_norm": 0.820184588432312, + "learning_rate": 3.092321270962048e-05, + "loss": 0.6231, + "step": 10810 + }, + { + "epoch": 1.9098049598446738, + "grad_norm": 2.120673656463623, + "learning_rate": 3.090556045895852e-05, + "loss": 0.5958, + "step": 10820 + }, + { + "epoch": 1.9115700291236433, + "grad_norm": 1.511352300643921, + "learning_rate": 3.0887908208296564e-05, + "loss": 0.5967, + "step": 10830 + }, + { + "epoch": 1.9133350984026123, + "grad_norm": 2.4746806621551514, + "learning_rate": 3.08702559576346e-05, + "loss": 0.584, + "step": 10840 + }, + { + "epoch": 1.9151001676815815, + "grad_norm": 0.655605673789978, + "learning_rate": 3.085260370697264e-05, + "loss": 0.606, + "step": 10850 + }, + { + "epoch": 1.9168652369605508, + "grad_norm": 0.8358765244483948, + "learning_rate": 3.083495145631068e-05, + "loss": 0.6417, + "step": 10860 + }, + { + "epoch": 1.9186303062395198, + "grad_norm": 0.9366624355316162, + "learning_rate": 3.081729920564872e-05, + "loss": 0.6077, + "step": 10870 + }, + { + "epoch": 1.9203953755184893, + "grad_norm": 0.7896908521652222, + "learning_rate": 3.079964695498676e-05, + "loss": 0.5887, + "step": 10880 + }, + { + "epoch": 1.9221604447974583, + "grad_norm": 1.6127266883850098, + "learning_rate": 3.0781994704324804e-05, + "loss": 0.6263, + "step": 10890 + }, + { + "epoch": 1.9239255140764275, + "grad_norm": 2.684758424758911, + "learning_rate": 3.0764342453662845e-05, + "loss": 0.6099, + "step": 10900 + }, + { + "epoch": 1.9256905833553968, + "grad_norm": 3.7798666954040527, + "learning_rate": 3.0746690203000886e-05, + "loss": 0.67, + "step": 10910 + }, + { + "epoch": 1.9274556526343658, + "grad_norm": 1.0354872941970825, + "learning_rate": 3.072903795233893e-05, + "loss": 0.5863, + "step": 10920 + }, + { + "epoch": 1.929220721913335, + "grad_norm": 0.7059375047683716, + "learning_rate": 3.071138570167696e-05, + "loss": 0.53, + "step": 10930 + }, + { + "epoch": 1.9309857911923043, + "grad_norm": 0.7127173542976379, + "learning_rate": 3.0693733451015e-05, + "loss": 0.5633, + "step": 10940 + }, + { + "epoch": 1.9327508604712735, + "grad_norm": 0.8398742079734802, + "learning_rate": 3.0676081200353044e-05, + "loss": 0.581, + "step": 10950 + }, + { + "epoch": 1.9345159297502428, + "grad_norm": 2.5236151218414307, + "learning_rate": 3.0658428949691085e-05, + "loss": 0.6473, + "step": 10960 + }, + { + "epoch": 1.9362809990292118, + "grad_norm": 0.8333614468574524, + "learning_rate": 3.0640776699029126e-05, + "loss": 0.6541, + "step": 10970 + }, + { + "epoch": 1.938046068308181, + "grad_norm": 0.7542189359664917, + "learning_rate": 3.062312444836717e-05, + "loss": 0.5332, + "step": 10980 + }, + { + "epoch": 1.9398111375871503, + "grad_norm": 1.0335618257522583, + "learning_rate": 3.060547219770521e-05, + "loss": 0.6618, + "step": 10990 + }, + { + "epoch": 1.9415762068661195, + "grad_norm": 2.821669816970825, + "learning_rate": 3.058781994704325e-05, + "loss": 0.5796, + "step": 11000 + }, + { + "epoch": 1.9415762068661195, + "eval_loss": 0.6470552682876587, + "eval_runtime": 591.6835, + "eval_samples_per_second": 47.875, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0005082583868723192, + "step": 11000 + }, + { + "epoch": 1.9433412761450888, + "grad_norm": 0.9215902090072632, + "learning_rate": 3.057016769638129e-05, + "loss": 0.6368, + "step": 11010 + }, + { + "epoch": 1.9451063454240578, + "grad_norm": 0.8010556101799011, + "learning_rate": 3.055251544571933e-05, + "loss": 0.5164, + "step": 11020 + }, + { + "epoch": 1.946871414703027, + "grad_norm": 2.074794054031372, + "learning_rate": 3.053486319505737e-05, + "loss": 0.5884, + "step": 11030 + }, + { + "epoch": 1.9486364839819963, + "grad_norm": 1.1630234718322754, + "learning_rate": 3.0517210944395414e-05, + "loss": 0.534, + "step": 11040 + }, + { + "epoch": 1.9504015532609655, + "grad_norm": 1.5974197387695312, + "learning_rate": 3.0499558693733455e-05, + "loss": 0.6164, + "step": 11050 + }, + { + "epoch": 1.9521666225399348, + "grad_norm": 3.0076427459716797, + "learning_rate": 3.0481906443071496e-05, + "loss": 0.6304, + "step": 11060 + }, + { + "epoch": 1.9539316918189038, + "grad_norm": 2.112062454223633, + "learning_rate": 3.0464254192409538e-05, + "loss": 0.5802, + "step": 11070 + }, + { + "epoch": 1.955696761097873, + "grad_norm": 3.1546525955200195, + "learning_rate": 3.0446601941747575e-05, + "loss": 0.6581, + "step": 11080 + }, + { + "epoch": 1.9574618303768423, + "grad_norm": 1.4764760732650757, + "learning_rate": 3.0428949691085616e-05, + "loss": 0.614, + "step": 11090 + }, + { + "epoch": 1.9592268996558115, + "grad_norm": 2.507228136062622, + "learning_rate": 3.0411297440423654e-05, + "loss": 0.5967, + "step": 11100 + }, + { + "epoch": 1.9609919689347808, + "grad_norm": 0.6856989860534668, + "learning_rate": 3.0393645189761692e-05, + "loss": 0.6435, + "step": 11110 + }, + { + "epoch": 1.9627570382137498, + "grad_norm": 1.0282143354415894, + "learning_rate": 3.0375992939099733e-05, + "loss": 0.6121, + "step": 11120 + }, + { + "epoch": 1.964522107492719, + "grad_norm": 2.7744762897491455, + "learning_rate": 3.0358340688437774e-05, + "loss": 0.6126, + "step": 11130 + }, + { + "epoch": 1.9662871767716883, + "grad_norm": 1.6380807161331177, + "learning_rate": 3.0340688437775815e-05, + "loss": 0.5215, + "step": 11140 + }, + { + "epoch": 1.9680522460506575, + "grad_norm": 1.3833621740341187, + "learning_rate": 3.0323036187113857e-05, + "loss": 0.5779, + "step": 11150 + }, + { + "epoch": 1.9698173153296268, + "grad_norm": 1.4948760271072388, + "learning_rate": 3.0305383936451898e-05, + "loss": 0.6363, + "step": 11160 + }, + { + "epoch": 1.9715823846085958, + "grad_norm": 1.3232519626617432, + "learning_rate": 3.028773168578994e-05, + "loss": 0.5343, + "step": 11170 + }, + { + "epoch": 1.973347453887565, + "grad_norm": 1.552229404449463, + "learning_rate": 3.027007943512798e-05, + "loss": 0.5917, + "step": 11180 + }, + { + "epoch": 1.9751125231665343, + "grad_norm": 0.6373162865638733, + "learning_rate": 3.025242718446602e-05, + "loss": 0.634, + "step": 11190 + }, + { + "epoch": 1.9768775924455035, + "grad_norm": 1.2558025121688843, + "learning_rate": 3.0234774933804062e-05, + "loss": 0.5806, + "step": 11200 + }, + { + "epoch": 1.9786426617244728, + "grad_norm": 1.1361700296401978, + "learning_rate": 3.02171226831421e-05, + "loss": 0.6575, + "step": 11210 + }, + { + "epoch": 1.9804077310034418, + "grad_norm": 0.8260495662689209, + "learning_rate": 3.019947043248014e-05, + "loss": 0.6143, + "step": 11220 + }, + { + "epoch": 1.982172800282411, + "grad_norm": 0.9218713045120239, + "learning_rate": 3.0181818181818182e-05, + "loss": 0.616, + "step": 11230 + }, + { + "epoch": 1.9839378695613803, + "grad_norm": 0.8892648816108704, + "learning_rate": 3.0164165931156224e-05, + "loss": 0.5788, + "step": 11240 + }, + { + "epoch": 1.9857029388403493, + "grad_norm": 1.1470361948013306, + "learning_rate": 3.0146513680494265e-05, + "loss": 0.5623, + "step": 11250 + }, + { + "epoch": 1.9874680081193188, + "grad_norm": 2.0089900493621826, + "learning_rate": 3.0128861429832306e-05, + "loss": 0.665, + "step": 11260 + }, + { + "epoch": 1.9892330773982878, + "grad_norm": 2.222125768661499, + "learning_rate": 3.0111209179170347e-05, + "loss": 0.6518, + "step": 11270 + }, + { + "epoch": 1.990998146677257, + "grad_norm": 0.7392178177833557, + "learning_rate": 3.0093556928508388e-05, + "loss": 0.6416, + "step": 11280 + }, + { + "epoch": 1.9927632159562263, + "grad_norm": 0.8574917912483215, + "learning_rate": 3.007590467784643e-05, + "loss": 0.5742, + "step": 11290 + }, + { + "epoch": 1.9945282852351953, + "grad_norm": 3.2676074504852295, + "learning_rate": 3.005825242718447e-05, + "loss": 0.5671, + "step": 11300 + }, + { + "epoch": 1.9962933545141648, + "grad_norm": 2.6135575771331787, + "learning_rate": 3.0040600176522508e-05, + "loss": 0.6729, + "step": 11310 + }, + { + "epoch": 1.9980584237931338, + "grad_norm": 1.1514455080032349, + "learning_rate": 3.002294792586055e-05, + "loss": 0.5945, + "step": 11320 + }, + { + "epoch": 1.999823493072103, + "grad_norm": 0.957177996635437, + "learning_rate": 3.000529567519859e-05, + "loss": 0.6641, + "step": 11330 + }, + { + "epoch": 2.0015885623510723, + "grad_norm": 1.1928191184997559, + "learning_rate": 2.998764342453663e-05, + "loss": 0.5776, + "step": 11340 + }, + { + "epoch": 2.0033536316300413, + "grad_norm": 0.9931208491325378, + "learning_rate": 2.9969991173874673e-05, + "loss": 0.5331, + "step": 11350 + }, + { + "epoch": 2.005118700909011, + "grad_norm": 0.9708819389343262, + "learning_rate": 2.9952338923212707e-05, + "loss": 0.466, + "step": 11360 + }, + { + "epoch": 2.00688377018798, + "grad_norm": 0.8825761675834656, + "learning_rate": 2.9934686672550748e-05, + "loss": 0.48, + "step": 11370 + }, + { + "epoch": 2.0086488394669493, + "grad_norm": 0.8337153792381287, + "learning_rate": 2.991703442188879e-05, + "loss": 0.4293, + "step": 11380 + }, + { + "epoch": 2.0104139087459183, + "grad_norm": 4.886162281036377, + "learning_rate": 2.989938217122683e-05, + "loss": 0.5525, + "step": 11390 + }, + { + "epoch": 2.0121789780248873, + "grad_norm": 0.7176535129547119, + "learning_rate": 2.9881729920564872e-05, + "loss": 0.5155, + "step": 11400 + }, + { + "epoch": 2.013944047303857, + "grad_norm": 1.8255113363265991, + "learning_rate": 2.9864077669902913e-05, + "loss": 0.5057, + "step": 11410 + }, + { + "epoch": 2.015709116582826, + "grad_norm": 1.6702226400375366, + "learning_rate": 2.9846425419240954e-05, + "loss": 0.4671, + "step": 11420 + }, + { + "epoch": 2.0174741858617953, + "grad_norm": 0.8029167652130127, + "learning_rate": 2.9828773168578995e-05, + "loss": 0.4813, + "step": 11430 + }, + { + "epoch": 2.0192392551407643, + "grad_norm": 3.2598228454589844, + "learning_rate": 2.9811120917917036e-05, + "loss": 0.4942, + "step": 11440 + }, + { + "epoch": 2.0210043244197333, + "grad_norm": 2.657938003540039, + "learning_rate": 2.9793468667255074e-05, + "loss": 0.4563, + "step": 11450 + }, + { + "epoch": 2.022769393698703, + "grad_norm": 2.4557859897613525, + "learning_rate": 2.9775816416593115e-05, + "loss": 0.4351, + "step": 11460 + }, + { + "epoch": 2.024534462977672, + "grad_norm": 2.4432296752929688, + "learning_rate": 2.9758164165931156e-05, + "loss": 0.4639, + "step": 11470 + }, + { + "epoch": 2.0262995322566413, + "grad_norm": 1.0158432722091675, + "learning_rate": 2.9740511915269197e-05, + "loss": 0.561, + "step": 11480 + }, + { + "epoch": 2.0280646015356103, + "grad_norm": 2.512036085128784, + "learning_rate": 2.972285966460724e-05, + "loss": 0.5906, + "step": 11490 + }, + { + "epoch": 2.0298296708145793, + "grad_norm": 1.3112326860427856, + "learning_rate": 2.970520741394528e-05, + "loss": 0.4741, + "step": 11500 + }, + { + "epoch": 2.031594740093549, + "grad_norm": 1.4458718299865723, + "learning_rate": 2.968755516328332e-05, + "loss": 0.522, + "step": 11510 + }, + { + "epoch": 2.033359809372518, + "grad_norm": 0.7055060267448425, + "learning_rate": 2.9669902912621362e-05, + "loss": 0.4711, + "step": 11520 + }, + { + "epoch": 2.0351248786514873, + "grad_norm": 2.6412127017974854, + "learning_rate": 2.9652250661959403e-05, + "loss": 0.4922, + "step": 11530 + }, + { + "epoch": 2.0368899479304563, + "grad_norm": 2.383212089538574, + "learning_rate": 2.9634598411297444e-05, + "loss": 0.5204, + "step": 11540 + }, + { + "epoch": 2.0386550172094253, + "grad_norm": 0.963070809841156, + "learning_rate": 2.9616946160635482e-05, + "loss": 0.4838, + "step": 11550 + }, + { + "epoch": 2.040420086488395, + "grad_norm": 1.1218928098678589, + "learning_rate": 2.9599293909973523e-05, + "loss": 0.504, + "step": 11560 + }, + { + "epoch": 2.042185155767364, + "grad_norm": 1.0899280309677124, + "learning_rate": 2.9581641659311564e-05, + "loss": 0.5522, + "step": 11570 + }, + { + "epoch": 2.0439502250463333, + "grad_norm": 1.1442947387695312, + "learning_rate": 2.9563989408649606e-05, + "loss": 0.5598, + "step": 11580 + }, + { + "epoch": 2.0457152943253023, + "grad_norm": 1.0687470436096191, + "learning_rate": 2.9546337157987647e-05, + "loss": 0.4778, + "step": 11590 + }, + { + "epoch": 2.0474803636042713, + "grad_norm": 1.081070899963379, + "learning_rate": 2.9528684907325688e-05, + "loss": 0.5483, + "step": 11600 + }, + { + "epoch": 2.049245432883241, + "grad_norm": 0.8817394375801086, + "learning_rate": 2.951103265666373e-05, + "loss": 0.5219, + "step": 11610 + }, + { + "epoch": 2.05101050216221, + "grad_norm": 0.8029956817626953, + "learning_rate": 2.949338040600177e-05, + "loss": 0.5053, + "step": 11620 + }, + { + "epoch": 2.0527755714411793, + "grad_norm": 0.783224880695343, + "learning_rate": 2.9475728155339804e-05, + "loss": 0.5109, + "step": 11630 + }, + { + "epoch": 2.0545406407201483, + "grad_norm": 0.7042371034622192, + "learning_rate": 2.9458075904677846e-05, + "loss": 0.5278, + "step": 11640 + }, + { + "epoch": 2.0563057099991173, + "grad_norm": 4.542788028717041, + "learning_rate": 2.9440423654015887e-05, + "loss": 0.5746, + "step": 11650 + }, + { + "epoch": 2.058070779278087, + "grad_norm": 0.9003473520278931, + "learning_rate": 2.9422771403353928e-05, + "loss": 0.4479, + "step": 11660 + }, + { + "epoch": 2.059835848557056, + "grad_norm": 3.547114372253418, + "learning_rate": 2.940511915269197e-05, + "loss": 0.6115, + "step": 11670 + }, + { + "epoch": 2.0616009178360253, + "grad_norm": 1.4339213371276855, + "learning_rate": 2.938746690203001e-05, + "loss": 0.4837, + "step": 11680 + }, + { + "epoch": 2.0633659871149943, + "grad_norm": 2.337256908416748, + "learning_rate": 2.9369814651368048e-05, + "loss": 0.4576, + "step": 11690 + }, + { + "epoch": 2.0651310563939633, + "grad_norm": 2.767146587371826, + "learning_rate": 2.935216240070609e-05, + "loss": 0.526, + "step": 11700 + }, + { + "epoch": 2.066896125672933, + "grad_norm": 1.023708701133728, + "learning_rate": 2.933451015004413e-05, + "loss": 0.4683, + "step": 11710 + }, + { + "epoch": 2.068661194951902, + "grad_norm": 0.7664569020271301, + "learning_rate": 2.931685789938217e-05, + "loss": 0.4743, + "step": 11720 + }, + { + "epoch": 2.0704262642308713, + "grad_norm": 2.275400400161743, + "learning_rate": 2.9299205648720213e-05, + "loss": 0.4842, + "step": 11730 + }, + { + "epoch": 2.0721913335098403, + "grad_norm": 0.7984259724617004, + "learning_rate": 2.9281553398058254e-05, + "loss": 0.4877, + "step": 11740 + }, + { + "epoch": 2.0739564027888093, + "grad_norm": 0.8254725337028503, + "learning_rate": 2.9263901147396295e-05, + "loss": 0.5589, + "step": 11750 + }, + { + "epoch": 2.075721472067779, + "grad_norm": 1.0608854293823242, + "learning_rate": 2.9246248896734336e-05, + "loss": 0.5527, + "step": 11760 + }, + { + "epoch": 2.077486541346748, + "grad_norm": 1.1715601682662964, + "learning_rate": 2.9228596646072377e-05, + "loss": 0.4665, + "step": 11770 + }, + { + "epoch": 2.0792516106257173, + "grad_norm": 1.4764703512191772, + "learning_rate": 2.9210944395410418e-05, + "loss": 0.4224, + "step": 11780 + }, + { + "epoch": 2.0810166799046863, + "grad_norm": 1.0137091875076294, + "learning_rate": 2.9193292144748456e-05, + "loss": 0.5108, + "step": 11790 + }, + { + "epoch": 2.0827817491836553, + "grad_norm": 2.340433359146118, + "learning_rate": 2.9175639894086497e-05, + "loss": 0.4594, + "step": 11800 + }, + { + "epoch": 2.084546818462625, + "grad_norm": 2.333954334259033, + "learning_rate": 2.915798764342454e-05, + "loss": 0.6047, + "step": 11810 + }, + { + "epoch": 2.086311887741594, + "grad_norm": 0.9210914969444275, + "learning_rate": 2.914033539276258e-05, + "loss": 0.4875, + "step": 11820 + }, + { + "epoch": 2.0880769570205633, + "grad_norm": 1.9261822700500488, + "learning_rate": 2.912268314210062e-05, + "loss": 0.5139, + "step": 11830 + }, + { + "epoch": 2.0898420262995323, + "grad_norm": 1.805763840675354, + "learning_rate": 2.9105030891438662e-05, + "loss": 0.5626, + "step": 11840 + }, + { + "epoch": 2.0916070955785013, + "grad_norm": 2.940823554992676, + "learning_rate": 2.9087378640776703e-05, + "loss": 0.5426, + "step": 11850 + }, + { + "epoch": 2.093372164857471, + "grad_norm": 1.9291481971740723, + "learning_rate": 2.9069726390114744e-05, + "loss": 0.5193, + "step": 11860 + }, + { + "epoch": 2.09513723413644, + "grad_norm": 1.1445153951644897, + "learning_rate": 2.9052074139452785e-05, + "loss": 0.5539, + "step": 11870 + }, + { + "epoch": 2.096902303415409, + "grad_norm": 2.1614785194396973, + "learning_rate": 2.9034421888790826e-05, + "loss": 0.5235, + "step": 11880 + }, + { + "epoch": 2.0986673726943783, + "grad_norm": 1.3679187297821045, + "learning_rate": 2.901676963812886e-05, + "loss": 0.5553, + "step": 11890 + }, + { + "epoch": 2.1004324419733473, + "grad_norm": 1.0410507917404175, + "learning_rate": 2.8999117387466902e-05, + "loss": 0.4773, + "step": 11900 + }, + { + "epoch": 2.102197511252317, + "grad_norm": 0.9981822967529297, + "learning_rate": 2.8981465136804943e-05, + "loss": 0.5068, + "step": 11910 + }, + { + "epoch": 2.103962580531286, + "grad_norm": 1.834923505783081, + "learning_rate": 2.8963812886142984e-05, + "loss": 0.5367, + "step": 11920 + }, + { + "epoch": 2.105727649810255, + "grad_norm": 1.878109335899353, + "learning_rate": 2.8946160635481022e-05, + "loss": 0.4888, + "step": 11930 + }, + { + "epoch": 2.1074927190892243, + "grad_norm": 0.9439142346382141, + "learning_rate": 2.8928508384819063e-05, + "loss": 0.5235, + "step": 11940 + }, + { + "epoch": 2.1092577883681933, + "grad_norm": 1.915279507637024, + "learning_rate": 2.8910856134157104e-05, + "loss": 0.5273, + "step": 11950 + }, + { + "epoch": 2.111022857647163, + "grad_norm": 2.888288974761963, + "learning_rate": 2.8893203883495145e-05, + "loss": 0.5289, + "step": 11960 + }, + { + "epoch": 2.112787926926132, + "grad_norm": 2.066969871520996, + "learning_rate": 2.8875551632833187e-05, + "loss": 0.4869, + "step": 11970 + }, + { + "epoch": 2.114552996205101, + "grad_norm": 2.3141443729400635, + "learning_rate": 2.8857899382171228e-05, + "loss": 0.5212, + "step": 11980 + }, + { + "epoch": 2.1163180654840703, + "grad_norm": 2.632960081100464, + "learning_rate": 2.884024713150927e-05, + "loss": 0.5814, + "step": 11990 + }, + { + "epoch": 2.1180831347630393, + "grad_norm": 0.9722346663475037, + "learning_rate": 2.882259488084731e-05, + "loss": 0.4128, + "step": 12000 + }, + { + "epoch": 2.1180831347630393, + "eval_loss": 0.654511570930481, + "eval_runtime": 591.8488, + "eval_samples_per_second": 47.862, + "eval_steps_per_second": 2.394, + "eval_token_accuracy": 0.0004901932924507392, + "step": 12000 + }, + { + "epoch": 2.119848204042009, + "grad_norm": 0.8732126951217651, + "learning_rate": 2.880494263018535e-05, + "loss": 0.4396, + "step": 12010 + }, + { + "epoch": 2.121613273320978, + "grad_norm": 3.0739586353302, + "learning_rate": 2.8787290379523392e-05, + "loss": 0.5427, + "step": 12020 + }, + { + "epoch": 2.123378342599947, + "grad_norm": 2.652486801147461, + "learning_rate": 2.876963812886143e-05, + "loss": 0.4744, + "step": 12030 + }, + { + "epoch": 2.1251434118789163, + "grad_norm": 1.4533450603485107, + "learning_rate": 2.875198587819947e-05, + "loss": 0.5421, + "step": 12040 + }, + { + "epoch": 2.1269084811578853, + "grad_norm": 0.821504533290863, + "learning_rate": 2.8734333627537512e-05, + "loss": 0.5684, + "step": 12050 + }, + { + "epoch": 2.128673550436855, + "grad_norm": 0.6715283989906311, + "learning_rate": 2.8716681376875553e-05, + "loss": 0.4973, + "step": 12060 + }, + { + "epoch": 2.130438619715824, + "grad_norm": 3.1312203407287598, + "learning_rate": 2.8699029126213595e-05, + "loss": 0.4808, + "step": 12070 + }, + { + "epoch": 2.132203688994793, + "grad_norm": 0.6378737688064575, + "learning_rate": 2.8681376875551636e-05, + "loss": 0.4798, + "step": 12080 + }, + { + "epoch": 2.1339687582737623, + "grad_norm": 1.9007421731948853, + "learning_rate": 2.8665489849955873e-05, + "loss": 0.5055, + "step": 12090 + }, + { + "epoch": 2.1357338275527313, + "grad_norm": 0.5962501168251038, + "learning_rate": 2.8647837599293915e-05, + "loss": 0.4686, + "step": 12100 + }, + { + "epoch": 2.137498896831701, + "grad_norm": 1.4856303930282593, + "learning_rate": 2.863018534863195e-05, + "loss": 0.4889, + "step": 12110 + }, + { + "epoch": 2.13926396611067, + "grad_norm": 0.873531699180603, + "learning_rate": 2.861253309796999e-05, + "loss": 0.4911, + "step": 12120 + }, + { + "epoch": 2.141029035389639, + "grad_norm": 2.513932228088379, + "learning_rate": 2.859488084730803e-05, + "loss": 0.5479, + "step": 12130 + }, + { + "epoch": 2.1427941046686083, + "grad_norm": 2.6930124759674072, + "learning_rate": 2.8577228596646072e-05, + "loss": 0.5069, + "step": 12140 + }, + { + "epoch": 2.1445591739475773, + "grad_norm": 1.6831334829330444, + "learning_rate": 2.8559576345984114e-05, + "loss": 0.4757, + "step": 12150 + }, + { + "epoch": 2.146324243226547, + "grad_norm": 2.069110155105591, + "learning_rate": 2.8541924095322155e-05, + "loss": 0.5105, + "step": 12160 + }, + { + "epoch": 2.148089312505516, + "grad_norm": 1.4201005697250366, + "learning_rate": 2.8524271844660196e-05, + "loss": 0.5031, + "step": 12170 + }, + { + "epoch": 2.149854381784485, + "grad_norm": 3.460890293121338, + "learning_rate": 2.8506619593998234e-05, + "loss": 0.5576, + "step": 12180 + }, + { + "epoch": 2.1516194510634543, + "grad_norm": 1.0495176315307617, + "learning_rate": 2.8488967343336275e-05, + "loss": 0.5344, + "step": 12190 + }, + { + "epoch": 2.1533845203424233, + "grad_norm": 0.7707245349884033, + "learning_rate": 2.8471315092674316e-05, + "loss": 0.4722, + "step": 12200 + }, + { + "epoch": 2.155149589621393, + "grad_norm": 1.064219355583191, + "learning_rate": 2.8453662842012357e-05, + "loss": 0.5478, + "step": 12210 + }, + { + "epoch": 2.156914658900362, + "grad_norm": 2.337414264678955, + "learning_rate": 2.8436010591350398e-05, + "loss": 0.5004, + "step": 12220 + }, + { + "epoch": 2.158679728179331, + "grad_norm": 0.9675964713096619, + "learning_rate": 2.841835834068844e-05, + "loss": 0.4763, + "step": 12230 + }, + { + "epoch": 2.1604447974583003, + "grad_norm": 1.2413973808288574, + "learning_rate": 2.840070609002648e-05, + "loss": 0.495, + "step": 12240 + }, + { + "epoch": 2.1622098667372693, + "grad_norm": 1.6234698295593262, + "learning_rate": 2.838305383936452e-05, + "loss": 0.4424, + "step": 12250 + }, + { + "epoch": 2.163974936016239, + "grad_norm": 2.0990002155303955, + "learning_rate": 2.8365401588702563e-05, + "loss": 0.5045, + "step": 12260 + }, + { + "epoch": 2.165740005295208, + "grad_norm": 0.6744937896728516, + "learning_rate": 2.8347749338040604e-05, + "loss": 0.4955, + "step": 12270 + }, + { + "epoch": 2.167505074574177, + "grad_norm": 1.7633548974990845, + "learning_rate": 2.833009708737864e-05, + "loss": 0.476, + "step": 12280 + }, + { + "epoch": 2.1692701438531463, + "grad_norm": 1.0914983749389648, + "learning_rate": 2.8312444836716683e-05, + "loss": 0.4602, + "step": 12290 + }, + { + "epoch": 2.1710352131321153, + "grad_norm": 1.7474454641342163, + "learning_rate": 2.8294792586054724e-05, + "loss": 0.5249, + "step": 12300 + }, + { + "epoch": 2.172800282411085, + "grad_norm": 1.0030485391616821, + "learning_rate": 2.8277140335392765e-05, + "loss": 0.5095, + "step": 12310 + }, + { + "epoch": 2.174565351690054, + "grad_norm": 1.8654377460479736, + "learning_rate": 2.8259488084730806e-05, + "loss": 0.4896, + "step": 12320 + }, + { + "epoch": 2.176330420969023, + "grad_norm": 1.0025534629821777, + "learning_rate": 2.8241835834068847e-05, + "loss": 0.493, + "step": 12330 + }, + { + "epoch": 2.1780954902479923, + "grad_norm": 2.6646952629089355, + "learning_rate": 2.822418358340689e-05, + "loss": 0.5012, + "step": 12340 + }, + { + "epoch": 2.1798605595269613, + "grad_norm": 2.589738130569458, + "learning_rate": 2.820653133274493e-05, + "loss": 0.5174, + "step": 12350 + }, + { + "epoch": 2.181625628805931, + "grad_norm": 1.0582941770553589, + "learning_rate": 2.818887908208297e-05, + "loss": 0.479, + "step": 12360 + }, + { + "epoch": 2.1833906980849, + "grad_norm": 1.1127334833145142, + "learning_rate": 2.8171226831421005e-05, + "loss": 0.5474, + "step": 12370 + }, + { + "epoch": 2.185155767363869, + "grad_norm": 1.8785793781280518, + "learning_rate": 2.8153574580759046e-05, + "loss": 0.4606, + "step": 12380 + }, + { + "epoch": 2.1869208366428383, + "grad_norm": 0.9331899285316467, + "learning_rate": 2.8135922330097087e-05, + "loss": 0.5505, + "step": 12390 + }, + { + "epoch": 2.1886859059218073, + "grad_norm": 2.787630081176758, + "learning_rate": 2.811827007943513e-05, + "loss": 0.5077, + "step": 12400 + }, + { + "epoch": 2.190450975200777, + "grad_norm": 1.945959448814392, + "learning_rate": 2.8100617828773166e-05, + "loss": 0.4737, + "step": 12410 + }, + { + "epoch": 2.192216044479746, + "grad_norm": 2.3678271770477295, + "learning_rate": 2.8082965578111208e-05, + "loss": 0.549, + "step": 12420 + }, + { + "epoch": 2.193981113758715, + "grad_norm": 1.0043656826019287, + "learning_rate": 2.806531332744925e-05, + "loss": 0.4822, + "step": 12430 + }, + { + "epoch": 2.1957461830376843, + "grad_norm": 0.9238103628158569, + "learning_rate": 2.804766107678729e-05, + "loss": 0.55, + "step": 12440 + }, + { + "epoch": 2.1975112523166533, + "grad_norm": 1.7914990186691284, + "learning_rate": 2.803000882612533e-05, + "loss": 0.4653, + "step": 12450 + }, + { + "epoch": 2.199276321595623, + "grad_norm": 2.3372256755828857, + "learning_rate": 2.8012356575463372e-05, + "loss": 0.5211, + "step": 12460 + }, + { + "epoch": 2.201041390874592, + "grad_norm": 2.7075140476226807, + "learning_rate": 2.7994704324801413e-05, + "loss": 0.5067, + "step": 12470 + }, + { + "epoch": 2.202806460153561, + "grad_norm": 1.2606568336486816, + "learning_rate": 2.7977052074139454e-05, + "loss": 0.5076, + "step": 12480 + }, + { + "epoch": 2.2045715294325303, + "grad_norm": 2.36053466796875, + "learning_rate": 2.7959399823477496e-05, + "loss": 0.4988, + "step": 12490 + }, + { + "epoch": 2.2063365987114993, + "grad_norm": 1.0246137380599976, + "learning_rate": 2.7941747572815537e-05, + "loss": 0.4638, + "step": 12500 + }, + { + "epoch": 2.208101667990469, + "grad_norm": 0.9959467649459839, + "learning_rate": 2.7924095322153574e-05, + "loss": 0.4979, + "step": 12510 + }, + { + "epoch": 2.209866737269438, + "grad_norm": 0.8300411701202393, + "learning_rate": 2.7906443071491616e-05, + "loss": 0.5047, + "step": 12520 + }, + { + "epoch": 2.211631806548407, + "grad_norm": 1.7269713878631592, + "learning_rate": 2.7888790820829657e-05, + "loss": 0.524, + "step": 12530 + }, + { + "epoch": 2.2133968758273763, + "grad_norm": 1.104152798652649, + "learning_rate": 2.7871138570167698e-05, + "loss": 0.5376, + "step": 12540 + }, + { + "epoch": 2.2151619451063453, + "grad_norm": 3.106315851211548, + "learning_rate": 2.785348631950574e-05, + "loss": 0.5384, + "step": 12550 + }, + { + "epoch": 2.216927014385315, + "grad_norm": 2.4172351360321045, + "learning_rate": 2.783583406884378e-05, + "loss": 0.5317, + "step": 12560 + }, + { + "epoch": 2.218692083664284, + "grad_norm": 1.0057008266448975, + "learning_rate": 2.781818181818182e-05, + "loss": 0.5518, + "step": 12570 + }, + { + "epoch": 2.220457152943253, + "grad_norm": 1.2210044860839844, + "learning_rate": 2.7800529567519862e-05, + "loss": 0.4703, + "step": 12580 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.9134182333946228, + "learning_rate": 2.7782877316857904e-05, + "loss": 0.4795, + "step": 12590 + }, + { + "epoch": 2.2239872915011913, + "grad_norm": 2.1350576877593994, + "learning_rate": 2.7765225066195945e-05, + "loss": 0.4975, + "step": 12600 + }, + { + "epoch": 2.225752360780161, + "grad_norm": 0.8448552489280701, + "learning_rate": 2.7747572815533986e-05, + "loss": 0.5284, + "step": 12610 + }, + { + "epoch": 2.22751743005913, + "grad_norm": 2.1405866146087646, + "learning_rate": 2.7729920564872024e-05, + "loss": 0.5373, + "step": 12620 + }, + { + "epoch": 2.229282499338099, + "grad_norm": 2.148709774017334, + "learning_rate": 2.7712268314210065e-05, + "loss": 0.4989, + "step": 12630 + }, + { + "epoch": 2.2310475686170683, + "grad_norm": 1.166394591331482, + "learning_rate": 2.7694616063548103e-05, + "loss": 0.5028, + "step": 12640 + }, + { + "epoch": 2.2328126378960373, + "grad_norm": 0.9031407237052917, + "learning_rate": 2.767696381288614e-05, + "loss": 0.5754, + "step": 12650 + }, + { + "epoch": 2.234577707175007, + "grad_norm": 0.9934903979301453, + "learning_rate": 2.765931156222418e-05, + "loss": 0.5072, + "step": 12660 + }, + { + "epoch": 2.236342776453976, + "grad_norm": 2.5429446697235107, + "learning_rate": 2.7641659311562223e-05, + "loss": 0.515, + "step": 12670 + }, + { + "epoch": 2.238107845732945, + "grad_norm": 2.769002676010132, + "learning_rate": 2.7624007060900264e-05, + "loss": 0.4504, + "step": 12680 + }, + { + "epoch": 2.2398729150119143, + "grad_norm": 0.9032069444656372, + "learning_rate": 2.7606354810238305e-05, + "loss": 0.5617, + "step": 12690 + }, + { + "epoch": 2.2416379842908833, + "grad_norm": 0.7437412142753601, + "learning_rate": 2.7588702559576346e-05, + "loss": 0.4948, + "step": 12700 + }, + { + "epoch": 2.243403053569853, + "grad_norm": 3.5754473209381104, + "learning_rate": 2.7571050308914387e-05, + "loss": 0.4823, + "step": 12710 + }, + { + "epoch": 2.245168122848822, + "grad_norm": 1.0576409101486206, + "learning_rate": 2.755339805825243e-05, + "loss": 0.4672, + "step": 12720 + }, + { + "epoch": 2.246933192127791, + "grad_norm": 1.3094158172607422, + "learning_rate": 2.753574580759047e-05, + "loss": 0.5177, + "step": 12730 + }, + { + "epoch": 2.2486982614067603, + "grad_norm": 0.7089628577232361, + "learning_rate": 2.751809355692851e-05, + "loss": 0.5172, + "step": 12740 + }, + { + "epoch": 2.2504633306857293, + "grad_norm": 1.1376948356628418, + "learning_rate": 2.750044130626655e-05, + "loss": 0.5558, + "step": 12750 + }, + { + "epoch": 2.2522283999646984, + "grad_norm": 1.1415077447891235, + "learning_rate": 2.748278905560459e-05, + "loss": 0.5039, + "step": 12760 + }, + { + "epoch": 2.253993469243668, + "grad_norm": 1.022479772567749, + "learning_rate": 2.746513680494263e-05, + "loss": 0.5461, + "step": 12770 + }, + { + "epoch": 2.255758538522637, + "grad_norm": 1.0342376232147217, + "learning_rate": 2.7447484554280672e-05, + "loss": 0.4556, + "step": 12780 + }, + { + "epoch": 2.2575236078016063, + "grad_norm": 1.3401023149490356, + "learning_rate": 2.7429832303618713e-05, + "loss": 0.5366, + "step": 12790 + }, + { + "epoch": 2.2592886770805753, + "grad_norm": 0.9429671764373779, + "learning_rate": 2.7412180052956754e-05, + "loss": 0.487, + "step": 12800 + }, + { + "epoch": 2.2610537463595444, + "grad_norm": 1.254345178604126, + "learning_rate": 2.7394527802294795e-05, + "loss": 0.588, + "step": 12810 + }, + { + "epoch": 2.262818815638514, + "grad_norm": 1.041446328163147, + "learning_rate": 2.7376875551632836e-05, + "loss": 0.5193, + "step": 12820 + }, + { + "epoch": 2.264583884917483, + "grad_norm": 3.8499042987823486, + "learning_rate": 2.7359223300970878e-05, + "loss": 0.5235, + "step": 12830 + }, + { + "epoch": 2.2663489541964523, + "grad_norm": 3.177006721496582, + "learning_rate": 2.734157105030892e-05, + "loss": 0.5519, + "step": 12840 + }, + { + "epoch": 2.2681140234754213, + "grad_norm": 1.080491065979004, + "learning_rate": 2.7323918799646956e-05, + "loss": 0.4556, + "step": 12850 + }, + { + "epoch": 2.2698790927543904, + "grad_norm": 0.6767769455909729, + "learning_rate": 2.7306266548984998e-05, + "loss": 0.5929, + "step": 12860 + }, + { + "epoch": 2.27164416203336, + "grad_norm": 3.2237303256988525, + "learning_rate": 2.728861429832304e-05, + "loss": 0.5824, + "step": 12870 + }, + { + "epoch": 2.273409231312329, + "grad_norm": 0.7552109956741333, + "learning_rate": 2.727096204766108e-05, + "loss": 0.5254, + "step": 12880 + }, + { + "epoch": 2.2751743005912983, + "grad_norm": 1.4033763408660889, + "learning_rate": 2.725330979699912e-05, + "loss": 0.5456, + "step": 12890 + }, + { + "epoch": 2.2769393698702673, + "grad_norm": 0.9239394068717957, + "learning_rate": 2.7235657546337155e-05, + "loss": 0.5253, + "step": 12900 + }, + { + "epoch": 2.2787044391492364, + "grad_norm": 2.7689387798309326, + "learning_rate": 2.7218005295675197e-05, + "loss": 0.4943, + "step": 12910 + }, + { + "epoch": 2.280469508428206, + "grad_norm": 0.866808295249939, + "learning_rate": 2.7200353045013238e-05, + "loss": 0.5595, + "step": 12920 + }, + { + "epoch": 2.282234577707175, + "grad_norm": 1.0920748710632324, + "learning_rate": 2.718270079435128e-05, + "loss": 0.5383, + "step": 12930 + }, + { + "epoch": 2.2839996469861443, + "grad_norm": 0.7788814306259155, + "learning_rate": 2.716504854368932e-05, + "loss": 0.4792, + "step": 12940 + }, + { + "epoch": 2.2857647162651133, + "grad_norm": 0.875015377998352, + "learning_rate": 2.714739629302736e-05, + "loss": 0.499, + "step": 12950 + }, + { + "epoch": 2.2875297855440824, + "grad_norm": 0.830392599105835, + "learning_rate": 2.7129744042365402e-05, + "loss": 0.4868, + "step": 12960 + }, + { + "epoch": 2.289294854823052, + "grad_norm": 0.8815006613731384, + "learning_rate": 2.7112091791703443e-05, + "loss": 0.4666, + "step": 12970 + }, + { + "epoch": 2.291059924102021, + "grad_norm": 0.8470727205276489, + "learning_rate": 2.7094439541041485e-05, + "loss": 0.5324, + "step": 12980 + }, + { + "epoch": 2.2928249933809903, + "grad_norm": 1.8830323219299316, + "learning_rate": 2.7076787290379522e-05, + "loss": 0.5445, + "step": 12990 + }, + { + "epoch": 2.2945900626599594, + "grad_norm": 0.8701585531234741, + "learning_rate": 2.7059135039717564e-05, + "loss": 0.5036, + "step": 13000 + }, + { + "epoch": 2.2945900626599594, + "eval_loss": 0.6412675976753235, + "eval_runtime": 591.7448, + "eval_samples_per_second": 47.87, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0004934409498748434, + "step": 13000 + }, + { + "epoch": 2.2963551319389284, + "grad_norm": 0.9522472023963928, + "learning_rate": 2.7041482789055605e-05, + "loss": 0.4838, + "step": 13010 + }, + { + "epoch": 2.298120201217898, + "grad_norm": 2.1521029472351074, + "learning_rate": 2.7023830538393646e-05, + "loss": 0.473, + "step": 13020 + }, + { + "epoch": 2.299885270496867, + "grad_norm": 1.0218665599822998, + "learning_rate": 2.7006178287731687e-05, + "loss": 0.4596, + "step": 13030 + }, + { + "epoch": 2.3016503397758363, + "grad_norm": 0.8671874403953552, + "learning_rate": 2.6988526037069728e-05, + "loss": 0.5186, + "step": 13040 + }, + { + "epoch": 2.3034154090548054, + "grad_norm": 0.9349090456962585, + "learning_rate": 2.697087378640777e-05, + "loss": 0.5075, + "step": 13050 + }, + { + "epoch": 2.3051804783337744, + "grad_norm": 1.8751778602600098, + "learning_rate": 2.695322153574581e-05, + "loss": 0.5692, + "step": 13060 + }, + { + "epoch": 2.306945547612744, + "grad_norm": 1.0619752407073975, + "learning_rate": 2.693556928508385e-05, + "loss": 0.5053, + "step": 13070 + }, + { + "epoch": 2.308710616891713, + "grad_norm": 1.0086804628372192, + "learning_rate": 2.6917917034421893e-05, + "loss": 0.5665, + "step": 13080 + }, + { + "epoch": 2.3104756861706823, + "grad_norm": 1.0648972988128662, + "learning_rate": 2.690026478375993e-05, + "loss": 0.4463, + "step": 13090 + }, + { + "epoch": 2.3122407554496514, + "grad_norm": 1.061785340309143, + "learning_rate": 2.688261253309797e-05, + "loss": 0.5137, + "step": 13100 + }, + { + "epoch": 2.3140058247286204, + "grad_norm": 2.9344685077667236, + "learning_rate": 2.6864960282436013e-05, + "loss": 0.4788, + "step": 13110 + }, + { + "epoch": 2.31577089400759, + "grad_norm": 1.2300915718078613, + "learning_rate": 2.6847308031774054e-05, + "loss": 0.5406, + "step": 13120 + }, + { + "epoch": 2.317535963286559, + "grad_norm": 1.0264790058135986, + "learning_rate": 2.6829655781112095e-05, + "loss": 0.4655, + "step": 13130 + }, + { + "epoch": 2.3193010325655283, + "grad_norm": 2.2145140171051025, + "learning_rate": 2.6812003530450136e-05, + "loss": 0.5669, + "step": 13140 + }, + { + "epoch": 2.3210661018444974, + "grad_norm": 3.4140677452087402, + "learning_rate": 2.6794351279788177e-05, + "loss": 0.5037, + "step": 13150 + }, + { + "epoch": 2.3228311711234664, + "grad_norm": 0.9157199859619141, + "learning_rate": 2.677669902912622e-05, + "loss": 0.5114, + "step": 13160 + }, + { + "epoch": 2.324596240402436, + "grad_norm": 2.4897093772888184, + "learning_rate": 2.6759046778464253e-05, + "loss": 0.5387, + "step": 13170 + }, + { + "epoch": 2.326361309681405, + "grad_norm": 0.6659016013145447, + "learning_rate": 2.6741394527802294e-05, + "loss": 0.5019, + "step": 13180 + }, + { + "epoch": 2.3281263789603743, + "grad_norm": 1.978591799736023, + "learning_rate": 2.6723742277140335e-05, + "loss": 0.5258, + "step": 13190 + }, + { + "epoch": 2.3298914482393434, + "grad_norm": 0.8332827687263489, + "learning_rate": 2.6706090026478376e-05, + "loss": 0.5617, + "step": 13200 + }, + { + "epoch": 2.3316565175183124, + "grad_norm": 1.862585425376892, + "learning_rate": 2.6688437775816417e-05, + "loss": 0.4242, + "step": 13210 + }, + { + "epoch": 2.333421586797282, + "grad_norm": 2.829312801361084, + "learning_rate": 2.667078552515446e-05, + "loss": 0.4638, + "step": 13220 + }, + { + "epoch": 2.335186656076251, + "grad_norm": 0.7040713429450989, + "learning_rate": 2.6653133274492496e-05, + "loss": 0.4604, + "step": 13230 + }, + { + "epoch": 2.3369517253552203, + "grad_norm": 2.7348663806915283, + "learning_rate": 2.6635481023830537e-05, + "loss": 0.5251, + "step": 13240 + }, + { + "epoch": 2.3387167946341894, + "grad_norm": 1.9702792167663574, + "learning_rate": 2.661782877316858e-05, + "loss": 0.507, + "step": 13250 + }, + { + "epoch": 2.3404818639131584, + "grad_norm": 0.856442928314209, + "learning_rate": 2.660017652250662e-05, + "loss": 0.5338, + "step": 13260 + }, + { + "epoch": 2.342246933192128, + "grad_norm": 2.1334874629974365, + "learning_rate": 2.658252427184466e-05, + "loss": 0.5276, + "step": 13270 + }, + { + "epoch": 2.344012002471097, + "grad_norm": 0.8047542572021484, + "learning_rate": 2.6564872021182702e-05, + "loss": 0.533, + "step": 13280 + }, + { + "epoch": 2.3457770717500663, + "grad_norm": 1.1873878240585327, + "learning_rate": 2.6547219770520743e-05, + "loss": 0.5194, + "step": 13290 + }, + { + "epoch": 2.3475421410290354, + "grad_norm": 1.130898356437683, + "learning_rate": 2.6529567519858784e-05, + "loss": 0.4911, + "step": 13300 + }, + { + "epoch": 2.3493072103080044, + "grad_norm": 0.7969009280204773, + "learning_rate": 2.6511915269196825e-05, + "loss": 0.4655, + "step": 13310 + }, + { + "epoch": 2.351072279586974, + "grad_norm": 0.8343009948730469, + "learning_rate": 2.6494263018534867e-05, + "loss": 0.4632, + "step": 13320 + }, + { + "epoch": 2.352837348865943, + "grad_norm": 2.038909673690796, + "learning_rate": 2.6476610767872904e-05, + "loss": 0.468, + "step": 13330 + }, + { + "epoch": 2.3546024181449123, + "grad_norm": 3.487717628479004, + "learning_rate": 2.6458958517210946e-05, + "loss": 0.4837, + "step": 13340 + }, + { + "epoch": 2.3563674874238814, + "grad_norm": 3.3392558097839355, + "learning_rate": 2.6441306266548987e-05, + "loss": 0.4344, + "step": 13350 + }, + { + "epoch": 2.3581325567028504, + "grad_norm": 2.897871971130371, + "learning_rate": 2.6423654015887028e-05, + "loss": 0.5185, + "step": 13360 + }, + { + "epoch": 2.35989762598182, + "grad_norm": 2.5428342819213867, + "learning_rate": 2.640600176522507e-05, + "loss": 0.5117, + "step": 13370 + }, + { + "epoch": 2.361662695260789, + "grad_norm": 1.0503923892974854, + "learning_rate": 2.638834951456311e-05, + "loss": 0.4753, + "step": 13380 + }, + { + "epoch": 2.3634277645397583, + "grad_norm": 2.5742833614349365, + "learning_rate": 2.637069726390115e-05, + "loss": 0.5126, + "step": 13390 + }, + { + "epoch": 2.3651928338187274, + "grad_norm": 3.079458475112915, + "learning_rate": 2.6353045013239192e-05, + "loss": 0.4613, + "step": 13400 + }, + { + "epoch": 2.3669579030976964, + "grad_norm": 0.7799136638641357, + "learning_rate": 2.6335392762577234e-05, + "loss": 0.4927, + "step": 13410 + }, + { + "epoch": 2.368722972376666, + "grad_norm": 2.216231346130371, + "learning_rate": 2.6317740511915275e-05, + "loss": 0.5037, + "step": 13420 + }, + { + "epoch": 2.370488041655635, + "grad_norm": 1.005723476409912, + "learning_rate": 2.630008826125331e-05, + "loss": 0.5267, + "step": 13430 + }, + { + "epoch": 2.3722531109346043, + "grad_norm": 0.884051501750946, + "learning_rate": 2.628243601059135e-05, + "loss": 0.4745, + "step": 13440 + }, + { + "epoch": 2.3740181802135734, + "grad_norm": 1.1464614868164062, + "learning_rate": 2.626478375992939e-05, + "loss": 0.4789, + "step": 13450 + }, + { + "epoch": 2.3757832494925424, + "grad_norm": 1.0192292928695679, + "learning_rate": 2.624713150926743e-05, + "loss": 0.4834, + "step": 13460 + }, + { + "epoch": 2.377548318771512, + "grad_norm": 3.1154890060424805, + "learning_rate": 2.622947925860547e-05, + "loss": 0.5096, + "step": 13470 + }, + { + "epoch": 2.379313388050481, + "grad_norm": 3.1223933696746826, + "learning_rate": 2.621182700794351e-05, + "loss": 0.4817, + "step": 13480 + }, + { + "epoch": 2.3810784573294503, + "grad_norm": 1.5079442262649536, + "learning_rate": 2.6194174757281553e-05, + "loss": 0.4582, + "step": 13490 + }, + { + "epoch": 2.3828435266084194, + "grad_norm": 2.5750479698181152, + "learning_rate": 2.6176522506619594e-05, + "loss": 0.5411, + "step": 13500 + }, + { + "epoch": 2.3846085958873884, + "grad_norm": 2.674363613128662, + "learning_rate": 2.6158870255957635e-05, + "loss": 0.489, + "step": 13510 + }, + { + "epoch": 2.386373665166358, + "grad_norm": 2.6622684001922607, + "learning_rate": 2.6141218005295676e-05, + "loss": 0.5215, + "step": 13520 + }, + { + "epoch": 2.388138734445327, + "grad_norm": 1.0257827043533325, + "learning_rate": 2.6123565754633717e-05, + "loss": 0.4981, + "step": 13530 + }, + { + "epoch": 2.3899038037242963, + "grad_norm": 1.8958207368850708, + "learning_rate": 2.6105913503971758e-05, + "loss": 0.4808, + "step": 13540 + }, + { + "epoch": 2.3916688730032654, + "grad_norm": 0.9230754375457764, + "learning_rate": 2.60882612533098e-05, + "loss": 0.6154, + "step": 13550 + }, + { + "epoch": 2.3934339422822344, + "grad_norm": 3.1750411987304688, + "learning_rate": 2.6070609002647837e-05, + "loss": 0.5281, + "step": 13560 + }, + { + "epoch": 2.395199011561204, + "grad_norm": 0.994310200214386, + "learning_rate": 2.605295675198588e-05, + "loss": 0.5862, + "step": 13570 + }, + { + "epoch": 2.396964080840173, + "grad_norm": 1.904697060585022, + "learning_rate": 2.603530450132392e-05, + "loss": 0.5261, + "step": 13580 + }, + { + "epoch": 2.3987291501191423, + "grad_norm": 0.8757647275924683, + "learning_rate": 2.601765225066196e-05, + "loss": 0.5154, + "step": 13590 + }, + { + "epoch": 2.4004942193981114, + "grad_norm": 2.1605241298675537, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.4573, + "step": 13600 + }, + { + "epoch": 2.4022592886770804, + "grad_norm": 1.7989857196807861, + "learning_rate": 2.5982347749338043e-05, + "loss": 0.4879, + "step": 13610 + }, + { + "epoch": 2.40402435795605, + "grad_norm": 2.11667799949646, + "learning_rate": 2.5964695498676084e-05, + "loss": 0.4869, + "step": 13620 + }, + { + "epoch": 2.405789427235019, + "grad_norm": 3.442117929458618, + "learning_rate": 2.5947043248014125e-05, + "loss": 0.5279, + "step": 13630 + }, + { + "epoch": 2.4075544965139883, + "grad_norm": 1.5708115100860596, + "learning_rate": 2.5929390997352166e-05, + "loss": 0.4816, + "step": 13640 + }, + { + "epoch": 2.4093195657929574, + "grad_norm": 1.3552768230438232, + "learning_rate": 2.5911738746690208e-05, + "loss": 0.564, + "step": 13650 + }, + { + "epoch": 2.4110846350719264, + "grad_norm": 0.8411927223205566, + "learning_rate": 2.589408649602825e-05, + "loss": 0.401, + "step": 13660 + }, + { + "epoch": 2.412849704350896, + "grad_norm": 3.806356906890869, + "learning_rate": 2.5876434245366286e-05, + "loss": 0.4354, + "step": 13670 + }, + { + "epoch": 2.414614773629865, + "grad_norm": 2.0317461490631104, + "learning_rate": 2.5858781994704328e-05, + "loss": 0.4978, + "step": 13680 + }, + { + "epoch": 2.4163798429088343, + "grad_norm": 2.2540907859802246, + "learning_rate": 2.584112974404237e-05, + "loss": 0.4289, + "step": 13690 + }, + { + "epoch": 2.4181449121878034, + "grad_norm": 0.8400912880897522, + "learning_rate": 2.5823477493380403e-05, + "loss": 0.4892, + "step": 13700 + }, + { + "epoch": 2.4199099814667724, + "grad_norm": 0.6846358180046082, + "learning_rate": 2.5805825242718444e-05, + "loss": 0.5721, + "step": 13710 + }, + { + "epoch": 2.421675050745742, + "grad_norm": 0.9299115538597107, + "learning_rate": 2.5788172992056485e-05, + "loss": 0.5353, + "step": 13720 + }, + { + "epoch": 2.423440120024711, + "grad_norm": 1.3329682350158691, + "learning_rate": 2.5770520741394527e-05, + "loss": 0.5371, + "step": 13730 + }, + { + "epoch": 2.4252051893036803, + "grad_norm": 1.0928109884262085, + "learning_rate": 2.5752868490732568e-05, + "loss": 0.5186, + "step": 13740 + }, + { + "epoch": 2.4269702585826494, + "grad_norm": 0.9928961396217346, + "learning_rate": 2.573521624007061e-05, + "loss": 0.5265, + "step": 13750 + }, + { + "epoch": 2.4287353278616184, + "grad_norm": 2.193777322769165, + "learning_rate": 2.571756398940865e-05, + "loss": 0.572, + "step": 13760 + }, + { + "epoch": 2.430500397140588, + "grad_norm": 1.8867632150650024, + "learning_rate": 2.569991173874669e-05, + "loss": 0.4702, + "step": 13770 + }, + { + "epoch": 2.432265466419557, + "grad_norm": 3.6797149181365967, + "learning_rate": 2.5682259488084732e-05, + "loss": 0.5486, + "step": 13780 + }, + { + "epoch": 2.4340305356985263, + "grad_norm": 0.9218922257423401, + "learning_rate": 2.5664607237422773e-05, + "loss": 0.5172, + "step": 13790 + }, + { + "epoch": 2.4357956049774954, + "grad_norm": 0.8286840319633484, + "learning_rate": 2.564695498676081e-05, + "loss": 0.5309, + "step": 13800 + }, + { + "epoch": 2.4375606742564644, + "grad_norm": 2.5113284587860107, + "learning_rate": 2.5629302736098852e-05, + "loss": 0.5122, + "step": 13810 + }, + { + "epoch": 2.439325743535434, + "grad_norm": 4.121739864349365, + "learning_rate": 2.5611650485436893e-05, + "loss": 0.5529, + "step": 13820 + }, + { + "epoch": 2.441090812814403, + "grad_norm": 0.8111094236373901, + "learning_rate": 2.5593998234774935e-05, + "loss": 0.5267, + "step": 13830 + }, + { + "epoch": 2.4428558820933723, + "grad_norm": 3.327519178390503, + "learning_rate": 2.5576345984112976e-05, + "loss": 0.4871, + "step": 13840 + }, + { + "epoch": 2.4446209513723414, + "grad_norm": 3.18033766746521, + "learning_rate": 2.5558693733451017e-05, + "loss": 0.4858, + "step": 13850 + }, + { + "epoch": 2.4463860206513104, + "grad_norm": 2.803899049758911, + "learning_rate": 2.5541041482789058e-05, + "loss": 0.5272, + "step": 13860 + }, + { + "epoch": 2.44815108993028, + "grad_norm": 0.9279343485832214, + "learning_rate": 2.55233892321271e-05, + "loss": 0.4904, + "step": 13870 + }, + { + "epoch": 2.449916159209249, + "grad_norm": 1.1356558799743652, + "learning_rate": 2.550573698146514e-05, + "loss": 0.4783, + "step": 13880 + }, + { + "epoch": 2.4516812284882183, + "grad_norm": 2.152723550796509, + "learning_rate": 2.548808473080318e-05, + "loss": 0.5241, + "step": 13890 + }, + { + "epoch": 2.4534462977671874, + "grad_norm": 0.8582736253738403, + "learning_rate": 2.547043248014122e-05, + "loss": 0.4888, + "step": 13900 + }, + { + "epoch": 2.4552113670461564, + "grad_norm": 0.7155855894088745, + "learning_rate": 2.545278022947926e-05, + "loss": 0.5235, + "step": 13910 + }, + { + "epoch": 2.456976436325126, + "grad_norm": 1.35896897315979, + "learning_rate": 2.54351279788173e-05, + "loss": 0.5206, + "step": 13920 + }, + { + "epoch": 2.458741505604095, + "grad_norm": 1.0848078727722168, + "learning_rate": 2.5417475728155343e-05, + "loss": 0.5047, + "step": 13930 + }, + { + "epoch": 2.4605065748830643, + "grad_norm": 2.7077338695526123, + "learning_rate": 2.5399823477493384e-05, + "loss": 0.5035, + "step": 13940 + }, + { + "epoch": 2.4622716441620334, + "grad_norm": 2.3435027599334717, + "learning_rate": 2.5382171226831425e-05, + "loss": 0.4596, + "step": 13950 + }, + { + "epoch": 2.4640367134410024, + "grad_norm": 1.2936254739761353, + "learning_rate": 2.536451897616946e-05, + "loss": 0.5803, + "step": 13960 + }, + { + "epoch": 2.465801782719972, + "grad_norm": 0.7503966093063354, + "learning_rate": 2.53468667255075e-05, + "loss": 0.4078, + "step": 13970 + }, + { + "epoch": 2.467566851998941, + "grad_norm": 3.8789782524108887, + "learning_rate": 2.532921447484554e-05, + "loss": 0.4721, + "step": 13980 + }, + { + "epoch": 2.4693319212779103, + "grad_norm": 3.136807441711426, + "learning_rate": 2.5311562224183583e-05, + "loss": 0.5074, + "step": 13990 + }, + { + "epoch": 2.4710969905568794, + "grad_norm": 1.8736839294433594, + "learning_rate": 2.5293909973521624e-05, + "loss": 0.5167, + "step": 14000 + }, + { + "epoch": 2.4710969905568794, + "eval_loss": 0.6444569826126099, + "eval_runtime": 591.67, + "eval_samples_per_second": 47.876, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.00049364392846385, + "step": 14000 + }, + { + "epoch": 2.4728620598358484, + "grad_norm": 2.7958600521087646, + "learning_rate": 2.5276257722859665e-05, + "loss": 0.4353, + "step": 14010 + }, + { + "epoch": 2.474627129114818, + "grad_norm": 1.9591412544250488, + "learning_rate": 2.5258605472197706e-05, + "loss": 0.4061, + "step": 14020 + }, + { + "epoch": 2.476392198393787, + "grad_norm": 0.845340371131897, + "learning_rate": 2.5240953221535747e-05, + "loss": 0.4764, + "step": 14030 + }, + { + "epoch": 2.4781572676727563, + "grad_norm": 2.308152198791504, + "learning_rate": 2.5223300970873785e-05, + "loss": 0.555, + "step": 14040 + }, + { + "epoch": 2.4799223369517254, + "grad_norm": 1.7259413003921509, + "learning_rate": 2.5205648720211826e-05, + "loss": 0.4994, + "step": 14050 + }, + { + "epoch": 2.4816874062306944, + "grad_norm": 0.7561100721359253, + "learning_rate": 2.5187996469549867e-05, + "loss": 0.4588, + "step": 14060 + }, + { + "epoch": 2.483452475509664, + "grad_norm": 0.8931876420974731, + "learning_rate": 2.517034421888791e-05, + "loss": 0.4509, + "step": 14070 + }, + { + "epoch": 2.485217544788633, + "grad_norm": 0.8635971546173096, + "learning_rate": 2.515269196822595e-05, + "loss": 0.5047, + "step": 14080 + }, + { + "epoch": 2.4869826140676023, + "grad_norm": 1.0457156896591187, + "learning_rate": 2.513503971756399e-05, + "loss": 0.5713, + "step": 14090 + }, + { + "epoch": 2.4887476833465714, + "grad_norm": 0.7360007762908936, + "learning_rate": 2.5117387466902032e-05, + "loss": 0.4962, + "step": 14100 + }, + { + "epoch": 2.4905127526255404, + "grad_norm": 1.2129534482955933, + "learning_rate": 2.5099735216240073e-05, + "loss": 0.5531, + "step": 14110 + }, + { + "epoch": 2.49227782190451, + "grad_norm": 1.0140886306762695, + "learning_rate": 2.5082082965578114e-05, + "loss": 0.5137, + "step": 14120 + }, + { + "epoch": 2.494042891183479, + "grad_norm": 3.158255100250244, + "learning_rate": 2.5064430714916155e-05, + "loss": 0.524, + "step": 14130 + }, + { + "epoch": 2.4958079604624483, + "grad_norm": 1.9043729305267334, + "learning_rate": 2.5048543689320393e-05, + "loss": 0.5126, + "step": 14140 + }, + { + "epoch": 2.4975730297414174, + "grad_norm": 0.8182691335678101, + "learning_rate": 2.503089143865843e-05, + "loss": 0.4468, + "step": 14150 + }, + { + "epoch": 2.4993380990203864, + "grad_norm": 2.1772260665893555, + "learning_rate": 2.5013239187996472e-05, + "loss": 0.502, + "step": 14160 + }, + { + "epoch": 2.501103168299356, + "grad_norm": 0.8185103535652161, + "learning_rate": 2.499558693733451e-05, + "loss": 0.5546, + "step": 14170 + }, + { + "epoch": 2.502868237578325, + "grad_norm": 1.6303755044937134, + "learning_rate": 2.497793468667255e-05, + "loss": 0.4519, + "step": 14180 + }, + { + "epoch": 2.5046333068572944, + "grad_norm": 2.8604867458343506, + "learning_rate": 2.4960282436010592e-05, + "loss": 0.5637, + "step": 14190 + }, + { + "epoch": 2.5063983761362634, + "grad_norm": 3.644176721572876, + "learning_rate": 2.4942630185348633e-05, + "loss": 0.5599, + "step": 14200 + }, + { + "epoch": 2.5081634454152324, + "grad_norm": 0.834730863571167, + "learning_rate": 2.4924977934686674e-05, + "loss": 0.4952, + "step": 14210 + }, + { + "epoch": 2.509928514694202, + "grad_norm": 1.0383777618408203, + "learning_rate": 2.4907325684024716e-05, + "loss": 0.5917, + "step": 14220 + }, + { + "epoch": 2.511693583973171, + "grad_norm": 0.9282052516937256, + "learning_rate": 2.4889673433362757e-05, + "loss": 0.4442, + "step": 14230 + }, + { + "epoch": 2.5134586532521404, + "grad_norm": 0.8705268502235413, + "learning_rate": 2.4872021182700794e-05, + "loss": 0.5424, + "step": 14240 + }, + { + "epoch": 2.5152237225311094, + "grad_norm": 0.9563567638397217, + "learning_rate": 2.4854368932038836e-05, + "loss": 0.4315, + "step": 14250 + }, + { + "epoch": 2.5169887918100784, + "grad_norm": 3.457864999771118, + "learning_rate": 2.4836716681376877e-05, + "loss": 0.4982, + "step": 14260 + }, + { + "epoch": 2.518753861089048, + "grad_norm": 0.9856398105621338, + "learning_rate": 2.4819064430714918e-05, + "loss": 0.4919, + "step": 14270 + }, + { + "epoch": 2.520518930368017, + "grad_norm": 0.7606726288795471, + "learning_rate": 2.480141218005296e-05, + "loss": 0.4871, + "step": 14280 + }, + { + "epoch": 2.5222839996469864, + "grad_norm": 0.8468009233474731, + "learning_rate": 2.4783759929390997e-05, + "loss": 0.5633, + "step": 14290 + }, + { + "epoch": 2.5240490689259554, + "grad_norm": 0.9710422158241272, + "learning_rate": 2.4766107678729038e-05, + "loss": 0.5282, + "step": 14300 + }, + { + "epoch": 2.5258141382049244, + "grad_norm": 3.3125197887420654, + "learning_rate": 2.474845542806708e-05, + "loss": 0.5101, + "step": 14310 + }, + { + "epoch": 2.527579207483894, + "grad_norm": 3.0758326053619385, + "learning_rate": 2.473080317740512e-05, + "loss": 0.5018, + "step": 14320 + }, + { + "epoch": 2.529344276762863, + "grad_norm": 1.0053824186325073, + "learning_rate": 2.471315092674316e-05, + "loss": 0.5137, + "step": 14330 + }, + { + "epoch": 2.5311093460418324, + "grad_norm": 4.004051685333252, + "learning_rate": 2.4695498676081202e-05, + "loss": 0.5151, + "step": 14340 + }, + { + "epoch": 2.5328744153208014, + "grad_norm": 0.7134153246879578, + "learning_rate": 2.4677846425419244e-05, + "loss": 0.5042, + "step": 14350 + }, + { + "epoch": 2.5346394845997704, + "grad_norm": 3.08558988571167, + "learning_rate": 2.4660194174757285e-05, + "loss": 0.5112, + "step": 14360 + }, + { + "epoch": 2.53640455387874, + "grad_norm": 0.9656035304069519, + "learning_rate": 2.4642541924095323e-05, + "loss": 0.5184, + "step": 14370 + }, + { + "epoch": 2.538169623157709, + "grad_norm": 0.7227429747581482, + "learning_rate": 2.4624889673433364e-05, + "loss": 0.4754, + "step": 14380 + }, + { + "epoch": 2.5399346924366784, + "grad_norm": 1.0796962976455688, + "learning_rate": 2.4607237422771405e-05, + "loss": 0.5395, + "step": 14390 + }, + { + "epoch": 2.5416997617156474, + "grad_norm": 1.065271019935608, + "learning_rate": 2.4589585172109446e-05, + "loss": 0.5096, + "step": 14400 + }, + { + "epoch": 2.5434648309946164, + "grad_norm": 1.0752772092819214, + "learning_rate": 2.4571932921447484e-05, + "loss": 0.554, + "step": 14410 + }, + { + "epoch": 2.545229900273586, + "grad_norm": 3.272254228591919, + "learning_rate": 2.4554280670785525e-05, + "loss": 0.4955, + "step": 14420 + }, + { + "epoch": 2.546994969552555, + "grad_norm": 2.887012481689453, + "learning_rate": 2.4536628420123566e-05, + "loss": 0.4783, + "step": 14430 + }, + { + "epoch": 2.5487600388315244, + "grad_norm": 1.104333758354187, + "learning_rate": 2.4518976169461607e-05, + "loss": 0.5279, + "step": 14440 + }, + { + "epoch": 2.5505251081104934, + "grad_norm": 3.4142651557922363, + "learning_rate": 2.450132391879965e-05, + "loss": 0.4985, + "step": 14450 + }, + { + "epoch": 2.5522901773894624, + "grad_norm": 1.1748323440551758, + "learning_rate": 2.448367166813769e-05, + "loss": 0.6331, + "step": 14460 + }, + { + "epoch": 2.554055246668432, + "grad_norm": 0.7295975089073181, + "learning_rate": 2.446601941747573e-05, + "loss": 0.5106, + "step": 14470 + }, + { + "epoch": 2.555820315947401, + "grad_norm": 3.932593584060669, + "learning_rate": 2.4448367166813772e-05, + "loss": 0.467, + "step": 14480 + }, + { + "epoch": 2.5575853852263704, + "grad_norm": 1.1309473514556885, + "learning_rate": 2.4430714916151813e-05, + "loss": 0.5545, + "step": 14490 + }, + { + "epoch": 2.5593504545053394, + "grad_norm": 2.504162073135376, + "learning_rate": 2.441306266548985e-05, + "loss": 0.4719, + "step": 14500 + }, + { + "epoch": 2.5611155237843084, + "grad_norm": 1.8151975870132446, + "learning_rate": 2.4395410414827892e-05, + "loss": 0.4756, + "step": 14510 + }, + { + "epoch": 2.562880593063278, + "grad_norm": 0.7189649343490601, + "learning_rate": 2.4377758164165933e-05, + "loss": 0.4718, + "step": 14520 + }, + { + "epoch": 2.564645662342247, + "grad_norm": 2.3369531631469727, + "learning_rate": 2.436010591350397e-05, + "loss": 0.4687, + "step": 14530 + }, + { + "epoch": 2.5664107316212164, + "grad_norm": 0.9797418117523193, + "learning_rate": 2.4342453662842012e-05, + "loss": 0.4821, + "step": 14540 + }, + { + "epoch": 2.5681758009001854, + "grad_norm": 1.5502965450286865, + "learning_rate": 2.4324801412180053e-05, + "loss": 0.5433, + "step": 14550 + }, + { + "epoch": 2.5699408701791544, + "grad_norm": 2.704186201095581, + "learning_rate": 2.4307149161518094e-05, + "loss": 0.5147, + "step": 14560 + }, + { + "epoch": 2.571705939458124, + "grad_norm": 0.8617478013038635, + "learning_rate": 2.4289496910856135e-05, + "loss": 0.491, + "step": 14570 + }, + { + "epoch": 2.573471008737093, + "grad_norm": 3.2564940452575684, + "learning_rate": 2.4271844660194176e-05, + "loss": 0.4789, + "step": 14580 + }, + { + "epoch": 2.5752360780160624, + "grad_norm": 3.6057090759277344, + "learning_rate": 2.4254192409532218e-05, + "loss": 0.5254, + "step": 14590 + }, + { + "epoch": 2.5770011472950314, + "grad_norm": 3.0366363525390625, + "learning_rate": 2.423654015887026e-05, + "loss": 0.4894, + "step": 14600 + }, + { + "epoch": 2.5787662165740004, + "grad_norm": 2.611250877380371, + "learning_rate": 2.42188879082083e-05, + "loss": 0.5071, + "step": 14610 + }, + { + "epoch": 2.58053128585297, + "grad_norm": 0.9398654103279114, + "learning_rate": 2.420123565754634e-05, + "loss": 0.4138, + "step": 14620 + }, + { + "epoch": 2.582296355131939, + "grad_norm": 1.245468258857727, + "learning_rate": 2.418358340688438e-05, + "loss": 0.6041, + "step": 14630 + }, + { + "epoch": 2.5840614244109084, + "grad_norm": 2.410236120223999, + "learning_rate": 2.416593115622242e-05, + "loss": 0.5736, + "step": 14640 + }, + { + "epoch": 2.5858264936898774, + "grad_norm": 2.7321484088897705, + "learning_rate": 2.4148278905560458e-05, + "loss": 0.4715, + "step": 14650 + }, + { + "epoch": 2.5875915629688464, + "grad_norm": 1.2167099714279175, + "learning_rate": 2.41306266548985e-05, + "loss": 0.5238, + "step": 14660 + }, + { + "epoch": 2.589356632247816, + "grad_norm": 0.7984771132469177, + "learning_rate": 2.411297440423654e-05, + "loss": 0.4759, + "step": 14670 + }, + { + "epoch": 2.591121701526785, + "grad_norm": 2.0809378623962402, + "learning_rate": 2.409532215357458e-05, + "loss": 0.4867, + "step": 14680 + }, + { + "epoch": 2.5928867708057544, + "grad_norm": 1.0306892395019531, + "learning_rate": 2.4077669902912622e-05, + "loss": 0.4754, + "step": 14690 + }, + { + "epoch": 2.5946518400847234, + "grad_norm": 4.413130760192871, + "learning_rate": 2.4060017652250663e-05, + "loss": 0.5013, + "step": 14700 + }, + { + "epoch": 2.5964169093636924, + "grad_norm": 0.9244773387908936, + "learning_rate": 2.4042365401588705e-05, + "loss": 0.5159, + "step": 14710 + }, + { + "epoch": 2.598181978642662, + "grad_norm": 2.5842370986938477, + "learning_rate": 2.4024713150926746e-05, + "loss": 0.6024, + "step": 14720 + }, + { + "epoch": 2.599947047921631, + "grad_norm": 2.524390697479248, + "learning_rate": 2.4007060900264787e-05, + "loss": 0.4822, + "step": 14730 + }, + { + "epoch": 2.6017121172006004, + "grad_norm": 2.7176120281219482, + "learning_rate": 2.3989408649602828e-05, + "loss": 0.5486, + "step": 14740 + }, + { + "epoch": 2.6034771864795694, + "grad_norm": 0.8411146402359009, + "learning_rate": 2.3971756398940866e-05, + "loss": 0.5113, + "step": 14750 + }, + { + "epoch": 2.6052422557585384, + "grad_norm": 1.2995619773864746, + "learning_rate": 2.3954104148278907e-05, + "loss": 0.4908, + "step": 14760 + }, + { + "epoch": 2.607007325037508, + "grad_norm": 0.9292383193969727, + "learning_rate": 2.3936451897616945e-05, + "loss": 0.5188, + "step": 14770 + }, + { + "epoch": 2.608772394316477, + "grad_norm": 0.9706587791442871, + "learning_rate": 2.3918799646954986e-05, + "loss": 0.4462, + "step": 14780 + }, + { + "epoch": 2.6105374635954464, + "grad_norm": 0.8439898490905762, + "learning_rate": 2.3901147396293027e-05, + "loss": 0.4806, + "step": 14790 + }, + { + "epoch": 2.6123025328744154, + "grad_norm": 2.3372583389282227, + "learning_rate": 2.3883495145631068e-05, + "loss": 0.4816, + "step": 14800 + }, + { + "epoch": 2.6140676021533844, + "grad_norm": 1.188154935836792, + "learning_rate": 2.3867608120035306e-05, + "loss": 0.5135, + "step": 14810 + }, + { + "epoch": 2.615832671432354, + "grad_norm": 1.5531268119812012, + "learning_rate": 2.3849955869373347e-05, + "loss": 0.4791, + "step": 14820 + }, + { + "epoch": 2.617597740711323, + "grad_norm": 1.0564147233963013, + "learning_rate": 2.3832303618711388e-05, + "loss": 0.4124, + "step": 14830 + }, + { + "epoch": 2.6193628099902924, + "grad_norm": 0.7218682169914246, + "learning_rate": 2.381465136804943e-05, + "loss": 0.4817, + "step": 14840 + }, + { + "epoch": 2.6211278792692614, + "grad_norm": 2.071608304977417, + "learning_rate": 2.3796999117387467e-05, + "loss": 0.4893, + "step": 14850 + }, + { + "epoch": 2.6228929485482304, + "grad_norm": 3.6240506172180176, + "learning_rate": 2.3779346866725508e-05, + "loss": 0.4753, + "step": 14860 + }, + { + "epoch": 2.6246580178272, + "grad_norm": 2.2952630519866943, + "learning_rate": 2.376169461606355e-05, + "loss": 0.449, + "step": 14870 + }, + { + "epoch": 2.626423087106169, + "grad_norm": 0.794567346572876, + "learning_rate": 2.374404236540159e-05, + "loss": 0.442, + "step": 14880 + }, + { + "epoch": 2.6281881563851384, + "grad_norm": 0.6543694138526917, + "learning_rate": 2.372639011473963e-05, + "loss": 0.4496, + "step": 14890 + }, + { + "epoch": 2.6299532256641074, + "grad_norm": 3.7700748443603516, + "learning_rate": 2.370873786407767e-05, + "loss": 0.4946, + "step": 14900 + }, + { + "epoch": 2.6317182949430764, + "grad_norm": 1.2146788835525513, + "learning_rate": 2.369108561341571e-05, + "loss": 0.4665, + "step": 14910 + }, + { + "epoch": 2.633483364222046, + "grad_norm": 2.967947006225586, + "learning_rate": 2.367343336275375e-05, + "loss": 0.5184, + "step": 14920 + }, + { + "epoch": 2.635248433501015, + "grad_norm": 0.9510634541511536, + "learning_rate": 2.3655781112091793e-05, + "loss": 0.4812, + "step": 14930 + }, + { + "epoch": 2.6370135027799844, + "grad_norm": 0.8221954107284546, + "learning_rate": 2.3638128861429834e-05, + "loss": 0.4662, + "step": 14940 + }, + { + "epoch": 2.6387785720589534, + "grad_norm": 0.9185769557952881, + "learning_rate": 2.3620476610767875e-05, + "loss": 0.4848, + "step": 14950 + }, + { + "epoch": 2.6405436413379224, + "grad_norm": 0.8108286261558533, + "learning_rate": 2.3602824360105916e-05, + "loss": 0.4619, + "step": 14960 + }, + { + "epoch": 2.642308710616892, + "grad_norm": 2.633342742919922, + "learning_rate": 2.3585172109443957e-05, + "loss": 0.5568, + "step": 14970 + }, + { + "epoch": 2.644073779895861, + "grad_norm": 0.9109094142913818, + "learning_rate": 2.3567519858781995e-05, + "loss": 0.5136, + "step": 14980 + }, + { + "epoch": 2.6458388491748304, + "grad_norm": 1.047798991203308, + "learning_rate": 2.3549867608120036e-05, + "loss": 0.5128, + "step": 14990 + }, + { + "epoch": 2.6476039184537994, + "grad_norm": 1.1590611934661865, + "learning_rate": 2.3532215357458077e-05, + "loss": 0.5211, + "step": 15000 + }, + { + "epoch": 2.6476039184537994, + "eval_loss": 0.62970370054245, + "eval_runtime": 591.6243, + "eval_samples_per_second": 47.88, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0004849158491365697, + "step": 15000 + }, + { + "epoch": 2.6493689877327684, + "grad_norm": 2.34171199798584, + "learning_rate": 2.351456310679612e-05, + "loss": 0.5036, + "step": 15010 + }, + { + "epoch": 2.651134057011738, + "grad_norm": 1.2570987939834595, + "learning_rate": 2.3496910856134156e-05, + "loss": 0.545, + "step": 15020 + }, + { + "epoch": 2.652899126290707, + "grad_norm": 1.4143269062042236, + "learning_rate": 2.3479258605472197e-05, + "loss": 0.5092, + "step": 15030 + }, + { + "epoch": 2.6546641955696764, + "grad_norm": 1.1344726085662842, + "learning_rate": 2.346160635481024e-05, + "loss": 0.4663, + "step": 15040 + }, + { + "epoch": 2.6564292648486454, + "grad_norm": 2.5450966358184814, + "learning_rate": 2.344395410414828e-05, + "loss": 0.4743, + "step": 15050 + }, + { + "epoch": 2.6581943341276144, + "grad_norm": 4.249632835388184, + "learning_rate": 2.342630185348632e-05, + "loss": 0.5652, + "step": 15060 + }, + { + "epoch": 2.6599594034065834, + "grad_norm": 3.0133004188537598, + "learning_rate": 2.3408649602824362e-05, + "loss": 0.5316, + "step": 15070 + }, + { + "epoch": 2.661724472685553, + "grad_norm": 2.640355110168457, + "learning_rate": 2.3390997352162403e-05, + "loss": 0.4818, + "step": 15080 + }, + { + "epoch": 2.6634895419645224, + "grad_norm": 2.2960193157196045, + "learning_rate": 2.3373345101500444e-05, + "loss": 0.5672, + "step": 15090 + }, + { + "epoch": 2.6652546112434914, + "grad_norm": 0.8862185478210449, + "learning_rate": 2.3355692850838485e-05, + "loss": 0.4891, + "step": 15100 + }, + { + "epoch": 2.6670196805224604, + "grad_norm": 1.2502179145812988, + "learning_rate": 2.3338040600176523e-05, + "loss": 0.4982, + "step": 15110 + }, + { + "epoch": 2.6687847498014294, + "grad_norm": 2.1781082153320312, + "learning_rate": 2.3320388349514564e-05, + "loss": 0.5247, + "step": 15120 + }, + { + "epoch": 2.670549819080399, + "grad_norm": 0.93157559633255, + "learning_rate": 2.3302736098852606e-05, + "loss": 0.5113, + "step": 15130 + }, + { + "epoch": 2.6723148883593684, + "grad_norm": 2.9610769748687744, + "learning_rate": 2.3285083848190643e-05, + "loss": 0.5343, + "step": 15140 + }, + { + "epoch": 2.6740799576383374, + "grad_norm": 1.2315391302108765, + "learning_rate": 2.3267431597528684e-05, + "loss": 0.4567, + "step": 15150 + }, + { + "epoch": 2.6758450269173064, + "grad_norm": 2.971158266067505, + "learning_rate": 2.3249779346866726e-05, + "loss": 0.5709, + "step": 15160 + }, + { + "epoch": 2.6776100961962754, + "grad_norm": 0.7065374851226807, + "learning_rate": 2.3232127096204767e-05, + "loss": 0.494, + "step": 15170 + }, + { + "epoch": 2.679375165475245, + "grad_norm": 2.8088817596435547, + "learning_rate": 2.3214474845542808e-05, + "loss": 0.4709, + "step": 15180 + }, + { + "epoch": 2.6811402347542144, + "grad_norm": 1.246795654296875, + "learning_rate": 2.319682259488085e-05, + "loss": 0.4718, + "step": 15190 + }, + { + "epoch": 2.6829053040331834, + "grad_norm": 1.635353446006775, + "learning_rate": 2.317917034421889e-05, + "loss": 0.4402, + "step": 15200 + }, + { + "epoch": 2.6846703733121524, + "grad_norm": 0.8145955204963684, + "learning_rate": 2.316151809355693e-05, + "loss": 0.4706, + "step": 15210 + }, + { + "epoch": 2.6864354425911214, + "grad_norm": 1.0448297262191772, + "learning_rate": 2.3143865842894972e-05, + "loss": 0.4669, + "step": 15220 + }, + { + "epoch": 2.688200511870091, + "grad_norm": 0.9495267271995544, + "learning_rate": 2.3126213592233014e-05, + "loss": 0.4408, + "step": 15230 + }, + { + "epoch": 2.6899655811490604, + "grad_norm": 0.7679589986801147, + "learning_rate": 2.310856134157105e-05, + "loss": 0.5135, + "step": 15240 + }, + { + "epoch": 2.6917306504280294, + "grad_norm": 2.6463985443115234, + "learning_rate": 2.309090909090909e-05, + "loss": 0.3998, + "step": 15250 + }, + { + "epoch": 2.6934957197069984, + "grad_norm": 0.8487569093704224, + "learning_rate": 2.307325684024713e-05, + "loss": 0.563, + "step": 15260 + }, + { + "epoch": 2.6952607889859674, + "grad_norm": 0.8544154763221741, + "learning_rate": 2.305560458958517e-05, + "loss": 0.4757, + "step": 15270 + }, + { + "epoch": 2.697025858264937, + "grad_norm": 0.7770341634750366, + "learning_rate": 2.3037952338923213e-05, + "loss": 0.5124, + "step": 15280 + }, + { + "epoch": 2.6987909275439064, + "grad_norm": 2.1505966186523438, + "learning_rate": 2.3020300088261254e-05, + "loss": 0.5038, + "step": 15290 + }, + { + "epoch": 2.7005559968228754, + "grad_norm": 0.8879594802856445, + "learning_rate": 2.3002647837599295e-05, + "loss": 0.4735, + "step": 15300 + }, + { + "epoch": 2.7023210661018444, + "grad_norm": 1.0117619037628174, + "learning_rate": 2.2984995586937336e-05, + "loss": 0.5157, + "step": 15310 + }, + { + "epoch": 2.7040861353808134, + "grad_norm": 1.0806429386138916, + "learning_rate": 2.2967343336275377e-05, + "loss": 0.4627, + "step": 15320 + }, + { + "epoch": 2.705851204659783, + "grad_norm": 2.156562328338623, + "learning_rate": 2.2949691085613418e-05, + "loss": 0.5131, + "step": 15330 + }, + { + "epoch": 2.7076162739387524, + "grad_norm": 2.4664413928985596, + "learning_rate": 2.293203883495146e-05, + "loss": 0.4633, + "step": 15340 + }, + { + "epoch": 2.7093813432177214, + "grad_norm": 0.7217195630073547, + "learning_rate": 2.29143865842895e-05, + "loss": 0.4845, + "step": 15350 + }, + { + "epoch": 2.7111464124966904, + "grad_norm": 0.9697237610816956, + "learning_rate": 2.289673433362754e-05, + "loss": 0.4361, + "step": 15360 + }, + { + "epoch": 2.7129114817756594, + "grad_norm": 2.4647676944732666, + "learning_rate": 2.287908208296558e-05, + "loss": 0.428, + "step": 15370 + }, + { + "epoch": 2.714676551054629, + "grad_norm": 3.4656920433044434, + "learning_rate": 2.2861429832303617e-05, + "loss": 0.4057, + "step": 15380 + }, + { + "epoch": 2.716441620333598, + "grad_norm": 4.08831787109375, + "learning_rate": 2.284377758164166e-05, + "loss": 0.449, + "step": 15390 + }, + { + "epoch": 2.7182066896125674, + "grad_norm": 1.6453620195388794, + "learning_rate": 2.28261253309797e-05, + "loss": 0.5226, + "step": 15400 + }, + { + "epoch": 2.7199717588915364, + "grad_norm": 0.8757455348968506, + "learning_rate": 2.280847308031774e-05, + "loss": 0.4323, + "step": 15410 + }, + { + "epoch": 2.7217368281705054, + "grad_norm": 0.7374882102012634, + "learning_rate": 2.2790820829655782e-05, + "loss": 0.5089, + "step": 15420 + }, + { + "epoch": 2.723501897449475, + "grad_norm": 0.9058319926261902, + "learning_rate": 2.2773168578993823e-05, + "loss": 0.5055, + "step": 15430 + }, + { + "epoch": 2.725266966728444, + "grad_norm": 0.81839519739151, + "learning_rate": 2.2755516328331864e-05, + "loss": 0.4996, + "step": 15440 + }, + { + "epoch": 2.7270320360074134, + "grad_norm": 2.061976194381714, + "learning_rate": 2.2737864077669905e-05, + "loss": 0.4803, + "step": 15450 + }, + { + "epoch": 2.7287971052863824, + "grad_norm": 0.9650241732597351, + "learning_rate": 2.2720211827007946e-05, + "loss": 0.4967, + "step": 15460 + }, + { + "epoch": 2.7305621745653514, + "grad_norm": 3.2927639484405518, + "learning_rate": 2.2702559576345984e-05, + "loss": 0.4602, + "step": 15470 + }, + { + "epoch": 2.732327243844321, + "grad_norm": 0.7154979109764099, + "learning_rate": 2.2684907325684025e-05, + "loss": 0.4768, + "step": 15480 + }, + { + "epoch": 2.73409231312329, + "grad_norm": 1.0628308057785034, + "learning_rate": 2.2667255075022066e-05, + "loss": 0.4454, + "step": 15490 + }, + { + "epoch": 2.7358573824022594, + "grad_norm": 3.443286418914795, + "learning_rate": 2.2649602824360108e-05, + "loss": 0.5177, + "step": 15500 + }, + { + "epoch": 2.7376224516812284, + "grad_norm": 1.6210869550704956, + "learning_rate": 2.2631950573698145e-05, + "loss": 0.5108, + "step": 15510 + }, + { + "epoch": 2.7393875209601974, + "grad_norm": 4.365480422973633, + "learning_rate": 2.2614298323036187e-05, + "loss": 0.5139, + "step": 15520 + }, + { + "epoch": 2.741152590239167, + "grad_norm": 3.1389224529266357, + "learning_rate": 2.2596646072374228e-05, + "loss": 0.4778, + "step": 15530 + }, + { + "epoch": 2.742917659518136, + "grad_norm": 0.8731733560562134, + "learning_rate": 2.257899382171227e-05, + "loss": 0.4987, + "step": 15540 + }, + { + "epoch": 2.7446827287971054, + "grad_norm": 1.9244966506958008, + "learning_rate": 2.256134157105031e-05, + "loss": 0.5318, + "step": 15550 + }, + { + "epoch": 2.7464477980760744, + "grad_norm": 2.3997249603271484, + "learning_rate": 2.254368932038835e-05, + "loss": 0.5424, + "step": 15560 + }, + { + "epoch": 2.7482128673550434, + "grad_norm": 2.4747679233551025, + "learning_rate": 2.2526037069726392e-05, + "loss": 0.4674, + "step": 15570 + }, + { + "epoch": 2.749977936634013, + "grad_norm": 1.0028587579727173, + "learning_rate": 2.2508384819064433e-05, + "loss": 0.496, + "step": 15580 + }, + { + "epoch": 2.751743005912982, + "grad_norm": 0.7981588244438171, + "learning_rate": 2.249073256840247e-05, + "loss": 0.4122, + "step": 15590 + }, + { + "epoch": 2.7535080751919514, + "grad_norm": 1.8606398105621338, + "learning_rate": 2.2473080317740512e-05, + "loss": 0.5136, + "step": 15600 + }, + { + "epoch": 2.7552731444709204, + "grad_norm": 0.8973249793052673, + "learning_rate": 2.2455428067078553e-05, + "loss": 0.5144, + "step": 15610 + }, + { + "epoch": 2.7570382137498894, + "grad_norm": 1.283713459968567, + "learning_rate": 2.2437775816416595e-05, + "loss": 0.6096, + "step": 15620 + }, + { + "epoch": 2.758803283028859, + "grad_norm": 1.2897413969039917, + "learning_rate": 2.2420123565754636e-05, + "loss": 0.4907, + "step": 15630 + }, + { + "epoch": 2.760568352307828, + "grad_norm": 3.3617520332336426, + "learning_rate": 2.2402471315092673e-05, + "loss": 0.4862, + "step": 15640 + }, + { + "epoch": 2.7623334215867974, + "grad_norm": 0.9050626158714294, + "learning_rate": 2.2384819064430715e-05, + "loss": 0.504, + "step": 15650 + }, + { + "epoch": 2.7640984908657664, + "grad_norm": 2.170416831970215, + "learning_rate": 2.2367166813768756e-05, + "loss": 0.4483, + "step": 15660 + }, + { + "epoch": 2.7658635601447354, + "grad_norm": 1.0081535577774048, + "learning_rate": 2.2349514563106797e-05, + "loss": 0.6113, + "step": 15670 + }, + { + "epoch": 2.767628629423705, + "grad_norm": 3.635767936706543, + "learning_rate": 2.2331862312444838e-05, + "loss": 0.5047, + "step": 15680 + }, + { + "epoch": 2.769393698702674, + "grad_norm": 1.7012405395507812, + "learning_rate": 2.231421006178288e-05, + "loss": 0.4778, + "step": 15690 + }, + { + "epoch": 2.7711587679816434, + "grad_norm": 1.043668270111084, + "learning_rate": 2.229655781112092e-05, + "loss": 0.5586, + "step": 15700 + }, + { + "epoch": 2.7729238372606124, + "grad_norm": 2.9892914295196533, + "learning_rate": 2.2278905560458958e-05, + "loss": 0.4703, + "step": 15710 + }, + { + "epoch": 2.7746889065395814, + "grad_norm": 0.8967382311820984, + "learning_rate": 2.2261253309797e-05, + "loss": 0.4559, + "step": 15720 + }, + { + "epoch": 2.776453975818551, + "grad_norm": 0.8660327792167664, + "learning_rate": 2.224360105913504e-05, + "loss": 0.4824, + "step": 15730 + }, + { + "epoch": 2.77821904509752, + "grad_norm": 2.1754908561706543, + "learning_rate": 2.222594880847308e-05, + "loss": 0.4651, + "step": 15740 + }, + { + "epoch": 2.7799841143764894, + "grad_norm": 0.9087356328964233, + "learning_rate": 2.2208296557811123e-05, + "loss": 0.4744, + "step": 15750 + }, + { + "epoch": 2.7817491836554584, + "grad_norm": 1.9040993452072144, + "learning_rate": 2.2190644307149164e-05, + "loss": 0.562, + "step": 15760 + }, + { + "epoch": 2.7835142529344274, + "grad_norm": 0.6683452725410461, + "learning_rate": 2.2172992056487205e-05, + "loss": 0.4921, + "step": 15770 + }, + { + "epoch": 2.785279322213397, + "grad_norm": 2.6163249015808105, + "learning_rate": 2.2155339805825243e-05, + "loss": 0.4716, + "step": 15780 + }, + { + "epoch": 2.787044391492366, + "grad_norm": 2.3667445182800293, + "learning_rate": 2.2137687555163284e-05, + "loss": 0.5489, + "step": 15790 + }, + { + "epoch": 2.7888094607713354, + "grad_norm": 0.7131396532058716, + "learning_rate": 2.2120035304501325e-05, + "loss": 0.4945, + "step": 15800 + }, + { + "epoch": 2.7905745300503044, + "grad_norm": 2.674614667892456, + "learning_rate": 2.2102383053839366e-05, + "loss": 0.48, + "step": 15810 + }, + { + "epoch": 2.7923395993292734, + "grad_norm": 1.0316333770751953, + "learning_rate": 2.2084730803177407e-05, + "loss": 0.5085, + "step": 15820 + }, + { + "epoch": 2.794104668608243, + "grad_norm": 2.8404314517974854, + "learning_rate": 2.2067078552515445e-05, + "loss": 0.4339, + "step": 15830 + }, + { + "epoch": 2.795869737887212, + "grad_norm": 0.8546010851860046, + "learning_rate": 2.2049426301853486e-05, + "loss": 0.5151, + "step": 15840 + }, + { + "epoch": 2.7976348071661814, + "grad_norm": 3.1289405822753906, + "learning_rate": 2.2031774051191527e-05, + "loss": 0.4742, + "step": 15850 + }, + { + "epoch": 2.7993998764451504, + "grad_norm": 3.0443801879882812, + "learning_rate": 2.201412180052957e-05, + "loss": 0.5226, + "step": 15860 + }, + { + "epoch": 2.8011649457241194, + "grad_norm": 2.93648099899292, + "learning_rate": 2.199646954986761e-05, + "loss": 0.551, + "step": 15870 + }, + { + "epoch": 2.802930015003089, + "grad_norm": 0.9104002714157104, + "learning_rate": 2.197881729920565e-05, + "loss": 0.499, + "step": 15880 + }, + { + "epoch": 2.804695084282058, + "grad_norm": 1.0089343786239624, + "learning_rate": 2.1961165048543692e-05, + "loss": 0.5425, + "step": 15890 + }, + { + "epoch": 2.8064601535610274, + "grad_norm": 0.7696927785873413, + "learning_rate": 2.1943512797881733e-05, + "loss": 0.4238, + "step": 15900 + }, + { + "epoch": 2.8082252228399964, + "grad_norm": 3.6322391033172607, + "learning_rate": 2.192586054721977e-05, + "loss": 0.5536, + "step": 15910 + }, + { + "epoch": 2.8099902921189654, + "grad_norm": 0.7961804270744324, + "learning_rate": 2.1908208296557812e-05, + "loss": 0.4964, + "step": 15920 + }, + { + "epoch": 2.811755361397935, + "grad_norm": 1.4182825088500977, + "learning_rate": 2.1890556045895853e-05, + "loss": 0.5449, + "step": 15930 + }, + { + "epoch": 2.813520430676904, + "grad_norm": 0.6621285676956177, + "learning_rate": 2.1872903795233894e-05, + "loss": 0.5348, + "step": 15940 + }, + { + "epoch": 2.8152854999558734, + "grad_norm": 1.5774197578430176, + "learning_rate": 2.1855251544571932e-05, + "loss": 0.4791, + "step": 15950 + }, + { + "epoch": 2.8170505692348424, + "grad_norm": 0.7247095704078674, + "learning_rate": 2.1837599293909973e-05, + "loss": 0.4904, + "step": 15960 + }, + { + "epoch": 2.8188156385138115, + "grad_norm": 2.010996103286743, + "learning_rate": 2.1819947043248014e-05, + "loss": 0.4235, + "step": 15970 + }, + { + "epoch": 2.820580707792781, + "grad_norm": 1.0321121215820312, + "learning_rate": 2.1802294792586056e-05, + "loss": 0.4999, + "step": 15980 + }, + { + "epoch": 2.82234577707175, + "grad_norm": 1.1265519857406616, + "learning_rate": 2.1784642541924097e-05, + "loss": 0.4972, + "step": 15990 + }, + { + "epoch": 2.8241108463507194, + "grad_norm": 1.2448257207870483, + "learning_rate": 2.1766990291262138e-05, + "loss": 0.4863, + "step": 16000 + }, + { + "epoch": 2.8241108463507194, + "eval_loss": 0.6306177973747253, + "eval_runtime": 591.8945, + "eval_samples_per_second": 47.858, + "eval_steps_per_second": 2.394, + "eval_token_accuracy": 0.0005015600934351041, + "step": 16000 + }, + { + "epoch": 2.8258759156296884, + "grad_norm": 1.085065484046936, + "learning_rate": 2.174933804060018e-05, + "loss": 0.5037, + "step": 16010 + }, + { + "epoch": 2.8276409849086575, + "grad_norm": 1.0297991037368774, + "learning_rate": 2.173168578993822e-05, + "loss": 0.4709, + "step": 16020 + }, + { + "epoch": 2.829406054187627, + "grad_norm": 1.2882356643676758, + "learning_rate": 2.171403353927626e-05, + "loss": 0.4344, + "step": 16030 + }, + { + "epoch": 2.831171123466596, + "grad_norm": 3.9636170864105225, + "learning_rate": 2.16963812886143e-05, + "loss": 0.4831, + "step": 16040 + }, + { + "epoch": 2.8329361927455654, + "grad_norm": 1.1417694091796875, + "learning_rate": 2.167872903795234e-05, + "loss": 0.5312, + "step": 16050 + }, + { + "epoch": 2.8347012620245344, + "grad_norm": 1.081305742263794, + "learning_rate": 2.166107678729038e-05, + "loss": 0.5413, + "step": 16060 + }, + { + "epoch": 2.8364663313035035, + "grad_norm": 1.6925846338272095, + "learning_rate": 2.164342453662842e-05, + "loss": 0.4676, + "step": 16070 + }, + { + "epoch": 2.838231400582473, + "grad_norm": 0.9859732389450073, + "learning_rate": 2.162577228596646e-05, + "loss": 0.4722, + "step": 16080 + }, + { + "epoch": 2.839996469861442, + "grad_norm": 3.5599405765533447, + "learning_rate": 2.16081200353045e-05, + "loss": 0.4447, + "step": 16090 + }, + { + "epoch": 2.8417615391404114, + "grad_norm": 2.7686328887939453, + "learning_rate": 2.1590467784642542e-05, + "loss": 0.4336, + "step": 16100 + }, + { + "epoch": 2.8435266084193804, + "grad_norm": 1.155197262763977, + "learning_rate": 2.1572815533980584e-05, + "loss": 0.4605, + "step": 16110 + }, + { + "epoch": 2.8452916776983495, + "grad_norm": 1.153023362159729, + "learning_rate": 2.1555163283318625e-05, + "loss": 0.4993, + "step": 16120 + }, + { + "epoch": 2.847056746977319, + "grad_norm": 0.9941584467887878, + "learning_rate": 2.1537511032656666e-05, + "loss": 0.5547, + "step": 16130 + }, + { + "epoch": 2.848821816256288, + "grad_norm": 1.042536973953247, + "learning_rate": 2.1519858781994707e-05, + "loss": 0.4827, + "step": 16140 + }, + { + "epoch": 2.8505868855352574, + "grad_norm": 0.9316404461860657, + "learning_rate": 2.1502206531332748e-05, + "loss": 0.4378, + "step": 16150 + }, + { + "epoch": 2.8523519548142264, + "grad_norm": 0.9529580473899841, + "learning_rate": 2.148455428067079e-05, + "loss": 0.5597, + "step": 16160 + }, + { + "epoch": 2.8541170240931955, + "grad_norm": 3.498310089111328, + "learning_rate": 2.1466902030008827e-05, + "loss": 0.4683, + "step": 16170 + }, + { + "epoch": 2.855882093372165, + "grad_norm": 2.061889410018921, + "learning_rate": 2.1449249779346868e-05, + "loss": 0.5144, + "step": 16180 + }, + { + "epoch": 2.857647162651134, + "grad_norm": 3.0003912448883057, + "learning_rate": 2.1431597528684906e-05, + "loss": 0.5241, + "step": 16190 + }, + { + "epoch": 2.8594122319301034, + "grad_norm": 4.18428897857666, + "learning_rate": 2.1413945278022947e-05, + "loss": 0.4634, + "step": 16200 + }, + { + "epoch": 2.8611773012090724, + "grad_norm": 2.9743313789367676, + "learning_rate": 2.139629302736099e-05, + "loss": 0.5063, + "step": 16210 + }, + { + "epoch": 2.8629423704880415, + "grad_norm": 1.0419633388519287, + "learning_rate": 2.137864077669903e-05, + "loss": 0.4746, + "step": 16220 + }, + { + "epoch": 2.864707439767011, + "grad_norm": 1.1511107683181763, + "learning_rate": 2.136098852603707e-05, + "loss": 0.4511, + "step": 16230 + }, + { + "epoch": 2.86647250904598, + "grad_norm": 3.314408302307129, + "learning_rate": 2.1343336275375112e-05, + "loss": 0.5168, + "step": 16240 + }, + { + "epoch": 2.8682375783249494, + "grad_norm": 1.9849070310592651, + "learning_rate": 2.1325684024713153e-05, + "loss": 0.4397, + "step": 16250 + }, + { + "epoch": 2.8700026476039184, + "grad_norm": 4.134389877319336, + "learning_rate": 2.1308031774051194e-05, + "loss": 0.4684, + "step": 16260 + }, + { + "epoch": 2.8717677168828875, + "grad_norm": 1.888159990310669, + "learning_rate": 2.1290379523389235e-05, + "loss": 0.4666, + "step": 16270 + }, + { + "epoch": 2.873532786161857, + "grad_norm": 0.756315290927887, + "learning_rate": 2.1272727272727276e-05, + "loss": 0.5064, + "step": 16280 + }, + { + "epoch": 2.875297855440826, + "grad_norm": 1.1455638408660889, + "learning_rate": 2.1255075022065314e-05, + "loss": 0.4677, + "step": 16290 + }, + { + "epoch": 2.8770629247197954, + "grad_norm": 2.3531157970428467, + "learning_rate": 2.1237422771403355e-05, + "loss": 0.5371, + "step": 16300 + }, + { + "epoch": 2.8788279939987644, + "grad_norm": 2.039882183074951, + "learning_rate": 2.1219770520741393e-05, + "loss": 0.5056, + "step": 16310 + }, + { + "epoch": 2.8805930632777335, + "grad_norm": 0.7574838995933533, + "learning_rate": 2.1202118270079434e-05, + "loss": 0.4737, + "step": 16320 + }, + { + "epoch": 2.882358132556703, + "grad_norm": 2.321002960205078, + "learning_rate": 2.1184466019417475e-05, + "loss": 0.4875, + "step": 16330 + }, + { + "epoch": 2.884123201835672, + "grad_norm": 1.2612684965133667, + "learning_rate": 2.1166813768755516e-05, + "loss": 0.5249, + "step": 16340 + }, + { + "epoch": 2.8858882711146414, + "grad_norm": 0.8896175622940063, + "learning_rate": 2.1149161518093558e-05, + "loss": 0.4589, + "step": 16350 + }, + { + "epoch": 2.8876533403936104, + "grad_norm": 0.7815883755683899, + "learning_rate": 2.11315092674316e-05, + "loss": 0.5376, + "step": 16360 + }, + { + "epoch": 2.8894184096725795, + "grad_norm": 1.2477344274520874, + "learning_rate": 2.111385701676964e-05, + "loss": 0.495, + "step": 16370 + }, + { + "epoch": 2.891183478951549, + "grad_norm": 1.113911747932434, + "learning_rate": 2.109620476610768e-05, + "loss": 0.6071, + "step": 16380 + }, + { + "epoch": 2.892948548230518, + "grad_norm": 0.9196587204933167, + "learning_rate": 2.1078552515445722e-05, + "loss": 0.5024, + "step": 16390 + }, + { + "epoch": 2.8947136175094874, + "grad_norm": 0.9714921116828918, + "learning_rate": 2.1060900264783763e-05, + "loss": 0.447, + "step": 16400 + }, + { + "epoch": 2.8964786867884564, + "grad_norm": 1.1055673360824585, + "learning_rate": 2.10432480141218e-05, + "loss": 0.4452, + "step": 16410 + }, + { + "epoch": 2.8982437560674255, + "grad_norm": 1.2155486345291138, + "learning_rate": 2.1025595763459842e-05, + "loss": 0.565, + "step": 16420 + }, + { + "epoch": 2.900008825346395, + "grad_norm": 1.8765925168991089, + "learning_rate": 2.1007943512797883e-05, + "loss": 0.4882, + "step": 16430 + }, + { + "epoch": 2.901773894625364, + "grad_norm": 0.997562050819397, + "learning_rate": 2.099029126213592e-05, + "loss": 0.6414, + "step": 16440 + }, + { + "epoch": 2.9035389639043334, + "grad_norm": 1.0786616802215576, + "learning_rate": 2.0972639011473962e-05, + "loss": 0.5226, + "step": 16450 + }, + { + "epoch": 2.9053040331833024, + "grad_norm": 4.614233493804932, + "learning_rate": 2.0954986760812003e-05, + "loss": 0.5057, + "step": 16460 + }, + { + "epoch": 2.9070691024622715, + "grad_norm": 2.1534485816955566, + "learning_rate": 2.0937334510150045e-05, + "loss": 0.4424, + "step": 16470 + }, + { + "epoch": 2.908834171741241, + "grad_norm": 0.6817697286605835, + "learning_rate": 2.0919682259488086e-05, + "loss": 0.5091, + "step": 16480 + }, + { + "epoch": 2.91059924102021, + "grad_norm": 3.658924102783203, + "learning_rate": 2.0902030008826127e-05, + "loss": 0.4715, + "step": 16490 + }, + { + "epoch": 2.9123643102991794, + "grad_norm": 0.9275711178779602, + "learning_rate": 2.0884377758164168e-05, + "loss": 0.4714, + "step": 16500 + }, + { + "epoch": 2.9141293795781484, + "grad_norm": 3.1437737941741943, + "learning_rate": 2.086672550750221e-05, + "loss": 0.4552, + "step": 16510 + }, + { + "epoch": 2.9158944488571175, + "grad_norm": 1.1514102220535278, + "learning_rate": 2.0849073256840247e-05, + "loss": 0.5323, + "step": 16520 + }, + { + "epoch": 2.917659518136087, + "grad_norm": 2.7238919734954834, + "learning_rate": 2.0831421006178288e-05, + "loss": 0.4578, + "step": 16530 + }, + { + "epoch": 2.919424587415056, + "grad_norm": 1.4829517602920532, + "learning_rate": 2.081376875551633e-05, + "loss": 0.5226, + "step": 16540 + }, + { + "epoch": 2.9211896566940254, + "grad_norm": 2.4364633560180664, + "learning_rate": 2.079611650485437e-05, + "loss": 0.5122, + "step": 16550 + }, + { + "epoch": 2.9229547259729944, + "grad_norm": 1.0211104154586792, + "learning_rate": 2.077846425419241e-05, + "loss": 0.5789, + "step": 16560 + }, + { + "epoch": 2.9247197952519635, + "grad_norm": 2.342478036880493, + "learning_rate": 2.076081200353045e-05, + "loss": 0.5076, + "step": 16570 + }, + { + "epoch": 2.926484864530933, + "grad_norm": 1.1480425596237183, + "learning_rate": 2.074315975286849e-05, + "loss": 0.5127, + "step": 16580 + }, + { + "epoch": 2.928249933809902, + "grad_norm": 2.9418277740478516, + "learning_rate": 2.072550750220653e-05, + "loss": 0.4792, + "step": 16590 + }, + { + "epoch": 2.9300150030888714, + "grad_norm": 1.6210964918136597, + "learning_rate": 2.0707855251544573e-05, + "loss": 0.5162, + "step": 16600 + }, + { + "epoch": 2.9317800723678404, + "grad_norm": 1.0910576581954956, + "learning_rate": 2.0690203000882614e-05, + "loss": 0.4656, + "step": 16610 + }, + { + "epoch": 2.9335451416468095, + "grad_norm": 2.7877092361450195, + "learning_rate": 2.0672550750220655e-05, + "loss": 0.44, + "step": 16620 + }, + { + "epoch": 2.935310210925779, + "grad_norm": 1.0605442523956299, + "learning_rate": 2.0654898499558696e-05, + "loss": 0.5368, + "step": 16630 + }, + { + "epoch": 2.937075280204748, + "grad_norm": 1.2327733039855957, + "learning_rate": 2.0637246248896734e-05, + "loss": 0.469, + "step": 16640 + }, + { + "epoch": 2.9388403494837174, + "grad_norm": 0.9705036282539368, + "learning_rate": 2.0619593998234775e-05, + "loss": 0.4702, + "step": 16650 + }, + { + "epoch": 2.9406054187626864, + "grad_norm": 0.7878549695014954, + "learning_rate": 2.0601941747572816e-05, + "loss": 0.4524, + "step": 16660 + }, + { + "epoch": 2.9423704880416555, + "grad_norm": 1.2322081327438354, + "learning_rate": 2.0584289496910857e-05, + "loss": 0.4912, + "step": 16670 + }, + { + "epoch": 2.944135557320625, + "grad_norm": 0.9457627534866333, + "learning_rate": 2.05666372462489e-05, + "loss": 0.5209, + "step": 16680 + }, + { + "epoch": 2.945900626599594, + "grad_norm": 1.9645498991012573, + "learning_rate": 2.054898499558694e-05, + "loss": 0.4715, + "step": 16690 + }, + { + "epoch": 2.9476656958785634, + "grad_norm": 3.2327866554260254, + "learning_rate": 2.0531332744924977e-05, + "loss": 0.4796, + "step": 16700 + }, + { + "epoch": 2.9494307651575324, + "grad_norm": 1.7311758995056152, + "learning_rate": 2.051368049426302e-05, + "loss": 0.5579, + "step": 16710 + }, + { + "epoch": 2.9511958344365015, + "grad_norm": 0.8861629962921143, + "learning_rate": 2.049602824360106e-05, + "loss": 0.4753, + "step": 16720 + }, + { + "epoch": 2.952960903715471, + "grad_norm": 0.8525285124778748, + "learning_rate": 2.04783759929391e-05, + "loss": 0.4858, + "step": 16730 + }, + { + "epoch": 2.95472597299444, + "grad_norm": 3.608468532562256, + "learning_rate": 2.0460723742277142e-05, + "loss": 0.4523, + "step": 16740 + }, + { + "epoch": 2.9564910422734094, + "grad_norm": 0.978895902633667, + "learning_rate": 2.0443071491615183e-05, + "loss": 0.4952, + "step": 16750 + }, + { + "epoch": 2.9582561115523784, + "grad_norm": 2.2159929275512695, + "learning_rate": 2.042541924095322e-05, + "loss": 0.5031, + "step": 16760 + }, + { + "epoch": 2.9600211808313475, + "grad_norm": 2.59801983833313, + "learning_rate": 2.0407766990291262e-05, + "loss": 0.4754, + "step": 16770 + }, + { + "epoch": 2.961786250110317, + "grad_norm": 1.045888066291809, + "learning_rate": 2.0390114739629303e-05, + "loss": 0.5499, + "step": 16780 + }, + { + "epoch": 2.963551319389286, + "grad_norm": 0.7944245338439941, + "learning_rate": 2.0372462488967344e-05, + "loss": 0.4866, + "step": 16790 + }, + { + "epoch": 2.9653163886682554, + "grad_norm": 3.4561140537261963, + "learning_rate": 2.0354810238305385e-05, + "loss": 0.5503, + "step": 16800 + }, + { + "epoch": 2.9670814579472244, + "grad_norm": 1.817888855934143, + "learning_rate": 2.0337157987643427e-05, + "loss": 0.541, + "step": 16810 + }, + { + "epoch": 2.9688465272261935, + "grad_norm": 0.9524929523468018, + "learning_rate": 2.0319505736981468e-05, + "loss": 0.4855, + "step": 16820 + }, + { + "epoch": 2.970611596505163, + "grad_norm": 2.639288902282715, + "learning_rate": 2.0301853486319505e-05, + "loss": 0.4909, + "step": 16830 + }, + { + "epoch": 2.972376665784132, + "grad_norm": 0.8207223415374756, + "learning_rate": 2.0284201235657547e-05, + "loss": 0.5087, + "step": 16840 + }, + { + "epoch": 2.9741417350631014, + "grad_norm": 0.8275809288024902, + "learning_rate": 2.0266548984995588e-05, + "loss": 0.4533, + "step": 16850 + }, + { + "epoch": 2.9759068043420704, + "grad_norm": 0.7429968118667603, + "learning_rate": 2.024889673433363e-05, + "loss": 0.5182, + "step": 16860 + }, + { + "epoch": 2.9776718736210395, + "grad_norm": 1.4792590141296387, + "learning_rate": 2.023124448367167e-05, + "loss": 0.4303, + "step": 16870 + }, + { + "epoch": 2.979436942900009, + "grad_norm": 2.502073287963867, + "learning_rate": 2.0213592233009708e-05, + "loss": 0.5349, + "step": 16880 + }, + { + "epoch": 2.981202012178978, + "grad_norm": 2.577254295349121, + "learning_rate": 2.019593998234775e-05, + "loss": 0.5173, + "step": 16890 + }, + { + "epoch": 2.9829670814579474, + "grad_norm": 3.045180559158325, + "learning_rate": 2.017828773168579e-05, + "loss": 0.4923, + "step": 16900 + }, + { + "epoch": 2.9847321507369164, + "grad_norm": 0.8430206775665283, + "learning_rate": 2.016063548102383e-05, + "loss": 0.498, + "step": 16910 + }, + { + "epoch": 2.9864972200158855, + "grad_norm": 2.0831096172332764, + "learning_rate": 2.0142983230361872e-05, + "loss": 0.5765, + "step": 16920 + }, + { + "epoch": 2.988262289294855, + "grad_norm": 0.9307297468185425, + "learning_rate": 2.0125330979699914e-05, + "loss": 0.441, + "step": 16930 + }, + { + "epoch": 2.990027358573824, + "grad_norm": 1.0840940475463867, + "learning_rate": 2.0107678729037955e-05, + "loss": 0.5035, + "step": 16940 + }, + { + "epoch": 2.9917924278527934, + "grad_norm": 1.0643527507781982, + "learning_rate": 2.0090026478375996e-05, + "loss": 0.4537, + "step": 16950 + }, + { + "epoch": 2.9935574971317624, + "grad_norm": 0.859700620174408, + "learning_rate": 2.0072374227714037e-05, + "loss": 0.5502, + "step": 16960 + }, + { + "epoch": 2.9953225664107315, + "grad_norm": 1.9049547910690308, + "learning_rate": 2.0054721977052075e-05, + "loss": 0.4597, + "step": 16970 + }, + { + "epoch": 2.997087635689701, + "grad_norm": 1.4590740203857422, + "learning_rate": 2.0037069726390116e-05, + "loss": 0.4553, + "step": 16980 + }, + { + "epoch": 2.99885270496867, + "grad_norm": 2.080549955368042, + "learning_rate": 2.0019417475728157e-05, + "loss": 0.4728, + "step": 16990 + }, + { + "epoch": 3.0006177742476394, + "grad_norm": 1.6013654470443726, + "learning_rate": 2.0001765225066195e-05, + "loss": 0.4843, + "step": 17000 + }, + { + "epoch": 3.0006177742476394, + "eval_loss": 0.6180456280708313, + "eval_runtime": 591.8623, + "eval_samples_per_second": 47.861, + "eval_steps_per_second": 2.394, + "eval_token_accuracy": 0.0004966886072989477, + "step": 17000 + }, + { + "epoch": 3.0023828435266084, + "grad_norm": 1.9730802774429321, + "learning_rate": 1.9984112974404236e-05, + "loss": 0.3307, + "step": 17010 + }, + { + "epoch": 3.0041479128055775, + "grad_norm": 3.203862428665161, + "learning_rate": 1.9966460723742277e-05, + "loss": 0.4227, + "step": 17020 + }, + { + "epoch": 3.005912982084547, + "grad_norm": 0.9686912894248962, + "learning_rate": 1.9948808473080318e-05, + "loss": 0.3816, + "step": 17030 + }, + { + "epoch": 3.007678051363516, + "grad_norm": 0.8262260556221008, + "learning_rate": 1.993115622241836e-05, + "loss": 0.3952, + "step": 17040 + }, + { + "epoch": 3.0094431206424854, + "grad_norm": 1.0676703453063965, + "learning_rate": 1.99135039717564e-05, + "loss": 0.3561, + "step": 17050 + }, + { + "epoch": 3.0112081899214544, + "grad_norm": 1.036767840385437, + "learning_rate": 1.989585172109444e-05, + "loss": 0.3818, + "step": 17060 + }, + { + "epoch": 3.0129732592004235, + "grad_norm": 0.9834463000297546, + "learning_rate": 1.9878199470432483e-05, + "loss": 0.378, + "step": 17070 + }, + { + "epoch": 3.014738328479393, + "grad_norm": 0.8777109980583191, + "learning_rate": 1.9860547219770524e-05, + "loss": 0.4404, + "step": 17080 + }, + { + "epoch": 3.016503397758362, + "grad_norm": 0.7740920186042786, + "learning_rate": 1.9842894969108565e-05, + "loss": 0.4248, + "step": 17090 + }, + { + "epoch": 3.0182684670373314, + "grad_norm": 1.1949310302734375, + "learning_rate": 1.9825242718446603e-05, + "loss": 0.4148, + "step": 17100 + }, + { + "epoch": 3.0200335363163004, + "grad_norm": 0.8576988577842712, + "learning_rate": 1.9807590467784644e-05, + "loss": 0.384, + "step": 17110 + }, + { + "epoch": 3.0217986055952695, + "grad_norm": 1.0700007677078247, + "learning_rate": 1.9789938217122682e-05, + "loss": 0.3596, + "step": 17120 + }, + { + "epoch": 3.023563674874239, + "grad_norm": 3.1668031215667725, + "learning_rate": 1.9772285966460723e-05, + "loss": 0.3766, + "step": 17130 + }, + { + "epoch": 3.025328744153208, + "grad_norm": 0.8742389678955078, + "learning_rate": 1.9754633715798764e-05, + "loss": 0.3683, + "step": 17140 + }, + { + "epoch": 3.0270938134321774, + "grad_norm": 2.866408109664917, + "learning_rate": 1.9736981465136805e-05, + "loss": 0.3652, + "step": 17150 + }, + { + "epoch": 3.0288588827111464, + "grad_norm": 3.1418302059173584, + "learning_rate": 1.9719329214474846e-05, + "loss": 0.3758, + "step": 17160 + }, + { + "epoch": 3.0306239519901155, + "grad_norm": 3.4927444458007812, + "learning_rate": 1.9701676963812888e-05, + "loss": 0.4894, + "step": 17170 + }, + { + "epoch": 3.032389021269085, + "grad_norm": 1.428471565246582, + "learning_rate": 1.968402471315093e-05, + "loss": 0.3906, + "step": 17180 + }, + { + "epoch": 3.034154090548054, + "grad_norm": 2.707277297973633, + "learning_rate": 1.966637246248897e-05, + "loss": 0.4501, + "step": 17190 + }, + { + "epoch": 3.0359191598270234, + "grad_norm": 0.8563360571861267, + "learning_rate": 1.964872021182701e-05, + "loss": 0.3319, + "step": 17200 + }, + { + "epoch": 3.0376842291059925, + "grad_norm": 0.9764096140861511, + "learning_rate": 1.9631067961165052e-05, + "loss": 0.38, + "step": 17210 + }, + { + "epoch": 3.0394492983849615, + "grad_norm": 2.875133991241455, + "learning_rate": 1.961341571050309e-05, + "loss": 0.3718, + "step": 17220 + }, + { + "epoch": 3.041214367663931, + "grad_norm": 0.8888715505599976, + "learning_rate": 1.959576345984113e-05, + "loss": 0.3622, + "step": 17230 + }, + { + "epoch": 3.0429794369429, + "grad_norm": 3.4094061851501465, + "learning_rate": 1.957811120917917e-05, + "loss": 0.3764, + "step": 17240 + }, + { + "epoch": 3.0447445062218694, + "grad_norm": 1.1295435428619385, + "learning_rate": 1.956045895851721e-05, + "loss": 0.3731, + "step": 17250 + }, + { + "epoch": 3.0465095755008385, + "grad_norm": 2.553759813308716, + "learning_rate": 1.954280670785525e-05, + "loss": 0.3827, + "step": 17260 + }, + { + "epoch": 3.0482746447798075, + "grad_norm": 2.6989941596984863, + "learning_rate": 1.9525154457193292e-05, + "loss": 0.3205, + "step": 17270 + }, + { + "epoch": 3.050039714058777, + "grad_norm": 3.229684352874756, + "learning_rate": 1.9507502206531333e-05, + "loss": 0.4119, + "step": 17280 + }, + { + "epoch": 3.051804783337746, + "grad_norm": 2.388998508453369, + "learning_rate": 1.9489849955869374e-05, + "loss": 0.3727, + "step": 17290 + }, + { + "epoch": 3.0535698526167154, + "grad_norm": 0.7352622151374817, + "learning_rate": 1.9472197705207416e-05, + "loss": 0.4236, + "step": 17300 + }, + { + "epoch": 3.0553349218956845, + "grad_norm": 1.1700186729431152, + "learning_rate": 1.9454545454545457e-05, + "loss": 0.3871, + "step": 17310 + }, + { + "epoch": 3.0570999911746535, + "grad_norm": 0.6964054703712463, + "learning_rate": 1.9436893203883498e-05, + "loss": 0.3644, + "step": 17320 + }, + { + "epoch": 3.058865060453623, + "grad_norm": 2.9264075756073, + "learning_rate": 1.941924095322154e-05, + "loss": 0.3235, + "step": 17330 + }, + { + "epoch": 3.060630129732592, + "grad_norm": 2.7784578800201416, + "learning_rate": 1.9401588702559577e-05, + "loss": 0.4357, + "step": 17340 + }, + { + "epoch": 3.062395199011561, + "grad_norm": 3.4185690879821777, + "learning_rate": 1.9383936451897618e-05, + "loss": 0.4222, + "step": 17350 + }, + { + "epoch": 3.0641602682905305, + "grad_norm": 2.2247936725616455, + "learning_rate": 1.9366284201235656e-05, + "loss": 0.4036, + "step": 17360 + }, + { + "epoch": 3.0659253375694995, + "grad_norm": 3.4868199825286865, + "learning_rate": 1.9348631950573697e-05, + "loss": 0.3873, + "step": 17370 + }, + { + "epoch": 3.067690406848469, + "grad_norm": 3.106703758239746, + "learning_rate": 1.9330979699911738e-05, + "loss": 0.384, + "step": 17380 + }, + { + "epoch": 3.069455476127438, + "grad_norm": 0.9752678871154785, + "learning_rate": 1.931332744924978e-05, + "loss": 0.3587, + "step": 17390 + }, + { + "epoch": 3.071220545406407, + "grad_norm": 1.7790659666061401, + "learning_rate": 1.929567519858782e-05, + "loss": 0.3493, + "step": 17400 + }, + { + "epoch": 3.0729856146853765, + "grad_norm": 0.9488353133201599, + "learning_rate": 1.927802294792586e-05, + "loss": 0.3924, + "step": 17410 + }, + { + "epoch": 3.0747506839643455, + "grad_norm": 3.756638288497925, + "learning_rate": 1.9260370697263903e-05, + "loss": 0.4504, + "step": 17420 + }, + { + "epoch": 3.076515753243315, + "grad_norm": 2.0129315853118896, + "learning_rate": 1.9242718446601944e-05, + "loss": 0.4081, + "step": 17430 + }, + { + "epoch": 3.078280822522284, + "grad_norm": 0.9279769062995911, + "learning_rate": 1.9225066195939985e-05, + "loss": 0.3911, + "step": 17440 + }, + { + "epoch": 3.080045891801253, + "grad_norm": 1.094159483909607, + "learning_rate": 1.9207413945278026e-05, + "loss": 0.3615, + "step": 17450 + }, + { + "epoch": 3.0818109610802225, + "grad_norm": 1.5764673948287964, + "learning_rate": 1.9189761694616064e-05, + "loss": 0.427, + "step": 17460 + }, + { + "epoch": 3.0835760303591915, + "grad_norm": 2.997194528579712, + "learning_rate": 1.9172109443954105e-05, + "loss": 0.3908, + "step": 17470 + }, + { + "epoch": 3.085341099638161, + "grad_norm": 0.5910589694976807, + "learning_rate": 1.9154457193292146e-05, + "loss": 0.3617, + "step": 17480 + }, + { + "epoch": 3.08710616891713, + "grad_norm": 2.0380403995513916, + "learning_rate": 1.9136804942630187e-05, + "loss": 0.3828, + "step": 17490 + }, + { + "epoch": 3.088871238196099, + "grad_norm": 2.3677608966827393, + "learning_rate": 1.9119152691968225e-05, + "loss": 0.3954, + "step": 17500 + }, + { + "epoch": 3.0906363074750685, + "grad_norm": 1.9572163820266724, + "learning_rate": 1.9101500441306266e-05, + "loss": 0.3373, + "step": 17510 + }, + { + "epoch": 3.0924013767540375, + "grad_norm": 3.817796230316162, + "learning_rate": 1.9083848190644307e-05, + "loss": 0.3983, + "step": 17520 + }, + { + "epoch": 3.094166446033007, + "grad_norm": 0.9311608672142029, + "learning_rate": 1.906619593998235e-05, + "loss": 0.4515, + "step": 17530 + }, + { + "epoch": 3.095931515311976, + "grad_norm": 2.0781655311584473, + "learning_rate": 1.904854368932039e-05, + "loss": 0.3921, + "step": 17540 + }, + { + "epoch": 3.097696584590945, + "grad_norm": 1.2067509889602661, + "learning_rate": 1.903089143865843e-05, + "loss": 0.4393, + "step": 17550 + }, + { + "epoch": 3.0994616538699145, + "grad_norm": 0.8493309020996094, + "learning_rate": 1.9013239187996472e-05, + "loss": 0.4344, + "step": 17560 + }, + { + "epoch": 3.1012267231488835, + "grad_norm": 4.165531158447266, + "learning_rate": 1.8995586937334513e-05, + "loss": 0.3652, + "step": 17570 + }, + { + "epoch": 3.102991792427853, + "grad_norm": 0.9132296442985535, + "learning_rate": 1.897793468667255e-05, + "loss": 0.3535, + "step": 17580 + }, + { + "epoch": 3.104756861706822, + "grad_norm": 4.703742027282715, + "learning_rate": 1.8960282436010592e-05, + "loss": 0.412, + "step": 17590 + }, + { + "epoch": 3.106521930985791, + "grad_norm": 2.0259885787963867, + "learning_rate": 1.8942630185348633e-05, + "loss": 0.3206, + "step": 17600 + }, + { + "epoch": 3.1082870002647605, + "grad_norm": 0.9822014570236206, + "learning_rate": 1.8924977934686674e-05, + "loss": 0.3595, + "step": 17610 + }, + { + "epoch": 3.1100520695437295, + "grad_norm": 0.8171116709709167, + "learning_rate": 1.8907325684024715e-05, + "loss": 0.3018, + "step": 17620 + }, + { + "epoch": 3.111817138822699, + "grad_norm": 1.1940850019454956, + "learning_rate": 1.8889673433362753e-05, + "loss": 0.3922, + "step": 17630 + }, + { + "epoch": 3.113582208101668, + "grad_norm": 0.7778500914573669, + "learning_rate": 1.8872021182700794e-05, + "loss": 0.3292, + "step": 17640 + }, + { + "epoch": 3.115347277380637, + "grad_norm": 1.2759803533554077, + "learning_rate": 1.8854368932038835e-05, + "loss": 0.3997, + "step": 17650 + }, + { + "epoch": 3.1171123466596065, + "grad_norm": 1.983506441116333, + "learning_rate": 1.8836716681376877e-05, + "loss": 0.3327, + "step": 17660 + }, + { + "epoch": 3.1188774159385755, + "grad_norm": 1.1584614515304565, + "learning_rate": 1.8819064430714918e-05, + "loss": 0.345, + "step": 17670 + }, + { + "epoch": 3.120642485217545, + "grad_norm": 0.903225839138031, + "learning_rate": 1.880141218005296e-05, + "loss": 0.3868, + "step": 17680 + }, + { + "epoch": 3.122407554496514, + "grad_norm": 1.5125501155853271, + "learning_rate": 1.8783759929390997e-05, + "loss": 0.369, + "step": 17690 + }, + { + "epoch": 3.124172623775483, + "grad_norm": 0.9562115669250488, + "learning_rate": 1.8766107678729038e-05, + "loss": 0.3886, + "step": 17700 + }, + { + "epoch": 3.1259376930544525, + "grad_norm": 1.0078356266021729, + "learning_rate": 1.874845542806708e-05, + "loss": 0.4391, + "step": 17710 + }, + { + "epoch": 3.1277027623334215, + "grad_norm": 1.0851801633834839, + "learning_rate": 1.873080317740512e-05, + "loss": 0.3967, + "step": 17720 + }, + { + "epoch": 3.129467831612391, + "grad_norm": 2.5538651943206787, + "learning_rate": 1.871315092674316e-05, + "loss": 0.3671, + "step": 17730 + }, + { + "epoch": 3.13123290089136, + "grad_norm": 4.896754264831543, + "learning_rate": 1.8695498676081202e-05, + "loss": 0.3569, + "step": 17740 + }, + { + "epoch": 3.132997970170329, + "grad_norm": 1.1560003757476807, + "learning_rate": 1.8677846425419243e-05, + "loss": 0.3262, + "step": 17750 + }, + { + "epoch": 3.1347630394492985, + "grad_norm": 3.046627998352051, + "learning_rate": 1.866019417475728e-05, + "loss": 0.4524, + "step": 17760 + }, + { + "epoch": 3.1365281087282675, + "grad_norm": 3.14935040473938, + "learning_rate": 1.8642541924095322e-05, + "loss": 0.3734, + "step": 17770 + }, + { + "epoch": 3.138293178007237, + "grad_norm": 1.015893578529358, + "learning_rate": 1.8624889673433364e-05, + "loss": 0.522, + "step": 17780 + }, + { + "epoch": 3.140058247286206, + "grad_norm": 2.6474082469940186, + "learning_rate": 1.8607237422771405e-05, + "loss": 0.3783, + "step": 17790 + }, + { + "epoch": 3.141823316565175, + "grad_norm": 0.9868051409721375, + "learning_rate": 1.8589585172109446e-05, + "loss": 0.4548, + "step": 17800 + }, + { + "epoch": 3.1435883858441445, + "grad_norm": 2.009326696395874, + "learning_rate": 1.8571932921447484e-05, + "loss": 0.3878, + "step": 17810 + }, + { + "epoch": 3.1453534551231135, + "grad_norm": 2.1260197162628174, + "learning_rate": 1.8554280670785525e-05, + "loss": 0.365, + "step": 17820 + }, + { + "epoch": 3.147118524402083, + "grad_norm": 1.069753885269165, + "learning_rate": 1.8536628420123566e-05, + "loss": 0.4153, + "step": 17830 + }, + { + "epoch": 3.148883593681052, + "grad_norm": 1.5456312894821167, + "learning_rate": 1.8518976169461607e-05, + "loss": 0.4015, + "step": 17840 + }, + { + "epoch": 3.150648662960021, + "grad_norm": 1.6291440725326538, + "learning_rate": 1.8501323918799648e-05, + "loss": 0.3667, + "step": 17850 + }, + { + "epoch": 3.1524137322389905, + "grad_norm": 1.152271032333374, + "learning_rate": 1.848367166813769e-05, + "loss": 0.3785, + "step": 17860 + }, + { + "epoch": 3.1541788015179595, + "grad_norm": 1.0361758470535278, + "learning_rate": 1.846601941747573e-05, + "loss": 0.3797, + "step": 17870 + }, + { + "epoch": 3.155943870796929, + "grad_norm": 0.9247083067893982, + "learning_rate": 1.844836716681377e-05, + "loss": 0.4611, + "step": 17880 + }, + { + "epoch": 3.157708940075898, + "grad_norm": 1.0259307622909546, + "learning_rate": 1.843071491615181e-05, + "loss": 0.3774, + "step": 17890 + }, + { + "epoch": 3.159474009354867, + "grad_norm": 0.8783037662506104, + "learning_rate": 1.841306266548985e-05, + "loss": 0.3992, + "step": 17900 + }, + { + "epoch": 3.1612390786338365, + "grad_norm": 1.3424186706542969, + "learning_rate": 1.839541041482789e-05, + "loss": 0.4161, + "step": 17910 + }, + { + "epoch": 3.1630041479128055, + "grad_norm": 1.8806232213974, + "learning_rate": 1.8377758164165933e-05, + "loss": 0.4088, + "step": 17920 + }, + { + "epoch": 3.164769217191775, + "grad_norm": 4.50385046005249, + "learning_rate": 1.836010591350397e-05, + "loss": 0.4092, + "step": 17930 + }, + { + "epoch": 3.166534286470744, + "grad_norm": 1.1971051692962646, + "learning_rate": 1.8342453662842012e-05, + "loss": 0.4802, + "step": 17940 + }, + { + "epoch": 3.168299355749713, + "grad_norm": 2.5064802169799805, + "learning_rate": 1.8324801412180053e-05, + "loss": 0.4109, + "step": 17950 + }, + { + "epoch": 3.1700644250286825, + "grad_norm": 1.0116615295410156, + "learning_rate": 1.8307149161518094e-05, + "loss": 0.3885, + "step": 17960 + }, + { + "epoch": 3.1718294943076515, + "grad_norm": 2.087759256362915, + "learning_rate": 1.8289496910856135e-05, + "loss": 0.4146, + "step": 17970 + }, + { + "epoch": 3.173594563586621, + "grad_norm": 1.0581564903259277, + "learning_rate": 1.8271844660194176e-05, + "loss": 0.4173, + "step": 17980 + }, + { + "epoch": 3.17535963286559, + "grad_norm": 3.7093119621276855, + "learning_rate": 1.825595763459841e-05, + "loss": 0.4077, + "step": 17990 + }, + { + "epoch": 3.177124702144559, + "grad_norm": 0.9037616848945618, + "learning_rate": 1.8238305383936452e-05, + "loss": 0.4099, + "step": 18000 + }, + { + "epoch": 3.177124702144559, + "eval_loss": 0.635881781578064, + "eval_runtime": 591.7121, + "eval_samples_per_second": 47.873, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0004861337206706088, + "step": 18000 + }, + { + "epoch": 3.1788897714235285, + "grad_norm": 1.0461044311523438, + "learning_rate": 1.8220653133274493e-05, + "loss": 0.3745, + "step": 18010 + }, + { + "epoch": 3.1806548407024975, + "grad_norm": 3.2152843475341797, + "learning_rate": 1.8203000882612534e-05, + "loss": 0.4088, + "step": 18020 + }, + { + "epoch": 3.182419909981467, + "grad_norm": 0.8267636895179749, + "learning_rate": 1.8185348631950575e-05, + "loss": 0.4058, + "step": 18030 + }, + { + "epoch": 3.184184979260436, + "grad_norm": 1.0266914367675781, + "learning_rate": 1.8167696381288616e-05, + "loss": 0.4244, + "step": 18040 + }, + { + "epoch": 3.185950048539405, + "grad_norm": 3.4762895107269287, + "learning_rate": 1.8150044130626657e-05, + "loss": 0.4025, + "step": 18050 + }, + { + "epoch": 3.1877151178183745, + "grad_norm": 2.3744864463806152, + "learning_rate": 1.8132391879964695e-05, + "loss": 0.3202, + "step": 18060 + }, + { + "epoch": 3.1894801870973435, + "grad_norm": 4.3991379737854, + "learning_rate": 1.8114739629302736e-05, + "loss": 0.4466, + "step": 18070 + }, + { + "epoch": 3.191245256376313, + "grad_norm": 1.149390697479248, + "learning_rate": 1.8097087378640778e-05, + "loss": 0.3886, + "step": 18080 + }, + { + "epoch": 3.193010325655282, + "grad_norm": 2.5938711166381836, + "learning_rate": 1.807943512797882e-05, + "loss": 0.3872, + "step": 18090 + }, + { + "epoch": 3.194775394934251, + "grad_norm": 2.244466781616211, + "learning_rate": 1.806178287731686e-05, + "loss": 0.424, + "step": 18100 + }, + { + "epoch": 3.1965404642132205, + "grad_norm": 1.0753962993621826, + "learning_rate": 1.8044130626654898e-05, + "loss": 0.4268, + "step": 18110 + }, + { + "epoch": 3.1983055334921895, + "grad_norm": 2.415478229522705, + "learning_rate": 1.802647837599294e-05, + "loss": 0.4399, + "step": 18120 + }, + { + "epoch": 3.200070602771159, + "grad_norm": 1.0524026155471802, + "learning_rate": 1.800882612533098e-05, + "loss": 0.4179, + "step": 18130 + }, + { + "epoch": 3.201835672050128, + "grad_norm": 0.7873696684837341, + "learning_rate": 1.799117387466902e-05, + "loss": 0.3989, + "step": 18140 + }, + { + "epoch": 3.203600741329097, + "grad_norm": 1.1974061727523804, + "learning_rate": 1.7973521624007062e-05, + "loss": 0.3389, + "step": 18150 + }, + { + "epoch": 3.2053658106080665, + "grad_norm": 2.7853527069091797, + "learning_rate": 1.7955869373345103e-05, + "loss": 0.4219, + "step": 18160 + }, + { + "epoch": 3.2071308798870355, + "grad_norm": 1.1441482305526733, + "learning_rate": 1.7938217122683144e-05, + "loss": 0.3902, + "step": 18170 + }, + { + "epoch": 3.208895949166005, + "grad_norm": 2.0141642093658447, + "learning_rate": 1.7920564872021182e-05, + "loss": 0.3761, + "step": 18180 + }, + { + "epoch": 3.210661018444974, + "grad_norm": 1.0025014877319336, + "learning_rate": 1.7902912621359223e-05, + "loss": 0.4484, + "step": 18190 + }, + { + "epoch": 3.212426087723943, + "grad_norm": 3.727806568145752, + "learning_rate": 1.7885260370697265e-05, + "loss": 0.3911, + "step": 18200 + }, + { + "epoch": 3.2141911570029125, + "grad_norm": 0.956070065498352, + "learning_rate": 1.7867608120035306e-05, + "loss": 0.4688, + "step": 18210 + }, + { + "epoch": 3.2159562262818815, + "grad_norm": 0.9003653526306152, + "learning_rate": 1.7849955869373347e-05, + "loss": 0.4221, + "step": 18220 + }, + { + "epoch": 3.217721295560851, + "grad_norm": 0.8405077457427979, + "learning_rate": 1.7832303618711388e-05, + "loss": 0.3842, + "step": 18230 + }, + { + "epoch": 3.21948636483982, + "grad_norm": 3.997713088989258, + "learning_rate": 1.7814651368049426e-05, + "loss": 0.3925, + "step": 18240 + }, + { + "epoch": 3.221251434118789, + "grad_norm": 2.2268900871276855, + "learning_rate": 1.7796999117387467e-05, + "loss": 0.4403, + "step": 18250 + }, + { + "epoch": 3.2230165033977585, + "grad_norm": 2.1332101821899414, + "learning_rate": 1.7779346866725508e-05, + "loss": 0.3982, + "step": 18260 + }, + { + "epoch": 3.2247815726767275, + "grad_norm": 0.9260847568511963, + "learning_rate": 1.776169461606355e-05, + "loss": 0.3877, + "step": 18270 + }, + { + "epoch": 3.226546641955697, + "grad_norm": 1.0425130128860474, + "learning_rate": 1.774404236540159e-05, + "loss": 0.3911, + "step": 18280 + }, + { + "epoch": 3.228311711234666, + "grad_norm": 0.8160964250564575, + "learning_rate": 1.772639011473963e-05, + "loss": 0.383, + "step": 18290 + }, + { + "epoch": 3.230076780513635, + "grad_norm": 0.8338566422462463, + "learning_rate": 1.770873786407767e-05, + "loss": 0.4303, + "step": 18300 + }, + { + "epoch": 3.2318418497926045, + "grad_norm": 1.5822185277938843, + "learning_rate": 1.769108561341571e-05, + "loss": 0.3649, + "step": 18310 + }, + { + "epoch": 3.2336069190715735, + "grad_norm": 2.6177778244018555, + "learning_rate": 1.767343336275375e-05, + "loss": 0.4294, + "step": 18320 + }, + { + "epoch": 3.235371988350543, + "grad_norm": 1.1790587902069092, + "learning_rate": 1.7655781112091793e-05, + "loss": 0.4309, + "step": 18330 + }, + { + "epoch": 3.237137057629512, + "grad_norm": 1.1360477209091187, + "learning_rate": 1.7638128861429834e-05, + "loss": 0.4458, + "step": 18340 + }, + { + "epoch": 3.238902126908481, + "grad_norm": 1.3596748113632202, + "learning_rate": 1.7620476610767875e-05, + "loss": 0.4219, + "step": 18350 + }, + { + "epoch": 3.2406671961874505, + "grad_norm": 3.5767550468444824, + "learning_rate": 1.7602824360105916e-05, + "loss": 0.387, + "step": 18360 + }, + { + "epoch": 3.2424322654664195, + "grad_norm": 1.0752480030059814, + "learning_rate": 1.7585172109443954e-05, + "loss": 0.4437, + "step": 18370 + }, + { + "epoch": 3.244197334745389, + "grad_norm": 0.7791188955307007, + "learning_rate": 1.7567519858781995e-05, + "loss": 0.3724, + "step": 18380 + }, + { + "epoch": 3.245962404024358, + "grad_norm": 2.880629062652588, + "learning_rate": 1.7549867608120036e-05, + "loss": 0.3876, + "step": 18390 + }, + { + "epoch": 3.247727473303327, + "grad_norm": 1.916669487953186, + "learning_rate": 1.7532215357458077e-05, + "loss": 0.3946, + "step": 18400 + }, + { + "epoch": 3.2494925425822965, + "grad_norm": 1.1796541213989258, + "learning_rate": 1.751456310679612e-05, + "loss": 0.3918, + "step": 18410 + }, + { + "epoch": 3.2512576118612655, + "grad_norm": 2.1787917613983154, + "learning_rate": 1.7496910856134156e-05, + "loss": 0.3771, + "step": 18420 + }, + { + "epoch": 3.2530226811402345, + "grad_norm": 2.3476030826568604, + "learning_rate": 1.7479258605472197e-05, + "loss": 0.4083, + "step": 18430 + }, + { + "epoch": 3.254787750419204, + "grad_norm": 2.371819496154785, + "learning_rate": 1.746160635481024e-05, + "loss": 0.3711, + "step": 18440 + }, + { + "epoch": 3.256552819698173, + "grad_norm": 0.7971315383911133, + "learning_rate": 1.744395410414828e-05, + "loss": 0.3566, + "step": 18450 + }, + { + "epoch": 3.2583178889771425, + "grad_norm": 1.1119778156280518, + "learning_rate": 1.742630185348632e-05, + "loss": 0.39, + "step": 18460 + }, + { + "epoch": 3.2600829582561115, + "grad_norm": 3.2914371490478516, + "learning_rate": 1.7408649602824362e-05, + "loss": 0.429, + "step": 18470 + }, + { + "epoch": 3.2618480275350805, + "grad_norm": 1.6263724565505981, + "learning_rate": 1.7390997352162403e-05, + "loss": 0.3881, + "step": 18480 + }, + { + "epoch": 3.26361309681405, + "grad_norm": 3.777236223220825, + "learning_rate": 1.7373345101500444e-05, + "loss": 0.3768, + "step": 18490 + }, + { + "epoch": 3.265378166093019, + "grad_norm": 0.7307336330413818, + "learning_rate": 1.7355692850838482e-05, + "loss": 0.3739, + "step": 18500 + }, + { + "epoch": 3.2671432353719885, + "grad_norm": 0.869637668132782, + "learning_rate": 1.7338040600176523e-05, + "loss": 0.5611, + "step": 18510 + }, + { + "epoch": 3.2689083046509575, + "grad_norm": 1.3609660863876343, + "learning_rate": 1.7320388349514564e-05, + "loss": 0.3561, + "step": 18520 + }, + { + "epoch": 3.2706733739299265, + "grad_norm": 1.3725719451904297, + "learning_rate": 1.7302736098852605e-05, + "loss": 0.4136, + "step": 18530 + }, + { + "epoch": 3.272438443208896, + "grad_norm": 1.636234998703003, + "learning_rate": 1.7285083848190643e-05, + "loss": 0.3679, + "step": 18540 + }, + { + "epoch": 3.274203512487865, + "grad_norm": 0.9934202432632446, + "learning_rate": 1.7267431597528684e-05, + "loss": 0.4268, + "step": 18550 + }, + { + "epoch": 3.2759685817668345, + "grad_norm": 2.107452869415283, + "learning_rate": 1.7249779346866725e-05, + "loss": 0.4559, + "step": 18560 + }, + { + "epoch": 3.2777336510458035, + "grad_norm": 1.0061404705047607, + "learning_rate": 1.7232127096204767e-05, + "loss": 0.3688, + "step": 18570 + }, + { + "epoch": 3.2794987203247725, + "grad_norm": 1.0886799097061157, + "learning_rate": 1.7214474845542808e-05, + "loss": 0.3709, + "step": 18580 + }, + { + "epoch": 3.281263789603742, + "grad_norm": 0.9672642946243286, + "learning_rate": 1.719682259488085e-05, + "loss": 0.4803, + "step": 18590 + }, + { + "epoch": 3.283028858882711, + "grad_norm": 2.108147382736206, + "learning_rate": 1.717917034421889e-05, + "loss": 0.3735, + "step": 18600 + }, + { + "epoch": 3.2847939281616805, + "grad_norm": 1.3874099254608154, + "learning_rate": 1.716151809355693e-05, + "loss": 0.3592, + "step": 18610 + }, + { + "epoch": 3.2865589974406495, + "grad_norm": 1.8656635284423828, + "learning_rate": 1.7143865842894972e-05, + "loss": 0.3573, + "step": 18620 + }, + { + "epoch": 3.2883240667196185, + "grad_norm": 2.979400873184204, + "learning_rate": 1.7126213592233013e-05, + "loss": 0.4112, + "step": 18630 + }, + { + "epoch": 3.290089135998588, + "grad_norm": 1.2417200803756714, + "learning_rate": 1.710856134157105e-05, + "loss": 0.3632, + "step": 18640 + }, + { + "epoch": 3.291854205277557, + "grad_norm": 1.0724737644195557, + "learning_rate": 1.7090909090909092e-05, + "loss": 0.3741, + "step": 18650 + }, + { + "epoch": 3.2936192745565265, + "grad_norm": 1.0055967569351196, + "learning_rate": 1.707325684024713e-05, + "loss": 0.4599, + "step": 18660 + }, + { + "epoch": 3.2953843438354955, + "grad_norm": 1.7651218175888062, + "learning_rate": 1.705560458958517e-05, + "loss": 0.3824, + "step": 18670 + }, + { + "epoch": 3.2971494131144645, + "grad_norm": 0.8168540000915527, + "learning_rate": 1.7037952338923212e-05, + "loss": 0.4056, + "step": 18680 + }, + { + "epoch": 3.298914482393434, + "grad_norm": 1.0433987379074097, + "learning_rate": 1.7020300088261254e-05, + "loss": 0.4021, + "step": 18690 + }, + { + "epoch": 3.300679551672403, + "grad_norm": 1.1858054399490356, + "learning_rate": 1.7002647837599295e-05, + "loss": 0.4621, + "step": 18700 + }, + { + "epoch": 3.3024446209513725, + "grad_norm": 2.612271785736084, + "learning_rate": 1.6984995586937336e-05, + "loss": 0.4165, + "step": 18710 + }, + { + "epoch": 3.3042096902303415, + "grad_norm": 2.5530431270599365, + "learning_rate": 1.6967343336275377e-05, + "loss": 0.3873, + "step": 18720 + }, + { + "epoch": 3.3059747595093105, + "grad_norm": 1.0394740104675293, + "learning_rate": 1.6949691085613418e-05, + "loss": 0.368, + "step": 18730 + }, + { + "epoch": 3.30773982878828, + "grad_norm": 2.9890246391296387, + "learning_rate": 1.693203883495146e-05, + "loss": 0.3394, + "step": 18740 + }, + { + "epoch": 3.309504898067249, + "grad_norm": 0.9640330672264099, + "learning_rate": 1.69143865842895e-05, + "loss": 0.365, + "step": 18750 + }, + { + "epoch": 3.3112699673462185, + "grad_norm": 2.4221484661102295, + "learning_rate": 1.6896734333627538e-05, + "loss": 0.4086, + "step": 18760 + }, + { + "epoch": 3.3130350366251875, + "grad_norm": 0.8883141875267029, + "learning_rate": 1.687908208296558e-05, + "loss": 0.5143, + "step": 18770 + }, + { + "epoch": 3.3148001059041565, + "grad_norm": 2.6699986457824707, + "learning_rate": 1.6861429832303617e-05, + "loss": 0.4004, + "step": 18780 + }, + { + "epoch": 3.316565175183126, + "grad_norm": 2.7353172302246094, + "learning_rate": 1.6843777581641658e-05, + "loss": 0.4226, + "step": 18790 + }, + { + "epoch": 3.318330244462095, + "grad_norm": 2.341336727142334, + "learning_rate": 1.68261253309797e-05, + "loss": 0.4535, + "step": 18800 + }, + { + "epoch": 3.3200953137410645, + "grad_norm": 0.9151778817176819, + "learning_rate": 1.680847308031774e-05, + "loss": 0.3625, + "step": 18810 + }, + { + "epoch": 3.3218603830200335, + "grad_norm": 0.7896936535835266, + "learning_rate": 1.679082082965578e-05, + "loss": 0.3674, + "step": 18820 + }, + { + "epoch": 3.3236254522990025, + "grad_norm": 1.1445233821868896, + "learning_rate": 1.6773168578993823e-05, + "loss": 0.3445, + "step": 18830 + }, + { + "epoch": 3.325390521577972, + "grad_norm": 1.9561645984649658, + "learning_rate": 1.6755516328331864e-05, + "loss": 0.4055, + "step": 18840 + }, + { + "epoch": 3.327155590856941, + "grad_norm": 3.36482834815979, + "learning_rate": 1.6737864077669905e-05, + "loss": 0.436, + "step": 18850 + }, + { + "epoch": 3.3289206601359105, + "grad_norm": 1.7744373083114624, + "learning_rate": 1.6720211827007946e-05, + "loss": 0.4467, + "step": 18860 + }, + { + "epoch": 3.3306857294148795, + "grad_norm": 1.379241704940796, + "learning_rate": 1.6702559576345987e-05, + "loss": 0.4124, + "step": 18870 + }, + { + "epoch": 3.3324507986938485, + "grad_norm": 0.9114649891853333, + "learning_rate": 1.6684907325684025e-05, + "loss": 0.3661, + "step": 18880 + }, + { + "epoch": 3.334215867972818, + "grad_norm": 2.910946846008301, + "learning_rate": 1.6667255075022066e-05, + "loss": 0.3866, + "step": 18890 + }, + { + "epoch": 3.335980937251787, + "grad_norm": 1.5581128597259521, + "learning_rate": 1.6649602824360104e-05, + "loss": 0.4196, + "step": 18900 + }, + { + "epoch": 3.3377460065307565, + "grad_norm": 1.8757463693618774, + "learning_rate": 1.6631950573698145e-05, + "loss": 0.3783, + "step": 18910 + }, + { + "epoch": 3.3395110758097255, + "grad_norm": 3.579040765762329, + "learning_rate": 1.6614298323036186e-05, + "loss": 0.3791, + "step": 18920 + }, + { + "epoch": 3.3412761450886945, + "grad_norm": 3.096893072128296, + "learning_rate": 1.6596646072374228e-05, + "loss": 0.4378, + "step": 18930 + }, + { + "epoch": 3.343041214367664, + "grad_norm": 3.1453256607055664, + "learning_rate": 1.657899382171227e-05, + "loss": 0.3997, + "step": 18940 + }, + { + "epoch": 3.344806283646633, + "grad_norm": 1.0607496500015259, + "learning_rate": 1.656134157105031e-05, + "loss": 0.4079, + "step": 18950 + }, + { + "epoch": 3.3465713529256025, + "grad_norm": 1.0377461910247803, + "learning_rate": 1.654368932038835e-05, + "loss": 0.4123, + "step": 18960 + }, + { + "epoch": 3.3483364222045715, + "grad_norm": 1.3114358186721802, + "learning_rate": 1.6526037069726392e-05, + "loss": 0.4246, + "step": 18970 + }, + { + "epoch": 3.3501014914835405, + "grad_norm": 4.857133388519287, + "learning_rate": 1.6508384819064433e-05, + "loss": 0.3943, + "step": 18980 + }, + { + "epoch": 3.35186656076251, + "grad_norm": 2.0322043895721436, + "learning_rate": 1.6490732568402474e-05, + "loss": 0.399, + "step": 18990 + }, + { + "epoch": 3.353631630041479, + "grad_norm": 0.7585875391960144, + "learning_rate": 1.6473080317740512e-05, + "loss": 0.3883, + "step": 19000 + }, + { + "epoch": 3.353631630041479, + "eval_loss": 0.6337409615516663, + "eval_runtime": 591.7269, + "eval_samples_per_second": 47.872, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0004964856287099412, + "step": 19000 + }, + { + "epoch": 3.3553966993204485, + "grad_norm": 3.319875717163086, + "learning_rate": 1.6455428067078553e-05, + "loss": 0.4074, + "step": 19010 + }, + { + "epoch": 3.3571617685994175, + "grad_norm": 1.039914846420288, + "learning_rate": 1.6437775816416594e-05, + "loss": 0.3939, + "step": 19020 + }, + { + "epoch": 3.3589268378783865, + "grad_norm": 2.315622329711914, + "learning_rate": 1.6420123565754632e-05, + "loss": 0.4632, + "step": 19030 + }, + { + "epoch": 3.360691907157356, + "grad_norm": 1.0073977708816528, + "learning_rate": 1.6402471315092673e-05, + "loss": 0.3984, + "step": 19040 + }, + { + "epoch": 3.362456976436325, + "grad_norm": 2.9430811405181885, + "learning_rate": 1.6384819064430714e-05, + "loss": 0.3565, + "step": 19050 + }, + { + "epoch": 3.3642220457152945, + "grad_norm": 2.2811427116394043, + "learning_rate": 1.6367166813768756e-05, + "loss": 0.3919, + "step": 19060 + }, + { + "epoch": 3.3659871149942635, + "grad_norm": 0.8443194031715393, + "learning_rate": 1.6349514563106797e-05, + "loss": 0.413, + "step": 19070 + }, + { + "epoch": 3.3677521842732325, + "grad_norm": 1.146048665046692, + "learning_rate": 1.6331862312444838e-05, + "loss": 0.4198, + "step": 19080 + }, + { + "epoch": 3.369517253552202, + "grad_norm": 0.9706215262413025, + "learning_rate": 1.631421006178288e-05, + "loss": 0.3997, + "step": 19090 + }, + { + "epoch": 3.371282322831171, + "grad_norm": 0.9332062005996704, + "learning_rate": 1.629655781112092e-05, + "loss": 0.4265, + "step": 19100 + }, + { + "epoch": 3.3730473921101405, + "grad_norm": 2.7127761840820312, + "learning_rate": 1.6278905560458958e-05, + "loss": 0.4015, + "step": 19110 + }, + { + "epoch": 3.3748124613891095, + "grad_norm": 3.304807424545288, + "learning_rate": 1.6261253309797e-05, + "loss": 0.4094, + "step": 19120 + }, + { + "epoch": 3.3765775306680785, + "grad_norm": 2.3264896869659424, + "learning_rate": 1.624360105913504e-05, + "loss": 0.4057, + "step": 19130 + }, + { + "epoch": 3.378342599947048, + "grad_norm": 2.1947805881500244, + "learning_rate": 1.622594880847308e-05, + "loss": 0.4145, + "step": 19140 + }, + { + "epoch": 3.380107669226017, + "grad_norm": 1.283929705619812, + "learning_rate": 1.6208296557811123e-05, + "loss": 0.3286, + "step": 19150 + }, + { + "epoch": 3.3818727385049865, + "grad_norm": 2.9055802822113037, + "learning_rate": 1.619064430714916e-05, + "loss": 0.3332, + "step": 19160 + }, + { + "epoch": 3.3836378077839555, + "grad_norm": 2.830183267593384, + "learning_rate": 1.61729920564872e-05, + "loss": 0.399, + "step": 19170 + }, + { + "epoch": 3.3854028770629245, + "grad_norm": 1.0093673467636108, + "learning_rate": 1.6155339805825243e-05, + "loss": 0.3873, + "step": 19180 + }, + { + "epoch": 3.387167946341894, + "grad_norm": 2.602121591567993, + "learning_rate": 1.6137687555163284e-05, + "loss": 0.3895, + "step": 19190 + }, + { + "epoch": 3.388933015620863, + "grad_norm": 3.5917704105377197, + "learning_rate": 1.6120035304501325e-05, + "loss": 0.372, + "step": 19200 + }, + { + "epoch": 3.3906980848998325, + "grad_norm": 0.8379570245742798, + "learning_rate": 1.6102383053839366e-05, + "loss": 0.3456, + "step": 19210 + }, + { + "epoch": 3.3924631541788015, + "grad_norm": 1.011858582496643, + "learning_rate": 1.6084730803177407e-05, + "loss": 0.4192, + "step": 19220 + }, + { + "epoch": 3.3942282234577705, + "grad_norm": 2.373765707015991, + "learning_rate": 1.6067078552515445e-05, + "loss": 0.4006, + "step": 19230 + }, + { + "epoch": 3.39599329273674, + "grad_norm": 3.50282621383667, + "learning_rate": 1.6049426301853486e-05, + "loss": 0.49, + "step": 19240 + }, + { + "epoch": 3.397758362015709, + "grad_norm": 2.8352739810943604, + "learning_rate": 1.6031774051191527e-05, + "loss": 0.4188, + "step": 19250 + }, + { + "epoch": 3.3995234312946785, + "grad_norm": 2.508870840072632, + "learning_rate": 1.601412180052957e-05, + "loss": 0.389, + "step": 19260 + }, + { + "epoch": 3.4012885005736475, + "grad_norm": 1.0266497135162354, + "learning_rate": 1.599646954986761e-05, + "loss": 0.3711, + "step": 19270 + }, + { + "epoch": 3.4030535698526165, + "grad_norm": 1.2052311897277832, + "learning_rate": 1.597881729920565e-05, + "loss": 0.4537, + "step": 19280 + }, + { + "epoch": 3.404818639131586, + "grad_norm": 2.9178948402404785, + "learning_rate": 1.5961165048543692e-05, + "loss": 0.423, + "step": 19290 + }, + { + "epoch": 3.406583708410555, + "grad_norm": 0.8659635186195374, + "learning_rate": 1.594351279788173e-05, + "loss": 0.4, + "step": 19300 + }, + { + "epoch": 3.4083487776895245, + "grad_norm": 0.8659955859184265, + "learning_rate": 1.592586054721977e-05, + "loss": 0.3794, + "step": 19310 + }, + { + "epoch": 3.4101138469684935, + "grad_norm": 0.8710178732872009, + "learning_rate": 1.5908208296557812e-05, + "loss": 0.4134, + "step": 19320 + }, + { + "epoch": 3.4118789162474625, + "grad_norm": 0.758415162563324, + "learning_rate": 1.5890556045895853e-05, + "loss": 0.3794, + "step": 19330 + }, + { + "epoch": 3.413643985526432, + "grad_norm": 1.963364839553833, + "learning_rate": 1.5872903795233894e-05, + "loss": 0.3979, + "step": 19340 + }, + { + "epoch": 3.415409054805401, + "grad_norm": 1.1023908853530884, + "learning_rate": 1.5855251544571932e-05, + "loss": 0.394, + "step": 19350 + }, + { + "epoch": 3.4171741240843705, + "grad_norm": 2.183608055114746, + "learning_rate": 1.5837599293909973e-05, + "loss": 0.4181, + "step": 19360 + }, + { + "epoch": 3.4189391933633395, + "grad_norm": 1.6452324390411377, + "learning_rate": 1.5819947043248014e-05, + "loss": 0.378, + "step": 19370 + }, + { + "epoch": 3.4207042626423085, + "grad_norm": 1.1097979545593262, + "learning_rate": 1.5802294792586055e-05, + "loss": 0.3809, + "step": 19380 + }, + { + "epoch": 3.422469331921278, + "grad_norm": 0.9524782299995422, + "learning_rate": 1.5784642541924097e-05, + "loss": 0.4181, + "step": 19390 + }, + { + "epoch": 3.424234401200247, + "grad_norm": 2.443341016769409, + "learning_rate": 1.5766990291262138e-05, + "loss": 0.4372, + "step": 19400 + }, + { + "epoch": 3.4259994704792165, + "grad_norm": 1.8552148342132568, + "learning_rate": 1.574933804060018e-05, + "loss": 0.4068, + "step": 19410 + }, + { + "epoch": 3.4277645397581855, + "grad_norm": 2.9345669746398926, + "learning_rate": 1.573168578993822e-05, + "loss": 0.3257, + "step": 19420 + }, + { + "epoch": 3.4295296090371545, + "grad_norm": 2.683713674545288, + "learning_rate": 1.5714033539276258e-05, + "loss": 0.3966, + "step": 19430 + }, + { + "epoch": 3.431294678316124, + "grad_norm": 2.2262651920318604, + "learning_rate": 1.56963812886143e-05, + "loss": 0.3859, + "step": 19440 + }, + { + "epoch": 3.433059747595093, + "grad_norm": 3.83772611618042, + "learning_rate": 1.567872903795234e-05, + "loss": 0.3707, + "step": 19450 + }, + { + "epoch": 3.4348248168740625, + "grad_norm": 1.1714105606079102, + "learning_rate": 1.566107678729038e-05, + "loss": 0.4113, + "step": 19460 + }, + { + "epoch": 3.4365898861530315, + "grad_norm": 2.8277933597564697, + "learning_rate": 1.564342453662842e-05, + "loss": 0.388, + "step": 19470 + }, + { + "epoch": 3.4383549554320005, + "grad_norm": 2.253077268600464, + "learning_rate": 1.562577228596646e-05, + "loss": 0.3965, + "step": 19480 + }, + { + "epoch": 3.44012002471097, + "grad_norm": 2.4589033126831055, + "learning_rate": 1.56081200353045e-05, + "loss": 0.4136, + "step": 19490 + }, + { + "epoch": 3.441885093989939, + "grad_norm": 1.1025439500808716, + "learning_rate": 1.5590467784642542e-05, + "loss": 0.4406, + "step": 19500 + }, + { + "epoch": 3.4436501632689085, + "grad_norm": 4.201873302459717, + "learning_rate": 1.5572815533980583e-05, + "loss": 0.4258, + "step": 19510 + }, + { + "epoch": 3.4454152325478775, + "grad_norm": 2.1414599418640137, + "learning_rate": 1.5555163283318625e-05, + "loss": 0.3728, + "step": 19520 + }, + { + "epoch": 3.4471803018268465, + "grad_norm": 3.3690412044525146, + "learning_rate": 1.5537511032656666e-05, + "loss": 0.4301, + "step": 19530 + }, + { + "epoch": 3.448945371105816, + "grad_norm": 0.9869192838668823, + "learning_rate": 1.5519858781994707e-05, + "loss": 0.406, + "step": 19540 + }, + { + "epoch": 3.450710440384785, + "grad_norm": 0.9513691663742065, + "learning_rate": 1.5502206531332748e-05, + "loss": 0.3683, + "step": 19550 + }, + { + "epoch": 3.4524755096637545, + "grad_norm": 3.8297016620635986, + "learning_rate": 1.5484554280670786e-05, + "loss": 0.4542, + "step": 19560 + }, + { + "epoch": 3.4542405789427235, + "grad_norm": 0.9088470935821533, + "learning_rate": 1.5466902030008827e-05, + "loss": 0.3416, + "step": 19570 + }, + { + "epoch": 3.4560056482216925, + "grad_norm": 3.7486371994018555, + "learning_rate": 1.5449249779346868e-05, + "loss": 0.3973, + "step": 19580 + }, + { + "epoch": 3.457770717500662, + "grad_norm": 1.234134554862976, + "learning_rate": 1.5431597528684906e-05, + "loss": 0.4224, + "step": 19590 + }, + { + "epoch": 3.459535786779631, + "grad_norm": 2.473367691040039, + "learning_rate": 1.5413945278022947e-05, + "loss": 0.4131, + "step": 19600 + }, + { + "epoch": 3.4613008560586005, + "grad_norm": 0.9623596668243408, + "learning_rate": 1.5396293027360988e-05, + "loss": 0.4584, + "step": 19610 + }, + { + "epoch": 3.4630659253375695, + "grad_norm": 1.3496516942977905, + "learning_rate": 1.537864077669903e-05, + "loss": 0.3863, + "step": 19620 + }, + { + "epoch": 3.4648309946165385, + "grad_norm": 3.4756240844726562, + "learning_rate": 1.536098852603707e-05, + "loss": 0.4121, + "step": 19630 + }, + { + "epoch": 3.466596063895508, + "grad_norm": 0.9052891731262207, + "learning_rate": 1.534333627537511e-05, + "loss": 0.4241, + "step": 19640 + }, + { + "epoch": 3.468361133174477, + "grad_norm": 1.1238759756088257, + "learning_rate": 1.5325684024713153e-05, + "loss": 0.3724, + "step": 19650 + }, + { + "epoch": 3.4701262024534465, + "grad_norm": 0.7360913753509521, + "learning_rate": 1.5308031774051194e-05, + "loss": 0.374, + "step": 19660 + }, + { + "epoch": 3.4718912717324155, + "grad_norm": 2.606687068939209, + "learning_rate": 1.5290379523389235e-05, + "loss": 0.5032, + "step": 19670 + }, + { + "epoch": 3.4736563410113845, + "grad_norm": 3.076735496520996, + "learning_rate": 1.5272727272727276e-05, + "loss": 0.3582, + "step": 19680 + }, + { + "epoch": 3.475421410290354, + "grad_norm": 3.728522300720215, + "learning_rate": 1.5255075022065312e-05, + "loss": 0.4011, + "step": 19690 + }, + { + "epoch": 3.477186479569323, + "grad_norm": 1.052911639213562, + "learning_rate": 1.5237422771403353e-05, + "loss": 0.4207, + "step": 19700 + }, + { + "epoch": 3.4789515488482925, + "grad_norm": 1.7988530397415161, + "learning_rate": 1.5219770520741395e-05, + "loss": 0.4107, + "step": 19710 + }, + { + "epoch": 3.4807166181272615, + "grad_norm": 1.0843051671981812, + "learning_rate": 1.5202118270079436e-05, + "loss": 0.3656, + "step": 19720 + }, + { + "epoch": 3.4824816874062305, + "grad_norm": 2.6198229789733887, + "learning_rate": 1.5184466019417475e-05, + "loss": 0.37, + "step": 19730 + }, + { + "epoch": 3.4842467566852, + "grad_norm": 2.750885009765625, + "learning_rate": 1.5166813768755516e-05, + "loss": 0.3745, + "step": 19740 + }, + { + "epoch": 3.486011825964169, + "grad_norm": 3.1571855545043945, + "learning_rate": 1.5149161518093557e-05, + "loss": 0.3779, + "step": 19750 + }, + { + "epoch": 3.4877768952431385, + "grad_norm": 1.179776668548584, + "learning_rate": 1.5131509267431599e-05, + "loss": 0.3756, + "step": 19760 + }, + { + "epoch": 3.4895419645221075, + "grad_norm": 0.8757907748222351, + "learning_rate": 1.511385701676964e-05, + "loss": 0.4455, + "step": 19770 + }, + { + "epoch": 3.4913070338010765, + "grad_norm": 1.0547289848327637, + "learning_rate": 1.509620476610768e-05, + "loss": 0.3914, + "step": 19780 + }, + { + "epoch": 3.493072103080046, + "grad_norm": 0.7314989566802979, + "learning_rate": 1.507855251544572e-05, + "loss": 0.445, + "step": 19790 + }, + { + "epoch": 3.494837172359015, + "grad_norm": 3.2221102714538574, + "learning_rate": 1.5060900264783761e-05, + "loss": 0.374, + "step": 19800 + }, + { + "epoch": 3.4966022416379845, + "grad_norm": 0.7653711438179016, + "learning_rate": 1.5043248014121803e-05, + "loss": 0.3547, + "step": 19810 + }, + { + "epoch": 3.4983673109169535, + "grad_norm": 1.3949837684631348, + "learning_rate": 1.5025595763459844e-05, + "loss": 0.3801, + "step": 19820 + }, + { + "epoch": 3.5001323801959225, + "grad_norm": 2.3859100341796875, + "learning_rate": 1.5007943512797882e-05, + "loss": 0.4, + "step": 19830 + }, + { + "epoch": 3.501897449474892, + "grad_norm": 2.28014874458313, + "learning_rate": 1.4990291262135923e-05, + "loss": 0.4424, + "step": 19840 + }, + { + "epoch": 3.503662518753861, + "grad_norm": 0.9215405583381653, + "learning_rate": 1.4972639011473962e-05, + "loss": 0.3775, + "step": 19850 + }, + { + "epoch": 3.5054275880328305, + "grad_norm": 3.190624237060547, + "learning_rate": 1.4954986760812003e-05, + "loss": 0.3817, + "step": 19860 + }, + { + "epoch": 3.5071926573117995, + "grad_norm": 1.0653491020202637, + "learning_rate": 1.4937334510150044e-05, + "loss": 0.3981, + "step": 19870 + }, + { + "epoch": 3.5089577265907685, + "grad_norm": 0.8434626460075378, + "learning_rate": 1.4919682259488086e-05, + "loss": 0.3985, + "step": 19880 + }, + { + "epoch": 3.510722795869738, + "grad_norm": 1.124345064163208, + "learning_rate": 1.4902030008826127e-05, + "loss": 0.4017, + "step": 19890 + }, + { + "epoch": 3.512487865148707, + "grad_norm": 2.5151355266571045, + "learning_rate": 1.4884377758164166e-05, + "loss": 0.4992, + "step": 19900 + }, + { + "epoch": 3.5142529344276765, + "grad_norm": 1.2481873035430908, + "learning_rate": 1.4866725507502207e-05, + "loss": 0.3878, + "step": 19910 + }, + { + "epoch": 3.5160180037066455, + "grad_norm": 1.1398800611495972, + "learning_rate": 1.4849073256840248e-05, + "loss": 0.3707, + "step": 19920 + }, + { + "epoch": 3.5177830729856145, + "grad_norm": 2.3349595069885254, + "learning_rate": 1.483142100617829e-05, + "loss": 0.3675, + "step": 19930 + }, + { + "epoch": 3.519548142264584, + "grad_norm": 1.104368805885315, + "learning_rate": 1.481376875551633e-05, + "loss": 0.3351, + "step": 19940 + }, + { + "epoch": 3.521313211543553, + "grad_norm": 2.1135847568511963, + "learning_rate": 1.479611650485437e-05, + "loss": 0.326, + "step": 19950 + }, + { + "epoch": 3.5230782808225225, + "grad_norm": 0.7931279540061951, + "learning_rate": 1.477846425419241e-05, + "loss": 0.3917, + "step": 19960 + }, + { + "epoch": 3.5248433501014915, + "grad_norm": 3.3754491806030273, + "learning_rate": 1.4760812003530449e-05, + "loss": 0.3659, + "step": 19970 + }, + { + "epoch": 3.5266084193804605, + "grad_norm": 3.644896984100342, + "learning_rate": 1.474315975286849e-05, + "loss": 0.3987, + "step": 19980 + }, + { + "epoch": 3.52837348865943, + "grad_norm": 1.0437495708465576, + "learning_rate": 1.4725507502206531e-05, + "loss": 0.3862, + "step": 19990 + }, + { + "epoch": 3.530138557938399, + "grad_norm": 0.6430332660675049, + "learning_rate": 1.4707855251544573e-05, + "loss": 0.361, + "step": 20000 + }, + { + "epoch": 3.530138557938399, + "eval_loss": 0.627667248249054, + "eval_runtime": 591.668, + "eval_samples_per_second": 47.877, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0004995303075450389, + "step": 20000 + }, + { + "epoch": 3.5319036272173685, + "grad_norm": 0.9894623756408691, + "learning_rate": 1.4690203000882614e-05, + "loss": 0.397, + "step": 20010 + }, + { + "epoch": 3.5336686964963375, + "grad_norm": 0.8043822646141052, + "learning_rate": 1.4672550750220653e-05, + "loss": 0.2927, + "step": 20020 + }, + { + "epoch": 3.5354337657753065, + "grad_norm": 1.4241021871566772, + "learning_rate": 1.4654898499558694e-05, + "loss": 0.3976, + "step": 20030 + }, + { + "epoch": 3.537198835054276, + "grad_norm": 2.9625587463378906, + "learning_rate": 1.4637246248896735e-05, + "loss": 0.4065, + "step": 20040 + }, + { + "epoch": 3.538963904333245, + "grad_norm": 1.040947675704956, + "learning_rate": 1.4619593998234777e-05, + "loss": 0.4393, + "step": 20050 + }, + { + "epoch": 3.5407289736122145, + "grad_norm": 2.4093751907348633, + "learning_rate": 1.4601941747572818e-05, + "loss": 0.4131, + "step": 20060 + }, + { + "epoch": 3.5424940428911835, + "grad_norm": 0.9673560857772827, + "learning_rate": 1.4584289496910857e-05, + "loss": 0.3504, + "step": 20070 + }, + { + "epoch": 3.5442591121701525, + "grad_norm": 1.1420180797576904, + "learning_rate": 1.4566637246248898e-05, + "loss": 0.3997, + "step": 20080 + }, + { + "epoch": 3.546024181449122, + "grad_norm": 2.3320648670196533, + "learning_rate": 1.4548984995586936e-05, + "loss": 0.4331, + "step": 20090 + }, + { + "epoch": 3.547789250728091, + "grad_norm": 2.1832408905029297, + "learning_rate": 1.4531332744924977e-05, + "loss": 0.4219, + "step": 20100 + }, + { + "epoch": 3.5495543200070605, + "grad_norm": 0.7137057781219482, + "learning_rate": 1.4513680494263018e-05, + "loss": 0.3839, + "step": 20110 + }, + { + "epoch": 3.5513193892860295, + "grad_norm": 2.7392122745513916, + "learning_rate": 1.449602824360106e-05, + "loss": 0.3617, + "step": 20120 + }, + { + "epoch": 3.5530844585649985, + "grad_norm": 2.4449989795684814, + "learning_rate": 1.44783759929391e-05, + "loss": 0.3957, + "step": 20130 + }, + { + "epoch": 3.554849527843968, + "grad_norm": 1.5275472402572632, + "learning_rate": 1.446072374227714e-05, + "loss": 0.3073, + "step": 20140 + }, + { + "epoch": 3.556614597122937, + "grad_norm": 0.8892549276351929, + "learning_rate": 1.4443071491615181e-05, + "loss": 0.4079, + "step": 20150 + }, + { + "epoch": 3.5583796664019065, + "grad_norm": 2.4779117107391357, + "learning_rate": 1.4425419240953222e-05, + "loss": 0.3696, + "step": 20160 + }, + { + "epoch": 3.5601447356808755, + "grad_norm": 2.004605770111084, + "learning_rate": 1.4407766990291264e-05, + "loss": 0.3899, + "step": 20170 + }, + { + "epoch": 3.5619098049598445, + "grad_norm": 1.1132246255874634, + "learning_rate": 1.4390114739629305e-05, + "loss": 0.3858, + "step": 20180 + }, + { + "epoch": 3.563674874238814, + "grad_norm": 3.4782907962799072, + "learning_rate": 1.4372462488967344e-05, + "loss": 0.3474, + "step": 20190 + }, + { + "epoch": 3.565439943517783, + "grad_norm": 0.9343898296356201, + "learning_rate": 1.4354810238305385e-05, + "loss": 0.3925, + "step": 20200 + }, + { + "epoch": 3.5672050127967525, + "grad_norm": 1.3837854862213135, + "learning_rate": 1.4337157987643426e-05, + "loss": 0.4391, + "step": 20210 + }, + { + "epoch": 3.5689700820757215, + "grad_norm": 4.418737888336182, + "learning_rate": 1.4319505736981464e-05, + "loss": 0.3682, + "step": 20220 + }, + { + "epoch": 3.5707351513546906, + "grad_norm": 2.9316399097442627, + "learning_rate": 1.4301853486319505e-05, + "loss": 0.3837, + "step": 20230 + }, + { + "epoch": 3.57250022063366, + "grad_norm": 1.3634068965911865, + "learning_rate": 1.4284201235657546e-05, + "loss": 0.4566, + "step": 20240 + }, + { + "epoch": 3.574265289912629, + "grad_norm": 1.034454107284546, + "learning_rate": 1.4266548984995588e-05, + "loss": 0.4131, + "step": 20250 + }, + { + "epoch": 3.5760303591915985, + "grad_norm": 0.7224887013435364, + "learning_rate": 1.4248896734333627e-05, + "loss": 0.3576, + "step": 20260 + }, + { + "epoch": 3.5777954284705675, + "grad_norm": 1.0184725522994995, + "learning_rate": 1.4231244483671668e-05, + "loss": 0.4344, + "step": 20270 + }, + { + "epoch": 3.5795604977495366, + "grad_norm": 0.8519681096076965, + "learning_rate": 1.421359223300971e-05, + "loss": 0.3649, + "step": 20280 + }, + { + "epoch": 3.581325567028506, + "grad_norm": 0.8168842196464539, + "learning_rate": 1.419593998234775e-05, + "loss": 0.3725, + "step": 20290 + }, + { + "epoch": 3.583090636307475, + "grad_norm": 4.341702938079834, + "learning_rate": 1.4178287731685792e-05, + "loss": 0.4082, + "step": 20300 + }, + { + "epoch": 3.5848557055864445, + "grad_norm": 0.7056460380554199, + "learning_rate": 1.4160635481023831e-05, + "loss": 0.379, + "step": 20310 + }, + { + "epoch": 3.5866207748654135, + "grad_norm": 3.9937493801116943, + "learning_rate": 1.4142983230361872e-05, + "loss": 0.3684, + "step": 20320 + }, + { + "epoch": 3.5883858441443826, + "grad_norm": 2.532386064529419, + "learning_rate": 1.4125330979699913e-05, + "loss": 0.3415, + "step": 20330 + }, + { + "epoch": 3.590150913423352, + "grad_norm": 2.4106085300445557, + "learning_rate": 1.4107678729037955e-05, + "loss": 0.3569, + "step": 20340 + }, + { + "epoch": 3.591915982702321, + "grad_norm": 1.1405956745147705, + "learning_rate": 1.4090026478375996e-05, + "loss": 0.4453, + "step": 20350 + }, + { + "epoch": 3.5936810519812905, + "grad_norm": 1.306481957435608, + "learning_rate": 1.4072374227714033e-05, + "loss": 0.3698, + "step": 20360 + }, + { + "epoch": 3.5954461212602595, + "grad_norm": 0.9987174272537231, + "learning_rate": 1.4054721977052075e-05, + "loss": 0.3728, + "step": 20370 + }, + { + "epoch": 3.5972111905392286, + "grad_norm": 4.221369743347168, + "learning_rate": 1.4037069726390114e-05, + "loss": 0.41, + "step": 20380 + }, + { + "epoch": 3.598976259818198, + "grad_norm": 1.0658732652664185, + "learning_rate": 1.4019417475728155e-05, + "loss": 0.3043, + "step": 20390 + }, + { + "epoch": 3.600741329097167, + "grad_norm": 1.355228066444397, + "learning_rate": 1.4001765225066196e-05, + "loss": 0.377, + "step": 20400 + }, + { + "epoch": 3.6025063983761365, + "grad_norm": 3.138543128967285, + "learning_rate": 1.3984112974404237e-05, + "loss": 0.3937, + "step": 20410 + }, + { + "epoch": 3.6042714676551055, + "grad_norm": 0.8033159375190735, + "learning_rate": 1.3966460723742279e-05, + "loss": 0.3792, + "step": 20420 + }, + { + "epoch": 3.6060365369340746, + "grad_norm": 0.810020923614502, + "learning_rate": 1.3948808473080318e-05, + "loss": 0.4466, + "step": 20430 + }, + { + "epoch": 3.607801606213044, + "grad_norm": 1.978060007095337, + "learning_rate": 1.393115622241836e-05, + "loss": 0.4336, + "step": 20440 + }, + { + "epoch": 3.609566675492013, + "grad_norm": 2.3692550659179688, + "learning_rate": 1.39135039717564e-05, + "loss": 0.3689, + "step": 20450 + }, + { + "epoch": 3.6113317447709825, + "grad_norm": 1.4536701440811157, + "learning_rate": 1.3895851721094442e-05, + "loss": 0.377, + "step": 20460 + }, + { + "epoch": 3.6130968140499515, + "grad_norm": 1.230667233467102, + "learning_rate": 1.3878199470432483e-05, + "loss": 0.3791, + "step": 20470 + }, + { + "epoch": 3.6148618833289206, + "grad_norm": 0.8049829006195068, + "learning_rate": 1.3860547219770522e-05, + "loss": 0.3594, + "step": 20480 + }, + { + "epoch": 3.61662695260789, + "grad_norm": 0.8730904459953308, + "learning_rate": 1.3842894969108562e-05, + "loss": 0.3676, + "step": 20490 + }, + { + "epoch": 3.618392021886859, + "grad_norm": 4.918626308441162, + "learning_rate": 1.3825242718446601e-05, + "loss": 0.4362, + "step": 20500 + }, + { + "epoch": 3.6201570911658285, + "grad_norm": 1.2398933172225952, + "learning_rate": 1.3807590467784642e-05, + "loss": 0.3963, + "step": 20510 + }, + { + "epoch": 3.6219221604447975, + "grad_norm": 1.5302728414535522, + "learning_rate": 1.3789938217122683e-05, + "loss": 0.3749, + "step": 20520 + }, + { + "epoch": 3.6236872297237666, + "grad_norm": 3.465019702911377, + "learning_rate": 1.3772285966460724e-05, + "loss": 0.4055, + "step": 20530 + }, + { + "epoch": 3.625452299002736, + "grad_norm": 2.143458604812622, + "learning_rate": 1.3754633715798766e-05, + "loss": 0.3957, + "step": 20540 + }, + { + "epoch": 3.627217368281705, + "grad_norm": 2.2639827728271484, + "learning_rate": 1.3736981465136805e-05, + "loss": 0.3577, + "step": 20550 + }, + { + "epoch": 3.6289824375606745, + "grad_norm": 1.8318665027618408, + "learning_rate": 1.3719329214474846e-05, + "loss": 0.3756, + "step": 20560 + }, + { + "epoch": 3.6307475068396435, + "grad_norm": 0.9574584364891052, + "learning_rate": 1.3701676963812887e-05, + "loss": 0.3594, + "step": 20570 + }, + { + "epoch": 3.6325125761186126, + "grad_norm": 1.0685011148452759, + "learning_rate": 1.3684024713150929e-05, + "loss": 0.3595, + "step": 20580 + }, + { + "epoch": 3.6342776453975816, + "grad_norm": 1.5938297510147095, + "learning_rate": 1.366637246248897e-05, + "loss": 0.3697, + "step": 20590 + }, + { + "epoch": 3.636042714676551, + "grad_norm": 1.924798846244812, + "learning_rate": 1.3648720211827009e-05, + "loss": 0.4691, + "step": 20600 + }, + { + "epoch": 3.6378077839555205, + "grad_norm": 1.6222426891326904, + "learning_rate": 1.363106796116505e-05, + "loss": 0.36, + "step": 20610 + }, + { + "epoch": 3.6395728532344895, + "grad_norm": 1.0212628841400146, + "learning_rate": 1.3613415710503088e-05, + "loss": 0.3561, + "step": 20620 + }, + { + "epoch": 3.6413379225134586, + "grad_norm": 1.080259084701538, + "learning_rate": 1.359576345984113e-05, + "loss": 0.3642, + "step": 20630 + }, + { + "epoch": 3.6431029917924276, + "grad_norm": 1.1045883893966675, + "learning_rate": 1.357811120917917e-05, + "loss": 0.4131, + "step": 20640 + }, + { + "epoch": 3.644868061071397, + "grad_norm": 2.390874147415161, + "learning_rate": 1.3560458958517211e-05, + "loss": 0.3447, + "step": 20650 + }, + { + "epoch": 3.6466331303503665, + "grad_norm": 1.1138941049575806, + "learning_rate": 1.3542806707855251e-05, + "loss": 0.3586, + "step": 20660 + }, + { + "epoch": 3.6483981996293355, + "grad_norm": 1.5280704498291016, + "learning_rate": 1.3525154457193292e-05, + "loss": 0.3944, + "step": 20670 + }, + { + "epoch": 3.6501632689083046, + "grad_norm": 1.3573005199432373, + "learning_rate": 1.3507502206531333e-05, + "loss": 0.3321, + "step": 20680 + }, + { + "epoch": 3.6519283381872736, + "grad_norm": 2.458770751953125, + "learning_rate": 1.3489849955869374e-05, + "loss": 0.4535, + "step": 20690 + }, + { + "epoch": 3.653693407466243, + "grad_norm": 1.4218165874481201, + "learning_rate": 1.3472197705207415e-05, + "loss": 0.3851, + "step": 20700 + }, + { + "epoch": 3.6554584767452125, + "grad_norm": 3.5979576110839844, + "learning_rate": 1.3454545454545457e-05, + "loss": 0.4107, + "step": 20710 + }, + { + "epoch": 3.6572235460241815, + "grad_norm": 0.8865223526954651, + "learning_rate": 1.3436893203883496e-05, + "loss": 0.3959, + "step": 20720 + }, + { + "epoch": 3.6589886153031506, + "grad_norm": 1.084224820137024, + "learning_rate": 1.3419240953221537e-05, + "loss": 0.3803, + "step": 20730 + }, + { + "epoch": 3.6607536845821196, + "grad_norm": 1.8160624504089355, + "learning_rate": 1.3401588702559578e-05, + "loss": 0.4098, + "step": 20740 + }, + { + "epoch": 3.662518753861089, + "grad_norm": 1.106541633605957, + "learning_rate": 1.3383936451897616e-05, + "loss": 0.4183, + "step": 20750 + }, + { + "epoch": 3.6642838231400585, + "grad_norm": 0.7795540690422058, + "learning_rate": 1.3366284201235657e-05, + "loss": 0.288, + "step": 20760 + }, + { + "epoch": 3.6660488924190275, + "grad_norm": 1.5033721923828125, + "learning_rate": 1.3348631950573698e-05, + "loss": 0.4001, + "step": 20770 + }, + { + "epoch": 3.6678139616979966, + "grad_norm": 0.7211555242538452, + "learning_rate": 1.3330979699911738e-05, + "loss": 0.297, + "step": 20780 + }, + { + "epoch": 3.6695790309769656, + "grad_norm": 4.685546875, + "learning_rate": 1.3313327449249779e-05, + "loss": 0.3903, + "step": 20790 + }, + { + "epoch": 3.671344100255935, + "grad_norm": 0.9873565435409546, + "learning_rate": 1.329567519858782e-05, + "loss": 0.3887, + "step": 20800 + }, + { + "epoch": 3.6731091695349045, + "grad_norm": 3.9295639991760254, + "learning_rate": 1.3278022947925861e-05, + "loss": 0.3855, + "step": 20810 + }, + { + "epoch": 3.6748742388138735, + "grad_norm": Infinity, + "learning_rate": 1.3262135922330099e-05, + "loss": 0.4774, + "step": 20820 + }, + { + "epoch": 3.6766393080928426, + "grad_norm": 1.827843427658081, + "learning_rate": 1.3244483671668137e-05, + "loss": 0.3703, + "step": 20830 + }, + { + "epoch": 3.6784043773718116, + "grad_norm": 3.078402042388916, + "learning_rate": 1.3226831421006178e-05, + "loss": 0.3907, + "step": 20840 + }, + { + "epoch": 3.680169446650781, + "grad_norm": 1.2314592599868774, + "learning_rate": 1.3209179170344219e-05, + "loss": 0.3843, + "step": 20850 + }, + { + "epoch": 3.6819345159297505, + "grad_norm": 1.0087709426879883, + "learning_rate": 1.319152691968226e-05, + "loss": 0.4249, + "step": 20860 + }, + { + "epoch": 3.6836995852087195, + "grad_norm": 0.9953848719596863, + "learning_rate": 1.31738746690203e-05, + "loss": 0.3404, + "step": 20870 + }, + { + "epoch": 3.6854646544876886, + "grad_norm": 2.3661205768585205, + "learning_rate": 1.315622241835834e-05, + "loss": 0.3602, + "step": 20880 + }, + { + "epoch": 3.6872297237666576, + "grad_norm": 2.3409297466278076, + "learning_rate": 1.3138570167696382e-05, + "loss": 0.3858, + "step": 20890 + }, + { + "epoch": 3.688994793045627, + "grad_norm": 1.0345180034637451, + "learning_rate": 1.3120917917034423e-05, + "loss": 0.4095, + "step": 20900 + }, + { + "epoch": 3.690759862324596, + "grad_norm": 0.862300455570221, + "learning_rate": 1.3103265666372464e-05, + "loss": 0.3509, + "step": 20910 + }, + { + "epoch": 3.6925249316035655, + "grad_norm": 4.365932941436768, + "learning_rate": 1.3085613415710504e-05, + "loss": 0.3304, + "step": 20920 + }, + { + "epoch": 3.6942900008825346, + "grad_norm": 1.525173544883728, + "learning_rate": 1.3067961165048545e-05, + "loss": 0.3922, + "step": 20930 + }, + { + "epoch": 3.6960550701615036, + "grad_norm": 1.042880654335022, + "learning_rate": 1.3050308914386586e-05, + "loss": 0.379, + "step": 20940 + }, + { + "epoch": 3.697820139440473, + "grad_norm": 5.789563179016113, + "learning_rate": 1.3032656663724627e-05, + "loss": 0.4081, + "step": 20950 + }, + { + "epoch": 3.699585208719442, + "grad_norm": 1.1197853088378906, + "learning_rate": 1.3015004413062668e-05, + "loss": 0.3498, + "step": 20960 + }, + { + "epoch": 3.7013502779984115, + "grad_norm": 3.8499810695648193, + "learning_rate": 1.2997352162400706e-05, + "loss": 0.4205, + "step": 20970 + }, + { + "epoch": 3.7031153472773806, + "grad_norm": 1.4235578775405884, + "learning_rate": 1.2979699911738746e-05, + "loss": 0.3928, + "step": 20980 + }, + { + "epoch": 3.7048804165563496, + "grad_norm": 1.1226133108139038, + "learning_rate": 1.2962047661076787e-05, + "loss": 0.4256, + "step": 20990 + }, + { + "epoch": 3.706645485835319, + "grad_norm": 1.0519105195999146, + "learning_rate": 1.2944395410414828e-05, + "loss": 0.3961, + "step": 21000 + }, + { + "epoch": 3.706645485835319, + "eval_loss": 0.624920129776001, + "eval_runtime": 591.415, + "eval_samples_per_second": 47.897, + "eval_steps_per_second": 2.396, + "eval_token_accuracy": 0.0005052137080372214, + "step": 21000 + }, + { + "epoch": 3.708410555114288, + "grad_norm": 1.1179771423339844, + "learning_rate": 1.2926743159752869e-05, + "loss": 0.4488, + "step": 21010 + }, + { + "epoch": 3.7101756243932575, + "grad_norm": 1.0017509460449219, + "learning_rate": 1.290909090909091e-05, + "loss": 0.3567, + "step": 21020 + }, + { + "epoch": 3.7119406936722266, + "grad_norm": 3.0131490230560303, + "learning_rate": 1.289143865842895e-05, + "loss": 0.3817, + "step": 21030 + }, + { + "epoch": 3.7137057629511956, + "grad_norm": 2.2893624305725098, + "learning_rate": 1.287378640776699e-05, + "loss": 0.3994, + "step": 21040 + }, + { + "epoch": 3.715470832230165, + "grad_norm": 3.590670347213745, + "learning_rate": 1.2856134157105032e-05, + "loss": 0.3815, + "step": 21050 + }, + { + "epoch": 3.717235901509134, + "grad_norm": 0.7297062873840332, + "learning_rate": 1.2838481906443073e-05, + "loss": 0.4081, + "step": 21060 + }, + { + "epoch": 3.7190009707881035, + "grad_norm": 2.541386127471924, + "learning_rate": 1.2820829655781114e-05, + "loss": 0.3312, + "step": 21070 + }, + { + "epoch": 3.7207660400670726, + "grad_norm": 2.1308650970458984, + "learning_rate": 1.2803177405119155e-05, + "loss": 0.4065, + "step": 21080 + }, + { + "epoch": 3.7225311093460416, + "grad_norm": 1.2172911167144775, + "learning_rate": 1.2785525154457195e-05, + "loss": 0.3209, + "step": 21090 + }, + { + "epoch": 3.724296178625011, + "grad_norm": 3.1587109565734863, + "learning_rate": 1.2767872903795232e-05, + "loss": 0.4147, + "step": 21100 + }, + { + "epoch": 3.72606124790398, + "grad_norm": 2.742392063140869, + "learning_rate": 1.2750220653133274e-05, + "loss": 0.3323, + "step": 21110 + }, + { + "epoch": 3.7278263171829495, + "grad_norm": 1.6285436153411865, + "learning_rate": 1.2732568402471315e-05, + "loss": 0.3819, + "step": 21120 + }, + { + "epoch": 3.7295913864619186, + "grad_norm": 0.7405969500541687, + "learning_rate": 1.2714916151809356e-05, + "loss": 0.4268, + "step": 21130 + }, + { + "epoch": 3.7313564557408876, + "grad_norm": 0.9725006818771362, + "learning_rate": 1.2697263901147397e-05, + "loss": 0.3536, + "step": 21140 + }, + { + "epoch": 3.733121525019857, + "grad_norm": 0.6396371126174927, + "learning_rate": 1.2679611650485437e-05, + "loss": 0.3708, + "step": 21150 + }, + { + "epoch": 3.734886594298826, + "grad_norm": 0.7963176369667053, + "learning_rate": 1.2661959399823478e-05, + "loss": 0.3832, + "step": 21160 + }, + { + "epoch": 3.7366516635777955, + "grad_norm": 1.2382067441940308, + "learning_rate": 1.2644307149161519e-05, + "loss": 0.3598, + "step": 21170 + }, + { + "epoch": 3.7384167328567646, + "grad_norm": 2.2310237884521484, + "learning_rate": 1.262665489849956e-05, + "loss": 0.4122, + "step": 21180 + }, + { + "epoch": 3.7401818021357336, + "grad_norm": 2.335681915283203, + "learning_rate": 1.2609002647837601e-05, + "loss": 0.4517, + "step": 21190 + }, + { + "epoch": 3.741946871414703, + "grad_norm": 1.0254428386688232, + "learning_rate": 1.259135039717564e-05, + "loss": 0.404, + "step": 21200 + }, + { + "epoch": 3.743711940693672, + "grad_norm": 2.330514907836914, + "learning_rate": 1.2573698146513682e-05, + "loss": 0.4372, + "step": 21210 + }, + { + "epoch": 3.7454770099726415, + "grad_norm": 1.1486183404922485, + "learning_rate": 1.2556045895851723e-05, + "loss": 0.4251, + "step": 21220 + }, + { + "epoch": 3.7472420792516106, + "grad_norm": 1.931495189666748, + "learning_rate": 1.253839364518976e-05, + "loss": 0.3881, + "step": 21230 + }, + { + "epoch": 3.7490071485305796, + "grad_norm": 1.995705246925354, + "learning_rate": 1.2520741394527802e-05, + "loss": 0.38, + "step": 21240 + }, + { + "epoch": 3.750772217809549, + "grad_norm": 3.3702809810638428, + "learning_rate": 1.2503089143865843e-05, + "loss": 0.4274, + "step": 21250 + }, + { + "epoch": 3.752537287088518, + "grad_norm": 0.8253573179244995, + "learning_rate": 1.2485436893203884e-05, + "loss": 0.4227, + "step": 21260 + }, + { + "epoch": 3.7543023563674875, + "grad_norm": 2.0807230472564697, + "learning_rate": 1.2467784642541923e-05, + "loss": 0.412, + "step": 21270 + }, + { + "epoch": 3.7560674256464566, + "grad_norm": 2.1444883346557617, + "learning_rate": 1.2450132391879965e-05, + "loss": 0.3729, + "step": 21280 + }, + { + "epoch": 3.7578324949254256, + "grad_norm": 0.9703782200813293, + "learning_rate": 1.2432480141218006e-05, + "loss": 0.4516, + "step": 21290 + }, + { + "epoch": 3.759597564204395, + "grad_norm": 1.991798758506775, + "learning_rate": 1.2414827890556047e-05, + "loss": 0.4525, + "step": 21300 + }, + { + "epoch": 3.761362633483364, + "grad_norm": 1.26823091506958, + "learning_rate": 1.2397175639894088e-05, + "loss": 0.3618, + "step": 21310 + }, + { + "epoch": 3.7631277027623335, + "grad_norm": 0.8779565691947937, + "learning_rate": 1.2379523389232128e-05, + "loss": 0.3955, + "step": 21320 + }, + { + "epoch": 3.7648927720413026, + "grad_norm": 1.0764127969741821, + "learning_rate": 1.2361871138570167e-05, + "loss": 0.3765, + "step": 21330 + }, + { + "epoch": 3.7666578413202716, + "grad_norm": 1.1968106031417847, + "learning_rate": 1.2344218887908208e-05, + "loss": 0.4095, + "step": 21340 + }, + { + "epoch": 3.768422910599241, + "grad_norm": 3.0292351245880127, + "learning_rate": 1.232656663724625e-05, + "loss": 0.4801, + "step": 21350 + }, + { + "epoch": 3.77018797987821, + "grad_norm": 1.730216145515442, + "learning_rate": 1.230891438658429e-05, + "loss": 0.3684, + "step": 21360 + }, + { + "epoch": 3.7719530491571795, + "grad_norm": 4.9076080322265625, + "learning_rate": 1.2291262135922332e-05, + "loss": 0.3868, + "step": 21370 + }, + { + "epoch": 3.7737181184361486, + "grad_norm": 1.4453113079071045, + "learning_rate": 1.2273609885260371e-05, + "loss": 0.3908, + "step": 21380 + }, + { + "epoch": 3.7754831877151176, + "grad_norm": 2.025132417678833, + "learning_rate": 1.2255957634598412e-05, + "loss": 0.3748, + "step": 21390 + }, + { + "epoch": 3.777248256994087, + "grad_norm": 0.9815580248832703, + "learning_rate": 1.2238305383936452e-05, + "loss": 0.3808, + "step": 21400 + }, + { + "epoch": 3.779013326273056, + "grad_norm": 4.732287883758545, + "learning_rate": 1.2220653133274493e-05, + "loss": 0.3854, + "step": 21410 + }, + { + "epoch": 3.7807783955520256, + "grad_norm": 2.854905605316162, + "learning_rate": 1.2203000882612534e-05, + "loss": 0.432, + "step": 21420 + }, + { + "epoch": 3.7825434648309946, + "grad_norm": 2.033327341079712, + "learning_rate": 1.2185348631950575e-05, + "loss": 0.3847, + "step": 21430 + }, + { + "epoch": 3.7843085341099636, + "grad_norm": 1.2547252178192139, + "learning_rate": 1.2167696381288614e-05, + "loss": 0.3925, + "step": 21440 + }, + { + "epoch": 3.786073603388933, + "grad_norm": 0.7951124310493469, + "learning_rate": 1.2150044130626656e-05, + "loss": 0.3882, + "step": 21450 + }, + { + "epoch": 3.787838672667902, + "grad_norm": 1.1173574924468994, + "learning_rate": 1.2132391879964695e-05, + "loss": 0.3262, + "step": 21460 + }, + { + "epoch": 3.7896037419468716, + "grad_norm": 1.071058988571167, + "learning_rate": 1.2114739629302736e-05, + "loss": 0.4359, + "step": 21470 + }, + { + "epoch": 3.7913688112258406, + "grad_norm": 1.3308414220809937, + "learning_rate": 1.2097087378640777e-05, + "loss": 0.4015, + "step": 21480 + }, + { + "epoch": 3.7931338805048096, + "grad_norm": 1.0162357091903687, + "learning_rate": 1.2079435127978819e-05, + "loss": 0.4003, + "step": 21490 + }, + { + "epoch": 3.794898949783779, + "grad_norm": 0.7248963713645935, + "learning_rate": 1.2061782877316858e-05, + "loss": 0.3947, + "step": 21500 + }, + { + "epoch": 3.796664019062748, + "grad_norm": 2.628209114074707, + "learning_rate": 1.2044130626654899e-05, + "loss": 0.3891, + "step": 21510 + }, + { + "epoch": 3.7984290883417176, + "grad_norm": 1.0009437799453735, + "learning_rate": 1.202647837599294e-05, + "loss": 0.4172, + "step": 21520 + }, + { + "epoch": 3.8001941576206866, + "grad_norm": 1.0610207319259644, + "learning_rate": 1.200882612533098e-05, + "loss": 0.3344, + "step": 21530 + }, + { + "epoch": 3.8019592268996556, + "grad_norm": 2.8646228313446045, + "learning_rate": 1.1991173874669021e-05, + "loss": 0.4041, + "step": 21540 + }, + { + "epoch": 3.803724296178625, + "grad_norm": 1.103757619857788, + "learning_rate": 1.1973521624007062e-05, + "loss": 0.4352, + "step": 21550 + }, + { + "epoch": 3.805489365457594, + "grad_norm": 1.9466677904129028, + "learning_rate": 1.1955869373345101e-05, + "loss": 0.3928, + "step": 21560 + }, + { + "epoch": 3.8072544347365636, + "grad_norm": 0.7169159650802612, + "learning_rate": 1.1938217122683143e-05, + "loss": 0.419, + "step": 21570 + }, + { + "epoch": 3.8090195040155326, + "grad_norm": 2.1274571418762207, + "learning_rate": 1.1920564872021184e-05, + "loss": 0.3842, + "step": 21580 + }, + { + "epoch": 3.8107845732945016, + "grad_norm": 2.4608614444732666, + "learning_rate": 1.1902912621359223e-05, + "loss": 0.4074, + "step": 21590 + }, + { + "epoch": 3.812549642573471, + "grad_norm": 0.9707876443862915, + "learning_rate": 1.1885260370697264e-05, + "loss": 0.3886, + "step": 21600 + }, + { + "epoch": 3.81431471185244, + "grad_norm": 1.1373486518859863, + "learning_rate": 1.1867608120035306e-05, + "loss": 0.3951, + "step": 21610 + }, + { + "epoch": 3.8160797811314096, + "grad_norm": 1.4261958599090576, + "learning_rate": 1.1849955869373345e-05, + "loss": 0.4412, + "step": 21620 + }, + { + "epoch": 3.8178448504103786, + "grad_norm": 1.2144572734832764, + "learning_rate": 1.1832303618711386e-05, + "loss": 0.4004, + "step": 21630 + }, + { + "epoch": 3.8196099196893476, + "grad_norm": 1.5304597616195679, + "learning_rate": 1.1814651368049427e-05, + "loss": 0.3798, + "step": 21640 + }, + { + "epoch": 3.821374988968317, + "grad_norm": 2.4164507389068604, + "learning_rate": 1.1796999117387468e-05, + "loss": 0.4313, + "step": 21650 + }, + { + "epoch": 3.823140058247286, + "grad_norm": 0.7153837084770203, + "learning_rate": 1.1779346866725508e-05, + "loss": 0.3471, + "step": 21660 + }, + { + "epoch": 3.8249051275262556, + "grad_norm": 0.9994945526123047, + "learning_rate": 1.1761694616063549e-05, + "loss": 0.4341, + "step": 21670 + }, + { + "epoch": 3.8266701968052246, + "grad_norm": 2.4096806049346924, + "learning_rate": 1.1744042365401588e-05, + "loss": 0.432, + "step": 21680 + }, + { + "epoch": 3.8284352660841936, + "grad_norm": 4.5101141929626465, + "learning_rate": 1.172639011473963e-05, + "loss": 0.4459, + "step": 21690 + }, + { + "epoch": 3.830200335363163, + "grad_norm": 2.3713507652282715, + "learning_rate": 1.170873786407767e-05, + "loss": 0.4236, + "step": 21700 + }, + { + "epoch": 3.831965404642132, + "grad_norm": 1.1655126810073853, + "learning_rate": 1.1691085613415712e-05, + "loss": 0.4113, + "step": 21710 + }, + { + "epoch": 3.8337304739211016, + "grad_norm": 1.9403069019317627, + "learning_rate": 1.1673433362753753e-05, + "loss": 0.3659, + "step": 21720 + }, + { + "epoch": 3.8354955432000706, + "grad_norm": 2.1725008487701416, + "learning_rate": 1.1655781112091792e-05, + "loss": 0.3584, + "step": 21730 + }, + { + "epoch": 3.8372606124790396, + "grad_norm": 1.0423593521118164, + "learning_rate": 1.1638128861429832e-05, + "loss": 0.3916, + "step": 21740 + }, + { + "epoch": 3.839025681758009, + "grad_norm": 0.7833170294761658, + "learning_rate": 1.1620476610767873e-05, + "loss": 0.3754, + "step": 21750 + }, + { + "epoch": 3.840790751036978, + "grad_norm": 2.643669843673706, + "learning_rate": 1.1602824360105914e-05, + "loss": 0.3785, + "step": 21760 + }, + { + "epoch": 3.8425558203159476, + "grad_norm": 1.3187824487686157, + "learning_rate": 1.1585172109443955e-05, + "loss": 0.3943, + "step": 21770 + }, + { + "epoch": 3.8443208895949166, + "grad_norm": 1.010647177696228, + "learning_rate": 1.1567519858781997e-05, + "loss": 0.4056, + "step": 21780 + }, + { + "epoch": 3.8460859588738856, + "grad_norm": 1.4921151399612427, + "learning_rate": 1.1549867608120036e-05, + "loss": 0.4014, + "step": 21790 + }, + { + "epoch": 3.847851028152855, + "grad_norm": 1.1991304159164429, + "learning_rate": 1.1532215357458075e-05, + "loss": 0.3655, + "step": 21800 + }, + { + "epoch": 3.849616097431824, + "grad_norm": 3.473905324935913, + "learning_rate": 1.1514563106796117e-05, + "loss": 0.3779, + "step": 21810 + }, + { + "epoch": 3.8513811667107936, + "grad_norm": 1.5528671741485596, + "learning_rate": 1.1496910856134158e-05, + "loss": 0.3933, + "step": 21820 + }, + { + "epoch": 3.8531462359897626, + "grad_norm": 2.279825210571289, + "learning_rate": 1.1479258605472199e-05, + "loss": 0.4505, + "step": 21830 + }, + { + "epoch": 3.8549113052687316, + "grad_norm": 1.464953899383545, + "learning_rate": 1.146160635481024e-05, + "loss": 0.4017, + "step": 21840 + }, + { + "epoch": 3.856676374547701, + "grad_norm": 3.4239885807037354, + "learning_rate": 1.144395410414828e-05, + "loss": 0.4011, + "step": 21850 + }, + { + "epoch": 3.85844144382667, + "grad_norm": 3.552250385284424, + "learning_rate": 1.1426301853486319e-05, + "loss": 0.3232, + "step": 21860 + }, + { + "epoch": 3.8602065131056396, + "grad_norm": 3.231083393096924, + "learning_rate": 1.140864960282436e-05, + "loss": 0.3967, + "step": 21870 + }, + { + "epoch": 3.8619715823846086, + "grad_norm": 1.8909341096878052, + "learning_rate": 1.1390997352162401e-05, + "loss": 0.3818, + "step": 21880 + }, + { + "epoch": 3.8637366516635776, + "grad_norm": 1.1951463222503662, + "learning_rate": 1.1373345101500442e-05, + "loss": 0.4373, + "step": 21890 + }, + { + "epoch": 3.865501720942547, + "grad_norm": 2.14921498298645, + "learning_rate": 1.1355692850838483e-05, + "loss": 0.4407, + "step": 21900 + }, + { + "epoch": 3.867266790221516, + "grad_norm": 1.4103434085845947, + "learning_rate": 1.1338040600176523e-05, + "loss": 0.3667, + "step": 21910 + }, + { + "epoch": 3.8690318595004856, + "grad_norm": 0.9304023385047913, + "learning_rate": 1.1320388349514564e-05, + "loss": 0.3761, + "step": 21920 + }, + { + "epoch": 3.8707969287794546, + "grad_norm": 0.8020132184028625, + "learning_rate": 1.1302736098852604e-05, + "loss": 0.3869, + "step": 21930 + }, + { + "epoch": 3.8725619980584236, + "grad_norm": 1.112574815750122, + "learning_rate": 1.1285083848190645e-05, + "loss": 0.3541, + "step": 21940 + }, + { + "epoch": 3.874327067337393, + "grad_norm": 1.3293650150299072, + "learning_rate": 1.1267431597528686e-05, + "loss": 0.5064, + "step": 21950 + }, + { + "epoch": 3.876092136616362, + "grad_norm": 1.3969178199768066, + "learning_rate": 1.1249779346866727e-05, + "loss": 0.3996, + "step": 21960 + }, + { + "epoch": 3.8778572058953316, + "grad_norm": 1.1569809913635254, + "learning_rate": 1.1232127096204766e-05, + "loss": 0.3674, + "step": 21970 + }, + { + "epoch": 3.8796222751743006, + "grad_norm": 1.0100200176239014, + "learning_rate": 1.1214474845542808e-05, + "loss": 0.3997, + "step": 21980 + }, + { + "epoch": 3.8813873444532696, + "grad_norm": 0.8106080293655396, + "learning_rate": 1.1196822594880847e-05, + "loss": 0.4123, + "step": 21990 + }, + { + "epoch": 3.883152413732239, + "grad_norm": 1.5351225137710571, + "learning_rate": 1.1179170344218888e-05, + "loss": 0.3907, + "step": 22000 + }, + { + "epoch": 3.883152413732239, + "eval_loss": 0.6229148507118225, + "eval_runtime": 591.6, + "eval_samples_per_second": 47.882, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0005044017936811953, + "step": 22000 + }, + { + "epoch": 3.884917483011208, + "grad_norm": 1.0947341918945312, + "learning_rate": 1.116151809355693e-05, + "loss": 0.3824, + "step": 22010 + }, + { + "epoch": 3.8866825522901776, + "grad_norm": 2.8309361934661865, + "learning_rate": 1.114386584289497e-05, + "loss": 0.357, + "step": 22020 + }, + { + "epoch": 3.8884476215691466, + "grad_norm": 3.102479934692383, + "learning_rate": 1.112621359223301e-05, + "loss": 0.3478, + "step": 22030 + }, + { + "epoch": 3.8902126908481156, + "grad_norm": 2.9752273559570312, + "learning_rate": 1.1108561341571051e-05, + "loss": 0.4227, + "step": 22040 + }, + { + "epoch": 3.891977760127085, + "grad_norm": 1.2526441812515259, + "learning_rate": 1.1090909090909092e-05, + "loss": 0.4003, + "step": 22050 + }, + { + "epoch": 3.893742829406054, + "grad_norm": 1.1294667720794678, + "learning_rate": 1.1073256840247132e-05, + "loss": 0.3236, + "step": 22060 + }, + { + "epoch": 3.8955078986850236, + "grad_norm": 0.9164405465126038, + "learning_rate": 1.1055604589585173e-05, + "loss": 0.3619, + "step": 22070 + }, + { + "epoch": 3.8972729679639926, + "grad_norm": 1.0946732759475708, + "learning_rate": 1.1037952338923214e-05, + "loss": 0.4133, + "step": 22080 + }, + { + "epoch": 3.8990380372429616, + "grad_norm": 1.1232048273086548, + "learning_rate": 1.1020300088261253e-05, + "loss": 0.3772, + "step": 22090 + }, + { + "epoch": 3.900803106521931, + "grad_norm": 3.4187843799591064, + "learning_rate": 1.1002647837599295e-05, + "loss": 0.4244, + "step": 22100 + }, + { + "epoch": 3.9025681758009, + "grad_norm": 2.3030498027801514, + "learning_rate": 1.0984995586937336e-05, + "loss": 0.3796, + "step": 22110 + }, + { + "epoch": 3.9043332450798696, + "grad_norm": 1.2743421792984009, + "learning_rate": 1.0967343336275375e-05, + "loss": 0.3715, + "step": 22120 + }, + { + "epoch": 3.9060983143588386, + "grad_norm": 1.7408547401428223, + "learning_rate": 1.0949691085613416e-05, + "loss": 0.3988, + "step": 22130 + }, + { + "epoch": 3.9078633836378076, + "grad_norm": 3.8301382064819336, + "learning_rate": 1.0932038834951456e-05, + "loss": 0.3595, + "step": 22140 + }, + { + "epoch": 3.909628452916777, + "grad_norm": 1.3640156984329224, + "learning_rate": 1.0914386584289497e-05, + "loss": 0.3977, + "step": 22150 + }, + { + "epoch": 3.911393522195746, + "grad_norm": 2.915240526199341, + "learning_rate": 1.0896734333627538e-05, + "loss": 0.3883, + "step": 22160 + }, + { + "epoch": 3.9131585914747156, + "grad_norm": 1.0516003370285034, + "learning_rate": 1.087908208296558e-05, + "loss": 0.4486, + "step": 22170 + }, + { + "epoch": 3.9149236607536846, + "grad_norm": 1.0945969820022583, + "learning_rate": 1.086142983230362e-05, + "loss": 0.4029, + "step": 22180 + }, + { + "epoch": 3.9166887300326536, + "grad_norm": 3.0902202129364014, + "learning_rate": 1.084377758164166e-05, + "loss": 0.448, + "step": 22190 + }, + { + "epoch": 3.918453799311623, + "grad_norm": 0.8262447118759155, + "learning_rate": 1.08261253309797e-05, + "loss": 0.3915, + "step": 22200 + }, + { + "epoch": 3.920218868590592, + "grad_norm": 0.8729456663131714, + "learning_rate": 1.080847308031774e-05, + "loss": 0.4089, + "step": 22210 + }, + { + "epoch": 3.9219839378695616, + "grad_norm": 1.9946736097335815, + "learning_rate": 1.0790820829655782e-05, + "loss": 0.3995, + "step": 22220 + }, + { + "epoch": 3.9237490071485306, + "grad_norm": 0.9540746808052063, + "learning_rate": 1.0773168578993823e-05, + "loss": 0.4238, + "step": 22230 + }, + { + "epoch": 3.9255140764274996, + "grad_norm": 0.8667685985565186, + "learning_rate": 1.0755516328331864e-05, + "loss": 0.3823, + "step": 22240 + }, + { + "epoch": 3.927279145706469, + "grad_norm": 1.721529483795166, + "learning_rate": 1.0737864077669903e-05, + "loss": 0.3752, + "step": 22250 + }, + { + "epoch": 3.929044214985438, + "grad_norm": 2.2760324478149414, + "learning_rate": 1.0720211827007943e-05, + "loss": 0.3983, + "step": 22260 + }, + { + "epoch": 3.9308092842644076, + "grad_norm": 3.106400489807129, + "learning_rate": 1.0702559576345984e-05, + "loss": 0.4276, + "step": 22270 + }, + { + "epoch": 3.9325743535433766, + "grad_norm": 1.0799741744995117, + "learning_rate": 1.0684907325684025e-05, + "loss": 0.3941, + "step": 22280 + }, + { + "epoch": 3.9343394228223456, + "grad_norm": 0.885012149810791, + "learning_rate": 1.0667255075022066e-05, + "loss": 0.437, + "step": 22290 + }, + { + "epoch": 3.936104492101315, + "grad_norm": 0.9948944449424744, + "learning_rate": 1.0649602824360107e-05, + "loss": 0.3712, + "step": 22300 + }, + { + "epoch": 3.937869561380284, + "grad_norm": 1.2208306789398193, + "learning_rate": 1.0631950573698147e-05, + "loss": 0.3858, + "step": 22310 + }, + { + "epoch": 3.9396346306592536, + "grad_norm": 1.3812928199768066, + "learning_rate": 1.0614298323036186e-05, + "loss": 0.3662, + "step": 22320 + }, + { + "epoch": 3.9413996999382226, + "grad_norm": 2.223764657974243, + "learning_rate": 1.0596646072374227e-05, + "loss": 0.3875, + "step": 22330 + }, + { + "epoch": 3.9431647692171916, + "grad_norm": 1.549903392791748, + "learning_rate": 1.0578993821712269e-05, + "loss": 0.3764, + "step": 22340 + }, + { + "epoch": 3.944929838496161, + "grad_norm": 1.3353488445281982, + "learning_rate": 1.056134157105031e-05, + "loss": 0.4389, + "step": 22350 + }, + { + "epoch": 3.94669490777513, + "grad_norm": 0.9973052144050598, + "learning_rate": 1.054368932038835e-05, + "loss": 0.4134, + "step": 22360 + }, + { + "epoch": 3.9484599770540996, + "grad_norm": 0.7656703591346741, + "learning_rate": 1.052603706972639e-05, + "loss": 0.3623, + "step": 22370 + }, + { + "epoch": 3.9502250463330686, + "grad_norm": 1.0713194608688354, + "learning_rate": 1.0508384819064431e-05, + "loss": 0.3699, + "step": 22380 + }, + { + "epoch": 3.9519901156120376, + "grad_norm": 0.9375460147857666, + "learning_rate": 1.0490732568402471e-05, + "loss": 0.3788, + "step": 22390 + }, + { + "epoch": 3.953755184891007, + "grad_norm": 2.809772253036499, + "learning_rate": 1.0473080317740512e-05, + "loss": 0.3766, + "step": 22400 + }, + { + "epoch": 3.955520254169976, + "grad_norm": 1.0046566724777222, + "learning_rate": 1.0455428067078553e-05, + "loss": 0.3173, + "step": 22410 + }, + { + "epoch": 3.9572853234489456, + "grad_norm": 3.1235032081604004, + "learning_rate": 1.0437775816416594e-05, + "loss": 0.3317, + "step": 22420 + }, + { + "epoch": 3.9590503927279146, + "grad_norm": 2.9782588481903076, + "learning_rate": 1.0420123565754634e-05, + "loss": 0.368, + "step": 22430 + }, + { + "epoch": 3.9608154620068836, + "grad_norm": 1.7899539470672607, + "learning_rate": 1.0402471315092675e-05, + "loss": 0.3335, + "step": 22440 + }, + { + "epoch": 3.962580531285853, + "grad_norm": 1.4096568822860718, + "learning_rate": 1.0384819064430714e-05, + "loss": 0.387, + "step": 22450 + }, + { + "epoch": 3.964345600564822, + "grad_norm": 2.6816868782043457, + "learning_rate": 1.0367166813768755e-05, + "loss": 0.4641, + "step": 22460 + }, + { + "epoch": 3.9661106698437916, + "grad_norm": 0.9058939814567566, + "learning_rate": 1.0349514563106797e-05, + "loss": 0.3344, + "step": 22470 + }, + { + "epoch": 3.9678757391227606, + "grad_norm": 1.312925934791565, + "learning_rate": 1.0331862312444838e-05, + "loss": 0.403, + "step": 22480 + }, + { + "epoch": 3.9696408084017296, + "grad_norm": 2.5122547149658203, + "learning_rate": 1.0314210061782877e-05, + "loss": 0.3507, + "step": 22490 + }, + { + "epoch": 3.971405877680699, + "grad_norm": 1.3816343545913696, + "learning_rate": 1.0296557811120918e-05, + "loss": 0.4329, + "step": 22500 + }, + { + "epoch": 3.973170946959668, + "grad_norm": 3.181731939315796, + "learning_rate": 1.027890556045896e-05, + "loss": 0.4151, + "step": 22510 + }, + { + "epoch": 3.9749360162386376, + "grad_norm": 2.3720247745513916, + "learning_rate": 1.0261253309796999e-05, + "loss": 0.354, + "step": 22520 + }, + { + "epoch": 3.9767010855176066, + "grad_norm": 2.9242501258850098, + "learning_rate": 1.024360105913504e-05, + "loss": 0.3514, + "step": 22530 + }, + { + "epoch": 3.9784661547965756, + "grad_norm": 2.076550245285034, + "learning_rate": 1.0225948808473081e-05, + "loss": 0.3575, + "step": 22540 + }, + { + "epoch": 3.980231224075545, + "grad_norm": 3.645087957382202, + "learning_rate": 1.020829655781112e-05, + "loss": 0.4174, + "step": 22550 + }, + { + "epoch": 3.981996293354514, + "grad_norm": 1.596049427986145, + "learning_rate": 1.0190644307149162e-05, + "loss": 0.393, + "step": 22560 + }, + { + "epoch": 3.9837613626334836, + "grad_norm": 1.1321353912353516, + "learning_rate": 1.0172992056487203e-05, + "loss": 0.4111, + "step": 22570 + }, + { + "epoch": 3.9855264319124526, + "grad_norm": 1.0473411083221436, + "learning_rate": 1.0155339805825244e-05, + "loss": 0.3882, + "step": 22580 + }, + { + "epoch": 3.9872915011914216, + "grad_norm": 0.7440062761306763, + "learning_rate": 1.0137687555163284e-05, + "loss": 0.3498, + "step": 22590 + }, + { + "epoch": 3.989056570470391, + "grad_norm": 2.2967731952667236, + "learning_rate": 1.0120035304501325e-05, + "loss": 0.3788, + "step": 22600 + }, + { + "epoch": 3.99082163974936, + "grad_norm": 2.151320219039917, + "learning_rate": 1.0102383053839364e-05, + "loss": 0.3777, + "step": 22610 + }, + { + "epoch": 3.9925867090283296, + "grad_norm": 1.2454546689987183, + "learning_rate": 1.0084730803177405e-05, + "loss": 0.332, + "step": 22620 + }, + { + "epoch": 3.9943517783072986, + "grad_norm": 1.2453608512878418, + "learning_rate": 1.0067078552515446e-05, + "loss": 0.4272, + "step": 22630 + }, + { + "epoch": 3.9961168475862676, + "grad_norm": 2.866697072982788, + "learning_rate": 1.0049426301853488e-05, + "loss": 0.4149, + "step": 22640 + }, + { + "epoch": 3.997881916865237, + "grad_norm": 0.9703179597854614, + "learning_rate": 1.0031774051191527e-05, + "loss": 0.4228, + "step": 22650 + }, + { + "epoch": 3.999646986144206, + "grad_norm": 2.9622578620910645, + "learning_rate": 1.0014121800529568e-05, + "loss": 0.4309, + "step": 22660 + }, + { + "epoch": 4.001412055423176, + "grad_norm": 0.811493992805481, + "learning_rate": 9.996469549867608e-06, + "loss": 0.3302, + "step": 22670 + }, + { + "epoch": 4.003177124702145, + "grad_norm": 2.814358949661255, + "learning_rate": 9.978817299205649e-06, + "loss": 0.3583, + "step": 22680 + }, + { + "epoch": 4.004942193981114, + "grad_norm": 1.2781128883361816, + "learning_rate": 9.96116504854369e-06, + "loss": 0.2715, + "step": 22690 + }, + { + "epoch": 4.006707263260083, + "grad_norm": 0.9884144067764282, + "learning_rate": 9.943512797881731e-06, + "loss": 0.3188, + "step": 22700 + }, + { + "epoch": 4.0084723325390526, + "grad_norm": 4.0260539054870605, + "learning_rate": 9.925860547219772e-06, + "loss": 0.3265, + "step": 22710 + }, + { + "epoch": 4.010237401818022, + "grad_norm": 2.8330905437469482, + "learning_rate": 9.908208296557812e-06, + "loss": 0.3375, + "step": 22720 + }, + { + "epoch": 4.012002471096991, + "grad_norm": 1.2538163661956787, + "learning_rate": 9.890556045895851e-06, + "loss": 0.2851, + "step": 22730 + }, + { + "epoch": 4.01376754037596, + "grad_norm": 1.3961235284805298, + "learning_rate": 9.872903795233892e-06, + "loss": 0.2863, + "step": 22740 + }, + { + "epoch": 4.015532609654929, + "grad_norm": 0.8649784922599792, + "learning_rate": 9.855251544571933e-06, + "loss": 0.3217, + "step": 22750 + }, + { + "epoch": 4.0172976789338986, + "grad_norm": 1.030613660812378, + "learning_rate": 9.837599293909975e-06, + "loss": 0.3168, + "step": 22760 + }, + { + "epoch": 4.019062748212868, + "grad_norm": 1.1077706813812256, + "learning_rate": 9.819947043248016e-06, + "loss": 0.3424, + "step": 22770 + }, + { + "epoch": 4.020827817491837, + "grad_norm": 1.1141341924667358, + "learning_rate": 9.802294792586055e-06, + "loss": 0.2827, + "step": 22780 + }, + { + "epoch": 4.022592886770806, + "grad_norm": 3.150998830795288, + "learning_rate": 9.784642541924095e-06, + "loss": 0.3143, + "step": 22790 + }, + { + "epoch": 4.024357956049775, + "grad_norm": 2.8929078578948975, + "learning_rate": 9.766990291262136e-06, + "loss": 0.3366, + "step": 22800 + }, + { + "epoch": 4.0261230253287446, + "grad_norm": 0.665348470211029, + "learning_rate": 9.749338040600177e-06, + "loss": 0.3051, + "step": 22810 + }, + { + "epoch": 4.027888094607714, + "grad_norm": 1.1569024324417114, + "learning_rate": 9.731685789938218e-06, + "loss": 0.3493, + "step": 22820 + }, + { + "epoch": 4.029653163886683, + "grad_norm": 1.1093579530715942, + "learning_rate": 9.71403353927626e-06, + "loss": 0.287, + "step": 22830 + }, + { + "epoch": 4.031418233165652, + "grad_norm": 2.9330356121063232, + "learning_rate": 9.696381288614299e-06, + "loss": 0.3206, + "step": 22840 + }, + { + "epoch": 4.033183302444621, + "grad_norm": 2.4523935317993164, + "learning_rate": 9.678729037952338e-06, + "loss": 0.3066, + "step": 22850 + }, + { + "epoch": 4.0349483717235906, + "grad_norm": 1.1754714250564575, + "learning_rate": 9.66107678729038e-06, + "loss": 0.2948, + "step": 22860 + }, + { + "epoch": 4.03671344100256, + "grad_norm": 1.2477717399597168, + "learning_rate": 9.64342453662842e-06, + "loss": 0.2733, + "step": 22870 + }, + { + "epoch": 4.038478510281529, + "grad_norm": 2.5531203746795654, + "learning_rate": 9.625772285966462e-06, + "loss": 0.2801, + "step": 22880 + }, + { + "epoch": 4.040243579560498, + "grad_norm": 1.2535523176193237, + "learning_rate": 9.608120035304503e-06, + "loss": 0.3116, + "step": 22890 + }, + { + "epoch": 4.042008648839467, + "grad_norm": 1.3747566938400269, + "learning_rate": 9.590467784642542e-06, + "loss": 0.3896, + "step": 22900 + }, + { + "epoch": 4.0437737181184366, + "grad_norm": 1.3517296314239502, + "learning_rate": 9.572815533980583e-06, + "loss": 0.3097, + "step": 22910 + }, + { + "epoch": 4.045538787397406, + "grad_norm": 1.2062978744506836, + "learning_rate": 9.555163283318623e-06, + "loss": 0.3538, + "step": 22920 + }, + { + "epoch": 4.047303856676375, + "grad_norm": 1.1064728498458862, + "learning_rate": 9.537511032656664e-06, + "loss": 0.2828, + "step": 22930 + }, + { + "epoch": 4.049068925955344, + "grad_norm": 3.050992488861084, + "learning_rate": 9.519858781994705e-06, + "loss": 0.2996, + "step": 22940 + }, + { + "epoch": 4.050833995234313, + "grad_norm": 3.6021342277526855, + "learning_rate": 9.503971756398941e-06, + "loss": 0.3003, + "step": 22950 + }, + { + "epoch": 4.052599064513283, + "grad_norm": 1.802708625793457, + "learning_rate": 9.486319505736982e-06, + "loss": 0.2647, + "step": 22960 + }, + { + "epoch": 4.054364133792252, + "grad_norm": 0.673669159412384, + "learning_rate": 9.468667255075023e-06, + "loss": 0.3215, + "step": 22970 + }, + { + "epoch": 4.056129203071221, + "grad_norm": 1.2035069465637207, + "learning_rate": 9.451015004413063e-06, + "loss": 0.2952, + "step": 22980 + }, + { + "epoch": 4.05789427235019, + "grad_norm": 2.897989273071289, + "learning_rate": 9.433362753751104e-06, + "loss": 0.3877, + "step": 22990 + }, + { + "epoch": 4.059659341629159, + "grad_norm": 1.7467148303985596, + "learning_rate": 9.415710503089143e-06, + "loss": 0.2597, + "step": 23000 + }, + { + "epoch": 4.059659341629159, + "eval_loss": 0.648861289024353, + "eval_runtime": 591.5301, + "eval_samples_per_second": 47.888, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0004983124360109999, + "step": 23000 + }, + { + "epoch": 4.061424410908129, + "grad_norm": 2.3363940715789795, + "learning_rate": 9.398058252427185e-06, + "loss": 0.3482, + "step": 23010 + }, + { + "epoch": 4.063189480187098, + "grad_norm": 2.2884440422058105, + "learning_rate": 9.380406001765226e-06, + "loss": 0.3512, + "step": 23020 + }, + { + "epoch": 4.064954549466067, + "grad_norm": 1.645193338394165, + "learning_rate": 9.362753751103267e-06, + "loss": 0.2675, + "step": 23030 + }, + { + "epoch": 4.066719618745036, + "grad_norm": 1.3125427961349487, + "learning_rate": 9.345101500441306e-06, + "loss": 0.2646, + "step": 23040 + }, + { + "epoch": 4.068484688024005, + "grad_norm": 1.4610532522201538, + "learning_rate": 9.327449249779347e-06, + "loss": 0.2981, + "step": 23050 + }, + { + "epoch": 4.070249757302975, + "grad_norm": 1.4041917324066162, + "learning_rate": 9.309796999117387e-06, + "loss": 0.2779, + "step": 23060 + }, + { + "epoch": 4.072014826581944, + "grad_norm": 1.26522958278656, + "learning_rate": 9.292144748455428e-06, + "loss": 0.2745, + "step": 23070 + }, + { + "epoch": 4.073779895860913, + "grad_norm": 0.81779944896698, + "learning_rate": 9.27449249779347e-06, + "loss": 0.3133, + "step": 23080 + }, + { + "epoch": 4.075544965139882, + "grad_norm": 1.1725612878799438, + "learning_rate": 9.25684024713151e-06, + "loss": 0.3377, + "step": 23090 + }, + { + "epoch": 4.077310034418851, + "grad_norm": 1.5018218755722046, + "learning_rate": 9.23918799646955e-06, + "loss": 0.2722, + "step": 23100 + }, + { + "epoch": 4.079075103697821, + "grad_norm": 2.033559799194336, + "learning_rate": 9.221535745807591e-06, + "loss": 0.2613, + "step": 23110 + }, + { + "epoch": 4.08084017297679, + "grad_norm": 1.1398875713348389, + "learning_rate": 9.203883495145632e-06, + "loss": 0.3586, + "step": 23120 + }, + { + "epoch": 4.082605242255759, + "grad_norm": 1.7472472190856934, + "learning_rate": 9.186231244483672e-06, + "loss": 0.3397, + "step": 23130 + }, + { + "epoch": 4.084370311534728, + "grad_norm": 1.9572392702102661, + "learning_rate": 9.168578993821713e-06, + "loss": 0.3449, + "step": 23140 + }, + { + "epoch": 4.086135380813697, + "grad_norm": 2.6116466522216797, + "learning_rate": 9.150926743159754e-06, + "loss": 0.3446, + "step": 23150 + }, + { + "epoch": 4.087900450092667, + "grad_norm": 4.654027938842773, + "learning_rate": 9.133274492497793e-06, + "loss": 0.3033, + "step": 23160 + }, + { + "epoch": 4.089665519371636, + "grad_norm": 5.693676471710205, + "learning_rate": 9.115622241835834e-06, + "loss": 0.3421, + "step": 23170 + }, + { + "epoch": 4.091430588650605, + "grad_norm": 1.7942348718643188, + "learning_rate": 9.097969991173876e-06, + "loss": 0.319, + "step": 23180 + }, + { + "epoch": 4.093195657929574, + "grad_norm": 2.1697189807891846, + "learning_rate": 9.080317740511917e-06, + "loss": 0.3362, + "step": 23190 + }, + { + "epoch": 4.094960727208543, + "grad_norm": 0.8903745412826538, + "learning_rate": 9.062665489849956e-06, + "loss": 0.331, + "step": 23200 + }, + { + "epoch": 4.096725796487513, + "grad_norm": 3.2681972980499268, + "learning_rate": 9.045013239187997e-06, + "loss": 0.2898, + "step": 23210 + }, + { + "epoch": 4.098490865766482, + "grad_norm": 1.1167973279953003, + "learning_rate": 9.027360988526037e-06, + "loss": 0.2927, + "step": 23220 + }, + { + "epoch": 4.100255935045451, + "grad_norm": 2.5101799964904785, + "learning_rate": 9.009708737864078e-06, + "loss": 0.3068, + "step": 23230 + }, + { + "epoch": 4.10202100432442, + "grad_norm": 1.2922667264938354, + "learning_rate": 8.992056487202119e-06, + "loss": 0.3001, + "step": 23240 + }, + { + "epoch": 4.103786073603389, + "grad_norm": 5.074741840362549, + "learning_rate": 8.97440423654016e-06, + "loss": 0.2934, + "step": 23250 + }, + { + "epoch": 4.105551142882359, + "grad_norm": 2.8039944171905518, + "learning_rate": 8.9567519858782e-06, + "loss": 0.317, + "step": 23260 + }, + { + "epoch": 4.107316212161328, + "grad_norm": 1.5906391143798828, + "learning_rate": 8.93909973521624e-06, + "loss": 0.3094, + "step": 23270 + }, + { + "epoch": 4.109081281440297, + "grad_norm": 0.6800219416618347, + "learning_rate": 8.92144748455428e-06, + "loss": 0.2973, + "step": 23280 + }, + { + "epoch": 4.110846350719266, + "grad_norm": 1.584240198135376, + "learning_rate": 8.903795233892321e-06, + "loss": 0.3333, + "step": 23290 + }, + { + "epoch": 4.112611419998235, + "grad_norm": 1.1392163038253784, + "learning_rate": 8.886142983230363e-06, + "loss": 0.3363, + "step": 23300 + }, + { + "epoch": 4.114376489277205, + "grad_norm": 2.1709890365600586, + "learning_rate": 8.868490732568404e-06, + "loss": 0.3035, + "step": 23310 + }, + { + "epoch": 4.116141558556174, + "grad_norm": 2.6064887046813965, + "learning_rate": 8.850838481906445e-06, + "loss": 0.3309, + "step": 23320 + }, + { + "epoch": 4.117906627835143, + "grad_norm": 4.0854997634887695, + "learning_rate": 8.833186231244484e-06, + "loss": 0.2956, + "step": 23330 + }, + { + "epoch": 4.119671697114112, + "grad_norm": 2.8295369148254395, + "learning_rate": 8.815533980582524e-06, + "loss": 0.3026, + "step": 23340 + }, + { + "epoch": 4.121436766393081, + "grad_norm": 1.6397031545639038, + "learning_rate": 8.797881729920565e-06, + "loss": 0.3333, + "step": 23350 + }, + { + "epoch": 4.123201835672051, + "grad_norm": 2.4158713817596436, + "learning_rate": 8.780229479258606e-06, + "loss": 0.3445, + "step": 23360 + }, + { + "epoch": 4.12496690495102, + "grad_norm": 3.3127028942108154, + "learning_rate": 8.762577228596647e-06, + "loss": 0.2818, + "step": 23370 + }, + { + "epoch": 4.126731974229989, + "grad_norm": 1.6492713689804077, + "learning_rate": 8.744924977934688e-06, + "loss": 0.3062, + "step": 23380 + }, + { + "epoch": 4.128497043508958, + "grad_norm": 1.1045948266983032, + "learning_rate": 8.727272727272728e-06, + "loss": 0.3042, + "step": 23390 + }, + { + "epoch": 4.130262112787927, + "grad_norm": 0.809982419013977, + "learning_rate": 8.709620476610767e-06, + "loss": 0.304, + "step": 23400 + }, + { + "epoch": 4.132027182066896, + "grad_norm": 1.7027791738510132, + "learning_rate": 8.691968225948808e-06, + "loss": 0.3169, + "step": 23410 + }, + { + "epoch": 4.133792251345866, + "grad_norm": 0.8467365503311157, + "learning_rate": 8.67431597528685e-06, + "loss": 0.2853, + "step": 23420 + }, + { + "epoch": 4.135557320624835, + "grad_norm": 1.5310596227645874, + "learning_rate": 8.65666372462489e-06, + "loss": 0.3149, + "step": 23430 + }, + { + "epoch": 4.137322389903804, + "grad_norm": 0.9673229455947876, + "learning_rate": 8.639011473962932e-06, + "loss": 0.3051, + "step": 23440 + }, + { + "epoch": 4.139087459182773, + "grad_norm": 1.3377794027328491, + "learning_rate": 8.621359223300971e-06, + "loss": 0.3517, + "step": 23450 + }, + { + "epoch": 4.140852528461743, + "grad_norm": 1.3316737413406372, + "learning_rate": 8.60370697263901e-06, + "loss": 0.3095, + "step": 23460 + }, + { + "epoch": 4.142617597740712, + "grad_norm": 1.3464562892913818, + "learning_rate": 8.586054721977052e-06, + "loss": 0.3085, + "step": 23470 + }, + { + "epoch": 4.144382667019681, + "grad_norm": 3.453214645385742, + "learning_rate": 8.568402471315093e-06, + "loss": 0.3038, + "step": 23480 + }, + { + "epoch": 4.14614773629865, + "grad_norm": 1.2823230028152466, + "learning_rate": 8.550750220653134e-06, + "loss": 0.3958, + "step": 23490 + }, + { + "epoch": 4.147912805577619, + "grad_norm": 1.4263204336166382, + "learning_rate": 8.533097969991175e-06, + "loss": 0.2977, + "step": 23500 + }, + { + "epoch": 4.149677874856588, + "grad_norm": 0.9644317626953125, + "learning_rate": 8.515445719329215e-06, + "loss": 0.3711, + "step": 23510 + }, + { + "epoch": 4.151442944135558, + "grad_norm": 1.897396206855774, + "learning_rate": 8.497793468667256e-06, + "loss": 0.2964, + "step": 23520 + }, + { + "epoch": 4.153208013414527, + "grad_norm": 2.8374288082122803, + "learning_rate": 8.480141218005295e-06, + "loss": 0.3009, + "step": 23530 + }, + { + "epoch": 4.154973082693496, + "grad_norm": 0.8667998313903809, + "learning_rate": 8.462488967343337e-06, + "loss": 0.3245, + "step": 23540 + }, + { + "epoch": 4.156738151972465, + "grad_norm": 3.246046543121338, + "learning_rate": 8.444836716681378e-06, + "loss": 0.3408, + "step": 23550 + }, + { + "epoch": 4.158503221251435, + "grad_norm": 1.8256837129592896, + "learning_rate": 8.427184466019419e-06, + "loss": 0.3052, + "step": 23560 + }, + { + "epoch": 4.160268290530404, + "grad_norm": 0.8874765634536743, + "learning_rate": 8.409532215357458e-06, + "loss": 0.2911, + "step": 23570 + }, + { + "epoch": 4.162033359809373, + "grad_norm": 1.2588152885437012, + "learning_rate": 8.3918799646955e-06, + "loss": 0.2877, + "step": 23580 + }, + { + "epoch": 4.163798429088342, + "grad_norm": 2.0045018196105957, + "learning_rate": 8.374227714033539e-06, + "loss": 0.398, + "step": 23590 + }, + { + "epoch": 4.165563498367311, + "grad_norm": 3.729039430618286, + "learning_rate": 8.35657546337158e-06, + "loss": 0.305, + "step": 23600 + }, + { + "epoch": 4.16732856764628, + "grad_norm": 2.881028652191162, + "learning_rate": 8.338923212709621e-06, + "loss": 0.2958, + "step": 23610 + }, + { + "epoch": 4.16909363692525, + "grad_norm": 1.8063958883285522, + "learning_rate": 8.32127096204766e-06, + "loss": 0.3089, + "step": 23620 + }, + { + "epoch": 4.170858706204219, + "grad_norm": 1.1365619897842407, + "learning_rate": 8.303618711385702e-06, + "loss": 0.3338, + "step": 23630 + }, + { + "epoch": 4.172623775483188, + "grad_norm": 1.2051860094070435, + "learning_rate": 8.285966460723743e-06, + "loss": 0.2829, + "step": 23640 + }, + { + "epoch": 4.174388844762157, + "grad_norm": 1.248076319694519, + "learning_rate": 8.268314210061784e-06, + "loss": 0.3573, + "step": 23650 + }, + { + "epoch": 4.176153914041127, + "grad_norm": 2.788290500640869, + "learning_rate": 8.250661959399823e-06, + "loss": 0.2695, + "step": 23660 + }, + { + "epoch": 4.177918983320096, + "grad_norm": 1.5468093156814575, + "learning_rate": 8.233009708737865e-06, + "loss": 0.3258, + "step": 23670 + }, + { + "epoch": 4.179684052599065, + "grad_norm": 3.705085039138794, + "learning_rate": 8.215357458075904e-06, + "loss": 0.2781, + "step": 23680 + }, + { + "epoch": 4.181449121878034, + "grad_norm": 1.7882062196731567, + "learning_rate": 8.197705207413945e-06, + "loss": 0.2967, + "step": 23690 + }, + { + "epoch": 4.183214191157003, + "grad_norm": 3.2124860286712646, + "learning_rate": 8.180052956751986e-06, + "loss": 0.2557, + "step": 23700 + }, + { + "epoch": 4.184979260435972, + "grad_norm": 1.0433921813964844, + "learning_rate": 8.162400706090028e-06, + "loss": 0.3152, + "step": 23710 + }, + { + "epoch": 4.186744329714942, + "grad_norm": 1.109761357307434, + "learning_rate": 8.144748455428069e-06, + "loss": 0.3309, + "step": 23720 + }, + { + "epoch": 4.188509398993911, + "grad_norm": 1.2098308801651, + "learning_rate": 8.127096204766108e-06, + "loss": 0.2816, + "step": 23730 + }, + { + "epoch": 4.19027446827288, + "grad_norm": 1.9796377420425415, + "learning_rate": 8.109443954104148e-06, + "loss": 0.3377, + "step": 23740 + }, + { + "epoch": 4.192039537551849, + "grad_norm": 2.3625099658966064, + "learning_rate": 8.091791703442189e-06, + "loss": 0.3028, + "step": 23750 + }, + { + "epoch": 4.193804606830818, + "grad_norm": 0.8961766958236694, + "learning_rate": 8.07413945278023e-06, + "loss": 0.2704, + "step": 23760 + }, + { + "epoch": 4.195569676109788, + "grad_norm": 1.137475609779358, + "learning_rate": 8.056487202118271e-06, + "loss": 0.3296, + "step": 23770 + }, + { + "epoch": 4.197334745388757, + "grad_norm": 0.7209318280220032, + "learning_rate": 8.038834951456312e-06, + "loss": 0.2598, + "step": 23780 + }, + { + "epoch": 4.199099814667726, + "grad_norm": 1.4462792873382568, + "learning_rate": 8.021182700794352e-06, + "loss": 0.3049, + "step": 23790 + }, + { + "epoch": 4.200864883946695, + "grad_norm": 1.377173900604248, + "learning_rate": 8.003530450132391e-06, + "loss": 0.3041, + "step": 23800 + }, + { + "epoch": 4.202629953225664, + "grad_norm": 1.8368486166000366, + "learning_rate": 7.985878199470432e-06, + "loss": 0.2636, + "step": 23810 + }, + { + "epoch": 4.204395022504634, + "grad_norm": 1.310995101928711, + "learning_rate": 7.968225948808473e-06, + "loss": 0.2845, + "step": 23820 + }, + { + "epoch": 4.206160091783603, + "grad_norm": 1.4313207864761353, + "learning_rate": 7.950573698146515e-06, + "loss": 0.3365, + "step": 23830 + }, + { + "epoch": 4.207925161062572, + "grad_norm": 2.2771763801574707, + "learning_rate": 7.932921447484556e-06, + "loss": 0.3221, + "step": 23840 + }, + { + "epoch": 4.209690230341541, + "grad_norm": 0.9911360740661621, + "learning_rate": 7.915269196822595e-06, + "loss": 0.3255, + "step": 23850 + }, + { + "epoch": 4.21145529962051, + "grad_norm": 1.0318197011947632, + "learning_rate": 7.897616946160635e-06, + "loss": 0.2863, + "step": 23860 + }, + { + "epoch": 4.21322036889948, + "grad_norm": 1.4511429071426392, + "learning_rate": 7.879964695498676e-06, + "loss": 0.2939, + "step": 23870 + }, + { + "epoch": 4.214985438178449, + "grad_norm": 0.9572875499725342, + "learning_rate": 7.862312444836717e-06, + "loss": 0.3005, + "step": 23880 + }, + { + "epoch": 4.216750507457418, + "grad_norm": 2.9129538536071777, + "learning_rate": 7.844660194174758e-06, + "loss": 0.2933, + "step": 23890 + }, + { + "epoch": 4.218515576736387, + "grad_norm": 1.1331290006637573, + "learning_rate": 7.827007943512799e-06, + "loss": 0.2633, + "step": 23900 + }, + { + "epoch": 4.220280646015356, + "grad_norm": 0.7028851509094238, + "learning_rate": 7.809355692850839e-06, + "loss": 0.3477, + "step": 23910 + }, + { + "epoch": 4.222045715294326, + "grad_norm": 1.0875290632247925, + "learning_rate": 7.791703442188878e-06, + "loss": 0.3177, + "step": 23920 + }, + { + "epoch": 4.223810784573295, + "grad_norm": 2.365562677383423, + "learning_rate": 7.77405119152692e-06, + "loss": 0.3045, + "step": 23930 + }, + { + "epoch": 4.225575853852264, + "grad_norm": 1.9424879550933838, + "learning_rate": 7.75639894086496e-06, + "loss": 0.2623, + "step": 23940 + }, + { + "epoch": 4.227340923131233, + "grad_norm": 2.014070510864258, + "learning_rate": 7.738746690203001e-06, + "loss": 0.2916, + "step": 23950 + }, + { + "epoch": 4.229105992410202, + "grad_norm": 2.267733573913574, + "learning_rate": 7.721094439541043e-06, + "loss": 0.2807, + "step": 23960 + }, + { + "epoch": 4.230871061689172, + "grad_norm": 2.6851611137390137, + "learning_rate": 7.703442188879082e-06, + "loss": 0.3615, + "step": 23970 + }, + { + "epoch": 4.232636130968141, + "grad_norm": 0.8334643244743347, + "learning_rate": 7.685789938217123e-06, + "loss": 0.2835, + "step": 23980 + }, + { + "epoch": 4.23440120024711, + "grad_norm": 1.605870008468628, + "learning_rate": 7.668137687555163e-06, + "loss": 0.3236, + "step": 23990 + }, + { + "epoch": 4.236166269526079, + "grad_norm": 1.1965278387069702, + "learning_rate": 7.650485436893204e-06, + "loss": 0.3069, + "step": 24000 + }, + { + "epoch": 4.236166269526079, + "eval_loss": 0.6535650491714478, + "eval_runtime": 591.7187, + "eval_samples_per_second": 47.872, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.000500342221901065, + "step": 24000 + }, + { + "epoch": 4.237931338805048, + "grad_norm": 2.919116973876953, + "learning_rate": 7.632833186231245e-06, + "loss": 0.3474, + "step": 24010 + }, + { + "epoch": 4.239696408084018, + "grad_norm": 2.139883041381836, + "learning_rate": 7.615180935569285e-06, + "loss": 0.2733, + "step": 24020 + }, + { + "epoch": 4.241461477362987, + "grad_norm": 0.7100754380226135, + "learning_rate": 7.597528684907326e-06, + "loss": 0.2755, + "step": 24030 + }, + { + "epoch": 4.243226546641956, + "grad_norm": 1.0110312700271606, + "learning_rate": 7.579876434245367e-06, + "loss": 0.2658, + "step": 24040 + }, + { + "epoch": 4.244991615920925, + "grad_norm": 1.016863465309143, + "learning_rate": 7.562224183583408e-06, + "loss": 0.2966, + "step": 24050 + }, + { + "epoch": 4.246756685199894, + "grad_norm": 2.464367389678955, + "learning_rate": 7.544571932921447e-06, + "loss": 0.334, + "step": 24060 + }, + { + "epoch": 4.248521754478864, + "grad_norm": 4.494397163391113, + "learning_rate": 7.5269196822594885e-06, + "loss": 0.2397, + "step": 24070 + }, + { + "epoch": 4.250286823757833, + "grad_norm": 2.648230791091919, + "learning_rate": 7.509267431597529e-06, + "loss": 0.3144, + "step": 24080 + }, + { + "epoch": 4.252051893036802, + "grad_norm": 3.175685405731201, + "learning_rate": 7.49161518093557e-06, + "loss": 0.2716, + "step": 24090 + }, + { + "epoch": 4.253816962315771, + "grad_norm": 2.834336757659912, + "learning_rate": 7.47396293027361e-06, + "loss": 0.2755, + "step": 24100 + }, + { + "epoch": 4.25558203159474, + "grad_norm": 2.6625614166259766, + "learning_rate": 7.456310679611651e-06, + "loss": 0.367, + "step": 24110 + }, + { + "epoch": 4.25734710087371, + "grad_norm": 0.9785534143447876, + "learning_rate": 7.438658428949691e-06, + "loss": 0.292, + "step": 24120 + }, + { + "epoch": 4.259112170152679, + "grad_norm": 1.2775884866714478, + "learning_rate": 7.421006178287732e-06, + "loss": 0.3023, + "step": 24130 + }, + { + "epoch": 4.260877239431648, + "grad_norm": 2.735036611557007, + "learning_rate": 7.403353927625772e-06, + "loss": 0.3281, + "step": 24140 + }, + { + "epoch": 4.262642308710617, + "grad_norm": 1.1893835067749023, + "learning_rate": 7.385701676963813e-06, + "loss": 0.2853, + "step": 24150 + }, + { + "epoch": 4.264407377989586, + "grad_norm": 1.292029857635498, + "learning_rate": 7.368049426301854e-06, + "loss": 0.2648, + "step": 24160 + }, + { + "epoch": 4.266172447268556, + "grad_norm": 2.2875688076019287, + "learning_rate": 7.350397175639895e-06, + "loss": 0.3576, + "step": 24170 + }, + { + "epoch": 4.267937516547525, + "grad_norm": 0.7808769345283508, + "learning_rate": 7.332744924977936e-06, + "loss": 0.2908, + "step": 24180 + }, + { + "epoch": 4.269702585826494, + "grad_norm": 2.999920606613159, + "learning_rate": 7.3150926743159754e-06, + "loss": 0.327, + "step": 24190 + }, + { + "epoch": 4.271467655105463, + "grad_norm": 2.7963919639587402, + "learning_rate": 7.297440423654016e-06, + "loss": 0.3061, + "step": 24200 + }, + { + "epoch": 4.273232724384432, + "grad_norm": 1.0172919034957886, + "learning_rate": 7.279788172992057e-06, + "loss": 0.2894, + "step": 24210 + }, + { + "epoch": 4.274997793663402, + "grad_norm": 1.3990018367767334, + "learning_rate": 7.262135922330097e-06, + "loss": 0.2977, + "step": 24220 + }, + { + "epoch": 4.276762862942371, + "grad_norm": 1.0484473705291748, + "learning_rate": 7.244483671668138e-06, + "loss": 0.2686, + "step": 24230 + }, + { + "epoch": 4.27852793222134, + "grad_norm": 3.1869072914123535, + "learning_rate": 7.2268314210061795e-06, + "loss": 0.3518, + "step": 24240 + }, + { + "epoch": 4.280293001500309, + "grad_norm": 3.1976029872894287, + "learning_rate": 7.20917917034422e-06, + "loss": 0.346, + "step": 24250 + }, + { + "epoch": 4.282058070779278, + "grad_norm": 1.2420626878738403, + "learning_rate": 7.191526919682259e-06, + "loss": 0.3339, + "step": 24260 + }, + { + "epoch": 4.283823140058248, + "grad_norm": 1.1138620376586914, + "learning_rate": 7.1738746690203e-06, + "loss": 0.3505, + "step": 24270 + }, + { + "epoch": 4.285588209337217, + "grad_norm": 0.9700965285301208, + "learning_rate": 7.156222418358341e-06, + "loss": 0.3217, + "step": 24280 + }, + { + "epoch": 4.287353278616186, + "grad_norm": 0.8993445634841919, + "learning_rate": 7.138570167696382e-06, + "loss": 0.2542, + "step": 24290 + }, + { + "epoch": 4.289118347895155, + "grad_norm": 3.1148288249969482, + "learning_rate": 7.120917917034423e-06, + "loss": 0.3403, + "step": 24300 + }, + { + "epoch": 4.290883417174124, + "grad_norm": 1.1724573373794556, + "learning_rate": 7.103265666372463e-06, + "loss": 0.3562, + "step": 24310 + }, + { + "epoch": 4.292648486453094, + "grad_norm": 3.517512083053589, + "learning_rate": 7.085613415710503e-06, + "loss": 0.3412, + "step": 24320 + }, + { + "epoch": 4.294413555732063, + "grad_norm": 1.830450177192688, + "learning_rate": 7.067961165048544e-06, + "loss": 0.2899, + "step": 24330 + }, + { + "epoch": 4.296178625011032, + "grad_norm": 2.5125741958618164, + "learning_rate": 7.050308914386584e-06, + "loss": 0.3349, + "step": 24340 + }, + { + "epoch": 4.297943694290001, + "grad_norm": 3.087559223175049, + "learning_rate": 7.032656663724625e-06, + "loss": 0.3021, + "step": 24350 + }, + { + "epoch": 4.29970876356897, + "grad_norm": 1.0679352283477783, + "learning_rate": 7.0150044130626664e-06, + "loss": 0.3215, + "step": 24360 + }, + { + "epoch": 4.30147383284794, + "grad_norm": 0.7437068223953247, + "learning_rate": 6.997352162400707e-06, + "loss": 0.322, + "step": 24370 + }, + { + "epoch": 4.303238902126909, + "grad_norm": 1.2267742156982422, + "learning_rate": 6.979699911738748e-06, + "loss": 0.3966, + "step": 24380 + }, + { + "epoch": 4.305003971405878, + "grad_norm": 0.9550343155860901, + "learning_rate": 6.962047661076787e-06, + "loss": 0.3122, + "step": 24390 + }, + { + "epoch": 4.306769040684847, + "grad_norm": 1.301531195640564, + "learning_rate": 6.944395410414828e-06, + "loss": 0.2963, + "step": 24400 + }, + { + "epoch": 4.308534109963816, + "grad_norm": 1.4933806657791138, + "learning_rate": 6.926743159752869e-06, + "loss": 0.2739, + "step": 24410 + }, + { + "epoch": 4.310299179242786, + "grad_norm": 0.6646918654441833, + "learning_rate": 6.909090909090909e-06, + "loss": 0.2877, + "step": 24420 + }, + { + "epoch": 4.312064248521755, + "grad_norm": 1.2793468236923218, + "learning_rate": 6.89143865842895e-06, + "loss": 0.2802, + "step": 24430 + }, + { + "epoch": 4.313829317800724, + "grad_norm": 2.5267844200134277, + "learning_rate": 6.873786407766991e-06, + "loss": 0.2813, + "step": 24440 + }, + { + "epoch": 4.315594387079693, + "grad_norm": 0.9937548637390137, + "learning_rate": 6.856134157105031e-06, + "loss": 0.2761, + "step": 24450 + }, + { + "epoch": 4.317359456358662, + "grad_norm": 0.7776408791542053, + "learning_rate": 6.838481906443071e-06, + "loss": 0.3156, + "step": 24460 + }, + { + "epoch": 4.319124525637632, + "grad_norm": 3.936626672744751, + "learning_rate": 6.820829655781112e-06, + "loss": 0.29, + "step": 24470 + }, + { + "epoch": 4.320889594916601, + "grad_norm": 3.12445330619812, + "learning_rate": 6.8031774051191526e-06, + "loss": 0.3339, + "step": 24480 + }, + { + "epoch": 4.32265466419557, + "grad_norm": 2.77976131439209, + "learning_rate": 6.785525154457194e-06, + "loss": 0.2989, + "step": 24490 + }, + { + "epoch": 4.324419733474539, + "grad_norm": 1.1880944967269897, + "learning_rate": 6.767872903795235e-06, + "loss": 0.2955, + "step": 24500 + }, + { + "epoch": 4.326184802753508, + "grad_norm": 1.6699707508087158, + "learning_rate": 6.750220653133275e-06, + "loss": 0.2068, + "step": 24510 + }, + { + "epoch": 4.327949872032478, + "grad_norm": 4.329336166381836, + "learning_rate": 6.732568402471315e-06, + "loss": 0.3533, + "step": 24520 + }, + { + "epoch": 4.329714941311447, + "grad_norm": 0.8252127766609192, + "learning_rate": 6.714916151809356e-06, + "loss": 0.292, + "step": 24530 + }, + { + "epoch": 4.331480010590416, + "grad_norm": 4.663092613220215, + "learning_rate": 6.697263901147396e-06, + "loss": 0.3377, + "step": 24540 + }, + { + "epoch": 4.333245079869385, + "grad_norm": 1.191158413887024, + "learning_rate": 6.679611650485437e-06, + "loss": 0.2992, + "step": 24550 + }, + { + "epoch": 4.335010149148354, + "grad_norm": 0.9868506193161011, + "learning_rate": 6.661959399823478e-06, + "loss": 0.3129, + "step": 24560 + }, + { + "epoch": 4.336775218427324, + "grad_norm": 1.1885898113250732, + "learning_rate": 6.644307149161519e-06, + "loss": 0.3082, + "step": 24570 + }, + { + "epoch": 4.338540287706293, + "grad_norm": 1.1443339586257935, + "learning_rate": 6.62665489849956e-06, + "loss": 0.2772, + "step": 24580 + }, + { + "epoch": 4.340305356985262, + "grad_norm": 1.6731210947036743, + "learning_rate": 6.609002647837599e-06, + "loss": 0.2865, + "step": 24590 + }, + { + "epoch": 4.342070426264231, + "grad_norm": 2.7773597240448, + "learning_rate": 6.5913503971756395e-06, + "loss": 0.3296, + "step": 24600 + }, + { + "epoch": 4.3438354955432, + "grad_norm": 1.090955138206482, + "learning_rate": 6.573698146513681e-06, + "loss": 0.2642, + "step": 24610 + }, + { + "epoch": 4.34560056482217, + "grad_norm": 2.6028826236724854, + "learning_rate": 6.556045895851722e-06, + "loss": 0.2895, + "step": 24620 + }, + { + "epoch": 4.347365634101139, + "grad_norm": 0.9110934734344482, + "learning_rate": 6.538393645189762e-06, + "loss": 0.3021, + "step": 24630 + }, + { + "epoch": 4.349130703380108, + "grad_norm": 3.2843897342681885, + "learning_rate": 6.520741394527803e-06, + "loss": 0.2826, + "step": 24640 + }, + { + "epoch": 4.350895772659077, + "grad_norm": 3.9276788234710693, + "learning_rate": 6.503089143865843e-06, + "loss": 0.2596, + "step": 24650 + }, + { + "epoch": 4.352660841938046, + "grad_norm": 1.0411996841430664, + "learning_rate": 6.485436893203883e-06, + "loss": 0.2316, + "step": 24660 + }, + { + "epoch": 4.354425911217016, + "grad_norm": 1.1377794742584229, + "learning_rate": 6.467784642541924e-06, + "loss": 0.308, + "step": 24670 + }, + { + "epoch": 4.356190980495985, + "grad_norm": 1.1812376976013184, + "learning_rate": 6.450132391879965e-06, + "loss": 0.3091, + "step": 24680 + }, + { + "epoch": 4.357956049774954, + "grad_norm": 1.2442140579223633, + "learning_rate": 6.432480141218006e-06, + "loss": 0.3041, + "step": 24690 + }, + { + "epoch": 4.359721119053923, + "grad_norm": 1.8300071954727173, + "learning_rate": 6.414827890556047e-06, + "loss": 0.3012, + "step": 24700 + }, + { + "epoch": 4.361486188332892, + "grad_norm": 2.2776644229888916, + "learning_rate": 6.397175639894087e-06, + "loss": 0.3097, + "step": 24710 + }, + { + "epoch": 4.363251257611862, + "grad_norm": 1.6646955013275146, + "learning_rate": 6.3795233892321265e-06, + "loss": 0.2554, + "step": 24720 + }, + { + "epoch": 4.365016326890831, + "grad_norm": 0.6946931481361389, + "learning_rate": 6.361871138570168e-06, + "loss": 0.2638, + "step": 24730 + }, + { + "epoch": 4.3667813961698, + "grad_norm": 3.5868146419525146, + "learning_rate": 6.344218887908209e-06, + "loss": 0.2959, + "step": 24740 + }, + { + "epoch": 4.368546465448769, + "grad_norm": 0.9338198304176331, + "learning_rate": 6.326566637246249e-06, + "loss": 0.2671, + "step": 24750 + }, + { + "epoch": 4.370311534727738, + "grad_norm": 3.109726667404175, + "learning_rate": 6.30891438658429e-06, + "loss": 0.2825, + "step": 24760 + }, + { + "epoch": 4.372076604006708, + "grad_norm": 2.315507411956787, + "learning_rate": 6.2912621359223306e-06, + "loss": 0.3094, + "step": 24770 + }, + { + "epoch": 4.373841673285677, + "grad_norm": 3.1632182598114014, + "learning_rate": 6.273609885260372e-06, + "loss": 0.2725, + "step": 24780 + }, + { + "epoch": 4.375606742564646, + "grad_norm": 1.8601646423339844, + "learning_rate": 6.255957634598411e-06, + "loss": 0.289, + "step": 24790 + }, + { + "epoch": 4.377371811843615, + "grad_norm": 2.712733507156372, + "learning_rate": 6.238305383936452e-06, + "loss": 0.3093, + "step": 24800 + }, + { + "epoch": 4.379136881122584, + "grad_norm": 3.8398969173431396, + "learning_rate": 6.220653133274493e-06, + "loss": 0.3063, + "step": 24810 + }, + { + "epoch": 4.380901950401554, + "grad_norm": 0.9455320835113525, + "learning_rate": 6.203000882612534e-06, + "loss": 0.3086, + "step": 24820 + }, + { + "epoch": 4.382667019680523, + "grad_norm": 2.375248908996582, + "learning_rate": 6.185348631950574e-06, + "loss": 0.2885, + "step": 24830 + }, + { + "epoch": 4.384432088959492, + "grad_norm": 0.8944133520126343, + "learning_rate": 6.167696381288614e-06, + "loss": 0.2758, + "step": 24840 + }, + { + "epoch": 4.386197158238461, + "grad_norm": 1.490126609802246, + "learning_rate": 6.1500441306266555e-06, + "loss": 0.2911, + "step": 24850 + }, + { + "epoch": 4.38796222751743, + "grad_norm": 1.6396925449371338, + "learning_rate": 6.132391879964696e-06, + "loss": 0.2867, + "step": 24860 + }, + { + "epoch": 4.3897272967964, + "grad_norm": 2.53312087059021, + "learning_rate": 6.114739629302736e-06, + "loss": 0.2978, + "step": 24870 + }, + { + "epoch": 4.391492366075369, + "grad_norm": 2.5027709007263184, + "learning_rate": 6.097087378640777e-06, + "loss": 0.3359, + "step": 24880 + }, + { + "epoch": 4.393257435354338, + "grad_norm": 1.1508357524871826, + "learning_rate": 6.0794351279788175e-06, + "loss": 0.2817, + "step": 24890 + }, + { + "epoch": 4.395022504633307, + "grad_norm": 1.068000316619873, + "learning_rate": 6.061782877316858e-06, + "loss": 0.3096, + "step": 24900 + }, + { + "epoch": 4.396787573912276, + "grad_norm": 0.8342865109443665, + "learning_rate": 6.044130626654899e-06, + "loss": 0.2927, + "step": 24910 + }, + { + "epoch": 4.398552643191246, + "grad_norm": 1.2853881120681763, + "learning_rate": 6.026478375992939e-06, + "loss": 0.2963, + "step": 24920 + }, + { + "epoch": 4.400317712470215, + "grad_norm": 1.2073771953582764, + "learning_rate": 6.00882612533098e-06, + "loss": 0.2856, + "step": 24930 + }, + { + "epoch": 4.402082781749184, + "grad_norm": 3.3324711322784424, + "learning_rate": 5.991173874669021e-06, + "loss": 0.3403, + "step": 24940 + }, + { + "epoch": 4.403847851028153, + "grad_norm": 3.556697130203247, + "learning_rate": 5.973521624007061e-06, + "loss": 0.2825, + "step": 24950 + }, + { + "epoch": 4.405612920307122, + "grad_norm": 2.4177629947662354, + "learning_rate": 5.955869373345102e-06, + "loss": 0.3349, + "step": 24960 + }, + { + "epoch": 4.407377989586092, + "grad_norm": 2.2014477252960205, + "learning_rate": 5.9382171226831425e-06, + "loss": 0.2541, + "step": 24970 + }, + { + "epoch": 4.409143058865061, + "grad_norm": 1.251031517982483, + "learning_rate": 5.920564872021183e-06, + "loss": 0.3314, + "step": 24980 + }, + { + "epoch": 4.41090812814403, + "grad_norm": 2.201280117034912, + "learning_rate": 5.902912621359224e-06, + "loss": 0.2466, + "step": 24990 + }, + { + "epoch": 4.412673197422999, + "grad_norm": 1.493913173675537, + "learning_rate": 5.885260370697264e-06, + "loss": 0.3005, + "step": 25000 + }, + { + "epoch": 4.412673197422999, + "eval_loss": 0.655875027179718, + "eval_runtime": 591.548, + "eval_samples_per_second": 47.886, + "eval_steps_per_second": 2.395, + "eval_token_accuracy": 0.0004981094574219933, + "step": 25000 + }, + { + "epoch": 4.414438266701968, + "grad_norm": 1.3809815645217896, + "learning_rate": 5.8676081200353045e-06, + "loss": 0.2812, + "step": 25010 + }, + { + "epoch": 4.416203335980938, + "grad_norm": 2.053154468536377, + "learning_rate": 5.849955869373346e-06, + "loss": 0.3088, + "step": 25020 + }, + { + "epoch": 4.417968405259907, + "grad_norm": 1.3587795495986938, + "learning_rate": 5.832303618711386e-06, + "loss": 0.2717, + "step": 25030 + }, + { + "epoch": 4.419733474538876, + "grad_norm": 2.7400026321411133, + "learning_rate": 5.814651368049426e-06, + "loss": 0.3474, + "step": 25040 + }, + { + "epoch": 4.421498543817845, + "grad_norm": 0.9677148461341858, + "learning_rate": 5.796999117387467e-06, + "loss": 0.2745, + "step": 25050 + }, + { + "epoch": 4.423263613096814, + "grad_norm": 2.704630136489868, + "learning_rate": 5.779346866725508e-06, + "loss": 0.3524, + "step": 25060 + }, + { + "epoch": 4.425028682375784, + "grad_norm": 2.6748154163360596, + "learning_rate": 5.761694616063548e-06, + "loss": 0.2928, + "step": 25070 + }, + { + "epoch": 4.426793751654753, + "grad_norm": 1.1494990587234497, + "learning_rate": 5.744042365401589e-06, + "loss": 0.2947, + "step": 25080 + }, + { + "epoch": 4.428558820933722, + "grad_norm": 1.6972780227661133, + "learning_rate": 5.7263901147396294e-06, + "loss": 0.3203, + "step": 25090 + }, + { + "epoch": 4.430323890212691, + "grad_norm": 0.7590330243110657, + "learning_rate": 5.70873786407767e-06, + "loss": 0.3069, + "step": 25100 + }, + { + "epoch": 4.43208895949166, + "grad_norm": 2.139500617980957, + "learning_rate": 5.691085613415711e-06, + "loss": 0.3059, + "step": 25110 + }, + { + "epoch": 4.43385402877063, + "grad_norm": 2.8314902782440186, + "learning_rate": 5.673433362753751e-06, + "loss": 0.3206, + "step": 25120 + }, + { + "epoch": 4.435619098049599, + "grad_norm": 2.3796584606170654, + "learning_rate": 5.6557811120917915e-06, + "loss": 0.2859, + "step": 25130 + }, + { + "epoch": 4.437384167328568, + "grad_norm": 1.3357577323913574, + "learning_rate": 5.638128861429833e-06, + "loss": 0.291, + "step": 25140 + }, + { + "epoch": 4.439149236607537, + "grad_norm": 3.7719616889953613, + "learning_rate": 5.620476610767873e-06, + "loss": 0.328, + "step": 25150 + }, + { + "epoch": 4.440914305886506, + "grad_norm": 3.9590110778808594, + "learning_rate": 5.602824360105914e-06, + "loss": 0.3063, + "step": 25160 + }, + { + "epoch": 4.442679375165476, + "grad_norm": 0.9310747385025024, + "learning_rate": 5.585172109443954e-06, + "loss": 0.2763, + "step": 25170 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 1.384122610092163, + "learning_rate": 5.567519858781995e-06, + "loss": 0.3029, + "step": 25180 + }, + { + "epoch": 4.446209513723414, + "grad_norm": 1.3160362243652344, + "learning_rate": 5.549867608120036e-06, + "loss": 0.3112, + "step": 25190 + }, + { + "epoch": 4.447974583002383, + "grad_norm": 1.24907386302948, + "learning_rate": 5.532215357458076e-06, + "loss": 0.2652, + "step": 25200 + }, + { + "epoch": 4.449739652281352, + "grad_norm": 2.1747610569000244, + "learning_rate": 5.514563106796116e-06, + "loss": 0.3258, + "step": 25210 + }, + { + "epoch": 4.451504721560322, + "grad_norm": 0.5644562244415283, + "learning_rate": 5.4969108561341576e-06, + "loss": 0.2719, + "step": 25220 + }, + { + "epoch": 4.453269790839291, + "grad_norm": 0.9033040404319763, + "learning_rate": 5.479258605472198e-06, + "loss": 0.3326, + "step": 25230 + }, + { + "epoch": 4.45503486011826, + "grad_norm": 0.9956077933311462, + "learning_rate": 5.461606354810238e-06, + "loss": 0.3305, + "step": 25240 + }, + { + "epoch": 4.456799929397229, + "grad_norm": 1.1431922912597656, + "learning_rate": 5.443954104148279e-06, + "loss": 0.3509, + "step": 25250 + }, + { + "epoch": 4.458564998676198, + "grad_norm": 1.020296573638916, + "learning_rate": 5.42630185348632e-06, + "loss": 0.2813, + "step": 25260 + }, + { + "epoch": 4.460330067955168, + "grad_norm": 1.0820090770721436, + "learning_rate": 5.40864960282436e-06, + "loss": 0.2997, + "step": 25270 + }, + { + "epoch": 4.462095137234137, + "grad_norm": 1.5949653387069702, + "learning_rate": 5.390997352162401e-06, + "loss": 0.2982, + "step": 25280 + }, + { + "epoch": 4.463860206513106, + "grad_norm": 1.358872890472412, + "learning_rate": 5.373345101500442e-06, + "loss": 0.3245, + "step": 25290 + }, + { + "epoch": 4.465625275792075, + "grad_norm": 1.1218355894088745, + "learning_rate": 5.355692850838482e-06, + "loss": 0.3468, + "step": 25300 + }, + { + "epoch": 4.467390345071044, + "grad_norm": 3.1862800121307373, + "learning_rate": 5.338040600176523e-06, + "loss": 0.3242, + "step": 25310 + }, + { + "epoch": 4.469155414350014, + "grad_norm": 0.8951514363288879, + "learning_rate": 5.320388349514564e-06, + "loss": 0.2978, + "step": 25320 + }, + { + "epoch": 4.470920483628983, + "grad_norm": 2.006993293762207, + "learning_rate": 5.302736098852603e-06, + "loss": 0.2933, + "step": 25330 + }, + { + "epoch": 4.472685552907952, + "grad_norm": 4.0217719078063965, + "learning_rate": 5.2850838481906445e-06, + "loss": 0.3422, + "step": 25340 + }, + { + "epoch": 4.474450622186921, + "grad_norm": 1.2529191970825195, + "learning_rate": 5.267431597528686e-06, + "loss": 0.3059, + "step": 25350 + }, + { + "epoch": 4.47621569146589, + "grad_norm": 3.027233362197876, + "learning_rate": 5.249779346866725e-06, + "loss": 0.3498, + "step": 25360 + }, + { + "epoch": 4.47798076074486, + "grad_norm": 1.0020170211791992, + "learning_rate": 5.232127096204766e-06, + "loss": 0.273, + "step": 25370 + }, + { + "epoch": 4.479745830023829, + "grad_norm": 1.1415488719940186, + "learning_rate": 5.2144748455428074e-06, + "loss": 0.3093, + "step": 25380 + }, + { + "epoch": 4.481510899302798, + "grad_norm": 0.8501665592193604, + "learning_rate": 5.196822594880848e-06, + "loss": 0.2857, + "step": 25390 + }, + { + "epoch": 4.483275968581767, + "grad_norm": 0.9540072083473206, + "learning_rate": 5.179170344218888e-06, + "loss": 0.2998, + "step": 25400 + }, + { + "epoch": 4.485041037860736, + "grad_norm": 4.062297344207764, + "learning_rate": 5.161518093556929e-06, + "loss": 0.3466, + "step": 25410 + }, + { + "epoch": 4.486806107139706, + "grad_norm": 2.4073777198791504, + "learning_rate": 5.1438658428949695e-06, + "loss": 0.3052, + "step": 25420 + }, + { + "epoch": 4.488571176418675, + "grad_norm": 2.450510025024414, + "learning_rate": 5.12621359223301e-06, + "loss": 0.3023, + "step": 25430 + }, + { + "epoch": 4.490336245697644, + "grad_norm": 5.3954315185546875, + "learning_rate": 5.108561341571051e-06, + "loss": 0.3043, + "step": 25440 + }, + { + "epoch": 4.492101314976613, + "grad_norm": 1.8781121969223022, + "learning_rate": 5.090909090909091e-06, + "loss": 0.2563, + "step": 25450 + }, + { + "epoch": 4.493866384255582, + "grad_norm": 2.038400173187256, + "learning_rate": 5.0732568402471315e-06, + "loss": 0.3205, + "step": 25460 + }, + { + "epoch": 4.495631453534552, + "grad_norm": 2.6978981494903564, + "learning_rate": 5.055604589585173e-06, + "loss": 0.2934, + "step": 25470 + }, + { + "epoch": 4.497396522813521, + "grad_norm": 1.041235089302063, + "learning_rate": 5.037952338923213e-06, + "loss": 0.3096, + "step": 25480 + }, + { + "epoch": 4.49916159209249, + "grad_norm": 1.190179467201233, + "learning_rate": 5.020300088261254e-06, + "loss": 0.2895, + "step": 25490 + }, + { + "epoch": 4.500926661371459, + "grad_norm": 0.9067394733428955, + "learning_rate": 5.0026478375992936e-06, + "loss": 0.3435, + "step": 25500 + }, + { + "epoch": 4.502691730650428, + "grad_norm": 2.3193790912628174, + "learning_rate": 4.984995586937335e-06, + "loss": 0.3417, + "step": 25510 + }, + { + "epoch": 4.504456799929397, + "grad_norm": 0.9801955819129944, + "learning_rate": 4.967343336275376e-06, + "loss": 0.3055, + "step": 25520 + }, + { + "epoch": 4.506221869208367, + "grad_norm": 5.301249027252197, + "learning_rate": 4.949691085613415e-06, + "loss": 0.2871, + "step": 25530 + }, + { + "epoch": 4.507986938487336, + "grad_norm": 2.5554471015930176, + "learning_rate": 4.9320388349514564e-06, + "loss": 0.3287, + "step": 25540 + }, + { + "epoch": 4.509752007766305, + "grad_norm": 0.8739365935325623, + "learning_rate": 4.914386584289498e-06, + "loss": 0.2924, + "step": 25550 + }, + { + "epoch": 4.511517077045274, + "grad_norm": 2.3037023544311523, + "learning_rate": 4.896734333627537e-06, + "loss": 0.2868, + "step": 25560 + }, + { + "epoch": 4.513282146324244, + "grad_norm": 2.186363697052002, + "learning_rate": 4.879082082965578e-06, + "loss": 0.2893, + "step": 25570 + }, + { + "epoch": 4.515047215603213, + "grad_norm": 2.8541760444641113, + "learning_rate": 4.861429832303619e-06, + "loss": 0.3106, + "step": 25580 + }, + { + "epoch": 4.516812284882182, + "grad_norm": 4.5733160972595215, + "learning_rate": 4.84377758164166e-06, + "loss": 0.3346, + "step": 25590 + }, + { + "epoch": 4.518577354161151, + "grad_norm": 0.9147126078605652, + "learning_rate": 4.8261253309797e-06, + "loss": 0.2274, + "step": 25600 + }, + { + "epoch": 4.52034242344012, + "grad_norm": 1.2009501457214355, + "learning_rate": 4.808473080317741e-06, + "loss": 0.317, + "step": 25610 + }, + { + "epoch": 4.522107492719089, + "grad_norm": 1.2615430355072021, + "learning_rate": 4.790820829655781e-06, + "loss": 0.2832, + "step": 25620 + }, + { + "epoch": 4.523872561998059, + "grad_norm": 2.852447032928467, + "learning_rate": 4.773168578993822e-06, + "loss": 0.3147, + "step": 25630 + }, + { + "epoch": 4.525637631277028, + "grad_norm": 0.9824477434158325, + "learning_rate": 4.755516328331863e-06, + "loss": 0.2997, + "step": 25640 + }, + { + "epoch": 4.527402700555997, + "grad_norm": 0.7785859107971191, + "learning_rate": 4.737864077669903e-06, + "loss": 0.305, + "step": 25650 + }, + { + "epoch": 4.529167769834966, + "grad_norm": 0.7817701697349548, + "learning_rate": 4.7202118270079434e-06, + "loss": 0.3145, + "step": 25660 + }, + { + "epoch": 4.530932839113936, + "grad_norm": 1.9367749691009521, + "learning_rate": 4.7025595763459846e-06, + "loss": 0.2832, + "step": 25670 + }, + { + "epoch": 4.532697908392905, + "grad_norm": 3.135561943054199, + "learning_rate": 4.684907325684025e-06, + "loss": 0.2559, + "step": 25680 + }, + { + "epoch": 4.534462977671874, + "grad_norm": 0.9335846304893494, + "learning_rate": 4.667255075022066e-06, + "loss": 0.3028, + "step": 25690 + }, + { + "epoch": 4.536228046950843, + "grad_norm": 1.048282504081726, + "learning_rate": 4.649602824360106e-06, + "loss": 0.3062, + "step": 25700 + }, + { + "epoch": 4.537993116229812, + "grad_norm": 1.0533701181411743, + "learning_rate": 4.631950573698147e-06, + "loss": 0.3095, + "step": 25710 + }, + { + "epoch": 4.539758185508781, + "grad_norm": 1.0367389917373657, + "learning_rate": 4.614298323036188e-06, + "loss": 0.2964, + "step": 25720 + }, + { + "epoch": 4.541523254787751, + "grad_norm": 4.537725925445557, + "learning_rate": 4.596646072374228e-06, + "loss": 0.3129, + "step": 25730 + }, + { + "epoch": 4.54328832406672, + "grad_norm": 1.0677293539047241, + "learning_rate": 4.578993821712268e-06, + "loss": 0.2888, + "step": 25740 + }, + { + "epoch": 4.545053393345689, + "grad_norm": 2.7014944553375244, + "learning_rate": 4.5613415710503095e-06, + "loss": 0.3432, + "step": 25750 + }, + { + "epoch": 4.546818462624658, + "grad_norm": 3.0516560077667236, + "learning_rate": 4.54368932038835e-06, + "loss": 0.2645, + "step": 25760 + }, + { + "epoch": 4.548583531903628, + "grad_norm": 1.2660284042358398, + "learning_rate": 4.52603706972639e-06, + "loss": 0.3056, + "step": 25770 + }, + { + "epoch": 4.550348601182597, + "grad_norm": 1.121484398841858, + "learning_rate": 4.508384819064431e-06, + "loss": 0.2779, + "step": 25780 + }, + { + "epoch": 4.552113670461566, + "grad_norm": 0.7952722907066345, + "learning_rate": 4.4907325684024715e-06, + "loss": 0.2805, + "step": 25790 + }, + { + "epoch": 4.553878739740535, + "grad_norm": 0.9755123257637024, + "learning_rate": 4.473080317740512e-06, + "loss": 0.3174, + "step": 25800 + }, + { + "epoch": 4.555643809019504, + "grad_norm": 3.9789111614227295, + "learning_rate": 4.455428067078553e-06, + "loss": 0.3924, + "step": 25810 + }, + { + "epoch": 4.557408878298473, + "grad_norm": 0.7337201237678528, + "learning_rate": 4.437775816416593e-06, + "loss": 0.2729, + "step": 25820 + }, + { + "epoch": 4.559173947577443, + "grad_norm": 1.2232738733291626, + "learning_rate": 4.420123565754634e-06, + "loss": 0.275, + "step": 25830 + }, + { + "epoch": 4.560939016856412, + "grad_norm": 0.9414122700691223, + "learning_rate": 4.402471315092675e-06, + "loss": 0.3133, + "step": 25840 + }, + { + "epoch": 4.562704086135381, + "grad_norm": 2.035956621170044, + "learning_rate": 4.384819064430715e-06, + "loss": 0.306, + "step": 25850 + }, + { + "epoch": 4.56446915541435, + "grad_norm": 1.1455971002578735, + "learning_rate": 4.367166813768755e-06, + "loss": 0.2878, + "step": 25860 + }, + { + "epoch": 4.56623422469332, + "grad_norm": 1.037244439125061, + "learning_rate": 4.3495145631067965e-06, + "loss": 0.3086, + "step": 25870 + }, + { + "epoch": 4.567999293972289, + "grad_norm": 1.216088891029358, + "learning_rate": 4.331862312444837e-06, + "loss": 0.3132, + "step": 25880 + }, + { + "epoch": 4.569764363251258, + "grad_norm": 1.1905804872512817, + "learning_rate": 4.314210061782877e-06, + "loss": 0.3662, + "step": 25890 + }, + { + "epoch": 4.571529432530227, + "grad_norm": 1.2199269533157349, + "learning_rate": 4.296557811120918e-06, + "loss": 0.2483, + "step": 25900 + }, + { + "epoch": 4.573294501809196, + "grad_norm": 1.5603306293487549, + "learning_rate": 4.2789055604589585e-06, + "loss": 0.2351, + "step": 25910 + }, + { + "epoch": 4.575059571088165, + "grad_norm": 1.185232400894165, + "learning_rate": 4.261253309797e-06, + "loss": 0.3162, + "step": 25920 + }, + { + "epoch": 4.576824640367135, + "grad_norm": 1.825040340423584, + "learning_rate": 4.24360105913504e-06, + "loss": 0.2868, + "step": 25930 + }, + { + "epoch": 4.578589709646104, + "grad_norm": 2.968623399734497, + "learning_rate": 4.22594880847308e-06, + "loss": 0.3104, + "step": 25940 + }, + { + "epoch": 4.580354778925073, + "grad_norm": 1.5956579446792603, + "learning_rate": 4.208296557811121e-06, + "loss": 0.311, + "step": 25950 + }, + { + "epoch": 4.582119848204042, + "grad_norm": 1.1295750141143799, + "learning_rate": 4.190644307149162e-06, + "loss": 0.3386, + "step": 25960 + }, + { + "epoch": 4.583884917483012, + "grad_norm": 2.4485180377960205, + "learning_rate": 4.172992056487202e-06, + "loss": 0.2868, + "step": 25970 + }, + { + "epoch": 4.585649986761981, + "grad_norm": 1.623205542564392, + "learning_rate": 4.155339805825243e-06, + "loss": 0.3648, + "step": 25980 + }, + { + "epoch": 4.58741505604095, + "grad_norm": 1.0542960166931152, + "learning_rate": 4.1376875551632835e-06, + "loss": 0.3063, + "step": 25990 + }, + { + "epoch": 4.589180125319919, + "grad_norm": 2.7634682655334473, + "learning_rate": 4.120035304501324e-06, + "loss": 0.2696, + "step": 26000 + }, + { + "epoch": 4.589180125319919, + "eval_loss": 0.6520219445228577, + "eval_runtime": 591.3466, + "eval_samples_per_second": 47.903, + "eval_steps_per_second": 2.396, + "eval_token_accuracy": 0.0004987183931890128, + "step": 26000 + }, + { + "epoch": 4.590945194598888, + "grad_norm": 0.9608787298202515, + "learning_rate": 4.102383053839365e-06, + "loss": 0.3891, + "step": 26010 + }, + { + "epoch": 4.592710263877857, + "grad_norm": 1.4607149362564087, + "learning_rate": 4.084730803177406e-06, + "loss": 0.3429, + "step": 26020 + }, + { + "epoch": 4.594475333156827, + "grad_norm": 3.5650391578674316, + "learning_rate": 4.0670785525154455e-06, + "loss": 0.3296, + "step": 26030 + }, + { + "epoch": 4.596240402435796, + "grad_norm": 2.4345691204071045, + "learning_rate": 4.051191526919682e-06, + "loss": 0.2763, + "step": 26040 + }, + { + "epoch": 4.598005471714765, + "grad_norm": 3.1073009967803955, + "learning_rate": 4.033539276257723e-06, + "loss": 0.295, + "step": 26050 + }, + { + "epoch": 4.599770540993734, + "grad_norm": 1.295482873916626, + "learning_rate": 4.015887025595764e-06, + "loss": 0.2796, + "step": 26060 + }, + { + "epoch": 4.601535610272704, + "grad_norm": 2.9781415462493896, + "learning_rate": 3.998234774933804e-06, + "loss": 0.2426, + "step": 26070 + }, + { + "epoch": 4.603300679551673, + "grad_norm": 2.263942003250122, + "learning_rate": 3.980582524271844e-06, + "loss": 0.3208, + "step": 26080 + }, + { + "epoch": 4.605065748830642, + "grad_norm": 1.9064913988113403, + "learning_rate": 3.9629302736098855e-06, + "loss": 0.3221, + "step": 26090 + }, + { + "epoch": 4.606830818109611, + "grad_norm": 1.2600352764129639, + "learning_rate": 3.945278022947927e-06, + "loss": 0.2916, + "step": 26100 + }, + { + "epoch": 4.60859588738858, + "grad_norm": 1.0407512187957764, + "learning_rate": 3.927625772285966e-06, + "loss": 0.3659, + "step": 26110 + }, + { + "epoch": 4.610360956667549, + "grad_norm": 0.909622073173523, + "learning_rate": 3.909973521624007e-06, + "loss": 0.3047, + "step": 26120 + }, + { + "epoch": 4.612126025946519, + "grad_norm": 1.2238434553146362, + "learning_rate": 3.8923212709620484e-06, + "loss": 0.2643, + "step": 26130 + }, + { + "epoch": 4.613891095225488, + "grad_norm": 1.068109154701233, + "learning_rate": 3.874669020300088e-06, + "loss": 0.2787, + "step": 26140 + }, + { + "epoch": 4.615656164504457, + "grad_norm": 1.3901350498199463, + "learning_rate": 3.857016769638129e-06, + "loss": 0.3241, + "step": 26150 + }, + { + "epoch": 4.617421233783426, + "grad_norm": 1.4353913068771362, + "learning_rate": 3.83936451897617e-06, + "loss": 0.2735, + "step": 26160 + }, + { + "epoch": 4.619186303062396, + "grad_norm": 2.3656747341156006, + "learning_rate": 3.82171226831421e-06, + "loss": 0.3031, + "step": 26170 + }, + { + "epoch": 4.620951372341365, + "grad_norm": 5.758845806121826, + "learning_rate": 3.8040600176522508e-06, + "loss": 0.3019, + "step": 26180 + }, + { + "epoch": 4.622716441620334, + "grad_norm": 1.460636854171753, + "learning_rate": 3.7864077669902915e-06, + "loss": 0.3448, + "step": 26190 + }, + { + "epoch": 4.624481510899303, + "grad_norm": 0.8635392189025879, + "learning_rate": 3.7687555163283322e-06, + "loss": 0.3295, + "step": 26200 + }, + { + "epoch": 4.626246580178272, + "grad_norm": 1.209629774093628, + "learning_rate": 3.7511032656663725e-06, + "loss": 0.3744, + "step": 26210 + }, + { + "epoch": 4.628011649457241, + "grad_norm": 0.9914260506629944, + "learning_rate": 3.7334510150044132e-06, + "loss": 0.3099, + "step": 26220 + }, + { + "epoch": 4.629776718736211, + "grad_norm": 3.453619956970215, + "learning_rate": 3.715798764342454e-06, + "loss": 0.3271, + "step": 26230 + }, + { + "epoch": 4.63154178801518, + "grad_norm": 0.9711024165153503, + "learning_rate": 3.6981465136804943e-06, + "loss": 0.3152, + "step": 26240 + }, + { + "epoch": 4.633306857294149, + "grad_norm": 1.7442153692245483, + "learning_rate": 3.680494263018535e-06, + "loss": 0.2736, + "step": 26250 + }, + { + "epoch": 4.635071926573118, + "grad_norm": 1.5687910318374634, + "learning_rate": 3.6628420123565757e-06, + "loss": 0.2782, + "step": 26260 + }, + { + "epoch": 4.636836995852088, + "grad_norm": 0.8251959085464478, + "learning_rate": 3.645189761694616e-06, + "loss": 0.3215, + "step": 26270 + }, + { + "epoch": 4.638602065131057, + "grad_norm": 1.3255902528762817, + "learning_rate": 3.6275375110326567e-06, + "loss": 0.2467, + "step": 26280 + }, + { + "epoch": 4.640367134410026, + "grad_norm": 1.0360016822814941, + "learning_rate": 3.6098852603706975e-06, + "loss": 0.3133, + "step": 26290 + }, + { + "epoch": 4.642132203688995, + "grad_norm": 0.9186058640480042, + "learning_rate": 3.5922330097087378e-06, + "loss": 0.2761, + "step": 26300 + }, + { + "epoch": 4.643897272967964, + "grad_norm": 1.3283425569534302, + "learning_rate": 3.5745807590467785e-06, + "loss": 0.3016, + "step": 26310 + }, + { + "epoch": 4.645662342246933, + "grad_norm": 2.8542580604553223, + "learning_rate": 3.556928508384819e-06, + "loss": 0.3201, + "step": 26320 + }, + { + "epoch": 4.647427411525903, + "grad_norm": 0.8498136401176453, + "learning_rate": 3.53927625772286e-06, + "loss": 0.307, + "step": 26330 + }, + { + "epoch": 4.649192480804872, + "grad_norm": 1.2035598754882812, + "learning_rate": 3.5216240070609002e-06, + "loss": 0.2858, + "step": 26340 + }, + { + "epoch": 4.650957550083841, + "grad_norm": 1.4415249824523926, + "learning_rate": 3.503971756398941e-06, + "loss": 0.2998, + "step": 26350 + }, + { + "epoch": 4.65272261936281, + "grad_norm": 3.0700762271881104, + "learning_rate": 3.4863195057369817e-06, + "loss": 0.3217, + "step": 26360 + }, + { + "epoch": 4.65448768864178, + "grad_norm": 0.8596258163452148, + "learning_rate": 3.468667255075022e-06, + "loss": 0.2927, + "step": 26370 + }, + { + "epoch": 4.656252757920749, + "grad_norm": 2.478158950805664, + "learning_rate": 3.4510150044130627e-06, + "loss": 0.3098, + "step": 26380 + }, + { + "epoch": 4.658017827199718, + "grad_norm": 3.169443368911743, + "learning_rate": 3.4333627537511034e-06, + "loss": 0.3009, + "step": 26390 + }, + { + "epoch": 4.659782896478687, + "grad_norm": 1.1622791290283203, + "learning_rate": 3.4157105030891437e-06, + "loss": 0.2945, + "step": 26400 + }, + { + "epoch": 4.661547965757656, + "grad_norm": 0.9778208136558533, + "learning_rate": 3.3980582524271844e-06, + "loss": 0.3669, + "step": 26410 + }, + { + "epoch": 4.663313035036625, + "grad_norm": 1.3064228296279907, + "learning_rate": 3.380406001765225e-06, + "loss": 0.2849, + "step": 26420 + }, + { + "epoch": 4.665078104315595, + "grad_norm": 2.3085849285125732, + "learning_rate": 3.3627537511032663e-06, + "loss": 0.3482, + "step": 26430 + }, + { + "epoch": 4.666843173594564, + "grad_norm": 4.210567951202393, + "learning_rate": 3.345101500441306e-06, + "loss": 0.2852, + "step": 26440 + }, + { + "epoch": 4.668608242873533, + "grad_norm": 1.2309918403625488, + "learning_rate": 3.327449249779347e-06, + "loss": 0.2712, + "step": 26450 + }, + { + "epoch": 4.670373312152502, + "grad_norm": 0.7104286551475525, + "learning_rate": 3.309796999117388e-06, + "loss": 0.2868, + "step": 26460 + }, + { + "epoch": 4.672138381431472, + "grad_norm": 1.697925090789795, + "learning_rate": 3.292144748455428e-06, + "loss": 0.3129, + "step": 26470 + }, + { + "epoch": 4.673903450710441, + "grad_norm": 1.1698533296585083, + "learning_rate": 3.2744924977934686e-06, + "loss": 0.2597, + "step": 26480 + }, + { + "epoch": 4.67566851998941, + "grad_norm": 1.2262734174728394, + "learning_rate": 3.2568402471315098e-06, + "loss": 0.3117, + "step": 26490 + }, + { + "epoch": 4.677433589268379, + "grad_norm": 2.3895866870880127, + "learning_rate": 3.2391879964695497e-06, + "loss": 0.2886, + "step": 26500 + }, + { + "epoch": 4.679198658547348, + "grad_norm": 1.1458373069763184, + "learning_rate": 3.2215357458075904e-06, + "loss": 0.2731, + "step": 26510 + }, + { + "epoch": 4.680963727826317, + "grad_norm": 1.3387194871902466, + "learning_rate": 3.2038834951456315e-06, + "loss": 0.3467, + "step": 26520 + }, + { + "epoch": 4.682728797105287, + "grad_norm": 0.7710012197494507, + "learning_rate": 3.1862312444836723e-06, + "loss": 0.3574, + "step": 26530 + }, + { + "epoch": 4.684493866384256, + "grad_norm": 0.7596765160560608, + "learning_rate": 3.168578993821712e-06, + "loss": 0.2881, + "step": 26540 + }, + { + "epoch": 4.686258935663225, + "grad_norm": 2.8583171367645264, + "learning_rate": 3.150926743159753e-06, + "loss": 0.3057, + "step": 26550 + }, + { + "epoch": 4.688024004942194, + "grad_norm": 4.765782833099365, + "learning_rate": 3.133274492497794e-06, + "loss": 0.2745, + "step": 26560 + }, + { + "epoch": 4.689789074221164, + "grad_norm": 1.0168406963348389, + "learning_rate": 3.1156222418358343e-06, + "loss": 0.31, + "step": 26570 + }, + { + "epoch": 4.691554143500133, + "grad_norm": 1.3798327445983887, + "learning_rate": 3.0979699911738746e-06, + "loss": 0.3772, + "step": 26580 + }, + { + "epoch": 4.693319212779102, + "grad_norm": 1.2798494100570679, + "learning_rate": 3.0803177405119153e-06, + "loss": 0.3575, + "step": 26590 + }, + { + "epoch": 4.695084282058071, + "grad_norm": 0.6293618083000183, + "learning_rate": 3.062665489849956e-06, + "loss": 0.2763, + "step": 26600 + }, + { + "epoch": 4.69684935133704, + "grad_norm": 0.893796980381012, + "learning_rate": 3.0450132391879963e-06, + "loss": 0.2968, + "step": 26610 + }, + { + "epoch": 4.698614420616009, + "grad_norm": 2.7735981941223145, + "learning_rate": 3.0273609885260375e-06, + "loss": 0.3324, + "step": 26620 + }, + { + "epoch": 4.700379489894979, + "grad_norm": 3.296457290649414, + "learning_rate": 3.0097087378640778e-06, + "loss": 0.3665, + "step": 26630 + }, + { + "epoch": 4.702144559173948, + "grad_norm": 2.9078474044799805, + "learning_rate": 2.992056487202118e-06, + "loss": 0.3013, + "step": 26640 + }, + { + "epoch": 4.703909628452917, + "grad_norm": 0.9801350235939026, + "learning_rate": 2.9744042365401592e-06, + "loss": 0.3093, + "step": 26650 + }, + { + "epoch": 4.705674697731886, + "grad_norm": 3.157639980316162, + "learning_rate": 2.9567519858781995e-06, + "loss": 0.3188, + "step": 26660 + }, + { + "epoch": 4.707439767010856, + "grad_norm": 3.6760361194610596, + "learning_rate": 2.9390997352162403e-06, + "loss": 0.3403, + "step": 26670 + }, + { + "epoch": 4.709204836289825, + "grad_norm": 3.899162530899048, + "learning_rate": 2.921447484554281e-06, + "loss": 0.3169, + "step": 26680 + }, + { + "epoch": 4.710969905568794, + "grad_norm": 0.8276218175888062, + "learning_rate": 2.9037952338923213e-06, + "loss": 0.3051, + "step": 26690 + }, + { + "epoch": 4.712734974847763, + "grad_norm": 2.197704315185547, + "learning_rate": 2.886142983230362e-06, + "loss": 0.3056, + "step": 26700 + }, + { + "epoch": 4.714500044126732, + "grad_norm": 1.597664475440979, + "learning_rate": 2.8684907325684027e-06, + "loss": 0.2863, + "step": 26710 + }, + { + "epoch": 4.716265113405701, + "grad_norm": 2.530172348022461, + "learning_rate": 2.8508384819064434e-06, + "loss": 0.306, + "step": 26720 + }, + { + "epoch": 4.718030182684671, + "grad_norm": 3.7757041454315186, + "learning_rate": 2.8331862312444837e-06, + "loss": 0.3334, + "step": 26730 + }, + { + "epoch": 4.71979525196364, + "grad_norm": 1.145796775817871, + "learning_rate": 2.8155339805825245e-06, + "loss": 0.312, + "step": 26740 + }, + { + "epoch": 4.721560321242609, + "grad_norm": 2.3895106315612793, + "learning_rate": 2.797881729920565e-06, + "loss": 0.3311, + "step": 26750 + }, + { + "epoch": 4.723325390521578, + "grad_norm": 1.0439083576202393, + "learning_rate": 2.7802294792586055e-06, + "loss": 0.2596, + "step": 26760 + }, + { + "epoch": 4.725090459800548, + "grad_norm": 2.6261661052703857, + "learning_rate": 2.762577228596646e-06, + "loss": 0.3281, + "step": 26770 + }, + { + "epoch": 4.726855529079517, + "grad_norm": 4.33652925491333, + "learning_rate": 2.744924977934687e-06, + "loss": 0.3354, + "step": 26780 + }, + { + "epoch": 4.728620598358486, + "grad_norm": 1.4785486459732056, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.2876, + "step": 26790 + }, + { + "epoch": 4.730385667637455, + "grad_norm": 0.853244960308075, + "learning_rate": 2.709620476610768e-06, + "loss": 0.3128, + "step": 26800 + }, + { + "epoch": 4.732150736916424, + "grad_norm": 4.548651218414307, + "learning_rate": 2.6919682259488087e-06, + "loss": 0.3242, + "step": 26810 + }, + { + "epoch": 4.733915806195393, + "grad_norm": 1.5922703742980957, + "learning_rate": 2.6743159752868494e-06, + "loss": 0.3453, + "step": 26820 + }, + { + "epoch": 4.735680875474363, + "grad_norm": 1.2137644290924072, + "learning_rate": 2.6566637246248897e-06, + "loss": 0.3563, + "step": 26830 + }, + { + "epoch": 4.737445944753332, + "grad_norm": 3.0799238681793213, + "learning_rate": 2.6390114739629304e-06, + "loss": 0.2741, + "step": 26840 + }, + { + "epoch": 4.739211014032301, + "grad_norm": 1.2434720993041992, + "learning_rate": 2.621359223300971e-06, + "loss": 0.3753, + "step": 26850 + }, + { + "epoch": 4.74097608331127, + "grad_norm": 1.0864171981811523, + "learning_rate": 2.6037069726390114e-06, + "loss": 0.2686, + "step": 26860 + }, + { + "epoch": 4.74274115259024, + "grad_norm": 1.02934992313385, + "learning_rate": 2.586054721977052e-06, + "loss": 0.3207, + "step": 26870 + }, + { + "epoch": 4.744506221869209, + "grad_norm": 2.575124740600586, + "learning_rate": 2.568402471315093e-06, + "loss": 0.276, + "step": 26880 + }, + { + "epoch": 4.746271291148178, + "grad_norm": 2.9327895641326904, + "learning_rate": 2.550750220653133e-06, + "loss": 0.2728, + "step": 26890 + }, + { + "epoch": 4.748036360427147, + "grad_norm": 1.2971775531768799, + "learning_rate": 2.533097969991174e-06, + "loss": 0.3054, + "step": 26900 + }, + { + "epoch": 4.749801429706116, + "grad_norm": 2.2022430896759033, + "learning_rate": 2.5154457193292146e-06, + "loss": 0.2774, + "step": 26910 + }, + { + "epoch": 4.751566498985085, + "grad_norm": 1.1809123754501343, + "learning_rate": 2.4977934686672553e-06, + "loss": 0.3174, + "step": 26920 + }, + { + "epoch": 4.753331568264055, + "grad_norm": 1.0642763376235962, + "learning_rate": 2.4801412180052956e-06, + "loss": 0.2903, + "step": 26930 + }, + { + "epoch": 4.755096637543024, + "grad_norm": 1.3782179355621338, + "learning_rate": 2.4624889673433364e-06, + "loss": 0.3092, + "step": 26940 + }, + { + "epoch": 4.756861706821993, + "grad_norm": 0.7173749208450317, + "learning_rate": 2.444836716681377e-06, + "loss": 0.3167, + "step": 26950 + }, + { + "epoch": 4.758626776100962, + "grad_norm": 3.7624456882476807, + "learning_rate": 2.4271844660194174e-06, + "loss": 0.3088, + "step": 26960 + }, + { + "epoch": 4.760391845379932, + "grad_norm": 2.7406694889068604, + "learning_rate": 2.4095322153574585e-06, + "loss": 0.3334, + "step": 26970 + }, + { + "epoch": 4.762156914658901, + "grad_norm": 2.3339078426361084, + "learning_rate": 2.391879964695499e-06, + "loss": 0.3057, + "step": 26980 + }, + { + "epoch": 4.76392198393787, + "grad_norm": 2.0263259410858154, + "learning_rate": 2.374227714033539e-06, + "loss": 0.2526, + "step": 26990 + }, + { + "epoch": 4.765687053216839, + "grad_norm": 3.8648524284362793, + "learning_rate": 2.3565754633715803e-06, + "loss": 0.2975, + "step": 27000 + }, + { + "epoch": 4.765687053216839, + "eval_loss": 0.6528812646865845, + "eval_runtime": 592.735, + "eval_samples_per_second": 47.79, + "eval_steps_per_second": 2.391, + "eval_token_accuracy": 0.0004985154146000063, + "step": 27000 + }, + { + "epoch": 4.767452122495808, + "grad_norm": 0.8504769802093506, + "learning_rate": 2.3389232127096206e-06, + "loss": 0.2618, + "step": 27010 + }, + { + "epoch": 4.769217191774777, + "grad_norm": 4.1417436599731445, + "learning_rate": 2.321270962047661e-06, + "loss": 0.2816, + "step": 27020 + }, + { + "epoch": 4.770982261053747, + "grad_norm": 1.3575209379196167, + "learning_rate": 2.303618711385702e-06, + "loss": 0.2941, + "step": 27030 + }, + { + "epoch": 4.772747330332716, + "grad_norm": 3.2614238262176514, + "learning_rate": 2.2859664607237423e-06, + "loss": 0.339, + "step": 27040 + }, + { + "epoch": 4.774512399611685, + "grad_norm": 2.057924270629883, + "learning_rate": 2.268314210061783e-06, + "loss": 0.3645, + "step": 27050 + }, + { + "epoch": 4.776277468890654, + "grad_norm": 2.174006462097168, + "learning_rate": 2.2506619593998238e-06, + "loss": 0.3252, + "step": 27060 + }, + { + "epoch": 4.778042538169624, + "grad_norm": 1.5190887451171875, + "learning_rate": 2.233009708737864e-06, + "loss": 0.2854, + "step": 27070 + }, + { + "epoch": 4.779807607448593, + "grad_norm": 1.7354347705841064, + "learning_rate": 2.215357458075905e-06, + "loss": 0.2674, + "step": 27080 + }, + { + "epoch": 4.781572676727562, + "grad_norm": 0.9061219692230225, + "learning_rate": 2.197705207413945e-06, + "loss": 0.303, + "step": 27090 + }, + { + "epoch": 4.783337746006531, + "grad_norm": 0.9815694093704224, + "learning_rate": 2.1800529567519862e-06, + "loss": 0.3104, + "step": 27100 + }, + { + "epoch": 4.7851028152855, + "grad_norm": 1.0961108207702637, + "learning_rate": 2.1624007060900265e-06, + "loss": 0.2876, + "step": 27110 + }, + { + "epoch": 4.786867884564469, + "grad_norm": 0.8509754538536072, + "learning_rate": 2.144748455428067e-06, + "loss": 0.3311, + "step": 27120 + }, + { + "epoch": 4.788632953843439, + "grad_norm": 2.176682233810425, + "learning_rate": 2.127096204766108e-06, + "loss": 0.3585, + "step": 27130 + }, + { + "epoch": 4.790398023122408, + "grad_norm": 0.9792783260345459, + "learning_rate": 2.1094439541041483e-06, + "loss": 0.3132, + "step": 27140 + }, + { + "epoch": 4.792163092401377, + "grad_norm": 0.9529822468757629, + "learning_rate": 2.091791703442189e-06, + "loss": 0.3025, + "step": 27150 + }, + { + "epoch": 4.793928161680346, + "grad_norm": 1.79104745388031, + "learning_rate": 2.0741394527802297e-06, + "loss": 0.3481, + "step": 27160 + }, + { + "epoch": 4.795693230959315, + "grad_norm": 1.240500807762146, + "learning_rate": 2.05648720211827e-06, + "loss": 0.2879, + "step": 27170 + }, + { + "epoch": 4.797458300238285, + "grad_norm": 2.7122185230255127, + "learning_rate": 2.0388349514563107e-06, + "loss": 0.2797, + "step": 27180 + }, + { + "epoch": 4.799223369517254, + "grad_norm": 1.1486530303955078, + "learning_rate": 2.0211827007943515e-06, + "loss": 0.2731, + "step": 27190 + }, + { + "epoch": 4.800988438796223, + "grad_norm": 2.8081750869750977, + "learning_rate": 2.003530450132392e-06, + "loss": 0.2995, + "step": 27200 + }, + { + "epoch": 4.802753508075192, + "grad_norm": 1.1571046113967896, + "learning_rate": 1.9858781994704325e-06, + "loss": 0.2822, + "step": 27210 + }, + { + "epoch": 4.804518577354161, + "grad_norm": 1.4669501781463623, + "learning_rate": 1.968225948808473e-06, + "loss": 0.3253, + "step": 27220 + }, + { + "epoch": 4.806283646633131, + "grad_norm": 0.9071950316429138, + "learning_rate": 1.950573698146514e-06, + "loss": 0.3247, + "step": 27230 + }, + { + "epoch": 4.8080487159121, + "grad_norm": 1.4200551509857178, + "learning_rate": 1.9329214474845542e-06, + "loss": 0.3058, + "step": 27240 + }, + { + "epoch": 4.809813785191069, + "grad_norm": 1.8995357751846313, + "learning_rate": 1.915269196822595e-06, + "loss": 0.2876, + "step": 27250 + }, + { + "epoch": 4.811578854470038, + "grad_norm": 1.1847381591796875, + "learning_rate": 1.8976169461606357e-06, + "loss": 0.3068, + "step": 27260 + }, + { + "epoch": 4.813343923749007, + "grad_norm": 1.081455111503601, + "learning_rate": 1.879964695498676e-06, + "loss": 0.3231, + "step": 27270 + }, + { + "epoch": 4.815108993027977, + "grad_norm": 1.0959587097167969, + "learning_rate": 1.862312444836717e-06, + "loss": 0.3252, + "step": 27280 + }, + { + "epoch": 4.816874062306946, + "grad_norm": 1.3235975503921509, + "learning_rate": 1.8446601941747572e-06, + "loss": 0.3169, + "step": 27290 + }, + { + "epoch": 4.818639131585915, + "grad_norm": 2.8046417236328125, + "learning_rate": 1.8270079435127981e-06, + "loss": 0.3005, + "step": 27300 + }, + { + "epoch": 4.820404200864884, + "grad_norm": 3.0383732318878174, + "learning_rate": 1.8093556928508387e-06, + "loss": 0.3221, + "step": 27310 + }, + { + "epoch": 4.822169270143853, + "grad_norm": 2.7719833850860596, + "learning_rate": 1.791703442188879e-06, + "loss": 0.2759, + "step": 27320 + }, + { + "epoch": 4.823934339422823, + "grad_norm": 1.3720282316207886, + "learning_rate": 1.7740511915269199e-06, + "loss": 0.3227, + "step": 27330 + }, + { + "epoch": 4.825699408701792, + "grad_norm": 1.1153744459152222, + "learning_rate": 1.7563989408649604e-06, + "loss": 0.2728, + "step": 27340 + }, + { + "epoch": 4.827464477980761, + "grad_norm": 3.1630284786224365, + "learning_rate": 1.7387466902030011e-06, + "loss": 0.3009, + "step": 27350 + }, + { + "epoch": 4.82922954725973, + "grad_norm": 1.8349213600158691, + "learning_rate": 1.7210944395410416e-06, + "loss": 0.3477, + "step": 27360 + }, + { + "epoch": 4.830994616538699, + "grad_norm": 0.9961835741996765, + "learning_rate": 1.7034421888790821e-06, + "loss": 0.2948, + "step": 27370 + }, + { + "epoch": 4.832759685817669, + "grad_norm": 2.6590805053710938, + "learning_rate": 1.6857899382171229e-06, + "loss": 0.212, + "step": 27380 + }, + { + "epoch": 4.834524755096638, + "grad_norm": 1.2290103435516357, + "learning_rate": 1.6681376875551634e-06, + "loss": 0.2845, + "step": 27390 + }, + { + "epoch": 4.836289824375607, + "grad_norm": 3.4003355503082275, + "learning_rate": 1.650485436893204e-06, + "loss": 0.3422, + "step": 27400 + }, + { + "epoch": 4.838054893654576, + "grad_norm": 1.2254893779754639, + "learning_rate": 1.6328331862312446e-06, + "loss": 0.3088, + "step": 27410 + }, + { + "epoch": 4.839819962933545, + "grad_norm": 2.233959197998047, + "learning_rate": 1.6151809355692851e-06, + "loss": 0.291, + "step": 27420 + }, + { + "epoch": 4.841585032212515, + "grad_norm": 0.9931178092956543, + "learning_rate": 1.5975286849073258e-06, + "loss": 0.2904, + "step": 27430 + }, + { + "epoch": 4.843350101491484, + "grad_norm": 2.9077725410461426, + "learning_rate": 1.5798764342453664e-06, + "loss": 0.3081, + "step": 27440 + }, + { + "epoch": 4.845115170770453, + "grad_norm": 4.963522911071777, + "learning_rate": 1.5622241835834069e-06, + "loss": 0.2661, + "step": 27450 + }, + { + "epoch": 4.846880240049422, + "grad_norm": 2.2293930053710938, + "learning_rate": 1.5445719329214476e-06, + "loss": 0.3111, + "step": 27460 + }, + { + "epoch": 4.848645309328391, + "grad_norm": 1.4293302297592163, + "learning_rate": 1.5269196822594883e-06, + "loss": 0.311, + "step": 27470 + }, + { + "epoch": 4.850410378607361, + "grad_norm": 1.2167930603027344, + "learning_rate": 1.5092674315975286e-06, + "loss": 0.3095, + "step": 27480 + }, + { + "epoch": 4.85217544788633, + "grad_norm": 0.9582422375679016, + "learning_rate": 1.4916151809355693e-06, + "loss": 0.3293, + "step": 27490 + }, + { + "epoch": 4.853940517165299, + "grad_norm": 1.0664198398590088, + "learning_rate": 1.47396293027361e-06, + "loss": 0.2998, + "step": 27500 + }, + { + "epoch": 4.855705586444268, + "grad_norm": 2.815174102783203, + "learning_rate": 1.4563106796116506e-06, + "loss": 0.3487, + "step": 27510 + }, + { + "epoch": 4.857470655723237, + "grad_norm": 2.406156301498413, + "learning_rate": 1.4386584289496913e-06, + "loss": 0.2857, + "step": 27520 + }, + { + "epoch": 4.859235725002207, + "grad_norm": 4.202737808227539, + "learning_rate": 1.4210061782877318e-06, + "loss": 0.3014, + "step": 27530 + }, + { + "epoch": 4.861000794281176, + "grad_norm": 2.53226375579834, + "learning_rate": 1.4033539276257723e-06, + "loss": 0.2891, + "step": 27540 + }, + { + "epoch": 4.862765863560145, + "grad_norm": 1.1201049089431763, + "learning_rate": 1.385701676963813e-06, + "loss": 0.2834, + "step": 27550 + }, + { + "epoch": 4.864530932839114, + "grad_norm": 0.8204585909843445, + "learning_rate": 1.3680494263018535e-06, + "loss": 0.3099, + "step": 27560 + }, + { + "epoch": 4.866296002118083, + "grad_norm": 0.8954245448112488, + "learning_rate": 1.350397175639894e-06, + "loss": 0.2838, + "step": 27570 + }, + { + "epoch": 4.868061071397053, + "grad_norm": 1.3307112455368042, + "learning_rate": 1.3327449249779348e-06, + "loss": 0.2736, + "step": 27580 + }, + { + "epoch": 4.869826140676022, + "grad_norm": 2.3387129306793213, + "learning_rate": 1.3150926743159753e-06, + "loss": 0.2762, + "step": 27590 + }, + { + "epoch": 4.871591209954991, + "grad_norm": 0.8025361895561218, + "learning_rate": 1.297440423654016e-06, + "loss": 0.3007, + "step": 27600 + }, + { + "epoch": 4.87335627923396, + "grad_norm": 2.6926400661468506, + "learning_rate": 1.2797881729920565e-06, + "loss": 0.2714, + "step": 27610 + }, + { + "epoch": 4.875121348512929, + "grad_norm": 0.843543291091919, + "learning_rate": 1.262135922330097e-06, + "loss": 0.2508, + "step": 27620 + }, + { + "epoch": 4.876886417791899, + "grad_norm": 2.3910794258117676, + "learning_rate": 1.2444836716681377e-06, + "loss": 0.221, + "step": 27630 + }, + { + "epoch": 4.878651487070868, + "grad_norm": 1.1623315811157227, + "learning_rate": 1.2268314210061783e-06, + "loss": 0.3141, + "step": 27640 + }, + { + "epoch": 4.880416556349837, + "grad_norm": 1.3146125078201294, + "learning_rate": 1.209179170344219e-06, + "loss": 0.3237, + "step": 27650 + }, + { + "epoch": 4.882181625628806, + "grad_norm": 1.2800966501235962, + "learning_rate": 1.1915269196822597e-06, + "loss": 0.2762, + "step": 27660 + }, + { + "epoch": 4.883946694907775, + "grad_norm": 1.5484886169433594, + "learning_rate": 1.1738746690203e-06, + "loss": 0.3123, + "step": 27670 + }, + { + "epoch": 4.885711764186745, + "grad_norm": 3.3829500675201416, + "learning_rate": 1.1562224183583407e-06, + "loss": 0.2624, + "step": 27680 + }, + { + "epoch": 4.887476833465714, + "grad_norm": 1.0227965116500854, + "learning_rate": 1.1385701676963814e-06, + "loss": 0.3367, + "step": 27690 + }, + { + "epoch": 4.889241902744683, + "grad_norm": 0.977554440498352, + "learning_rate": 1.120917917034422e-06, + "loss": 0.3134, + "step": 27700 + }, + { + "epoch": 4.891006972023652, + "grad_norm": 1.9226443767547607, + "learning_rate": 1.1032656663724627e-06, + "loss": 0.2859, + "step": 27710 + }, + { + "epoch": 4.892772041302621, + "grad_norm": 0.9863432049751282, + "learning_rate": 1.085613415710503e-06, + "loss": 0.2962, + "step": 27720 + }, + { + "epoch": 4.894537110581591, + "grad_norm": 2.392817974090576, + "learning_rate": 1.0679611650485437e-06, + "loss": 0.3292, + "step": 27730 + }, + { + "epoch": 4.89630217986056, + "grad_norm": 2.1724038124084473, + "learning_rate": 1.0503089143865844e-06, + "loss": 0.2853, + "step": 27740 + }, + { + "epoch": 4.898067249139529, + "grad_norm": 1.4158661365509033, + "learning_rate": 1.032656663724625e-06, + "loss": 0.2829, + "step": 27750 + }, + { + "epoch": 4.899832318418498, + "grad_norm": 1.2781078815460205, + "learning_rate": 1.0150044130626657e-06, + "loss": 0.2682, + "step": 27760 + }, + { + "epoch": 4.901597387697467, + "grad_norm": 0.9909201860427856, + "learning_rate": 9.973521624007062e-07, + "loss": 0.2903, + "step": 27770 + }, + { + "epoch": 4.903362456976437, + "grad_norm": 1.9638125896453857, + "learning_rate": 9.796999117387467e-07, + "loss": 0.2802, + "step": 27780 + }, + { + "epoch": 4.905127526255406, + "grad_norm": 1.5546956062316895, + "learning_rate": 9.620476610767874e-07, + "loss": 0.3387, + "step": 27790 + }, + { + "epoch": 4.906892595534375, + "grad_norm": 2.3524317741394043, + "learning_rate": 9.44395410414828e-07, + "loss": 0.2755, + "step": 27800 + }, + { + "epoch": 4.908657664813344, + "grad_norm": 2.4190759658813477, + "learning_rate": 9.267431597528684e-07, + "loss": 0.3054, + "step": 27810 + }, + { + "epoch": 4.910422734092313, + "grad_norm": 0.7786517143249512, + "learning_rate": 9.09090909090909e-07, + "loss": 0.2976, + "step": 27820 + }, + { + "epoch": 4.912187803371283, + "grad_norm": 3.312124490737915, + "learning_rate": 8.914386584289498e-07, + "loss": 0.2602, + "step": 27830 + }, + { + "epoch": 4.913952872650252, + "grad_norm": 1.6175801753997803, + "learning_rate": 8.737864077669904e-07, + "loss": 0.3014, + "step": 27840 + }, + { + "epoch": 4.915717941929221, + "grad_norm": 1.6575732231140137, + "learning_rate": 8.56134157105031e-07, + "loss": 0.3052, + "step": 27850 + }, + { + "epoch": 4.91748301120819, + "grad_norm": 1.234856367111206, + "learning_rate": 8.384819064430715e-07, + "loss": 0.3095, + "step": 27860 + }, + { + "epoch": 4.919248080487159, + "grad_norm": 3.541182041168213, + "learning_rate": 8.208296557811121e-07, + "loss": 0.2807, + "step": 27870 + }, + { + "epoch": 4.921013149766129, + "grad_norm": 1.1563400030136108, + "learning_rate": 8.031774051191527e-07, + "loss": 0.3132, + "step": 27880 + }, + { + "epoch": 4.922778219045098, + "grad_norm": 0.9835913777351379, + "learning_rate": 7.855251544571934e-07, + "loss": 0.2582, + "step": 27890 + }, + { + "epoch": 4.924543288324067, + "grad_norm": 1.1049256324768066, + "learning_rate": 7.678729037952339e-07, + "loss": 0.2754, + "step": 27900 + }, + { + "epoch": 4.926308357603036, + "grad_norm": 1.1144856214523315, + "learning_rate": 7.502206531332746e-07, + "loss": 0.3033, + "step": 27910 + }, + { + "epoch": 4.928073426882005, + "grad_norm": 1.1903610229492188, + "learning_rate": 7.325684024713151e-07, + "loss": 0.3207, + "step": 27920 + }, + { + "epoch": 4.929838496160975, + "grad_norm": 1.6650744676589966, + "learning_rate": 7.149161518093557e-07, + "loss": 0.3029, + "step": 27930 + }, + { + "epoch": 4.931603565439944, + "grad_norm": 3.3469290733337402, + "learning_rate": 6.972639011473963e-07, + "loss": 0.2502, + "step": 27940 + }, + { + "epoch": 4.933368634718913, + "grad_norm": 0.8579782247543335, + "learning_rate": 6.79611650485437e-07, + "loss": 0.2735, + "step": 27950 + }, + { + "epoch": 4.935133703997882, + "grad_norm": 0.7689581513404846, + "learning_rate": 6.619593998234776e-07, + "loss": 0.3048, + "step": 27960 + }, + { + "epoch": 4.936898773276851, + "grad_norm": 1.6697453260421753, + "learning_rate": 6.443071491615181e-07, + "loss": 0.3127, + "step": 27970 + }, + { + "epoch": 4.938663842555821, + "grad_norm": 2.41105055809021, + "learning_rate": 6.266548984995587e-07, + "loss": 0.2993, + "step": 27980 + }, + { + "epoch": 4.94042891183479, + "grad_norm": 1.329819679260254, + "learning_rate": 6.090026478375993e-07, + "loss": 0.3397, + "step": 27990 + }, + { + "epoch": 4.942193981113759, + "grad_norm": 0.8519781231880188, + "learning_rate": 5.913503971756399e-07, + "loss": 0.3279, + "step": 28000 + }, + { + "epoch": 4.942193981113759, + "eval_loss": 0.6499401926994324, + "eval_runtime": 592.4285, + "eval_samples_per_second": 47.815, + "eval_steps_per_second": 2.392, + "eval_token_accuracy": 0.0004987183931890128, + "step": 28000 + }, + { + "epoch": 4.943959050392728, + "grad_norm": 1.1171196699142456, + "learning_rate": 5.736981465136805e-07, + "loss": 0.3191, + "step": 28010 + }, + { + "epoch": 4.945724119671697, + "grad_norm": 3.382251262664795, + "learning_rate": 5.560458958517212e-07, + "loss": 0.341, + "step": 28020 + }, + { + "epoch": 4.947489188950667, + "grad_norm": 1.3143904209136963, + "learning_rate": 5.383936451897618e-07, + "loss": 0.268, + "step": 28030 + }, + { + "epoch": 4.949254258229636, + "grad_norm": 1.3122084140777588, + "learning_rate": 5.207413945278023e-07, + "loss": 0.3608, + "step": 28040 + }, + { + "epoch": 4.951019327508605, + "grad_norm": 3.165703296661377, + "learning_rate": 5.030891438658429e-07, + "loss": 0.313, + "step": 28050 + }, + { + "epoch": 4.952784396787574, + "grad_norm": 1.2517273426055908, + "learning_rate": 4.854368932038835e-07, + "loss": 0.2962, + "step": 28060 + }, + { + "epoch": 4.954549466066543, + "grad_norm": 0.9983559250831604, + "learning_rate": 4.6778464254192414e-07, + "loss": 0.2787, + "step": 28070 + }, + { + "epoch": 4.956314535345513, + "grad_norm": 1.5988553762435913, + "learning_rate": 4.5013239187996475e-07, + "loss": 0.3001, + "step": 28080 + }, + { + "epoch": 4.958079604624482, + "grad_norm": 2.1683664321899414, + "learning_rate": 4.324801412180053e-07, + "loss": 0.2809, + "step": 28090 + }, + { + "epoch": 4.959844673903451, + "grad_norm": 1.0645288228988647, + "learning_rate": 4.1482789055604593e-07, + "loss": 0.3469, + "step": 28100 + }, + { + "epoch": 4.96160974318242, + "grad_norm": 1.2240803241729736, + "learning_rate": 3.971756398940865e-07, + "loss": 0.3191, + "step": 28110 + }, + { + "epoch": 4.963374812461389, + "grad_norm": 1.3843520879745483, + "learning_rate": 3.795233892321271e-07, + "loss": 0.3317, + "step": 28120 + }, + { + "epoch": 4.965139881740359, + "grad_norm": 1.5506443977355957, + "learning_rate": 3.6187113857016773e-07, + "loss": 0.3127, + "step": 28130 + }, + { + "epoch": 4.966904951019328, + "grad_norm": 0.9059777855873108, + "learning_rate": 3.442188879082083e-07, + "loss": 0.3205, + "step": 28140 + }, + { + "epoch": 4.968670020298297, + "grad_norm": 3.5530388355255127, + "learning_rate": 3.265666372462489e-07, + "loss": 0.291, + "step": 28150 + }, + { + "epoch": 4.970435089577266, + "grad_norm": 4.22071647644043, + "learning_rate": 3.0891438658428953e-07, + "loss": 0.2899, + "step": 28160 + }, + { + "epoch": 4.972200158856235, + "grad_norm": 2.5634772777557373, + "learning_rate": 2.9126213592233014e-07, + "loss": 0.28, + "step": 28170 + }, + { + "epoch": 4.973965228135205, + "grad_norm": 0.8161858916282654, + "learning_rate": 2.736098852603707e-07, + "loss": 0.253, + "step": 28180 + }, + { + "epoch": 4.975730297414174, + "grad_norm": 2.5096681118011475, + "learning_rate": 2.559576345984113e-07, + "loss": 0.2672, + "step": 28190 + }, + { + "epoch": 4.977495366693143, + "grad_norm": 1.2688097953796387, + "learning_rate": 2.383053839364519e-07, + "loss": 0.2874, + "step": 28200 + }, + { + "epoch": 4.979260435972112, + "grad_norm": 1.17268705368042, + "learning_rate": 2.2065313327449248e-07, + "loss": 0.3518, + "step": 28210 + }, + { + "epoch": 4.981025505251081, + "grad_norm": 1.4544918537139893, + "learning_rate": 2.0300088261253312e-07, + "loss": 0.422, + "step": 28220 + }, + { + "epoch": 4.98279057453005, + "grad_norm": 1.8409720659255981, + "learning_rate": 1.853486319505737e-07, + "loss": 0.2917, + "step": 28230 + }, + { + "epoch": 4.98455564380902, + "grad_norm": 1.1786491870880127, + "learning_rate": 1.676963812886143e-07, + "loss": 0.2828, + "step": 28240 + }, + { + "epoch": 4.986320713087989, + "grad_norm": 2.36482834815979, + "learning_rate": 1.500441306266549e-07, + "loss": 0.2697, + "step": 28250 + }, + { + "epoch": 4.988085782366958, + "grad_norm": 2.1647708415985107, + "learning_rate": 1.323918799646955e-07, + "loss": 0.2416, + "step": 28260 + }, + { + "epoch": 4.989850851645927, + "grad_norm": 2.631108522415161, + "learning_rate": 1.1473962930273611e-07, + "loss": 0.3057, + "step": 28270 + }, + { + "epoch": 4.991615920924897, + "grad_norm": 2.9568164348602295, + "learning_rate": 9.70873786407767e-08, + "loss": 0.3125, + "step": 28280 + }, + { + "epoch": 4.993380990203866, + "grad_norm": 1.0665050745010376, + "learning_rate": 7.94351279788173e-08, + "loss": 0.2802, + "step": 28290 + }, + { + "epoch": 4.995146059482835, + "grad_norm": 2.899059534072876, + "learning_rate": 6.178287731685791e-08, + "loss": 0.316, + "step": 28300 + }, + { + "epoch": 4.996911128761804, + "grad_norm": 1.2794692516326904, + "learning_rate": 4.4130626654898505e-08, + "loss": 0.3383, + "step": 28310 + }, + { + "epoch": 4.998676198040773, + "grad_norm": 1.1049646139144897, + "learning_rate": 2.64783759929391e-08, + "loss": 0.3078, + "step": 28320 + } + ], + "logging_steps": 10, + "max_steps": 28325, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0162875069185393e+18, + "train_batch_size": 10, + "trial_name": null, + "trial_params": null +}