diff --git "a/checkpoints/checkpoint-34495/trainer_state.json" "b/checkpoints/checkpoint-34495/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/checkpoint-34495/trainer_state.json" @@ -0,0 +1,24482 @@ +{ + "best_metric": 0.7098406553268433, + "best_model_checkpoint": "model/checkpoints/run1-java-codegen/checkpoint-20000", + "epoch": 4.999818820886328, + "eval_steps": 1000, + "global_step": 34495, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014494329093742073, + "grad_norm": 4.100641250610352, + "learning_rate": 4.998550514567329e-05, + "loss": 1.327, + "step": 10 + }, + { + "epoch": 0.0028988658187484146, + "grad_norm": 1.0939269065856934, + "learning_rate": 4.997101029134658e-05, + "loss": 1.1957, + "step": 20 + }, + { + "epoch": 0.004348298728122622, + "grad_norm": 1.4986281394958496, + "learning_rate": 4.995651543701986e-05, + "loss": 1.178, + "step": 30 + }, + { + "epoch": 0.005797731637496829, + "grad_norm": 1.2242189645767212, + "learning_rate": 4.9942020582693147e-05, + "loss": 1.2268, + "step": 40 + }, + { + "epoch": 0.007247164546871037, + "grad_norm": 1.2362180948257446, + "learning_rate": 4.992752572836643e-05, + "loss": 1.0823, + "step": 50 + }, + { + "epoch": 0.008696597456245243, + "grad_norm": 1.1525092124938965, + "learning_rate": 4.9913030874039715e-05, + "loss": 1.1256, + "step": 60 + }, + { + "epoch": 0.010146030365619452, + "grad_norm": 0.8720236420631409, + "learning_rate": 4.9898536019713006e-05, + "loss": 1.1147, + "step": 70 + }, + { + "epoch": 0.011595463274993658, + "grad_norm": 1.3506882190704346, + "learning_rate": 4.988404116538629e-05, + "loss": 1.2389, + "step": 80 + }, + { + "epoch": 0.013044896184367867, + "grad_norm": 1.2381508350372314, + "learning_rate": 4.9869546311059575e-05, + "loss": 1.0555, + "step": 90 + }, + { + "epoch": 0.014494329093742073, + "grad_norm": 3.484632730484009, + "learning_rate": 4.985505145673286e-05, + "loss": 1.1662, + "step": 100 + }, + { + "epoch": 0.015943762003116282, + "grad_norm": 3.0248196125030518, + "learning_rate": 4.9840556602406144e-05, + "loss": 1.1566, + "step": 110 + }, + { + "epoch": 0.017393194912490487, + "grad_norm": 1.1476988792419434, + "learning_rate": 4.9826061748079435e-05, + "loss": 1.0685, + "step": 120 + }, + { + "epoch": 0.018842627821864695, + "grad_norm": 3.148444652557373, + "learning_rate": 4.981156689375272e-05, + "loss": 1.2396, + "step": 130 + }, + { + "epoch": 0.020292060731238903, + "grad_norm": 1.3316197395324707, + "learning_rate": 4.9797072039426004e-05, + "loss": 1.1839, + "step": 140 + }, + { + "epoch": 0.021741493640613112, + "grad_norm": 1.0662773847579956, + "learning_rate": 4.9782577185099295e-05, + "loss": 1.1634, + "step": 150 + }, + { + "epoch": 0.023190926549987317, + "grad_norm": 1.1283774375915527, + "learning_rate": 4.976808233077258e-05, + "loss": 1.114, + "step": 160 + }, + { + "epoch": 0.024640359459361525, + "grad_norm": 3.6623597145080566, + "learning_rate": 4.9753587476445864e-05, + "loss": 1.0846, + "step": 170 + }, + { + "epoch": 0.026089792368735733, + "grad_norm": 1.061867356300354, + "learning_rate": 4.973909262211915e-05, + "loss": 1.0521, + "step": 180 + }, + { + "epoch": 0.02753922527810994, + "grad_norm": 3.880845308303833, + "learning_rate": 4.972459776779244e-05, + "loss": 1.0697, + "step": 190 + }, + { + "epoch": 0.028988658187484147, + "grad_norm": 1.021384835243225, + "learning_rate": 4.9710102913465723e-05, + "loss": 1.0681, + "step": 200 + }, + { + "epoch": 0.030438091096858355, + "grad_norm": 4.2998433113098145, + "learning_rate": 4.969560805913901e-05, + "loss": 1.0457, + "step": 210 + }, + { + "epoch": 0.031887524006232564, + "grad_norm": 0.8809024691581726, + "learning_rate": 4.96811132048123e-05, + "loss": 1.0517, + "step": 220 + }, + { + "epoch": 0.03333695691560677, + "grad_norm": 4.0385847091674805, + "learning_rate": 4.9666618350485577e-05, + "loss": 1.1193, + "step": 230 + }, + { + "epoch": 0.03478638982498097, + "grad_norm": 2.3646724224090576, + "learning_rate": 4.965212349615887e-05, + "loss": 1.0611, + "step": 240 + }, + { + "epoch": 0.03623582273435518, + "grad_norm": 1.0366450548171997, + "learning_rate": 4.963762864183215e-05, + "loss": 1.019, + "step": 250 + }, + { + "epoch": 0.03768525564372939, + "grad_norm": 1.8329641819000244, + "learning_rate": 4.9623133787505436e-05, + "loss": 1.0415, + "step": 260 + }, + { + "epoch": 0.0391346885531036, + "grad_norm": 0.9972900152206421, + "learning_rate": 4.960863893317873e-05, + "loss": 0.9843, + "step": 270 + }, + { + "epoch": 0.04058412146247781, + "grad_norm": 0.8438522815704346, + "learning_rate": 4.959414407885201e-05, + "loss": 1.0857, + "step": 280 + }, + { + "epoch": 0.042033554371852015, + "grad_norm": 2.5039472579956055, + "learning_rate": 4.9579649224525296e-05, + "loss": 1.0238, + "step": 290 + }, + { + "epoch": 0.043482987281226224, + "grad_norm": 1.199118733406067, + "learning_rate": 4.956515437019858e-05, + "loss": 1.0512, + "step": 300 + }, + { + "epoch": 0.044932420190600425, + "grad_norm": 1.1583256721496582, + "learning_rate": 4.9550659515871865e-05, + "loss": 1.1624, + "step": 310 + }, + { + "epoch": 0.04638185309997463, + "grad_norm": 1.0587997436523438, + "learning_rate": 4.9536164661545156e-05, + "loss": 1.0383, + "step": 320 + }, + { + "epoch": 0.04783128600934884, + "grad_norm": 3.125576972961426, + "learning_rate": 4.952166980721844e-05, + "loss": 0.9672, + "step": 330 + }, + { + "epoch": 0.04928071891872305, + "grad_norm": 1.0660935640335083, + "learning_rate": 4.9507174952891725e-05, + "loss": 1.0317, + "step": 340 + }, + { + "epoch": 0.05073015182809726, + "grad_norm": 1.2960877418518066, + "learning_rate": 4.9492680098565016e-05, + "loss": 1.0432, + "step": 350 + }, + { + "epoch": 0.05217958473747147, + "grad_norm": 4.807547092437744, + "learning_rate": 4.9478185244238294e-05, + "loss": 1.1153, + "step": 360 + }, + { + "epoch": 0.05362901764684567, + "grad_norm": 0.9906821250915527, + "learning_rate": 4.9463690389911585e-05, + "loss": 0.9973, + "step": 370 + }, + { + "epoch": 0.05507845055621988, + "grad_norm": 2.1681807041168213, + "learning_rate": 4.944919553558487e-05, + "loss": 1.0127, + "step": 380 + }, + { + "epoch": 0.056527883465594085, + "grad_norm": 1.2156059741973877, + "learning_rate": 4.9434700681258153e-05, + "loss": 1.0341, + "step": 390 + }, + { + "epoch": 0.057977316374968293, + "grad_norm": 2.7744154930114746, + "learning_rate": 4.9420205826931445e-05, + "loss": 0.9894, + "step": 400 + }, + { + "epoch": 0.0594267492843425, + "grad_norm": 0.9824851751327515, + "learning_rate": 4.940571097260473e-05, + "loss": 0.9473, + "step": 410 + }, + { + "epoch": 0.06087618219371671, + "grad_norm": 2.0799906253814697, + "learning_rate": 4.939121611827801e-05, + "loss": 1.0758, + "step": 420 + }, + { + "epoch": 0.06232561510309092, + "grad_norm": 1.282225251197815, + "learning_rate": 4.93767212639513e-05, + "loss": 1.0126, + "step": 430 + }, + { + "epoch": 0.06377504801246513, + "grad_norm": 3.6338846683502197, + "learning_rate": 4.936222640962458e-05, + "loss": 1.0053, + "step": 440 + }, + { + "epoch": 0.06522448092183933, + "grad_norm": 2.238694667816162, + "learning_rate": 4.934773155529787e-05, + "loss": 0.9902, + "step": 450 + }, + { + "epoch": 0.06667391383121354, + "grad_norm": 0.8595529794692993, + "learning_rate": 4.933323670097116e-05, + "loss": 0.9713, + "step": 460 + }, + { + "epoch": 0.06812334674058775, + "grad_norm": 1.2215890884399414, + "learning_rate": 4.931874184664444e-05, + "loss": 1.0144, + "step": 470 + }, + { + "epoch": 0.06957277964996195, + "grad_norm": 1.1760045289993286, + "learning_rate": 4.930424699231773e-05, + "loss": 1.1197, + "step": 480 + }, + { + "epoch": 0.07102221255933616, + "grad_norm": 2.8960888385772705, + "learning_rate": 4.928975213799101e-05, + "loss": 1.0625, + "step": 490 + }, + { + "epoch": 0.07247164546871036, + "grad_norm": 1.0947250127792358, + "learning_rate": 4.92752572836643e-05, + "loss": 1.0667, + "step": 500 + }, + { + "epoch": 0.07392107837808458, + "grad_norm": 0.8826558589935303, + "learning_rate": 4.9260762429337586e-05, + "loss": 1.081, + "step": 510 + }, + { + "epoch": 0.07537051128745878, + "grad_norm": 1.2271273136138916, + "learning_rate": 4.924626757501087e-05, + "loss": 0.9745, + "step": 520 + }, + { + "epoch": 0.076819944196833, + "grad_norm": 0.9710911512374878, + "learning_rate": 4.923177272068416e-05, + "loss": 0.9851, + "step": 530 + }, + { + "epoch": 0.0782693771062072, + "grad_norm": 0.9668734669685364, + "learning_rate": 4.9217277866357446e-05, + "loss": 1.0127, + "step": 540 + }, + { + "epoch": 0.0797188100155814, + "grad_norm": 0.9408063888549805, + "learning_rate": 4.920278301203073e-05, + "loss": 0.8172, + "step": 550 + }, + { + "epoch": 0.08116824292495561, + "grad_norm": 0.99382084608078, + "learning_rate": 4.9188288157704015e-05, + "loss": 0.9216, + "step": 560 + }, + { + "epoch": 0.08261767583432982, + "grad_norm": 1.1238948106765747, + "learning_rate": 4.9173793303377306e-05, + "loss": 1.0077, + "step": 570 + }, + { + "epoch": 0.08406710874370403, + "grad_norm": 1.1847305297851562, + "learning_rate": 4.915929844905059e-05, + "loss": 1.0102, + "step": 580 + }, + { + "epoch": 0.08551654165307823, + "grad_norm": 1.0952173471450806, + "learning_rate": 4.9144803594723874e-05, + "loss": 1.0433, + "step": 590 + }, + { + "epoch": 0.08696597456245245, + "grad_norm": 2.061677932739258, + "learning_rate": 4.9130308740397166e-05, + "loss": 0.9256, + "step": 600 + }, + { + "epoch": 0.08841540747182665, + "grad_norm": 2.83744478225708, + "learning_rate": 4.911581388607045e-05, + "loss": 1.0611, + "step": 610 + }, + { + "epoch": 0.08986484038120085, + "grad_norm": 2.6311850547790527, + "learning_rate": 4.9101319031743734e-05, + "loss": 1.0714, + "step": 620 + }, + { + "epoch": 0.09131427329057507, + "grad_norm": 3.2638437747955322, + "learning_rate": 4.908682417741702e-05, + "loss": 1.0245, + "step": 630 + }, + { + "epoch": 0.09276370619994927, + "grad_norm": 2.02360200881958, + "learning_rate": 4.90723293230903e-05, + "loss": 0.9304, + "step": 640 + }, + { + "epoch": 0.09421313910932348, + "grad_norm": 2.3670814037323, + "learning_rate": 4.9057834468763594e-05, + "loss": 1.0079, + "step": 650 + }, + { + "epoch": 0.09566257201869768, + "grad_norm": 0.9333920478820801, + "learning_rate": 4.904333961443688e-05, + "loss": 0.9725, + "step": 660 + }, + { + "epoch": 0.09711200492807189, + "grad_norm": 1.0716252326965332, + "learning_rate": 4.902884476011016e-05, + "loss": 1.028, + "step": 670 + }, + { + "epoch": 0.0985614378374461, + "grad_norm": 3.501131534576416, + "learning_rate": 4.901434990578345e-05, + "loss": 0.9943, + "step": 680 + }, + { + "epoch": 0.1000108707468203, + "grad_norm": 3.1148428916931152, + "learning_rate": 4.899985505145673e-05, + "loss": 0.9784, + "step": 690 + }, + { + "epoch": 0.10146030365619452, + "grad_norm": 2.3271002769470215, + "learning_rate": 4.898536019713002e-05, + "loss": 0.9682, + "step": 700 + }, + { + "epoch": 0.10290973656556872, + "grad_norm": 4.113440036773682, + "learning_rate": 4.897086534280331e-05, + "loss": 0.9687, + "step": 710 + }, + { + "epoch": 0.10435916947494293, + "grad_norm": 1.9874829053878784, + "learning_rate": 4.895637048847659e-05, + "loss": 1.0132, + "step": 720 + }, + { + "epoch": 0.10580860238431714, + "grad_norm": 1.2082017660140991, + "learning_rate": 4.894187563414988e-05, + "loss": 0.9529, + "step": 730 + }, + { + "epoch": 0.10725803529369134, + "grad_norm": 1.016812801361084, + "learning_rate": 4.892738077982317e-05, + "loss": 0.9425, + "step": 740 + }, + { + "epoch": 0.10870746820306555, + "grad_norm": 1.1642169952392578, + "learning_rate": 4.891288592549645e-05, + "loss": 1.0083, + "step": 750 + }, + { + "epoch": 0.11015690111243975, + "grad_norm": 1.112574815750122, + "learning_rate": 4.8898391071169736e-05, + "loss": 1.0205, + "step": 760 + }, + { + "epoch": 0.11160633402181397, + "grad_norm": 1.2382400035858154, + "learning_rate": 4.888389621684302e-05, + "loss": 1.0995, + "step": 770 + }, + { + "epoch": 0.11305576693118817, + "grad_norm": 1.1367257833480835, + "learning_rate": 4.886940136251631e-05, + "loss": 1.0161, + "step": 780 + }, + { + "epoch": 0.11450519984056239, + "grad_norm": 0.9074719548225403, + "learning_rate": 4.8854906508189596e-05, + "loss": 0.8662, + "step": 790 + }, + { + "epoch": 0.11595463274993659, + "grad_norm": 2.2612428665161133, + "learning_rate": 4.884041165386288e-05, + "loss": 1.0303, + "step": 800 + }, + { + "epoch": 0.11740406565931079, + "grad_norm": 0.9624481797218323, + "learning_rate": 4.8825916799536164e-05, + "loss": 1.0381, + "step": 810 + }, + { + "epoch": 0.118853498568685, + "grad_norm": 1.8881802558898926, + "learning_rate": 4.881142194520945e-05, + "loss": 0.9832, + "step": 820 + }, + { + "epoch": 0.1203029314780592, + "grad_norm": 1.0378659963607788, + "learning_rate": 4.879692709088274e-05, + "loss": 0.898, + "step": 830 + }, + { + "epoch": 0.12175236438743342, + "grad_norm": 2.750006675720215, + "learning_rate": 4.8782432236556024e-05, + "loss": 0.9369, + "step": 840 + }, + { + "epoch": 0.12320179729680762, + "grad_norm": 1.0392038822174072, + "learning_rate": 4.876793738222931e-05, + "loss": 0.949, + "step": 850 + }, + { + "epoch": 0.12465123020618184, + "grad_norm": 1.1465096473693848, + "learning_rate": 4.87534425279026e-05, + "loss": 0.9241, + "step": 860 + }, + { + "epoch": 0.12610066311555604, + "grad_norm": 1.13377845287323, + "learning_rate": 4.8738947673575884e-05, + "loss": 0.9059, + "step": 870 + }, + { + "epoch": 0.12755009602493025, + "grad_norm": 0.9846686720848083, + "learning_rate": 4.872445281924917e-05, + "loss": 0.9339, + "step": 880 + }, + { + "epoch": 0.12899952893430444, + "grad_norm": 1.0960410833358765, + "learning_rate": 4.870995796492245e-05, + "loss": 0.9495, + "step": 890 + }, + { + "epoch": 0.13044896184367866, + "grad_norm": 0.8573481440544128, + "learning_rate": 4.869546311059574e-05, + "loss": 1.0344, + "step": 900 + }, + { + "epoch": 0.13189839475305287, + "grad_norm": 1.264652967453003, + "learning_rate": 4.868096825626903e-05, + "loss": 0.9459, + "step": 910 + }, + { + "epoch": 0.1333478276624271, + "grad_norm": 2.9564871788024902, + "learning_rate": 4.866647340194231e-05, + "loss": 1.0021, + "step": 920 + }, + { + "epoch": 0.13479726057180128, + "grad_norm": 2.6998257637023926, + "learning_rate": 4.8651978547615604e-05, + "loss": 0.9497, + "step": 930 + }, + { + "epoch": 0.1362466934811755, + "grad_norm": 1.1182892322540283, + "learning_rate": 4.863748369328888e-05, + "loss": 0.9073, + "step": 940 + }, + { + "epoch": 0.1376961263905497, + "grad_norm": 3.7214620113372803, + "learning_rate": 4.862298883896217e-05, + "loss": 1.0357, + "step": 950 + }, + { + "epoch": 0.1391455592999239, + "grad_norm": 2.3611812591552734, + "learning_rate": 4.860849398463546e-05, + "loss": 0.9799, + "step": 960 + }, + { + "epoch": 0.1405949922092981, + "grad_norm": 2.17946195602417, + "learning_rate": 4.859399913030874e-05, + "loss": 0.9733, + "step": 970 + }, + { + "epoch": 0.14204442511867232, + "grad_norm": 2.5610668659210205, + "learning_rate": 4.857950427598203e-05, + "loss": 0.9406, + "step": 980 + }, + { + "epoch": 0.14349385802804654, + "grad_norm": 2.09188175201416, + "learning_rate": 4.8565009421655317e-05, + "loss": 0.9563, + "step": 990 + }, + { + "epoch": 0.14494329093742073, + "grad_norm": 2.210683584213257, + "learning_rate": 4.85505145673286e-05, + "loss": 0.9265, + "step": 1000 + }, + { + "epoch": 0.14494329093742073, + "eval_loss": 0.9329102039337158, + "eval_runtime": 670.2535, + "eval_samples_per_second": 51.466, + "eval_steps_per_second": 2.574, + "eval_token_accuracy": 0.00042011859825789107, + "step": 1000 + }, + { + "epoch": 0.14639272384679494, + "grad_norm": 1.2653387784957886, + "learning_rate": 4.8536019713001885e-05, + "loss": 0.9509, + "step": 1010 + }, + { + "epoch": 0.14784215675616916, + "grad_norm": 0.9553664922714233, + "learning_rate": 4.852152485867517e-05, + "loss": 0.982, + "step": 1020 + }, + { + "epoch": 0.14929158966554334, + "grad_norm": 0.8428664803504944, + "learning_rate": 4.850703000434846e-05, + "loss": 0.8896, + "step": 1030 + }, + { + "epoch": 0.15074102257491756, + "grad_norm": 2.9214038848876953, + "learning_rate": 4.8492535150021745e-05, + "loss": 0.8753, + "step": 1040 + }, + { + "epoch": 0.15219045548429178, + "grad_norm": 0.9408260583877563, + "learning_rate": 4.847804029569503e-05, + "loss": 1.047, + "step": 1050 + }, + { + "epoch": 0.153639888393666, + "grad_norm": 1.9852826595306396, + "learning_rate": 4.846354544136832e-05, + "loss": 0.9524, + "step": 1060 + }, + { + "epoch": 0.15508932130304018, + "grad_norm": 3.4830455780029297, + "learning_rate": 4.84490505870416e-05, + "loss": 0.9501, + "step": 1070 + }, + { + "epoch": 0.1565387542124144, + "grad_norm": 2.317713975906372, + "learning_rate": 4.843455573271489e-05, + "loss": 0.8766, + "step": 1080 + }, + { + "epoch": 0.1579881871217886, + "grad_norm": 3.330193281173706, + "learning_rate": 4.8420060878388174e-05, + "loss": 0.9837, + "step": 1090 + }, + { + "epoch": 0.1594376200311628, + "grad_norm": 2.919408082962036, + "learning_rate": 4.840556602406146e-05, + "loss": 0.9576, + "step": 1100 + }, + { + "epoch": 0.160887052940537, + "grad_norm": 2.2638754844665527, + "learning_rate": 4.839107116973475e-05, + "loss": 0.9298, + "step": 1110 + }, + { + "epoch": 0.16233648584991123, + "grad_norm": 1.0544981956481934, + "learning_rate": 4.8376576315408034e-05, + "loss": 0.9947, + "step": 1120 + }, + { + "epoch": 0.16378591875928544, + "grad_norm": 1.3924440145492554, + "learning_rate": 4.836208146108132e-05, + "loss": 0.8487, + "step": 1130 + }, + { + "epoch": 0.16523535166865963, + "grad_norm": 3.719411611557007, + "learning_rate": 4.83475866067546e-05, + "loss": 0.834, + "step": 1140 + }, + { + "epoch": 0.16668478457803385, + "grad_norm": 3.1443965435028076, + "learning_rate": 4.833309175242789e-05, + "loss": 1.0022, + "step": 1150 + }, + { + "epoch": 0.16813421748740806, + "grad_norm": 0.8373526334762573, + "learning_rate": 4.831859689810118e-05, + "loss": 0.9773, + "step": 1160 + }, + { + "epoch": 0.16958365039678225, + "grad_norm": 2.951531171798706, + "learning_rate": 4.830410204377446e-05, + "loss": 0.977, + "step": 1170 + }, + { + "epoch": 0.17103308330615646, + "grad_norm": 2.2842185497283936, + "learning_rate": 4.8289607189447747e-05, + "loss": 0.9501, + "step": 1180 + }, + { + "epoch": 0.17248251621553068, + "grad_norm": 2.8369762897491455, + "learning_rate": 4.827511233512104e-05, + "loss": 0.9134, + "step": 1190 + }, + { + "epoch": 0.1739319491249049, + "grad_norm": 2.4819037914276123, + "learning_rate": 4.8260617480794315e-05, + "loss": 0.9153, + "step": 1200 + }, + { + "epoch": 0.17538138203427908, + "grad_norm": 1.0048160552978516, + "learning_rate": 4.8246122626467606e-05, + "loss": 0.9296, + "step": 1210 + }, + { + "epoch": 0.1768308149436533, + "grad_norm": 3.0000860691070557, + "learning_rate": 4.823162777214089e-05, + "loss": 1.0214, + "step": 1220 + }, + { + "epoch": 0.1782802478530275, + "grad_norm": 2.2292511463165283, + "learning_rate": 4.8217132917814175e-05, + "loss": 0.9293, + "step": 1230 + }, + { + "epoch": 0.1797296807624017, + "grad_norm": 1.6593018770217896, + "learning_rate": 4.8202638063487466e-05, + "loss": 0.936, + "step": 1240 + }, + { + "epoch": 0.18117911367177592, + "grad_norm": 2.7024037837982178, + "learning_rate": 4.818814320916075e-05, + "loss": 0.905, + "step": 1250 + }, + { + "epoch": 0.18262854658115013, + "grad_norm": 0.9400494694709778, + "learning_rate": 4.817364835483404e-05, + "loss": 0.9595, + "step": 1260 + }, + { + "epoch": 0.18407797949052435, + "grad_norm": 1.0066282749176025, + "learning_rate": 4.815915350050732e-05, + "loss": 0.8789, + "step": 1270 + }, + { + "epoch": 0.18552741239989853, + "grad_norm": 1.9175103902816772, + "learning_rate": 4.814465864618061e-05, + "loss": 0.8903, + "step": 1280 + }, + { + "epoch": 0.18697684530927275, + "grad_norm": 1.1838349103927612, + "learning_rate": 4.8130163791853895e-05, + "loss": 0.9542, + "step": 1290 + }, + { + "epoch": 0.18842627821864696, + "grad_norm": 4.920762062072754, + "learning_rate": 4.811566893752718e-05, + "loss": 0.971, + "step": 1300 + }, + { + "epoch": 0.18987571112802115, + "grad_norm": 1.5060116052627563, + "learning_rate": 4.810117408320047e-05, + "loss": 0.8748, + "step": 1310 + }, + { + "epoch": 0.19132514403739537, + "grad_norm": 2.7574806213378906, + "learning_rate": 4.8086679228873755e-05, + "loss": 0.8522, + "step": 1320 + }, + { + "epoch": 0.19277457694676958, + "grad_norm": 3.3912112712860107, + "learning_rate": 4.807218437454704e-05, + "loss": 0.9123, + "step": 1330 + }, + { + "epoch": 0.19422400985614377, + "grad_norm": 0.9661535620689392, + "learning_rate": 4.805768952022032e-05, + "loss": 0.9062, + "step": 1340 + }, + { + "epoch": 0.19567344276551799, + "grad_norm": 2.8521437644958496, + "learning_rate": 4.804319466589361e-05, + "loss": 0.9922, + "step": 1350 + }, + { + "epoch": 0.1971228756748922, + "grad_norm": 2.469609498977661, + "learning_rate": 4.80286998115669e-05, + "loss": 1.0353, + "step": 1360 + }, + { + "epoch": 0.19857230858426642, + "grad_norm": 2.4669108390808105, + "learning_rate": 4.801420495724018e-05, + "loss": 0.9082, + "step": 1370 + }, + { + "epoch": 0.2000217414936406, + "grad_norm": 2.641343593597412, + "learning_rate": 4.799971010291347e-05, + "loss": 0.9272, + "step": 1380 + }, + { + "epoch": 0.20147117440301482, + "grad_norm": 1.0589393377304077, + "learning_rate": 4.798521524858676e-05, + "loss": 0.9126, + "step": 1390 + }, + { + "epoch": 0.20292060731238903, + "grad_norm": 4.2109880447387695, + "learning_rate": 4.7970720394260036e-05, + "loss": 1.0255, + "step": 1400 + }, + { + "epoch": 0.20437004022176322, + "grad_norm": 2.466830015182495, + "learning_rate": 4.795622553993333e-05, + "loss": 0.9984, + "step": 1410 + }, + { + "epoch": 0.20581947313113744, + "grad_norm": 0.8373948931694031, + "learning_rate": 4.794173068560661e-05, + "loss": 0.8425, + "step": 1420 + }, + { + "epoch": 0.20726890604051165, + "grad_norm": 1.0101238489151, + "learning_rate": 4.7927235831279896e-05, + "loss": 0.897, + "step": 1430 + }, + { + "epoch": 0.20871833894988587, + "grad_norm": 2.0855133533477783, + "learning_rate": 4.791274097695319e-05, + "loss": 0.9181, + "step": 1440 + }, + { + "epoch": 0.21016777185926006, + "grad_norm": 0.9278097152709961, + "learning_rate": 4.789824612262647e-05, + "loss": 0.9341, + "step": 1450 + }, + { + "epoch": 0.21161720476863427, + "grad_norm": 1.599377989768982, + "learning_rate": 4.7883751268299756e-05, + "loss": 0.8105, + "step": 1460 + }, + { + "epoch": 0.21306663767800849, + "grad_norm": 1.1618376970291138, + "learning_rate": 4.786925641397304e-05, + "loss": 0.913, + "step": 1470 + }, + { + "epoch": 0.21451607058738267, + "grad_norm": 1.2751816511154175, + "learning_rate": 4.7854761559646325e-05, + "loss": 0.8783, + "step": 1480 + }, + { + "epoch": 0.2159655034967569, + "grad_norm": 0.7834726572036743, + "learning_rate": 4.7840266705319616e-05, + "loss": 0.8906, + "step": 1490 + }, + { + "epoch": 0.2174149364061311, + "grad_norm": 0.9681240320205688, + "learning_rate": 4.78257718509929e-05, + "loss": 1.0063, + "step": 1500 + }, + { + "epoch": 0.21886436931550532, + "grad_norm": 2.6840054988861084, + "learning_rate": 4.7811276996666185e-05, + "loss": 0.9857, + "step": 1510 + }, + { + "epoch": 0.2203138022248795, + "grad_norm": 0.952912449836731, + "learning_rate": 4.779678214233947e-05, + "loss": 1.0314, + "step": 1520 + }, + { + "epoch": 0.22176323513425372, + "grad_norm": 2.4693140983581543, + "learning_rate": 4.778228728801275e-05, + "loss": 0.9979, + "step": 1530 + }, + { + "epoch": 0.22321266804362794, + "grad_norm": 2.79858136177063, + "learning_rate": 4.7767792433686044e-05, + "loss": 0.9388, + "step": 1540 + }, + { + "epoch": 0.22466210095300213, + "grad_norm": 2.951786994934082, + "learning_rate": 4.775329757935933e-05, + "loss": 0.9021, + "step": 1550 + }, + { + "epoch": 0.22611153386237634, + "grad_norm": 1.213524341583252, + "learning_rate": 4.773880272503261e-05, + "loss": 1.0483, + "step": 1560 + }, + { + "epoch": 0.22756096677175056, + "grad_norm": 3.919523000717163, + "learning_rate": 4.7724307870705904e-05, + "loss": 1.0111, + "step": 1570 + }, + { + "epoch": 0.22901039968112477, + "grad_norm": 1.069823145866394, + "learning_rate": 4.770981301637919e-05, + "loss": 0.8561, + "step": 1580 + }, + { + "epoch": 0.23045983259049896, + "grad_norm": 1.9359710216522217, + "learning_rate": 4.769531816205247e-05, + "loss": 0.9442, + "step": 1590 + }, + { + "epoch": 0.23190926549987317, + "grad_norm": 0.9056136608123779, + "learning_rate": 4.768082330772576e-05, + "loss": 0.8722, + "step": 1600 + }, + { + "epoch": 0.2333586984092474, + "grad_norm": 1.7711377143859863, + "learning_rate": 4.766632845339904e-05, + "loss": 0.8642, + "step": 1610 + }, + { + "epoch": 0.23480813131862158, + "grad_norm": 0.7478885650634766, + "learning_rate": 4.765183359907233e-05, + "loss": 0.9124, + "step": 1620 + }, + { + "epoch": 0.2362575642279958, + "grad_norm": 0.8888253569602966, + "learning_rate": 4.763733874474562e-05, + "loss": 0.9537, + "step": 1630 + }, + { + "epoch": 0.23770699713737, + "grad_norm": 0.7752460837364197, + "learning_rate": 4.762284389041891e-05, + "loss": 0.8851, + "step": 1640 + }, + { + "epoch": 0.23915643004674422, + "grad_norm": 1.0801783800125122, + "learning_rate": 4.7608349036092186e-05, + "loss": 0.8669, + "step": 1650 + }, + { + "epoch": 0.2406058629561184, + "grad_norm": 1.8434480428695679, + "learning_rate": 4.759385418176548e-05, + "loss": 1.0102, + "step": 1660 + }, + { + "epoch": 0.24205529586549263, + "grad_norm": 2.921001434326172, + "learning_rate": 4.757935932743876e-05, + "loss": 0.8252, + "step": 1670 + }, + { + "epoch": 0.24350472877486684, + "grad_norm": 1.2697440385818481, + "learning_rate": 4.7564864473112046e-05, + "loss": 0.851, + "step": 1680 + }, + { + "epoch": 0.24495416168424103, + "grad_norm": 2.131409168243408, + "learning_rate": 4.755036961878534e-05, + "loss": 0.9035, + "step": 1690 + }, + { + "epoch": 0.24640359459361524, + "grad_norm": 2.086275339126587, + "learning_rate": 4.753587476445862e-05, + "loss": 0.8721, + "step": 1700 + }, + { + "epoch": 0.24785302750298946, + "grad_norm": 1.0655896663665771, + "learning_rate": 4.7521379910131906e-05, + "loss": 0.9176, + "step": 1710 + }, + { + "epoch": 0.24930246041236367, + "grad_norm": 0.9161379933357239, + "learning_rate": 4.750688505580519e-05, + "loss": 0.8358, + "step": 1720 + }, + { + "epoch": 0.25075189332173786, + "grad_norm": 1.0103297233581543, + "learning_rate": 4.7492390201478474e-05, + "loss": 0.9097, + "step": 1730 + }, + { + "epoch": 0.2522013262311121, + "grad_norm": 0.9535181522369385, + "learning_rate": 4.7477895347151765e-05, + "loss": 0.9068, + "step": 1740 + }, + { + "epoch": 0.2536507591404863, + "grad_norm": 3.1974833011627197, + "learning_rate": 4.746340049282505e-05, + "loss": 0.9513, + "step": 1750 + }, + { + "epoch": 0.2551001920498605, + "grad_norm": 3.5244507789611816, + "learning_rate": 4.7448905638498334e-05, + "loss": 0.9102, + "step": 1760 + }, + { + "epoch": 0.2565496249592347, + "grad_norm": 0.7904402017593384, + "learning_rate": 4.7434410784171625e-05, + "loss": 0.9212, + "step": 1770 + }, + { + "epoch": 0.2579990578686089, + "grad_norm": 2.1184966564178467, + "learning_rate": 4.74199159298449e-05, + "loss": 0.8912, + "step": 1780 + }, + { + "epoch": 0.2594484907779831, + "grad_norm": 4.049259185791016, + "learning_rate": 4.7405421075518194e-05, + "loss": 1.0043, + "step": 1790 + }, + { + "epoch": 0.2608979236873573, + "grad_norm": 0.9909255504608154, + "learning_rate": 4.739092622119148e-05, + "loss": 0.967, + "step": 1800 + }, + { + "epoch": 0.26234735659673153, + "grad_norm": 0.9145278930664062, + "learning_rate": 4.737643136686476e-05, + "loss": 0.8499, + "step": 1810 + }, + { + "epoch": 0.26379678950610574, + "grad_norm": 4.189142227172852, + "learning_rate": 4.7361936512538054e-05, + "loss": 0.9368, + "step": 1820 + }, + { + "epoch": 0.26524622241547996, + "grad_norm": 3.4056777954101562, + "learning_rate": 4.734744165821134e-05, + "loss": 0.8985, + "step": 1830 + }, + { + "epoch": 0.2666956553248542, + "grad_norm": 0.9040047526359558, + "learning_rate": 4.733294680388462e-05, + "loss": 0.8604, + "step": 1840 + }, + { + "epoch": 0.26814508823422833, + "grad_norm": 2.2302513122558594, + "learning_rate": 4.731845194955791e-05, + "loss": 0.951, + "step": 1850 + }, + { + "epoch": 0.26959452114360255, + "grad_norm": 1.6290664672851562, + "learning_rate": 4.730395709523119e-05, + "loss": 0.8684, + "step": 1860 + }, + { + "epoch": 0.27104395405297677, + "grad_norm": 3.178881883621216, + "learning_rate": 4.728946224090448e-05, + "loss": 0.939, + "step": 1870 + }, + { + "epoch": 0.272493386962351, + "grad_norm": 0.8622704744338989, + "learning_rate": 4.727496738657777e-05, + "loss": 0.907, + "step": 1880 + }, + { + "epoch": 0.2739428198717252, + "grad_norm": 0.8100652694702148, + "learning_rate": 4.726047253225105e-05, + "loss": 0.9619, + "step": 1890 + }, + { + "epoch": 0.2753922527810994, + "grad_norm": 3.0906217098236084, + "learning_rate": 4.724597767792434e-05, + "loss": 0.9168, + "step": 1900 + }, + { + "epoch": 0.2768416856904736, + "grad_norm": 0.9899945259094238, + "learning_rate": 4.723148282359762e-05, + "loss": 0.9018, + "step": 1910 + }, + { + "epoch": 0.2782911185998478, + "grad_norm": 0.9530486464500427, + "learning_rate": 4.721698796927091e-05, + "loss": 0.9945, + "step": 1920 + }, + { + "epoch": 0.279740551509222, + "grad_norm": 0.9075252413749695, + "learning_rate": 4.7202493114944195e-05, + "loss": 0.8555, + "step": 1930 + }, + { + "epoch": 0.2811899844185962, + "grad_norm": 2.1643764972686768, + "learning_rate": 4.718799826061748e-05, + "loss": 0.9005, + "step": 1940 + }, + { + "epoch": 0.28263941732797043, + "grad_norm": 1.0276823043823242, + "learning_rate": 4.717350340629077e-05, + "loss": 1.0093, + "step": 1950 + }, + { + "epoch": 0.28408885023734465, + "grad_norm": 3.145967483520508, + "learning_rate": 4.7159008551964055e-05, + "loss": 0.8591, + "step": 1960 + }, + { + "epoch": 0.28553828314671886, + "grad_norm": 0.9500845074653625, + "learning_rate": 4.7144513697637346e-05, + "loss": 0.9213, + "step": 1970 + }, + { + "epoch": 0.2869877160560931, + "grad_norm": 0.709665834903717, + "learning_rate": 4.7130018843310624e-05, + "loss": 0.8349, + "step": 1980 + }, + { + "epoch": 0.28843714896546724, + "grad_norm": 1.674027442932129, + "learning_rate": 4.711552398898391e-05, + "loss": 0.9234, + "step": 1990 + }, + { + "epoch": 0.28988658187484145, + "grad_norm": 1.1925066709518433, + "learning_rate": 4.71010291346572e-05, + "loss": 0.9271, + "step": 2000 + }, + { + "epoch": 0.28988658187484145, + "eval_loss": 0.8844457864761353, + "eval_runtime": 669.6132, + "eval_samples_per_second": 51.515, + "eval_steps_per_second": 2.576, + "eval_token_accuracy": 0.0004413497065925165, + "step": 2000 + }, + { + "epoch": 0.29133601478421567, + "grad_norm": 1.0759925842285156, + "learning_rate": 4.7086534280330484e-05, + "loss": 0.9389, + "step": 2010 + }, + { + "epoch": 0.2927854476935899, + "grad_norm": 3.2743873596191406, + "learning_rate": 4.7072039426003775e-05, + "loss": 0.909, + "step": 2020 + }, + { + "epoch": 0.2942348806029641, + "grad_norm": 2.5346035957336426, + "learning_rate": 4.705754457167706e-05, + "loss": 0.9818, + "step": 2030 + }, + { + "epoch": 0.2956843135123383, + "grad_norm": 0.8546256422996521, + "learning_rate": 4.7043049717350344e-05, + "loss": 0.9858, + "step": 2040 + }, + { + "epoch": 0.29713374642171253, + "grad_norm": 1.074825644493103, + "learning_rate": 4.702855486302363e-05, + "loss": 0.8267, + "step": 2050 + }, + { + "epoch": 0.2985831793310867, + "grad_norm": 0.9701589941978455, + "learning_rate": 4.701406000869691e-05, + "loss": 0.8692, + "step": 2060 + }, + { + "epoch": 0.3000326122404609, + "grad_norm": 0.9195166826248169, + "learning_rate": 4.6999565154370204e-05, + "loss": 0.9654, + "step": 2070 + }, + { + "epoch": 0.3014820451498351, + "grad_norm": 0.9517616629600525, + "learning_rate": 4.698507030004349e-05, + "loss": 0.8702, + "step": 2080 + }, + { + "epoch": 0.30293147805920934, + "grad_norm": 0.7250176072120667, + "learning_rate": 4.697057544571677e-05, + "loss": 0.9022, + "step": 2090 + }, + { + "epoch": 0.30438091096858355, + "grad_norm": 0.8878209590911865, + "learning_rate": 4.6956080591390063e-05, + "loss": 0.961, + "step": 2100 + }, + { + "epoch": 0.30583034387795777, + "grad_norm": 0.8246135115623474, + "learning_rate": 4.694158573706334e-05, + "loss": 0.8893, + "step": 2110 + }, + { + "epoch": 0.307279776787332, + "grad_norm": 2.2007503509521484, + "learning_rate": 4.692709088273663e-05, + "loss": 0.9578, + "step": 2120 + }, + { + "epoch": 0.30872920969670614, + "grad_norm": 0.8744785189628601, + "learning_rate": 4.6912596028409916e-05, + "loss": 0.8567, + "step": 2130 + }, + { + "epoch": 0.31017864260608036, + "grad_norm": 0.8183798789978027, + "learning_rate": 4.68981011740832e-05, + "loss": 0.9111, + "step": 2140 + }, + { + "epoch": 0.3116280755154546, + "grad_norm": 3.1583478450775146, + "learning_rate": 4.688360631975649e-05, + "loss": 0.9575, + "step": 2150 + }, + { + "epoch": 0.3130775084248288, + "grad_norm": 2.7086269855499268, + "learning_rate": 4.6869111465429776e-05, + "loss": 0.9788, + "step": 2160 + }, + { + "epoch": 0.314526941334203, + "grad_norm": 0.8873903751373291, + "learning_rate": 4.685461661110306e-05, + "loss": 0.8857, + "step": 2170 + }, + { + "epoch": 0.3159763742435772, + "grad_norm": 1.2673507928848267, + "learning_rate": 4.6840121756776345e-05, + "loss": 0.8733, + "step": 2180 + }, + { + "epoch": 0.31742580715295143, + "grad_norm": 2.983605146408081, + "learning_rate": 4.682562690244963e-05, + "loss": 1.0298, + "step": 2190 + }, + { + "epoch": 0.3188752400623256, + "grad_norm": 0.7594058513641357, + "learning_rate": 4.681113204812292e-05, + "loss": 0.912, + "step": 2200 + }, + { + "epoch": 0.3203246729716998, + "grad_norm": 2.4812521934509277, + "learning_rate": 4.6796637193796205e-05, + "loss": 0.9988, + "step": 2210 + }, + { + "epoch": 0.321774105881074, + "grad_norm": 2.502842903137207, + "learning_rate": 4.678214233946949e-05, + "loss": 0.8589, + "step": 2220 + }, + { + "epoch": 0.32322353879044824, + "grad_norm": 1.1463567018508911, + "learning_rate": 4.676764748514278e-05, + "loss": 0.9046, + "step": 2230 + }, + { + "epoch": 0.32467297169982245, + "grad_norm": 1.1552772521972656, + "learning_rate": 4.675315263081606e-05, + "loss": 0.8777, + "step": 2240 + }, + { + "epoch": 0.32612240460919667, + "grad_norm": 0.7134237289428711, + "learning_rate": 4.673865777648935e-05, + "loss": 0.9582, + "step": 2250 + }, + { + "epoch": 0.3275718375185709, + "grad_norm": 1.2815346717834473, + "learning_rate": 4.6724162922162633e-05, + "loss": 0.9574, + "step": 2260 + }, + { + "epoch": 0.32902127042794505, + "grad_norm": 2.6346404552459717, + "learning_rate": 4.670966806783592e-05, + "loss": 0.8783, + "step": 2270 + }, + { + "epoch": 0.33047070333731926, + "grad_norm": 2.8299264907836914, + "learning_rate": 4.669517321350921e-05, + "loss": 0.8363, + "step": 2280 + }, + { + "epoch": 0.3319201362466935, + "grad_norm": 1.1581934690475464, + "learning_rate": 4.668067835918249e-05, + "loss": 0.8875, + "step": 2290 + }, + { + "epoch": 0.3333695691560677, + "grad_norm": 0.801040768623352, + "learning_rate": 4.666618350485578e-05, + "loss": 0.9725, + "step": 2300 + }, + { + "epoch": 0.3348190020654419, + "grad_norm": 4.302339553833008, + "learning_rate": 4.665168865052906e-05, + "loss": 0.8756, + "step": 2310 + }, + { + "epoch": 0.3362684349748161, + "grad_norm": 1.6098670959472656, + "learning_rate": 4.6637193796202346e-05, + "loss": 0.9409, + "step": 2320 + }, + { + "epoch": 0.33771786788419034, + "grad_norm": 0.9708408117294312, + "learning_rate": 4.662269894187564e-05, + "loss": 0.8269, + "step": 2330 + }, + { + "epoch": 0.3391673007935645, + "grad_norm": 2.4792208671569824, + "learning_rate": 4.660820408754892e-05, + "loss": 0.7968, + "step": 2340 + }, + { + "epoch": 0.3406167337029387, + "grad_norm": 1.9270927906036377, + "learning_rate": 4.659370923322221e-05, + "loss": 0.8857, + "step": 2350 + }, + { + "epoch": 0.3420661666123129, + "grad_norm": 0.823274552822113, + "learning_rate": 4.65792143788955e-05, + "loss": 0.9024, + "step": 2360 + }, + { + "epoch": 0.34351559952168714, + "grad_norm": 1.060691475868225, + "learning_rate": 4.656471952456878e-05, + "loss": 0.8604, + "step": 2370 + }, + { + "epoch": 0.34496503243106136, + "grad_norm": 0.875577986240387, + "learning_rate": 4.6550224670242066e-05, + "loss": 0.8587, + "step": 2380 + }, + { + "epoch": 0.3464144653404356, + "grad_norm": 1.1254234313964844, + "learning_rate": 4.653572981591535e-05, + "loss": 0.841, + "step": 2390 + }, + { + "epoch": 0.3478638982498098, + "grad_norm": 0.9796624779701233, + "learning_rate": 4.652123496158864e-05, + "loss": 0.8953, + "step": 2400 + }, + { + "epoch": 0.34931333115918395, + "grad_norm": 1.004135012626648, + "learning_rate": 4.6506740107261926e-05, + "loss": 0.9329, + "step": 2410 + }, + { + "epoch": 0.35076276406855816, + "grad_norm": 1.0762351751327515, + "learning_rate": 4.649224525293521e-05, + "loss": 0.8285, + "step": 2420 + }, + { + "epoch": 0.3522121969779324, + "grad_norm": 2.8664536476135254, + "learning_rate": 4.6477750398608495e-05, + "loss": 0.894, + "step": 2430 + }, + { + "epoch": 0.3536616298873066, + "grad_norm": 2.1905500888824463, + "learning_rate": 4.646325554428178e-05, + "loss": 1.0367, + "step": 2440 + }, + { + "epoch": 0.3551110627966808, + "grad_norm": 2.4113802909851074, + "learning_rate": 4.644876068995507e-05, + "loss": 0.915, + "step": 2450 + }, + { + "epoch": 0.356560495706055, + "grad_norm": 2.86832594871521, + "learning_rate": 4.6434265835628355e-05, + "loss": 0.9524, + "step": 2460 + }, + { + "epoch": 0.35800992861542924, + "grad_norm": 2.5679378509521484, + "learning_rate": 4.641977098130164e-05, + "loss": 0.8456, + "step": 2470 + }, + { + "epoch": 0.3594593615248034, + "grad_norm": 2.207540988922119, + "learning_rate": 4.640527612697493e-05, + "loss": 0.9101, + "step": 2480 + }, + { + "epoch": 0.3609087944341776, + "grad_norm": 0.9119837880134583, + "learning_rate": 4.639078127264821e-05, + "loss": 0.8385, + "step": 2490 + }, + { + "epoch": 0.36235822734355183, + "grad_norm": 2.577519655227661, + "learning_rate": 4.63762864183215e-05, + "loss": 0.9289, + "step": 2500 + }, + { + "epoch": 0.36380766025292605, + "grad_norm": 0.8448528051376343, + "learning_rate": 4.636179156399478e-05, + "loss": 0.8761, + "step": 2510 + }, + { + "epoch": 0.36525709316230026, + "grad_norm": 1.085668683052063, + "learning_rate": 4.634729670966807e-05, + "loss": 0.9201, + "step": 2520 + }, + { + "epoch": 0.3667065260716745, + "grad_norm": 3.674574851989746, + "learning_rate": 4.633280185534136e-05, + "loss": 0.8622, + "step": 2530 + }, + { + "epoch": 0.3681559589810487, + "grad_norm": 0.8500022292137146, + "learning_rate": 4.631830700101464e-05, + "loss": 0.8821, + "step": 2540 + }, + { + "epoch": 0.36960539189042285, + "grad_norm": 0.8118327856063843, + "learning_rate": 4.630381214668793e-05, + "loss": 0.8382, + "step": 2550 + }, + { + "epoch": 0.37105482479979707, + "grad_norm": 0.8166295289993286, + "learning_rate": 4.628931729236121e-05, + "loss": 0.8343, + "step": 2560 + }, + { + "epoch": 0.3725042577091713, + "grad_norm": 0.8547616004943848, + "learning_rate": 4.6274822438034496e-05, + "loss": 0.8319, + "step": 2570 + }, + { + "epoch": 0.3739536906185455, + "grad_norm": 2.3333303928375244, + "learning_rate": 4.626032758370779e-05, + "loss": 0.9359, + "step": 2580 + }, + { + "epoch": 0.3754031235279197, + "grad_norm": 2.314495086669922, + "learning_rate": 4.624583272938107e-05, + "loss": 0.9749, + "step": 2590 + }, + { + "epoch": 0.37685255643729393, + "grad_norm": 3.219776153564453, + "learning_rate": 4.6231337875054356e-05, + "loss": 0.8707, + "step": 2600 + }, + { + "epoch": 0.37830198934666814, + "grad_norm": 1.0046021938323975, + "learning_rate": 4.621684302072765e-05, + "loss": 0.963, + "step": 2610 + }, + { + "epoch": 0.3797514222560423, + "grad_norm": 3.0080223083496094, + "learning_rate": 4.6202348166400925e-05, + "loss": 0.7975, + "step": 2620 + }, + { + "epoch": 0.3812008551654165, + "grad_norm": 0.7294064164161682, + "learning_rate": 4.6187853312074216e-05, + "loss": 0.9354, + "step": 2630 + }, + { + "epoch": 0.38265028807479073, + "grad_norm": 2.269174575805664, + "learning_rate": 4.61733584577475e-05, + "loss": 0.952, + "step": 2640 + }, + { + "epoch": 0.38409972098416495, + "grad_norm": 2.4099864959716797, + "learning_rate": 4.6158863603420784e-05, + "loss": 0.9258, + "step": 2650 + }, + { + "epoch": 0.38554915389353916, + "grad_norm": 0.9502733945846558, + "learning_rate": 4.6144368749094076e-05, + "loss": 0.9672, + "step": 2660 + }, + { + "epoch": 0.3869985868029134, + "grad_norm": 2.0401251316070557, + "learning_rate": 4.612987389476736e-05, + "loss": 0.8961, + "step": 2670 + }, + { + "epoch": 0.38844801971228754, + "grad_norm": 0.6880651116371155, + "learning_rate": 4.611537904044065e-05, + "loss": 0.8395, + "step": 2680 + }, + { + "epoch": 0.38989745262166176, + "grad_norm": 2.322988986968994, + "learning_rate": 4.610088418611393e-05, + "loss": 0.9236, + "step": 2690 + }, + { + "epoch": 0.39134688553103597, + "grad_norm": 2.6459531784057617, + "learning_rate": 4.608638933178721e-05, + "loss": 0.9201, + "step": 2700 + }, + { + "epoch": 0.3927963184404102, + "grad_norm": 3.0082294940948486, + "learning_rate": 4.6071894477460504e-05, + "loss": 0.9289, + "step": 2710 + }, + { + "epoch": 0.3942457513497844, + "grad_norm": 2.3234775066375732, + "learning_rate": 4.605739962313379e-05, + "loss": 0.8907, + "step": 2720 + }, + { + "epoch": 0.3956951842591586, + "grad_norm": 2.548450469970703, + "learning_rate": 4.604290476880708e-05, + "loss": 0.8939, + "step": 2730 + }, + { + "epoch": 0.39714461716853283, + "grad_norm": 0.8990277051925659, + "learning_rate": 4.6028409914480364e-05, + "loss": 0.8258, + "step": 2740 + }, + { + "epoch": 0.398594050077907, + "grad_norm": 2.6693661212921143, + "learning_rate": 4.601391506015365e-05, + "loss": 0.8105, + "step": 2750 + }, + { + "epoch": 0.4000434829872812, + "grad_norm": 1.8534784317016602, + "learning_rate": 4.599942020582693e-05, + "loss": 0.8123, + "step": 2760 + }, + { + "epoch": 0.4014929158966554, + "grad_norm": 0.9559533596038818, + "learning_rate": 4.598492535150022e-05, + "loss": 0.8422, + "step": 2770 + }, + { + "epoch": 0.40294234880602964, + "grad_norm": 3.0623579025268555, + "learning_rate": 4.597043049717351e-05, + "loss": 0.9334, + "step": 2780 + }, + { + "epoch": 0.40439178171540385, + "grad_norm": 2.8755178451538086, + "learning_rate": 4.595593564284679e-05, + "loss": 0.8892, + "step": 2790 + }, + { + "epoch": 0.40584121462477807, + "grad_norm": 1.9824632406234741, + "learning_rate": 4.594144078852008e-05, + "loss": 0.8169, + "step": 2800 + }, + { + "epoch": 0.4072906475341523, + "grad_norm": 2.5154964923858643, + "learning_rate": 4.592694593419337e-05, + "loss": 0.8304, + "step": 2810 + }, + { + "epoch": 0.40874008044352644, + "grad_norm": 1.6365853548049927, + "learning_rate": 4.5912451079866646e-05, + "loss": 0.8779, + "step": 2820 + }, + { + "epoch": 0.41018951335290066, + "grad_norm": 0.7808021903038025, + "learning_rate": 4.589795622553994e-05, + "loss": 0.8335, + "step": 2830 + }, + { + "epoch": 0.4116389462622749, + "grad_norm": 0.9925370812416077, + "learning_rate": 4.588346137121322e-05, + "loss": 0.7861, + "step": 2840 + }, + { + "epoch": 0.4130883791716491, + "grad_norm": 2.8942930698394775, + "learning_rate": 4.5868966516886506e-05, + "loss": 0.9344, + "step": 2850 + }, + { + "epoch": 0.4145378120810233, + "grad_norm": 1.0374455451965332, + "learning_rate": 4.58544716625598e-05, + "loss": 0.8743, + "step": 2860 + }, + { + "epoch": 0.4159872449903975, + "grad_norm": 1.9524415731430054, + "learning_rate": 4.583997680823308e-05, + "loss": 0.8777, + "step": 2870 + }, + { + "epoch": 0.41743667789977174, + "grad_norm": 0.8949989080429077, + "learning_rate": 4.5825481953906365e-05, + "loss": 0.8909, + "step": 2880 + }, + { + "epoch": 0.4188861108091459, + "grad_norm": 2.0267937183380127, + "learning_rate": 4.581098709957965e-05, + "loss": 0.9716, + "step": 2890 + }, + { + "epoch": 0.4203355437185201, + "grad_norm": 0.943326473236084, + "learning_rate": 4.5796492245252934e-05, + "loss": 0.9009, + "step": 2900 + }, + { + "epoch": 0.4217849766278943, + "grad_norm": 0.7954492568969727, + "learning_rate": 4.5781997390926225e-05, + "loss": 0.9105, + "step": 2910 + }, + { + "epoch": 0.42323440953726854, + "grad_norm": 4.5275349617004395, + "learning_rate": 4.576750253659951e-05, + "loss": 0.8551, + "step": 2920 + }, + { + "epoch": 0.42468384244664276, + "grad_norm": 0.8888694047927856, + "learning_rate": 4.5753007682272794e-05, + "loss": 0.7458, + "step": 2930 + }, + { + "epoch": 0.42613327535601697, + "grad_norm": 2.544055938720703, + "learning_rate": 4.5738512827946085e-05, + "loss": 0.9631, + "step": 2940 + }, + { + "epoch": 0.4275827082653912, + "grad_norm": 0.7478300929069519, + "learning_rate": 4.572401797361936e-05, + "loss": 0.8596, + "step": 2950 + }, + { + "epoch": 0.42903214117476535, + "grad_norm": 3.5203044414520264, + "learning_rate": 4.5709523119292654e-05, + "loss": 0.8521, + "step": 2960 + }, + { + "epoch": 0.43048157408413956, + "grad_norm": 0.8133640289306641, + "learning_rate": 4.569502826496594e-05, + "loss": 0.8216, + "step": 2970 + }, + { + "epoch": 0.4319310069935138, + "grad_norm": 1.3965954780578613, + "learning_rate": 4.568053341063922e-05, + "loss": 0.8722, + "step": 2980 + }, + { + "epoch": 0.433380439902888, + "grad_norm": 0.6984473466873169, + "learning_rate": 4.5666038556312514e-05, + "loss": 0.9287, + "step": 2990 + }, + { + "epoch": 0.4348298728122622, + "grad_norm": 0.7165631055831909, + "learning_rate": 4.56515437019858e-05, + "loss": 0.881, + "step": 3000 + }, + { + "epoch": 0.4348298728122622, + "eval_loss": 0.8530751466751099, + "eval_runtime": 671.8627, + "eval_samples_per_second": 51.342, + "eval_steps_per_second": 2.567, + "eval_token_accuracy": 0.00042285290766462313, + "step": 3000 + }, + { + "epoch": 0.4362793057216364, + "grad_norm": 0.9275263547897339, + "learning_rate": 4.563704884765908e-05, + "loss": 0.9308, + "step": 3010 + }, + { + "epoch": 0.43772873863101064, + "grad_norm": 0.8296828269958496, + "learning_rate": 4.562255399333237e-05, + "loss": 0.9581, + "step": 3020 + }, + { + "epoch": 0.4391781715403848, + "grad_norm": 0.8571820855140686, + "learning_rate": 4.560805913900565e-05, + "loss": 0.8346, + "step": 3030 + }, + { + "epoch": 0.440627604449759, + "grad_norm": 0.910743772983551, + "learning_rate": 4.559356428467894e-05, + "loss": 0.9596, + "step": 3040 + }, + { + "epoch": 0.44207703735913323, + "grad_norm": 1.1356425285339355, + "learning_rate": 4.557906943035223e-05, + "loss": 0.8551, + "step": 3050 + }, + { + "epoch": 0.44352647026850744, + "grad_norm": 0.9998161792755127, + "learning_rate": 4.556457457602552e-05, + "loss": 0.8586, + "step": 3060 + }, + { + "epoch": 0.44497590317788166, + "grad_norm": 1.193593978881836, + "learning_rate": 4.55500797216988e-05, + "loss": 0.877, + "step": 3070 + }, + { + "epoch": 0.4464253360872559, + "grad_norm": 0.7462995052337646, + "learning_rate": 4.553558486737208e-05, + "loss": 0.8667, + "step": 3080 + }, + { + "epoch": 0.4478747689966301, + "grad_norm": 0.9373787045478821, + "learning_rate": 4.552109001304537e-05, + "loss": 0.8609, + "step": 3090 + }, + { + "epoch": 0.44932420190600425, + "grad_norm": 3.4775943756103516, + "learning_rate": 4.5506595158718655e-05, + "loss": 0.9172, + "step": 3100 + }, + { + "epoch": 0.45077363481537847, + "grad_norm": 1.0909534692764282, + "learning_rate": 4.5492100304391946e-05, + "loss": 0.9085, + "step": 3110 + }, + { + "epoch": 0.4522230677247527, + "grad_norm": 2.1074111461639404, + "learning_rate": 4.547760545006523e-05, + "loss": 0.8658, + "step": 3120 + }, + { + "epoch": 0.4536725006341269, + "grad_norm": 0.9462234377861023, + "learning_rate": 4.5463110595738515e-05, + "loss": 0.7572, + "step": 3130 + }, + { + "epoch": 0.4551219335435011, + "grad_norm": 1.8323560953140259, + "learning_rate": 4.54486157414118e-05, + "loss": 0.8097, + "step": 3140 + }, + { + "epoch": 0.4565713664528753, + "grad_norm": 0.9515965580940247, + "learning_rate": 4.5434120887085084e-05, + "loss": 0.8501, + "step": 3150 + }, + { + "epoch": 0.45802079936224954, + "grad_norm": 4.722071170806885, + "learning_rate": 4.5419626032758375e-05, + "loss": 0.9871, + "step": 3160 + }, + { + "epoch": 0.4594702322716237, + "grad_norm": 1.0577269792556763, + "learning_rate": 4.540513117843166e-05, + "loss": 0.7459, + "step": 3170 + }, + { + "epoch": 0.4609196651809979, + "grad_norm": 1.0412189960479736, + "learning_rate": 4.5390636324104944e-05, + "loss": 0.8492, + "step": 3180 + }, + { + "epoch": 0.46236909809037213, + "grad_norm": 2.957418441772461, + "learning_rate": 4.5376141469778235e-05, + "loss": 0.8368, + "step": 3190 + }, + { + "epoch": 0.46381853099974635, + "grad_norm": 1.0263798236846924, + "learning_rate": 4.536164661545152e-05, + "loss": 0.8358, + "step": 3200 + }, + { + "epoch": 0.46526796390912056, + "grad_norm": 0.8341432213783264, + "learning_rate": 4.5347151761124803e-05, + "loss": 0.8912, + "step": 3210 + }, + { + "epoch": 0.4667173968184948, + "grad_norm": 2.897122621536255, + "learning_rate": 4.533265690679809e-05, + "loss": 0.8843, + "step": 3220 + }, + { + "epoch": 0.468166829727869, + "grad_norm": 2.811288356781006, + "learning_rate": 4.531816205247137e-05, + "loss": 0.8503, + "step": 3230 + }, + { + "epoch": 0.46961626263724315, + "grad_norm": 2.7167201042175293, + "learning_rate": 4.530366719814466e-05, + "loss": 0.8325, + "step": 3240 + }, + { + "epoch": 0.47106569554661737, + "grad_norm": 2.2229905128479004, + "learning_rate": 4.528917234381795e-05, + "loss": 0.8333, + "step": 3250 + }, + { + "epoch": 0.4725151284559916, + "grad_norm": 0.8708949685096741, + "learning_rate": 4.527467748949123e-05, + "loss": 0.8631, + "step": 3260 + }, + { + "epoch": 0.4739645613653658, + "grad_norm": 2.3800301551818848, + "learning_rate": 4.5260182635164516e-05, + "loss": 0.8038, + "step": 3270 + }, + { + "epoch": 0.47541399427474, + "grad_norm": 3.5404298305511475, + "learning_rate": 4.52456877808378e-05, + "loss": 0.8799, + "step": 3280 + }, + { + "epoch": 0.47686342718411423, + "grad_norm": 3.8659508228302, + "learning_rate": 4.523119292651109e-05, + "loss": 0.8644, + "step": 3290 + }, + { + "epoch": 0.47831286009348845, + "grad_norm": 0.8231411576271057, + "learning_rate": 4.5216698072184376e-05, + "loss": 0.7729, + "step": 3300 + }, + { + "epoch": 0.4797622930028626, + "grad_norm": 0.8375280499458313, + "learning_rate": 4.520220321785766e-05, + "loss": 0.8589, + "step": 3310 + }, + { + "epoch": 0.4812117259122368, + "grad_norm": 1.1756319999694824, + "learning_rate": 4.518770836353095e-05, + "loss": 0.7879, + "step": 3320 + }, + { + "epoch": 0.48266115882161104, + "grad_norm": 1.7768633365631104, + "learning_rate": 4.517321350920423e-05, + "loss": 0.8617, + "step": 3330 + }, + { + "epoch": 0.48411059173098525, + "grad_norm": 2.3392810821533203, + "learning_rate": 4.515871865487752e-05, + "loss": 0.8479, + "step": 3340 + }, + { + "epoch": 0.48556002464035947, + "grad_norm": 0.9335256218910217, + "learning_rate": 4.5144223800550805e-05, + "loss": 0.7817, + "step": 3350 + }, + { + "epoch": 0.4870094575497337, + "grad_norm": 2.646064281463623, + "learning_rate": 4.512972894622409e-05, + "loss": 0.8844, + "step": 3360 + }, + { + "epoch": 0.4884588904591079, + "grad_norm": 1.0953303575515747, + "learning_rate": 4.511523409189738e-05, + "loss": 0.8304, + "step": 3370 + }, + { + "epoch": 0.48990832336848206, + "grad_norm": 2.2010998725891113, + "learning_rate": 4.5100739237570665e-05, + "loss": 0.7681, + "step": 3380 + }, + { + "epoch": 0.4913577562778563, + "grad_norm": 2.8716251850128174, + "learning_rate": 4.5086244383243956e-05, + "loss": 0.9036, + "step": 3390 + }, + { + "epoch": 0.4928071891872305, + "grad_norm": 0.7964257597923279, + "learning_rate": 4.5071749528917233e-05, + "loss": 0.9171, + "step": 3400 + }, + { + "epoch": 0.4942566220966047, + "grad_norm": 1.0154571533203125, + "learning_rate": 4.505725467459052e-05, + "loss": 0.8455, + "step": 3410 + }, + { + "epoch": 0.4957060550059789, + "grad_norm": 0.8615840673446655, + "learning_rate": 4.504275982026381e-05, + "loss": 0.8727, + "step": 3420 + }, + { + "epoch": 0.49715548791535313, + "grad_norm": 1.9749999046325684, + "learning_rate": 4.502826496593709e-05, + "loss": 0.8145, + "step": 3430 + }, + { + "epoch": 0.49860492082472735, + "grad_norm": 0.8778107762336731, + "learning_rate": 4.5013770111610384e-05, + "loss": 0.8998, + "step": 3440 + }, + { + "epoch": 0.5000543537341016, + "grad_norm": 2.565814733505249, + "learning_rate": 4.499927525728367e-05, + "loss": 0.8783, + "step": 3450 + }, + { + "epoch": 0.5015037866434757, + "grad_norm": 1.1715037822723389, + "learning_rate": 4.498478040295695e-05, + "loss": 0.8143, + "step": 3460 + }, + { + "epoch": 0.50295321955285, + "grad_norm": 2.346769094467163, + "learning_rate": 4.497028554863024e-05, + "loss": 0.8483, + "step": 3470 + }, + { + "epoch": 0.5044026524622242, + "grad_norm": 1.9950371980667114, + "learning_rate": 4.495579069430352e-05, + "loss": 0.762, + "step": 3480 + }, + { + "epoch": 0.5058520853715983, + "grad_norm": 2.0283892154693604, + "learning_rate": 4.494129583997681e-05, + "loss": 0.8049, + "step": 3490 + }, + { + "epoch": 0.5073015182809726, + "grad_norm": 0.9947206377983093, + "learning_rate": 4.49268009856501e-05, + "loss": 0.9395, + "step": 3500 + }, + { + "epoch": 0.5087509511903467, + "grad_norm": 0.7881497144699097, + "learning_rate": 4.491230613132338e-05, + "loss": 0.8563, + "step": 3510 + }, + { + "epoch": 0.510200384099721, + "grad_norm": 2.30930495262146, + "learning_rate": 4.489781127699667e-05, + "loss": 0.9173, + "step": 3520 + }, + { + "epoch": 0.5116498170090952, + "grad_norm": 3.175978899002075, + "learning_rate": 4.488331642266995e-05, + "loss": 0.8632, + "step": 3530 + }, + { + "epoch": 0.5130992499184694, + "grad_norm": 0.7730461955070496, + "learning_rate": 4.486882156834324e-05, + "loss": 0.7952, + "step": 3540 + }, + { + "epoch": 0.5145486828278436, + "grad_norm": 1.061704158782959, + "learning_rate": 4.4854326714016526e-05, + "loss": 0.7262, + "step": 3550 + }, + { + "epoch": 0.5159981157372178, + "grad_norm": 3.158407211303711, + "learning_rate": 4.483983185968981e-05, + "loss": 0.7429, + "step": 3560 + }, + { + "epoch": 0.517447548646592, + "grad_norm": 2.246033191680908, + "learning_rate": 4.48253370053631e-05, + "loss": 0.8934, + "step": 3570 + }, + { + "epoch": 0.5188969815559662, + "grad_norm": 2.165231466293335, + "learning_rate": 4.4810842151036386e-05, + "loss": 0.8407, + "step": 3580 + }, + { + "epoch": 0.5203464144653405, + "grad_norm": 3.290911912918091, + "learning_rate": 4.479634729670967e-05, + "loss": 0.8164, + "step": 3590 + }, + { + "epoch": 0.5217958473747146, + "grad_norm": 1.2274982929229736, + "learning_rate": 4.4781852442382954e-05, + "loss": 0.8467, + "step": 3600 + }, + { + "epoch": 0.5232452802840889, + "grad_norm": 0.940759539604187, + "learning_rate": 4.476735758805624e-05, + "loss": 0.7047, + "step": 3610 + }, + { + "epoch": 0.5246947131934631, + "grad_norm": 2.014238119125366, + "learning_rate": 4.475286273372953e-05, + "loss": 0.8128, + "step": 3620 + }, + { + "epoch": 0.5261441461028372, + "grad_norm": 0.9501071572303772, + "learning_rate": 4.4738367879402814e-05, + "loss": 0.9025, + "step": 3630 + }, + { + "epoch": 0.5275935790122115, + "grad_norm": 0.9888691902160645, + "learning_rate": 4.47238730250761e-05, + "loss": 0.7364, + "step": 3640 + }, + { + "epoch": 0.5290430119215856, + "grad_norm": 3.5915961265563965, + "learning_rate": 4.470937817074939e-05, + "loss": 0.8577, + "step": 3650 + }, + { + "epoch": 0.5304924448309599, + "grad_norm": 0.958191990852356, + "learning_rate": 4.469488331642267e-05, + "loss": 0.9072, + "step": 3660 + }, + { + "epoch": 0.5319418777403341, + "grad_norm": 4.15883207321167, + "learning_rate": 4.468038846209596e-05, + "loss": 0.9426, + "step": 3670 + }, + { + "epoch": 0.5333913106497084, + "grad_norm": 1.091619849205017, + "learning_rate": 4.466589360776924e-05, + "loss": 0.7791, + "step": 3680 + }, + { + "epoch": 0.5348407435590825, + "grad_norm": 3.2313733100891113, + "learning_rate": 4.465139875344253e-05, + "loss": 0.7986, + "step": 3690 + }, + { + "epoch": 0.5362901764684567, + "grad_norm": 0.9143542051315308, + "learning_rate": 4.463690389911582e-05, + "loss": 0.7882, + "step": 3700 + }, + { + "epoch": 0.5377396093778309, + "grad_norm": 1.09394109249115, + "learning_rate": 4.46224090447891e-05, + "loss": 0.7717, + "step": 3710 + }, + { + "epoch": 0.5391890422872051, + "grad_norm": 1.5948755741119385, + "learning_rate": 4.460791419046239e-05, + "loss": 0.8436, + "step": 3720 + }, + { + "epoch": 0.5406384751965794, + "grad_norm": 2.321056604385376, + "learning_rate": 4.459341933613567e-05, + "loss": 0.8353, + "step": 3730 + }, + { + "epoch": 0.5420879081059535, + "grad_norm": 2.6569488048553467, + "learning_rate": 4.4578924481808956e-05, + "loss": 0.834, + "step": 3740 + }, + { + "epoch": 0.5435373410153278, + "grad_norm": 3.0700886249542236, + "learning_rate": 4.456442962748225e-05, + "loss": 0.8203, + "step": 3750 + }, + { + "epoch": 0.544986773924702, + "grad_norm": 2.120776414871216, + "learning_rate": 4.454993477315553e-05, + "loss": 0.8139, + "step": 3760 + }, + { + "epoch": 0.5464362068340761, + "grad_norm": 2.4611003398895264, + "learning_rate": 4.453543991882882e-05, + "loss": 0.8524, + "step": 3770 + }, + { + "epoch": 0.5478856397434504, + "grad_norm": 1.0382717847824097, + "learning_rate": 4.452094506450211e-05, + "loss": 0.8474, + "step": 3780 + }, + { + "epoch": 0.5493350726528246, + "grad_norm": 0.7796250581741333, + "learning_rate": 4.4506450210175384e-05, + "loss": 0.8561, + "step": 3790 + }, + { + "epoch": 0.5507845055621988, + "grad_norm": 2.735966920852661, + "learning_rate": 4.4491955355848676e-05, + "loss": 0.7416, + "step": 3800 + }, + { + "epoch": 0.552233938471573, + "grad_norm": 0.8775273561477661, + "learning_rate": 4.447746050152196e-05, + "loss": 0.868, + "step": 3810 + }, + { + "epoch": 0.5536833713809473, + "grad_norm": 0.9166693687438965, + "learning_rate": 4.446296564719525e-05, + "loss": 0.895, + "step": 3820 + }, + { + "epoch": 0.5551328042903214, + "grad_norm": 0.9439873099327087, + "learning_rate": 4.4448470792868535e-05, + "loss": 0.8011, + "step": 3830 + }, + { + "epoch": 0.5565822371996956, + "grad_norm": 0.7854294776916504, + "learning_rate": 4.443397593854182e-05, + "loss": 0.8141, + "step": 3840 + }, + { + "epoch": 0.5580316701090698, + "grad_norm": 0.9581212401390076, + "learning_rate": 4.4419481084215104e-05, + "loss": 0.7875, + "step": 3850 + }, + { + "epoch": 0.559481103018444, + "grad_norm": 1.0400795936584473, + "learning_rate": 4.440498622988839e-05, + "loss": 0.8467, + "step": 3860 + }, + { + "epoch": 0.5609305359278183, + "grad_norm": 0.912345826625824, + "learning_rate": 4.439049137556168e-05, + "loss": 0.8752, + "step": 3870 + }, + { + "epoch": 0.5623799688371924, + "grad_norm": 0.8866713047027588, + "learning_rate": 4.4375996521234964e-05, + "loss": 0.8622, + "step": 3880 + }, + { + "epoch": 0.5638294017465667, + "grad_norm": 1.6618248224258423, + "learning_rate": 4.436150166690825e-05, + "loss": 0.8189, + "step": 3890 + }, + { + "epoch": 0.5652788346559409, + "grad_norm": 0.9341676235198975, + "learning_rate": 4.434700681258154e-05, + "loss": 0.7565, + "step": 3900 + }, + { + "epoch": 0.566728267565315, + "grad_norm": 2.9719181060791016, + "learning_rate": 4.4332511958254824e-05, + "loss": 0.7841, + "step": 3910 + }, + { + "epoch": 0.5681777004746893, + "grad_norm": 1.0437769889831543, + "learning_rate": 4.431801710392811e-05, + "loss": 0.8613, + "step": 3920 + }, + { + "epoch": 0.5696271333840635, + "grad_norm": 1.6077215671539307, + "learning_rate": 4.430352224960139e-05, + "loss": 0.8608, + "step": 3930 + }, + { + "epoch": 0.5710765662934377, + "grad_norm": 2.5882461071014404, + "learning_rate": 4.428902739527468e-05, + "loss": 0.8888, + "step": 3940 + }, + { + "epoch": 0.5725259992028119, + "grad_norm": 2.8269803524017334, + "learning_rate": 4.427453254094797e-05, + "loss": 0.9575, + "step": 3950 + }, + { + "epoch": 0.5739754321121862, + "grad_norm": 1.8316258192062378, + "learning_rate": 4.426003768662125e-05, + "loss": 0.9177, + "step": 3960 + }, + { + "epoch": 0.5754248650215603, + "grad_norm": 2.2243731021881104, + "learning_rate": 4.424554283229454e-05, + "loss": 0.9245, + "step": 3970 + }, + { + "epoch": 0.5768742979309345, + "grad_norm": 2.549422264099121, + "learning_rate": 4.423104797796782e-05, + "loss": 0.8799, + "step": 3980 + }, + { + "epoch": 0.5783237308403087, + "grad_norm": 0.6303164958953857, + "learning_rate": 4.4216553123641105e-05, + "loss": 0.8572, + "step": 3990 + }, + { + "epoch": 0.5797731637496829, + "grad_norm": 1.8965942859649658, + "learning_rate": 4.4202058269314397e-05, + "loss": 0.7797, + "step": 4000 + }, + { + "epoch": 0.5797731637496829, + "eval_loss": 0.8338169455528259, + "eval_runtime": 672.2843, + "eval_samples_per_second": 51.31, + "eval_steps_per_second": 2.566, + "eval_token_accuracy": 0.00039567065532711027, + "step": 4000 + }, + { + "epoch": 0.5812225966590572, + "grad_norm": 2.4498796463012695, + "learning_rate": 4.418756341498768e-05, + "loss": 0.8155, + "step": 4010 + }, + { + "epoch": 0.5826720295684313, + "grad_norm": 0.7214322686195374, + "learning_rate": 4.4173068560660965e-05, + "loss": 0.902, + "step": 4020 + }, + { + "epoch": 0.5841214624778056, + "grad_norm": 1.9820255041122437, + "learning_rate": 4.4158573706334256e-05, + "loss": 0.8595, + "step": 4030 + }, + { + "epoch": 0.5855708953871798, + "grad_norm": 2.444005250930786, + "learning_rate": 4.414407885200754e-05, + "loss": 0.7991, + "step": 4040 + }, + { + "epoch": 0.5870203282965539, + "grad_norm": 1.8259989023208618, + "learning_rate": 4.4129583997680825e-05, + "loss": 0.8107, + "step": 4050 + }, + { + "epoch": 0.5884697612059282, + "grad_norm": 1.1818252801895142, + "learning_rate": 4.411508914335411e-05, + "loss": 0.916, + "step": 4060 + }, + { + "epoch": 0.5899191941153024, + "grad_norm": 1.647369146347046, + "learning_rate": 4.4100594289027394e-05, + "loss": 0.7957, + "step": 4070 + }, + { + "epoch": 0.5913686270246766, + "grad_norm": 3.159400463104248, + "learning_rate": 4.4086099434700685e-05, + "loss": 0.9447, + "step": 4080 + }, + { + "epoch": 0.5928180599340508, + "grad_norm": 1.5535900592803955, + "learning_rate": 4.407160458037397e-05, + "loss": 0.8162, + "step": 4090 + }, + { + "epoch": 0.5942674928434251, + "grad_norm": 3.949704647064209, + "learning_rate": 4.405710972604726e-05, + "loss": 0.832, + "step": 4100 + }, + { + "epoch": 0.5957169257527992, + "grad_norm": 0.761550784111023, + "learning_rate": 4.404261487172054e-05, + "loss": 0.8827, + "step": 4110 + }, + { + "epoch": 0.5971663586621734, + "grad_norm": 1.7703503370285034, + "learning_rate": 4.402812001739382e-05, + "loss": 0.8713, + "step": 4120 + }, + { + "epoch": 0.5986157915715477, + "grad_norm": 0.7429376840591431, + "learning_rate": 4.4013625163067114e-05, + "loss": 0.7604, + "step": 4130 + }, + { + "epoch": 0.6000652244809218, + "grad_norm": 0.8260897397994995, + "learning_rate": 4.39991303087404e-05, + "loss": 0.8617, + "step": 4140 + }, + { + "epoch": 0.6015146573902961, + "grad_norm": 2.519921064376831, + "learning_rate": 4.398463545441369e-05, + "loss": 0.88, + "step": 4150 + }, + { + "epoch": 0.6029640902996702, + "grad_norm": 0.6744734048843384, + "learning_rate": 4.3970140600086973e-05, + "loss": 0.7893, + "step": 4160 + }, + { + "epoch": 0.6044135232090445, + "grad_norm": 0.9345059394836426, + "learning_rate": 4.395564574576026e-05, + "loss": 0.8327, + "step": 4170 + }, + { + "epoch": 0.6058629561184187, + "grad_norm": 3.128801107406616, + "learning_rate": 4.394115089143354e-05, + "loss": 0.8599, + "step": 4180 + }, + { + "epoch": 0.6073123890277928, + "grad_norm": 0.9045581817626953, + "learning_rate": 4.3926656037106827e-05, + "loss": 0.8531, + "step": 4190 + }, + { + "epoch": 0.6087618219371671, + "grad_norm": 1.3482164144515991, + "learning_rate": 4.391216118278012e-05, + "loss": 0.8507, + "step": 4200 + }, + { + "epoch": 0.6102112548465413, + "grad_norm": 2.7452120780944824, + "learning_rate": 4.38976663284534e-05, + "loss": 0.7958, + "step": 4210 + }, + { + "epoch": 0.6116606877559155, + "grad_norm": 0.7711092829704285, + "learning_rate": 4.3883171474126686e-05, + "loss": 0.9046, + "step": 4220 + }, + { + "epoch": 0.6131101206652897, + "grad_norm": 1.0835328102111816, + "learning_rate": 4.386867661979998e-05, + "loss": 0.7562, + "step": 4230 + }, + { + "epoch": 0.614559553574664, + "grad_norm": 1.0015699863433838, + "learning_rate": 4.3854181765473255e-05, + "loss": 0.7945, + "step": 4240 + }, + { + "epoch": 0.6160089864840381, + "grad_norm": 0.9216394424438477, + "learning_rate": 4.3839686911146546e-05, + "loss": 0.896, + "step": 4250 + }, + { + "epoch": 0.6174584193934123, + "grad_norm": 2.684931755065918, + "learning_rate": 4.382519205681983e-05, + "loss": 0.8881, + "step": 4260 + }, + { + "epoch": 0.6189078523027866, + "grad_norm": 2.537541627883911, + "learning_rate": 4.3810697202493115e-05, + "loss": 0.9242, + "step": 4270 + }, + { + "epoch": 0.6203572852121607, + "grad_norm": 1.0086060762405396, + "learning_rate": 4.3796202348166406e-05, + "loss": 0.8116, + "step": 4280 + }, + { + "epoch": 0.621806718121535, + "grad_norm": 1.0704823732376099, + "learning_rate": 4.378170749383969e-05, + "loss": 0.8074, + "step": 4290 + }, + { + "epoch": 0.6232561510309091, + "grad_norm": 0.9033405184745789, + "learning_rate": 4.3767212639512975e-05, + "loss": 0.7903, + "step": 4300 + }, + { + "epoch": 0.6247055839402834, + "grad_norm": 0.925373375415802, + "learning_rate": 4.375271778518626e-05, + "loss": 0.7961, + "step": 4310 + }, + { + "epoch": 0.6261550168496576, + "grad_norm": 2.409839391708374, + "learning_rate": 4.3738222930859544e-05, + "loss": 0.9425, + "step": 4320 + }, + { + "epoch": 0.6276044497590317, + "grad_norm": 0.8671131134033203, + "learning_rate": 4.3723728076532835e-05, + "loss": 0.9119, + "step": 4330 + }, + { + "epoch": 0.629053882668406, + "grad_norm": 1.9147675037384033, + "learning_rate": 4.370923322220612e-05, + "loss": 0.7476, + "step": 4340 + }, + { + "epoch": 0.6305033155777802, + "grad_norm": 2.5024831295013428, + "learning_rate": 4.36947383678794e-05, + "loss": 0.8095, + "step": 4350 + }, + { + "epoch": 0.6319527484871544, + "grad_norm": 1.1699477434158325, + "learning_rate": 4.3680243513552695e-05, + "loss": 0.7649, + "step": 4360 + }, + { + "epoch": 0.6334021813965286, + "grad_norm": 1.053463101387024, + "learning_rate": 4.366574865922597e-05, + "loss": 0.7738, + "step": 4370 + }, + { + "epoch": 0.6348516143059029, + "grad_norm": 0.9840993285179138, + "learning_rate": 4.365125380489926e-05, + "loss": 0.8444, + "step": 4380 + }, + { + "epoch": 0.636301047215277, + "grad_norm": 1.8738470077514648, + "learning_rate": 4.363675895057255e-05, + "loss": 0.8309, + "step": 4390 + }, + { + "epoch": 0.6377504801246512, + "grad_norm": 0.9873073697090149, + "learning_rate": 4.362226409624583e-05, + "loss": 0.9108, + "step": 4400 + }, + { + "epoch": 0.6391999130340255, + "grad_norm": 1.9426926374435425, + "learning_rate": 4.360776924191912e-05, + "loss": 0.8223, + "step": 4410 + }, + { + "epoch": 0.6406493459433996, + "grad_norm": 0.967570424079895, + "learning_rate": 4.359327438759241e-05, + "loss": 0.8562, + "step": 4420 + }, + { + "epoch": 0.6420987788527739, + "grad_norm": 1.1149927377700806, + "learning_rate": 4.357877953326569e-05, + "loss": 0.962, + "step": 4430 + }, + { + "epoch": 0.643548211762148, + "grad_norm": 0.8680489659309387, + "learning_rate": 4.3564284678938976e-05, + "loss": 0.8571, + "step": 4440 + }, + { + "epoch": 0.6449976446715223, + "grad_norm": 1.7880040407180786, + "learning_rate": 4.354978982461226e-05, + "loss": 0.82, + "step": 4450 + }, + { + "epoch": 0.6464470775808965, + "grad_norm": 1.0027462244033813, + "learning_rate": 4.353529497028555e-05, + "loss": 0.7613, + "step": 4460 + }, + { + "epoch": 0.6478965104902706, + "grad_norm": 1.6337950229644775, + "learning_rate": 4.3520800115958836e-05, + "loss": 0.7093, + "step": 4470 + }, + { + "epoch": 0.6493459433996449, + "grad_norm": 2.350299596786499, + "learning_rate": 4.350630526163213e-05, + "loss": 0.849, + "step": 4480 + }, + { + "epoch": 0.6507953763090191, + "grad_norm": 1.187408685684204, + "learning_rate": 4.349181040730541e-05, + "loss": 0.8921, + "step": 4490 + }, + { + "epoch": 0.6522448092183933, + "grad_norm": 1.5397257804870605, + "learning_rate": 4.347731555297869e-05, + "loss": 0.8206, + "step": 4500 + }, + { + "epoch": 0.6536942421277675, + "grad_norm": 1.072008728981018, + "learning_rate": 4.346282069865198e-05, + "loss": 0.8675, + "step": 4510 + }, + { + "epoch": 0.6551436750371418, + "grad_norm": 0.9316320419311523, + "learning_rate": 4.3448325844325265e-05, + "loss": 0.752, + "step": 4520 + }, + { + "epoch": 0.6565931079465159, + "grad_norm": 0.7438766360282898, + "learning_rate": 4.3433830989998556e-05, + "loss": 0.7891, + "step": 4530 + }, + { + "epoch": 0.6580425408558901, + "grad_norm": 0.84394371509552, + "learning_rate": 4.341933613567184e-05, + "loss": 0.8332, + "step": 4540 + }, + { + "epoch": 0.6594919737652644, + "grad_norm": 0.9269441366195679, + "learning_rate": 4.3404841281345124e-05, + "loss": 0.9001, + "step": 4550 + }, + { + "epoch": 0.6609414066746385, + "grad_norm": 0.8739952445030212, + "learning_rate": 4.339034642701841e-05, + "loss": 0.8035, + "step": 4560 + }, + { + "epoch": 0.6623908395840128, + "grad_norm": 1.9341847896575928, + "learning_rate": 4.337585157269169e-05, + "loss": 0.8768, + "step": 4570 + }, + { + "epoch": 0.663840272493387, + "grad_norm": 0.8902730345726013, + "learning_rate": 4.3361356718364984e-05, + "loss": 0.8206, + "step": 4580 + }, + { + "epoch": 0.6652897054027612, + "grad_norm": 0.8367321491241455, + "learning_rate": 4.334686186403827e-05, + "loss": 0.8538, + "step": 4590 + }, + { + "epoch": 0.6667391383121354, + "grad_norm": 2.862443685531616, + "learning_rate": 4.333236700971155e-05, + "loss": 0.8864, + "step": 4600 + }, + { + "epoch": 0.6681885712215095, + "grad_norm": 0.8810304999351501, + "learning_rate": 4.3317872155384844e-05, + "loss": 0.961, + "step": 4610 + }, + { + "epoch": 0.6696380041308838, + "grad_norm": 0.9625788927078247, + "learning_rate": 4.330337730105813e-05, + "loss": 0.8163, + "step": 4620 + }, + { + "epoch": 0.671087437040258, + "grad_norm": 0.772083580493927, + "learning_rate": 4.328888244673141e-05, + "loss": 0.8537, + "step": 4630 + }, + { + "epoch": 0.6725368699496322, + "grad_norm": 1.8077799081802368, + "learning_rate": 4.32743875924047e-05, + "loss": 0.9016, + "step": 4640 + }, + { + "epoch": 0.6739863028590064, + "grad_norm": 0.7941009998321533, + "learning_rate": 4.325989273807798e-05, + "loss": 0.8011, + "step": 4650 + }, + { + "epoch": 0.6754357357683807, + "grad_norm": 2.0097315311431885, + "learning_rate": 4.324539788375127e-05, + "loss": 0.862, + "step": 4660 + }, + { + "epoch": 0.6768851686777548, + "grad_norm": 2.0535378456115723, + "learning_rate": 4.323090302942456e-05, + "loss": 0.7873, + "step": 4670 + }, + { + "epoch": 0.678334601587129, + "grad_norm": 1.0298711061477661, + "learning_rate": 4.321640817509784e-05, + "loss": 0.8405, + "step": 4680 + }, + { + "epoch": 0.6797840344965033, + "grad_norm": 0.7229492664337158, + "learning_rate": 4.3201913320771126e-05, + "loss": 0.8394, + "step": 4690 + }, + { + "epoch": 0.6812334674058774, + "grad_norm": 0.8889229893684387, + "learning_rate": 4.318741846644441e-05, + "loss": 0.8787, + "step": 4700 + }, + { + "epoch": 0.6826829003152517, + "grad_norm": 2.534590482711792, + "learning_rate": 4.31729236121177e-05, + "loss": 0.7816, + "step": 4710 + }, + { + "epoch": 0.6841323332246259, + "grad_norm": 0.8015549182891846, + "learning_rate": 4.3158428757790986e-05, + "loss": 0.7606, + "step": 4720 + }, + { + "epoch": 0.6855817661340001, + "grad_norm": 1.200669288635254, + "learning_rate": 4.314393390346427e-05, + "loss": 0.9176, + "step": 4730 + }, + { + "epoch": 0.6870311990433743, + "grad_norm": 1.7741879224777222, + "learning_rate": 4.312943904913756e-05, + "loss": 0.8293, + "step": 4740 + }, + { + "epoch": 0.6884806319527484, + "grad_norm": 2.136448621749878, + "learning_rate": 4.3114944194810846e-05, + "loss": 0.877, + "step": 4750 + }, + { + "epoch": 0.6899300648621227, + "grad_norm": 1.725224256515503, + "learning_rate": 4.310044934048413e-05, + "loss": 0.7985, + "step": 4760 + }, + { + "epoch": 0.6913794977714969, + "grad_norm": 0.8180555701255798, + "learning_rate": 4.3085954486157414e-05, + "loss": 0.8777, + "step": 4770 + }, + { + "epoch": 0.6928289306808711, + "grad_norm": 0.5975582003593445, + "learning_rate": 4.30714596318307e-05, + "loss": 0.7879, + "step": 4780 + }, + { + "epoch": 0.6942783635902453, + "grad_norm": 0.9511466026306152, + "learning_rate": 4.305696477750399e-05, + "loss": 0.8936, + "step": 4790 + }, + { + "epoch": 0.6957277964996196, + "grad_norm": 1.0259195566177368, + "learning_rate": 4.3042469923177274e-05, + "loss": 0.761, + "step": 4800 + }, + { + "epoch": 0.6971772294089937, + "grad_norm": 2.490692615509033, + "learning_rate": 4.3027975068850565e-05, + "loss": 0.8069, + "step": 4810 + }, + { + "epoch": 0.6986266623183679, + "grad_norm": 1.8551379442214966, + "learning_rate": 4.301348021452384e-05, + "loss": 0.7879, + "step": 4820 + }, + { + "epoch": 0.7000760952277422, + "grad_norm": 2.0735645294189453, + "learning_rate": 4.299898536019713e-05, + "loss": 0.8176, + "step": 4830 + }, + { + "epoch": 0.7015255281371163, + "grad_norm": 0.8412244915962219, + "learning_rate": 4.298449050587042e-05, + "loss": 0.8356, + "step": 4840 + }, + { + "epoch": 0.7029749610464906, + "grad_norm": 3.692345142364502, + "learning_rate": 4.29699956515437e-05, + "loss": 0.7893, + "step": 4850 + }, + { + "epoch": 0.7044243939558648, + "grad_norm": 3.127885103225708, + "learning_rate": 4.2955500797216994e-05, + "loss": 0.8115, + "step": 4860 + }, + { + "epoch": 0.705873826865239, + "grad_norm": 3.361967086791992, + "learning_rate": 4.294100594289028e-05, + "loss": 0.8344, + "step": 4870 + }, + { + "epoch": 0.7073232597746132, + "grad_norm": 0.7663640975952148, + "learning_rate": 4.292651108856356e-05, + "loss": 0.7669, + "step": 4880 + }, + { + "epoch": 0.7087726926839873, + "grad_norm": 3.2423362731933594, + "learning_rate": 4.291201623423685e-05, + "loss": 0.9231, + "step": 4890 + }, + { + "epoch": 0.7102221255933616, + "grad_norm": 2.1033742427825928, + "learning_rate": 4.289752137991013e-05, + "loss": 0.8546, + "step": 4900 + }, + { + "epoch": 0.7116715585027358, + "grad_norm": 1.6948819160461426, + "learning_rate": 4.288302652558342e-05, + "loss": 0.8038, + "step": 4910 + }, + { + "epoch": 0.71312099141211, + "grad_norm": 1.5998060703277588, + "learning_rate": 4.286853167125671e-05, + "loss": 0.7877, + "step": 4920 + }, + { + "epoch": 0.7145704243214842, + "grad_norm": 1.865884780883789, + "learning_rate": 4.285403681692999e-05, + "loss": 0.8306, + "step": 4930 + }, + { + "epoch": 0.7160198572308585, + "grad_norm": 0.7841973304748535, + "learning_rate": 4.283954196260328e-05, + "loss": 0.737, + "step": 4940 + }, + { + "epoch": 0.7174692901402326, + "grad_norm": 0.9612240195274353, + "learning_rate": 4.282504710827656e-05, + "loss": 0.8115, + "step": 4950 + }, + { + "epoch": 0.7189187230496068, + "grad_norm": 0.8772614002227783, + "learning_rate": 4.281055225394985e-05, + "loss": 0.8226, + "step": 4960 + }, + { + "epoch": 0.7203681559589811, + "grad_norm": 0.9083195328712463, + "learning_rate": 4.2796057399623135e-05, + "loss": 0.8494, + "step": 4970 + }, + { + "epoch": 0.7218175888683552, + "grad_norm": 2.2124178409576416, + "learning_rate": 4.278156254529642e-05, + "loss": 0.8175, + "step": 4980 + }, + { + "epoch": 0.7232670217777295, + "grad_norm": 3.139594793319702, + "learning_rate": 4.276706769096971e-05, + "loss": 0.8368, + "step": 4990 + }, + { + "epoch": 0.7247164546871037, + "grad_norm": 0.9544805884361267, + "learning_rate": 4.2752572836642995e-05, + "loss": 0.7776, + "step": 5000 + }, + { + "epoch": 0.7247164546871037, + "eval_loss": 0.8146118521690369, + "eval_runtime": 671.7402, + "eval_samples_per_second": 51.352, + "eval_steps_per_second": 2.568, + "eval_token_accuracy": 0.00041561502982327355, + "step": 5000 + }, + { + "epoch": 0.7261658875964779, + "grad_norm": 0.878653883934021, + "learning_rate": 4.273807798231628e-05, + "loss": 0.8326, + "step": 5010 + }, + { + "epoch": 0.7276153205058521, + "grad_norm": 0.8273966312408447, + "learning_rate": 4.2723583127989564e-05, + "loss": 0.7153, + "step": 5020 + }, + { + "epoch": 0.7290647534152263, + "grad_norm": 1.2781431674957275, + "learning_rate": 4.270908827366285e-05, + "loss": 0.7926, + "step": 5030 + }, + { + "epoch": 0.7305141863246005, + "grad_norm": 1.093881607055664, + "learning_rate": 4.269459341933614e-05, + "loss": 0.8349, + "step": 5040 + }, + { + "epoch": 0.7319636192339747, + "grad_norm": 2.6112184524536133, + "learning_rate": 4.2680098565009424e-05, + "loss": 0.8498, + "step": 5050 + }, + { + "epoch": 0.733413052143349, + "grad_norm": 2.0127315521240234, + "learning_rate": 4.266560371068271e-05, + "loss": 0.8129, + "step": 5060 + }, + { + "epoch": 0.7348624850527231, + "grad_norm": 0.8400161266326904, + "learning_rate": 4.2651108856356e-05, + "loss": 0.8914, + "step": 5070 + }, + { + "epoch": 0.7363119179620974, + "grad_norm": 2.1788601875305176, + "learning_rate": 4.263661400202928e-05, + "loss": 0.9231, + "step": 5080 + }, + { + "epoch": 0.7377613508714715, + "grad_norm": 0.8961378335952759, + "learning_rate": 4.262211914770257e-05, + "loss": 0.833, + "step": 5090 + }, + { + "epoch": 0.7392107837808457, + "grad_norm": 1.742361068725586, + "learning_rate": 4.260762429337585e-05, + "loss": 0.7927, + "step": 5100 + }, + { + "epoch": 0.74066021669022, + "grad_norm": 3.132824420928955, + "learning_rate": 4.259312943904914e-05, + "loss": 0.7623, + "step": 5110 + }, + { + "epoch": 0.7421096495995941, + "grad_norm": 1.33770751953125, + "learning_rate": 4.257863458472243e-05, + "loss": 0.8726, + "step": 5120 + }, + { + "epoch": 0.7435590825089684, + "grad_norm": 2.396254301071167, + "learning_rate": 4.256413973039571e-05, + "loss": 0.8686, + "step": 5130 + }, + { + "epoch": 0.7450085154183426, + "grad_norm": 0.8666991591453552, + "learning_rate": 4.2549644876069e-05, + "loss": 0.728, + "step": 5140 + }, + { + "epoch": 0.7464579483277168, + "grad_norm": 1.0336198806762695, + "learning_rate": 4.253515002174228e-05, + "loss": 0.7384, + "step": 5150 + }, + { + "epoch": 0.747907381237091, + "grad_norm": 3.091517448425293, + "learning_rate": 4.2520655167415565e-05, + "loss": 0.8342, + "step": 5160 + }, + { + "epoch": 0.7493568141464652, + "grad_norm": 1.0534110069274902, + "learning_rate": 4.2506160313088856e-05, + "loss": 0.8453, + "step": 5170 + }, + { + "epoch": 0.7508062470558394, + "grad_norm": 0.7332333326339722, + "learning_rate": 4.249166545876214e-05, + "loss": 0.9213, + "step": 5180 + }, + { + "epoch": 0.7522556799652136, + "grad_norm": 0.9510803818702698, + "learning_rate": 4.247717060443543e-05, + "loss": 0.7008, + "step": 5190 + }, + { + "epoch": 0.7537051128745879, + "grad_norm": 0.9351806044578552, + "learning_rate": 4.2462675750108716e-05, + "loss": 0.8127, + "step": 5200 + }, + { + "epoch": 0.755154545783962, + "grad_norm": 0.739949107170105, + "learning_rate": 4.2448180895781994e-05, + "loss": 0.7393, + "step": 5210 + }, + { + "epoch": 0.7566039786933363, + "grad_norm": 0.7901501655578613, + "learning_rate": 4.2433686041455285e-05, + "loss": 0.7996, + "step": 5220 + }, + { + "epoch": 0.7580534116027104, + "grad_norm": 1.0017391443252563, + "learning_rate": 4.241919118712857e-05, + "loss": 0.7722, + "step": 5230 + }, + { + "epoch": 0.7595028445120846, + "grad_norm": 2.1011905670166016, + "learning_rate": 4.240469633280186e-05, + "loss": 0.7743, + "step": 5240 + }, + { + "epoch": 0.7609522774214589, + "grad_norm": 0.9775656461715698, + "learning_rate": 4.2390201478475145e-05, + "loss": 0.7604, + "step": 5250 + }, + { + "epoch": 0.762401710330833, + "grad_norm": 2.5780832767486572, + "learning_rate": 4.237570662414843e-05, + "loss": 0.8864, + "step": 5260 + }, + { + "epoch": 0.7638511432402073, + "grad_norm": 2.137580394744873, + "learning_rate": 4.2361211769821713e-05, + "loss": 0.7924, + "step": 5270 + }, + { + "epoch": 0.7653005761495815, + "grad_norm": 1.0586779117584229, + "learning_rate": 4.2346716915495e-05, + "loss": 0.7467, + "step": 5280 + }, + { + "epoch": 0.7667500090589556, + "grad_norm": 1.0569285154342651, + "learning_rate": 4.233222206116829e-05, + "loss": 0.8568, + "step": 5290 + }, + { + "epoch": 0.7681994419683299, + "grad_norm": 2.8792178630828857, + "learning_rate": 4.231772720684157e-05, + "loss": 0.8079, + "step": 5300 + }, + { + "epoch": 0.7696488748777041, + "grad_norm": 2.2021849155426025, + "learning_rate": 4.230323235251486e-05, + "loss": 0.8115, + "step": 5310 + }, + { + "epoch": 0.7710983077870783, + "grad_norm": 0.730385422706604, + "learning_rate": 4.228873749818815e-05, + "loss": 0.8685, + "step": 5320 + }, + { + "epoch": 0.7725477406964525, + "grad_norm": 1.3898003101348877, + "learning_rate": 4.227424264386143e-05, + "loss": 0.7707, + "step": 5330 + }, + { + "epoch": 0.7739971736058268, + "grad_norm": 0.990164041519165, + "learning_rate": 4.225974778953472e-05, + "loss": 0.8342, + "step": 5340 + }, + { + "epoch": 0.7754466065152009, + "grad_norm": 0.7976711392402649, + "learning_rate": 4.2245252935208e-05, + "loss": 0.7386, + "step": 5350 + }, + { + "epoch": 0.7768960394245751, + "grad_norm": 1.8743420839309692, + "learning_rate": 4.2230758080881286e-05, + "loss": 0.7369, + "step": 5360 + }, + { + "epoch": 0.7783454723339494, + "grad_norm": 0.827246367931366, + "learning_rate": 4.221626322655458e-05, + "loss": 0.7587, + "step": 5370 + }, + { + "epoch": 0.7797949052433235, + "grad_norm": 2.4631876945495605, + "learning_rate": 4.220176837222786e-05, + "loss": 0.7761, + "step": 5380 + }, + { + "epoch": 0.7812443381526978, + "grad_norm": 0.9340019226074219, + "learning_rate": 4.2187273517901146e-05, + "loss": 0.8844, + "step": 5390 + }, + { + "epoch": 0.7826937710620719, + "grad_norm": 0.8450477123260498, + "learning_rate": 4.217277866357443e-05, + "loss": 0.7458, + "step": 5400 + }, + { + "epoch": 0.7841432039714462, + "grad_norm": 0.7873104214668274, + "learning_rate": 4.2158283809247715e-05, + "loss": 0.7478, + "step": 5410 + }, + { + "epoch": 0.7855926368808204, + "grad_norm": 0.9672576785087585, + "learning_rate": 4.2143788954921006e-05, + "loss": 0.8117, + "step": 5420 + }, + { + "epoch": 0.7870420697901945, + "grad_norm": 0.8910959959030151, + "learning_rate": 4.212929410059429e-05, + "loss": 0.8051, + "step": 5430 + }, + { + "epoch": 0.7884915026995688, + "grad_norm": 2.4677529335021973, + "learning_rate": 4.2114799246267575e-05, + "loss": 0.8244, + "step": 5440 + }, + { + "epoch": 0.789940935608943, + "grad_norm": 0.9692900776863098, + "learning_rate": 4.2100304391940866e-05, + "loss": 0.7718, + "step": 5450 + }, + { + "epoch": 0.7913903685183172, + "grad_norm": 1.1083464622497559, + "learning_rate": 4.208580953761415e-05, + "loss": 0.8499, + "step": 5460 + }, + { + "epoch": 0.7928398014276914, + "grad_norm": 0.7506137490272522, + "learning_rate": 4.2071314683287435e-05, + "loss": 0.7642, + "step": 5470 + }, + { + "epoch": 0.7942892343370657, + "grad_norm": 0.9144595265388489, + "learning_rate": 4.205681982896072e-05, + "loss": 0.8737, + "step": 5480 + }, + { + "epoch": 0.7957386672464398, + "grad_norm": 2.1372525691986084, + "learning_rate": 4.2042324974634e-05, + "loss": 0.8602, + "step": 5490 + }, + { + "epoch": 0.797188100155814, + "grad_norm": 0.9848815202713013, + "learning_rate": 4.2027830120307294e-05, + "loss": 0.7839, + "step": 5500 + }, + { + "epoch": 0.7986375330651883, + "grad_norm": 0.9968371391296387, + "learning_rate": 4.201333526598058e-05, + "loss": 0.8499, + "step": 5510 + }, + { + "epoch": 0.8000869659745624, + "grad_norm": 0.8149365186691284, + "learning_rate": 4.199884041165387e-05, + "loss": 0.7291, + "step": 5520 + }, + { + "epoch": 0.8015363988839367, + "grad_norm": 3.4558465480804443, + "learning_rate": 4.198434555732715e-05, + "loss": 0.8413, + "step": 5530 + }, + { + "epoch": 0.8029858317933108, + "grad_norm": 1.0479544401168823, + "learning_rate": 4.196985070300043e-05, + "loss": 0.8553, + "step": 5540 + }, + { + "epoch": 0.8044352647026851, + "grad_norm": 2.724478006362915, + "learning_rate": 4.195535584867372e-05, + "loss": 0.8351, + "step": 5550 + }, + { + "epoch": 0.8058846976120593, + "grad_norm": 1.0499247312545776, + "learning_rate": 4.194086099434701e-05, + "loss": 0.83, + "step": 5560 + }, + { + "epoch": 0.8073341305214334, + "grad_norm": 1.0782171487808228, + "learning_rate": 4.19263661400203e-05, + "loss": 0.7593, + "step": 5570 + }, + { + "epoch": 0.8087835634308077, + "grad_norm": 0.8276395201683044, + "learning_rate": 4.191187128569358e-05, + "loss": 0.793, + "step": 5580 + }, + { + "epoch": 0.8102329963401819, + "grad_norm": 0.7851141691207886, + "learning_rate": 4.189737643136687e-05, + "loss": 0.8528, + "step": 5590 + }, + { + "epoch": 0.8116824292495561, + "grad_norm": 0.865790605545044, + "learning_rate": 4.188288157704015e-05, + "loss": 0.855, + "step": 5600 + }, + { + "epoch": 0.8131318621589303, + "grad_norm": 0.9776528477668762, + "learning_rate": 4.1868386722713436e-05, + "loss": 0.7644, + "step": 5610 + }, + { + "epoch": 0.8145812950683046, + "grad_norm": 0.8355238437652588, + "learning_rate": 4.185389186838673e-05, + "loss": 0.8027, + "step": 5620 + }, + { + "epoch": 0.8160307279776787, + "grad_norm": 2.3449933528900146, + "learning_rate": 4.183939701406001e-05, + "loss": 0.8623, + "step": 5630 + }, + { + "epoch": 0.8174801608870529, + "grad_norm": 0.879385769367218, + "learning_rate": 4.1824902159733296e-05, + "loss": 0.8522, + "step": 5640 + }, + { + "epoch": 0.8189295937964272, + "grad_norm": 0.9040646553039551, + "learning_rate": 4.181040730540659e-05, + "loss": 0.7904, + "step": 5650 + }, + { + "epoch": 0.8203790267058013, + "grad_norm": 1.8557476997375488, + "learning_rate": 4.1795912451079864e-05, + "loss": 0.9005, + "step": 5660 + }, + { + "epoch": 0.8218284596151756, + "grad_norm": 2.7867159843444824, + "learning_rate": 4.1781417596753156e-05, + "loss": 0.8912, + "step": 5670 + }, + { + "epoch": 0.8232778925245497, + "grad_norm": 1.0696690082550049, + "learning_rate": 4.176692274242644e-05, + "loss": 0.8365, + "step": 5680 + }, + { + "epoch": 0.824727325433924, + "grad_norm": 2.0290820598602295, + "learning_rate": 4.1752427888099724e-05, + "loss": 0.7567, + "step": 5690 + }, + { + "epoch": 0.8261767583432982, + "grad_norm": 1.7348473072052002, + "learning_rate": 4.1737933033773015e-05, + "loss": 0.8233, + "step": 5700 + }, + { + "epoch": 0.8276261912526723, + "grad_norm": 1.0425660610198975, + "learning_rate": 4.17234381794463e-05, + "loss": 0.9019, + "step": 5710 + }, + { + "epoch": 0.8290756241620466, + "grad_norm": 1.9040955305099487, + "learning_rate": 4.1708943325119584e-05, + "loss": 0.8035, + "step": 5720 + }, + { + "epoch": 0.8305250570714208, + "grad_norm": 0.7795917391777039, + "learning_rate": 4.169444847079287e-05, + "loss": 0.8519, + "step": 5730 + }, + { + "epoch": 0.831974489980795, + "grad_norm": 1.099768042564392, + "learning_rate": 4.167995361646615e-05, + "loss": 0.7934, + "step": 5740 + }, + { + "epoch": 0.8334239228901692, + "grad_norm": 1.0211305618286133, + "learning_rate": 4.1665458762139444e-05, + "loss": 0.8082, + "step": 5750 + }, + { + "epoch": 0.8348733557995435, + "grad_norm": 0.8864409327507019, + "learning_rate": 4.165096390781273e-05, + "loss": 0.7796, + "step": 5760 + }, + { + "epoch": 0.8363227887089176, + "grad_norm": 0.8594615459442139, + "learning_rate": 4.163646905348601e-05, + "loss": 0.7961, + "step": 5770 + }, + { + "epoch": 0.8377722216182918, + "grad_norm": 0.9918568134307861, + "learning_rate": 4.1621974199159304e-05, + "loss": 0.7387, + "step": 5780 + }, + { + "epoch": 0.8392216545276661, + "grad_norm": 0.6663451790809631, + "learning_rate": 4.160747934483258e-05, + "loss": 0.8327, + "step": 5790 + }, + { + "epoch": 0.8406710874370402, + "grad_norm": 0.8948430418968201, + "learning_rate": 4.159298449050587e-05, + "loss": 0.8362, + "step": 5800 + }, + { + "epoch": 0.8421205203464145, + "grad_norm": 1.9746590852737427, + "learning_rate": 4.157848963617916e-05, + "loss": 0.8044, + "step": 5810 + }, + { + "epoch": 0.8435699532557887, + "grad_norm": 0.7772048115730286, + "learning_rate": 4.156399478185244e-05, + "loss": 0.7423, + "step": 5820 + }, + { + "epoch": 0.8450193861651629, + "grad_norm": 1.7775204181671143, + "learning_rate": 4.154949992752573e-05, + "loss": 0.8133, + "step": 5830 + }, + { + "epoch": 0.8464688190745371, + "grad_norm": 1.7171192169189453, + "learning_rate": 4.153500507319902e-05, + "loss": 0.7657, + "step": 5840 + }, + { + "epoch": 0.8479182519839112, + "grad_norm": 0.6374487280845642, + "learning_rate": 4.152051021887231e-05, + "loss": 0.786, + "step": 5850 + }, + { + "epoch": 0.8493676848932855, + "grad_norm": 0.764743447303772, + "learning_rate": 4.1506015364545586e-05, + "loss": 0.8204, + "step": 5860 + }, + { + "epoch": 0.8508171178026597, + "grad_norm": 3.3664588928222656, + "learning_rate": 4.149152051021887e-05, + "loss": 0.8363, + "step": 5870 + }, + { + "epoch": 0.8522665507120339, + "grad_norm": 1.1142141819000244, + "learning_rate": 4.147702565589216e-05, + "loss": 0.8146, + "step": 5880 + }, + { + "epoch": 0.8537159836214081, + "grad_norm": 0.7387230396270752, + "learning_rate": 4.1462530801565445e-05, + "loss": 0.7297, + "step": 5890 + }, + { + "epoch": 0.8551654165307824, + "grad_norm": 0.9522792100906372, + "learning_rate": 4.1448035947238737e-05, + "loss": 0.794, + "step": 5900 + }, + { + "epoch": 0.8566148494401565, + "grad_norm": 1.995259165763855, + "learning_rate": 4.143354109291202e-05, + "loss": 0.8431, + "step": 5910 + }, + { + "epoch": 0.8580642823495307, + "grad_norm": 1.9007961750030518, + "learning_rate": 4.14190462385853e-05, + "loss": 0.8472, + "step": 5920 + }, + { + "epoch": 0.859513715258905, + "grad_norm": 2.864264488220215, + "learning_rate": 4.140455138425859e-05, + "loss": 0.7948, + "step": 5930 + }, + { + "epoch": 0.8609631481682791, + "grad_norm": 0.8150015473365784, + "learning_rate": 4.1390056529931874e-05, + "loss": 0.8264, + "step": 5940 + }, + { + "epoch": 0.8624125810776534, + "grad_norm": 1.1385763883590698, + "learning_rate": 4.1375561675605165e-05, + "loss": 0.8622, + "step": 5950 + }, + { + "epoch": 0.8638620139870276, + "grad_norm": 0.8771058320999146, + "learning_rate": 4.136106682127845e-05, + "loss": 0.7388, + "step": 5960 + }, + { + "epoch": 0.8653114468964018, + "grad_norm": 0.7156450748443604, + "learning_rate": 4.1346571966951734e-05, + "loss": 0.7222, + "step": 5970 + }, + { + "epoch": 0.866760879805776, + "grad_norm": 2.9387946128845215, + "learning_rate": 4.1332077112625025e-05, + "loss": 0.8499, + "step": 5980 + }, + { + "epoch": 0.8682103127151501, + "grad_norm": 0.827257513999939, + "learning_rate": 4.13175822582983e-05, + "loss": 0.7695, + "step": 5990 + }, + { + "epoch": 0.8696597456245244, + "grad_norm": 1.7753419876098633, + "learning_rate": 4.1303087403971594e-05, + "loss": 0.8177, + "step": 6000 + }, + { + "epoch": 0.8696597456245244, + "eval_loss": 0.8002915382385254, + "eval_runtime": 670.9604, + "eval_samples_per_second": 51.411, + "eval_steps_per_second": 2.571, + "eval_token_accuracy": 0.00040709041814346184, + "step": 6000 + }, + { + "epoch": 0.8711091785338986, + "grad_norm": 0.7696251273155212, + "learning_rate": 4.128859254964488e-05, + "loss": 0.741, + "step": 6010 + }, + { + "epoch": 0.8725586114432728, + "grad_norm": Infinity, + "learning_rate": 4.127554718075084e-05, + "loss": 0.8342, + "step": 6020 + }, + { + "epoch": 0.874008044352647, + "grad_norm": 0.7678489089012146, + "learning_rate": 4.126105232642412e-05, + "loss": 0.7815, + "step": 6030 + }, + { + "epoch": 0.8754574772620213, + "grad_norm": 0.6283605694770813, + "learning_rate": 4.1246557472097406e-05, + "loss": 0.8367, + "step": 6040 + }, + { + "epoch": 0.8769069101713954, + "grad_norm": 0.9361746907234192, + "learning_rate": 4.12320626177707e-05, + "loss": 0.8052, + "step": 6050 + }, + { + "epoch": 0.8783563430807696, + "grad_norm": 2.359816074371338, + "learning_rate": 4.121756776344398e-05, + "loss": 0.8143, + "step": 6060 + }, + { + "epoch": 0.8798057759901439, + "grad_norm": 2.2239186763763428, + "learning_rate": 4.1203072909117266e-05, + "loss": 0.9177, + "step": 6070 + }, + { + "epoch": 0.881255208899518, + "grad_norm": 2.528871536254883, + "learning_rate": 4.118857805479055e-05, + "loss": 0.7642, + "step": 6080 + }, + { + "epoch": 0.8827046418088923, + "grad_norm": 2.7372164726257324, + "learning_rate": 4.1174083200463835e-05, + "loss": 0.8501, + "step": 6090 + }, + { + "epoch": 0.8841540747182665, + "grad_norm": 0.9686703681945801, + "learning_rate": 4.1159588346137126e-05, + "loss": 0.9077, + "step": 6100 + }, + { + "epoch": 0.8856035076276407, + "grad_norm": 0.7706042528152466, + "learning_rate": 4.114509349181041e-05, + "loss": 0.7017, + "step": 6110 + }, + { + "epoch": 0.8870529405370149, + "grad_norm": 0.7924395203590393, + "learning_rate": 4.1130598637483695e-05, + "loss": 0.7459, + "step": 6120 + }, + { + "epoch": 0.888502373446389, + "grad_norm": 1.8335968255996704, + "learning_rate": 4.1116103783156986e-05, + "loss": 0.8256, + "step": 6130 + }, + { + "epoch": 0.8899518063557633, + "grad_norm": 1.7597600221633911, + "learning_rate": 4.110160892883026e-05, + "loss": 0.782, + "step": 6140 + }, + { + "epoch": 0.8914012392651375, + "grad_norm": 1.0810132026672363, + "learning_rate": 4.1087114074503554e-05, + "loss": 0.8474, + "step": 6150 + }, + { + "epoch": 0.8928506721745118, + "grad_norm": 1.7811000347137451, + "learning_rate": 4.107261922017684e-05, + "loss": 0.8355, + "step": 6160 + }, + { + "epoch": 0.8943001050838859, + "grad_norm": 2.644036054611206, + "learning_rate": 4.105812436585012e-05, + "loss": 0.7535, + "step": 6170 + }, + { + "epoch": 0.8957495379932602, + "grad_norm": 1.6075462102890015, + "learning_rate": 4.1043629511523414e-05, + "loss": 0.7093, + "step": 6180 + }, + { + "epoch": 0.8971989709026343, + "grad_norm": 1.9953532218933105, + "learning_rate": 4.10291346571967e-05, + "loss": 0.8419, + "step": 6190 + }, + { + "epoch": 0.8986484038120085, + "grad_norm": 2.267934799194336, + "learning_rate": 4.101463980286998e-05, + "loss": 0.8523, + "step": 6200 + }, + { + "epoch": 0.9000978367213828, + "grad_norm": 1.9035049676895142, + "learning_rate": 4.100014494854327e-05, + "loss": 0.7483, + "step": 6210 + }, + { + "epoch": 0.9015472696307569, + "grad_norm": 0.8251744508743286, + "learning_rate": 4.098565009421655e-05, + "loss": 0.8831, + "step": 6220 + }, + { + "epoch": 0.9029967025401312, + "grad_norm": 2.1203315258026123, + "learning_rate": 4.097115523988984e-05, + "loss": 0.7755, + "step": 6230 + }, + { + "epoch": 0.9044461354495054, + "grad_norm": 0.8929731249809265, + "learning_rate": 4.095666038556313e-05, + "loss": 0.8435, + "step": 6240 + }, + { + "epoch": 0.9058955683588796, + "grad_norm": 2.2965400218963623, + "learning_rate": 4.094216553123642e-05, + "loss": 0.8037, + "step": 6250 + }, + { + "epoch": 0.9073450012682538, + "grad_norm": 0.6761788725852966, + "learning_rate": 4.0927670676909696e-05, + "loss": 0.8395, + "step": 6260 + }, + { + "epoch": 0.908794434177628, + "grad_norm": 0.7339790463447571, + "learning_rate": 4.091317582258298e-05, + "loss": 0.8268, + "step": 6270 + }, + { + "epoch": 0.9102438670870022, + "grad_norm": 0.9395334720611572, + "learning_rate": 4.089868096825627e-05, + "loss": 0.7619, + "step": 6280 + }, + { + "epoch": 0.9116932999963764, + "grad_norm": 1.431176781654358, + "learning_rate": 4.0884186113929556e-05, + "loss": 0.7839, + "step": 6290 + }, + { + "epoch": 0.9131427329057507, + "grad_norm": 0.9616233706474304, + "learning_rate": 4.086969125960285e-05, + "loss": 0.8028, + "step": 6300 + }, + { + "epoch": 0.9145921658151248, + "grad_norm": 1.2412314414978027, + "learning_rate": 4.085519640527613e-05, + "loss": 0.9369, + "step": 6310 + }, + { + "epoch": 0.9160415987244991, + "grad_norm": 0.7780980467796326, + "learning_rate": 4.0840701550949416e-05, + "loss": 0.8283, + "step": 6320 + }, + { + "epoch": 0.9174910316338732, + "grad_norm": 2.292849540710449, + "learning_rate": 4.08262066966227e-05, + "loss": 0.8124, + "step": 6330 + }, + { + "epoch": 0.9189404645432474, + "grad_norm": 2.3807780742645264, + "learning_rate": 4.0811711842295984e-05, + "loss": 0.7707, + "step": 6340 + }, + { + "epoch": 0.9203898974526217, + "grad_norm": 0.8536470532417297, + "learning_rate": 4.0797216987969275e-05, + "loss": 0.868, + "step": 6350 + }, + { + "epoch": 0.9218393303619958, + "grad_norm": 3.343093156814575, + "learning_rate": 4.078272213364256e-05, + "loss": 0.8104, + "step": 6360 + }, + { + "epoch": 0.9232887632713701, + "grad_norm": 0.824166476726532, + "learning_rate": 4.0768227279315844e-05, + "loss": 0.7457, + "step": 6370 + }, + { + "epoch": 0.9247381961807443, + "grad_norm": 0.8960769772529602, + "learning_rate": 4.0753732424989135e-05, + "loss": 0.7728, + "step": 6380 + }, + { + "epoch": 0.9261876290901185, + "grad_norm": 0.7696554064750671, + "learning_rate": 4.073923757066241e-05, + "loss": 0.8353, + "step": 6390 + }, + { + "epoch": 0.9276370619994927, + "grad_norm": 3.4166648387908936, + "learning_rate": 4.0724742716335704e-05, + "loss": 0.8143, + "step": 6400 + }, + { + "epoch": 0.9290864949088669, + "grad_norm": 2.698413133621216, + "learning_rate": 4.071024786200899e-05, + "loss": 0.7716, + "step": 6410 + }, + { + "epoch": 0.9305359278182411, + "grad_norm": 1.7810639142990112, + "learning_rate": 4.069575300768227e-05, + "loss": 0.8451, + "step": 6420 + }, + { + "epoch": 0.9319853607276153, + "grad_norm": 2.0858755111694336, + "learning_rate": 4.0681258153355564e-05, + "loss": 0.8048, + "step": 6430 + }, + { + "epoch": 0.9334347936369896, + "grad_norm": 2.5020108222961426, + "learning_rate": 4.066676329902885e-05, + "loss": 0.8316, + "step": 6440 + }, + { + "epoch": 0.9348842265463637, + "grad_norm": 0.8029810190200806, + "learning_rate": 4.065226844470213e-05, + "loss": 0.8335, + "step": 6450 + }, + { + "epoch": 0.936333659455738, + "grad_norm": 0.7290728688240051, + "learning_rate": 4.063777359037542e-05, + "loss": 0.8842, + "step": 6460 + }, + { + "epoch": 0.9377830923651121, + "grad_norm": 0.8825303316116333, + "learning_rate": 4.06232787360487e-05, + "loss": 0.7534, + "step": 6470 + }, + { + "epoch": 0.9392325252744863, + "grad_norm": 2.034435510635376, + "learning_rate": 4.060878388172199e-05, + "loss": 0.7776, + "step": 6480 + }, + { + "epoch": 0.9406819581838606, + "grad_norm": 1.9346160888671875, + "learning_rate": 4.059428902739528e-05, + "loss": 0.8008, + "step": 6490 + }, + { + "epoch": 0.9421313910932347, + "grad_norm": 0.8853064179420471, + "learning_rate": 4.057979417306856e-05, + "loss": 0.8289, + "step": 6500 + }, + { + "epoch": 0.943580824002609, + "grad_norm": 2.0305445194244385, + "learning_rate": 4.056529931874185e-05, + "loss": 0.8135, + "step": 6510 + }, + { + "epoch": 0.9450302569119832, + "grad_norm": 2.7980148792266846, + "learning_rate": 4.055080446441513e-05, + "loss": 0.8089, + "step": 6520 + }, + { + "epoch": 0.9464796898213574, + "grad_norm": 1.0745283365249634, + "learning_rate": 4.053630961008842e-05, + "loss": 0.9252, + "step": 6530 + }, + { + "epoch": 0.9479291227307316, + "grad_norm": 0.9550811648368835, + "learning_rate": 4.0521814755761705e-05, + "loss": 0.7947, + "step": 6540 + }, + { + "epoch": 0.9493785556401058, + "grad_norm": 2.408703088760376, + "learning_rate": 4.050731990143499e-05, + "loss": 0.7917, + "step": 6550 + }, + { + "epoch": 0.95082798854948, + "grad_norm": 1.9390877485275269, + "learning_rate": 4.049282504710828e-05, + "loss": 0.8671, + "step": 6560 + }, + { + "epoch": 0.9522774214588542, + "grad_norm": 1.9944474697113037, + "learning_rate": 4.0478330192781565e-05, + "loss": 0.7685, + "step": 6570 + }, + { + "epoch": 0.9537268543682285, + "grad_norm": 2.1434199810028076, + "learning_rate": 4.0463835338454856e-05, + "loss": 0.7979, + "step": 6580 + }, + { + "epoch": 0.9551762872776026, + "grad_norm": 2.005647897720337, + "learning_rate": 4.0449340484128134e-05, + "loss": 0.7932, + "step": 6590 + }, + { + "epoch": 0.9566257201869769, + "grad_norm": 1.3373275995254517, + "learning_rate": 4.043484562980142e-05, + "loss": 0.7332, + "step": 6600 + }, + { + "epoch": 0.958075153096351, + "grad_norm": 3.3556673526763916, + "learning_rate": 4.042035077547471e-05, + "loss": 0.7623, + "step": 6610 + }, + { + "epoch": 0.9595245860057252, + "grad_norm": 0.8822040557861328, + "learning_rate": 4.0405855921147994e-05, + "loss": 0.8023, + "step": 6620 + }, + { + "epoch": 0.9609740189150995, + "grad_norm": 1.7681927680969238, + "learning_rate": 4.0391361066821285e-05, + "loss": 0.8174, + "step": 6630 + }, + { + "epoch": 0.9624234518244736, + "grad_norm": 0.8971022963523865, + "learning_rate": 4.037686621249457e-05, + "loss": 0.7967, + "step": 6640 + }, + { + "epoch": 0.9638728847338479, + "grad_norm": 2.181123733520508, + "learning_rate": 4.036237135816785e-05, + "loss": 0.7561, + "step": 6650 + }, + { + "epoch": 0.9653223176432221, + "grad_norm": 2.5860283374786377, + "learning_rate": 4.034787650384114e-05, + "loss": 0.741, + "step": 6660 + }, + { + "epoch": 0.9667717505525963, + "grad_norm": 2.3298215866088867, + "learning_rate": 4.033338164951442e-05, + "loss": 0.8468, + "step": 6670 + }, + { + "epoch": 0.9682211834619705, + "grad_norm": 2.238698959350586, + "learning_rate": 4.0318886795187713e-05, + "loss": 0.7435, + "step": 6680 + }, + { + "epoch": 0.9696706163713447, + "grad_norm": 2.973845958709717, + "learning_rate": 4.0304391940861e-05, + "loss": 0.8298, + "step": 6690 + }, + { + "epoch": 0.9711200492807189, + "grad_norm": 0.7994486689567566, + "learning_rate": 4.028989708653428e-05, + "loss": 0.7821, + "step": 6700 + }, + { + "epoch": 0.9725694821900931, + "grad_norm": 2.1712687015533447, + "learning_rate": 4.027540223220757e-05, + "loss": 0.7828, + "step": 6710 + }, + { + "epoch": 0.9740189150994674, + "grad_norm": 1.6569911241531372, + "learning_rate": 4.026090737788085e-05, + "loss": 0.7252, + "step": 6720 + }, + { + "epoch": 0.9754683480088415, + "grad_norm": 2.9206511974334717, + "learning_rate": 4.024641252355414e-05, + "loss": 0.904, + "step": 6730 + }, + { + "epoch": 0.9769177809182158, + "grad_norm": 0.7334924340248108, + "learning_rate": 4.0231917669227426e-05, + "loss": 0.8184, + "step": 6740 + }, + { + "epoch": 0.97836721382759, + "grad_norm": 0.9845920205116272, + "learning_rate": 4.021742281490071e-05, + "loss": 0.7562, + "step": 6750 + }, + { + "epoch": 0.9798166467369641, + "grad_norm": 0.7724540829658508, + "learning_rate": 4.0202927960574e-05, + "loss": 0.8182, + "step": 6760 + }, + { + "epoch": 0.9812660796463384, + "grad_norm": 2.2663064002990723, + "learning_rate": 4.0188433106247286e-05, + "loss": 0.7824, + "step": 6770 + }, + { + "epoch": 0.9827155125557125, + "grad_norm": 1.2169333696365356, + "learning_rate": 4.017393825192057e-05, + "loss": 0.759, + "step": 6780 + }, + { + "epoch": 0.9841649454650868, + "grad_norm": 0.6806200742721558, + "learning_rate": 4.0159443397593855e-05, + "loss": 0.8752, + "step": 6790 + }, + { + "epoch": 0.985614378374461, + "grad_norm": 1.1035833358764648, + "learning_rate": 4.014494854326714e-05, + "loss": 0.8627, + "step": 6800 + }, + { + "epoch": 0.9870638112838352, + "grad_norm": 0.8531244397163391, + "learning_rate": 4.013045368894043e-05, + "loss": 0.7522, + "step": 6810 + }, + { + "epoch": 0.9885132441932094, + "grad_norm": 2.015822410583496, + "learning_rate": 4.0115958834613715e-05, + "loss": 0.8365, + "step": 6820 + }, + { + "epoch": 0.9899626771025836, + "grad_norm": 2.455010414123535, + "learning_rate": 4.0101463980287e-05, + "loss": 0.8219, + "step": 6830 + }, + { + "epoch": 0.9914121100119578, + "grad_norm": 3.0342283248901367, + "learning_rate": 4.008696912596029e-05, + "loss": 0.8452, + "step": 6840 + }, + { + "epoch": 0.992861542921332, + "grad_norm": 2.531339406967163, + "learning_rate": 4.007247427163357e-05, + "loss": 0.8152, + "step": 6850 + }, + { + "epoch": 0.9943109758307063, + "grad_norm": 0.8472199440002441, + "learning_rate": 4.005797941730686e-05, + "loss": 0.7319, + "step": 6860 + }, + { + "epoch": 0.9957604087400804, + "grad_norm": 2.572490930557251, + "learning_rate": 4.0043484562980143e-05, + "loss": 0.8502, + "step": 6870 + }, + { + "epoch": 0.9972098416494547, + "grad_norm": 3.1185014247894287, + "learning_rate": 4.002898970865343e-05, + "loss": 0.776, + "step": 6880 + }, + { + "epoch": 0.9986592745588289, + "grad_norm": 2.1352272033691406, + "learning_rate": 4.001449485432672e-05, + "loss": 0.7082, + "step": 6890 + }, + { + "epoch": 1.0001087074682031, + "grad_norm": Infinity, + "learning_rate": 4.000144948543267e-05, + "loss": 0.8426, + "step": 6900 + }, + { + "epoch": 1.0015581403775773, + "grad_norm": 1.6490182876586914, + "learning_rate": 3.998695463110596e-05, + "loss": 0.6672, + "step": 6910 + }, + { + "epoch": 1.0030075732869514, + "grad_norm": 1.2665408849716187, + "learning_rate": 3.997245977677925e-05, + "loss": 0.6411, + "step": 6920 + }, + { + "epoch": 1.0044570061963256, + "grad_norm": 1.9385483264923096, + "learning_rate": 3.995796492245253e-05, + "loss": 0.6512, + "step": 6930 + }, + { + "epoch": 1.0059064391057, + "grad_norm": 1.9456660747528076, + "learning_rate": 3.9943470068125816e-05, + "loss": 0.7186, + "step": 6940 + }, + { + "epoch": 1.0073558720150741, + "grad_norm": 0.8936344981193542, + "learning_rate": 3.99289752137991e-05, + "loss": 0.7411, + "step": 6950 + }, + { + "epoch": 1.0088053049244483, + "grad_norm": 0.8895001411437988, + "learning_rate": 3.991448035947239e-05, + "loss": 0.732, + "step": 6960 + }, + { + "epoch": 1.0102547378338225, + "grad_norm": 2.4484055042266846, + "learning_rate": 3.9899985505145676e-05, + "loss": 0.6834, + "step": 6970 + }, + { + "epoch": 1.0117041707431966, + "grad_norm": 1.6009628772735596, + "learning_rate": 3.988549065081896e-05, + "loss": 0.7056, + "step": 6980 + }, + { + "epoch": 1.013153603652571, + "grad_norm": 0.817646324634552, + "learning_rate": 3.987099579649225e-05, + "loss": 0.7023, + "step": 6990 + }, + { + "epoch": 1.0146030365619452, + "grad_norm": 2.423677444458008, + "learning_rate": 3.985650094216553e-05, + "loss": 0.6912, + "step": 7000 + }, + { + "epoch": 1.0146030365619452, + "eval_loss": 0.792147159576416, + "eval_runtime": 670.8402, + "eval_samples_per_second": 51.421, + "eval_steps_per_second": 2.571, + "eval_token_accuracy": 0.0003976007560848035, + "step": 7000 + }, + { + "epoch": 1.0160524694713193, + "grad_norm": 0.8038765788078308, + "learning_rate": 3.984200608783882e-05, + "loss": 0.7457, + "step": 7010 + }, + { + "epoch": 1.0175019023806935, + "grad_norm": 2.2214059829711914, + "learning_rate": 3.9827511233512104e-05, + "loss": 0.6803, + "step": 7020 + }, + { + "epoch": 1.0189513352900677, + "grad_norm": 0.8514269590377808, + "learning_rate": 3.981301637918539e-05, + "loss": 0.7552, + "step": 7030 + }, + { + "epoch": 1.020400768199442, + "grad_norm": 0.9722759127616882, + "learning_rate": 3.979852152485868e-05, + "loss": 0.6756, + "step": 7040 + }, + { + "epoch": 1.0218502011088162, + "grad_norm": 2.627790689468384, + "learning_rate": 3.9784026670531964e-05, + "loss": 0.7058, + "step": 7050 + }, + { + "epoch": 1.0232996340181904, + "grad_norm": 0.8177839517593384, + "learning_rate": 3.976953181620525e-05, + "loss": 0.6768, + "step": 7060 + }, + { + "epoch": 1.0247490669275645, + "grad_norm": 0.7263636589050293, + "learning_rate": 3.975503696187853e-05, + "loss": 0.6945, + "step": 7070 + }, + { + "epoch": 1.026198499836939, + "grad_norm": 1.9415383338928223, + "learning_rate": 3.974054210755182e-05, + "loss": 0.6608, + "step": 7080 + }, + { + "epoch": 1.027647932746313, + "grad_norm": 0.9528020024299622, + "learning_rate": 3.972604725322511e-05, + "loss": 0.6775, + "step": 7090 + }, + { + "epoch": 1.0290973656556872, + "grad_norm": 1.0678958892822266, + "learning_rate": 3.971155239889839e-05, + "loss": 0.7564, + "step": 7100 + }, + { + "epoch": 1.0305467985650614, + "grad_norm": 1.6998884677886963, + "learning_rate": 3.9697057544571684e-05, + "loss": 0.8101, + "step": 7110 + }, + { + "epoch": 1.0319962314744355, + "grad_norm": 0.9464067816734314, + "learning_rate": 3.968256269024497e-05, + "loss": 0.6839, + "step": 7120 + }, + { + "epoch": 1.03344566438381, + "grad_norm": 0.9953280091285706, + "learning_rate": 3.9668067835918246e-05, + "loss": 0.6776, + "step": 7130 + }, + { + "epoch": 1.034895097293184, + "grad_norm": 2.3919310569763184, + "learning_rate": 3.965357298159154e-05, + "loss": 0.7536, + "step": 7140 + }, + { + "epoch": 1.0363445302025582, + "grad_norm": 2.497450113296509, + "learning_rate": 3.963907812726482e-05, + "loss": 0.7147, + "step": 7150 + }, + { + "epoch": 1.0377939631119324, + "grad_norm": 0.7969416379928589, + "learning_rate": 3.962458327293811e-05, + "loss": 0.7205, + "step": 7160 + }, + { + "epoch": 1.0392433960213068, + "grad_norm": 2.086747884750366, + "learning_rate": 3.9610088418611397e-05, + "loss": 0.6671, + "step": 7170 + }, + { + "epoch": 1.040692828930681, + "grad_norm": 0.8285253643989563, + "learning_rate": 3.959559356428468e-05, + "loss": 0.6685, + "step": 7180 + }, + { + "epoch": 1.042142261840055, + "grad_norm": 0.897041380405426, + "learning_rate": 3.9581098709957965e-05, + "loss": 0.6681, + "step": 7190 + }, + { + "epoch": 1.0435916947494293, + "grad_norm": 0.9613838195800781, + "learning_rate": 3.956660385563125e-05, + "loss": 0.5794, + "step": 7200 + }, + { + "epoch": 1.0450411276588034, + "grad_norm": 0.9189807176589966, + "learning_rate": 3.955210900130454e-05, + "loss": 0.6748, + "step": 7210 + }, + { + "epoch": 1.0464905605681778, + "grad_norm": 2.2880289554595947, + "learning_rate": 3.9537614146977825e-05, + "loss": 0.6471, + "step": 7220 + }, + { + "epoch": 1.047939993477552, + "grad_norm": 2.2064247131347656, + "learning_rate": 3.952311929265111e-05, + "loss": 0.7417, + "step": 7230 + }, + { + "epoch": 1.0493894263869261, + "grad_norm": 1.0736846923828125, + "learning_rate": 3.95086244383244e-05, + "loss": 0.6161, + "step": 7240 + }, + { + "epoch": 1.0508388592963003, + "grad_norm": 0.9019289612770081, + "learning_rate": 3.9494129583997685e-05, + "loss": 0.7279, + "step": 7250 + }, + { + "epoch": 1.0522882922056744, + "grad_norm": 0.6698160767555237, + "learning_rate": 3.947963472967097e-05, + "loss": 0.6323, + "step": 7260 + }, + { + "epoch": 1.0537377251150488, + "grad_norm": 2.861865282058716, + "learning_rate": 3.9465139875344254e-05, + "loss": 0.7293, + "step": 7270 + }, + { + "epoch": 1.055187158024423, + "grad_norm": 0.9664188027381897, + "learning_rate": 3.945064502101754e-05, + "loss": 0.7297, + "step": 7280 + }, + { + "epoch": 1.0566365909337971, + "grad_norm": 3.0016567707061768, + "learning_rate": 3.943615016669083e-05, + "loss": 0.7518, + "step": 7290 + }, + { + "epoch": 1.0580860238431713, + "grad_norm": 0.9263351559638977, + "learning_rate": 3.9421655312364114e-05, + "loss": 0.7422, + "step": 7300 + }, + { + "epoch": 1.0595354567525455, + "grad_norm": 1.9621021747589111, + "learning_rate": 3.94071604580374e-05, + "loss": 0.6925, + "step": 7310 + }, + { + "epoch": 1.0609848896619198, + "grad_norm": 0.8665259480476379, + "learning_rate": 3.939266560371068e-05, + "loss": 0.7076, + "step": 7320 + }, + { + "epoch": 1.062434322571294, + "grad_norm": 0.8874248266220093, + "learning_rate": 3.937817074938397e-05, + "loss": 0.7928, + "step": 7330 + }, + { + "epoch": 1.0638837554806682, + "grad_norm": 0.9791090488433838, + "learning_rate": 3.936367589505726e-05, + "loss": 0.5994, + "step": 7340 + }, + { + "epoch": 1.0653331883900423, + "grad_norm": 2.642836809158325, + "learning_rate": 3.934918104073054e-05, + "loss": 0.7622, + "step": 7350 + }, + { + "epoch": 1.0667826212994167, + "grad_norm": 2.5333988666534424, + "learning_rate": 3.9334686186403827e-05, + "loss": 0.6799, + "step": 7360 + }, + { + "epoch": 1.0682320542087909, + "grad_norm": 1.0640588998794556, + "learning_rate": 3.932019133207712e-05, + "loss": 0.7797, + "step": 7370 + }, + { + "epoch": 1.069681487118165, + "grad_norm": 0.8988319039344788, + "learning_rate": 3.9305696477750395e-05, + "loss": 0.7214, + "step": 7380 + }, + { + "epoch": 1.0711309200275392, + "grad_norm": 0.9383190870285034, + "learning_rate": 3.9291201623423686e-05, + "loss": 0.758, + "step": 7390 + }, + { + "epoch": 1.0725803529369133, + "grad_norm": 0.8509389758110046, + "learning_rate": 3.927670676909697e-05, + "loss": 0.6936, + "step": 7400 + }, + { + "epoch": 1.0740297858462877, + "grad_norm": 0.8831974864006042, + "learning_rate": 3.9262211914770255e-05, + "loss": 0.7335, + "step": 7410 + }, + { + "epoch": 1.0754792187556619, + "grad_norm": 2.2875640392303467, + "learning_rate": 3.9247717060443546e-05, + "loss": 0.6867, + "step": 7420 + }, + { + "epoch": 1.076928651665036, + "grad_norm": 2.576653242111206, + "learning_rate": 3.923322220611683e-05, + "loss": 0.7765, + "step": 7430 + }, + { + "epoch": 1.0783780845744102, + "grad_norm": 3.6591567993164062, + "learning_rate": 3.921872735179012e-05, + "loss": 0.6902, + "step": 7440 + }, + { + "epoch": 1.0798275174837846, + "grad_norm": 1.969195008277893, + "learning_rate": 3.92042324974634e-05, + "loss": 0.6604, + "step": 7450 + }, + { + "epoch": 1.0812769503931587, + "grad_norm": 1.8259871006011963, + "learning_rate": 3.9189737643136684e-05, + "loss": 0.7414, + "step": 7460 + }, + { + "epoch": 1.082726383302533, + "grad_norm": 1.0266244411468506, + "learning_rate": 3.9175242788809975e-05, + "loss": 0.6822, + "step": 7470 + }, + { + "epoch": 1.084175816211907, + "grad_norm": 0.9187620878219604, + "learning_rate": 3.916074793448326e-05, + "loss": 0.7962, + "step": 7480 + }, + { + "epoch": 1.0856252491212812, + "grad_norm": 2.446805000305176, + "learning_rate": 3.914625308015655e-05, + "loss": 0.6669, + "step": 7490 + }, + { + "epoch": 1.0870746820306556, + "grad_norm": 1.7605246305465698, + "learning_rate": 3.9131758225829835e-05, + "loss": 0.6893, + "step": 7500 + }, + { + "epoch": 1.0885241149400298, + "grad_norm": 1.388551950454712, + "learning_rate": 3.911726337150311e-05, + "loss": 0.7815, + "step": 7510 + }, + { + "epoch": 1.089973547849404, + "grad_norm": 0.932859480381012, + "learning_rate": 3.91027685171764e-05, + "loss": 0.7375, + "step": 7520 + }, + { + "epoch": 1.091422980758778, + "grad_norm": 0.7429149150848389, + "learning_rate": 3.908827366284969e-05, + "loss": 0.6903, + "step": 7530 + }, + { + "epoch": 1.0928724136681522, + "grad_norm": 1.107605218887329, + "learning_rate": 3.907377880852298e-05, + "loss": 0.8292, + "step": 7540 + }, + { + "epoch": 1.0943218465775266, + "grad_norm": 1.9951454401016235, + "learning_rate": 3.905928395419626e-05, + "loss": 0.639, + "step": 7550 + }, + { + "epoch": 1.0957712794869008, + "grad_norm": 3.669023275375366, + "learning_rate": 3.904478909986955e-05, + "loss": 0.7677, + "step": 7560 + }, + { + "epoch": 1.097220712396275, + "grad_norm": 0.930507242679596, + "learning_rate": 3.903029424554284e-05, + "loss": 0.6948, + "step": 7570 + }, + { + "epoch": 1.098670145305649, + "grad_norm": 2.206545829772949, + "learning_rate": 3.9015799391216116e-05, + "loss": 0.746, + "step": 7580 + }, + { + "epoch": 1.1001195782150233, + "grad_norm": 2.361729860305786, + "learning_rate": 3.900130453688941e-05, + "loss": 0.6641, + "step": 7590 + }, + { + "epoch": 1.1015690111243976, + "grad_norm": 0.863552451133728, + "learning_rate": 3.898680968256269e-05, + "loss": 0.7508, + "step": 7600 + }, + { + "epoch": 1.1030184440337718, + "grad_norm": 0.9006494283676147, + "learning_rate": 3.8972314828235976e-05, + "loss": 0.6622, + "step": 7610 + }, + { + "epoch": 1.104467876943146, + "grad_norm": 1.0481303930282593, + "learning_rate": 3.895781997390927e-05, + "loss": 0.6852, + "step": 7620 + }, + { + "epoch": 1.1059173098525201, + "grad_norm": 2.5158543586730957, + "learning_rate": 3.894332511958255e-05, + "loss": 0.7763, + "step": 7630 + }, + { + "epoch": 1.1073667427618945, + "grad_norm": 2.5790767669677734, + "learning_rate": 3.8928830265255836e-05, + "loss": 0.6905, + "step": 7640 + }, + { + "epoch": 1.1088161756712687, + "grad_norm": 0.8664813041687012, + "learning_rate": 3.891433541092912e-05, + "loss": 0.6791, + "step": 7650 + }, + { + "epoch": 1.1102656085806428, + "grad_norm": 0.7020731568336487, + "learning_rate": 3.8899840556602405e-05, + "loss": 0.6716, + "step": 7660 + }, + { + "epoch": 1.111715041490017, + "grad_norm": 0.7595298886299133, + "learning_rate": 3.8885345702275696e-05, + "loss": 0.7458, + "step": 7670 + }, + { + "epoch": 1.1131644743993911, + "grad_norm": 1.0612444877624512, + "learning_rate": 3.887085084794898e-05, + "loss": 0.6897, + "step": 7680 + }, + { + "epoch": 1.1146139073087655, + "grad_norm": 1.821798324584961, + "learning_rate": 3.8856355993622265e-05, + "loss": 0.6691, + "step": 7690 + }, + { + "epoch": 1.1160633402181397, + "grad_norm": 2.0372657775878906, + "learning_rate": 3.8841861139295556e-05, + "loss": 0.7072, + "step": 7700 + }, + { + "epoch": 1.1175127731275138, + "grad_norm": 0.9012567400932312, + "learning_rate": 3.882736628496883e-05, + "loss": 0.6702, + "step": 7710 + }, + { + "epoch": 1.118962206036888, + "grad_norm": 0.9007511734962463, + "learning_rate": 3.8812871430642124e-05, + "loss": 0.6839, + "step": 7720 + }, + { + "epoch": 1.1204116389462624, + "grad_norm": 1.0055979490280151, + "learning_rate": 3.879837657631541e-05, + "loss": 0.7102, + "step": 7730 + }, + { + "epoch": 1.1218610718556365, + "grad_norm": 3.4122612476348877, + "learning_rate": 3.878388172198869e-05, + "loss": 0.7228, + "step": 7740 + }, + { + "epoch": 1.1233105047650107, + "grad_norm": 0.8962409496307373, + "learning_rate": 3.8769386867661984e-05, + "loss": 0.6315, + "step": 7750 + }, + { + "epoch": 1.1247599376743849, + "grad_norm": 1.3250775337219238, + "learning_rate": 3.875489201333527e-05, + "loss": 0.7067, + "step": 7760 + }, + { + "epoch": 1.126209370583759, + "grad_norm": 0.8613935112953186, + "learning_rate": 3.874039715900855e-05, + "loss": 0.6161, + "step": 7770 + }, + { + "epoch": 1.1276588034931332, + "grad_norm": 2.337137222290039, + "learning_rate": 3.872590230468184e-05, + "loss": 0.7833, + "step": 7780 + }, + { + "epoch": 1.1291082364025076, + "grad_norm": 1.5880218744277954, + "learning_rate": 3.871140745035512e-05, + "loss": 0.7372, + "step": 7790 + }, + { + "epoch": 1.1305576693118817, + "grad_norm": 0.8534262776374817, + "learning_rate": 3.869691259602841e-05, + "loss": 0.6467, + "step": 7800 + }, + { + "epoch": 1.132007102221256, + "grad_norm": 0.7763128280639648, + "learning_rate": 3.86824177417017e-05, + "loss": 0.7475, + "step": 7810 + }, + { + "epoch": 1.13345653513063, + "grad_norm": 1.5850907564163208, + "learning_rate": 3.866792288737499e-05, + "loss": 0.6969, + "step": 7820 + }, + { + "epoch": 1.1349059680400044, + "grad_norm": 1.0109902620315552, + "learning_rate": 3.865342803304827e-05, + "loss": 0.6547, + "step": 7830 + }, + { + "epoch": 1.1363554009493786, + "grad_norm": 2.639951705932617, + "learning_rate": 3.863893317872155e-05, + "loss": 0.6913, + "step": 7840 + }, + { + "epoch": 1.1378048338587528, + "grad_norm": 2.2629740238189697, + "learning_rate": 3.862443832439484e-05, + "loss": 0.7458, + "step": 7850 + }, + { + "epoch": 1.139254266768127, + "grad_norm": 3.484891176223755, + "learning_rate": 3.8609943470068126e-05, + "loss": 0.7227, + "step": 7860 + }, + { + "epoch": 1.140703699677501, + "grad_norm": 3.3114774227142334, + "learning_rate": 3.859544861574142e-05, + "loss": 0.6682, + "step": 7870 + }, + { + "epoch": 1.1421531325868755, + "grad_norm": 0.9345071911811829, + "learning_rate": 3.85809537614147e-05, + "loss": 0.6499, + "step": 7880 + }, + { + "epoch": 1.1436025654962496, + "grad_norm": 2.3660013675689697, + "learning_rate": 3.8566458907087986e-05, + "loss": 0.65, + "step": 7890 + }, + { + "epoch": 1.1450519984056238, + "grad_norm": 0.6031567454338074, + "learning_rate": 3.855196405276127e-05, + "loss": 0.6494, + "step": 7900 + }, + { + "epoch": 1.146501431314998, + "grad_norm": 1.2019445896148682, + "learning_rate": 3.8537469198434554e-05, + "loss": 0.7211, + "step": 7910 + }, + { + "epoch": 1.1479508642243723, + "grad_norm": 1.701231598854065, + "learning_rate": 3.8522974344107845e-05, + "loss": 0.7197, + "step": 7920 + }, + { + "epoch": 1.1494002971337465, + "grad_norm": 2.265749931335449, + "learning_rate": 3.850847948978113e-05, + "loss": 0.7113, + "step": 7930 + }, + { + "epoch": 1.1508497300431206, + "grad_norm": 0.8541210293769836, + "learning_rate": 3.8493984635454414e-05, + "loss": 0.7063, + "step": 7940 + }, + { + "epoch": 1.1522991629524948, + "grad_norm": 1.4700528383255005, + "learning_rate": 3.8479489781127705e-05, + "loss": 0.6937, + "step": 7950 + }, + { + "epoch": 1.153748595861869, + "grad_norm": 1.0620396137237549, + "learning_rate": 3.846499492680099e-05, + "loss": 0.7409, + "step": 7960 + }, + { + "epoch": 1.1551980287712433, + "grad_norm": 1.703613519668579, + "learning_rate": 3.8450500072474274e-05, + "loss": 0.6171, + "step": 7970 + }, + { + "epoch": 1.1566474616806175, + "grad_norm": 3.0533642768859863, + "learning_rate": 3.843600521814756e-05, + "loss": 0.78, + "step": 7980 + }, + { + "epoch": 1.1580968945899917, + "grad_norm": 2.968996286392212, + "learning_rate": 3.842151036382084e-05, + "loss": 0.665, + "step": 7990 + }, + { + "epoch": 1.1595463274993658, + "grad_norm": 1.5719397068023682, + "learning_rate": 3.8407015509494134e-05, + "loss": 0.6968, + "step": 8000 + }, + { + "epoch": 1.1595463274993658, + "eval_loss": 0.7815335988998413, + "eval_runtime": 669.2359, + "eval_samples_per_second": 51.544, + "eval_steps_per_second": 2.578, + "eval_token_accuracy": 0.000397922439544419, + "step": 8000 + }, + { + "epoch": 1.1609957604087402, + "grad_norm": 1.8867238759994507, + "learning_rate": 3.839252065516742e-05, + "loss": 0.7018, + "step": 8010 + }, + { + "epoch": 1.1624451933181144, + "grad_norm": 0.8305041193962097, + "learning_rate": 3.83780258008407e-05, + "loss": 0.7591, + "step": 8020 + }, + { + "epoch": 1.1638946262274885, + "grad_norm": 0.9616385102272034, + "learning_rate": 3.836353094651399e-05, + "loss": 0.7355, + "step": 8030 + }, + { + "epoch": 1.1653440591368627, + "grad_norm": 0.797763466835022, + "learning_rate": 3.834903609218727e-05, + "loss": 0.6915, + "step": 8040 + }, + { + "epoch": 1.1667934920462368, + "grad_norm": 0.9838529825210571, + "learning_rate": 3.833454123786056e-05, + "loss": 0.7352, + "step": 8050 + }, + { + "epoch": 1.168242924955611, + "grad_norm": 1.1470749378204346, + "learning_rate": 3.832004638353385e-05, + "loss": 0.7103, + "step": 8060 + }, + { + "epoch": 1.1696923578649854, + "grad_norm": 1.6150496006011963, + "learning_rate": 3.830555152920713e-05, + "loss": 0.713, + "step": 8070 + }, + { + "epoch": 1.1711417907743595, + "grad_norm": 2.4331045150756836, + "learning_rate": 3.829105667488042e-05, + "loss": 0.6846, + "step": 8080 + }, + { + "epoch": 1.1725912236837337, + "grad_norm": 0.7783316969871521, + "learning_rate": 3.827656182055371e-05, + "loss": 0.6121, + "step": 8090 + }, + { + "epoch": 1.1740406565931079, + "grad_norm": 0.7985559701919556, + "learning_rate": 3.826206696622699e-05, + "loss": 0.7786, + "step": 8100 + }, + { + "epoch": 1.1754900895024822, + "grad_norm": 2.05668568611145, + "learning_rate": 3.8247572111900275e-05, + "loss": 0.6657, + "step": 8110 + }, + { + "epoch": 1.1769395224118564, + "grad_norm": 1.422728419303894, + "learning_rate": 3.823307725757356e-05, + "loss": 0.6807, + "step": 8120 + }, + { + "epoch": 1.1783889553212306, + "grad_norm": 0.692974865436554, + "learning_rate": 3.821858240324685e-05, + "loss": 0.8165, + "step": 8130 + }, + { + "epoch": 1.1798383882306047, + "grad_norm": 0.6786725521087646, + "learning_rate": 3.8204087548920135e-05, + "loss": 0.6539, + "step": 8140 + }, + { + "epoch": 1.1812878211399789, + "grad_norm": 1.600859522819519, + "learning_rate": 3.8189592694593426e-05, + "loss": 0.6922, + "step": 8150 + }, + { + "epoch": 1.1827372540493533, + "grad_norm": 1.9323725700378418, + "learning_rate": 3.8175097840266704e-05, + "loss": 0.6823, + "step": 8160 + }, + { + "epoch": 1.1841866869587274, + "grad_norm": 0.9999890327453613, + "learning_rate": 3.816060298593999e-05, + "loss": 0.7203, + "step": 8170 + }, + { + "epoch": 1.1856361198681016, + "grad_norm": 0.8995366096496582, + "learning_rate": 3.814610813161328e-05, + "loss": 0.6565, + "step": 8180 + }, + { + "epoch": 1.1870855527774757, + "grad_norm": 2.447030544281006, + "learning_rate": 3.8131613277286564e-05, + "loss": 0.7161, + "step": 8190 + }, + { + "epoch": 1.1885349856868501, + "grad_norm": 3.476491689682007, + "learning_rate": 3.8117118422959855e-05, + "loss": 0.7355, + "step": 8200 + }, + { + "epoch": 1.1899844185962243, + "grad_norm": 0.8819745779037476, + "learning_rate": 3.810262356863314e-05, + "loss": 0.6723, + "step": 8210 + }, + { + "epoch": 1.1914338515055984, + "grad_norm": 0.8547608256340027, + "learning_rate": 3.8088128714306424e-05, + "loss": 0.739, + "step": 8220 + }, + { + "epoch": 1.1928832844149726, + "grad_norm": 2.171769142150879, + "learning_rate": 3.807363385997971e-05, + "loss": 0.7394, + "step": 8230 + }, + { + "epoch": 1.1943327173243468, + "grad_norm": 0.7322458624839783, + "learning_rate": 3.805913900565299e-05, + "loss": 0.7139, + "step": 8240 + }, + { + "epoch": 1.1957821502337211, + "grad_norm": 2.5103094577789307, + "learning_rate": 3.8044644151326284e-05, + "loss": 0.6333, + "step": 8250 + }, + { + "epoch": 1.1972315831430953, + "grad_norm": 2.6011738777160645, + "learning_rate": 3.803014929699957e-05, + "loss": 0.6952, + "step": 8260 + }, + { + "epoch": 1.1986810160524695, + "grad_norm": 0.9356118440628052, + "learning_rate": 3.801565444267285e-05, + "loss": 0.7426, + "step": 8270 + }, + { + "epoch": 1.2001304489618436, + "grad_norm": 0.8487895131111145, + "learning_rate": 3.8001159588346143e-05, + "loss": 0.6723, + "step": 8280 + }, + { + "epoch": 1.201579881871218, + "grad_norm": 0.6286924481391907, + "learning_rate": 3.798666473401942e-05, + "loss": 0.6607, + "step": 8290 + }, + { + "epoch": 1.2030293147805922, + "grad_norm": 0.8928942680358887, + "learning_rate": 3.797216987969271e-05, + "loss": 0.6233, + "step": 8300 + }, + { + "epoch": 1.2044787476899663, + "grad_norm": 2.156315326690674, + "learning_rate": 3.7957675025365996e-05, + "loss": 0.6939, + "step": 8310 + }, + { + "epoch": 1.2059281805993405, + "grad_norm": 1.7508461475372314, + "learning_rate": 3.794318017103928e-05, + "loss": 0.6814, + "step": 8320 + }, + { + "epoch": 1.2073776135087146, + "grad_norm": 2.0084280967712402, + "learning_rate": 3.792868531671257e-05, + "loss": 0.7559, + "step": 8330 + }, + { + "epoch": 1.2088270464180888, + "grad_norm": 0.8208935260772705, + "learning_rate": 3.7914190462385856e-05, + "loss": 0.6455, + "step": 8340 + }, + { + "epoch": 1.2102764793274632, + "grad_norm": 0.764270007610321, + "learning_rate": 3.789969560805914e-05, + "loss": 0.7392, + "step": 8350 + }, + { + "epoch": 1.2117259122368373, + "grad_norm": 1.3177683353424072, + "learning_rate": 3.7885200753732425e-05, + "loss": 0.7061, + "step": 8360 + }, + { + "epoch": 1.2131753451462115, + "grad_norm": 1.047242283821106, + "learning_rate": 3.787070589940571e-05, + "loss": 0.6914, + "step": 8370 + }, + { + "epoch": 1.2146247780555857, + "grad_norm": 0.938040018081665, + "learning_rate": 3.7856211045079e-05, + "loss": 0.7243, + "step": 8380 + }, + { + "epoch": 1.21607421096496, + "grad_norm": 3.838555335998535, + "learning_rate": 3.7841716190752285e-05, + "loss": 0.7684, + "step": 8390 + }, + { + "epoch": 1.2175236438743342, + "grad_norm": 0.834057629108429, + "learning_rate": 3.782722133642557e-05, + "loss": 0.6682, + "step": 8400 + }, + { + "epoch": 1.2189730767837084, + "grad_norm": 1.0617311000823975, + "learning_rate": 3.781272648209886e-05, + "loss": 0.7431, + "step": 8410 + }, + { + "epoch": 1.2204225096930825, + "grad_norm": 1.9687778949737549, + "learning_rate": 3.779823162777214e-05, + "loss": 0.6855, + "step": 8420 + }, + { + "epoch": 1.2218719426024567, + "grad_norm": 1.4709055423736572, + "learning_rate": 3.778373677344543e-05, + "loss": 0.7468, + "step": 8430 + }, + { + "epoch": 1.223321375511831, + "grad_norm": 1.0255465507507324, + "learning_rate": 3.7769241919118713e-05, + "loss": 0.6915, + "step": 8440 + }, + { + "epoch": 1.2247708084212052, + "grad_norm": 0.910531759262085, + "learning_rate": 3.7754747064792e-05, + "loss": 0.7406, + "step": 8450 + }, + { + "epoch": 1.2262202413305794, + "grad_norm": 1.3547700643539429, + "learning_rate": 3.774025221046529e-05, + "loss": 0.7018, + "step": 8460 + }, + { + "epoch": 1.2276696742399535, + "grad_norm": 1.63336181640625, + "learning_rate": 3.772575735613857e-05, + "loss": 0.6442, + "step": 8470 + }, + { + "epoch": 1.229119107149328, + "grad_norm": 2.7954370975494385, + "learning_rate": 3.771126250181186e-05, + "loss": 0.6536, + "step": 8480 + }, + { + "epoch": 1.230568540058702, + "grad_norm": 1.6382683515548706, + "learning_rate": 3.769676764748514e-05, + "loss": 0.6524, + "step": 8490 + }, + { + "epoch": 1.2320179729680762, + "grad_norm": 1.616657018661499, + "learning_rate": 3.7682272793158426e-05, + "loss": 0.7309, + "step": 8500 + }, + { + "epoch": 1.2334674058774504, + "grad_norm": 0.8286413550376892, + "learning_rate": 3.766777793883172e-05, + "loss": 0.7236, + "step": 8510 + }, + { + "epoch": 1.2349168387868246, + "grad_norm": 2.312886953353882, + "learning_rate": 3.7653283084505e-05, + "loss": 0.6688, + "step": 8520 + }, + { + "epoch": 1.236366271696199, + "grad_norm": 2.2434260845184326, + "learning_rate": 3.763878823017829e-05, + "loss": 0.6509, + "step": 8530 + }, + { + "epoch": 1.237815704605573, + "grad_norm": 0.8880380988121033, + "learning_rate": 3.762429337585158e-05, + "loss": 0.8008, + "step": 8540 + }, + { + "epoch": 1.2392651375149473, + "grad_norm": 3.9391345977783203, + "learning_rate": 3.7609798521524855e-05, + "loss": 0.6917, + "step": 8550 + }, + { + "epoch": 1.2407145704243214, + "grad_norm": 1.7457298040390015, + "learning_rate": 3.7595303667198146e-05, + "loss": 0.7069, + "step": 8560 + }, + { + "epoch": 1.2421640033336958, + "grad_norm": 0.8259201645851135, + "learning_rate": 3.758080881287143e-05, + "loss": 0.7088, + "step": 8570 + }, + { + "epoch": 1.24361343624307, + "grad_norm": 0.7493927478790283, + "learning_rate": 3.756631395854472e-05, + "loss": 0.6286, + "step": 8580 + }, + { + "epoch": 1.2450628691524441, + "grad_norm": 3.048274040222168, + "learning_rate": 3.7551819104218006e-05, + "loss": 0.6848, + "step": 8590 + }, + { + "epoch": 1.2465123020618183, + "grad_norm": 1.0291824340820312, + "learning_rate": 3.753732424989129e-05, + "loss": 0.6626, + "step": 8600 + }, + { + "epoch": 1.2479617349711924, + "grad_norm": 1.0793825387954712, + "learning_rate": 3.7522829395564575e-05, + "loss": 0.6966, + "step": 8610 + }, + { + "epoch": 1.2494111678805666, + "grad_norm": 1.4464988708496094, + "learning_rate": 3.750833454123786e-05, + "loss": 0.6607, + "step": 8620 + }, + { + "epoch": 1.250860600789941, + "grad_norm": 0.8551133871078491, + "learning_rate": 3.749383968691115e-05, + "loss": 0.638, + "step": 8630 + }, + { + "epoch": 1.2523100336993152, + "grad_norm": 2.6740128993988037, + "learning_rate": 3.7479344832584435e-05, + "loss": 0.742, + "step": 8640 + }, + { + "epoch": 1.2537594666086893, + "grad_norm": 1.0272529125213623, + "learning_rate": 3.746484997825772e-05, + "loss": 0.6917, + "step": 8650 + }, + { + "epoch": 1.2552088995180637, + "grad_norm": 0.8823620080947876, + "learning_rate": 3.745035512393101e-05, + "loss": 0.7034, + "step": 8660 + }, + { + "epoch": 1.2566583324274379, + "grad_norm": 0.8183333873748779, + "learning_rate": 3.7435860269604294e-05, + "loss": 0.6761, + "step": 8670 + }, + { + "epoch": 1.258107765336812, + "grad_norm": 2.188427209854126, + "learning_rate": 3.742136541527758e-05, + "loss": 0.6813, + "step": 8680 + }, + { + "epoch": 1.2595571982461862, + "grad_norm": 0.7740078568458557, + "learning_rate": 3.740687056095086e-05, + "loss": 0.7348, + "step": 8690 + }, + { + "epoch": 1.2610066311555603, + "grad_norm": 3.2154736518859863, + "learning_rate": 3.739237570662415e-05, + "loss": 0.7179, + "step": 8700 + }, + { + "epoch": 1.2624560640649345, + "grad_norm": 2.6355783939361572, + "learning_rate": 3.737788085229744e-05, + "loss": 0.6656, + "step": 8710 + }, + { + "epoch": 1.2639054969743089, + "grad_norm": 0.9518022537231445, + "learning_rate": 3.736338599797072e-05, + "loss": 0.66, + "step": 8720 + }, + { + "epoch": 1.265354929883683, + "grad_norm": 0.9941968321800232, + "learning_rate": 3.734889114364401e-05, + "loss": 0.701, + "step": 8730 + }, + { + "epoch": 1.2668043627930572, + "grad_norm": 2.278553009033203, + "learning_rate": 3.733439628931729e-05, + "loss": 0.6655, + "step": 8740 + }, + { + "epoch": 1.2682537957024314, + "grad_norm": 2.2199084758758545, + "learning_rate": 3.7319901434990576e-05, + "loss": 0.642, + "step": 8750 + }, + { + "epoch": 1.2697032286118057, + "grad_norm": 0.855959415435791, + "learning_rate": 3.730540658066387e-05, + "loss": 0.6768, + "step": 8760 + }, + { + "epoch": 1.27115266152118, + "grad_norm": 0.7713690996170044, + "learning_rate": 3.729091172633715e-05, + "loss": 0.7202, + "step": 8770 + }, + { + "epoch": 1.272602094430554, + "grad_norm": 3.6193957328796387, + "learning_rate": 3.7276416872010436e-05, + "loss": 0.8522, + "step": 8780 + }, + { + "epoch": 1.2740515273399282, + "grad_norm": 0.8761569261550903, + "learning_rate": 3.726192201768373e-05, + "loss": 0.6998, + "step": 8790 + }, + { + "epoch": 1.2755009602493024, + "grad_norm": 0.8228587508201599, + "learning_rate": 3.724742716335701e-05, + "loss": 0.6271, + "step": 8800 + }, + { + "epoch": 1.2769503931586765, + "grad_norm": 0.967322826385498, + "learning_rate": 3.7232932309030296e-05, + "loss": 0.677, + "step": 8810 + }, + { + "epoch": 1.278399826068051, + "grad_norm": 0.7491191029548645, + "learning_rate": 3.721843745470358e-05, + "loss": 0.6275, + "step": 8820 + }, + { + "epoch": 1.279849258977425, + "grad_norm": 0.7908267974853516, + "learning_rate": 3.7203942600376864e-05, + "loss": 0.758, + "step": 8830 + }, + { + "epoch": 1.2812986918867992, + "grad_norm": 2.2761118412017822, + "learning_rate": 3.7189447746050156e-05, + "loss": 0.6876, + "step": 8840 + }, + { + "epoch": 1.2827481247961736, + "grad_norm": 1.9194432497024536, + "learning_rate": 3.717495289172344e-05, + "loss": 0.6291, + "step": 8850 + }, + { + "epoch": 1.2841975577055478, + "grad_norm": 0.9832128882408142, + "learning_rate": 3.716045803739673e-05, + "loss": 0.738, + "step": 8860 + }, + { + "epoch": 1.285646990614922, + "grad_norm": 1.0362465381622314, + "learning_rate": 3.714596318307001e-05, + "loss": 0.653, + "step": 8870 + }, + { + "epoch": 1.287096423524296, + "grad_norm": 0.9563276171684265, + "learning_rate": 3.713146832874329e-05, + "loss": 0.712, + "step": 8880 + }, + { + "epoch": 1.2885458564336703, + "grad_norm": 1.062441349029541, + "learning_rate": 3.7116973474416584e-05, + "loss": 0.6401, + "step": 8890 + }, + { + "epoch": 1.2899952893430444, + "grad_norm": 1.0190941095352173, + "learning_rate": 3.710247862008987e-05, + "loss": 0.6558, + "step": 8900 + }, + { + "epoch": 1.2914447222524188, + "grad_norm": 2.2705626487731934, + "learning_rate": 3.708798376576316e-05, + "loss": 0.6885, + "step": 8910 + }, + { + "epoch": 1.292894155161793, + "grad_norm": 2.16426157951355, + "learning_rate": 3.7073488911436444e-05, + "loss": 0.7323, + "step": 8920 + }, + { + "epoch": 1.2943435880711671, + "grad_norm": 0.7907456159591675, + "learning_rate": 3.705899405710973e-05, + "loss": 0.6317, + "step": 8930 + }, + { + "epoch": 1.2957930209805415, + "grad_norm": 0.942328691482544, + "learning_rate": 3.704449920278301e-05, + "loss": 0.7002, + "step": 8940 + }, + { + "epoch": 1.2972424538899157, + "grad_norm": 1.5620898008346558, + "learning_rate": 3.70300043484563e-05, + "loss": 0.6701, + "step": 8950 + }, + { + "epoch": 1.2986918867992898, + "grad_norm": 2.1964192390441895, + "learning_rate": 3.701550949412959e-05, + "loss": 0.6785, + "step": 8960 + }, + { + "epoch": 1.300141319708664, + "grad_norm": 1.622687816619873, + "learning_rate": 3.700101463980287e-05, + "loss": 0.7324, + "step": 8970 + }, + { + "epoch": 1.3015907526180381, + "grad_norm": 0.9879828095436096, + "learning_rate": 3.698651978547616e-05, + "loss": 0.6718, + "step": 8980 + }, + { + "epoch": 1.3030401855274123, + "grad_norm": 0.7968335747718811, + "learning_rate": 3.697202493114945e-05, + "loss": 0.7395, + "step": 8990 + }, + { + "epoch": 1.3044896184367867, + "grad_norm": 1.0809406042099, + "learning_rate": 3.6957530076822726e-05, + "loss": 0.6902, + "step": 9000 + }, + { + "epoch": 1.3044896184367867, + "eval_loss": 0.7758866548538208, + "eval_runtime": 672.1803, + "eval_samples_per_second": 51.318, + "eval_steps_per_second": 2.566, + "eval_token_accuracy": 0.000404195267006922, + "step": 9000 + }, + { + "epoch": 1.3059390513461608, + "grad_norm": 1.9905250072479248, + "learning_rate": 3.694303522249602e-05, + "loss": 0.7069, + "step": 9010 + }, + { + "epoch": 1.307388484255535, + "grad_norm": 3.8968214988708496, + "learning_rate": 3.69285403681693e-05, + "loss": 0.679, + "step": 9020 + }, + { + "epoch": 1.3088379171649092, + "grad_norm": 0.9726070761680603, + "learning_rate": 3.6914045513842586e-05, + "loss": 0.7227, + "step": 9030 + }, + { + "epoch": 1.3102873500742835, + "grad_norm": 2.276339054107666, + "learning_rate": 3.689955065951588e-05, + "loss": 0.7343, + "step": 9040 + }, + { + "epoch": 1.3117367829836577, + "grad_norm": 2.7617475986480713, + "learning_rate": 3.688505580518916e-05, + "loss": 0.7368, + "step": 9050 + }, + { + "epoch": 1.3131862158930319, + "grad_norm": 2.35437273979187, + "learning_rate": 3.6870560950862445e-05, + "loss": 0.7382, + "step": 9060 + }, + { + "epoch": 1.314635648802406, + "grad_norm": 1.8291373252868652, + "learning_rate": 3.685606609653573e-05, + "loss": 0.6743, + "step": 9070 + }, + { + "epoch": 1.3160850817117802, + "grad_norm": 0.8860570788383484, + "learning_rate": 3.6841571242209014e-05, + "loss": 0.7293, + "step": 9080 + }, + { + "epoch": 1.3175345146211543, + "grad_norm": 2.3467090129852295, + "learning_rate": 3.6827076387882305e-05, + "loss": 0.7022, + "step": 9090 + }, + { + "epoch": 1.3189839475305287, + "grad_norm": 0.8134902715682983, + "learning_rate": 3.681258153355559e-05, + "loss": 0.6055, + "step": 9100 + }, + { + "epoch": 1.3204333804399029, + "grad_norm": 1.741827130317688, + "learning_rate": 3.6798086679228874e-05, + "loss": 0.6536, + "step": 9110 + }, + { + "epoch": 1.321882813349277, + "grad_norm": 0.9208307266235352, + "learning_rate": 3.6783591824902165e-05, + "loss": 0.6733, + "step": 9120 + }, + { + "epoch": 1.3233322462586514, + "grad_norm": 0.9000769257545471, + "learning_rate": 3.676909697057544e-05, + "loss": 0.7081, + "step": 9130 + }, + { + "epoch": 1.3247816791680256, + "grad_norm": 0.9797672033309937, + "learning_rate": 3.6754602116248734e-05, + "loss": 0.6804, + "step": 9140 + }, + { + "epoch": 1.3262311120773997, + "grad_norm": 3.0780463218688965, + "learning_rate": 3.674010726192202e-05, + "loss": 0.641, + "step": 9150 + }, + { + "epoch": 1.327680544986774, + "grad_norm": 0.9936016201972961, + "learning_rate": 3.67256124075953e-05, + "loss": 0.6929, + "step": 9160 + }, + { + "epoch": 1.329129977896148, + "grad_norm": 2.4128029346466064, + "learning_rate": 3.6711117553268594e-05, + "loss": 0.6985, + "step": 9170 + }, + { + "epoch": 1.3305794108055222, + "grad_norm": 0.8239456415176392, + "learning_rate": 3.669662269894188e-05, + "loss": 0.6691, + "step": 9180 + }, + { + "epoch": 1.3320288437148966, + "grad_norm": 2.196967124938965, + "learning_rate": 3.668212784461516e-05, + "loss": 0.7352, + "step": 9190 + }, + { + "epoch": 1.3334782766242708, + "grad_norm": 1.011257529258728, + "learning_rate": 3.666763299028845e-05, + "loss": 0.6596, + "step": 9200 + }, + { + "epoch": 1.334927709533645, + "grad_norm": 1.0266071557998657, + "learning_rate": 3.665313813596173e-05, + "loss": 0.7006, + "step": 9210 + }, + { + "epoch": 1.3363771424430193, + "grad_norm": 0.8049505949020386, + "learning_rate": 3.663864328163502e-05, + "loss": 0.654, + "step": 9220 + }, + { + "epoch": 1.3378265753523935, + "grad_norm": 1.4919633865356445, + "learning_rate": 3.6624148427308307e-05, + "loss": 0.6717, + "step": 9230 + }, + { + "epoch": 1.3392760082617676, + "grad_norm": 1.3777735233306885, + "learning_rate": 3.66096535729816e-05, + "loss": 0.7646, + "step": 9240 + }, + { + "epoch": 1.3407254411711418, + "grad_norm": 1.0090413093566895, + "learning_rate": 3.659515871865488e-05, + "loss": 0.661, + "step": 9250 + }, + { + "epoch": 1.342174874080516, + "grad_norm": 0.9088075757026672, + "learning_rate": 3.658066386432816e-05, + "loss": 0.7843, + "step": 9260 + }, + { + "epoch": 1.34362430698989, + "grad_norm": 2.6569576263427734, + "learning_rate": 3.656616901000145e-05, + "loss": 0.6978, + "step": 9270 + }, + { + "epoch": 1.3450737398992645, + "grad_norm": 2.193643808364868, + "learning_rate": 3.6551674155674735e-05, + "loss": 0.6263, + "step": 9280 + }, + { + "epoch": 1.3465231728086386, + "grad_norm": 1.4514281749725342, + "learning_rate": 3.6537179301348026e-05, + "loss": 0.6815, + "step": 9290 + }, + { + "epoch": 1.3479726057180128, + "grad_norm": 1.3338249921798706, + "learning_rate": 3.652268444702131e-05, + "loss": 0.7358, + "step": 9300 + }, + { + "epoch": 1.349422038627387, + "grad_norm": 0.815692663192749, + "learning_rate": 3.6508189592694595e-05, + "loss": 0.6351, + "step": 9310 + }, + { + "epoch": 1.3508714715367613, + "grad_norm": 1.9944225549697876, + "learning_rate": 3.649369473836788e-05, + "loss": 0.7492, + "step": 9320 + }, + { + "epoch": 1.3523209044461355, + "grad_norm": 0.8797019720077515, + "learning_rate": 3.6479199884041164e-05, + "loss": 0.7493, + "step": 9330 + }, + { + "epoch": 1.3537703373555097, + "grad_norm": 0.9795637726783752, + "learning_rate": 3.6464705029714455e-05, + "loss": 0.6286, + "step": 9340 + }, + { + "epoch": 1.3552197702648838, + "grad_norm": 1.6963341236114502, + "learning_rate": 3.645021017538774e-05, + "loss": 0.6751, + "step": 9350 + }, + { + "epoch": 1.356669203174258, + "grad_norm": 2.1599597930908203, + "learning_rate": 3.6435715321061024e-05, + "loss": 0.7438, + "step": 9360 + }, + { + "epoch": 1.3581186360836321, + "grad_norm": 2.2890264987945557, + "learning_rate": 3.6421220466734315e-05, + "loss": 0.666, + "step": 9370 + }, + { + "epoch": 1.3595680689930065, + "grad_norm": 0.921067476272583, + "learning_rate": 3.64067256124076e-05, + "loss": 0.6272, + "step": 9380 + }, + { + "epoch": 1.3610175019023807, + "grad_norm": 2.4551358222961426, + "learning_rate": 3.6392230758080883e-05, + "loss": 0.7137, + "step": 9390 + }, + { + "epoch": 1.3624669348117548, + "grad_norm": 2.1481635570526123, + "learning_rate": 3.637773590375417e-05, + "loss": 0.6762, + "step": 9400 + }, + { + "epoch": 1.3639163677211292, + "grad_norm": 2.6500132083892822, + "learning_rate": 3.636324104942745e-05, + "loss": 0.7908, + "step": 9410 + }, + { + "epoch": 1.3653658006305034, + "grad_norm": 1.9957408905029297, + "learning_rate": 3.634874619510074e-05, + "loss": 0.6225, + "step": 9420 + }, + { + "epoch": 1.3668152335398775, + "grad_norm": 1.0306966304779053, + "learning_rate": 3.633425134077403e-05, + "loss": 0.7203, + "step": 9430 + }, + { + "epoch": 1.3682646664492517, + "grad_norm": 2.174342393875122, + "learning_rate": 3.631975648644731e-05, + "loss": 0.7314, + "step": 9440 + }, + { + "epoch": 1.3697140993586259, + "grad_norm": 0.7824757099151611, + "learning_rate": 3.6305261632120596e-05, + "loss": 0.7611, + "step": 9450 + }, + { + "epoch": 1.371163532268, + "grad_norm": 2.024747371673584, + "learning_rate": 3.629076677779388e-05, + "loss": 0.7054, + "step": 9460 + }, + { + "epoch": 1.3726129651773744, + "grad_norm": 0.7712695598602295, + "learning_rate": 3.627627192346717e-05, + "loss": 0.7164, + "step": 9470 + }, + { + "epoch": 1.3740623980867486, + "grad_norm": 2.13800311088562, + "learning_rate": 3.6261777069140456e-05, + "loss": 0.6677, + "step": 9480 + }, + { + "epoch": 1.3755118309961227, + "grad_norm": 3.425255060195923, + "learning_rate": 3.624728221481374e-05, + "loss": 0.7723, + "step": 9490 + }, + { + "epoch": 1.3769612639054971, + "grad_norm": 0.7990706562995911, + "learning_rate": 3.623278736048703e-05, + "loss": 0.7634, + "step": 9500 + }, + { + "epoch": 1.3784106968148713, + "grad_norm": 1.1414493322372437, + "learning_rate": 3.6218292506160316e-05, + "loss": 0.7022, + "step": 9510 + }, + { + "epoch": 1.3798601297242454, + "grad_norm": 0.868614137172699, + "learning_rate": 3.62037976518336e-05, + "loss": 0.6991, + "step": 9520 + }, + { + "epoch": 1.3813095626336196, + "grad_norm": 1.5563892126083374, + "learning_rate": 3.6189302797506885e-05, + "loss": 0.7179, + "step": 9530 + }, + { + "epoch": 1.3827589955429938, + "grad_norm": 2.05755615234375, + "learning_rate": 3.617480794318017e-05, + "loss": 0.6415, + "step": 9540 + }, + { + "epoch": 1.384208428452368, + "grad_norm": 0.7626024484634399, + "learning_rate": 3.616031308885346e-05, + "loss": 0.6597, + "step": 9550 + }, + { + "epoch": 1.3856578613617423, + "grad_norm": 1.085626244544983, + "learning_rate": 3.6145818234526745e-05, + "loss": 0.6923, + "step": 9560 + }, + { + "epoch": 1.3871072942711165, + "grad_norm": 1.0802031755447388, + "learning_rate": 3.6131323380200036e-05, + "loss": 0.7292, + "step": 9570 + }, + { + "epoch": 1.3885567271804906, + "grad_norm": 1.0880290269851685, + "learning_rate": 3.6116828525873313e-05, + "loss": 0.6502, + "step": 9580 + }, + { + "epoch": 1.3900061600898648, + "grad_norm": 0.8710733652114868, + "learning_rate": 3.61023336715466e-05, + "loss": 0.7041, + "step": 9590 + }, + { + "epoch": 1.3914555929992392, + "grad_norm": 2.254011631011963, + "learning_rate": 3.608783881721989e-05, + "loss": 0.7389, + "step": 9600 + }, + { + "epoch": 1.3929050259086133, + "grad_norm": 0.849483847618103, + "learning_rate": 3.607334396289317e-05, + "loss": 0.7202, + "step": 9610 + }, + { + "epoch": 1.3943544588179875, + "grad_norm": 1.5035589933395386, + "learning_rate": 3.6058849108566464e-05, + "loss": 0.6131, + "step": 9620 + }, + { + "epoch": 1.3958038917273616, + "grad_norm": 2.1819663047790527, + "learning_rate": 3.604435425423975e-05, + "loss": 0.6658, + "step": 9630 + }, + { + "epoch": 1.3972533246367358, + "grad_norm": 0.8759439587593079, + "learning_rate": 3.602985939991303e-05, + "loss": 0.5972, + "step": 9640 + }, + { + "epoch": 1.39870275754611, + "grad_norm": 1.233659267425537, + "learning_rate": 3.601536454558632e-05, + "loss": 0.7624, + "step": 9650 + }, + { + "epoch": 1.4001521904554843, + "grad_norm": 1.6374999284744263, + "learning_rate": 3.60008696912596e-05, + "loss": 0.6939, + "step": 9660 + }, + { + "epoch": 1.4016016233648585, + "grad_norm": 0.7916937470436096, + "learning_rate": 3.598637483693289e-05, + "loss": 0.7305, + "step": 9670 + }, + { + "epoch": 1.4030510562742327, + "grad_norm": 0.9866163730621338, + "learning_rate": 3.597187998260618e-05, + "loss": 0.6483, + "step": 9680 + }, + { + "epoch": 1.404500489183607, + "grad_norm": 0.7897905707359314, + "learning_rate": 3.595738512827946e-05, + "loss": 0.7158, + "step": 9690 + }, + { + "epoch": 1.4059499220929812, + "grad_norm": 0.9635679125785828, + "learning_rate": 3.594289027395275e-05, + "loss": 0.7239, + "step": 9700 + }, + { + "epoch": 1.4073993550023554, + "grad_norm": 0.8831052780151367, + "learning_rate": 3.592839541962603e-05, + "loss": 0.5833, + "step": 9710 + }, + { + "epoch": 1.4088487879117295, + "grad_norm": 0.8457877039909363, + "learning_rate": 3.591390056529932e-05, + "loss": 0.6611, + "step": 9720 + }, + { + "epoch": 1.4102982208211037, + "grad_norm": 1.9585788249969482, + "learning_rate": 3.5899405710972606e-05, + "loss": 0.742, + "step": 9730 + }, + { + "epoch": 1.4117476537304778, + "grad_norm": 0.816891610622406, + "learning_rate": 3.588491085664589e-05, + "loss": 0.6449, + "step": 9740 + }, + { + "epoch": 1.4131970866398522, + "grad_norm": 1.15848970413208, + "learning_rate": 3.587041600231918e-05, + "loss": 0.6755, + "step": 9750 + }, + { + "epoch": 1.4146465195492264, + "grad_norm": 1.4050579071044922, + "learning_rate": 3.5855921147992466e-05, + "loss": 0.7529, + "step": 9760 + }, + { + "epoch": 1.4160959524586005, + "grad_norm": 0.9132912755012512, + "learning_rate": 3.584142629366575e-05, + "loss": 0.7623, + "step": 9770 + }, + { + "epoch": 1.4175453853679747, + "grad_norm": 2.101269006729126, + "learning_rate": 3.5826931439339034e-05, + "loss": 0.6348, + "step": 9780 + }, + { + "epoch": 1.418994818277349, + "grad_norm": 1.0391186475753784, + "learning_rate": 3.581243658501232e-05, + "loss": 0.6816, + "step": 9790 + }, + { + "epoch": 1.4204442511867232, + "grad_norm": 0.8907914757728577, + "learning_rate": 3.579794173068561e-05, + "loss": 0.6829, + "step": 9800 + }, + { + "epoch": 1.4218936840960974, + "grad_norm": 1.922855257987976, + "learning_rate": 3.5783446876358894e-05, + "loss": 0.6947, + "step": 9810 + }, + { + "epoch": 1.4233431170054716, + "grad_norm": 2.7909116744995117, + "learning_rate": 3.576895202203218e-05, + "loss": 0.7583, + "step": 9820 + }, + { + "epoch": 1.4247925499148457, + "grad_norm": 0.9503197073936462, + "learning_rate": 3.575445716770547e-05, + "loss": 0.6714, + "step": 9830 + }, + { + "epoch": 1.42624198282422, + "grad_norm": 1.3675655126571655, + "learning_rate": 3.573996231337875e-05, + "loss": 0.6434, + "step": 9840 + }, + { + "epoch": 1.4276914157335943, + "grad_norm": 0.931876003742218, + "learning_rate": 3.572546745905204e-05, + "loss": 0.6797, + "step": 9850 + }, + { + "epoch": 1.4291408486429684, + "grad_norm": 3.2150590419769287, + "learning_rate": 3.571097260472532e-05, + "loss": 0.6415, + "step": 9860 + }, + { + "epoch": 1.4305902815523426, + "grad_norm": 0.887059211730957, + "learning_rate": 3.569647775039861e-05, + "loss": 0.7293, + "step": 9870 + }, + { + "epoch": 1.432039714461717, + "grad_norm": 0.790198028087616, + "learning_rate": 3.56819828960719e-05, + "loss": 0.7376, + "step": 9880 + }, + { + "epoch": 1.4334891473710911, + "grad_norm": 2.035407781600952, + "learning_rate": 3.566748804174518e-05, + "loss": 0.7065, + "step": 9890 + }, + { + "epoch": 1.4349385802804653, + "grad_norm": 4.058956623077393, + "learning_rate": 3.5652993187418474e-05, + "loss": 0.6915, + "step": 9900 + }, + { + "epoch": 1.4363880131898394, + "grad_norm": 1.8480651378631592, + "learning_rate": 3.563849833309175e-05, + "loss": 0.6387, + "step": 9910 + }, + { + "epoch": 1.4378374460992136, + "grad_norm": 2.976912021636963, + "learning_rate": 3.5624003478765036e-05, + "loss": 0.734, + "step": 9920 + }, + { + "epoch": 1.4392868790085878, + "grad_norm": 0.7814016342163086, + "learning_rate": 3.560950862443833e-05, + "loss": 0.6622, + "step": 9930 + }, + { + "epoch": 1.4407363119179621, + "grad_norm": 2.2150237560272217, + "learning_rate": 3.559501377011161e-05, + "loss": 0.7197, + "step": 9940 + }, + { + "epoch": 1.4421857448273363, + "grad_norm": 0.8639504909515381, + "learning_rate": 3.55805189157849e-05, + "loss": 0.6108, + "step": 9950 + }, + { + "epoch": 1.4436351777367105, + "grad_norm": 1.8026032447814941, + "learning_rate": 3.556602406145819e-05, + "loss": 0.7659, + "step": 9960 + }, + { + "epoch": 1.4450846106460848, + "grad_norm": 2.1232337951660156, + "learning_rate": 3.5551529207131464e-05, + "loss": 0.7001, + "step": 9970 + }, + { + "epoch": 1.446534043555459, + "grad_norm": 0.9476084113121033, + "learning_rate": 3.5537034352804756e-05, + "loss": 0.672, + "step": 9980 + }, + { + "epoch": 1.4479834764648332, + "grad_norm": 1.823028802871704, + "learning_rate": 3.552253949847804e-05, + "loss": 0.7238, + "step": 9990 + }, + { + "epoch": 1.4494329093742073, + "grad_norm": 1.8687421083450317, + "learning_rate": 3.550804464415133e-05, + "loss": 0.6683, + "step": 10000 + }, + { + "epoch": 1.4494329093742073, + "eval_loss": 0.7658461332321167, + "eval_runtime": 671.2876, + "eval_samples_per_second": 51.386, + "eval_steps_per_second": 2.57, + "eval_token_accuracy": 0.00042253122420500756, + "step": 10000 + }, + { + "epoch": 1.4508823422835815, + "grad_norm": 0.9342031478881836, + "learning_rate": 3.5493549789824615e-05, + "loss": 0.7305, + "step": 10010 + }, + { + "epoch": 1.4523317751929556, + "grad_norm": 0.755462110042572, + "learning_rate": 3.54790549354979e-05, + "loss": 0.6838, + "step": 10020 + }, + { + "epoch": 1.45378120810233, + "grad_norm": 1.6950440406799316, + "learning_rate": 3.546456008117119e-05, + "loss": 0.6614, + "step": 10030 + }, + { + "epoch": 1.4552306410117042, + "grad_norm": 0.9943178296089172, + "learning_rate": 3.545006522684447e-05, + "loss": 0.7104, + "step": 10040 + }, + { + "epoch": 1.4566800739210783, + "grad_norm": 1.9179010391235352, + "learning_rate": 3.543557037251776e-05, + "loss": 0.6876, + "step": 10050 + }, + { + "epoch": 1.4581295068304525, + "grad_norm": 2.0479156970977783, + "learning_rate": 3.5421075518191044e-05, + "loss": 0.6735, + "step": 10060 + }, + { + "epoch": 1.4595789397398269, + "grad_norm": 2.0140061378479004, + "learning_rate": 3.540658066386433e-05, + "loss": 0.7058, + "step": 10070 + }, + { + "epoch": 1.461028372649201, + "grad_norm": 0.9706962704658508, + "learning_rate": 3.539208580953762e-05, + "loss": 0.6184, + "step": 10080 + }, + { + "epoch": 1.4624778055585752, + "grad_norm": 0.7795096039772034, + "learning_rate": 3.5377590955210904e-05, + "loss": 0.7585, + "step": 10090 + }, + { + "epoch": 1.4639272384679494, + "grad_norm": 0.8623164296150208, + "learning_rate": 3.536309610088419e-05, + "loss": 0.6902, + "step": 10100 + }, + { + "epoch": 1.4653766713773235, + "grad_norm": 1.4100171327590942, + "learning_rate": 3.534860124655747e-05, + "loss": 0.7216, + "step": 10110 + }, + { + "epoch": 1.466826104286698, + "grad_norm": 0.8636854887008667, + "learning_rate": 3.533410639223076e-05, + "loss": 0.6922, + "step": 10120 + }, + { + "epoch": 1.468275537196072, + "grad_norm": 0.8046514987945557, + "learning_rate": 3.531961153790405e-05, + "loss": 0.7511, + "step": 10130 + }, + { + "epoch": 1.4697249701054462, + "grad_norm": 1.5246775150299072, + "learning_rate": 3.530511668357733e-05, + "loss": 0.6348, + "step": 10140 + }, + { + "epoch": 1.4711744030148204, + "grad_norm": 0.7951986789703369, + "learning_rate": 3.529062182925062e-05, + "loss": 0.6492, + "step": 10150 + }, + { + "epoch": 1.4726238359241948, + "grad_norm": 1.9236490726470947, + "learning_rate": 3.52761269749239e-05, + "loss": 0.7468, + "step": 10160 + }, + { + "epoch": 1.474073268833569, + "grad_norm": 3.2273659706115723, + "learning_rate": 3.5261632120597185e-05, + "loss": 0.6982, + "step": 10170 + }, + { + "epoch": 1.475522701742943, + "grad_norm": 2.427692413330078, + "learning_rate": 3.5247137266270477e-05, + "loss": 0.8, + "step": 10180 + }, + { + "epoch": 1.4769721346523172, + "grad_norm": 1.7644684314727783, + "learning_rate": 3.523264241194376e-05, + "loss": 0.7468, + "step": 10190 + }, + { + "epoch": 1.4784215675616914, + "grad_norm": 0.9527018666267395, + "learning_rate": 3.5218147557617045e-05, + "loss": 0.6499, + "step": 10200 + }, + { + "epoch": 1.4798710004710656, + "grad_norm": 0.6727984547615051, + "learning_rate": 3.5203652703290336e-05, + "loss": 0.6432, + "step": 10210 + }, + { + "epoch": 1.48132043338044, + "grad_norm": 1.005982518196106, + "learning_rate": 3.518915784896362e-05, + "loss": 0.7578, + "step": 10220 + }, + { + "epoch": 1.482769866289814, + "grad_norm": 0.9719845056533813, + "learning_rate": 3.5174662994636905e-05, + "loss": 0.6569, + "step": 10230 + }, + { + "epoch": 1.4842192991991883, + "grad_norm": 0.7539176344871521, + "learning_rate": 3.516016814031019e-05, + "loss": 0.6088, + "step": 10240 + }, + { + "epoch": 1.4856687321085627, + "grad_norm": 2.297215700149536, + "learning_rate": 3.5145673285983474e-05, + "loss": 0.572, + "step": 10250 + }, + { + "epoch": 1.4871181650179368, + "grad_norm": 0.7237049341201782, + "learning_rate": 3.5131178431656765e-05, + "loss": 0.679, + "step": 10260 + }, + { + "epoch": 1.488567597927311, + "grad_norm": 0.7301941514015198, + "learning_rate": 3.511668357733005e-05, + "loss": 0.7794, + "step": 10270 + }, + { + "epoch": 1.4900170308366851, + "grad_norm": 2.2130892276763916, + "learning_rate": 3.510218872300334e-05, + "loss": 0.6488, + "step": 10280 + }, + { + "epoch": 1.4914664637460593, + "grad_norm": 1.0272775888442993, + "learning_rate": 3.508769386867662e-05, + "loss": 0.7033, + "step": 10290 + }, + { + "epoch": 1.4929158966554334, + "grad_norm": 1.5348803997039795, + "learning_rate": 3.50731990143499e-05, + "loss": 0.7062, + "step": 10300 + }, + { + "epoch": 1.4943653295648078, + "grad_norm": 2.1071600914001465, + "learning_rate": 3.5058704160023194e-05, + "loss": 0.6284, + "step": 10310 + }, + { + "epoch": 1.495814762474182, + "grad_norm": 1.7620905637741089, + "learning_rate": 3.504420930569648e-05, + "loss": 0.6207, + "step": 10320 + }, + { + "epoch": 1.4972641953835562, + "grad_norm": 1.4796020984649658, + "learning_rate": 3.502971445136977e-05, + "loss": 0.7086, + "step": 10330 + }, + { + "epoch": 1.4987136282929303, + "grad_norm": 0.8734525442123413, + "learning_rate": 3.5015219597043053e-05, + "loss": 0.6757, + "step": 10340 + }, + { + "epoch": 1.5001630612023047, + "grad_norm": 0.849749743938446, + "learning_rate": 3.500072474271634e-05, + "loss": 0.7431, + "step": 10350 + }, + { + "epoch": 1.5016124941116789, + "grad_norm": 1.0296956300735474, + "learning_rate": 3.498622988838962e-05, + "loss": 0.6848, + "step": 10360 + }, + { + "epoch": 1.503061927021053, + "grad_norm": 1.796150803565979, + "learning_rate": 3.4971735034062907e-05, + "loss": 0.635, + "step": 10370 + }, + { + "epoch": 1.5045113599304272, + "grad_norm": 3.076692581176758, + "learning_rate": 3.49572401797362e-05, + "loss": 0.7448, + "step": 10380 + }, + { + "epoch": 1.5059607928398013, + "grad_norm": 0.9386917352676392, + "learning_rate": 3.494274532540948e-05, + "loss": 0.7112, + "step": 10390 + }, + { + "epoch": 1.5074102257491755, + "grad_norm": 2.2855920791625977, + "learning_rate": 3.4928250471082766e-05, + "loss": 0.7775, + "step": 10400 + }, + { + "epoch": 1.5088596586585499, + "grad_norm": 0.7951090931892395, + "learning_rate": 3.491375561675606e-05, + "loss": 0.7277, + "step": 10410 + }, + { + "epoch": 1.510309091567924, + "grad_norm": 2.5202653408050537, + "learning_rate": 3.4899260762429335e-05, + "loss": 0.6719, + "step": 10420 + }, + { + "epoch": 1.5117585244772984, + "grad_norm": 1.999173879623413, + "learning_rate": 3.4884765908102626e-05, + "loss": 0.7104, + "step": 10430 + }, + { + "epoch": 1.5132079573866726, + "grad_norm": 0.9255297183990479, + "learning_rate": 3.487027105377591e-05, + "loss": 0.6208, + "step": 10440 + }, + { + "epoch": 1.5146573902960467, + "grad_norm": 1.7983359098434448, + "learning_rate": 3.4855776199449195e-05, + "loss": 0.7865, + "step": 10450 + }, + { + "epoch": 1.516106823205421, + "grad_norm": 0.9478282332420349, + "learning_rate": 3.4841281345122486e-05, + "loss": 0.7451, + "step": 10460 + }, + { + "epoch": 1.517556256114795, + "grad_norm": 1.0491830110549927, + "learning_rate": 3.482678649079577e-05, + "loss": 0.7612, + "step": 10470 + }, + { + "epoch": 1.5190056890241692, + "grad_norm": 0.9677424430847168, + "learning_rate": 3.4812291636469055e-05, + "loss": 0.6674, + "step": 10480 + }, + { + "epoch": 1.5204551219335434, + "grad_norm": 1.719191312789917, + "learning_rate": 3.479779678214234e-05, + "loss": 0.6897, + "step": 10490 + }, + { + "epoch": 1.5219045548429178, + "grad_norm": 0.9445870518684387, + "learning_rate": 3.4783301927815624e-05, + "loss": 0.6801, + "step": 10500 + }, + { + "epoch": 1.523353987752292, + "grad_norm": 2.001390218734741, + "learning_rate": 3.4768807073488915e-05, + "loss": 0.6875, + "step": 10510 + }, + { + "epoch": 1.524803420661666, + "grad_norm": 2.875307321548462, + "learning_rate": 3.47543122191622e-05, + "loss": 0.6728, + "step": 10520 + }, + { + "epoch": 1.5262528535710405, + "grad_norm": 2.2775304317474365, + "learning_rate": 3.473981736483548e-05, + "loss": 0.6632, + "step": 10530 + }, + { + "epoch": 1.5277022864804146, + "grad_norm": 0.9073929190635681, + "learning_rate": 3.4725322510508775e-05, + "loss": 0.7309, + "step": 10540 + }, + { + "epoch": 1.5291517193897888, + "grad_norm": 1.4111050367355347, + "learning_rate": 3.471082765618205e-05, + "loss": 0.6356, + "step": 10550 + }, + { + "epoch": 1.530601152299163, + "grad_norm": 1.236938238143921, + "learning_rate": 3.469633280185534e-05, + "loss": 0.697, + "step": 10560 + }, + { + "epoch": 1.532050585208537, + "grad_norm": 0.9304881691932678, + "learning_rate": 3.468183794752863e-05, + "loss": 0.66, + "step": 10570 + }, + { + "epoch": 1.5335000181179113, + "grad_norm": 0.6954860091209412, + "learning_rate": 3.466734309320191e-05, + "loss": 0.6499, + "step": 10580 + }, + { + "epoch": 1.5349494510272854, + "grad_norm": 1.1868486404418945, + "learning_rate": 3.46528482388752e-05, + "loss": 0.709, + "step": 10590 + }, + { + "epoch": 1.5363988839366598, + "grad_norm": 2.335561513900757, + "learning_rate": 3.463835338454849e-05, + "loss": 0.6909, + "step": 10600 + }, + { + "epoch": 1.537848316846034, + "grad_norm": 0.7953568696975708, + "learning_rate": 3.462385853022178e-05, + "loss": 0.6548, + "step": 10610 + }, + { + "epoch": 1.5392977497554083, + "grad_norm": 0.893295168876648, + "learning_rate": 3.4609363675895056e-05, + "loss": 0.7508, + "step": 10620 + }, + { + "epoch": 1.5407471826647825, + "grad_norm": 3.305266857147217, + "learning_rate": 3.459486882156834e-05, + "loss": 0.6312, + "step": 10630 + }, + { + "epoch": 1.5421966155741567, + "grad_norm": 1.2510921955108643, + "learning_rate": 3.458037396724163e-05, + "loss": 0.6669, + "step": 10640 + }, + { + "epoch": 1.5436460484835308, + "grad_norm": 0.8094971776008606, + "learning_rate": 3.4565879112914916e-05, + "loss": 0.7096, + "step": 10650 + }, + { + "epoch": 1.545095481392905, + "grad_norm": 0.8499931693077087, + "learning_rate": 3.455138425858821e-05, + "loss": 0.6208, + "step": 10660 + }, + { + "epoch": 1.5465449143022791, + "grad_norm": 2.0002918243408203, + "learning_rate": 3.453688940426149e-05, + "loss": 0.7067, + "step": 10670 + }, + { + "epoch": 1.5479943472116533, + "grad_norm": 3.303682565689087, + "learning_rate": 3.452239454993477e-05, + "loss": 0.6802, + "step": 10680 + }, + { + "epoch": 1.5494437801210277, + "grad_norm": 0.8864135146141052, + "learning_rate": 3.450789969560806e-05, + "loss": 0.7016, + "step": 10690 + }, + { + "epoch": 1.5508932130304018, + "grad_norm": 1.1588127613067627, + "learning_rate": 3.4493404841281345e-05, + "loss": 0.6362, + "step": 10700 + }, + { + "epoch": 1.5523426459397762, + "grad_norm": 1.947551965713501, + "learning_rate": 3.4478909986954636e-05, + "loss": 0.7554, + "step": 10710 + }, + { + "epoch": 1.5537920788491504, + "grad_norm": 2.90364933013916, + "learning_rate": 3.446441513262792e-05, + "loss": 0.724, + "step": 10720 + }, + { + "epoch": 1.5552415117585245, + "grad_norm": 0.898250937461853, + "learning_rate": 3.4449920278301204e-05, + "loss": 0.6948, + "step": 10730 + }, + { + "epoch": 1.5566909446678987, + "grad_norm": 2.2128922939300537, + "learning_rate": 3.4435425423974496e-05, + "loss": 0.6681, + "step": 10740 + }, + { + "epoch": 1.5581403775772729, + "grad_norm": 0.7464655041694641, + "learning_rate": 3.442093056964777e-05, + "loss": 0.6382, + "step": 10750 + }, + { + "epoch": 1.559589810486647, + "grad_norm": 0.7855749726295471, + "learning_rate": 3.4406435715321064e-05, + "loss": 0.7867, + "step": 10760 + }, + { + "epoch": 1.5610392433960212, + "grad_norm": 0.9440972805023193, + "learning_rate": 3.439194086099435e-05, + "loss": 0.7226, + "step": 10770 + }, + { + "epoch": 1.5624886763053956, + "grad_norm": 1.0518885850906372, + "learning_rate": 3.437744600666763e-05, + "loss": 0.7334, + "step": 10780 + }, + { + "epoch": 1.5639381092147697, + "grad_norm": 0.8372756242752075, + "learning_rate": 3.4362951152340924e-05, + "loss": 0.7222, + "step": 10790 + }, + { + "epoch": 1.5653875421241439, + "grad_norm": 0.901339054107666, + "learning_rate": 3.434845629801421e-05, + "loss": 0.7014, + "step": 10800 + }, + { + "epoch": 1.5668369750335183, + "grad_norm": 3.179217576980591, + "learning_rate": 3.433396144368749e-05, + "loss": 0.6731, + "step": 10810 + }, + { + "epoch": 1.5682864079428924, + "grad_norm": 0.967142641544342, + "learning_rate": 3.431946658936078e-05, + "loss": 0.7114, + "step": 10820 + }, + { + "epoch": 1.5697358408522666, + "grad_norm": 3.2705516815185547, + "learning_rate": 3.430497173503406e-05, + "loss": 0.6808, + "step": 10830 + }, + { + "epoch": 1.5711852737616407, + "grad_norm": 1.0143731832504272, + "learning_rate": 3.429047688070735e-05, + "loss": 0.6969, + "step": 10840 + }, + { + "epoch": 1.572634706671015, + "grad_norm": 0.7936012744903564, + "learning_rate": 3.427598202638064e-05, + "loss": 0.7324, + "step": 10850 + }, + { + "epoch": 1.574084139580389, + "grad_norm": 2.3230628967285156, + "learning_rate": 3.426148717205392e-05, + "loss": 0.666, + "step": 10860 + }, + { + "epoch": 1.5755335724897632, + "grad_norm": 2.224515199661255, + "learning_rate": 3.424699231772721e-05, + "loss": 0.6755, + "step": 10870 + }, + { + "epoch": 1.5769830053991376, + "grad_norm": 1.1359072923660278, + "learning_rate": 3.423249746340049e-05, + "loss": 0.7035, + "step": 10880 + }, + { + "epoch": 1.5784324383085118, + "grad_norm": 0.8372917175292969, + "learning_rate": 3.421800260907378e-05, + "loss": 0.5668, + "step": 10890 + }, + { + "epoch": 1.5798818712178861, + "grad_norm": 0.7077991962432861, + "learning_rate": 3.4203507754747066e-05, + "loss": 0.6532, + "step": 10900 + }, + { + "epoch": 1.5813313041272603, + "grad_norm": 1.7431057691574097, + "learning_rate": 3.418901290042035e-05, + "loss": 0.6941, + "step": 10910 + }, + { + "epoch": 1.5827807370366345, + "grad_norm": 0.7109362483024597, + "learning_rate": 3.417596753152631e-05, + "loss": 0.6996, + "step": 10920 + }, + { + "epoch": 1.5842301699460086, + "grad_norm": 0.884335994720459, + "learning_rate": 3.4161472677199594e-05, + "loss": 0.7177, + "step": 10930 + }, + { + "epoch": 1.5856796028553828, + "grad_norm": 2.881589651107788, + "learning_rate": 3.4146977822872885e-05, + "loss": 0.6996, + "step": 10940 + }, + { + "epoch": 1.587129035764757, + "grad_norm": 1.1488120555877686, + "learning_rate": 3.413248296854617e-05, + "loss": 0.684, + "step": 10950 + }, + { + "epoch": 1.588578468674131, + "grad_norm": 2.397216320037842, + "learning_rate": 3.4117988114219454e-05, + "loss": 0.677, + "step": 10960 + }, + { + "epoch": 1.5900279015835055, + "grad_norm": 1.1147899627685547, + "learning_rate": 3.410349325989274e-05, + "loss": 0.6599, + "step": 10970 + }, + { + "epoch": 1.5914773344928796, + "grad_norm": 2.646649122238159, + "learning_rate": 3.408899840556602e-05, + "loss": 0.6607, + "step": 10980 + }, + { + "epoch": 1.592926767402254, + "grad_norm": 1.1824229955673218, + "learning_rate": 3.407450355123931e-05, + "loss": 0.6214, + "step": 10990 + }, + { + "epoch": 1.5943762003116282, + "grad_norm": 1.8018920421600342, + "learning_rate": 3.40600086969126e-05, + "loss": 0.6321, + "step": 11000 + }, + { + "epoch": 1.5943762003116282, + "eval_loss": 0.7519996166229248, + "eval_runtime": 672.5842, + "eval_samples_per_second": 51.287, + "eval_steps_per_second": 2.565, + "eval_token_accuracy": 0.00041191567003769487, + "step": 11000 + }, + { + "epoch": 1.5958256332210023, + "grad_norm": 1.9211559295654297, + "learning_rate": 3.404551384258589e-05, + "loss": 0.681, + "step": 11010 + }, + { + "epoch": 1.5972750661303765, + "grad_norm": 0.8976821303367615, + "learning_rate": 3.403101898825917e-05, + "loss": 0.5972, + "step": 11020 + }, + { + "epoch": 1.5987244990397507, + "grad_norm": 0.7794384956359863, + "learning_rate": 3.401652413393245e-05, + "loss": 0.7753, + "step": 11030 + }, + { + "epoch": 1.6001739319491248, + "grad_norm": 1.1415067911148071, + "learning_rate": 3.400202927960574e-05, + "loss": 0.7083, + "step": 11040 + }, + { + "epoch": 1.601623364858499, + "grad_norm": 1.0612363815307617, + "learning_rate": 3.3987534425279026e-05, + "loss": 0.7217, + "step": 11050 + }, + { + "epoch": 1.6030727977678734, + "grad_norm": 2.422513246536255, + "learning_rate": 3.397303957095232e-05, + "loss": 0.7079, + "step": 11060 + }, + { + "epoch": 1.6045222306772475, + "grad_norm": 0.8872206211090088, + "learning_rate": 3.39585447166256e-05, + "loss": 0.7362, + "step": 11070 + }, + { + "epoch": 1.6059716635866217, + "grad_norm": 0.8796588778495789, + "learning_rate": 3.3944049862298886e-05, + "loss": 0.7146, + "step": 11080 + }, + { + "epoch": 1.607421096495996, + "grad_norm": 1.5327075719833374, + "learning_rate": 3.392955500797217e-05, + "loss": 0.6852, + "step": 11090 + }, + { + "epoch": 1.6088705294053702, + "grad_norm": 0.8125131130218506, + "learning_rate": 3.3915060153645455e-05, + "loss": 0.7014, + "step": 11100 + }, + { + "epoch": 1.6103199623147444, + "grad_norm": 0.8616804480552673, + "learning_rate": 3.3900565299318746e-05, + "loss": 0.6572, + "step": 11110 + }, + { + "epoch": 1.6117693952241186, + "grad_norm": 2.233217477798462, + "learning_rate": 3.388607044499203e-05, + "loss": 0.6935, + "step": 11120 + }, + { + "epoch": 1.6132188281334927, + "grad_norm": 2.3844566345214844, + "learning_rate": 3.3871575590665315e-05, + "loss": 0.7073, + "step": 11130 + }, + { + "epoch": 1.6146682610428669, + "grad_norm": 2.445873975753784, + "learning_rate": 3.3857080736338606e-05, + "loss": 0.7148, + "step": 11140 + }, + { + "epoch": 1.616117693952241, + "grad_norm": 1.7319201231002808, + "learning_rate": 3.384258588201189e-05, + "loss": 0.6878, + "step": 11150 + }, + { + "epoch": 1.6175671268616154, + "grad_norm": 2.917090654373169, + "learning_rate": 3.3828091027685175e-05, + "loss": 0.6819, + "step": 11160 + }, + { + "epoch": 1.6190165597709896, + "grad_norm": 2.5769288539886475, + "learning_rate": 3.381359617335846e-05, + "loss": 0.7332, + "step": 11170 + }, + { + "epoch": 1.620465992680364, + "grad_norm": 0.9746682047843933, + "learning_rate": 3.379910131903174e-05, + "loss": 0.7014, + "step": 11180 + }, + { + "epoch": 1.6219154255897381, + "grad_norm": 1.9940383434295654, + "learning_rate": 3.3784606464705034e-05, + "loss": 0.6732, + "step": 11190 + }, + { + "epoch": 1.6233648584991123, + "grad_norm": 1.0099761486053467, + "learning_rate": 3.377011161037832e-05, + "loss": 0.5949, + "step": 11200 + }, + { + "epoch": 1.6248142914084864, + "grad_norm": 2.6662089824676514, + "learning_rate": 3.37556167560516e-05, + "loss": 0.6592, + "step": 11210 + }, + { + "epoch": 1.6262637243178606, + "grad_norm": 0.6609460115432739, + "learning_rate": 3.374112190172489e-05, + "loss": 0.6284, + "step": 11220 + }, + { + "epoch": 1.6277131572272348, + "grad_norm": 2.3695130348205566, + "learning_rate": 3.372662704739817e-05, + "loss": 0.6975, + "step": 11230 + }, + { + "epoch": 1.629162590136609, + "grad_norm": 0.8688599467277527, + "learning_rate": 3.371213219307146e-05, + "loss": 0.6692, + "step": 11240 + }, + { + "epoch": 1.6306120230459833, + "grad_norm": 2.008312702178955, + "learning_rate": 3.369763733874475e-05, + "loss": 0.6637, + "step": 11250 + }, + { + "epoch": 1.6320614559553575, + "grad_norm": 0.8913775682449341, + "learning_rate": 3.368314248441803e-05, + "loss": 0.6861, + "step": 11260 + }, + { + "epoch": 1.6335108888647318, + "grad_norm": 1.5506291389465332, + "learning_rate": 3.366864763009132e-05, + "loss": 0.6753, + "step": 11270 + }, + { + "epoch": 1.634960321774106, + "grad_norm": 2.029127359390259, + "learning_rate": 3.36541527757646e-05, + "loss": 0.7136, + "step": 11280 + }, + { + "epoch": 1.6364097546834802, + "grad_norm": 1.9097111225128174, + "learning_rate": 3.363965792143789e-05, + "loss": 0.5921, + "step": 11290 + }, + { + "epoch": 1.6378591875928543, + "grad_norm": 2.0805654525756836, + "learning_rate": 3.3625163067111176e-05, + "loss": 0.6608, + "step": 11300 + }, + { + "epoch": 1.6393086205022285, + "grad_norm": 2.208421230316162, + "learning_rate": 3.361066821278446e-05, + "loss": 0.6562, + "step": 11310 + }, + { + "epoch": 1.6407580534116026, + "grad_norm": 1.505969762802124, + "learning_rate": 3.359617335845775e-05, + "loss": 0.693, + "step": 11320 + }, + { + "epoch": 1.6422074863209768, + "grad_norm": 2.6280758380889893, + "learning_rate": 3.3581678504131036e-05, + "loss": 0.673, + "step": 11330 + }, + { + "epoch": 1.6436569192303512, + "grad_norm": 0.8961797952651978, + "learning_rate": 3.356718364980432e-05, + "loss": 0.6021, + "step": 11340 + }, + { + "epoch": 1.6451063521397253, + "grad_norm": 0.6946638822555542, + "learning_rate": 3.3552688795477605e-05, + "loss": 0.6796, + "step": 11350 + }, + { + "epoch": 1.6465557850490995, + "grad_norm": 2.8928956985473633, + "learning_rate": 3.353819394115089e-05, + "loss": 0.7382, + "step": 11360 + }, + { + "epoch": 1.6480052179584739, + "grad_norm": 0.8265762329101562, + "learning_rate": 3.352369908682418e-05, + "loss": 0.7026, + "step": 11370 + }, + { + "epoch": 1.649454650867848, + "grad_norm": 4.2738938331604, + "learning_rate": 3.3509204232497464e-05, + "loss": 0.6302, + "step": 11380 + }, + { + "epoch": 1.6509040837772222, + "grad_norm": 2.031883955001831, + "learning_rate": 3.3494709378170756e-05, + "loss": 0.7206, + "step": 11390 + }, + { + "epoch": 1.6523535166865964, + "grad_norm": 0.7447781562805176, + "learning_rate": 3.348021452384404e-05, + "loss": 0.6582, + "step": 11400 + }, + { + "epoch": 1.6538029495959705, + "grad_norm": 0.9610055685043335, + "learning_rate": 3.346571966951732e-05, + "loss": 0.6741, + "step": 11410 + }, + { + "epoch": 1.6552523825053447, + "grad_norm": 2.5023865699768066, + "learning_rate": 3.345122481519061e-05, + "loss": 0.6875, + "step": 11420 + }, + { + "epoch": 1.6567018154147188, + "grad_norm": 1.0197991132736206, + "learning_rate": 3.343817944629656e-05, + "loss": 0.7759, + "step": 11430 + }, + { + "epoch": 1.6581512483240932, + "grad_norm": 0.8140033483505249, + "learning_rate": 3.342368459196985e-05, + "loss": 0.6356, + "step": 11440 + }, + { + "epoch": 1.6596006812334674, + "grad_norm": 1.772228717803955, + "learning_rate": 3.340918973764314e-05, + "loss": 0.7157, + "step": 11450 + }, + { + "epoch": 1.6610501141428418, + "grad_norm": 1.5189317464828491, + "learning_rate": 3.339469488331642e-05, + "loss": 0.6381, + "step": 11460 + }, + { + "epoch": 1.662499547052216, + "grad_norm": 2.599026679992676, + "learning_rate": 3.338020002898971e-05, + "loss": 0.7256, + "step": 11470 + }, + { + "epoch": 1.66394897996159, + "grad_norm": 0.9813500642776489, + "learning_rate": 3.3365705174662996e-05, + "loss": 0.7397, + "step": 11480 + }, + { + "epoch": 1.6653984128709642, + "grad_norm": 1.4737082719802856, + "learning_rate": 3.335121032033629e-05, + "loss": 0.7078, + "step": 11490 + }, + { + "epoch": 1.6668478457803384, + "grad_norm": 1.0761265754699707, + "learning_rate": 3.3336715466009565e-05, + "loss": 0.7165, + "step": 11500 + }, + { + "epoch": 1.6682972786897126, + "grad_norm": 4.028006553649902, + "learning_rate": 3.332222061168285e-05, + "loss": 0.7517, + "step": 11510 + }, + { + "epoch": 1.6697467115990867, + "grad_norm": 2.7268528938293457, + "learning_rate": 3.330772575735614e-05, + "loss": 0.6512, + "step": 11520 + }, + { + "epoch": 1.671196144508461, + "grad_norm": 1.2567470073699951, + "learning_rate": 3.3293230903029425e-05, + "loss": 0.6798, + "step": 11530 + }, + { + "epoch": 1.6726455774178353, + "grad_norm": 0.9180237054824829, + "learning_rate": 3.3278736048702716e-05, + "loss": 0.8243, + "step": 11540 + }, + { + "epoch": 1.6740950103272096, + "grad_norm": 2.680816650390625, + "learning_rate": 3.3264241194376e-05, + "loss": 0.6361, + "step": 11550 + }, + { + "epoch": 1.6755444432365838, + "grad_norm": 2.539856433868408, + "learning_rate": 3.324974634004928e-05, + "loss": 0.6767, + "step": 11560 + }, + { + "epoch": 1.676993876145958, + "grad_norm": 0.8751583695411682, + "learning_rate": 3.323525148572257e-05, + "loss": 0.7148, + "step": 11570 + }, + { + "epoch": 1.6784433090553321, + "grad_norm": 0.7487082481384277, + "learning_rate": 3.3220756631395854e-05, + "loss": 0.6886, + "step": 11580 + }, + { + "epoch": 1.6798927419647063, + "grad_norm": 2.2744905948638916, + "learning_rate": 3.3206261777069145e-05, + "loss": 0.6029, + "step": 11590 + }, + { + "epoch": 1.6813421748740804, + "grad_norm": 2.1104652881622314, + "learning_rate": 3.319176692274243e-05, + "loss": 0.6817, + "step": 11600 + }, + { + "epoch": 1.6827916077834546, + "grad_norm": 1.006417155265808, + "learning_rate": 3.3177272068415713e-05, + "loss": 0.7137, + "step": 11610 + }, + { + "epoch": 1.684241040692829, + "grad_norm": 2.490140914916992, + "learning_rate": 3.3162777214089005e-05, + "loss": 0.6737, + "step": 11620 + }, + { + "epoch": 1.6856904736022031, + "grad_norm": 1.7343158721923828, + "learning_rate": 3.314828235976228e-05, + "loss": 0.6307, + "step": 11630 + }, + { + "epoch": 1.6871399065115773, + "grad_norm": 0.7881498336791992, + "learning_rate": 3.313378750543557e-05, + "loss": 0.703, + "step": 11640 + }, + { + "epoch": 1.6885893394209517, + "grad_norm": 1.097777009010315, + "learning_rate": 3.311929265110886e-05, + "loss": 0.7322, + "step": 11650 + }, + { + "epoch": 1.6900387723303258, + "grad_norm": 1.9791345596313477, + "learning_rate": 3.310479779678214e-05, + "loss": 0.6046, + "step": 11660 + }, + { + "epoch": 1.6914882052397, + "grad_norm": 0.9018293619155884, + "learning_rate": 3.309030294245543e-05, + "loss": 0.6562, + "step": 11670 + }, + { + "epoch": 1.6929376381490742, + "grad_norm": 3.6132423877716064, + "learning_rate": 3.307580808812872e-05, + "loss": 0.7399, + "step": 11680 + }, + { + "epoch": 1.6943870710584483, + "grad_norm": 1.8991214036941528, + "learning_rate": 3.3061313233802e-05, + "loss": 0.7522, + "step": 11690 + }, + { + "epoch": 1.6958365039678225, + "grad_norm": 0.8896854519844055, + "learning_rate": 3.3046818379475286e-05, + "loss": 0.6909, + "step": 11700 + }, + { + "epoch": 1.6972859368771966, + "grad_norm": 1.0252200365066528, + "learning_rate": 3.303232352514857e-05, + "loss": 0.7392, + "step": 11710 + }, + { + "epoch": 1.698735369786571, + "grad_norm": 1.8175885677337646, + "learning_rate": 3.301782867082186e-05, + "loss": 0.7272, + "step": 11720 + }, + { + "epoch": 1.7001848026959452, + "grad_norm": 1.0172370672225952, + "learning_rate": 3.3003333816495146e-05, + "loss": 0.6532, + "step": 11730 + }, + { + "epoch": 1.7016342356053196, + "grad_norm": 2.1411426067352295, + "learning_rate": 3.298883896216843e-05, + "loss": 0.6795, + "step": 11740 + }, + { + "epoch": 1.7030836685146937, + "grad_norm": 1.9220808744430542, + "learning_rate": 3.297434410784172e-05, + "loss": 0.6655, + "step": 11750 + }, + { + "epoch": 1.7045331014240679, + "grad_norm": 3.3900511264801025, + "learning_rate": 3.2959849253515e-05, + "loss": 0.6648, + "step": 11760 + }, + { + "epoch": 1.705982534333442, + "grad_norm": 3.2443623542785645, + "learning_rate": 3.294535439918829e-05, + "loss": 0.6851, + "step": 11770 + }, + { + "epoch": 1.7074319672428162, + "grad_norm": 1.7702313661575317, + "learning_rate": 3.2930859544861575e-05, + "loss": 0.6403, + "step": 11780 + }, + { + "epoch": 1.7088814001521904, + "grad_norm": 2.2568228244781494, + "learning_rate": 3.291636469053486e-05, + "loss": 0.6541, + "step": 11790 + }, + { + "epoch": 1.7103308330615645, + "grad_norm": 3.1511707305908203, + "learning_rate": 3.290186983620815e-05, + "loss": 0.691, + "step": 11800 + }, + { + "epoch": 1.711780265970939, + "grad_norm": 1.8583332300186157, + "learning_rate": 3.2887374981881435e-05, + "loss": 0.6347, + "step": 11810 + }, + { + "epoch": 1.713229698880313, + "grad_norm": 1.8265938758850098, + "learning_rate": 3.287288012755472e-05, + "loss": 0.6762, + "step": 11820 + }, + { + "epoch": 1.7146791317896874, + "grad_norm": 2.006502866744995, + "learning_rate": 3.2858385273228e-05, + "loss": 0.6834, + "step": 11830 + }, + { + "epoch": 1.7161285646990616, + "grad_norm": 0.8520517945289612, + "learning_rate": 3.284389041890129e-05, + "loss": 0.6735, + "step": 11840 + }, + { + "epoch": 1.7175779976084358, + "grad_norm": 0.9941295385360718, + "learning_rate": 3.282939556457458e-05, + "loss": 0.6592, + "step": 11850 + }, + { + "epoch": 1.71902743051781, + "grad_norm": 0.8577190637588501, + "learning_rate": 3.281490071024786e-05, + "loss": 0.6025, + "step": 11860 + }, + { + "epoch": 1.720476863427184, + "grad_norm": 3.2018096446990967, + "learning_rate": 3.2800405855921154e-05, + "loss": 0.7097, + "step": 11870 + }, + { + "epoch": 1.7219262963365582, + "grad_norm": 0.7818105816841125, + "learning_rate": 3.278591100159444e-05, + "loss": 0.628, + "step": 11880 + }, + { + "epoch": 1.7233757292459324, + "grad_norm": 3.172701358795166, + "learning_rate": 3.2771416147267716e-05, + "loss": 0.6854, + "step": 11890 + }, + { + "epoch": 1.7248251621553068, + "grad_norm": 0.8485254049301147, + "learning_rate": 3.275692129294101e-05, + "loss": 0.6506, + "step": 11900 + }, + { + "epoch": 1.726274595064681, + "grad_norm": 1.6423099040985107, + "learning_rate": 3.274242643861429e-05, + "loss": 0.6422, + "step": 11910 + }, + { + "epoch": 1.727724027974055, + "grad_norm": 0.7199040055274963, + "learning_rate": 3.272793158428758e-05, + "loss": 0.6863, + "step": 11920 + }, + { + "epoch": 1.7291734608834295, + "grad_norm": 0.9496470093727112, + "learning_rate": 3.271343672996087e-05, + "loss": 0.6509, + "step": 11930 + }, + { + "epoch": 1.7306228937928037, + "grad_norm": 0.8391323685646057, + "learning_rate": 3.269894187563415e-05, + "loss": 0.7397, + "step": 11940 + }, + { + "epoch": 1.7320723267021778, + "grad_norm": 0.752637505531311, + "learning_rate": 3.2684447021307436e-05, + "loss": 0.6644, + "step": 11950 + }, + { + "epoch": 1.733521759611552, + "grad_norm": 0.8174326419830322, + "learning_rate": 3.266995216698072e-05, + "loss": 0.7158, + "step": 11960 + }, + { + "epoch": 1.7349711925209261, + "grad_norm": 1.8735527992248535, + "learning_rate": 3.265545731265401e-05, + "loss": 0.6833, + "step": 11970 + }, + { + "epoch": 1.7364206254303003, + "grad_norm": 0.8878263235092163, + "learning_rate": 3.2640962458327296e-05, + "loss": 0.6542, + "step": 11980 + }, + { + "epoch": 1.7378700583396745, + "grad_norm": 1.0730360746383667, + "learning_rate": 3.262646760400058e-05, + "loss": 0.6864, + "step": 11990 + }, + { + "epoch": 1.7393194912490488, + "grad_norm": 2.2720248699188232, + "learning_rate": 3.261197274967387e-05, + "loss": 0.7018, + "step": 12000 + }, + { + "epoch": 1.7393194912490488, + "eval_loss": 0.7453984022140503, + "eval_runtime": 671.5911, + "eval_samples_per_second": 51.363, + "eval_steps_per_second": 2.569, + "eval_token_accuracy": 0.0004127198786867337, + "step": 12000 + }, + { + "epoch": 1.740768924158423, + "grad_norm": 0.742331862449646, + "learning_rate": 3.2597477895347156e-05, + "loss": 0.6351, + "step": 12010 + }, + { + "epoch": 1.7422183570677974, + "grad_norm": 1.6918139457702637, + "learning_rate": 3.258298304102044e-05, + "loss": 0.6707, + "step": 12020 + }, + { + "epoch": 1.7436677899771715, + "grad_norm": 2.084019422531128, + "learning_rate": 3.2568488186693724e-05, + "loss": 0.6947, + "step": 12030 + }, + { + "epoch": 1.7451172228865457, + "grad_norm": 1.5365352630615234, + "learning_rate": 3.255399333236701e-05, + "loss": 0.6949, + "step": 12040 + }, + { + "epoch": 1.7465666557959199, + "grad_norm": 0.9291880130767822, + "learning_rate": 3.25394984780403e-05, + "loss": 0.7178, + "step": 12050 + }, + { + "epoch": 1.748016088705294, + "grad_norm": 2.788231134414673, + "learning_rate": 3.2525003623713584e-05, + "loss": 0.6189, + "step": 12060 + }, + { + "epoch": 1.7494655216146682, + "grad_norm": 0.8338028788566589, + "learning_rate": 3.251050876938687e-05, + "loss": 0.6596, + "step": 12070 + }, + { + "epoch": 1.7509149545240423, + "grad_norm": 1.9956914186477661, + "learning_rate": 3.249601391506015e-05, + "loss": 0.686, + "step": 12080 + }, + { + "epoch": 1.7523643874334167, + "grad_norm": 2.4082913398742676, + "learning_rate": 3.248151906073344e-05, + "loss": 0.6606, + "step": 12090 + }, + { + "epoch": 1.7538138203427909, + "grad_norm": 0.8920974135398865, + "learning_rate": 3.246702420640673e-05, + "loss": 0.6806, + "step": 12100 + }, + { + "epoch": 1.7552632532521653, + "grad_norm": 0.6411871314048767, + "learning_rate": 3.245252935208001e-05, + "loss": 0.6898, + "step": 12110 + }, + { + "epoch": 1.7567126861615394, + "grad_norm": 1.895133376121521, + "learning_rate": 3.24380344977533e-05, + "loss": 0.6771, + "step": 12120 + }, + { + "epoch": 1.7581621190709136, + "grad_norm": 0.8246234059333801, + "learning_rate": 3.242353964342659e-05, + "loss": 0.7074, + "step": 12130 + }, + { + "epoch": 1.7596115519802877, + "grad_norm": 0.9914174675941467, + "learning_rate": 3.240904478909987e-05, + "loss": 0.7025, + "step": 12140 + }, + { + "epoch": 1.761060984889662, + "grad_norm": 1.5609525442123413, + "learning_rate": 3.239454993477316e-05, + "loss": 0.7268, + "step": 12150 + }, + { + "epoch": 1.762510417799036, + "grad_norm": 0.7251628637313843, + "learning_rate": 3.238005508044644e-05, + "loss": 0.7015, + "step": 12160 + }, + { + "epoch": 1.7639598507084102, + "grad_norm": 2.229095697402954, + "learning_rate": 3.2365560226119726e-05, + "loss": 0.5969, + "step": 12170 + }, + { + "epoch": 1.7654092836177846, + "grad_norm": 0.9089709520339966, + "learning_rate": 3.235106537179302e-05, + "loss": 0.6905, + "step": 12180 + }, + { + "epoch": 1.7668587165271588, + "grad_norm": 0.8589386343955994, + "learning_rate": 3.23365705174663e-05, + "loss": 0.6675, + "step": 12190 + }, + { + "epoch": 1.768308149436533, + "grad_norm": 0.9688293933868408, + "learning_rate": 3.232207566313959e-05, + "loss": 0.5953, + "step": 12200 + }, + { + "epoch": 1.7697575823459073, + "grad_norm": 0.7265881896018982, + "learning_rate": 3.230758080881287e-05, + "loss": 0.6947, + "step": 12210 + }, + { + "epoch": 1.7712070152552815, + "grad_norm": 0.90655517578125, + "learning_rate": 3.2293085954486154e-05, + "loss": 0.661, + "step": 12220 + }, + { + "epoch": 1.7726564481646556, + "grad_norm": 0.8843987584114075, + "learning_rate": 3.2278591100159445e-05, + "loss": 0.6676, + "step": 12230 + }, + { + "epoch": 1.7741058810740298, + "grad_norm": 0.9245916604995728, + "learning_rate": 3.226409624583273e-05, + "loss": 0.6631, + "step": 12240 + }, + { + "epoch": 1.775555313983404, + "grad_norm": 1.0180144309997559, + "learning_rate": 3.224960139150602e-05, + "loss": 0.6716, + "step": 12250 + }, + { + "epoch": 1.777004746892778, + "grad_norm": 4.001671314239502, + "learning_rate": 3.2235106537179305e-05, + "loss": 0.8316, + "step": 12260 + }, + { + "epoch": 1.7784541798021523, + "grad_norm": 2.808419942855835, + "learning_rate": 3.222061168285258e-05, + "loss": 0.7557, + "step": 12270 + }, + { + "epoch": 1.7799036127115266, + "grad_norm": 1.6131603717803955, + "learning_rate": 3.2206116828525874e-05, + "loss": 0.6285, + "step": 12280 + }, + { + "epoch": 1.7813530456209008, + "grad_norm": 0.8948814272880554, + "learning_rate": 3.219162197419916e-05, + "loss": 0.7099, + "step": 12290 + }, + { + "epoch": 1.7828024785302752, + "grad_norm": 3.0544824600219727, + "learning_rate": 3.217712711987245e-05, + "loss": 0.6793, + "step": 12300 + }, + { + "epoch": 1.7842519114396493, + "grad_norm": 1.1489214897155762, + "learning_rate": 3.2162632265545734e-05, + "loss": 0.6682, + "step": 12310 + }, + { + "epoch": 1.7857013443490235, + "grad_norm": 1.5618456602096558, + "learning_rate": 3.214813741121902e-05, + "loss": 0.7276, + "step": 12320 + }, + { + "epoch": 1.7871507772583977, + "grad_norm": 2.5483455657958984, + "learning_rate": 3.213364255689231e-05, + "loss": 0.6497, + "step": 12330 + }, + { + "epoch": 1.7886002101677718, + "grad_norm": 2.1112256050109863, + "learning_rate": 3.211914770256559e-05, + "loss": 0.7096, + "step": 12340 + }, + { + "epoch": 1.790049643077146, + "grad_norm": 1.7844610214233398, + "learning_rate": 3.210465284823888e-05, + "loss": 0.5964, + "step": 12350 + }, + { + "epoch": 1.7914990759865201, + "grad_norm": 0.7652512192726135, + "learning_rate": 3.209015799391216e-05, + "loss": 0.7223, + "step": 12360 + }, + { + "epoch": 1.7929485088958945, + "grad_norm": 1.2720307111740112, + "learning_rate": 3.207566313958545e-05, + "loss": 0.7185, + "step": 12370 + }, + { + "epoch": 1.7943979418052687, + "grad_norm": 0.8202764391899109, + "learning_rate": 3.206116828525874e-05, + "loss": 0.6452, + "step": 12380 + }, + { + "epoch": 1.7958473747146428, + "grad_norm": 1.2758756875991821, + "learning_rate": 3.204667343093202e-05, + "loss": 0.65, + "step": 12390 + }, + { + "epoch": 1.7972968076240172, + "grad_norm": 2.4472177028656006, + "learning_rate": 3.2032178576605307e-05, + "loss": 0.6755, + "step": 12400 + }, + { + "epoch": 1.7987462405333914, + "grad_norm": 3.1003291606903076, + "learning_rate": 3.201768372227859e-05, + "loss": 0.6934, + "step": 12410 + }, + { + "epoch": 1.8001956734427655, + "grad_norm": 3.7856054306030273, + "learning_rate": 3.2003188867951875e-05, + "loss": 0.7038, + "step": 12420 + }, + { + "epoch": 1.8016451063521397, + "grad_norm": 0.8813289403915405, + "learning_rate": 3.1988694013625166e-05, + "loss": 0.651, + "step": 12430 + }, + { + "epoch": 1.8030945392615139, + "grad_norm": 2.7432374954223633, + "learning_rate": 3.197419915929845e-05, + "loss": 0.6651, + "step": 12440 + }, + { + "epoch": 1.804543972170888, + "grad_norm": 0.8676012754440308, + "learning_rate": 3.1959704304971735e-05, + "loss": 0.7189, + "step": 12450 + }, + { + "epoch": 1.8059934050802622, + "grad_norm": 1.9796631336212158, + "learning_rate": 3.1945209450645026e-05, + "loss": 0.6967, + "step": 12460 + }, + { + "epoch": 1.8074428379896366, + "grad_norm": 0.8426458239555359, + "learning_rate": 3.1930714596318304e-05, + "loss": 0.6947, + "step": 12470 + }, + { + "epoch": 1.8088922708990107, + "grad_norm": 0.9686286449432373, + "learning_rate": 3.1916219741991595e-05, + "loss": 0.6641, + "step": 12480 + }, + { + "epoch": 1.810341703808385, + "grad_norm": 0.8845982551574707, + "learning_rate": 3.190172488766488e-05, + "loss": 0.6704, + "step": 12490 + }, + { + "epoch": 1.8117911367177593, + "grad_norm": 1.16560959815979, + "learning_rate": 3.1887230033338164e-05, + "loss": 0.6233, + "step": 12500 + }, + { + "epoch": 1.8132405696271334, + "grad_norm": 0.8579843044281006, + "learning_rate": 3.1872735179011455e-05, + "loss": 0.6219, + "step": 12510 + }, + { + "epoch": 1.8146900025365076, + "grad_norm": 2.8450324535369873, + "learning_rate": 3.185824032468474e-05, + "loss": 0.6458, + "step": 12520 + }, + { + "epoch": 1.8161394354458817, + "grad_norm": 2.29085111618042, + "learning_rate": 3.1843745470358024e-05, + "loss": 0.7369, + "step": 12530 + }, + { + "epoch": 1.817588868355256, + "grad_norm": 2.1326372623443604, + "learning_rate": 3.182925061603131e-05, + "loss": 0.7111, + "step": 12540 + }, + { + "epoch": 1.81903830126463, + "grad_norm": 1.4014294147491455, + "learning_rate": 3.181475576170459e-05, + "loss": 0.6926, + "step": 12550 + }, + { + "epoch": 1.8204877341740044, + "grad_norm": 2.965717315673828, + "learning_rate": 3.1800260907377883e-05, + "loss": 0.6309, + "step": 12560 + }, + { + "epoch": 1.8219371670833786, + "grad_norm": 1.889169454574585, + "learning_rate": 3.178576605305117e-05, + "loss": 0.6605, + "step": 12570 + }, + { + "epoch": 1.823386599992753, + "grad_norm": 1.740950584411621, + "learning_rate": 3.177127119872446e-05, + "loss": 0.6657, + "step": 12580 + }, + { + "epoch": 1.8248360329021271, + "grad_norm": 0.9073018431663513, + "learning_rate": 3.175677634439774e-05, + "loss": 0.5996, + "step": 12590 + }, + { + "epoch": 1.8262854658115013, + "grad_norm": 2.453105926513672, + "learning_rate": 3.174228149007102e-05, + "loss": 0.6781, + "step": 12600 + }, + { + "epoch": 1.8277348987208755, + "grad_norm": 2.1584982872009277, + "learning_rate": 3.172778663574431e-05, + "loss": 0.6798, + "step": 12610 + }, + { + "epoch": 1.8291843316302496, + "grad_norm": 1.8437451124191284, + "learning_rate": 3.1713291781417596e-05, + "loss": 0.6918, + "step": 12620 + }, + { + "epoch": 1.8306337645396238, + "grad_norm": 0.9406701326370239, + "learning_rate": 3.169879692709089e-05, + "loss": 0.6399, + "step": 12630 + }, + { + "epoch": 1.832083197448998, + "grad_norm": 3.034640312194824, + "learning_rate": 3.168430207276417e-05, + "loss": 0.7079, + "step": 12640 + }, + { + "epoch": 1.8335326303583723, + "grad_norm": 1.1881543397903442, + "learning_rate": 3.1669807218437456e-05, + "loss": 0.6354, + "step": 12650 + }, + { + "epoch": 1.8349820632677465, + "grad_norm": 0.7651578783988953, + "learning_rate": 3.165531236411074e-05, + "loss": 0.6374, + "step": 12660 + }, + { + "epoch": 1.8364314961771206, + "grad_norm": 3.246687650680542, + "learning_rate": 3.1640817509784025e-05, + "loss": 0.6265, + "step": 12670 + }, + { + "epoch": 1.837880929086495, + "grad_norm": 2.519076108932495, + "learning_rate": 3.1626322655457316e-05, + "loss": 0.6974, + "step": 12680 + }, + { + "epoch": 1.8393303619958692, + "grad_norm": 1.0031216144561768, + "learning_rate": 3.16118278011306e-05, + "loss": 0.6395, + "step": 12690 + }, + { + "epoch": 1.8407797949052433, + "grad_norm": 1.622070550918579, + "learning_rate": 3.1597332946803885e-05, + "loss": 0.6563, + "step": 12700 + }, + { + "epoch": 1.8422292278146175, + "grad_norm": 0.7034159302711487, + "learning_rate": 3.1582838092477176e-05, + "loss": 0.6605, + "step": 12710 + }, + { + "epoch": 1.8436786607239917, + "grad_norm": 1.7830655574798584, + "learning_rate": 3.156834323815046e-05, + "loss": 0.6996, + "step": 12720 + }, + { + "epoch": 1.8451280936333658, + "grad_norm": 2.7417211532592773, + "learning_rate": 3.1553848383823745e-05, + "loss": 0.6474, + "step": 12730 + }, + { + "epoch": 1.84657752654274, + "grad_norm": 2.5987696647644043, + "learning_rate": 3.153935352949703e-05, + "loss": 0.6479, + "step": 12740 + }, + { + "epoch": 1.8480269594521144, + "grad_norm": 0.8672921657562256, + "learning_rate": 3.152485867517031e-05, + "loss": 0.7378, + "step": 12750 + }, + { + "epoch": 1.8494763923614885, + "grad_norm": 1.1308648586273193, + "learning_rate": 3.1510363820843605e-05, + "loss": 0.6805, + "step": 12760 + }, + { + "epoch": 1.850925825270863, + "grad_norm": 1.4739187955856323, + "learning_rate": 3.149586896651689e-05, + "loss": 0.6196, + "step": 12770 + }, + { + "epoch": 1.852375258180237, + "grad_norm": 0.7275584936141968, + "learning_rate": 3.148137411219017e-05, + "loss": 0.7302, + "step": 12780 + }, + { + "epoch": 1.8538246910896112, + "grad_norm": 1.3452956676483154, + "learning_rate": 3.146687925786346e-05, + "loss": 0.695, + "step": 12790 + }, + { + "epoch": 1.8552741239989854, + "grad_norm": 2.195021152496338, + "learning_rate": 3.145238440353674e-05, + "loss": 0.6259, + "step": 12800 + }, + { + "epoch": 1.8567235569083596, + "grad_norm": 2.028768539428711, + "learning_rate": 3.143788954921003e-05, + "loss": 0.6325, + "step": 12810 + }, + { + "epoch": 1.8581729898177337, + "grad_norm": 1.9663366079330444, + "learning_rate": 3.142339469488332e-05, + "loss": 0.5774, + "step": 12820 + }, + { + "epoch": 1.8596224227271079, + "grad_norm": 0.9267606139183044, + "learning_rate": 3.14088998405566e-05, + "loss": 0.7084, + "step": 12830 + }, + { + "epoch": 1.8610718556364823, + "grad_norm": 0.6876948475837708, + "learning_rate": 3.139440498622989e-05, + "loss": 0.7126, + "step": 12840 + }, + { + "epoch": 1.8625212885458564, + "grad_norm": 1.1228951215744019, + "learning_rate": 3.137991013190318e-05, + "loss": 0.6687, + "step": 12850 + }, + { + "epoch": 1.8639707214552308, + "grad_norm": 3.0569639205932617, + "learning_rate": 3.136541527757646e-05, + "loss": 0.7034, + "step": 12860 + }, + { + "epoch": 1.865420154364605, + "grad_norm": 2.0416345596313477, + "learning_rate": 3.1350920423249746e-05, + "loss": 0.7078, + "step": 12870 + }, + { + "epoch": 1.8668695872739791, + "grad_norm": 2.680041551589966, + "learning_rate": 3.133642556892303e-05, + "loss": 0.7417, + "step": 12880 + }, + { + "epoch": 1.8683190201833533, + "grad_norm": 1.1946463584899902, + "learning_rate": 3.132193071459632e-05, + "loss": 0.7307, + "step": 12890 + }, + { + "epoch": 1.8697684530927274, + "grad_norm": 0.8287043571472168, + "learning_rate": 3.1307435860269606e-05, + "loss": 0.6699, + "step": 12900 + }, + { + "epoch": 1.8712178860021016, + "grad_norm": 1.7087205648422241, + "learning_rate": 3.12929410059429e-05, + "loss": 0.6111, + "step": 12910 + }, + { + "epoch": 1.8726673189114758, + "grad_norm": 1.4836465120315552, + "learning_rate": 3.1278446151616175e-05, + "loss": 0.7251, + "step": 12920 + }, + { + "epoch": 1.8741167518208501, + "grad_norm": 0.8842166066169739, + "learning_rate": 3.126395129728946e-05, + "loss": 0.7151, + "step": 12930 + }, + { + "epoch": 1.8755661847302243, + "grad_norm": 3.3071022033691406, + "learning_rate": 3.124945644296275e-05, + "loss": 0.6943, + "step": 12940 + }, + { + "epoch": 1.8770156176395985, + "grad_norm": 1.8857395648956299, + "learning_rate": 3.1234961588636034e-05, + "loss": 0.6141, + "step": 12950 + }, + { + "epoch": 1.8784650505489728, + "grad_norm": 1.0635484457015991, + "learning_rate": 3.1220466734309326e-05, + "loss": 0.6381, + "step": 12960 + }, + { + "epoch": 1.879914483458347, + "grad_norm": 1.4398080110549927, + "learning_rate": 3.120597187998261e-05, + "loss": 0.633, + "step": 12970 + }, + { + "epoch": 1.8813639163677212, + "grad_norm": 1.040791392326355, + "learning_rate": 3.1191477025655894e-05, + "loss": 0.6147, + "step": 12980 + }, + { + "epoch": 1.8828133492770953, + "grad_norm": 0.9879858493804932, + "learning_rate": 3.117698217132918e-05, + "loss": 0.646, + "step": 12990 + }, + { + "epoch": 1.8842627821864695, + "grad_norm": 0.8812539577484131, + "learning_rate": 3.116248731700246e-05, + "loss": 0.7175, + "step": 13000 + }, + { + "epoch": 1.8842627821864695, + "eval_loss": 0.7349093556404114, + "eval_runtime": 669.3439, + "eval_samples_per_second": 51.536, + "eval_steps_per_second": 2.577, + "eval_token_accuracy": 0.00039856580646365007, + "step": 13000 + }, + { + "epoch": 1.8857122150958436, + "grad_norm": 0.8519927263259888, + "learning_rate": 3.1147992462675754e-05, + "loss": 0.6899, + "step": 13010 + }, + { + "epoch": 1.8871616480052178, + "grad_norm": 0.9121435880661011, + "learning_rate": 3.113349760834904e-05, + "loss": 0.6665, + "step": 13020 + }, + { + "epoch": 1.8886110809145922, + "grad_norm": 0.9590705037117004, + "learning_rate": 3.111900275402232e-05, + "loss": 0.5601, + "step": 13030 + }, + { + "epoch": 1.8900605138239663, + "grad_norm": 0.8972257375717163, + "learning_rate": 3.1104507899695614e-05, + "loss": 0.6744, + "step": 13040 + }, + { + "epoch": 1.8915099467333407, + "grad_norm": 0.8012029528617859, + "learning_rate": 3.109001304536889e-05, + "loss": 0.6178, + "step": 13050 + }, + { + "epoch": 1.8929593796427149, + "grad_norm": 0.9889801740646362, + "learning_rate": 3.107551819104218e-05, + "loss": 0.729, + "step": 13060 + }, + { + "epoch": 1.894408812552089, + "grad_norm": 1.8423585891723633, + "learning_rate": 3.106102333671547e-05, + "loss": 0.6544, + "step": 13070 + }, + { + "epoch": 1.8958582454614632, + "grad_norm": 2.015009880065918, + "learning_rate": 3.104652848238875e-05, + "loss": 0.6447, + "step": 13080 + }, + { + "epoch": 1.8973076783708374, + "grad_norm": 2.1070494651794434, + "learning_rate": 3.103203362806204e-05, + "loss": 0.575, + "step": 13090 + }, + { + "epoch": 1.8987571112802115, + "grad_norm": 0.9845415353775024, + "learning_rate": 3.101753877373533e-05, + "loss": 0.7011, + "step": 13100 + }, + { + "epoch": 1.9002065441895857, + "grad_norm": 1.782501220703125, + "learning_rate": 3.100304391940861e-05, + "loss": 0.6614, + "step": 13110 + }, + { + "epoch": 1.90165597709896, + "grad_norm": 1.965434193611145, + "learning_rate": 3.0988549065081896e-05, + "loss": 0.7378, + "step": 13120 + }, + { + "epoch": 1.9031054100083342, + "grad_norm": 0.8427996635437012, + "learning_rate": 3.097405421075518e-05, + "loss": 0.6494, + "step": 13130 + }, + { + "epoch": 1.9045548429177086, + "grad_norm": 0.9848141670227051, + "learning_rate": 3.095955935642847e-05, + "loss": 0.7511, + "step": 13140 + }, + { + "epoch": 1.9060042758270828, + "grad_norm": 1.678484559059143, + "learning_rate": 3.0945064502101756e-05, + "loss": 0.6437, + "step": 13150 + }, + { + "epoch": 1.907453708736457, + "grad_norm": 0.7144997715950012, + "learning_rate": 3.093056964777504e-05, + "loss": 0.6478, + "step": 13160 + }, + { + "epoch": 1.908903141645831, + "grad_norm": 0.9617393612861633, + "learning_rate": 3.091607479344833e-05, + "loss": 0.6981, + "step": 13170 + }, + { + "epoch": 1.9103525745552052, + "grad_norm": 0.8014298677444458, + "learning_rate": 3.090157993912161e-05, + "loss": 0.6234, + "step": 13180 + }, + { + "epoch": 1.9118020074645794, + "grad_norm": 1.6113104820251465, + "learning_rate": 3.08870850847949e-05, + "loss": 0.6508, + "step": 13190 + }, + { + "epoch": 1.9132514403739536, + "grad_norm": 0.8225398659706116, + "learning_rate": 3.0872590230468184e-05, + "loss": 0.637, + "step": 13200 + }, + { + "epoch": 1.914700873283328, + "grad_norm": 0.7952432036399841, + "learning_rate": 3.085809537614147e-05, + "loss": 0.6801, + "step": 13210 + }, + { + "epoch": 1.916150306192702, + "grad_norm": 2.7938709259033203, + "learning_rate": 3.084360052181476e-05, + "loss": 0.7336, + "step": 13220 + }, + { + "epoch": 1.9175997391020763, + "grad_norm": 0.8850911259651184, + "learning_rate": 3.0829105667488044e-05, + "loss": 0.6048, + "step": 13230 + }, + { + "epoch": 1.9190491720114506, + "grad_norm": 1.023543357849121, + "learning_rate": 3.081461081316133e-05, + "loss": 0.6227, + "step": 13240 + }, + { + "epoch": 1.9204986049208248, + "grad_norm": 0.89097660779953, + "learning_rate": 3.080011595883461e-05, + "loss": 0.6683, + "step": 13250 + }, + { + "epoch": 1.921948037830199, + "grad_norm": 0.8899220824241638, + "learning_rate": 3.07856211045079e-05, + "loss": 0.6979, + "step": 13260 + }, + { + "epoch": 1.9233974707395731, + "grad_norm": 1.9883252382278442, + "learning_rate": 3.077112625018119e-05, + "loss": 0.5952, + "step": 13270 + }, + { + "epoch": 1.9248469036489473, + "grad_norm": 2.1873536109924316, + "learning_rate": 3.075663139585447e-05, + "loss": 0.665, + "step": 13280 + }, + { + "epoch": 1.9262963365583214, + "grad_norm": 0.7692016363143921, + "learning_rate": 3.0742136541527764e-05, + "loss": 0.6827, + "step": 13290 + }, + { + "epoch": 1.9277457694676956, + "grad_norm": 3.5339086055755615, + "learning_rate": 3.072764168720105e-05, + "loss": 0.6847, + "step": 13300 + }, + { + "epoch": 1.92919520237707, + "grad_norm": 2.340005397796631, + "learning_rate": 3.0713146832874326e-05, + "loss": 0.7517, + "step": 13310 + }, + { + "epoch": 1.9306446352864441, + "grad_norm": 2.5109386444091797, + "learning_rate": 3.069865197854762e-05, + "loss": 0.604, + "step": 13320 + }, + { + "epoch": 1.9320940681958185, + "grad_norm": 0.8669785261154175, + "learning_rate": 3.06841571242209e-05, + "loss": 0.7108, + "step": 13330 + }, + { + "epoch": 1.9335435011051927, + "grad_norm": 1.7247940301895142, + "learning_rate": 3.066966226989419e-05, + "loss": 0.6894, + "step": 13340 + }, + { + "epoch": 1.9349929340145668, + "grad_norm": 2.6877431869506836, + "learning_rate": 3.0655167415567477e-05, + "loss": 0.6121, + "step": 13350 + }, + { + "epoch": 1.936442366923941, + "grad_norm": 0.8619291186332703, + "learning_rate": 3.064067256124076e-05, + "loss": 0.6968, + "step": 13360 + }, + { + "epoch": 1.9378917998333152, + "grad_norm": 0.9239633679389954, + "learning_rate": 3.0626177706914045e-05, + "loss": 0.6755, + "step": 13370 + }, + { + "epoch": 1.9393412327426893, + "grad_norm": 0.8451610207557678, + "learning_rate": 3.061168285258733e-05, + "loss": 0.7067, + "step": 13380 + }, + { + "epoch": 1.9407906656520635, + "grad_norm": 2.7170891761779785, + "learning_rate": 3.059718799826062e-05, + "loss": 0.5995, + "step": 13390 + }, + { + "epoch": 1.9422400985614379, + "grad_norm": 0.9185802340507507, + "learning_rate": 3.0582693143933905e-05, + "loss": 0.6857, + "step": 13400 + }, + { + "epoch": 1.943689531470812, + "grad_norm": 0.7790818214416504, + "learning_rate": 3.056819828960719e-05, + "loss": 0.6072, + "step": 13410 + }, + { + "epoch": 1.9451389643801864, + "grad_norm": 0.8623969554901123, + "learning_rate": 3.055370343528048e-05, + "loss": 0.7032, + "step": 13420 + }, + { + "epoch": 1.9465883972895606, + "grad_norm": 1.7718230485916138, + "learning_rate": 3.0539208580953765e-05, + "loss": 0.6544, + "step": 13430 + }, + { + "epoch": 1.9480378301989347, + "grad_norm": 1.7005128860473633, + "learning_rate": 3.052471372662705e-05, + "loss": 0.5819, + "step": 13440 + }, + { + "epoch": 1.9494872631083089, + "grad_norm": 1.0943654775619507, + "learning_rate": 3.0510218872300334e-05, + "loss": 0.6557, + "step": 13450 + }, + { + "epoch": 1.950936696017683, + "grad_norm": 1.9594976902008057, + "learning_rate": 3.049572401797362e-05, + "loss": 0.5739, + "step": 13460 + }, + { + "epoch": 1.9523861289270572, + "grad_norm": 2.1289260387420654, + "learning_rate": 3.0481229163646906e-05, + "loss": 0.6473, + "step": 13470 + }, + { + "epoch": 1.9538355618364314, + "grad_norm": 0.8309232592582703, + "learning_rate": 3.0466734309320194e-05, + "loss": 0.6649, + "step": 13480 + }, + { + "epoch": 1.9552849947458057, + "grad_norm": 0.8766238689422607, + "learning_rate": 3.045223945499348e-05, + "loss": 0.6607, + "step": 13490 + }, + { + "epoch": 1.95673442765518, + "grad_norm": 1.4982854127883911, + "learning_rate": 3.0437744600666762e-05, + "loss": 0.7224, + "step": 13500 + }, + { + "epoch": 1.958183860564554, + "grad_norm": 1.3108932971954346, + "learning_rate": 3.042324974634005e-05, + "loss": 0.5613, + "step": 13510 + }, + { + "epoch": 1.9596332934739285, + "grad_norm": 2.0841736793518066, + "learning_rate": 3.0408754892013334e-05, + "loss": 0.6751, + "step": 13520 + }, + { + "epoch": 1.9610827263833026, + "grad_norm": 0.7184624671936035, + "learning_rate": 3.0394260037686622e-05, + "loss": 0.6308, + "step": 13530 + }, + { + "epoch": 1.9625321592926768, + "grad_norm": 1.8600022792816162, + "learning_rate": 3.037976518335991e-05, + "loss": 0.5928, + "step": 13540 + }, + { + "epoch": 1.963981592202051, + "grad_norm": 0.7624354362487793, + "learning_rate": 3.0365270329033198e-05, + "loss": 0.685, + "step": 13550 + }, + { + "epoch": 1.965431025111425, + "grad_norm": 0.7610770463943481, + "learning_rate": 3.0350775474706482e-05, + "loss": 0.6679, + "step": 13560 + }, + { + "epoch": 1.9668804580207992, + "grad_norm": 2.7660579681396484, + "learning_rate": 3.0336280620379763e-05, + "loss": 0.6375, + "step": 13570 + }, + { + "epoch": 1.9683298909301734, + "grad_norm": 2.7836456298828125, + "learning_rate": 3.032178576605305e-05, + "loss": 0.673, + "step": 13580 + }, + { + "epoch": 1.9697793238395478, + "grad_norm": 2.4783413410186768, + "learning_rate": 3.030729091172634e-05, + "loss": 0.6507, + "step": 13590 + }, + { + "epoch": 1.971228756748922, + "grad_norm": 2.4852845668792725, + "learning_rate": 3.0292796057399626e-05, + "loss": 0.6564, + "step": 13600 + }, + { + "epoch": 1.9726781896582963, + "grad_norm": 1.1195975542068481, + "learning_rate": 3.027830120307291e-05, + "loss": 0.6899, + "step": 13610 + }, + { + "epoch": 1.9741276225676705, + "grad_norm": 0.7854732275009155, + "learning_rate": 3.02638063487462e-05, + "loss": 0.6372, + "step": 13620 + }, + { + "epoch": 1.9755770554770447, + "grad_norm": 0.8562934994697571, + "learning_rate": 3.024931149441948e-05, + "loss": 0.6611, + "step": 13630 + }, + { + "epoch": 1.9770264883864188, + "grad_norm": 2.820359468460083, + "learning_rate": 3.0234816640092767e-05, + "loss": 0.6989, + "step": 13640 + }, + { + "epoch": 1.978475921295793, + "grad_norm": 0.9774487018585205, + "learning_rate": 3.0220321785766055e-05, + "loss": 0.6408, + "step": 13650 + }, + { + "epoch": 1.9799253542051671, + "grad_norm": 1.5797113180160522, + "learning_rate": 3.020582693143934e-05, + "loss": 0.6087, + "step": 13660 + }, + { + "epoch": 1.9813747871145413, + "grad_norm": 1.3290632963180542, + "learning_rate": 3.0191332077112627e-05, + "loss": 0.6909, + "step": 13670 + }, + { + "epoch": 1.9828242200239157, + "grad_norm": 0.8466750979423523, + "learning_rate": 3.0176837222785915e-05, + "loss": 0.7907, + "step": 13680 + }, + { + "epoch": 1.9842736529332898, + "grad_norm": 1.6860663890838623, + "learning_rate": 3.0162342368459202e-05, + "loss": 0.6878, + "step": 13690 + }, + { + "epoch": 1.9857230858426642, + "grad_norm": 2.6741063594818115, + "learning_rate": 3.0147847514132483e-05, + "loss": 0.6498, + "step": 13700 + }, + { + "epoch": 1.9871725187520384, + "grad_norm": 0.9091131091117859, + "learning_rate": 3.0133352659805768e-05, + "loss": 0.751, + "step": 13710 + }, + { + "epoch": 1.9886219516614125, + "grad_norm": 0.8405702710151672, + "learning_rate": 3.0118857805479055e-05, + "loss": 0.6445, + "step": 13720 + }, + { + "epoch": 1.9900713845707867, + "grad_norm": 0.7596121430397034, + "learning_rate": 3.0104362951152343e-05, + "loss": 0.5749, + "step": 13730 + }, + { + "epoch": 1.9915208174801609, + "grad_norm": 1.1137197017669678, + "learning_rate": 3.008986809682563e-05, + "loss": 0.6756, + "step": 13740 + }, + { + "epoch": 1.992970250389535, + "grad_norm": 0.8212170004844666, + "learning_rate": 3.0075373242498915e-05, + "loss": 0.7122, + "step": 13750 + }, + { + "epoch": 1.9944196832989092, + "grad_norm": 2.4602837562561035, + "learning_rate": 3.0060878388172196e-05, + "loss": 0.7309, + "step": 13760 + }, + { + "epoch": 1.9958691162082836, + "grad_norm": 2.3783586025238037, + "learning_rate": 3.0046383533845484e-05, + "loss": 0.7033, + "step": 13770 + }, + { + "epoch": 1.9973185491176577, + "grad_norm": 1.690285086631775, + "learning_rate": 3.0031888679518772e-05, + "loss": 0.6742, + "step": 13780 + }, + { + "epoch": 1.9987679820270319, + "grad_norm": 0.956028401851654, + "learning_rate": 3.001739382519206e-05, + "loss": 0.6831, + "step": 13790 + }, + { + "epoch": 2.0002174149364063, + "grad_norm": 2.1426761150360107, + "learning_rate": 3.0002898970865344e-05, + "loss": 0.641, + "step": 13800 + }, + { + "epoch": 2.0016668478457804, + "grad_norm": 2.594532012939453, + "learning_rate": 2.998840411653863e-05, + "loss": 0.5254, + "step": 13810 + }, + { + "epoch": 2.0031162807551546, + "grad_norm": 3.246509075164795, + "learning_rate": 2.997390926221192e-05, + "loss": 0.6204, + "step": 13820 + }, + { + "epoch": 2.0045657136645287, + "grad_norm": 2.1272997856140137, + "learning_rate": 2.99594144078852e-05, + "loss": 0.616, + "step": 13830 + }, + { + "epoch": 2.006015146573903, + "grad_norm": 0.8702901005744934, + "learning_rate": 2.9944919553558488e-05, + "loss": 0.5416, + "step": 13840 + }, + { + "epoch": 2.007464579483277, + "grad_norm": 0.8716585040092468, + "learning_rate": 2.9930424699231772e-05, + "loss": 0.545, + "step": 13850 + }, + { + "epoch": 2.008914012392651, + "grad_norm": 1.0876643657684326, + "learning_rate": 2.991592984490506e-05, + "loss": 0.5743, + "step": 13860 + }, + { + "epoch": 2.0103634453020254, + "grad_norm": 0.886386513710022, + "learning_rate": 2.9901434990578348e-05, + "loss": 0.5763, + "step": 13870 + }, + { + "epoch": 2.0118128782114, + "grad_norm": 1.9063818454742432, + "learning_rate": 2.9886940136251636e-05, + "loss": 0.5437, + "step": 13880 + }, + { + "epoch": 2.013262311120774, + "grad_norm": 3.7688114643096924, + "learning_rate": 2.9872445281924917e-05, + "loss": 0.7232, + "step": 13890 + }, + { + "epoch": 2.0147117440301483, + "grad_norm": 1.1015434265136719, + "learning_rate": 2.98579504275982e-05, + "loss": 0.6314, + "step": 13900 + }, + { + "epoch": 2.0161611769395225, + "grad_norm": 1.0932585000991821, + "learning_rate": 2.984345557327149e-05, + "loss": 0.6195, + "step": 13910 + }, + { + "epoch": 2.0176106098488966, + "grad_norm": 1.782259225845337, + "learning_rate": 2.9828960718944777e-05, + "loss": 0.5268, + "step": 13920 + }, + { + "epoch": 2.019060042758271, + "grad_norm": 0.9539967775344849, + "learning_rate": 2.9814465864618064e-05, + "loss": 0.6348, + "step": 13930 + }, + { + "epoch": 2.020509475667645, + "grad_norm": 0.9660699367523193, + "learning_rate": 2.979997101029135e-05, + "loss": 0.6033, + "step": 13940 + }, + { + "epoch": 2.021958908577019, + "grad_norm": 1.874058723449707, + "learning_rate": 2.9785476155964636e-05, + "loss": 0.5756, + "step": 13950 + }, + { + "epoch": 2.0234083414863933, + "grad_norm": 2.6562726497650146, + "learning_rate": 2.9770981301637917e-05, + "loss": 0.543, + "step": 13960 + }, + { + "epoch": 2.024857774395768, + "grad_norm": 2.959782600402832, + "learning_rate": 2.9756486447311205e-05, + "loss": 0.494, + "step": 13970 + }, + { + "epoch": 2.026307207305142, + "grad_norm": 1.5290549993515015, + "learning_rate": 2.9741991592984493e-05, + "loss": 0.5822, + "step": 13980 + }, + { + "epoch": 2.027756640214516, + "grad_norm": 1.205371379852295, + "learning_rate": 2.9727496738657777e-05, + "loss": 0.5649, + "step": 13990 + }, + { + "epoch": 2.0292060731238903, + "grad_norm": 0.8969298601150513, + "learning_rate": 2.9713001884331065e-05, + "loss": 0.5747, + "step": 14000 + }, + { + "epoch": 2.0292060731238903, + "eval_loss": 0.7397032380104065, + "eval_runtime": 669.3889, + "eval_samples_per_second": 51.532, + "eval_steps_per_second": 2.577, + "eval_token_accuracy": 0.00038907614440499173, + "step": 14000 + }, + { + "epoch": 2.0306555060332645, + "grad_norm": 2.183821439743042, + "learning_rate": 2.9698507030004353e-05, + "loss": 0.6279, + "step": 14010 + }, + { + "epoch": 2.0321049389426387, + "grad_norm": 1.117087960243225, + "learning_rate": 2.9684012175677634e-05, + "loss": 0.5683, + "step": 14020 + }, + { + "epoch": 2.033554371852013, + "grad_norm": 1.8336340188980103, + "learning_rate": 2.966951732135092e-05, + "loss": 0.6125, + "step": 14030 + }, + { + "epoch": 2.035003804761387, + "grad_norm": 2.2379143238067627, + "learning_rate": 2.9655022467024206e-05, + "loss": 0.5941, + "step": 14040 + }, + { + "epoch": 2.036453237670761, + "grad_norm": 0.7575920224189758, + "learning_rate": 2.9640527612697494e-05, + "loss": 0.5903, + "step": 14050 + }, + { + "epoch": 2.0379026705801353, + "grad_norm": 3.254852294921875, + "learning_rate": 2.962603275837078e-05, + "loss": 0.562, + "step": 14060 + }, + { + "epoch": 2.03935210348951, + "grad_norm": 1.0880857706069946, + "learning_rate": 2.961153790404407e-05, + "loss": 0.5554, + "step": 14070 + }, + { + "epoch": 2.040801536398884, + "grad_norm": 0.8083730340003967, + "learning_rate": 2.9597043049717353e-05, + "loss": 0.5702, + "step": 14080 + }, + { + "epoch": 2.0422509693082582, + "grad_norm": 0.9211301207542419, + "learning_rate": 2.9582548195390634e-05, + "loss": 0.5294, + "step": 14090 + }, + { + "epoch": 2.0437004022176324, + "grad_norm": 1.0826817750930786, + "learning_rate": 2.9568053341063922e-05, + "loss": 0.6142, + "step": 14100 + }, + { + "epoch": 2.0451498351270065, + "grad_norm": 1.09013032913208, + "learning_rate": 2.955355848673721e-05, + "loss": 0.5591, + "step": 14110 + }, + { + "epoch": 2.0465992680363807, + "grad_norm": 0.9100773334503174, + "learning_rate": 2.9539063632410498e-05, + "loss": 0.5169, + "step": 14120 + }, + { + "epoch": 2.048048700945755, + "grad_norm": 2.1786623001098633, + "learning_rate": 2.9524568778083782e-05, + "loss": 0.6315, + "step": 14130 + }, + { + "epoch": 2.049498133855129, + "grad_norm": 3.048058271408081, + "learning_rate": 2.951007392375707e-05, + "loss": 0.557, + "step": 14140 + }, + { + "epoch": 2.050947566764503, + "grad_norm": 0.8104945421218872, + "learning_rate": 2.949557906943035e-05, + "loss": 0.5159, + "step": 14150 + }, + { + "epoch": 2.052396999673878, + "grad_norm": 1.006663203239441, + "learning_rate": 2.948108421510364e-05, + "loss": 0.5531, + "step": 14160 + }, + { + "epoch": 2.053846432583252, + "grad_norm": 0.8596798181533813, + "learning_rate": 2.9466589360776926e-05, + "loss": 0.6046, + "step": 14170 + }, + { + "epoch": 2.055295865492626, + "grad_norm": 0.9660754203796387, + "learning_rate": 2.945209450645021e-05, + "loss": 0.5892, + "step": 14180 + }, + { + "epoch": 2.0567452984020003, + "grad_norm": 0.7551504969596863, + "learning_rate": 2.9437599652123498e-05, + "loss": 0.5422, + "step": 14190 + }, + { + "epoch": 2.0581947313113744, + "grad_norm": 1.2956832647323608, + "learning_rate": 2.9423104797796786e-05, + "loss": 0.5931, + "step": 14200 + }, + { + "epoch": 2.0596441642207486, + "grad_norm": 2.2845637798309326, + "learning_rate": 2.9408609943470067e-05, + "loss": 0.5701, + "step": 14210 + }, + { + "epoch": 2.0610935971301227, + "grad_norm": 0.7346026301383972, + "learning_rate": 2.9394115089143355e-05, + "loss": 0.5839, + "step": 14220 + }, + { + "epoch": 2.062543030039497, + "grad_norm": 2.995973825454712, + "learning_rate": 2.938106972024931e-05, + "loss": 0.6129, + "step": 14230 + }, + { + "epoch": 2.063992462948871, + "grad_norm": 0.8397359251976013, + "learning_rate": 2.93665748659226e-05, + "loss": 0.6201, + "step": 14240 + }, + { + "epoch": 2.0654418958582457, + "grad_norm": 2.088355302810669, + "learning_rate": 2.9352080011595883e-05, + "loss": 0.6016, + "step": 14250 + }, + { + "epoch": 2.06689132876762, + "grad_norm": 1.5870708227157593, + "learning_rate": 2.933758515726917e-05, + "loss": 0.6194, + "step": 14260 + }, + { + "epoch": 2.068340761676994, + "grad_norm": 3.212609052658081, + "learning_rate": 2.9323090302942458e-05, + "loss": 0.6338, + "step": 14270 + }, + { + "epoch": 2.069790194586368, + "grad_norm": 1.227840542793274, + "learning_rate": 2.9308595448615746e-05, + "loss": 0.5664, + "step": 14280 + }, + { + "epoch": 2.0712396274957423, + "grad_norm": 1.0554555654525757, + "learning_rate": 2.929410059428903e-05, + "loss": 0.5276, + "step": 14290 + }, + { + "epoch": 2.0726890604051165, + "grad_norm": 1.9810999631881714, + "learning_rate": 2.927960573996231e-05, + "loss": 0.5682, + "step": 14300 + }, + { + "epoch": 2.0741384933144906, + "grad_norm": 1.1422438621520996, + "learning_rate": 2.92651108856356e-05, + "loss": 0.5827, + "step": 14310 + }, + { + "epoch": 2.075587926223865, + "grad_norm": 0.8738481402397156, + "learning_rate": 2.9250616031308887e-05, + "loss": 0.6125, + "step": 14320 + }, + { + "epoch": 2.077037359133239, + "grad_norm": 0.8904774785041809, + "learning_rate": 2.9236121176982175e-05, + "loss": 0.595, + "step": 14330 + }, + { + "epoch": 2.0784867920426136, + "grad_norm": 0.7666566967964172, + "learning_rate": 2.922162632265546e-05, + "loss": 0.5833, + "step": 14340 + }, + { + "epoch": 2.0799362249519877, + "grad_norm": 2.4014010429382324, + "learning_rate": 2.9207131468328747e-05, + "loss": 0.5472, + "step": 14350 + }, + { + "epoch": 2.081385657861362, + "grad_norm": 2.999542474746704, + "learning_rate": 2.9192636614002028e-05, + "loss": 0.5587, + "step": 14360 + }, + { + "epoch": 2.082835090770736, + "grad_norm": 1.0062235593795776, + "learning_rate": 2.9178141759675315e-05, + "loss": 0.4953, + "step": 14370 + }, + { + "epoch": 2.08428452368011, + "grad_norm": 2.3883652687072754, + "learning_rate": 2.9163646905348603e-05, + "loss": 0.5312, + "step": 14380 + }, + { + "epoch": 2.0857339565894844, + "grad_norm": 3.436647653579712, + "learning_rate": 2.9149152051021888e-05, + "loss": 0.6025, + "step": 14390 + }, + { + "epoch": 2.0871833894988585, + "grad_norm": 2.230759620666504, + "learning_rate": 2.9134657196695175e-05, + "loss": 0.5443, + "step": 14400 + }, + { + "epoch": 2.0886328224082327, + "grad_norm": 1.5165539979934692, + "learning_rate": 2.9120162342368463e-05, + "loss": 0.5124, + "step": 14410 + }, + { + "epoch": 2.090082255317607, + "grad_norm": 1.5038182735443115, + "learning_rate": 2.9105667488041747e-05, + "loss": 0.5254, + "step": 14420 + }, + { + "epoch": 2.091531688226981, + "grad_norm": 1.835642695426941, + "learning_rate": 2.9091172633715032e-05, + "loss": 0.5832, + "step": 14430 + }, + { + "epoch": 2.0929811211363556, + "grad_norm": 2.8495612144470215, + "learning_rate": 2.9076677779388316e-05, + "loss": 0.5343, + "step": 14440 + }, + { + "epoch": 2.0944305540457298, + "grad_norm": 2.3448574542999268, + "learning_rate": 2.9062182925061604e-05, + "loss": 0.5667, + "step": 14450 + }, + { + "epoch": 2.095879986955104, + "grad_norm": 1.0450005531311035, + "learning_rate": 2.904768807073489e-05, + "loss": 0.5853, + "step": 14460 + }, + { + "epoch": 2.097329419864478, + "grad_norm": 2.5745956897735596, + "learning_rate": 2.903319321640818e-05, + "loss": 0.591, + "step": 14470 + }, + { + "epoch": 2.0987788527738522, + "grad_norm": 1.0149450302124023, + "learning_rate": 2.9018698362081464e-05, + "loss": 0.6152, + "step": 14480 + }, + { + "epoch": 2.1002282856832264, + "grad_norm": 1.0218358039855957, + "learning_rate": 2.9004203507754745e-05, + "loss": 0.537, + "step": 14490 + }, + { + "epoch": 2.1016777185926006, + "grad_norm": 1.4043501615524292, + "learning_rate": 2.8989708653428032e-05, + "loss": 0.5272, + "step": 14500 + }, + { + "epoch": 2.1031271515019747, + "grad_norm": 1.0450868606567383, + "learning_rate": 2.897521379910132e-05, + "loss": 0.5758, + "step": 14510 + }, + { + "epoch": 2.104576584411349, + "grad_norm": 2.720461368560791, + "learning_rate": 2.8960718944774608e-05, + "loss": 0.5151, + "step": 14520 + }, + { + "epoch": 2.1060260173207235, + "grad_norm": 2.295283317565918, + "learning_rate": 2.8946224090447892e-05, + "loss": 0.5786, + "step": 14530 + }, + { + "epoch": 2.1074754502300976, + "grad_norm": 0.9478172063827515, + "learning_rate": 2.893172923612118e-05, + "loss": 0.5736, + "step": 14540 + }, + { + "epoch": 2.108924883139472, + "grad_norm": 2.118621826171875, + "learning_rate": 2.8917234381794468e-05, + "loss": 0.5304, + "step": 14550 + }, + { + "epoch": 2.110374316048846, + "grad_norm": 1.0907704830169678, + "learning_rate": 2.890273952746775e-05, + "loss": 0.5746, + "step": 14560 + }, + { + "epoch": 2.11182374895822, + "grad_norm": 2.393841505050659, + "learning_rate": 2.8888244673141036e-05, + "loss": 0.5995, + "step": 14570 + }, + { + "epoch": 2.1132731818675943, + "grad_norm": 2.9934537410736084, + "learning_rate": 2.887374981881432e-05, + "loss": 0.5894, + "step": 14580 + }, + { + "epoch": 2.1147226147769684, + "grad_norm": 3.190861940383911, + "learning_rate": 2.885925496448761e-05, + "loss": 0.6324, + "step": 14590 + }, + { + "epoch": 2.1161720476863426, + "grad_norm": 0.9476354122161865, + "learning_rate": 2.8844760110160896e-05, + "loss": 0.5397, + "step": 14600 + }, + { + "epoch": 2.1176214805957168, + "grad_norm": 0.9767410755157471, + "learning_rate": 2.883026525583418e-05, + "loss": 0.5659, + "step": 14610 + }, + { + "epoch": 2.119070913505091, + "grad_norm": 0.9902337193489075, + "learning_rate": 2.8815770401507465e-05, + "loss": 0.5553, + "step": 14620 + }, + { + "epoch": 2.1205203464144655, + "grad_norm": 0.8071011900901794, + "learning_rate": 2.880127554718075e-05, + "loss": 0.5573, + "step": 14630 + }, + { + "epoch": 2.1219697793238397, + "grad_norm": 1.1467477083206177, + "learning_rate": 2.8786780692854037e-05, + "loss": 0.6328, + "step": 14640 + }, + { + "epoch": 2.123419212233214, + "grad_norm": 1.8961005210876465, + "learning_rate": 2.8772285838527325e-05, + "loss": 0.6178, + "step": 14650 + }, + { + "epoch": 2.124868645142588, + "grad_norm": 0.9489744305610657, + "learning_rate": 2.8757790984200613e-05, + "loss": 0.5989, + "step": 14660 + }, + { + "epoch": 2.126318078051962, + "grad_norm": 0.7456279397010803, + "learning_rate": 2.8743296129873897e-05, + "loss": 0.5063, + "step": 14670 + }, + { + "epoch": 2.1277675109613363, + "grad_norm": 0.9865498542785645, + "learning_rate": 2.8728801275547185e-05, + "loss": 0.5289, + "step": 14680 + }, + { + "epoch": 2.1292169438707105, + "grad_norm": 1.7938251495361328, + "learning_rate": 2.8714306421220466e-05, + "loss": 0.5128, + "step": 14690 + }, + { + "epoch": 2.1306663767800846, + "grad_norm": 1.2648547887802124, + "learning_rate": 2.8699811566893753e-05, + "loss": 0.6091, + "step": 14700 + }, + { + "epoch": 2.132115809689459, + "grad_norm": 1.101527214050293, + "learning_rate": 2.868531671256704e-05, + "loss": 0.5869, + "step": 14710 + }, + { + "epoch": 2.1335652425988334, + "grad_norm": 1.024398922920227, + "learning_rate": 2.8670821858240326e-05, + "loss": 0.535, + "step": 14720 + }, + { + "epoch": 2.1350146755082076, + "grad_norm": 0.8312330842018127, + "learning_rate": 2.8656327003913613e-05, + "loss": 0.571, + "step": 14730 + }, + { + "epoch": 2.1364641084175817, + "grad_norm": 0.7251666784286499, + "learning_rate": 2.86418321495869e-05, + "loss": 0.5798, + "step": 14740 + }, + { + "epoch": 2.137913541326956, + "grad_norm": 4.033049583435059, + "learning_rate": 2.8627337295260182e-05, + "loss": 0.5914, + "step": 14750 + }, + { + "epoch": 2.13936297423633, + "grad_norm": 0.8272391557693481, + "learning_rate": 2.861284244093347e-05, + "loss": 0.553, + "step": 14760 + }, + { + "epoch": 2.140812407145704, + "grad_norm": 2.1350879669189453, + "learning_rate": 2.8598347586606754e-05, + "loss": 0.5949, + "step": 14770 + }, + { + "epoch": 2.1422618400550784, + "grad_norm": 1.17877197265625, + "learning_rate": 2.8583852732280042e-05, + "loss": 0.6239, + "step": 14780 + }, + { + "epoch": 2.1437112729644525, + "grad_norm": 3.4440340995788574, + "learning_rate": 2.856935787795333e-05, + "loss": 0.5632, + "step": 14790 + }, + { + "epoch": 2.1451607058738267, + "grad_norm": 2.5987493991851807, + "learning_rate": 2.8554863023626614e-05, + "loss": 0.609, + "step": 14800 + }, + { + "epoch": 2.146610138783201, + "grad_norm": 1.0455831289291382, + "learning_rate": 2.8540368169299902e-05, + "loss": 0.6558, + "step": 14810 + }, + { + "epoch": 2.1480595716925754, + "grad_norm": 1.111206293106079, + "learning_rate": 2.8525873314973183e-05, + "loss": 0.5465, + "step": 14820 + }, + { + "epoch": 2.1495090046019496, + "grad_norm": 1.54293692111969, + "learning_rate": 2.851137846064647e-05, + "loss": 0.5884, + "step": 14830 + }, + { + "epoch": 2.1509584375113238, + "grad_norm": 0.9174100160598755, + "learning_rate": 2.8496883606319758e-05, + "loss": 0.5876, + "step": 14840 + }, + { + "epoch": 2.152407870420698, + "grad_norm": 0.9613193869590759, + "learning_rate": 2.8482388751993046e-05, + "loss": 0.5343, + "step": 14850 + }, + { + "epoch": 2.153857303330072, + "grad_norm": 0.9370414018630981, + "learning_rate": 2.846789389766633e-05, + "loss": 0.5604, + "step": 14860 + }, + { + "epoch": 2.1553067362394462, + "grad_norm": 0.9880764484405518, + "learning_rate": 2.8453399043339618e-05, + "loss": 0.5512, + "step": 14870 + }, + { + "epoch": 2.1567561691488204, + "grad_norm": 0.9697717428207397, + "learning_rate": 2.84389041890129e-05, + "loss": 0.5975, + "step": 14880 + }, + { + "epoch": 2.1582056020581946, + "grad_norm": 2.6026132106781006, + "learning_rate": 2.8424409334686187e-05, + "loss": 0.5156, + "step": 14890 + }, + { + "epoch": 2.159655034967569, + "grad_norm": 1.054138422012329, + "learning_rate": 2.8409914480359475e-05, + "loss": 0.568, + "step": 14900 + }, + { + "epoch": 2.1611044678769433, + "grad_norm": 1.9634565114974976, + "learning_rate": 2.839541962603276e-05, + "loss": 0.4952, + "step": 14910 + }, + { + "epoch": 2.1625539007863175, + "grad_norm": 2.1717028617858887, + "learning_rate": 2.8380924771706047e-05, + "loss": 0.5208, + "step": 14920 + }, + { + "epoch": 2.1640033336956916, + "grad_norm": 1.046442985534668, + "learning_rate": 2.8366429917379334e-05, + "loss": 0.5488, + "step": 14930 + }, + { + "epoch": 2.165452766605066, + "grad_norm": 0.7042348980903625, + "learning_rate": 2.835193506305262e-05, + "loss": 0.5599, + "step": 14940 + }, + { + "epoch": 2.16690219951444, + "grad_norm": 0.8977163434028625, + "learning_rate": 2.8337440208725903e-05, + "loss": 0.611, + "step": 14950 + }, + { + "epoch": 2.168351632423814, + "grad_norm": 3.2048516273498535, + "learning_rate": 2.8322945354399187e-05, + "loss": 0.5933, + "step": 14960 + }, + { + "epoch": 2.1698010653331883, + "grad_norm": 2.3574509620666504, + "learning_rate": 2.8308450500072475e-05, + "loss": 0.5766, + "step": 14970 + }, + { + "epoch": 2.1712504982425624, + "grad_norm": 1.741523265838623, + "learning_rate": 2.8293955645745763e-05, + "loss": 0.5783, + "step": 14980 + }, + { + "epoch": 2.1726999311519366, + "grad_norm": 1.014177680015564, + "learning_rate": 2.827946079141905e-05, + "loss": 0.5648, + "step": 14990 + }, + { + "epoch": 2.174149364061311, + "grad_norm": 0.8169074058532715, + "learning_rate": 2.8264965937092335e-05, + "loss": 0.6333, + "step": 15000 + }, + { + "epoch": 2.174149364061311, + "eval_loss": 0.7357287406921387, + "eval_runtime": 671.8324, + "eval_samples_per_second": 51.345, + "eval_steps_per_second": 2.568, + "eval_token_accuracy": 0.00039937001511268895, + "step": 15000 + }, + { + "epoch": 2.1755987969706854, + "grad_norm": 1.113046407699585, + "learning_rate": 2.8250471082765616e-05, + "loss": 0.5788, + "step": 15010 + }, + { + "epoch": 2.1770482298800595, + "grad_norm": 2.7498059272766113, + "learning_rate": 2.8235976228438904e-05, + "loss": 0.5749, + "step": 15020 + }, + { + "epoch": 2.1784976627894337, + "grad_norm": 0.8708633780479431, + "learning_rate": 2.822148137411219e-05, + "loss": 0.5504, + "step": 15030 + }, + { + "epoch": 2.179947095698808, + "grad_norm": 1.1141761541366577, + "learning_rate": 2.820698651978548e-05, + "loss": 0.4918, + "step": 15040 + }, + { + "epoch": 2.181396528608182, + "grad_norm": 1.0088549852371216, + "learning_rate": 2.8192491665458764e-05, + "loss": 0.5798, + "step": 15050 + }, + { + "epoch": 2.182845961517556, + "grad_norm": 0.924612283706665, + "learning_rate": 2.817799681113205e-05, + "loss": 0.5411, + "step": 15060 + }, + { + "epoch": 2.1842953944269303, + "grad_norm": 3.6293270587921143, + "learning_rate": 2.816350195680534e-05, + "loss": 0.5907, + "step": 15070 + }, + { + "epoch": 2.1857448273363045, + "grad_norm": 1.6444721221923828, + "learning_rate": 2.814900710247862e-05, + "loss": 0.5934, + "step": 15080 + }, + { + "epoch": 2.187194260245679, + "grad_norm": 0.8702876567840576, + "learning_rate": 2.8134512248151908e-05, + "loss": 0.5257, + "step": 15090 + }, + { + "epoch": 2.1886436931550532, + "grad_norm": 1.1607176065444946, + "learning_rate": 2.8120017393825192e-05, + "loss": 0.6226, + "step": 15100 + }, + { + "epoch": 2.1900931260644274, + "grad_norm": 0.8363760113716125, + "learning_rate": 2.810552253949848e-05, + "loss": 0.555, + "step": 15110 + }, + { + "epoch": 2.1915425589738016, + "grad_norm": 0.882907509803772, + "learning_rate": 2.8091027685171768e-05, + "loss": 0.5416, + "step": 15120 + }, + { + "epoch": 2.1929919918831757, + "grad_norm": 1.025321364402771, + "learning_rate": 2.8076532830845052e-05, + "loss": 0.5778, + "step": 15130 + }, + { + "epoch": 2.19444142479255, + "grad_norm": 1.1927074193954468, + "learning_rate": 2.8062037976518336e-05, + "loss": 0.4824, + "step": 15140 + }, + { + "epoch": 2.195890857701924, + "grad_norm": 2.4487786293029785, + "learning_rate": 2.804754312219162e-05, + "loss": 0.5273, + "step": 15150 + }, + { + "epoch": 2.197340290611298, + "grad_norm": 0.8211236596107483, + "learning_rate": 2.803304826786491e-05, + "loss": 0.5425, + "step": 15160 + }, + { + "epoch": 2.1987897235206724, + "grad_norm": 1.0167851448059082, + "learning_rate": 2.8018553413538196e-05, + "loss": 0.5707, + "step": 15170 + }, + { + "epoch": 2.2002391564300465, + "grad_norm": 0.7350836396217346, + "learning_rate": 2.8004058559211484e-05, + "loss": 0.5831, + "step": 15180 + }, + { + "epoch": 2.201688589339421, + "grad_norm": 1.0720088481903076, + "learning_rate": 2.798956370488477e-05, + "loss": 0.6097, + "step": 15190 + }, + { + "epoch": 2.2031380222487953, + "grad_norm": 2.9851224422454834, + "learning_rate": 2.797506885055805e-05, + "loss": 0.6407, + "step": 15200 + }, + { + "epoch": 2.2045874551581695, + "grad_norm": 2.150064706802368, + "learning_rate": 2.7960573996231337e-05, + "loss": 0.565, + "step": 15210 + }, + { + "epoch": 2.2060368880675436, + "grad_norm": 2.035189151763916, + "learning_rate": 2.7946079141904625e-05, + "loss": 0.5112, + "step": 15220 + }, + { + "epoch": 2.2074863209769178, + "grad_norm": 1.1495574712753296, + "learning_rate": 2.7931584287577913e-05, + "loss": 0.5142, + "step": 15230 + }, + { + "epoch": 2.208935753886292, + "grad_norm": 0.8203104734420776, + "learning_rate": 2.7917089433251197e-05, + "loss": 0.569, + "step": 15240 + }, + { + "epoch": 2.210385186795666, + "grad_norm": 0.7035321593284607, + "learning_rate": 2.7902594578924485e-05, + "loss": 0.5688, + "step": 15250 + }, + { + "epoch": 2.2118346197050403, + "grad_norm": 0.8323057889938354, + "learning_rate": 2.7888099724597772e-05, + "loss": 0.515, + "step": 15260 + }, + { + "epoch": 2.2132840526144144, + "grad_norm": 2.930710792541504, + "learning_rate": 2.7873604870271053e-05, + "loss": 0.5306, + "step": 15270 + }, + { + "epoch": 2.214733485523789, + "grad_norm": 0.955631673336029, + "learning_rate": 2.785911001594434e-05, + "loss": 0.6126, + "step": 15280 + }, + { + "epoch": 2.216182918433163, + "grad_norm": 2.184109926223755, + "learning_rate": 2.7844615161617626e-05, + "loss": 0.5994, + "step": 15290 + }, + { + "epoch": 2.2176323513425373, + "grad_norm": 1.508414626121521, + "learning_rate": 2.7830120307290913e-05, + "loss": 0.5375, + "step": 15300 + }, + { + "epoch": 2.2190817842519115, + "grad_norm": 2.4226200580596924, + "learning_rate": 2.78156254529642e-05, + "loss": 0.567, + "step": 15310 + }, + { + "epoch": 2.2205312171612857, + "grad_norm": 0.8827345371246338, + "learning_rate": 2.7801130598637485e-05, + "loss": 0.5822, + "step": 15320 + }, + { + "epoch": 2.22198065007066, + "grad_norm": 2.3095834255218506, + "learning_rate": 2.778663574431077e-05, + "loss": 0.5613, + "step": 15330 + }, + { + "epoch": 2.223430082980034, + "grad_norm": 3.297713279724121, + "learning_rate": 2.7772140889984054e-05, + "loss": 0.5968, + "step": 15340 + }, + { + "epoch": 2.224879515889408, + "grad_norm": 3.122631788253784, + "learning_rate": 2.7757646035657342e-05, + "loss": 0.5367, + "step": 15350 + }, + { + "epoch": 2.2263289487987823, + "grad_norm": 0.7298904061317444, + "learning_rate": 2.774315118133063e-05, + "loss": 0.5646, + "step": 15360 + }, + { + "epoch": 2.2277783817081565, + "grad_norm": 2.29826283454895, + "learning_rate": 2.7728656327003917e-05, + "loss": 0.5341, + "step": 15370 + }, + { + "epoch": 2.229227814617531, + "grad_norm": 3.6760542392730713, + "learning_rate": 2.7714161472677202e-05, + "loss": 0.5884, + "step": 15380 + }, + { + "epoch": 2.230677247526905, + "grad_norm": 1.860216736793518, + "learning_rate": 2.769966661835049e-05, + "loss": 0.6191, + "step": 15390 + }, + { + "epoch": 2.2321266804362794, + "grad_norm": 1.4394019842147827, + "learning_rate": 2.768517176402377e-05, + "loss": 0.5397, + "step": 15400 + }, + { + "epoch": 2.2335761133456535, + "grad_norm": 0.8453474044799805, + "learning_rate": 2.7670676909697058e-05, + "loss": 0.5987, + "step": 15410 + }, + { + "epoch": 2.2350255462550277, + "grad_norm": 0.797804594039917, + "learning_rate": 2.7656182055370346e-05, + "loss": 0.5442, + "step": 15420 + }, + { + "epoch": 2.236474979164402, + "grad_norm": 0.9788017868995667, + "learning_rate": 2.764168720104363e-05, + "loss": 0.5397, + "step": 15430 + }, + { + "epoch": 2.237924412073776, + "grad_norm": 1.2205966711044312, + "learning_rate": 2.7627192346716918e-05, + "loss": 0.5546, + "step": 15440 + }, + { + "epoch": 2.23937384498315, + "grad_norm": 0.9380795359611511, + "learning_rate": 2.7612697492390206e-05, + "loss": 0.5002, + "step": 15450 + }, + { + "epoch": 2.240823277892525, + "grad_norm": 0.9293228983879089, + "learning_rate": 2.7598202638063487e-05, + "loss": 0.5216, + "step": 15460 + }, + { + "epoch": 2.242272710801899, + "grad_norm": 3.3147010803222656, + "learning_rate": 2.7583707783736774e-05, + "loss": 0.5185, + "step": 15470 + }, + { + "epoch": 2.243722143711273, + "grad_norm": 2.7040462493896484, + "learning_rate": 2.756921292941006e-05, + "loss": 0.5597, + "step": 15480 + }, + { + "epoch": 2.2451715766206473, + "grad_norm": 1.8201930522918701, + "learning_rate": 2.7554718075083347e-05, + "loss": 0.5687, + "step": 15490 + }, + { + "epoch": 2.2466210095300214, + "grad_norm": 2.6963207721710205, + "learning_rate": 2.7540223220756634e-05, + "loss": 0.548, + "step": 15500 + }, + { + "epoch": 2.2480704424393956, + "grad_norm": 2.536465644836426, + "learning_rate": 2.752572836642992e-05, + "loss": 0.5446, + "step": 15510 + }, + { + "epoch": 2.2495198753487697, + "grad_norm": 1.1007113456726074, + "learning_rate": 2.7511233512103206e-05, + "loss": 0.5694, + "step": 15520 + }, + { + "epoch": 2.250969308258144, + "grad_norm": 0.933037519454956, + "learning_rate": 2.7496738657776487e-05, + "loss": 0.5369, + "step": 15530 + }, + { + "epoch": 2.252418741167518, + "grad_norm": 1.0118540525436401, + "learning_rate": 2.7482243803449775e-05, + "loss": 0.6256, + "step": 15540 + }, + { + "epoch": 2.253868174076892, + "grad_norm": 1.152823805809021, + "learning_rate": 2.7467748949123063e-05, + "loss": 0.6073, + "step": 15550 + }, + { + "epoch": 2.2553176069862664, + "grad_norm": 1.694582462310791, + "learning_rate": 2.745325409479635e-05, + "loss": 0.5662, + "step": 15560 + }, + { + "epoch": 2.256767039895641, + "grad_norm": 1.02156400680542, + "learning_rate": 2.7438759240469635e-05, + "loss": 0.5219, + "step": 15570 + }, + { + "epoch": 2.258216472805015, + "grad_norm": 1.1287500858306885, + "learning_rate": 2.7424264386142923e-05, + "loss": 0.5136, + "step": 15580 + }, + { + "epoch": 2.2596659057143893, + "grad_norm": 3.2117209434509277, + "learning_rate": 2.7409769531816204e-05, + "loss": 0.536, + "step": 15590 + }, + { + "epoch": 2.2611153386237635, + "grad_norm": 2.113884925842285, + "learning_rate": 2.739527467748949e-05, + "loss": 0.6149, + "step": 15600 + }, + { + "epoch": 2.2625647715331376, + "grad_norm": 0.9687091708183289, + "learning_rate": 2.738077982316278e-05, + "loss": 0.5503, + "step": 15610 + }, + { + "epoch": 2.264014204442512, + "grad_norm": 2.7099952697753906, + "learning_rate": 2.7366284968836064e-05, + "loss": 0.5276, + "step": 15620 + }, + { + "epoch": 2.265463637351886, + "grad_norm": 3.803027629852295, + "learning_rate": 2.735179011450935e-05, + "loss": 0.5589, + "step": 15630 + }, + { + "epoch": 2.26691307026126, + "grad_norm": 3.0070250034332275, + "learning_rate": 2.733729526018264e-05, + "loss": 0.6067, + "step": 15640 + }, + { + "epoch": 2.2683625031706347, + "grad_norm": 1.7724757194519043, + "learning_rate": 2.7322800405855923e-05, + "loss": 0.6155, + "step": 15650 + }, + { + "epoch": 2.269811936080009, + "grad_norm": 0.9404800534248352, + "learning_rate": 2.7308305551529208e-05, + "loss": 0.522, + "step": 15660 + }, + { + "epoch": 2.271261368989383, + "grad_norm": 1.0227422714233398, + "learning_rate": 2.7293810697202492e-05, + "loss": 0.582, + "step": 15670 + }, + { + "epoch": 2.272710801898757, + "grad_norm": 1.8441566228866577, + "learning_rate": 2.727931584287578e-05, + "loss": 0.6357, + "step": 15680 + }, + { + "epoch": 2.2741602348081313, + "grad_norm": 1.2445653676986694, + "learning_rate": 2.7264820988549068e-05, + "loss": 0.5996, + "step": 15690 + }, + { + "epoch": 2.2756096677175055, + "grad_norm": 1.0026644468307495, + "learning_rate": 2.7250326134222352e-05, + "loss": 0.5768, + "step": 15700 + }, + { + "epoch": 2.2770591006268797, + "grad_norm": 1.053460717201233, + "learning_rate": 2.723583127989564e-05, + "loss": 0.5642, + "step": 15710 + }, + { + "epoch": 2.278508533536254, + "grad_norm": 0.7311908006668091, + "learning_rate": 2.722133642556892e-05, + "loss": 0.5459, + "step": 15720 + }, + { + "epoch": 2.279957966445628, + "grad_norm": 0.796303927898407, + "learning_rate": 2.720684157124221e-05, + "loss": 0.5471, + "step": 15730 + }, + { + "epoch": 2.281407399355002, + "grad_norm": 1.0040631294250488, + "learning_rate": 2.7192346716915496e-05, + "loss": 0.5486, + "step": 15740 + }, + { + "epoch": 2.2828568322643767, + "grad_norm": 0.8615463376045227, + "learning_rate": 2.7177851862588784e-05, + "loss": 0.5496, + "step": 15750 + }, + { + "epoch": 2.284306265173751, + "grad_norm": 2.0899055004119873, + "learning_rate": 2.716335700826207e-05, + "loss": 0.553, + "step": 15760 + }, + { + "epoch": 2.285755698083125, + "grad_norm": 1.0100775957107544, + "learning_rate": 2.7148862153935356e-05, + "loss": 0.5986, + "step": 15770 + }, + { + "epoch": 2.2872051309924992, + "grad_norm": 2.7385852336883545, + "learning_rate": 2.7134367299608644e-05, + "loss": 0.5775, + "step": 15780 + }, + { + "epoch": 2.2886545639018734, + "grad_norm": 1.0600306987762451, + "learning_rate": 2.7119872445281925e-05, + "loss": 0.5824, + "step": 15790 + }, + { + "epoch": 2.2901039968112475, + "grad_norm": 2.653311014175415, + "learning_rate": 2.7105377590955213e-05, + "loss": 0.6391, + "step": 15800 + }, + { + "epoch": 2.2915534297206217, + "grad_norm": 2.0966145992279053, + "learning_rate": 2.7090882736628497e-05, + "loss": 0.5532, + "step": 15810 + }, + { + "epoch": 2.293002862629996, + "grad_norm": 1.9514657258987427, + "learning_rate": 2.7076387882301785e-05, + "loss": 0.5264, + "step": 15820 + }, + { + "epoch": 2.2944522955393705, + "grad_norm": 0.8025791049003601, + "learning_rate": 2.7061893027975072e-05, + "loss": 0.5974, + "step": 15830 + }, + { + "epoch": 2.2959017284487446, + "grad_norm": 0.8524109125137329, + "learning_rate": 2.7047398173648357e-05, + "loss": 0.4788, + "step": 15840 + }, + { + "epoch": 2.297351161358119, + "grad_norm": 0.9747781157493591, + "learning_rate": 2.703290331932164e-05, + "loss": 0.5226, + "step": 15850 + }, + { + "epoch": 2.298800594267493, + "grad_norm": 1.0575156211853027, + "learning_rate": 2.7018408464994925e-05, + "loss": 0.5671, + "step": 15860 + }, + { + "epoch": 2.300250027176867, + "grad_norm": 1.6193407773971558, + "learning_rate": 2.7003913610668213e-05, + "loss": 0.5991, + "step": 15870 + }, + { + "epoch": 2.3016994600862413, + "grad_norm": 2.8316709995269775, + "learning_rate": 2.69894187563415e-05, + "loss": 0.5654, + "step": 15880 + }, + { + "epoch": 2.3031488929956154, + "grad_norm": 2.5655696392059326, + "learning_rate": 2.6974923902014785e-05, + "loss": 0.5776, + "step": 15890 + }, + { + "epoch": 2.3045983259049896, + "grad_norm": 0.8109912276268005, + "learning_rate": 2.6960429047688073e-05, + "loss": 0.5703, + "step": 15900 + }, + { + "epoch": 2.3060477588143637, + "grad_norm": 1.0215001106262207, + "learning_rate": 2.694593419336136e-05, + "loss": 0.6276, + "step": 15910 + }, + { + "epoch": 2.307497191723738, + "grad_norm": 0.8743044137954712, + "learning_rate": 2.6931439339034642e-05, + "loss": 0.5279, + "step": 15920 + }, + { + "epoch": 2.308946624633112, + "grad_norm": 3.493328809738159, + "learning_rate": 2.691694448470793e-05, + "loss": 0.5984, + "step": 15930 + }, + { + "epoch": 2.3103960575424867, + "grad_norm": 0.842359185218811, + "learning_rate": 2.6902449630381217e-05, + "loss": 0.5373, + "step": 15940 + }, + { + "epoch": 2.311845490451861, + "grad_norm": 0.8587477207183838, + "learning_rate": 2.68879547760545e-05, + "loss": 0.6291, + "step": 15950 + }, + { + "epoch": 2.313294923361235, + "grad_norm": 1.6115646362304688, + "learning_rate": 2.687345992172779e-05, + "loss": 0.5439, + "step": 15960 + }, + { + "epoch": 2.314744356270609, + "grad_norm": 1.8800400495529175, + "learning_rate": 2.6858965067401077e-05, + "loss": 0.5898, + "step": 15970 + }, + { + "epoch": 2.3161937891799833, + "grad_norm": 1.8140876293182373, + "learning_rate": 2.6844470213074358e-05, + "loss": 0.6311, + "step": 15980 + }, + { + "epoch": 2.3176432220893575, + "grad_norm": 0.8107464909553528, + "learning_rate": 2.6829975358747646e-05, + "loss": 0.5671, + "step": 15990 + }, + { + "epoch": 2.3190926549987316, + "grad_norm": 2.7339236736297607, + "learning_rate": 2.681548050442093e-05, + "loss": 0.5767, + "step": 16000 + }, + { + "epoch": 2.3190926549987316, + "eval_loss": 0.7302293181419373, + "eval_runtime": 671.5101, + "eval_samples_per_second": 51.369, + "eval_steps_per_second": 2.569, + "eval_token_accuracy": 0.0004098247275501939, + "step": 16000 + }, + { + "epoch": 2.320542087908106, + "grad_norm": 0.9360762238502502, + "learning_rate": 2.6800985650094218e-05, + "loss": 0.5811, + "step": 16010 + }, + { + "epoch": 2.3219915208174804, + "grad_norm": 1.0557974576950073, + "learning_rate": 2.6786490795767506e-05, + "loss": 0.5832, + "step": 16020 + }, + { + "epoch": 2.3234409537268546, + "grad_norm": 0.8322176337242126, + "learning_rate": 2.677199594144079e-05, + "loss": 0.5846, + "step": 16030 + }, + { + "epoch": 2.3248903866362287, + "grad_norm": 1.0784193277359009, + "learning_rate": 2.6757501087114078e-05, + "loss": 0.5706, + "step": 16040 + }, + { + "epoch": 2.326339819545603, + "grad_norm": 1.1414878368377686, + "learning_rate": 2.674300623278736e-05, + "loss": 0.615, + "step": 16050 + }, + { + "epoch": 2.327789252454977, + "grad_norm": 1.0786707401275635, + "learning_rate": 2.6728511378460647e-05, + "loss": 0.5238, + "step": 16060 + }, + { + "epoch": 2.329238685364351, + "grad_norm": 2.1784827709198, + "learning_rate": 2.6714016524133934e-05, + "loss": 0.5548, + "step": 16070 + }, + { + "epoch": 2.3306881182737254, + "grad_norm": 1.0003679990768433, + "learning_rate": 2.6699521669807222e-05, + "loss": 0.5994, + "step": 16080 + }, + { + "epoch": 2.3321375511830995, + "grad_norm": 0.9344300627708435, + "learning_rate": 2.6685026815480506e-05, + "loss": 0.5763, + "step": 16090 + }, + { + "epoch": 2.3335869840924737, + "grad_norm": 0.8955085277557373, + "learning_rate": 2.6670531961153794e-05, + "loss": 0.5758, + "step": 16100 + }, + { + "epoch": 2.335036417001848, + "grad_norm": 0.9898104667663574, + "learning_rate": 2.6656037106827075e-05, + "loss": 0.5956, + "step": 16110 + }, + { + "epoch": 2.336485849911222, + "grad_norm": 0.795774519443512, + "learning_rate": 2.6641542252500363e-05, + "loss": 0.5701, + "step": 16120 + }, + { + "epoch": 2.3379352828205966, + "grad_norm": 0.7755706906318665, + "learning_rate": 2.662704739817365e-05, + "loss": 0.5359, + "step": 16130 + }, + { + "epoch": 2.3393847157299708, + "grad_norm": 0.9177207946777344, + "learning_rate": 2.6612552543846935e-05, + "loss": 0.5405, + "step": 16140 + }, + { + "epoch": 2.340834148639345, + "grad_norm": 0.9761675000190735, + "learning_rate": 2.6598057689520223e-05, + "loss": 0.6509, + "step": 16150 + }, + { + "epoch": 2.342283581548719, + "grad_norm": 1.5821987390518188, + "learning_rate": 2.658356283519351e-05, + "loss": 0.5317, + "step": 16160 + }, + { + "epoch": 2.3437330144580932, + "grad_norm": 1.0644711256027222, + "learning_rate": 2.656906798086679e-05, + "loss": 0.5593, + "step": 16170 + }, + { + "epoch": 2.3451824473674674, + "grad_norm": 0.7431843280792236, + "learning_rate": 2.655457312654008e-05, + "loss": 0.5443, + "step": 16180 + }, + { + "epoch": 2.3466318802768416, + "grad_norm": 1.9291008710861206, + "learning_rate": 2.6540078272213364e-05, + "loss": 0.5501, + "step": 16190 + }, + { + "epoch": 2.3480813131862157, + "grad_norm": 1.1686913967132568, + "learning_rate": 2.652558341788665e-05, + "loss": 0.5686, + "step": 16200 + }, + { + "epoch": 2.3495307460955903, + "grad_norm": 0.8081609606742859, + "learning_rate": 2.651108856355994e-05, + "loss": 0.5372, + "step": 16210 + }, + { + "epoch": 2.3509801790049645, + "grad_norm": 2.6243176460266113, + "learning_rate": 2.6496593709233223e-05, + "loss": 0.5796, + "step": 16220 + }, + { + "epoch": 2.3524296119143386, + "grad_norm": 2.3926889896392822, + "learning_rate": 2.648209885490651e-05, + "loss": 0.5658, + "step": 16230 + }, + { + "epoch": 2.353879044823713, + "grad_norm": 0.7738615870475769, + "learning_rate": 2.6467604000579792e-05, + "loss": 0.5407, + "step": 16240 + }, + { + "epoch": 2.355328477733087, + "grad_norm": 1.307656168937683, + "learning_rate": 2.645310914625308e-05, + "loss": 0.4653, + "step": 16250 + }, + { + "epoch": 2.356777910642461, + "grad_norm": 3.530689001083374, + "learning_rate": 2.6438614291926368e-05, + "loss": 0.5606, + "step": 16260 + }, + { + "epoch": 2.3582273435518353, + "grad_norm": 3.2263526916503906, + "learning_rate": 2.6424119437599655e-05, + "loss": 0.5717, + "step": 16270 + }, + { + "epoch": 2.3596767764612094, + "grad_norm": 1.393611192703247, + "learning_rate": 2.640962458327294e-05, + "loss": 0.6187, + "step": 16280 + }, + { + "epoch": 2.3611262093705836, + "grad_norm": 1.1633098125457764, + "learning_rate": 2.6395129728946227e-05, + "loss": 0.6338, + "step": 16290 + }, + { + "epoch": 2.3625756422799578, + "grad_norm": 1.472760796546936, + "learning_rate": 2.638063487461951e-05, + "loss": 0.5886, + "step": 16300 + }, + { + "epoch": 2.3640250751893324, + "grad_norm": 0.9567728042602539, + "learning_rate": 2.6366140020292796e-05, + "loss": 0.5278, + "step": 16310 + }, + { + "epoch": 2.3654745080987065, + "grad_norm": 1.845215916633606, + "learning_rate": 2.6351645165966084e-05, + "loss": 0.6091, + "step": 16320 + }, + { + "epoch": 2.3669239410080807, + "grad_norm": 2.0079545974731445, + "learning_rate": 2.6337150311639368e-05, + "loss": 0.6078, + "step": 16330 + }, + { + "epoch": 2.368373373917455, + "grad_norm": 3.1055665016174316, + "learning_rate": 2.6322655457312656e-05, + "loss": 0.5967, + "step": 16340 + }, + { + "epoch": 2.369822806826829, + "grad_norm": 2.2276206016540527, + "learning_rate": 2.6308160602985944e-05, + "loss": 0.546, + "step": 16350 + }, + { + "epoch": 2.371272239736203, + "grad_norm": 1.090536117553711, + "learning_rate": 2.6293665748659228e-05, + "loss": 0.5972, + "step": 16360 + }, + { + "epoch": 2.3727216726455773, + "grad_norm": 3.283720016479492, + "learning_rate": 2.6279170894332513e-05, + "loss": 0.5906, + "step": 16370 + }, + { + "epoch": 2.3741711055549515, + "grad_norm": 1.0914537906646729, + "learning_rate": 2.6264676040005797e-05, + "loss": 0.6173, + "step": 16380 + }, + { + "epoch": 2.375620538464326, + "grad_norm": 1.635926604270935, + "learning_rate": 2.6250181185679085e-05, + "loss": 0.6039, + "step": 16390 + }, + { + "epoch": 2.3770699713737002, + "grad_norm": 0.960096001625061, + "learning_rate": 2.6235686331352372e-05, + "loss": 0.6001, + "step": 16400 + }, + { + "epoch": 2.3785194042830744, + "grad_norm": 0.8694259524345398, + "learning_rate": 2.6221191477025657e-05, + "loss": 0.5376, + "step": 16410 + }, + { + "epoch": 2.3799688371924486, + "grad_norm": 0.9353076815605164, + "learning_rate": 2.6206696622698944e-05, + "loss": 0.6184, + "step": 16420 + }, + { + "epoch": 2.3814182701018227, + "grad_norm": 2.265793800354004, + "learning_rate": 2.6192201768372225e-05, + "loss": 0.5653, + "step": 16430 + }, + { + "epoch": 2.382867703011197, + "grad_norm": 0.8421527147293091, + "learning_rate": 2.6177706914045513e-05, + "loss": 0.6106, + "step": 16440 + }, + { + "epoch": 2.384317135920571, + "grad_norm": 1.8163224458694458, + "learning_rate": 2.61632120597188e-05, + "loss": 0.519, + "step": 16450 + }, + { + "epoch": 2.385766568829945, + "grad_norm": 2.589510679244995, + "learning_rate": 2.614871720539209e-05, + "loss": 0.629, + "step": 16460 + }, + { + "epoch": 2.3872160017393194, + "grad_norm": 0.9429428577423096, + "learning_rate": 2.6134222351065373e-05, + "loss": 0.616, + "step": 16470 + }, + { + "epoch": 2.3886654346486935, + "grad_norm": 1.0219968557357788, + "learning_rate": 2.611972749673866e-05, + "loss": 0.5683, + "step": 16480 + }, + { + "epoch": 2.3901148675580677, + "grad_norm": 0.883083701133728, + "learning_rate": 2.610523264241195e-05, + "loss": 0.5917, + "step": 16490 + }, + { + "epoch": 2.3915643004674423, + "grad_norm": 2.5757100582122803, + "learning_rate": 2.609073778808523e-05, + "loss": 0.5856, + "step": 16500 + }, + { + "epoch": 2.3930137333768164, + "grad_norm": 2.343125581741333, + "learning_rate": 2.6076242933758517e-05, + "loss": 0.5585, + "step": 16510 + }, + { + "epoch": 2.3944631662861906, + "grad_norm": 2.2022042274475098, + "learning_rate": 2.60617480794318e-05, + "loss": 0.5742, + "step": 16520 + }, + { + "epoch": 2.3959125991955648, + "grad_norm": 1.8321778774261475, + "learning_rate": 2.604725322510509e-05, + "loss": 0.6063, + "step": 16530 + }, + { + "epoch": 2.397362032104939, + "grad_norm": 3.5156893730163574, + "learning_rate": 2.6032758370778377e-05, + "loss": 0.5967, + "step": 16540 + }, + { + "epoch": 2.398811465014313, + "grad_norm": 0.9662124514579773, + "learning_rate": 2.601826351645166e-05, + "loss": 0.5819, + "step": 16550 + }, + { + "epoch": 2.4002608979236872, + "grad_norm": 2.2844319343566895, + "learning_rate": 2.6003768662124946e-05, + "loss": 0.5899, + "step": 16560 + }, + { + "epoch": 2.4017103308330614, + "grad_norm": 1.8318297863006592, + "learning_rate": 2.598927380779823e-05, + "loss": 0.5363, + "step": 16570 + }, + { + "epoch": 2.403159763742436, + "grad_norm": 0.8952181935310364, + "learning_rate": 2.5974778953471518e-05, + "loss": 0.5589, + "step": 16580 + }, + { + "epoch": 2.40460919665181, + "grad_norm": 0.9220480918884277, + "learning_rate": 2.5960284099144806e-05, + "loss": 0.5619, + "step": 16590 + }, + { + "epoch": 2.4060586295611843, + "grad_norm": 1.3954976797103882, + "learning_rate": 2.594578924481809e-05, + "loss": 0.5288, + "step": 16600 + }, + { + "epoch": 2.4075080624705585, + "grad_norm": 0.9671636819839478, + "learning_rate": 2.5931294390491378e-05, + "loss": 0.5749, + "step": 16610 + }, + { + "epoch": 2.4089574953799326, + "grad_norm": 2.2830398082733154, + "learning_rate": 2.5916799536164666e-05, + "loss": 0.5306, + "step": 16620 + }, + { + "epoch": 2.410406928289307, + "grad_norm": 2.176110029220581, + "learning_rate": 2.5902304681837947e-05, + "loss": 0.5812, + "step": 16630 + }, + { + "epoch": 2.411856361198681, + "grad_norm": 0.8190110325813293, + "learning_rate": 2.5887809827511234e-05, + "loss": 0.5436, + "step": 16640 + }, + { + "epoch": 2.413305794108055, + "grad_norm": 1.3935105800628662, + "learning_rate": 2.5873314973184522e-05, + "loss": 0.5721, + "step": 16650 + }, + { + "epoch": 2.4147552270174293, + "grad_norm": 1.1035131216049194, + "learning_rate": 2.5858820118857806e-05, + "loss": 0.6168, + "step": 16660 + }, + { + "epoch": 2.4162046599268034, + "grad_norm": 2.679525136947632, + "learning_rate": 2.5844325264531094e-05, + "loss": 0.6231, + "step": 16670 + }, + { + "epoch": 2.4176540928361776, + "grad_norm": 0.9634791016578674, + "learning_rate": 2.5829830410204382e-05, + "loss": 0.5816, + "step": 16680 + }, + { + "epoch": 2.419103525745552, + "grad_norm": 1.4482085704803467, + "learning_rate": 2.5815335555877663e-05, + "loss": 0.5483, + "step": 16690 + }, + { + "epoch": 2.4205529586549264, + "grad_norm": 0.7800090312957764, + "learning_rate": 2.580084070155095e-05, + "loss": 0.524, + "step": 16700 + }, + { + "epoch": 2.4220023915643005, + "grad_norm": 0.8343473076820374, + "learning_rate": 2.5786345847224235e-05, + "loss": 0.5098, + "step": 16710 + }, + { + "epoch": 2.4234518244736747, + "grad_norm": 3.29060435295105, + "learning_rate": 2.5771850992897523e-05, + "loss": 0.588, + "step": 16720 + }, + { + "epoch": 2.424901257383049, + "grad_norm": 0.890139639377594, + "learning_rate": 2.575735613857081e-05, + "loss": 0.6131, + "step": 16730 + }, + { + "epoch": 2.426350690292423, + "grad_norm": 2.1215193271636963, + "learning_rate": 2.5742861284244095e-05, + "loss": 0.5887, + "step": 16740 + }, + { + "epoch": 2.427800123201797, + "grad_norm": 1.007075548171997, + "learning_rate": 2.5728366429917383e-05, + "loss": 0.5369, + "step": 16750 + }, + { + "epoch": 2.4292495561111713, + "grad_norm": 2.85379695892334, + "learning_rate": 2.5713871575590663e-05, + "loss": 0.5485, + "step": 16760 + }, + { + "epoch": 2.430698989020546, + "grad_norm": 1.8373109102249146, + "learning_rate": 2.569937672126395e-05, + "loss": 0.5603, + "step": 16770 + }, + { + "epoch": 2.43214842192992, + "grad_norm": 1.6939905881881714, + "learning_rate": 2.568488186693724e-05, + "loss": 0.5318, + "step": 16780 + }, + { + "epoch": 2.4335978548392942, + "grad_norm": 1.1219695806503296, + "learning_rate": 2.5670387012610523e-05, + "loss": 0.5429, + "step": 16790 + }, + { + "epoch": 2.4350472877486684, + "grad_norm": 3.929163932800293, + "learning_rate": 2.565589215828381e-05, + "loss": 0.5971, + "step": 16800 + }, + { + "epoch": 2.4364967206580426, + "grad_norm": 2.685321807861328, + "learning_rate": 2.56413973039571e-05, + "loss": 0.6142, + "step": 16810 + }, + { + "epoch": 2.4379461535674167, + "grad_norm": 2.1034326553344727, + "learning_rate": 2.562690244963038e-05, + "loss": 0.6629, + "step": 16820 + }, + { + "epoch": 2.439395586476791, + "grad_norm": 3.248325824737549, + "learning_rate": 2.5612407595303668e-05, + "loss": 0.5933, + "step": 16830 + }, + { + "epoch": 2.440845019386165, + "grad_norm": 0.8918882012367249, + "learning_rate": 2.5597912740976955e-05, + "loss": 0.608, + "step": 16840 + }, + { + "epoch": 2.442294452295539, + "grad_norm": 2.837606906890869, + "learning_rate": 2.558341788665024e-05, + "loss": 0.51, + "step": 16850 + }, + { + "epoch": 2.4437438852049134, + "grad_norm": 2.903970241546631, + "learning_rate": 2.5568923032323527e-05, + "loss": 0.6183, + "step": 16860 + }, + { + "epoch": 2.445193318114288, + "grad_norm": 0.7252295017242432, + "learning_rate": 2.5554428177996815e-05, + "loss": 0.5576, + "step": 16870 + }, + { + "epoch": 2.446642751023662, + "grad_norm": 2.373798370361328, + "learning_rate": 2.55399333236701e-05, + "loss": 0.5976, + "step": 16880 + }, + { + "epoch": 2.4480921839330363, + "grad_norm": 0.8515360355377197, + "learning_rate": 2.5525438469343384e-05, + "loss": 0.576, + "step": 16890 + }, + { + "epoch": 2.4495416168424105, + "grad_norm": 1.0345165729522705, + "learning_rate": 2.5510943615016668e-05, + "loss": 0.5842, + "step": 16900 + }, + { + "epoch": 2.4509910497517846, + "grad_norm": 0.9085811972618103, + "learning_rate": 2.5496448760689956e-05, + "loss": 0.5532, + "step": 16910 + }, + { + "epoch": 2.4524404826611588, + "grad_norm": 2.3138504028320312, + "learning_rate": 2.5481953906363244e-05, + "loss": 0.5709, + "step": 16920 + }, + { + "epoch": 2.453889915570533, + "grad_norm": 2.1807808876037598, + "learning_rate": 2.5467459052036528e-05, + "loss": 0.5493, + "step": 16930 + }, + { + "epoch": 2.455339348479907, + "grad_norm": 0.8590166568756104, + "learning_rate": 2.5452964197709816e-05, + "loss": 0.5581, + "step": 16940 + }, + { + "epoch": 2.4567887813892813, + "grad_norm": 1.3868404626846313, + "learning_rate": 2.5438469343383097e-05, + "loss": 0.5393, + "step": 16950 + }, + { + "epoch": 2.458238214298656, + "grad_norm": 2.7364912033081055, + "learning_rate": 2.5423974489056385e-05, + "loss": 0.5478, + "step": 16960 + }, + { + "epoch": 2.45968764720803, + "grad_norm": 0.6827089786529541, + "learning_rate": 2.5409479634729672e-05, + "loss": 0.59, + "step": 16970 + }, + { + "epoch": 2.461137080117404, + "grad_norm": 0.8788642287254333, + "learning_rate": 2.5394984780402957e-05, + "loss": 0.5632, + "step": 16980 + }, + { + "epoch": 2.4625865130267783, + "grad_norm": 1.083719253540039, + "learning_rate": 2.5380489926076244e-05, + "loss": 0.5878, + "step": 16990 + }, + { + "epoch": 2.4640359459361525, + "grad_norm": 0.7993260025978088, + "learning_rate": 2.5365995071749532e-05, + "loss": 0.5351, + "step": 17000 + }, + { + "epoch": 2.4640359459361525, + "eval_loss": 0.7283534407615662, + "eval_runtime": 670.6485, + "eval_samples_per_second": 51.435, + "eval_steps_per_second": 2.572, + "eval_token_accuracy": 0.00039727907262518793, + "step": 17000 + }, + { + "epoch": 2.4654853788455267, + "grad_norm": 2.387397527694702, + "learning_rate": 2.535150021742282e-05, + "loss": 0.5472, + "step": 17010 + }, + { + "epoch": 2.466934811754901, + "grad_norm": 2.854649305343628, + "learning_rate": 2.53370053630961e-05, + "loss": 0.5452, + "step": 17020 + }, + { + "epoch": 2.468384244664275, + "grad_norm": 1.1013128757476807, + "learning_rate": 2.532251050876939e-05, + "loss": 0.6453, + "step": 17030 + }, + { + "epoch": 2.469833677573649, + "grad_norm": 0.925757110118866, + "learning_rate": 2.5308015654442673e-05, + "loss": 0.5083, + "step": 17040 + }, + { + "epoch": 2.4712831104830233, + "grad_norm": 0.8227369785308838, + "learning_rate": 2.529352080011596e-05, + "loss": 0.5315, + "step": 17050 + }, + { + "epoch": 2.472732543392398, + "grad_norm": 2.7134206295013428, + "learning_rate": 2.527902594578925e-05, + "loss": 0.548, + "step": 17060 + }, + { + "epoch": 2.474181976301772, + "grad_norm": 1.0173161029815674, + "learning_rate": 2.5264531091462533e-05, + "loss": 0.5493, + "step": 17070 + }, + { + "epoch": 2.475631409211146, + "grad_norm": 1.1902835369110107, + "learning_rate": 2.5250036237135817e-05, + "loss": 0.5379, + "step": 17080 + }, + { + "epoch": 2.4770808421205204, + "grad_norm": 2.2522265911102295, + "learning_rate": 2.52355413828091e-05, + "loss": 0.5814, + "step": 17090 + }, + { + "epoch": 2.4785302750298945, + "grad_norm": 1.12079918384552, + "learning_rate": 2.522104652848239e-05, + "loss": 0.578, + "step": 17100 + }, + { + "epoch": 2.4799797079392687, + "grad_norm": 1.099763035774231, + "learning_rate": 2.5206551674155677e-05, + "loss": 0.5816, + "step": 17110 + }, + { + "epoch": 2.481429140848643, + "grad_norm": 2.820364475250244, + "learning_rate": 2.519205681982896e-05, + "loss": 0.5926, + "step": 17120 + }, + { + "epoch": 2.482878573758017, + "grad_norm": 1.0384719371795654, + "learning_rate": 2.517756196550225e-05, + "loss": 0.6269, + "step": 17130 + }, + { + "epoch": 2.4843280066673916, + "grad_norm": 0.8136078715324402, + "learning_rate": 2.516306711117553e-05, + "loss": 0.5686, + "step": 17140 + }, + { + "epoch": 2.485777439576766, + "grad_norm": 1.984144926071167, + "learning_rate": 2.5148572256848818e-05, + "loss": 0.5829, + "step": 17150 + }, + { + "epoch": 2.48722687248614, + "grad_norm": 1.0443241596221924, + "learning_rate": 2.5134077402522106e-05, + "loss": 0.5972, + "step": 17160 + }, + { + "epoch": 2.488676305395514, + "grad_norm": 2.3265018463134766, + "learning_rate": 2.5119582548195393e-05, + "loss": 0.5548, + "step": 17170 + }, + { + "epoch": 2.4901257383048883, + "grad_norm": 2.045551061630249, + "learning_rate": 2.5105087693868678e-05, + "loss": 0.5481, + "step": 17180 + }, + { + "epoch": 2.4915751712142624, + "grad_norm": 2.1172616481781006, + "learning_rate": 2.5090592839541965e-05, + "loss": 0.6316, + "step": 17190 + }, + { + "epoch": 2.4930246041236366, + "grad_norm": 2.111454963684082, + "learning_rate": 2.5076097985215253e-05, + "loss": 0.5581, + "step": 17200 + }, + { + "epoch": 2.4944740370330107, + "grad_norm": 0.8427057266235352, + "learning_rate": 2.5061603130888534e-05, + "loss": 0.5431, + "step": 17210 + }, + { + "epoch": 2.495923469942385, + "grad_norm": 1.2653619050979614, + "learning_rate": 2.5047108276561822e-05, + "loss": 0.5838, + "step": 17220 + }, + { + "epoch": 2.497372902851759, + "grad_norm": 0.9283726215362549, + "learning_rate": 2.5032613422235106e-05, + "loss": 0.6017, + "step": 17230 + }, + { + "epoch": 2.498822335761133, + "grad_norm": 3.0021955966949463, + "learning_rate": 2.5018118567908394e-05, + "loss": 0.5924, + "step": 17240 + }, + { + "epoch": 2.500271768670508, + "grad_norm": 0.9704183340072632, + "learning_rate": 2.5003623713581682e-05, + "loss": 0.6268, + "step": 17250 + }, + { + "epoch": 2.501721201579882, + "grad_norm": 1.6761257648468018, + "learning_rate": 2.4989128859254966e-05, + "loss": 0.527, + "step": 17260 + }, + { + "epoch": 2.503170634489256, + "grad_norm": 0.9880573153495789, + "learning_rate": 2.497463400492825e-05, + "loss": 0.5573, + "step": 17270 + }, + { + "epoch": 2.5046200673986303, + "grad_norm": 0.8700310587882996, + "learning_rate": 2.4960139150601538e-05, + "loss": 0.6175, + "step": 17280 + }, + { + "epoch": 2.5060695003080045, + "grad_norm": 2.353101968765259, + "learning_rate": 2.4945644296274823e-05, + "loss": 0.5717, + "step": 17290 + }, + { + "epoch": 2.5075189332173786, + "grad_norm": 0.9893775582313538, + "learning_rate": 2.493114944194811e-05, + "loss": 0.5915, + "step": 17300 + }, + { + "epoch": 2.508968366126753, + "grad_norm": 1.2474043369293213, + "learning_rate": 2.4916654587621395e-05, + "loss": 0.5724, + "step": 17310 + }, + { + "epoch": 2.5104177990361274, + "grad_norm": 1.757016658782959, + "learning_rate": 2.490215973329468e-05, + "loss": 0.6423, + "step": 17320 + }, + { + "epoch": 2.5118672319455015, + "grad_norm": 2.3559505939483643, + "learning_rate": 2.4887664878967967e-05, + "loss": 0.6315, + "step": 17330 + }, + { + "epoch": 2.5133166648548757, + "grad_norm": 2.5416176319122314, + "learning_rate": 2.4873170024641255e-05, + "loss": 0.5803, + "step": 17340 + }, + { + "epoch": 2.51476609776425, + "grad_norm": 0.9849221110343933, + "learning_rate": 2.485867517031454e-05, + "loss": 0.5358, + "step": 17350 + }, + { + "epoch": 2.516215530673624, + "grad_norm": 1.1437593698501587, + "learning_rate": 2.4844180315987827e-05, + "loss": 0.6097, + "step": 17360 + }, + { + "epoch": 2.517664963582998, + "grad_norm": 1.940948486328125, + "learning_rate": 2.482968546166111e-05, + "loss": 0.5221, + "step": 17370 + }, + { + "epoch": 2.5191143964923723, + "grad_norm": 1.1979233026504517, + "learning_rate": 2.48151906073344e-05, + "loss": 0.5467, + "step": 17380 + }, + { + "epoch": 2.5205638294017465, + "grad_norm": 0.8096486330032349, + "learning_rate": 2.4800695753007683e-05, + "loss": 0.6043, + "step": 17390 + }, + { + "epoch": 2.5220132623111207, + "grad_norm": 1.0167595148086548, + "learning_rate": 2.478620089868097e-05, + "loss": 0.5643, + "step": 17400 + }, + { + "epoch": 2.523462695220495, + "grad_norm": 0.9136523008346558, + "learning_rate": 2.4771706044354255e-05, + "loss": 0.5803, + "step": 17410 + }, + { + "epoch": 2.524912128129869, + "grad_norm": 0.9761309027671814, + "learning_rate": 2.475721119002754e-05, + "loss": 0.5619, + "step": 17420 + }, + { + "epoch": 2.526361561039243, + "grad_norm": 0.8645803928375244, + "learning_rate": 2.4742716335700827e-05, + "loss": 0.5925, + "step": 17430 + }, + { + "epoch": 2.5278109939486177, + "grad_norm": 1.0616345405578613, + "learning_rate": 2.4728221481374115e-05, + "loss": 0.5879, + "step": 17440 + }, + { + "epoch": 2.529260426857992, + "grad_norm": 1.0399123430252075, + "learning_rate": 2.47137266270474e-05, + "loss": 0.5642, + "step": 17450 + }, + { + "epoch": 2.530709859767366, + "grad_norm": 1.3098094463348389, + "learning_rate": 2.4699231772720684e-05, + "loss": 0.5524, + "step": 17460 + }, + { + "epoch": 2.5321592926767402, + "grad_norm": 2.086995840072632, + "learning_rate": 2.468473691839397e-05, + "loss": 0.5656, + "step": 17470 + }, + { + "epoch": 2.5336087255861144, + "grad_norm": 0.8559460043907166, + "learning_rate": 2.4670242064067256e-05, + "loss": 0.5693, + "step": 17480 + }, + { + "epoch": 2.5350581584954885, + "grad_norm": 1.3102972507476807, + "learning_rate": 2.4655747209740544e-05, + "loss": 0.6378, + "step": 17490 + }, + { + "epoch": 2.5365075914048627, + "grad_norm": 2.715803623199463, + "learning_rate": 2.4641252355413828e-05, + "loss": 0.589, + "step": 17500 + }, + { + "epoch": 2.5379570243142373, + "grad_norm": 1.1338856220245361, + "learning_rate": 2.4626757501087116e-05, + "loss": 0.573, + "step": 17510 + }, + { + "epoch": 2.5394064572236115, + "grad_norm": 1.6766473054885864, + "learning_rate": 2.46122626467604e-05, + "loss": 0.5306, + "step": 17520 + }, + { + "epoch": 2.5408558901329856, + "grad_norm": 1.0871402025222778, + "learning_rate": 2.4597767792433688e-05, + "loss": 0.5491, + "step": 17530 + }, + { + "epoch": 2.54230532304236, + "grad_norm": 1.2214182615280151, + "learning_rate": 2.4583272938106976e-05, + "loss": 0.6273, + "step": 17540 + }, + { + "epoch": 2.543754755951734, + "grad_norm": 1.6520607471466064, + "learning_rate": 2.456877808378026e-05, + "loss": 0.6235, + "step": 17550 + }, + { + "epoch": 2.545204188861108, + "grad_norm": 2.7489123344421387, + "learning_rate": 2.4554283229453544e-05, + "loss": 0.5907, + "step": 17560 + }, + { + "epoch": 2.5466536217704823, + "grad_norm": 1.1337274312973022, + "learning_rate": 2.4539788375126832e-05, + "loss": 0.5336, + "step": 17570 + }, + { + "epoch": 2.5481030546798564, + "grad_norm": 1.1375880241394043, + "learning_rate": 2.4525293520800116e-05, + "loss": 0.5142, + "step": 17580 + }, + { + "epoch": 2.5495524875892306, + "grad_norm": 0.8471434712409973, + "learning_rate": 2.4510798666473404e-05, + "loss": 0.5173, + "step": 17590 + }, + { + "epoch": 2.5510019204986047, + "grad_norm": 0.8759787678718567, + "learning_rate": 2.449630381214669e-05, + "loss": 0.5195, + "step": 17600 + }, + { + "epoch": 2.552451353407979, + "grad_norm": 0.8988997936248779, + "learning_rate": 2.4481808957819973e-05, + "loss": 0.5688, + "step": 17610 + }, + { + "epoch": 2.553900786317353, + "grad_norm": 2.0469727516174316, + "learning_rate": 2.446731410349326e-05, + "loss": 0.5593, + "step": 17620 + }, + { + "epoch": 2.5553502192267277, + "grad_norm": 1.4637449979782104, + "learning_rate": 2.445281924916655e-05, + "loss": 0.5109, + "step": 17630 + }, + { + "epoch": 2.556799652136102, + "grad_norm": 1.2323262691497803, + "learning_rate": 2.4438324394839833e-05, + "loss": 0.6227, + "step": 17640 + }, + { + "epoch": 2.558249085045476, + "grad_norm": 4.2845940589904785, + "learning_rate": 2.4423829540513117e-05, + "loss": 0.5726, + "step": 17650 + }, + { + "epoch": 2.55969851795485, + "grad_norm": 0.966750979423523, + "learning_rate": 2.4409334686186405e-05, + "loss": 0.629, + "step": 17660 + }, + { + "epoch": 2.5611479508642243, + "grad_norm": 2.271350145339966, + "learning_rate": 2.4394839831859693e-05, + "loss": 0.6151, + "step": 17670 + }, + { + "epoch": 2.5625973837735985, + "grad_norm": 0.9974566698074341, + "learning_rate": 2.4380344977532977e-05, + "loss": 0.4867, + "step": 17680 + }, + { + "epoch": 2.5640468166829726, + "grad_norm": 1.0083458423614502, + "learning_rate": 2.436585012320626e-05, + "loss": 0.531, + "step": 17690 + }, + { + "epoch": 2.5654962495923472, + "grad_norm": 1.5749166011810303, + "learning_rate": 2.435135526887955e-05, + "loss": 0.5425, + "step": 17700 + }, + { + "epoch": 2.5669456825017214, + "grad_norm": 1.1030534505844116, + "learning_rate": 2.4336860414552833e-05, + "loss": 0.5484, + "step": 17710 + }, + { + "epoch": 2.5683951154110956, + "grad_norm": 1.7335549592971802, + "learning_rate": 2.432236556022612e-05, + "loss": 0.6168, + "step": 17720 + }, + { + "epoch": 2.5698445483204697, + "grad_norm": 1.9986166954040527, + "learning_rate": 2.430787070589941e-05, + "loss": 0.5892, + "step": 17730 + }, + { + "epoch": 2.571293981229844, + "grad_norm": 0.8699630498886108, + "learning_rate": 2.4293375851572693e-05, + "loss": 0.472, + "step": 17740 + }, + { + "epoch": 2.572743414139218, + "grad_norm": 0.9094012379646301, + "learning_rate": 2.4278880997245978e-05, + "loss": 0.5479, + "step": 17750 + }, + { + "epoch": 2.574192847048592, + "grad_norm": 1.0403581857681274, + "learning_rate": 2.4264386142919265e-05, + "loss": 0.5751, + "step": 17760 + }, + { + "epoch": 2.5756422799579664, + "grad_norm": 2.722165822982788, + "learning_rate": 2.424989128859255e-05, + "loss": 0.5864, + "step": 17770 + }, + { + "epoch": 2.5770917128673405, + "grad_norm": 1.9678312540054321, + "learning_rate": 2.4235396434265838e-05, + "loss": 0.6081, + "step": 17780 + }, + { + "epoch": 2.5785411457767147, + "grad_norm": 0.9112073183059692, + "learning_rate": 2.4220901579939122e-05, + "loss": 0.5357, + "step": 17790 + }, + { + "epoch": 2.579990578686089, + "grad_norm": 2.1713218688964844, + "learning_rate": 2.420640672561241e-05, + "loss": 0.5997, + "step": 17800 + }, + { + "epoch": 2.5814400115954634, + "grad_norm": 1.1149449348449707, + "learning_rate": 2.4191911871285694e-05, + "loss": 0.5664, + "step": 17810 + }, + { + "epoch": 2.5828894445048376, + "grad_norm": 2.689030885696411, + "learning_rate": 2.4177417016958982e-05, + "loss": 0.5555, + "step": 17820 + }, + { + "epoch": 2.5843388774142118, + "grad_norm": 0.8640225529670715, + "learning_rate": 2.4162922162632266e-05, + "loss": 0.5226, + "step": 17830 + }, + { + "epoch": 2.585788310323586, + "grad_norm": 2.8747336864471436, + "learning_rate": 2.414842730830555e-05, + "loss": 0.5797, + "step": 17840 + }, + { + "epoch": 2.58723774323296, + "grad_norm": 1.1739059686660767, + "learning_rate": 2.4133932453978838e-05, + "loss": 0.5475, + "step": 17850 + }, + { + "epoch": 2.5886871761423342, + "grad_norm": 1.213647723197937, + "learning_rate": 2.4119437599652126e-05, + "loss": 0.5722, + "step": 17860 + }, + { + "epoch": 2.5901366090517084, + "grad_norm": 1.078586459159851, + "learning_rate": 2.410494274532541e-05, + "loss": 0.5696, + "step": 17870 + }, + { + "epoch": 2.591586041961083, + "grad_norm": 0.7184905409812927, + "learning_rate": 2.4090447890998695e-05, + "loss": 0.5729, + "step": 17880 + }, + { + "epoch": 2.593035474870457, + "grad_norm": 1.021785020828247, + "learning_rate": 2.4075953036671982e-05, + "loss": 0.6097, + "step": 17890 + }, + { + "epoch": 2.5944849077798313, + "grad_norm": 2.1129040718078613, + "learning_rate": 2.4061458182345267e-05, + "loss": 0.5692, + "step": 17900 + }, + { + "epoch": 2.5959343406892055, + "grad_norm": 2.2645630836486816, + "learning_rate": 2.4046963328018555e-05, + "loss": 0.5428, + "step": 17910 + }, + { + "epoch": 2.5973837735985796, + "grad_norm": 2.277709484100342, + "learning_rate": 2.4032468473691842e-05, + "loss": 0.5822, + "step": 17920 + }, + { + "epoch": 2.598833206507954, + "grad_norm": 2.4047906398773193, + "learning_rate": 2.4017973619365127e-05, + "loss": 0.5544, + "step": 17930 + }, + { + "epoch": 2.600282639417328, + "grad_norm": 1.217633605003357, + "learning_rate": 2.400347876503841e-05, + "loss": 0.5927, + "step": 17940 + }, + { + "epoch": 2.601732072326702, + "grad_norm": 1.1137984991073608, + "learning_rate": 2.39889839107117e-05, + "loss": 0.5649, + "step": 17950 + }, + { + "epoch": 2.6031815052360763, + "grad_norm": 2.8685598373413086, + "learning_rate": 2.3974489056384987e-05, + "loss": 0.63, + "step": 17960 + }, + { + "epoch": 2.6046309381454504, + "grad_norm": 2.009880781173706, + "learning_rate": 2.395999420205827e-05, + "loss": 0.6152, + "step": 17970 + }, + { + "epoch": 2.6060803710548246, + "grad_norm": 2.669762134552002, + "learning_rate": 2.3945499347731555e-05, + "loss": 0.63, + "step": 17980 + }, + { + "epoch": 2.6075298039641988, + "grad_norm": 4.854875564575195, + "learning_rate": 2.3931004493404843e-05, + "loss": 0.6127, + "step": 17990 + }, + { + "epoch": 2.6089792368735734, + "grad_norm": 0.7523910403251648, + "learning_rate": 2.3916509639078127e-05, + "loss": 0.567, + "step": 18000 + }, + { + "epoch": 2.6089792368735734, + "eval_loss": 0.7176042795181274, + "eval_runtime": 670.7992, + "eval_samples_per_second": 51.424, + "eval_steps_per_second": 2.572, + "eval_token_accuracy": 0.00039052371997326166, + "step": 18000 + }, + { + "epoch": 2.6104286697829475, + "grad_norm": 1.8868845701217651, + "learning_rate": 2.3902014784751415e-05, + "loss": 0.5227, + "step": 18010 + }, + { + "epoch": 2.6118781026923217, + "grad_norm": 3.6070969104766846, + "learning_rate": 2.38875199304247e-05, + "loss": 0.5724, + "step": 18020 + }, + { + "epoch": 2.613327535601696, + "grad_norm": 1.313196063041687, + "learning_rate": 2.3873025076097984e-05, + "loss": 0.6014, + "step": 18030 + }, + { + "epoch": 2.61477696851107, + "grad_norm": 2.521747350692749, + "learning_rate": 2.385853022177127e-05, + "loss": 0.5432, + "step": 18040 + }, + { + "epoch": 2.616226401420444, + "grad_norm": 4.846094131469727, + "learning_rate": 2.384403536744456e-05, + "loss": 0.528, + "step": 18050 + }, + { + "epoch": 2.6176758343298183, + "grad_norm": 1.277256727218628, + "learning_rate": 2.3829540513117847e-05, + "loss": 0.5767, + "step": 18060 + }, + { + "epoch": 2.619125267239193, + "grad_norm": 1.5727423429489136, + "learning_rate": 2.381504565879113e-05, + "loss": 0.4942, + "step": 18070 + }, + { + "epoch": 2.620574700148567, + "grad_norm": 0.8757898211479187, + "learning_rate": 2.3800550804464416e-05, + "loss": 0.5481, + "step": 18080 + }, + { + "epoch": 2.6220241330579412, + "grad_norm": 2.6296439170837402, + "learning_rate": 2.3786055950137703e-05, + "loss": 0.5735, + "step": 18090 + }, + { + "epoch": 2.6234735659673154, + "grad_norm": 1.3403879404067993, + "learning_rate": 2.3771561095810988e-05, + "loss": 0.5564, + "step": 18100 + }, + { + "epoch": 2.6249229988766896, + "grad_norm": 1.4756869077682495, + "learning_rate": 2.3757066241484276e-05, + "loss": 0.5626, + "step": 18110 + }, + { + "epoch": 2.6263724317860637, + "grad_norm": 1.0589386224746704, + "learning_rate": 2.374257138715756e-05, + "loss": 0.5912, + "step": 18120 + }, + { + "epoch": 2.627821864695438, + "grad_norm": 1.1331684589385986, + "learning_rate": 2.3728076532830844e-05, + "loss": 0.5754, + "step": 18130 + }, + { + "epoch": 2.629271297604812, + "grad_norm": 3.5398993492126465, + "learning_rate": 2.3713581678504132e-05, + "loss": 0.5491, + "step": 18140 + }, + { + "epoch": 2.630720730514186, + "grad_norm": 1.6487380266189575, + "learning_rate": 2.369908682417742e-05, + "loss": 0.5244, + "step": 18150 + }, + { + "epoch": 2.6321701634235604, + "grad_norm": 2.910534620285034, + "learning_rate": 2.3684591969850704e-05, + "loss": 0.5973, + "step": 18160 + }, + { + "epoch": 2.6336195963329345, + "grad_norm": 1.6422079801559448, + "learning_rate": 2.367009711552399e-05, + "loss": 0.6234, + "step": 18170 + }, + { + "epoch": 2.6350690292423087, + "grad_norm": 1.9400933980941772, + "learning_rate": 2.3655602261197276e-05, + "loss": 0.5992, + "step": 18180 + }, + { + "epoch": 2.6365184621516833, + "grad_norm": 0.6542257070541382, + "learning_rate": 2.3641107406870564e-05, + "loss": 0.6516, + "step": 18190 + }, + { + "epoch": 2.6379678950610574, + "grad_norm": 2.389939308166504, + "learning_rate": 2.362661255254385e-05, + "loss": 0.4967, + "step": 18200 + }, + { + "epoch": 2.6394173279704316, + "grad_norm": 4.163430213928223, + "learning_rate": 2.3612117698217133e-05, + "loss": 0.5762, + "step": 18210 + }, + { + "epoch": 2.6408667608798058, + "grad_norm": 0.9543486833572388, + "learning_rate": 2.359762284389042e-05, + "loss": 0.6407, + "step": 18220 + }, + { + "epoch": 2.64231619378918, + "grad_norm": 2.0527260303497314, + "learning_rate": 2.3583127989563705e-05, + "loss": 0.5274, + "step": 18230 + }, + { + "epoch": 2.643765626698554, + "grad_norm": 2.0211446285247803, + "learning_rate": 2.3568633135236993e-05, + "loss": 0.5337, + "step": 18240 + }, + { + "epoch": 2.6452150596079282, + "grad_norm": 0.931024968624115, + "learning_rate": 2.355413828091028e-05, + "loss": 0.65, + "step": 18250 + }, + { + "epoch": 2.646664492517303, + "grad_norm": 1.0443034172058105, + "learning_rate": 2.3539643426583565e-05, + "loss": 0.629, + "step": 18260 + }, + { + "epoch": 2.648113925426677, + "grad_norm": 1.0292446613311768, + "learning_rate": 2.352514857225685e-05, + "loss": 0.556, + "step": 18270 + }, + { + "epoch": 2.649563358336051, + "grad_norm": 3.1219959259033203, + "learning_rate": 2.3510653717930137e-05, + "loss": 0.568, + "step": 18280 + }, + { + "epoch": 2.6510127912454253, + "grad_norm": 1.1402798891067505, + "learning_rate": 2.349615886360342e-05, + "loss": 0.6201, + "step": 18290 + }, + { + "epoch": 2.6524622241547995, + "grad_norm": 1.141597867012024, + "learning_rate": 2.348166400927671e-05, + "loss": 0.638, + "step": 18300 + }, + { + "epoch": 2.6539116570641736, + "grad_norm": 3.5763368606567383, + "learning_rate": 2.3468618640382665e-05, + "loss": 0.5554, + "step": 18310 + }, + { + "epoch": 2.655361089973548, + "grad_norm": 0.9100473523139954, + "learning_rate": 2.3454123786055953e-05, + "loss": 0.549, + "step": 18320 + }, + { + "epoch": 2.656810522882922, + "grad_norm": 2.972250461578369, + "learning_rate": 2.3439628931729237e-05, + "loss": 0.5549, + "step": 18330 + }, + { + "epoch": 2.658259955792296, + "grad_norm": 0.8812468647956848, + "learning_rate": 2.342513407740252e-05, + "loss": 0.5164, + "step": 18340 + }, + { + "epoch": 2.6597093887016703, + "grad_norm": 1.1236684322357178, + "learning_rate": 2.341063922307581e-05, + "loss": 0.5589, + "step": 18350 + }, + { + "epoch": 2.6611588216110444, + "grad_norm": 2.689864158630371, + "learning_rate": 2.3396144368749097e-05, + "loss": 0.535, + "step": 18360 + }, + { + "epoch": 2.662608254520419, + "grad_norm": 1.1456810235977173, + "learning_rate": 2.338164951442238e-05, + "loss": 0.5154, + "step": 18370 + }, + { + "epoch": 2.664057687429793, + "grad_norm": 2.042848587036133, + "learning_rate": 2.3367154660095666e-05, + "loss": 0.5722, + "step": 18380 + }, + { + "epoch": 2.6655071203391674, + "grad_norm": 1.8200641870498657, + "learning_rate": 2.3352659805768953e-05, + "loss": 0.5991, + "step": 18390 + }, + { + "epoch": 2.6669565532485415, + "grad_norm": 3.0939741134643555, + "learning_rate": 2.333816495144224e-05, + "loss": 0.5762, + "step": 18400 + }, + { + "epoch": 2.6684059861579157, + "grad_norm": 1.2191381454467773, + "learning_rate": 2.3323670097115525e-05, + "loss": 0.5775, + "step": 18410 + }, + { + "epoch": 2.66985541906729, + "grad_norm": 3.6405093669891357, + "learning_rate": 2.330917524278881e-05, + "loss": 0.5567, + "step": 18420 + }, + { + "epoch": 2.671304851976664, + "grad_norm": 0.6857460737228394, + "learning_rate": 2.3294680388462097e-05, + "loss": 0.531, + "step": 18430 + }, + { + "epoch": 2.6727542848860386, + "grad_norm": 1.7805752754211426, + "learning_rate": 2.3280185534135382e-05, + "loss": 0.5528, + "step": 18440 + }, + { + "epoch": 2.6742037177954128, + "grad_norm": 0.9542523622512817, + "learning_rate": 2.326569067980867e-05, + "loss": 0.4993, + "step": 18450 + }, + { + "epoch": 2.675653150704787, + "grad_norm": 1.0148526430130005, + "learning_rate": 2.3251195825481954e-05, + "loss": 0.516, + "step": 18460 + }, + { + "epoch": 2.677102583614161, + "grad_norm": 1.0944411754608154, + "learning_rate": 2.323670097115524e-05, + "loss": 0.6308, + "step": 18470 + }, + { + "epoch": 2.6785520165235353, + "grad_norm": 1.1389912366867065, + "learning_rate": 2.3222206116828526e-05, + "loss": 0.5836, + "step": 18480 + }, + { + "epoch": 2.6800014494329094, + "grad_norm": 1.4398833513259888, + "learning_rate": 2.3207711262501814e-05, + "loss": 0.5429, + "step": 18490 + }, + { + "epoch": 2.6814508823422836, + "grad_norm": 1.2716155052185059, + "learning_rate": 2.31932164081751e-05, + "loss": 0.5753, + "step": 18500 + }, + { + "epoch": 2.6829003152516577, + "grad_norm": 1.1771233081817627, + "learning_rate": 2.3178721553848386e-05, + "loss": 0.5961, + "step": 18510 + }, + { + "epoch": 2.684349748161032, + "grad_norm": 3.2915737628936768, + "learning_rate": 2.316422669952167e-05, + "loss": 0.6194, + "step": 18520 + }, + { + "epoch": 2.685799181070406, + "grad_norm": 3.6810009479522705, + "learning_rate": 2.3149731845194958e-05, + "loss": 0.5164, + "step": 18530 + }, + { + "epoch": 2.68724861397978, + "grad_norm": 1.319347858428955, + "learning_rate": 2.3135236990868242e-05, + "loss": 0.4943, + "step": 18540 + }, + { + "epoch": 2.6886980468891544, + "grad_norm": 0.9938089847564697, + "learning_rate": 2.312074213654153e-05, + "loss": 0.5421, + "step": 18550 + }, + { + "epoch": 2.690147479798529, + "grad_norm": 0.9541263580322266, + "learning_rate": 2.3106247282214814e-05, + "loss": 0.5477, + "step": 18560 + }, + { + "epoch": 2.691596912707903, + "grad_norm": 2.2706756591796875, + "learning_rate": 2.30917524278881e-05, + "loss": 0.5099, + "step": 18570 + }, + { + "epoch": 2.6930463456172773, + "grad_norm": 1.2195298671722412, + "learning_rate": 2.3077257573561387e-05, + "loss": 0.6338, + "step": 18580 + }, + { + "epoch": 2.6944957785266515, + "grad_norm": 0.7414268851280212, + "learning_rate": 2.3062762719234674e-05, + "loss": 0.5689, + "step": 18590 + }, + { + "epoch": 2.6959452114360256, + "grad_norm": 3.0787389278411865, + "learning_rate": 2.304826786490796e-05, + "loss": 0.5427, + "step": 18600 + }, + { + "epoch": 2.6973946443453998, + "grad_norm": 0.7223326563835144, + "learning_rate": 2.3033773010581243e-05, + "loss": 0.5665, + "step": 18610 + }, + { + "epoch": 2.698844077254774, + "grad_norm": 0.889962375164032, + "learning_rate": 2.301927815625453e-05, + "loss": 0.5911, + "step": 18620 + }, + { + "epoch": 2.7002935101641485, + "grad_norm": 1.0519202947616577, + "learning_rate": 2.300478330192782e-05, + "loss": 0.5512, + "step": 18630 + }, + { + "epoch": 2.7017429430735227, + "grad_norm": 0.8857706785202026, + "learning_rate": 2.2990288447601103e-05, + "loss": 0.5119, + "step": 18640 + }, + { + "epoch": 2.703192375982897, + "grad_norm": 1.0033340454101562, + "learning_rate": 2.2975793593274387e-05, + "loss": 0.5544, + "step": 18650 + }, + { + "epoch": 2.704641808892271, + "grad_norm": 0.8942063450813293, + "learning_rate": 2.2961298738947675e-05, + "loss": 0.5107, + "step": 18660 + }, + { + "epoch": 2.706091241801645, + "grad_norm": 1.006726861000061, + "learning_rate": 2.294680388462096e-05, + "loss": 0.5571, + "step": 18670 + }, + { + "epoch": 2.7075406747110193, + "grad_norm": 1.1389527320861816, + "learning_rate": 2.2932309030294247e-05, + "loss": 0.5514, + "step": 18680 + }, + { + "epoch": 2.7089901076203935, + "grad_norm": 2.5659804344177246, + "learning_rate": 2.2917814175967535e-05, + "loss": 0.6332, + "step": 18690 + }, + { + "epoch": 2.7104395405297677, + "grad_norm": 1.050179123878479, + "learning_rate": 2.290331932164082e-05, + "loss": 0.5082, + "step": 18700 + }, + { + "epoch": 2.711888973439142, + "grad_norm": 2.12483286857605, + "learning_rate": 2.2888824467314104e-05, + "loss": 0.5867, + "step": 18710 + }, + { + "epoch": 2.713338406348516, + "grad_norm": 1.6364705562591553, + "learning_rate": 2.287432961298739e-05, + "loss": 0.6449, + "step": 18720 + }, + { + "epoch": 2.71478783925789, + "grad_norm": 2.376473903656006, + "learning_rate": 2.2859834758660676e-05, + "loss": 0.562, + "step": 18730 + }, + { + "epoch": 2.7162372721672643, + "grad_norm": 3.524937152862549, + "learning_rate": 2.2845339904333963e-05, + "loss": 0.5788, + "step": 18740 + }, + { + "epoch": 2.717686705076639, + "grad_norm": 1.0858070850372314, + "learning_rate": 2.2830845050007248e-05, + "loss": 0.5169, + "step": 18750 + }, + { + "epoch": 2.719136137986013, + "grad_norm": 0.9738995432853699, + "learning_rate": 2.2816350195680536e-05, + "loss": 0.5891, + "step": 18760 + }, + { + "epoch": 2.720585570895387, + "grad_norm": 2.339165687561035, + "learning_rate": 2.280185534135382e-05, + "loss": 0.57, + "step": 18770 + }, + { + "epoch": 2.7220350038047614, + "grad_norm": 1.0681856870651245, + "learning_rate": 2.2787360487027108e-05, + "loss": 0.6183, + "step": 18780 + }, + { + "epoch": 2.7234844367141355, + "grad_norm": 0.8750693798065186, + "learning_rate": 2.2772865632700392e-05, + "loss": 0.5597, + "step": 18790 + }, + { + "epoch": 2.7249338696235097, + "grad_norm": 1.375166654586792, + "learning_rate": 2.2758370778373676e-05, + "loss": 0.6153, + "step": 18800 + }, + { + "epoch": 2.726383302532884, + "grad_norm": 1.8082159757614136, + "learning_rate": 2.2743875924046964e-05, + "loss": 0.6993, + "step": 18810 + }, + { + "epoch": 2.7278327354422585, + "grad_norm": 2.435235023498535, + "learning_rate": 2.2729381069720252e-05, + "loss": 0.5326, + "step": 18820 + }, + { + "epoch": 2.7292821683516326, + "grad_norm": 1.028337001800537, + "learning_rate": 2.2714886215393536e-05, + "loss": 0.5088, + "step": 18830 + }, + { + "epoch": 2.730731601261007, + "grad_norm": 1.501955509185791, + "learning_rate": 2.2700391361066824e-05, + "loss": 0.4502, + "step": 18840 + }, + { + "epoch": 2.732181034170381, + "grad_norm": 2.1301772594451904, + "learning_rate": 2.268589650674011e-05, + "loss": 0.5966, + "step": 18850 + }, + { + "epoch": 2.733630467079755, + "grad_norm": 2.629920244216919, + "learning_rate": 2.2671401652413393e-05, + "loss": 0.5694, + "step": 18860 + }, + { + "epoch": 2.7350798999891293, + "grad_norm": 2.9983248710632324, + "learning_rate": 2.265690679808668e-05, + "loss": 0.5938, + "step": 18870 + }, + { + "epoch": 2.7365293328985034, + "grad_norm": 4.4188995361328125, + "learning_rate": 2.2642411943759968e-05, + "loss": 0.5527, + "step": 18880 + }, + { + "epoch": 2.7379787658078776, + "grad_norm": 0.8108800053596497, + "learning_rate": 2.2627917089433253e-05, + "loss": 0.5222, + "step": 18890 + }, + { + "epoch": 2.7394281987172517, + "grad_norm": 1.078866958618164, + "learning_rate": 2.2613422235106537e-05, + "loss": 0.5275, + "step": 18900 + }, + { + "epoch": 2.740877631626626, + "grad_norm": 2.8870749473571777, + "learning_rate": 2.2598927380779825e-05, + "loss": 0.6341, + "step": 18910 + }, + { + "epoch": 2.742327064536, + "grad_norm": 2.361414670944214, + "learning_rate": 2.2584432526453112e-05, + "loss": 0.5177, + "step": 18920 + }, + { + "epoch": 2.7437764974453747, + "grad_norm": 0.6874875426292419, + "learning_rate": 2.2569937672126397e-05, + "loss": 0.5358, + "step": 18930 + }, + { + "epoch": 2.745225930354749, + "grad_norm": 1.24635648727417, + "learning_rate": 2.255544281779968e-05, + "loss": 0.5636, + "step": 18940 + }, + { + "epoch": 2.746675363264123, + "grad_norm": 1.061892032623291, + "learning_rate": 2.254094796347297e-05, + "loss": 0.5179, + "step": 18950 + }, + { + "epoch": 2.748124796173497, + "grad_norm": 2.9943783283233643, + "learning_rate": 2.2526453109146253e-05, + "loss": 0.5643, + "step": 18960 + }, + { + "epoch": 2.7495742290828713, + "grad_norm": 1.0257405042648315, + "learning_rate": 2.251195825481954e-05, + "loss": 0.5869, + "step": 18970 + }, + { + "epoch": 2.7510236619922455, + "grad_norm": 0.9652470946311951, + "learning_rate": 2.2497463400492825e-05, + "loss": 0.5397, + "step": 18980 + }, + { + "epoch": 2.7524730949016196, + "grad_norm": 2.92497181892395, + "learning_rate": 2.248296854616611e-05, + "loss": 0.5578, + "step": 18990 + }, + { + "epoch": 2.7539225278109942, + "grad_norm": 1.0754202604293823, + "learning_rate": 2.2468473691839397e-05, + "loss": 0.6103, + "step": 19000 + }, + { + "epoch": 2.7539225278109942, + "eval_loss": 0.7155176401138306, + "eval_runtime": 669.6957, + "eval_samples_per_second": 51.508, + "eval_steps_per_second": 2.576, + "eval_token_accuracy": 0.0004030693748982676, + "step": 19000 + }, + { + "epoch": 2.7553719607203684, + "grad_norm": 2.33913516998291, + "learning_rate": 2.2453978837512685e-05, + "loss": 0.5592, + "step": 19010 + }, + { + "epoch": 2.7568213936297425, + "grad_norm": 1.0931929349899292, + "learning_rate": 2.243948398318597e-05, + "loss": 0.5391, + "step": 19020 + }, + { + "epoch": 2.7582708265391167, + "grad_norm": 2.506253480911255, + "learning_rate": 2.2424989128859257e-05, + "loss": 0.5415, + "step": 19030 + }, + { + "epoch": 2.759720259448491, + "grad_norm": 1.6211233139038086, + "learning_rate": 2.241049427453254e-05, + "loss": 0.5904, + "step": 19040 + }, + { + "epoch": 2.761169692357865, + "grad_norm": 0.7718712091445923, + "learning_rate": 2.239599942020583e-05, + "loss": 0.5529, + "step": 19050 + }, + { + "epoch": 2.762619125267239, + "grad_norm": 2.422839641571045, + "learning_rate": 2.2381504565879114e-05, + "loss": 0.5402, + "step": 19060 + }, + { + "epoch": 2.7640685581766133, + "grad_norm": 2.453468084335327, + "learning_rate": 2.23670097115524e-05, + "loss": 0.5041, + "step": 19070 + }, + { + "epoch": 2.7655179910859875, + "grad_norm": 1.3519423007965088, + "learning_rate": 2.2352514857225686e-05, + "loss": 0.5736, + "step": 19080 + }, + { + "epoch": 2.7669674239953617, + "grad_norm": 3.2335236072540283, + "learning_rate": 2.233802000289897e-05, + "loss": 0.5825, + "step": 19090 + }, + { + "epoch": 2.768416856904736, + "grad_norm": 2.3906261920928955, + "learning_rate": 2.2323525148572258e-05, + "loss": 0.6199, + "step": 19100 + }, + { + "epoch": 2.76986628981411, + "grad_norm": 3.0553524494171143, + "learning_rate": 2.2309030294245546e-05, + "loss": 0.5798, + "step": 19110 + }, + { + "epoch": 2.7713157227234846, + "grad_norm": 2.046018600463867, + "learning_rate": 2.229453543991883e-05, + "loss": 0.5871, + "step": 19120 + }, + { + "epoch": 2.7727651556328587, + "grad_norm": 2.1828296184539795, + "learning_rate": 2.2280040585592114e-05, + "loss": 0.5908, + "step": 19130 + }, + { + "epoch": 2.774214588542233, + "grad_norm": 2.1796507835388184, + "learning_rate": 2.2265545731265402e-05, + "loss": 0.5838, + "step": 19140 + }, + { + "epoch": 2.775664021451607, + "grad_norm": 1.3011929988861084, + "learning_rate": 2.2251050876938687e-05, + "loss": 0.6146, + "step": 19150 + }, + { + "epoch": 2.7771134543609812, + "grad_norm": 0.858228862285614, + "learning_rate": 2.2236556022611974e-05, + "loss": 0.5222, + "step": 19160 + }, + { + "epoch": 2.7785628872703554, + "grad_norm": 3.342522144317627, + "learning_rate": 2.222206116828526e-05, + "loss": 0.6894, + "step": 19170 + }, + { + "epoch": 2.7800123201797295, + "grad_norm": 0.9294348359107971, + "learning_rate": 2.2207566313958546e-05, + "loss": 0.5157, + "step": 19180 + }, + { + "epoch": 2.781461753089104, + "grad_norm": 0.7711198329925537, + "learning_rate": 2.219307145963183e-05, + "loss": 0.5299, + "step": 19190 + }, + { + "epoch": 2.7829111859984783, + "grad_norm": 1.0399785041809082, + "learning_rate": 2.217857660530512e-05, + "loss": 0.524, + "step": 19200 + }, + { + "epoch": 2.7843606189078525, + "grad_norm": 1.0538372993469238, + "learning_rate": 2.2164081750978406e-05, + "loss": 0.5483, + "step": 19210 + }, + { + "epoch": 2.7858100518172266, + "grad_norm": 0.8236547708511353, + "learning_rate": 2.214958689665169e-05, + "loss": 0.5912, + "step": 19220 + }, + { + "epoch": 2.787259484726601, + "grad_norm": 1.0914356708526611, + "learning_rate": 2.2135092042324975e-05, + "loss": 0.5343, + "step": 19230 + }, + { + "epoch": 2.788708917635975, + "grad_norm": 2.484463691711426, + "learning_rate": 2.2120597187998263e-05, + "loss": 0.5842, + "step": 19240 + }, + { + "epoch": 2.790158350545349, + "grad_norm": 1.2179654836654663, + "learning_rate": 2.2106102333671547e-05, + "loss": 0.4972, + "step": 19250 + }, + { + "epoch": 2.7916077834547233, + "grad_norm": 3.0674054622650146, + "learning_rate": 2.2091607479344835e-05, + "loss": 0.5962, + "step": 19260 + }, + { + "epoch": 2.7930572163640974, + "grad_norm": 1.1092416048049927, + "learning_rate": 2.207711262501812e-05, + "loss": 0.5121, + "step": 19270 + }, + { + "epoch": 2.7945066492734716, + "grad_norm": 3.062148094177246, + "learning_rate": 2.2062617770691404e-05, + "loss": 0.5796, + "step": 19280 + }, + { + "epoch": 2.7959560821828457, + "grad_norm": 0.682876467704773, + "learning_rate": 2.204812291636469e-05, + "loss": 0.5505, + "step": 19290 + }, + { + "epoch": 2.79740551509222, + "grad_norm": 0.7456188201904297, + "learning_rate": 2.203362806203798e-05, + "loss": 0.6132, + "step": 19300 + }, + { + "epoch": 2.7988549480015945, + "grad_norm": 1.0670653581619263, + "learning_rate": 2.2019133207711263e-05, + "loss": 0.6285, + "step": 19310 + }, + { + "epoch": 2.8003043809109687, + "grad_norm": 2.5444772243499756, + "learning_rate": 2.2004638353384548e-05, + "loss": 0.532, + "step": 19320 + }, + { + "epoch": 2.801753813820343, + "grad_norm": 3.2199554443359375, + "learning_rate": 2.1990143499057836e-05, + "loss": 0.5256, + "step": 19330 + }, + { + "epoch": 2.803203246729717, + "grad_norm": 3.2121224403381348, + "learning_rate": 2.1975648644731123e-05, + "loss": 0.5406, + "step": 19340 + }, + { + "epoch": 2.804652679639091, + "grad_norm": 2.8595330715179443, + "learning_rate": 2.1961153790404408e-05, + "loss": 0.6079, + "step": 19350 + }, + { + "epoch": 2.8061021125484653, + "grad_norm": 1.9917750358581543, + "learning_rate": 2.1946658936077692e-05, + "loss": 0.5897, + "step": 19360 + }, + { + "epoch": 2.8075515454578395, + "grad_norm": 0.9936667084693909, + "learning_rate": 2.193216408175098e-05, + "loss": 0.5673, + "step": 19370 + }, + { + "epoch": 2.809000978367214, + "grad_norm": 0.9017758369445801, + "learning_rate": 2.1917669227424264e-05, + "loss": 0.5452, + "step": 19380 + }, + { + "epoch": 2.8104504112765882, + "grad_norm": 0.9994916319847107, + "learning_rate": 2.1903174373097552e-05, + "loss": 0.5504, + "step": 19390 + }, + { + "epoch": 2.8118998441859624, + "grad_norm": 2.0900986194610596, + "learning_rate": 2.188867951877084e-05, + "loss": 0.5033, + "step": 19400 + }, + { + "epoch": 2.8133492770953366, + "grad_norm": 0.9001177549362183, + "learning_rate": 2.1874184664444124e-05, + "loss": 0.5446, + "step": 19410 + }, + { + "epoch": 2.8147987100047107, + "grad_norm": 1.0415499210357666, + "learning_rate": 2.1859689810117408e-05, + "loss": 0.6218, + "step": 19420 + }, + { + "epoch": 2.816248142914085, + "grad_norm": 0.9950922727584839, + "learning_rate": 2.1845194955790696e-05, + "loss": 0.5994, + "step": 19430 + }, + { + "epoch": 2.817697575823459, + "grad_norm": 0.9685205817222595, + "learning_rate": 2.183070010146398e-05, + "loss": 0.609, + "step": 19440 + }, + { + "epoch": 2.819147008732833, + "grad_norm": 1.086944341659546, + "learning_rate": 2.1816205247137268e-05, + "loss": 0.5858, + "step": 19450 + }, + { + "epoch": 2.8205964416422074, + "grad_norm": 1.3267871141433716, + "learning_rate": 2.1801710392810553e-05, + "loss": 0.5554, + "step": 19460 + }, + { + "epoch": 2.8220458745515815, + "grad_norm": 0.9111933708190918, + "learning_rate": 2.178721553848384e-05, + "loss": 0.5218, + "step": 19470 + }, + { + "epoch": 2.8234953074609557, + "grad_norm": 2.4886646270751953, + "learning_rate": 2.1772720684157125e-05, + "loss": 0.51, + "step": 19480 + }, + { + "epoch": 2.82494474037033, + "grad_norm": 2.937021017074585, + "learning_rate": 2.1758225829830412e-05, + "loss": 0.5775, + "step": 19490 + }, + { + "epoch": 2.8263941732797044, + "grad_norm": 1.8829514980316162, + "learning_rate": 2.1743730975503697e-05, + "loss": 0.5318, + "step": 19500 + }, + { + "epoch": 2.8278436061890786, + "grad_norm": 2.7370100021362305, + "learning_rate": 2.172923612117698e-05, + "loss": 0.5865, + "step": 19510 + }, + { + "epoch": 2.8292930390984528, + "grad_norm": 2.0809028148651123, + "learning_rate": 2.171474126685027e-05, + "loss": 0.5616, + "step": 19520 + }, + { + "epoch": 2.830742472007827, + "grad_norm": 1.7841269969940186, + "learning_rate": 2.1700246412523557e-05, + "loss": 0.5817, + "step": 19530 + }, + { + "epoch": 2.832191904917201, + "grad_norm": 3.6458749771118164, + "learning_rate": 2.168575155819684e-05, + "loss": 0.5487, + "step": 19540 + }, + { + "epoch": 2.8336413378265752, + "grad_norm": 0.8706045746803284, + "learning_rate": 2.1671256703870125e-05, + "loss": 0.5208, + "step": 19550 + }, + { + "epoch": 2.8350907707359494, + "grad_norm": 0.9333102107048035, + "learning_rate": 2.1656761849543413e-05, + "loss": 0.5952, + "step": 19560 + }, + { + "epoch": 2.836540203645324, + "grad_norm": 2.157334566116333, + "learning_rate": 2.1642266995216697e-05, + "loss": 0.5428, + "step": 19570 + }, + { + "epoch": 2.837989636554698, + "grad_norm": 1.2280299663543701, + "learning_rate": 2.1627772140889985e-05, + "loss": 0.5521, + "step": 19580 + }, + { + "epoch": 2.8394390694640723, + "grad_norm": 1.0699831247329712, + "learning_rate": 2.1613277286563273e-05, + "loss": 0.5647, + "step": 19590 + }, + { + "epoch": 2.8408885023734465, + "grad_norm": 2.3743484020233154, + "learning_rate": 2.1598782432236557e-05, + "loss": 0.5353, + "step": 19600 + }, + { + "epoch": 2.8423379352828206, + "grad_norm": 1.8679091930389404, + "learning_rate": 2.158428757790984e-05, + "loss": 0.5263, + "step": 19610 + }, + { + "epoch": 2.843787368192195, + "grad_norm": 1.9208581447601318, + "learning_rate": 2.156979272358313e-05, + "loss": 0.5266, + "step": 19620 + }, + { + "epoch": 2.845236801101569, + "grad_norm": 3.6413888931274414, + "learning_rate": 2.1555297869256417e-05, + "loss": 0.5512, + "step": 19630 + }, + { + "epoch": 2.846686234010943, + "grad_norm": 2.0212066173553467, + "learning_rate": 2.15408030149297e-05, + "loss": 0.5954, + "step": 19640 + }, + { + "epoch": 2.8481356669203173, + "grad_norm": 2.341249704360962, + "learning_rate": 2.1526308160602986e-05, + "loss": 0.5832, + "step": 19650 + }, + { + "epoch": 2.8495850998296914, + "grad_norm": 3.7906134128570557, + "learning_rate": 2.1511813306276274e-05, + "loss": 0.5869, + "step": 19660 + }, + { + "epoch": 2.8510345327390656, + "grad_norm": 2.7214527130126953, + "learning_rate": 2.1497318451949558e-05, + "loss": 0.6188, + "step": 19670 + }, + { + "epoch": 2.85248396564844, + "grad_norm": 1.1105526685714722, + "learning_rate": 2.1482823597622846e-05, + "loss": 0.5926, + "step": 19680 + }, + { + "epoch": 2.8539333985578144, + "grad_norm": 1.4884237051010132, + "learning_rate": 2.146832874329613e-05, + "loss": 0.5329, + "step": 19690 + }, + { + "epoch": 2.8553828314671885, + "grad_norm": 1.7720947265625, + "learning_rate": 2.1453833888969414e-05, + "loss": 0.4844, + "step": 19700 + }, + { + "epoch": 2.8568322643765627, + "grad_norm": 1.2303423881530762, + "learning_rate": 2.1439339034642702e-05, + "loss": 0.5637, + "step": 19710 + }, + { + "epoch": 2.858281697285937, + "grad_norm": 3.0960593223571777, + "learning_rate": 2.142484418031599e-05, + "loss": 0.5773, + "step": 19720 + }, + { + "epoch": 2.859731130195311, + "grad_norm": 3.4920639991760254, + "learning_rate": 2.1410349325989274e-05, + "loss": 0.5379, + "step": 19730 + }, + { + "epoch": 2.861180563104685, + "grad_norm": 1.1619203090667725, + "learning_rate": 2.139585447166256e-05, + "loss": 0.5732, + "step": 19740 + }, + { + "epoch": 2.8626299960140598, + "grad_norm": 1.4657340049743652, + "learning_rate": 2.1381359617335846e-05, + "loss": 0.5264, + "step": 19750 + }, + { + "epoch": 2.864079428923434, + "grad_norm": 1.1278634071350098, + "learning_rate": 2.1366864763009134e-05, + "loss": 0.5607, + "step": 19760 + }, + { + "epoch": 2.865528861832808, + "grad_norm": 1.0407209396362305, + "learning_rate": 2.135236990868242e-05, + "loss": 0.5141, + "step": 19770 + }, + { + "epoch": 2.8669782947421822, + "grad_norm": 2.0300920009613037, + "learning_rate": 2.1337875054355706e-05, + "loss": 0.5624, + "step": 19780 + }, + { + "epoch": 2.8684277276515564, + "grad_norm": 1.0135087966918945, + "learning_rate": 2.132338020002899e-05, + "loss": 0.5987, + "step": 19790 + }, + { + "epoch": 2.8698771605609306, + "grad_norm": 1.8810256719589233, + "learning_rate": 2.1308885345702275e-05, + "loss": 0.5018, + "step": 19800 + }, + { + "epoch": 2.8713265934703047, + "grad_norm": 1.8214755058288574, + "learning_rate": 2.1294390491375563e-05, + "loss": 0.6013, + "step": 19810 + }, + { + "epoch": 2.872776026379679, + "grad_norm": 1.2672983407974243, + "learning_rate": 2.127989563704885e-05, + "loss": 0.5856, + "step": 19820 + }, + { + "epoch": 2.874225459289053, + "grad_norm": 1.973645806312561, + "learning_rate": 2.1265400782722135e-05, + "loss": 0.5301, + "step": 19830 + }, + { + "epoch": 2.875674892198427, + "grad_norm": 3.644392251968384, + "learning_rate": 2.125090592839542e-05, + "loss": 0.5789, + "step": 19840 + }, + { + "epoch": 2.8771243251078014, + "grad_norm": 0.8183895349502563, + "learning_rate": 2.1236411074068707e-05, + "loss": 0.5546, + "step": 19850 + }, + { + "epoch": 2.8785737580171755, + "grad_norm": 0.9706283211708069, + "learning_rate": 2.122191621974199e-05, + "loss": 0.5987, + "step": 19860 + }, + { + "epoch": 2.88002319092655, + "grad_norm": 0.9787003993988037, + "learning_rate": 2.120742136541528e-05, + "loss": 0.5661, + "step": 19870 + }, + { + "epoch": 2.8814726238359243, + "grad_norm": 3.2533276081085205, + "learning_rate": 2.1192926511088563e-05, + "loss": 0.5659, + "step": 19880 + }, + { + "epoch": 2.8829220567452984, + "grad_norm": 1.3525904417037964, + "learning_rate": 2.117843165676185e-05, + "loss": 0.5648, + "step": 19890 + }, + { + "epoch": 2.8843714896546726, + "grad_norm": 1.7455861568450928, + "learning_rate": 2.1163936802435135e-05, + "loss": 0.6197, + "step": 19900 + }, + { + "epoch": 2.8858209225640468, + "grad_norm": 1.0630838871002197, + "learning_rate": 2.1149441948108423e-05, + "loss": 0.6389, + "step": 19910 + }, + { + "epoch": 2.887270355473421, + "grad_norm": 1.7146954536437988, + "learning_rate": 2.113494709378171e-05, + "loss": 0.5689, + "step": 19920 + }, + { + "epoch": 2.888719788382795, + "grad_norm": 1.6422052383422852, + "learning_rate": 2.1120452239454995e-05, + "loss": 0.5172, + "step": 19930 + }, + { + "epoch": 2.8901692212921697, + "grad_norm": 2.0748939514160156, + "learning_rate": 2.110595738512828e-05, + "loss": 0.6374, + "step": 19940 + }, + { + "epoch": 2.891618654201544, + "grad_norm": 1.256318211555481, + "learning_rate": 2.1091462530801567e-05, + "loss": 0.627, + "step": 19950 + }, + { + "epoch": 2.893068087110918, + "grad_norm": 2.529940128326416, + "learning_rate": 2.1076967676474852e-05, + "loss": 0.5437, + "step": 19960 + }, + { + "epoch": 2.894517520020292, + "grad_norm": 3.2431910037994385, + "learning_rate": 2.106247282214814e-05, + "loss": 0.5335, + "step": 19970 + }, + { + "epoch": 2.8959669529296663, + "grad_norm": 2.3461062908172607, + "learning_rate": 2.1047977967821424e-05, + "loss": 0.5081, + "step": 19980 + }, + { + "epoch": 2.8974163858390405, + "grad_norm": 1.2446269989013672, + "learning_rate": 2.1033483113494708e-05, + "loss": 0.5553, + "step": 19990 + }, + { + "epoch": 2.8988658187484146, + "grad_norm": 2.994905948638916, + "learning_rate": 2.1018988259167996e-05, + "loss": 0.5612, + "step": 20000 + }, + { + "epoch": 2.8988658187484146, + "eval_loss": 0.7098406553268433, + "eval_runtime": 669.4934, + "eval_samples_per_second": 51.524, + "eval_steps_per_second": 2.577, + "eval_token_accuracy": 0.0004001742237617278, + "step": 20000 + }, + { + "epoch": 2.900315251657789, + "grad_norm": 2.2802417278289795, + "learning_rate": 2.1004493404841284e-05, + "loss": 0.6105, + "step": 20010 + }, + { + "epoch": 2.901764684567163, + "grad_norm": 3.3665335178375244, + "learning_rate": 2.0989998550514568e-05, + "loss": 0.5633, + "step": 20020 + }, + { + "epoch": 2.903214117476537, + "grad_norm": 2.1991968154907227, + "learning_rate": 2.0975503696187852e-05, + "loss": 0.5318, + "step": 20030 + }, + { + "epoch": 2.9046635503859113, + "grad_norm": 2.621366024017334, + "learning_rate": 2.096100884186114e-05, + "loss": 0.5798, + "step": 20040 + }, + { + "epoch": 2.9061129832952854, + "grad_norm": 0.9481135606765747, + "learning_rate": 2.0946513987534428e-05, + "loss": 0.5695, + "step": 20050 + }, + { + "epoch": 2.90756241620466, + "grad_norm": 0.9128068685531616, + "learning_rate": 2.0932019133207712e-05, + "loss": 0.5408, + "step": 20060 + }, + { + "epoch": 2.909011849114034, + "grad_norm": 2.810906171798706, + "learning_rate": 2.0917524278880997e-05, + "loss": 0.5821, + "step": 20070 + }, + { + "epoch": 2.9104612820234084, + "grad_norm": 1.2242695093154907, + "learning_rate": 2.0903029424554284e-05, + "loss": 0.5532, + "step": 20080 + }, + { + "epoch": 2.9119107149327825, + "grad_norm": 1.5578938722610474, + "learning_rate": 2.088853457022757e-05, + "loss": 0.5768, + "step": 20090 + }, + { + "epoch": 2.9133601478421567, + "grad_norm": 0.8750723600387573, + "learning_rate": 2.0874039715900857e-05, + "loss": 0.5902, + "step": 20100 + }, + { + "epoch": 2.914809580751531, + "grad_norm": 2.7830746173858643, + "learning_rate": 2.0859544861574144e-05, + "loss": 0.6105, + "step": 20110 + }, + { + "epoch": 2.916259013660905, + "grad_norm": 0.8406007885932922, + "learning_rate": 2.084505000724743e-05, + "loss": 0.5396, + "step": 20120 + }, + { + "epoch": 2.9177084465702796, + "grad_norm": 2.036412477493286, + "learning_rate": 2.0830555152920713e-05, + "loss": 0.4757, + "step": 20130 + }, + { + "epoch": 2.9191578794796538, + "grad_norm": 0.9003493785858154, + "learning_rate": 2.0816060298594e-05, + "loss": 0.6022, + "step": 20140 + }, + { + "epoch": 2.920607312389028, + "grad_norm": 1.2599937915802002, + "learning_rate": 2.080156544426729e-05, + "loss": 0.5633, + "step": 20150 + }, + { + "epoch": 2.922056745298402, + "grad_norm": 1.3828151226043701, + "learning_rate": 2.0787070589940573e-05, + "loss": 0.5638, + "step": 20160 + }, + { + "epoch": 2.9235061782077763, + "grad_norm": 1.4246114492416382, + "learning_rate": 2.0772575735613857e-05, + "loss": 0.526, + "step": 20170 + }, + { + "epoch": 2.9249556111171504, + "grad_norm": 2.006903886795044, + "learning_rate": 2.0758080881287145e-05, + "loss": 0.5185, + "step": 20180 + }, + { + "epoch": 2.9264050440265246, + "grad_norm": 1.0413299798965454, + "learning_rate": 2.074358602696043e-05, + "loss": 0.5867, + "step": 20190 + }, + { + "epoch": 2.9278544769358987, + "grad_norm": 2.7793540954589844, + "learning_rate": 2.0729091172633717e-05, + "loss": 0.5626, + "step": 20200 + }, + { + "epoch": 2.929303909845273, + "grad_norm": 1.0333194732666016, + "learning_rate": 2.0714596318307e-05, + "loss": 0.6196, + "step": 20210 + }, + { + "epoch": 2.930753342754647, + "grad_norm": 1.1859326362609863, + "learning_rate": 2.0700101463980286e-05, + "loss": 0.6377, + "step": 20220 + }, + { + "epoch": 2.932202775664021, + "grad_norm": 2.745549440383911, + "learning_rate": 2.0685606609653574e-05, + "loss": 0.5695, + "step": 20230 + }, + { + "epoch": 2.933652208573396, + "grad_norm": 2.405184030532837, + "learning_rate": 2.067111175532686e-05, + "loss": 0.6088, + "step": 20240 + }, + { + "epoch": 2.93510164148277, + "grad_norm": 1.1689724922180176, + "learning_rate": 2.0656616901000146e-05, + "loss": 0.5437, + "step": 20250 + }, + { + "epoch": 2.936551074392144, + "grad_norm": 0.8889197111129761, + "learning_rate": 2.064212204667343e-05, + "loss": 0.5489, + "step": 20260 + }, + { + "epoch": 2.9380005073015183, + "grad_norm": 3.05747127532959, + "learning_rate": 2.0627627192346718e-05, + "loss": 0.5463, + "step": 20270 + }, + { + "epoch": 2.9394499402108925, + "grad_norm": 1.3492549657821655, + "learning_rate": 2.0613132338020002e-05, + "loss": 0.6216, + "step": 20280 + }, + { + "epoch": 2.9408993731202666, + "grad_norm": 0.8226122856140137, + "learning_rate": 2.059863748369329e-05, + "loss": 0.5107, + "step": 20290 + }, + { + "epoch": 2.9423488060296408, + "grad_norm": 0.8221601843833923, + "learning_rate": 2.0584142629366578e-05, + "loss": 0.5439, + "step": 20300 + }, + { + "epoch": 2.9437982389390154, + "grad_norm": 0.9002091288566589, + "learning_rate": 2.0571097260472534e-05, + "loss": 0.5844, + "step": 20310 + }, + { + "epoch": 2.9452476718483895, + "grad_norm": 2.082839012145996, + "learning_rate": 2.0556602406145818e-05, + "loss": 0.5639, + "step": 20320 + }, + { + "epoch": 2.9466971047577637, + "grad_norm": 0.8620240688323975, + "learning_rate": 2.0542107551819106e-05, + "loss": 0.5234, + "step": 20330 + }, + { + "epoch": 2.948146537667138, + "grad_norm": 2.8008594512939453, + "learning_rate": 2.052761269749239e-05, + "loss": 0.5547, + "step": 20340 + }, + { + "epoch": 2.949595970576512, + "grad_norm": 0.930914580821991, + "learning_rate": 2.0513117843165678e-05, + "loss": 0.5172, + "step": 20350 + }, + { + "epoch": 2.951045403485886, + "grad_norm": 0.9638407826423645, + "learning_rate": 2.0498622988838965e-05, + "loss": 0.6278, + "step": 20360 + }, + { + "epoch": 2.9524948363952603, + "grad_norm": 1.2510921955108643, + "learning_rate": 2.048412813451225e-05, + "loss": 0.5396, + "step": 20370 + }, + { + "epoch": 2.9539442693046345, + "grad_norm": 0.7251255512237549, + "learning_rate": 2.0469633280185534e-05, + "loss": 0.5134, + "step": 20380 + }, + { + "epoch": 2.9553937022140087, + "grad_norm": 3.2369771003723145, + "learning_rate": 2.0455138425858822e-05, + "loss": 0.5612, + "step": 20390 + }, + { + "epoch": 2.956843135123383, + "grad_norm": 1.9414756298065186, + "learning_rate": 2.0440643571532106e-05, + "loss": 0.5565, + "step": 20400 + }, + { + "epoch": 2.958292568032757, + "grad_norm": 0.9527379274368286, + "learning_rate": 2.0426148717205394e-05, + "loss": 0.5756, + "step": 20410 + }, + { + "epoch": 2.959742000942131, + "grad_norm": 2.340003728866577, + "learning_rate": 2.041165386287868e-05, + "loss": 0.5586, + "step": 20420 + }, + { + "epoch": 2.9611914338515057, + "grad_norm": 1.1174806356430054, + "learning_rate": 2.0397159008551963e-05, + "loss": 0.5186, + "step": 20430 + }, + { + "epoch": 2.96264086676088, + "grad_norm": 1.0091946125030518, + "learning_rate": 2.038266415422525e-05, + "loss": 0.5589, + "step": 20440 + }, + { + "epoch": 2.964090299670254, + "grad_norm": 0.8579798936843872, + "learning_rate": 2.0368169299898538e-05, + "loss": 0.5222, + "step": 20450 + }, + { + "epoch": 2.965539732579628, + "grad_norm": 1.2191307544708252, + "learning_rate": 2.0353674445571823e-05, + "loss": 0.5434, + "step": 20460 + }, + { + "epoch": 2.9669891654890024, + "grad_norm": 3.384915828704834, + "learning_rate": 2.0339179591245107e-05, + "loss": 0.578, + "step": 20470 + }, + { + "epoch": 2.9684385983983765, + "grad_norm": 3.718766927719116, + "learning_rate": 2.0324684736918395e-05, + "loss": 0.5868, + "step": 20480 + }, + { + "epoch": 2.9698880313077507, + "grad_norm": 1.0504790544509888, + "learning_rate": 2.0310189882591682e-05, + "loss": 0.541, + "step": 20490 + }, + { + "epoch": 2.9713374642171253, + "grad_norm": 0.9422523975372314, + "learning_rate": 2.0295695028264967e-05, + "loss": 0.5067, + "step": 20500 + }, + { + "epoch": 2.9727868971264995, + "grad_norm": 0.9271829724311829, + "learning_rate": 2.0281200173938255e-05, + "loss": 0.5226, + "step": 20510 + }, + { + "epoch": 2.9742363300358736, + "grad_norm": 1.016239881515503, + "learning_rate": 2.026670531961154e-05, + "loss": 0.5355, + "step": 20520 + }, + { + "epoch": 2.975685762945248, + "grad_norm": 1.2245512008666992, + "learning_rate": 2.0252210465284823e-05, + "loss": 0.595, + "step": 20530 + }, + { + "epoch": 2.977135195854622, + "grad_norm": 1.3465006351470947, + "learning_rate": 2.023771561095811e-05, + "loss": 0.544, + "step": 20540 + }, + { + "epoch": 2.978584628763996, + "grad_norm": 1.4868266582489014, + "learning_rate": 2.02232207566314e-05, + "loss": 0.541, + "step": 20550 + }, + { + "epoch": 2.9800340616733703, + "grad_norm": 3.016294479370117, + "learning_rate": 2.0208725902304683e-05, + "loss": 0.5838, + "step": 20560 + }, + { + "epoch": 2.9814834945827444, + "grad_norm": 1.2055611610412598, + "learning_rate": 2.0194231047977968e-05, + "loss": 0.6035, + "step": 20570 + }, + { + "epoch": 2.9829329274921186, + "grad_norm": 1.2255357503890991, + "learning_rate": 2.0179736193651255e-05, + "loss": 0.524, + "step": 20580 + }, + { + "epoch": 2.9843823604014927, + "grad_norm": 2.1640145778656006, + "learning_rate": 2.0165241339324543e-05, + "loss": 0.453, + "step": 20590 + }, + { + "epoch": 2.985831793310867, + "grad_norm": 1.263148546218872, + "learning_rate": 2.0150746484997827e-05, + "loss": 0.577, + "step": 20600 + }, + { + "epoch": 2.987281226220241, + "grad_norm": 1.0417917966842651, + "learning_rate": 2.0136251630671112e-05, + "loss": 0.494, + "step": 20610 + }, + { + "epoch": 2.9887306591296157, + "grad_norm": 2.8763279914855957, + "learning_rate": 2.01217567763444e-05, + "loss": 0.5378, + "step": 20620 + }, + { + "epoch": 2.99018009203899, + "grad_norm": 2.2581329345703125, + "learning_rate": 2.0107261922017684e-05, + "loss": 0.5647, + "step": 20630 + }, + { + "epoch": 2.991629524948364, + "grad_norm": 0.999770998954773, + "learning_rate": 2.009276706769097e-05, + "loss": 0.6144, + "step": 20640 + }, + { + "epoch": 2.993078957857738, + "grad_norm": 2.5331544876098633, + "learning_rate": 2.0078272213364256e-05, + "loss": 0.5512, + "step": 20650 + }, + { + "epoch": 2.9945283907671123, + "grad_norm": 1.2347875833511353, + "learning_rate": 2.006377735903754e-05, + "loss": 0.5147, + "step": 20660 + }, + { + "epoch": 2.9959778236764865, + "grad_norm": 0.8321196436882019, + "learning_rate": 2.0049282504710828e-05, + "loss": 0.5601, + "step": 20670 + }, + { + "epoch": 2.9974272565858606, + "grad_norm": 1.9766823053359985, + "learning_rate": 2.0034787650384116e-05, + "loss": 0.5128, + "step": 20680 + }, + { + "epoch": 2.9988766894952352, + "grad_norm": 0.8170207738876343, + "learning_rate": 2.00202927960574e-05, + "loss": 0.6103, + "step": 20690 + }, + { + "epoch": 3.0003261224046094, + "grad_norm": 1.5131596326828003, + "learning_rate": 2.0005797941730688e-05, + "loss": 0.5042, + "step": 20700 + }, + { + "epoch": 3.0017755553139835, + "grad_norm": 1.2082328796386719, + "learning_rate": 1.9991303087403972e-05, + "loss": 0.4591, + "step": 20710 + }, + { + "epoch": 3.0032249882233577, + "grad_norm": 0.784325361251831, + "learning_rate": 1.997680823307726e-05, + "loss": 0.4597, + "step": 20720 + }, + { + "epoch": 3.004674421132732, + "grad_norm": 3.104140043258667, + "learning_rate": 1.9962313378750544e-05, + "loss": 0.4864, + "step": 20730 + }, + { + "epoch": 3.006123854042106, + "grad_norm": 0.9828206896781921, + "learning_rate": 1.9947818524423832e-05, + "loss": 0.5007, + "step": 20740 + }, + { + "epoch": 3.00757328695148, + "grad_norm": 2.5246028900146484, + "learning_rate": 1.9933323670097116e-05, + "loss": 0.5046, + "step": 20750 + }, + { + "epoch": 3.0090227198608543, + "grad_norm": 1.0861430168151855, + "learning_rate": 1.99188288157704e-05, + "loss": 0.4932, + "step": 20760 + }, + { + "epoch": 3.0104721527702285, + "grad_norm": 0.9487363696098328, + "learning_rate": 1.990433396144369e-05, + "loss": 0.471, + "step": 20770 + }, + { + "epoch": 3.0119215856796027, + "grad_norm": 1.6627943515777588, + "learning_rate": 1.9889839107116976e-05, + "loss": 0.4333, + "step": 20780 + }, + { + "epoch": 3.0133710185889773, + "grad_norm": 1.0334659814834595, + "learning_rate": 1.987534425279026e-05, + "loss": 0.4825, + "step": 20790 + }, + { + "epoch": 3.0148204514983514, + "grad_norm": 0.946616530418396, + "learning_rate": 1.9860849398463545e-05, + "loss": 0.4572, + "step": 20800 + }, + { + "epoch": 3.0162698844077256, + "grad_norm": 1.0239499807357788, + "learning_rate": 1.9846354544136833e-05, + "loss": 0.579, + "step": 20810 + }, + { + "epoch": 3.0177193173170997, + "grad_norm": 1.330289602279663, + "learning_rate": 1.9831859689810117e-05, + "loss": 0.4576, + "step": 20820 + }, + { + "epoch": 3.019168750226474, + "grad_norm": 2.2307474613189697, + "learning_rate": 1.9817364835483405e-05, + "loss": 0.4538, + "step": 20830 + }, + { + "epoch": 3.020618183135848, + "grad_norm": 2.4169399738311768, + "learning_rate": 1.980286998115669e-05, + "loss": 0.4184, + "step": 20840 + }, + { + "epoch": 3.0220676160452222, + "grad_norm": 2.4117231369018555, + "learning_rate": 1.9788375126829974e-05, + "loss": 0.5314, + "step": 20850 + }, + { + "epoch": 3.0235170489545964, + "grad_norm": 1.6371862888336182, + "learning_rate": 1.977388027250326e-05, + "loss": 0.4839, + "step": 20860 + }, + { + "epoch": 3.0249664818639705, + "grad_norm": 1.996282935142517, + "learning_rate": 1.975938541817655e-05, + "loss": 0.4951, + "step": 20870 + }, + { + "epoch": 3.026415914773345, + "grad_norm": 1.1968421936035156, + "learning_rate": 1.9744890563849837e-05, + "loss": 0.4765, + "step": 20880 + }, + { + "epoch": 3.0278653476827193, + "grad_norm": 0.709713876247406, + "learning_rate": 1.973039570952312e-05, + "loss": 0.4464, + "step": 20890 + }, + { + "epoch": 3.0293147805920935, + "grad_norm": 2.853358507156372, + "learning_rate": 1.9715900855196406e-05, + "loss": 0.4124, + "step": 20900 + }, + { + "epoch": 3.0307642135014676, + "grad_norm": 0.9398220777511597, + "learning_rate": 1.9701406000869693e-05, + "loss": 0.491, + "step": 20910 + }, + { + "epoch": 3.032213646410842, + "grad_norm": 0.9328126311302185, + "learning_rate": 1.9686911146542978e-05, + "loss": 0.5014, + "step": 20920 + }, + { + "epoch": 3.033663079320216, + "grad_norm": 1.1154359579086304, + "learning_rate": 1.9672416292216265e-05, + "loss": 0.4352, + "step": 20930 + }, + { + "epoch": 3.03511251222959, + "grad_norm": 0.8593978881835938, + "learning_rate": 1.965792143788955e-05, + "loss": 0.4374, + "step": 20940 + }, + { + "epoch": 3.0365619451389643, + "grad_norm": 1.374656319618225, + "learning_rate": 1.9643426583562834e-05, + "loss": 0.475, + "step": 20950 + }, + { + "epoch": 3.0380113780483384, + "grad_norm": 2.1968801021575928, + "learning_rate": 1.9628931729236122e-05, + "loss": 0.4854, + "step": 20960 + }, + { + "epoch": 3.0394608109577126, + "grad_norm": 2.48079252243042, + "learning_rate": 1.961443687490941e-05, + "loss": 0.4832, + "step": 20970 + }, + { + "epoch": 3.040910243867087, + "grad_norm": 3.2282943725585938, + "learning_rate": 1.9599942020582694e-05, + "loss": 0.45, + "step": 20980 + }, + { + "epoch": 3.0423596767764614, + "grad_norm": 1.1852225065231323, + "learning_rate": 1.958544716625598e-05, + "loss": 0.4655, + "step": 20990 + }, + { + "epoch": 3.0438091096858355, + "grad_norm": 0.8068433403968811, + "learning_rate": 1.9570952311929266e-05, + "loss": 0.4486, + "step": 21000 + }, + { + "epoch": 3.0438091096858355, + "eval_loss": 0.7307726740837097, + "eval_runtime": 670.7043, + "eval_samples_per_second": 51.431, + "eval_steps_per_second": 2.572, + "eval_token_accuracy": 0.0003918104538117238, + "step": 21000 + }, + { + "epoch": 3.0452585425952097, + "grad_norm": 0.8493704795837402, + "learning_rate": 1.9556457457602554e-05, + "loss": 0.4957, + "step": 21010 + }, + { + "epoch": 3.046707975504584, + "grad_norm": 1.2335125207901, + "learning_rate": 1.9541962603275838e-05, + "loss": 0.3801, + "step": 21020 + }, + { + "epoch": 3.048157408413958, + "grad_norm": 1.2187851667404175, + "learning_rate": 1.9527467748949123e-05, + "loss": 0.472, + "step": 21030 + }, + { + "epoch": 3.049606841323332, + "grad_norm": 1.7487982511520386, + "learning_rate": 1.951297289462241e-05, + "loss": 0.4714, + "step": 21040 + }, + { + "epoch": 3.0510562742327063, + "grad_norm": 0.8521775007247925, + "learning_rate": 1.9498478040295695e-05, + "loss": 0.409, + "step": 21050 + }, + { + "epoch": 3.0525057071420805, + "grad_norm": 1.2800102233886719, + "learning_rate": 1.9483983185968982e-05, + "loss": 0.4213, + "step": 21060 + }, + { + "epoch": 3.053955140051455, + "grad_norm": 2.577768087387085, + "learning_rate": 1.946948833164227e-05, + "loss": 0.5315, + "step": 21070 + }, + { + "epoch": 3.0554045729608292, + "grad_norm": 1.125487208366394, + "learning_rate": 1.9454993477315555e-05, + "loss": 0.4036, + "step": 21080 + }, + { + "epoch": 3.0568540058702034, + "grad_norm": 1.9074199199676514, + "learning_rate": 1.944049862298884e-05, + "loss": 0.4442, + "step": 21090 + }, + { + "epoch": 3.0583034387795776, + "grad_norm": 0.7598525285720825, + "learning_rate": 1.9426003768662127e-05, + "loss": 0.5219, + "step": 21100 + }, + { + "epoch": 3.0597528716889517, + "grad_norm": 2.014727830886841, + "learning_rate": 1.941150891433541e-05, + "loss": 0.548, + "step": 21110 + }, + { + "epoch": 3.061202304598326, + "grad_norm": 2.676212787628174, + "learning_rate": 1.93970140600087e-05, + "loss": 0.4656, + "step": 21120 + }, + { + "epoch": 3.0626517375077, + "grad_norm": 2.4987432956695557, + "learning_rate": 1.9382519205681983e-05, + "loss": 0.4591, + "step": 21130 + }, + { + "epoch": 3.064101170417074, + "grad_norm": 0.8059296011924744, + "learning_rate": 1.936802435135527e-05, + "loss": 0.4621, + "step": 21140 + }, + { + "epoch": 3.0655506033264484, + "grad_norm": 2.7726733684539795, + "learning_rate": 1.9353529497028555e-05, + "loss": 0.488, + "step": 21150 + }, + { + "epoch": 3.0670000362358225, + "grad_norm": 1.1009929180145264, + "learning_rate": 1.9339034642701843e-05, + "loss": 0.4934, + "step": 21160 + }, + { + "epoch": 3.068449469145197, + "grad_norm": 0.9859609007835388, + "learning_rate": 1.9324539788375127e-05, + "loss": 0.4611, + "step": 21170 + }, + { + "epoch": 3.0698989020545713, + "grad_norm": 1.0177518129348755, + "learning_rate": 1.931004493404841e-05, + "loss": 0.4368, + "step": 21180 + }, + { + "epoch": 3.0713483349639454, + "grad_norm": 2.414206027984619, + "learning_rate": 1.92955500797217e-05, + "loss": 0.4755, + "step": 21190 + }, + { + "epoch": 3.0727977678733196, + "grad_norm": 3.308610677719116, + "learning_rate": 1.9281055225394987e-05, + "loss": 0.4878, + "step": 21200 + }, + { + "epoch": 3.0742472007826938, + "grad_norm": 1.0151442289352417, + "learning_rate": 1.926656037106827e-05, + "loss": 0.4834, + "step": 21210 + }, + { + "epoch": 3.075696633692068, + "grad_norm": 0.8535460233688354, + "learning_rate": 1.9252065516741556e-05, + "loss": 0.52, + "step": 21220 + }, + { + "epoch": 3.077146066601442, + "grad_norm": 0.9755415320396423, + "learning_rate": 1.9237570662414844e-05, + "loss": 0.4589, + "step": 21230 + }, + { + "epoch": 3.0785954995108162, + "grad_norm": 0.7422819137573242, + "learning_rate": 1.9223075808088128e-05, + "loss": 0.4364, + "step": 21240 + }, + { + "epoch": 3.0800449324201904, + "grad_norm": 1.1758068799972534, + "learning_rate": 1.9208580953761416e-05, + "loss": 0.4246, + "step": 21250 + }, + { + "epoch": 3.081494365329565, + "grad_norm": 2.122065305709839, + "learning_rate": 1.9194086099434703e-05, + "loss": 0.4369, + "step": 21260 + }, + { + "epoch": 3.082943798238939, + "grad_norm": 0.7626737356185913, + "learning_rate": 1.9179591245107988e-05, + "loss": 0.4277, + "step": 21270 + }, + { + "epoch": 3.0843932311483133, + "grad_norm": 2.114098072052002, + "learning_rate": 1.9165096390781272e-05, + "loss": 0.486, + "step": 21280 + }, + { + "epoch": 3.0858426640576875, + "grad_norm": 2.370668888092041, + "learning_rate": 1.915060153645456e-05, + "loss": 0.4631, + "step": 21290 + }, + { + "epoch": 3.0872920969670616, + "grad_norm": 1.3175185918807983, + "learning_rate": 1.9136106682127848e-05, + "loss": 0.42, + "step": 21300 + }, + { + "epoch": 3.088741529876436, + "grad_norm": 1.1968013048171997, + "learning_rate": 1.9121611827801132e-05, + "loss": 0.476, + "step": 21310 + }, + { + "epoch": 3.09019096278581, + "grad_norm": 0.7684279084205627, + "learning_rate": 1.9107116973474416e-05, + "loss": 0.5142, + "step": 21320 + }, + { + "epoch": 3.091640395695184, + "grad_norm": 0.9500318169593811, + "learning_rate": 1.9092622119147704e-05, + "loss": 0.4577, + "step": 21330 + }, + { + "epoch": 3.0930898286045583, + "grad_norm": 2.970555067062378, + "learning_rate": 1.907812726482099e-05, + "loss": 0.4624, + "step": 21340 + }, + { + "epoch": 3.094539261513933, + "grad_norm": 2.508944511413574, + "learning_rate": 1.9063632410494276e-05, + "loss": 0.3773, + "step": 21350 + }, + { + "epoch": 3.095988694423307, + "grad_norm": 1.0545679330825806, + "learning_rate": 1.904913755616756e-05, + "loss": 0.4906, + "step": 21360 + }, + { + "epoch": 3.097438127332681, + "grad_norm": 0.9826215505599976, + "learning_rate": 1.9034642701840845e-05, + "loss": 0.4465, + "step": 21370 + }, + { + "epoch": 3.0988875602420554, + "grad_norm": 0.921008825302124, + "learning_rate": 1.9020147847514133e-05, + "loss": 0.4526, + "step": 21380 + }, + { + "epoch": 3.1003369931514295, + "grad_norm": 1.1149829626083374, + "learning_rate": 1.900565299318742e-05, + "loss": 0.4398, + "step": 21390 + }, + { + "epoch": 3.1017864260608037, + "grad_norm": 1.2042686939239502, + "learning_rate": 1.8991158138860705e-05, + "loss": 0.5067, + "step": 21400 + }, + { + "epoch": 3.103235858970178, + "grad_norm": 1.7478713989257812, + "learning_rate": 1.897666328453399e-05, + "loss": 0.4704, + "step": 21410 + }, + { + "epoch": 3.104685291879552, + "grad_norm": 1.2265759706497192, + "learning_rate": 1.8962168430207277e-05, + "loss": 0.4861, + "step": 21420 + }, + { + "epoch": 3.106134724788926, + "grad_norm": 0.9611939191818237, + "learning_rate": 1.8947673575880565e-05, + "loss": 0.4485, + "step": 21430 + }, + { + "epoch": 3.1075841576983008, + "grad_norm": 3.710465908050537, + "learning_rate": 1.893317872155385e-05, + "loss": 0.499, + "step": 21440 + }, + { + "epoch": 3.109033590607675, + "grad_norm": 2.539863348007202, + "learning_rate": 1.8918683867227137e-05, + "loss": 0.4285, + "step": 21450 + }, + { + "epoch": 3.110483023517049, + "grad_norm": 1.1959104537963867, + "learning_rate": 1.890418901290042e-05, + "loss": 0.4061, + "step": 21460 + }, + { + "epoch": 3.1119324564264232, + "grad_norm": 3.4300787448883057, + "learning_rate": 1.8889694158573706e-05, + "loss": 0.4918, + "step": 21470 + }, + { + "epoch": 3.1133818893357974, + "grad_norm": 1.6281919479370117, + "learning_rate": 1.8875199304246993e-05, + "loss": 0.4336, + "step": 21480 + }, + { + "epoch": 3.1148313222451716, + "grad_norm": 1.0834424495697021, + "learning_rate": 1.886070444992028e-05, + "loss": 0.462, + "step": 21490 + }, + { + "epoch": 3.1162807551545457, + "grad_norm": 1.0968761444091797, + "learning_rate": 1.8846209595593565e-05, + "loss": 0.4493, + "step": 21500 + }, + { + "epoch": 3.11773018806392, + "grad_norm": 1.9969043731689453, + "learning_rate": 1.883171474126685e-05, + "loss": 0.4911, + "step": 21510 + }, + { + "epoch": 3.119179620973294, + "grad_norm": 1.7604318857192993, + "learning_rate": 1.8817219886940137e-05, + "loss": 0.418, + "step": 21520 + }, + { + "epoch": 3.120629053882668, + "grad_norm": 1.1388682126998901, + "learning_rate": 1.8802725032613422e-05, + "loss": 0.4105, + "step": 21530 + }, + { + "epoch": 3.122078486792043, + "grad_norm": 0.8295134902000427, + "learning_rate": 1.878823017828671e-05, + "loss": 0.4364, + "step": 21540 + }, + { + "epoch": 3.123527919701417, + "grad_norm": 1.134069561958313, + "learning_rate": 1.8773735323959994e-05, + "loss": 0.4599, + "step": 21550 + }, + { + "epoch": 3.124977352610791, + "grad_norm": 1.7226959466934204, + "learning_rate": 1.8759240469633282e-05, + "loss": 0.4642, + "step": 21560 + }, + { + "epoch": 3.1264267855201653, + "grad_norm": 1.3124228715896606, + "learning_rate": 1.8744745615306566e-05, + "loss": 0.4198, + "step": 21570 + }, + { + "epoch": 3.1278762184295394, + "grad_norm": 1.0341984033584595, + "learning_rate": 1.8730250760979854e-05, + "loss": 0.4535, + "step": 21580 + }, + { + "epoch": 3.1293256513389136, + "grad_norm": 0.8503829836845398, + "learning_rate": 1.871575590665314e-05, + "loss": 0.4629, + "step": 21590 + }, + { + "epoch": 3.1307750842482878, + "grad_norm": 2.9683518409729004, + "learning_rate": 1.8701261052326426e-05, + "loss": 0.46, + "step": 21600 + }, + { + "epoch": 3.132224517157662, + "grad_norm": 3.3407230377197266, + "learning_rate": 1.868676619799971e-05, + "loss": 0.4943, + "step": 21610 + }, + { + "epoch": 3.133673950067036, + "grad_norm": 1.2744535207748413, + "learning_rate": 1.8672271343672998e-05, + "loss": 0.446, + "step": 21620 + }, + { + "epoch": 3.1351233829764107, + "grad_norm": 1.004381775856018, + "learning_rate": 1.8657776489346282e-05, + "loss": 0.4428, + "step": 21630 + }, + { + "epoch": 3.136572815885785, + "grad_norm": 1.5023181438446045, + "learning_rate": 1.864328163501957e-05, + "loss": 0.4492, + "step": 21640 + }, + { + "epoch": 3.138022248795159, + "grad_norm": 0.7856612205505371, + "learning_rate": 1.8628786780692854e-05, + "loss": 0.4583, + "step": 21650 + }, + { + "epoch": 3.139471681704533, + "grad_norm": 0.7462323904037476, + "learning_rate": 1.861429192636614e-05, + "loss": 0.4353, + "step": 21660 + }, + { + "epoch": 3.1409211146139073, + "grad_norm": 0.8192468881607056, + "learning_rate": 1.8599797072039427e-05, + "loss": 0.4507, + "step": 21670 + }, + { + "epoch": 3.1423705475232815, + "grad_norm": 1.2236969470977783, + "learning_rate": 1.8585302217712714e-05, + "loss": 0.5054, + "step": 21680 + }, + { + "epoch": 3.1438199804326556, + "grad_norm": 1.431766152381897, + "learning_rate": 1.8570807363386e-05, + "loss": 0.4206, + "step": 21690 + }, + { + "epoch": 3.14526941334203, + "grad_norm": 2.7869577407836914, + "learning_rate": 1.8556312509059283e-05, + "loss": 0.4735, + "step": 21700 + }, + { + "epoch": 3.146718846251404, + "grad_norm": 1.9017903804779053, + "learning_rate": 1.854181765473257e-05, + "loss": 0.4565, + "step": 21710 + }, + { + "epoch": 3.148168279160778, + "grad_norm": 1.2527071237564087, + "learning_rate": 1.852732280040586e-05, + "loss": 0.4762, + "step": 21720 + }, + { + "epoch": 3.1496177120701527, + "grad_norm": 1.243003249168396, + "learning_rate": 1.8512827946079143e-05, + "loss": 0.4709, + "step": 21730 + }, + { + "epoch": 3.151067144979527, + "grad_norm": 2.1310229301452637, + "learning_rate": 1.8498333091752427e-05, + "loss": 0.4461, + "step": 21740 + }, + { + "epoch": 3.152516577888901, + "grad_norm": 2.815653085708618, + "learning_rate": 1.8483838237425715e-05, + "loss": 0.4761, + "step": 21750 + }, + { + "epoch": 3.153966010798275, + "grad_norm": 3.001795768737793, + "learning_rate": 1.8469343383099e-05, + "loss": 0.4694, + "step": 21760 + }, + { + "epoch": 3.1554154437076494, + "grad_norm": 2.344364643096924, + "learning_rate": 1.8454848528772287e-05, + "loss": 0.5223, + "step": 21770 + }, + { + "epoch": 3.1568648766170235, + "grad_norm": 0.849949061870575, + "learning_rate": 1.8440353674445575e-05, + "loss": 0.4398, + "step": 21780 + }, + { + "epoch": 3.1583143095263977, + "grad_norm": 0.9206576347351074, + "learning_rate": 1.842585882011886e-05, + "loss": 0.4655, + "step": 21790 + }, + { + "epoch": 3.159763742435772, + "grad_norm": 2.563059091567993, + "learning_rate": 1.8411363965792144e-05, + "loss": 0.4491, + "step": 21800 + }, + { + "epoch": 3.161213175345146, + "grad_norm": 2.454758882522583, + "learning_rate": 1.839686911146543e-05, + "loss": 0.4564, + "step": 21810 + }, + { + "epoch": 3.1626626082545206, + "grad_norm": 2.338440418243408, + "learning_rate": 1.8382374257138716e-05, + "loss": 0.4938, + "step": 21820 + }, + { + "epoch": 3.1641120411638948, + "grad_norm": 0.8359615802764893, + "learning_rate": 1.8367879402812003e-05, + "loss": 0.4533, + "step": 21830 + }, + { + "epoch": 3.165561474073269, + "grad_norm": 1.198116660118103, + "learning_rate": 1.8353384548485288e-05, + "loss": 0.4597, + "step": 21840 + }, + { + "epoch": 3.167010906982643, + "grad_norm": 1.0076746940612793, + "learning_rate": 1.8338889694158576e-05, + "loss": 0.4855, + "step": 21850 + }, + { + "epoch": 3.1684603398920173, + "grad_norm": 0.7770813703536987, + "learning_rate": 1.832584432526453e-05, + "loss": 0.4484, + "step": 21860 + }, + { + "epoch": 3.1699097728013914, + "grad_norm": 1.7894219160079956, + "learning_rate": 1.831134947093782e-05, + "loss": 0.4467, + "step": 21870 + }, + { + "epoch": 3.1713592057107656, + "grad_norm": 0.9608204364776611, + "learning_rate": 1.8296854616611104e-05, + "loss": 0.4081, + "step": 21880 + }, + { + "epoch": 3.1728086386201397, + "grad_norm": 3.3298492431640625, + "learning_rate": 1.828235976228439e-05, + "loss": 0.4777, + "step": 21890 + }, + { + "epoch": 3.174258071529514, + "grad_norm": 2.6890292167663574, + "learning_rate": 1.8267864907957676e-05, + "loss": 0.4917, + "step": 21900 + }, + { + "epoch": 3.175707504438888, + "grad_norm": 1.2616606950759888, + "learning_rate": 1.825337005363096e-05, + "loss": 0.4995, + "step": 21910 + }, + { + "epoch": 3.1771569373482627, + "grad_norm": 1.0045479536056519, + "learning_rate": 1.8238875199304248e-05, + "loss": 0.4002, + "step": 21920 + }, + { + "epoch": 3.178606370257637, + "grad_norm": 1.0455669164657593, + "learning_rate": 1.8224380344977536e-05, + "loss": 0.4975, + "step": 21930 + }, + { + "epoch": 3.180055803167011, + "grad_norm": 0.8399646282196045, + "learning_rate": 1.820988549065082e-05, + "loss": 0.4358, + "step": 21940 + }, + { + "epoch": 3.181505236076385, + "grad_norm": 1.2526215314865112, + "learning_rate": 1.8195390636324104e-05, + "loss": 0.4948, + "step": 21950 + }, + { + "epoch": 3.1829546689857593, + "grad_norm": 1.0016404390335083, + "learning_rate": 1.8180895781997392e-05, + "loss": 0.4968, + "step": 21960 + }, + { + "epoch": 3.1844041018951335, + "grad_norm": 0.9354867339134216, + "learning_rate": 1.8166400927670676e-05, + "loss": 0.4508, + "step": 21970 + }, + { + "epoch": 3.1858535348045076, + "grad_norm": 1.2305155992507935, + "learning_rate": 1.8151906073343964e-05, + "loss": 0.4875, + "step": 21980 + }, + { + "epoch": 3.1873029677138818, + "grad_norm": 2.4976658821105957, + "learning_rate": 1.813741121901725e-05, + "loss": 0.4869, + "step": 21990 + }, + { + "epoch": 3.1887524006232564, + "grad_norm": 1.661704182624817, + "learning_rate": 1.8122916364690536e-05, + "loss": 0.4676, + "step": 22000 + }, + { + "epoch": 3.1887524006232564, + "eval_loss": 0.725195050239563, + "eval_runtime": 670.7568, + "eval_samples_per_second": 51.427, + "eval_steps_per_second": 2.572, + "eval_token_accuracy": 0.000388915302675184, + "step": 22000 + }, + { + "epoch": 3.1902018335326305, + "grad_norm": 1.16428542137146, + "learning_rate": 1.810842151036382e-05, + "loss": 0.4443, + "step": 22010 + }, + { + "epoch": 3.1916512664420047, + "grad_norm": 3.376563549041748, + "learning_rate": 1.809392665603711e-05, + "loss": 0.4316, + "step": 22020 + }, + { + "epoch": 3.193100699351379, + "grad_norm": 1.1599247455596924, + "learning_rate": 1.8079431801710396e-05, + "loss": 0.4499, + "step": 22030 + }, + { + "epoch": 3.194550132260753, + "grad_norm": 1.023087739944458, + "learning_rate": 1.806493694738368e-05, + "loss": 0.514, + "step": 22040 + }, + { + "epoch": 3.195999565170127, + "grad_norm": 1.0364000797271729, + "learning_rate": 1.8050442093056965e-05, + "loss": 0.3974, + "step": 22050 + }, + { + "epoch": 3.1974489980795013, + "grad_norm": 1.0344669818878174, + "learning_rate": 1.8035947238730253e-05, + "loss": 0.5006, + "step": 22060 + }, + { + "epoch": 3.1988984309888755, + "grad_norm": 0.8084372282028198, + "learning_rate": 1.8021452384403537e-05, + "loss": 0.4475, + "step": 22070 + }, + { + "epoch": 3.2003478638982497, + "grad_norm": 2.4641897678375244, + "learning_rate": 1.8006957530076825e-05, + "loss": 0.4526, + "step": 22080 + }, + { + "epoch": 3.201797296807624, + "grad_norm": 1.0928001403808594, + "learning_rate": 1.799246267575011e-05, + "loss": 0.435, + "step": 22090 + }, + { + "epoch": 3.2032467297169984, + "grad_norm": 1.0270906686782837, + "learning_rate": 1.7977967821423393e-05, + "loss": 0.4208, + "step": 22100 + }, + { + "epoch": 3.2046961626263726, + "grad_norm": 2.1553866863250732, + "learning_rate": 1.796347296709668e-05, + "loss": 0.4887, + "step": 22110 + }, + { + "epoch": 3.2061455955357467, + "grad_norm": 2.221139907836914, + "learning_rate": 1.794897811276997e-05, + "loss": 0.4922, + "step": 22120 + }, + { + "epoch": 3.207595028445121, + "grad_norm": 1.4236232042312622, + "learning_rate": 1.7934483258443253e-05, + "loss": 0.4619, + "step": 22130 + }, + { + "epoch": 3.209044461354495, + "grad_norm": 0.9401782155036926, + "learning_rate": 1.7919988404116538e-05, + "loss": 0.491, + "step": 22140 + }, + { + "epoch": 3.210493894263869, + "grad_norm": 2.1895623207092285, + "learning_rate": 1.7905493549789825e-05, + "loss": 0.4168, + "step": 22150 + }, + { + "epoch": 3.2119433271732434, + "grad_norm": 2.5863196849823, + "learning_rate": 1.7890998695463113e-05, + "loss": 0.427, + "step": 22160 + }, + { + "epoch": 3.2133927600826175, + "grad_norm": 0.9865674376487732, + "learning_rate": 1.7876503841136397e-05, + "loss": 0.5734, + "step": 22170 + }, + { + "epoch": 3.2148421929919917, + "grad_norm": 1.3625892400741577, + "learning_rate": 1.7862008986809685e-05, + "loss": 0.434, + "step": 22180 + }, + { + "epoch": 3.2162916259013663, + "grad_norm": 2.1608617305755615, + "learning_rate": 1.784751413248297e-05, + "loss": 0.4579, + "step": 22190 + }, + { + "epoch": 3.2177410588107405, + "grad_norm": 1.851686716079712, + "learning_rate": 1.7833019278156254e-05, + "loss": 0.4108, + "step": 22200 + }, + { + "epoch": 3.2191904917201146, + "grad_norm": 1.1519325971603394, + "learning_rate": 1.781852442382954e-05, + "loss": 0.4558, + "step": 22210 + }, + { + "epoch": 3.220639924629489, + "grad_norm": 3.0943803787231445, + "learning_rate": 1.780402956950283e-05, + "loss": 0.4681, + "step": 22220 + }, + { + "epoch": 3.222089357538863, + "grad_norm": 1.34933340549469, + "learning_rate": 1.7789534715176114e-05, + "loss": 0.4719, + "step": 22230 + }, + { + "epoch": 3.223538790448237, + "grad_norm": 0.900416374206543, + "learning_rate": 1.7775039860849398e-05, + "loss": 0.4007, + "step": 22240 + }, + { + "epoch": 3.2249882233576113, + "grad_norm": 1.4920786619186401, + "learning_rate": 1.7760545006522686e-05, + "loss": 0.377, + "step": 22250 + }, + { + "epoch": 3.2264376562669854, + "grad_norm": 1.3333457708358765, + "learning_rate": 1.7746050152195974e-05, + "loss": 0.5223, + "step": 22260 + }, + { + "epoch": 3.2278870891763596, + "grad_norm": 0.8491736054420471, + "learning_rate": 1.7731555297869258e-05, + "loss": 0.4507, + "step": 22270 + }, + { + "epoch": 3.2293365220857337, + "grad_norm": 1.204391360282898, + "learning_rate": 1.7717060443542542e-05, + "loss": 0.4531, + "step": 22280 + }, + { + "epoch": 3.2307859549951083, + "grad_norm": 0.8898743987083435, + "learning_rate": 1.770256558921583e-05, + "loss": 0.4785, + "step": 22290 + }, + { + "epoch": 3.2322353879044825, + "grad_norm": 2.0492045879364014, + "learning_rate": 1.7688070734889114e-05, + "loss": 0.5007, + "step": 22300 + }, + { + "epoch": 3.2336848208138567, + "grad_norm": 1.5305896997451782, + "learning_rate": 1.7673575880562402e-05, + "loss": 0.458, + "step": 22310 + }, + { + "epoch": 3.235134253723231, + "grad_norm": 2.20937442779541, + "learning_rate": 1.7659081026235687e-05, + "loss": 0.4776, + "step": 22320 + }, + { + "epoch": 3.236583686632605, + "grad_norm": 1.2759435176849365, + "learning_rate": 1.764458617190897e-05, + "loss": 0.4559, + "step": 22330 + }, + { + "epoch": 3.238033119541979, + "grad_norm": 2.635709762573242, + "learning_rate": 1.763009131758226e-05, + "loss": 0.5438, + "step": 22340 + }, + { + "epoch": 3.2394825524513533, + "grad_norm": 0.9792227745056152, + "learning_rate": 1.7615596463255546e-05, + "loss": 0.4645, + "step": 22350 + }, + { + "epoch": 3.2409319853607275, + "grad_norm": 3.481822967529297, + "learning_rate": 1.760110160892883e-05, + "loss": 0.4575, + "step": 22360 + }, + { + "epoch": 3.2423814182701016, + "grad_norm": 1.1393392086029053, + "learning_rate": 1.758660675460212e-05, + "loss": 0.4472, + "step": 22370 + }, + { + "epoch": 3.2438308511794762, + "grad_norm": 2.524489641189575, + "learning_rate": 1.7572111900275403e-05, + "loss": 0.5124, + "step": 22380 + }, + { + "epoch": 3.2452802840888504, + "grad_norm": 2.6743130683898926, + "learning_rate": 1.7557617045948687e-05, + "loss": 0.4173, + "step": 22390 + }, + { + "epoch": 3.2467297169982245, + "grad_norm": 0.8293477892875671, + "learning_rate": 1.7543122191621975e-05, + "loss": 0.4444, + "step": 22400 + }, + { + "epoch": 3.2481791499075987, + "grad_norm": 1.2171900272369385, + "learning_rate": 1.7528627337295263e-05, + "loss": 0.475, + "step": 22410 + }, + { + "epoch": 3.249628582816973, + "grad_norm": 2.4464237689971924, + "learning_rate": 1.7514132482968547e-05, + "loss": 0.4502, + "step": 22420 + }, + { + "epoch": 3.251078015726347, + "grad_norm": 2.3313779830932617, + "learning_rate": 1.749963762864183e-05, + "loss": 0.4472, + "step": 22430 + }, + { + "epoch": 3.252527448635721, + "grad_norm": 1.9434154033660889, + "learning_rate": 1.748514277431512e-05, + "loss": 0.4169, + "step": 22440 + }, + { + "epoch": 3.2539768815450953, + "grad_norm": 0.9746271371841431, + "learning_rate": 1.7470647919988407e-05, + "loss": 0.4508, + "step": 22450 + }, + { + "epoch": 3.2554263144544695, + "grad_norm": 1.096774935722351, + "learning_rate": 1.745615306566169e-05, + "loss": 0.4454, + "step": 22460 + }, + { + "epoch": 3.2568757473638437, + "grad_norm": 1.1862661838531494, + "learning_rate": 1.7441658211334976e-05, + "loss": 0.5229, + "step": 22470 + }, + { + "epoch": 3.2583251802732183, + "grad_norm": 1.1553421020507812, + "learning_rate": 1.7427163357008263e-05, + "loss": 0.478, + "step": 22480 + }, + { + "epoch": 3.2597746131825924, + "grad_norm": 3.2264418601989746, + "learning_rate": 1.7412668502681548e-05, + "loss": 0.4547, + "step": 22490 + }, + { + "epoch": 3.2612240460919666, + "grad_norm": 1.1565039157867432, + "learning_rate": 1.7398173648354835e-05, + "loss": 0.4711, + "step": 22500 + }, + { + "epoch": 3.2626734790013407, + "grad_norm": 2.3308446407318115, + "learning_rate": 1.738367879402812e-05, + "loss": 0.4084, + "step": 22510 + }, + { + "epoch": 3.264122911910715, + "grad_norm": 1.196081519126892, + "learning_rate": 1.7369183939701404e-05, + "loss": 0.4103, + "step": 22520 + }, + { + "epoch": 3.265572344820089, + "grad_norm": 0.8371307253837585, + "learning_rate": 1.7354689085374692e-05, + "loss": 0.4792, + "step": 22530 + }, + { + "epoch": 3.2670217777294632, + "grad_norm": 0.8609314560890198, + "learning_rate": 1.734019423104798e-05, + "loss": 0.4364, + "step": 22540 + }, + { + "epoch": 3.2684712106388374, + "grad_norm": 3.442795753479004, + "learning_rate": 1.7325699376721267e-05, + "loss": 0.45, + "step": 22550 + }, + { + "epoch": 3.269920643548212, + "grad_norm": 2.094744920730591, + "learning_rate": 1.7311204522394552e-05, + "loss": 0.4826, + "step": 22560 + }, + { + "epoch": 3.271370076457586, + "grad_norm": 1.333853840827942, + "learning_rate": 1.7296709668067836e-05, + "loss": 0.4741, + "step": 22570 + }, + { + "epoch": 3.2728195093669603, + "grad_norm": 2.6992194652557373, + "learning_rate": 1.7282214813741124e-05, + "loss": 0.4722, + "step": 22580 + }, + { + "epoch": 3.2742689422763345, + "grad_norm": 1.3147931098937988, + "learning_rate": 1.7267719959414408e-05, + "loss": 0.4681, + "step": 22590 + }, + { + "epoch": 3.2757183751857086, + "grad_norm": 0.9021698236465454, + "learning_rate": 1.7253225105087696e-05, + "loss": 0.4493, + "step": 22600 + }, + { + "epoch": 3.277167808095083, + "grad_norm": 1.2024173736572266, + "learning_rate": 1.723873025076098e-05, + "loss": 0.4583, + "step": 22610 + }, + { + "epoch": 3.278617241004457, + "grad_norm": 3.715240716934204, + "learning_rate": 1.7224235396434265e-05, + "loss": 0.4554, + "step": 22620 + }, + { + "epoch": 3.280066673913831, + "grad_norm": 2.2712244987487793, + "learning_rate": 1.7209740542107552e-05, + "loss": 0.4563, + "step": 22630 + }, + { + "epoch": 3.2815161068232053, + "grad_norm": 1.429622769355774, + "learning_rate": 1.719524568778084e-05, + "loss": 0.4739, + "step": 22640 + }, + { + "epoch": 3.2829655397325794, + "grad_norm": 3.115586757659912, + "learning_rate": 1.7180750833454125e-05, + "loss": 0.5281, + "step": 22650 + }, + { + "epoch": 3.2844149726419536, + "grad_norm": 2.4304490089416504, + "learning_rate": 1.716625597912741e-05, + "loss": 0.3748, + "step": 22660 + }, + { + "epoch": 3.285864405551328, + "grad_norm": 0.8875736594200134, + "learning_rate": 1.7151761124800697e-05, + "loss": 0.5341, + "step": 22670 + }, + { + "epoch": 3.2873138384607024, + "grad_norm": 3.3635129928588867, + "learning_rate": 1.7137266270473984e-05, + "loss": 0.4654, + "step": 22680 + }, + { + "epoch": 3.2887632713700765, + "grad_norm": 1.1657626628875732, + "learning_rate": 1.712277141614727e-05, + "loss": 0.436, + "step": 22690 + }, + { + "epoch": 3.2902127042794507, + "grad_norm": 1.698906660079956, + "learning_rate": 1.7108276561820553e-05, + "loss": 0.4469, + "step": 22700 + }, + { + "epoch": 3.291662137188825, + "grad_norm": 2.1452109813690186, + "learning_rate": 1.709378170749384e-05, + "loss": 0.4585, + "step": 22710 + }, + { + "epoch": 3.293111570098199, + "grad_norm": 1.1009830236434937, + "learning_rate": 1.7079286853167125e-05, + "loss": 0.504, + "step": 22720 + }, + { + "epoch": 3.294561003007573, + "grad_norm": 0.9174827933311462, + "learning_rate": 1.7064791998840413e-05, + "loss": 0.4765, + "step": 22730 + }, + { + "epoch": 3.2960104359169473, + "grad_norm": 1.4893231391906738, + "learning_rate": 1.70502971445137e-05, + "loss": 0.4736, + "step": 22740 + }, + { + "epoch": 3.297459868826322, + "grad_norm": 1.3025282621383667, + "learning_rate": 1.7035802290186985e-05, + "loss": 0.4695, + "step": 22750 + }, + { + "epoch": 3.298909301735696, + "grad_norm": 1.2915959358215332, + "learning_rate": 1.702130743586027e-05, + "loss": 0.4178, + "step": 22760 + }, + { + "epoch": 3.3003587346450702, + "grad_norm": 0.8622018098831177, + "learning_rate": 1.7006812581533557e-05, + "loss": 0.4237, + "step": 22770 + }, + { + "epoch": 3.3018081675544444, + "grad_norm": 0.8027140498161316, + "learning_rate": 1.699231772720684e-05, + "loss": 0.4728, + "step": 22780 + }, + { + "epoch": 3.3032576004638186, + "grad_norm": 0.9985853433609009, + "learning_rate": 1.697782287288013e-05, + "loss": 0.4351, + "step": 22790 + }, + { + "epoch": 3.3047070333731927, + "grad_norm": 2.357565402984619, + "learning_rate": 1.6963328018553414e-05, + "loss": 0.4458, + "step": 22800 + }, + { + "epoch": 3.306156466282567, + "grad_norm": 1.0484607219696045, + "learning_rate": 1.6948833164226698e-05, + "loss": 0.4322, + "step": 22810 + }, + { + "epoch": 3.307605899191941, + "grad_norm": 1.152421474456787, + "learning_rate": 1.6934338309899986e-05, + "loss": 0.4471, + "step": 22820 + }, + { + "epoch": 3.309055332101315, + "grad_norm": 1.0169322490692139, + "learning_rate": 1.6919843455573274e-05, + "loss": 0.5096, + "step": 22830 + }, + { + "epoch": 3.3105047650106894, + "grad_norm": 1.0829248428344727, + "learning_rate": 1.6905348601246558e-05, + "loss": 0.4855, + "step": 22840 + }, + { + "epoch": 3.311954197920064, + "grad_norm": 0.8866340517997742, + "learning_rate": 1.6890853746919842e-05, + "loss": 0.4509, + "step": 22850 + }, + { + "epoch": 3.313403630829438, + "grad_norm": 1.7526586055755615, + "learning_rate": 1.687635889259313e-05, + "loss": 0.4671, + "step": 22860 + }, + { + "epoch": 3.3148530637388123, + "grad_norm": 1.9207016229629517, + "learning_rate": 1.6861864038266418e-05, + "loss": 0.4679, + "step": 22870 + }, + { + "epoch": 3.3163024966481864, + "grad_norm": 1.329699993133545, + "learning_rate": 1.6847369183939702e-05, + "loss": 0.4632, + "step": 22880 + }, + { + "epoch": 3.3177519295575606, + "grad_norm": 3.8258862495422363, + "learning_rate": 1.6832874329612986e-05, + "loss": 0.4822, + "step": 22890 + }, + { + "epoch": 3.3192013624669348, + "grad_norm": 1.1815471649169922, + "learning_rate": 1.6818379475286274e-05, + "loss": 0.4462, + "step": 22900 + }, + { + "epoch": 3.320650795376309, + "grad_norm": 0.9723221063613892, + "learning_rate": 1.680388462095956e-05, + "loss": 0.4768, + "step": 22910 + }, + { + "epoch": 3.322100228285683, + "grad_norm": 3.022918939590454, + "learning_rate": 1.6789389766632846e-05, + "loss": 0.4284, + "step": 22920 + }, + { + "epoch": 3.3235496611950577, + "grad_norm": 0.8839090466499329, + "learning_rate": 1.6774894912306134e-05, + "loss": 0.5082, + "step": 22930 + }, + { + "epoch": 3.324999094104432, + "grad_norm": 0.8981729745864868, + "learning_rate": 1.676040005797942e-05, + "loss": 0.4147, + "step": 22940 + }, + { + "epoch": 3.326448527013806, + "grad_norm": 2.855755567550659, + "learning_rate": 1.6745905203652703e-05, + "loss": 0.4797, + "step": 22950 + }, + { + "epoch": 3.32789795992318, + "grad_norm": 3.2790029048919678, + "learning_rate": 1.673141034932599e-05, + "loss": 0.5264, + "step": 22960 + }, + { + "epoch": 3.3293473928325543, + "grad_norm": 0.9484438896179199, + "learning_rate": 1.6716915494999278e-05, + "loss": 0.4662, + "step": 22970 + }, + { + "epoch": 3.3307968257419285, + "grad_norm": 1.0549322366714478, + "learning_rate": 1.6702420640672563e-05, + "loss": 0.4659, + "step": 22980 + }, + { + "epoch": 3.3322462586513026, + "grad_norm": 1.038637399673462, + "learning_rate": 1.6687925786345847e-05, + "loss": 0.4936, + "step": 22990 + }, + { + "epoch": 3.333695691560677, + "grad_norm": 2.0607287883758545, + "learning_rate": 1.6673430932019135e-05, + "loss": 0.4444, + "step": 23000 + }, + { + "epoch": 3.333695691560677, + "eval_loss": 0.7252270579338074, + "eval_runtime": 671.9965, + "eval_samples_per_second": 51.332, + "eval_steps_per_second": 2.567, + "eval_token_accuracy": 0.0003898803530540306, + "step": 23000 + }, + { + "epoch": 3.335145124470051, + "grad_norm": 1.2621514797210693, + "learning_rate": 1.665893607769242e-05, + "loss": 0.498, + "step": 23010 + }, + { + "epoch": 3.336594557379425, + "grad_norm": 2.548382043838501, + "learning_rate": 1.6644441223365707e-05, + "loss": 0.5305, + "step": 23020 + }, + { + "epoch": 3.3380439902887993, + "grad_norm": 1.046783208847046, + "learning_rate": 1.662994636903899e-05, + "loss": 0.4193, + "step": 23030 + }, + { + "epoch": 3.339493423198174, + "grad_norm": 0.898941159248352, + "learning_rate": 1.6615451514712276e-05, + "loss": 0.4921, + "step": 23040 + }, + { + "epoch": 3.340942856107548, + "grad_norm": 1.0653246641159058, + "learning_rate": 1.6600956660385563e-05, + "loss": 0.5408, + "step": 23050 + }, + { + "epoch": 3.342392289016922, + "grad_norm": 1.2510879039764404, + "learning_rate": 1.658646180605885e-05, + "loss": 0.4975, + "step": 23060 + }, + { + "epoch": 3.3438417219262964, + "grad_norm": 1.3344258069992065, + "learning_rate": 1.6571966951732135e-05, + "loss": 0.4376, + "step": 23070 + }, + { + "epoch": 3.3452911548356705, + "grad_norm": 1.0579745769500732, + "learning_rate": 1.655747209740542e-05, + "loss": 0.4425, + "step": 23080 + }, + { + "epoch": 3.3467405877450447, + "grad_norm": 1.7280727624893188, + "learning_rate": 1.6542977243078708e-05, + "loss": 0.5217, + "step": 23090 + }, + { + "epoch": 3.348190020654419, + "grad_norm": 0.845836341381073, + "learning_rate": 1.6528482388751995e-05, + "loss": 0.4382, + "step": 23100 + }, + { + "epoch": 3.349639453563793, + "grad_norm": 2.331657648086548, + "learning_rate": 1.651398753442528e-05, + "loss": 0.4631, + "step": 23110 + }, + { + "epoch": 3.3510888864731676, + "grad_norm": 1.2817667722702026, + "learning_rate": 1.6499492680098567e-05, + "loss": 0.4699, + "step": 23120 + }, + { + "epoch": 3.3525383193825418, + "grad_norm": 0.9075936675071716, + "learning_rate": 1.6484997825771852e-05, + "loss": 0.4654, + "step": 23130 + }, + { + "epoch": 3.353987752291916, + "grad_norm": 1.164581537246704, + "learning_rate": 1.6470502971445136e-05, + "loss": 0.4603, + "step": 23140 + }, + { + "epoch": 3.35543718520129, + "grad_norm": 2.0525588989257812, + "learning_rate": 1.6456008117118424e-05, + "loss": 0.4169, + "step": 23150 + }, + { + "epoch": 3.3568866181106642, + "grad_norm": 1.2242342233657837, + "learning_rate": 1.644151326279171e-05, + "loss": 0.4956, + "step": 23160 + }, + { + "epoch": 3.3583360510200384, + "grad_norm": 1.5872743129730225, + "learning_rate": 1.6427018408464996e-05, + "loss": 0.4644, + "step": 23170 + }, + { + "epoch": 3.3597854839294126, + "grad_norm": 2.8896257877349854, + "learning_rate": 1.641252355413828e-05, + "loss": 0.4392, + "step": 23180 + }, + { + "epoch": 3.3612349168387867, + "grad_norm": 3.553441286087036, + "learning_rate": 1.6398028699811568e-05, + "loss": 0.4607, + "step": 23190 + }, + { + "epoch": 3.362684349748161, + "grad_norm": 1.3294955492019653, + "learning_rate": 1.6383533845484852e-05, + "loss": 0.4963, + "step": 23200 + }, + { + "epoch": 3.364133782657535, + "grad_norm": 0.9303203821182251, + "learning_rate": 1.636903899115814e-05, + "loss": 0.4213, + "step": 23210 + }, + { + "epoch": 3.365583215566909, + "grad_norm": 1.2010650634765625, + "learning_rate": 1.6354544136831425e-05, + "loss": 0.3944, + "step": 23220 + }, + { + "epoch": 3.367032648476284, + "grad_norm": 1.1837284564971924, + "learning_rate": 1.634004928250471e-05, + "loss": 0.4574, + "step": 23230 + }, + { + "epoch": 3.368482081385658, + "grad_norm": 0.9718520045280457, + "learning_rate": 1.6325554428177997e-05, + "loss": 0.4507, + "step": 23240 + }, + { + "epoch": 3.369931514295032, + "grad_norm": 3.4572932720184326, + "learning_rate": 1.6311059573851284e-05, + "loss": 0.4998, + "step": 23250 + }, + { + "epoch": 3.3713809472044063, + "grad_norm": 3.0571515560150146, + "learning_rate": 1.6296564719524572e-05, + "loss": 0.4589, + "step": 23260 + }, + { + "epoch": 3.3728303801137804, + "grad_norm": 1.004155158996582, + "learning_rate": 1.6282069865197857e-05, + "loss": 0.4638, + "step": 23270 + }, + { + "epoch": 3.3742798130231546, + "grad_norm": 3.1334617137908936, + "learning_rate": 1.626757501087114e-05, + "loss": 0.5358, + "step": 23280 + }, + { + "epoch": 3.3757292459325288, + "grad_norm": 3.0823802947998047, + "learning_rate": 1.625308015654443e-05, + "loss": 0.4908, + "step": 23290 + }, + { + "epoch": 3.377178678841903, + "grad_norm": 0.82496178150177, + "learning_rate": 1.6238585302217713e-05, + "loss": 0.4883, + "step": 23300 + }, + { + "epoch": 3.3786281117512775, + "grad_norm": 1.1200013160705566, + "learning_rate": 1.6224090447891e-05, + "loss": 0.478, + "step": 23310 + }, + { + "epoch": 3.3800775446606517, + "grad_norm": 3.1949923038482666, + "learning_rate": 1.6209595593564285e-05, + "loss": 0.5102, + "step": 23320 + }, + { + "epoch": 3.381526977570026, + "grad_norm": 2.673584461212158, + "learning_rate": 1.619510073923757e-05, + "loss": 0.5098, + "step": 23330 + }, + { + "epoch": 3.3829764104794, + "grad_norm": 0.9602708220481873, + "learning_rate": 1.6180605884910857e-05, + "loss": 0.4518, + "step": 23340 + }, + { + "epoch": 3.384425843388774, + "grad_norm": 2.1943914890289307, + "learning_rate": 1.6166111030584145e-05, + "loss": 0.4692, + "step": 23350 + }, + { + "epoch": 3.3858752762981483, + "grad_norm": 2.179941415786743, + "learning_rate": 1.615161617625743e-05, + "loss": 0.4706, + "step": 23360 + }, + { + "epoch": 3.3873247092075225, + "grad_norm": 1.5116748809814453, + "learning_rate": 1.6137121321930714e-05, + "loss": 0.4411, + "step": 23370 + }, + { + "epoch": 3.3887741421168966, + "grad_norm": 0.8765755295753479, + "learning_rate": 1.6122626467604e-05, + "loss": 0.4853, + "step": 23380 + }, + { + "epoch": 3.390223575026271, + "grad_norm": 1.2487914562225342, + "learning_rate": 1.610813161327729e-05, + "loss": 0.4711, + "step": 23390 + }, + { + "epoch": 3.391673007935645, + "grad_norm": 0.8878582715988159, + "learning_rate": 1.6093636758950574e-05, + "loss": 0.4899, + "step": 23400 + }, + { + "epoch": 3.3931224408450196, + "grad_norm": 1.0657848119735718, + "learning_rate": 1.6079141904623858e-05, + "loss": 0.511, + "step": 23410 + }, + { + "epoch": 3.3945718737543937, + "grad_norm": 1.696035385131836, + "learning_rate": 1.6064647050297146e-05, + "loss": 0.4409, + "step": 23420 + }, + { + "epoch": 3.396021306663768, + "grad_norm": 1.4846292734146118, + "learning_rate": 1.605015219597043e-05, + "loss": 0.4929, + "step": 23430 + }, + { + "epoch": 3.397470739573142, + "grad_norm": 5.0850419998168945, + "learning_rate": 1.6035657341643718e-05, + "loss": 0.5647, + "step": 23440 + }, + { + "epoch": 3.398920172482516, + "grad_norm": 3.0048062801361084, + "learning_rate": 1.6021162487317005e-05, + "loss": 0.4346, + "step": 23450 + }, + { + "epoch": 3.4003696053918904, + "grad_norm": 0.7665842175483704, + "learning_rate": 1.600666763299029e-05, + "loss": 0.4606, + "step": 23460 + }, + { + "epoch": 3.4018190383012645, + "grad_norm": 2.541459798812866, + "learning_rate": 1.5992172778663574e-05, + "loss": 0.5251, + "step": 23470 + }, + { + "epoch": 3.4032684712106387, + "grad_norm": 1.6156889200210571, + "learning_rate": 1.5977677924336862e-05, + "loss": 0.4589, + "step": 23480 + }, + { + "epoch": 3.4047179041200133, + "grad_norm": 3.334068536758423, + "learning_rate": 1.5963183070010146e-05, + "loss": 0.4495, + "step": 23490 + }, + { + "epoch": 3.4061673370293875, + "grad_norm": 1.9630388021469116, + "learning_rate": 1.5948688215683434e-05, + "loss": 0.4894, + "step": 23500 + }, + { + "epoch": 3.4076167699387616, + "grad_norm": 1.2060139179229736, + "learning_rate": 1.593419336135672e-05, + "loss": 0.4561, + "step": 23510 + }, + { + "epoch": 3.4090662028481358, + "grad_norm": 1.082797884941101, + "learning_rate": 1.5919698507030006e-05, + "loss": 0.4288, + "step": 23520 + }, + { + "epoch": 3.41051563575751, + "grad_norm": 2.201314926147461, + "learning_rate": 1.590520365270329e-05, + "loss": 0.4776, + "step": 23530 + }, + { + "epoch": 3.411965068666884, + "grad_norm": 0.8573411107063293, + "learning_rate": 1.5890708798376578e-05, + "loss": 0.4428, + "step": 23540 + }, + { + "epoch": 3.4134145015762583, + "grad_norm": 0.9044166803359985, + "learning_rate": 1.5876213944049863e-05, + "loss": 0.4825, + "step": 23550 + }, + { + "epoch": 3.4148639344856324, + "grad_norm": 2.2872838973999023, + "learning_rate": 1.5861719089723147e-05, + "loss": 0.4554, + "step": 23560 + }, + { + "epoch": 3.4163133673950066, + "grad_norm": 1.6215200424194336, + "learning_rate": 1.5847224235396435e-05, + "loss": 0.4603, + "step": 23570 + }, + { + "epoch": 3.4177628003043807, + "grad_norm": 1.0931036472320557, + "learning_rate": 1.5832729381069722e-05, + "loss": 0.4911, + "step": 23580 + }, + { + "epoch": 3.419212233213755, + "grad_norm": 0.8152222037315369, + "learning_rate": 1.5818234526743007e-05, + "loss": 0.4648, + "step": 23590 + }, + { + "epoch": 3.4206616661231295, + "grad_norm": 2.398775815963745, + "learning_rate": 1.580373967241629e-05, + "loss": 0.4543, + "step": 23600 + }, + { + "epoch": 3.4221110990325037, + "grad_norm": 1.1491453647613525, + "learning_rate": 1.578924481808958e-05, + "loss": 0.4195, + "step": 23610 + }, + { + "epoch": 3.423560531941878, + "grad_norm": 1.175913691520691, + "learning_rate": 1.5774749963762863e-05, + "loss": 0.5195, + "step": 23620 + }, + { + "epoch": 3.425009964851252, + "grad_norm": 0.8074049949645996, + "learning_rate": 1.576025510943615e-05, + "loss": 0.5339, + "step": 23630 + }, + { + "epoch": 3.426459397760626, + "grad_norm": 1.1793062686920166, + "learning_rate": 1.574576025510944e-05, + "loss": 0.4522, + "step": 23640 + }, + { + "epoch": 3.4279088306700003, + "grad_norm": 0.9329249858856201, + "learning_rate": 1.5731265400782723e-05, + "loss": 0.4521, + "step": 23650 + }, + { + "epoch": 3.4293582635793745, + "grad_norm": 1.9298598766326904, + "learning_rate": 1.5716770546456008e-05, + "loss": 0.4866, + "step": 23660 + }, + { + "epoch": 3.4308076964887486, + "grad_norm": 1.1167508363723755, + "learning_rate": 1.5702275692129295e-05, + "loss": 0.4857, + "step": 23670 + }, + { + "epoch": 3.432257129398123, + "grad_norm": 0.9384124875068665, + "learning_rate": 1.5687780837802583e-05, + "loss": 0.5053, + "step": 23680 + }, + { + "epoch": 3.4337065623074974, + "grad_norm": 0.8454510569572449, + "learning_rate": 1.5673285983475867e-05, + "loss": 0.4679, + "step": 23690 + }, + { + "epoch": 3.4351559952168715, + "grad_norm": 1.1908695697784424, + "learning_rate": 1.5658791129149152e-05, + "loss": 0.4517, + "step": 23700 + }, + { + "epoch": 3.4366054281262457, + "grad_norm": 0.7445909976959229, + "learning_rate": 1.564429627482244e-05, + "loss": 0.4075, + "step": 23710 + }, + { + "epoch": 3.43805486103562, + "grad_norm": 2.431640148162842, + "learning_rate": 1.5629801420495724e-05, + "loss": 0.4361, + "step": 23720 + }, + { + "epoch": 3.439504293944994, + "grad_norm": 2.70326828956604, + "learning_rate": 1.561530656616901e-05, + "loss": 0.472, + "step": 23730 + }, + { + "epoch": 3.440953726854368, + "grad_norm": 1.1950196027755737, + "learning_rate": 1.5600811711842296e-05, + "loss": 0.5528, + "step": 23740 + }, + { + "epoch": 3.4424031597637423, + "grad_norm": 2.5100083351135254, + "learning_rate": 1.558631685751558e-05, + "loss": 0.4966, + "step": 23750 + }, + { + "epoch": 3.4438525926731165, + "grad_norm": 0.8543462753295898, + "learning_rate": 1.5571822003188868e-05, + "loss": 0.4825, + "step": 23760 + }, + { + "epoch": 3.4453020255824907, + "grad_norm": 0.789966344833374, + "learning_rate": 1.5557327148862156e-05, + "loss": 0.4606, + "step": 23770 + }, + { + "epoch": 3.446751458491865, + "grad_norm": 1.8987724781036377, + "learning_rate": 1.554283229453544e-05, + "loss": 0.4361, + "step": 23780 + }, + { + "epoch": 3.4482008914012394, + "grad_norm": 1.3493493795394897, + "learning_rate": 1.5528337440208725e-05, + "loss": 0.4777, + "step": 23790 + }, + { + "epoch": 3.4496503243106136, + "grad_norm": 1.1933780908584595, + "learning_rate": 1.5513842585882012e-05, + "loss": 0.4228, + "step": 23800 + }, + { + "epoch": 3.4510997572199877, + "grad_norm": 1.8975234031677246, + "learning_rate": 1.54993477315553e-05, + "loss": 0.4701, + "step": 23810 + }, + { + "epoch": 3.452549190129362, + "grad_norm": 1.3678011894226074, + "learning_rate": 1.5484852877228584e-05, + "loss": 0.4882, + "step": 23820 + }, + { + "epoch": 3.453998623038736, + "grad_norm": 1.091180443763733, + "learning_rate": 1.5470358022901872e-05, + "loss": 0.4821, + "step": 23830 + }, + { + "epoch": 3.45544805594811, + "grad_norm": 3.8083252906799316, + "learning_rate": 1.5455863168575156e-05, + "loss": 0.4612, + "step": 23840 + }, + { + "epoch": 3.4568974888574844, + "grad_norm": 1.3760696649551392, + "learning_rate": 1.544136831424844e-05, + "loss": 0.4554, + "step": 23850 + }, + { + "epoch": 3.4583469217668585, + "grad_norm": 1.2829561233520508, + "learning_rate": 1.542687345992173e-05, + "loss": 0.4219, + "step": 23860 + }, + { + "epoch": 3.459796354676233, + "grad_norm": 2.6034016609191895, + "learning_rate": 1.5412378605595016e-05, + "loss": 0.46, + "step": 23870 + }, + { + "epoch": 3.4612457875856073, + "grad_norm": 3.3234379291534424, + "learning_rate": 1.53978837512683e-05, + "loss": 0.5564, + "step": 23880 + }, + { + "epoch": 3.4626952204949815, + "grad_norm": 0.9328240156173706, + "learning_rate": 1.5383388896941585e-05, + "loss": 0.4576, + "step": 23890 + }, + { + "epoch": 3.4641446534043556, + "grad_norm": 1.895174264907837, + "learning_rate": 1.5368894042614873e-05, + "loss": 0.4811, + "step": 23900 + }, + { + "epoch": 3.46559408631373, + "grad_norm": 1.6813945770263672, + "learning_rate": 1.5354399188288157e-05, + "loss": 0.4107, + "step": 23910 + }, + { + "epoch": 3.467043519223104, + "grad_norm": 1.6949057579040527, + "learning_rate": 1.5339904333961445e-05, + "loss": 0.4491, + "step": 23920 + }, + { + "epoch": 3.468492952132478, + "grad_norm": 0.9117002487182617, + "learning_rate": 1.532540947963473e-05, + "loss": 0.4608, + "step": 23930 + }, + { + "epoch": 3.4699423850418523, + "grad_norm": 1.4225014448165894, + "learning_rate": 1.5310914625308017e-05, + "loss": 0.4462, + "step": 23940 + }, + { + "epoch": 3.4713918179512264, + "grad_norm": 0.9624481201171875, + "learning_rate": 1.52964197709813e-05, + "loss": 0.4547, + "step": 23950 + }, + { + "epoch": 3.4728412508606006, + "grad_norm": 1.0208147764205933, + "learning_rate": 1.528192491665459e-05, + "loss": 0.4626, + "step": 23960 + }, + { + "epoch": 3.474290683769975, + "grad_norm": 1.079161524772644, + "learning_rate": 1.5267430062327877e-05, + "loss": 0.4571, + "step": 23970 + }, + { + "epoch": 3.4757401166793493, + "grad_norm": 2.516960859298706, + "learning_rate": 1.525293520800116e-05, + "loss": 0.4935, + "step": 23980 + }, + { + "epoch": 3.4771895495887235, + "grad_norm": 1.0414551496505737, + "learning_rate": 1.5238440353674446e-05, + "loss": 0.5177, + "step": 23990 + }, + { + "epoch": 3.4786389824980977, + "grad_norm": 2.013028383255005, + "learning_rate": 1.5223945499347733e-05, + "loss": 0.5026, + "step": 24000 + }, + { + "epoch": 3.4786389824980977, + "eval_loss": 0.7205393314361572, + "eval_runtime": 671.6003, + "eval_samples_per_second": 51.362, + "eval_steps_per_second": 2.568, + "eval_token_accuracy": 0.0003924538207309549, + "step": 24000 + }, + { + "epoch": 3.480088415407472, + "grad_norm": 1.3144657611846924, + "learning_rate": 1.5209450645021018e-05, + "loss": 0.4393, + "step": 24010 + }, + { + "epoch": 3.481537848316846, + "grad_norm": 1.0083630084991455, + "learning_rate": 1.5194955790694304e-05, + "loss": 0.5047, + "step": 24020 + }, + { + "epoch": 3.48298728122622, + "grad_norm": 1.2430204153060913, + "learning_rate": 1.5180460936367591e-05, + "loss": 0.4527, + "step": 24030 + }, + { + "epoch": 3.4844367141355943, + "grad_norm": 2.474379062652588, + "learning_rate": 1.5165966082040876e-05, + "loss": 0.4234, + "step": 24040 + }, + { + "epoch": 3.4858861470449685, + "grad_norm": 1.0105928182601929, + "learning_rate": 1.5151471227714162e-05, + "loss": 0.4968, + "step": 24050 + }, + { + "epoch": 3.487335579954343, + "grad_norm": 1.6667371988296509, + "learning_rate": 1.5136976373387448e-05, + "loss": 0.4341, + "step": 24060 + }, + { + "epoch": 3.4887850128637172, + "grad_norm": 5.3816914558410645, + "learning_rate": 1.5122481519060736e-05, + "loss": 0.4355, + "step": 24070 + }, + { + "epoch": 3.4902344457730914, + "grad_norm": 1.5800060033798218, + "learning_rate": 1.510798666473402e-05, + "loss": 0.4395, + "step": 24080 + }, + { + "epoch": 3.4916838786824655, + "grad_norm": 2.9925456047058105, + "learning_rate": 1.5093491810407306e-05, + "loss": 0.4635, + "step": 24090 + }, + { + "epoch": 3.4931333115918397, + "grad_norm": 0.9235763549804688, + "learning_rate": 1.5078996956080594e-05, + "loss": 0.4665, + "step": 24100 + }, + { + "epoch": 3.494582744501214, + "grad_norm": 2.027843952178955, + "learning_rate": 1.5064502101753877e-05, + "loss": 0.401, + "step": 24110 + }, + { + "epoch": 3.496032177410588, + "grad_norm": 0.9020404815673828, + "learning_rate": 1.5050007247427164e-05, + "loss": 0.4479, + "step": 24120 + }, + { + "epoch": 3.497481610319962, + "grad_norm": 0.9779691696166992, + "learning_rate": 1.503551239310045e-05, + "loss": 0.4969, + "step": 24130 + }, + { + "epoch": 3.4989310432293363, + "grad_norm": 1.1350491046905518, + "learning_rate": 1.5021017538773735e-05, + "loss": 0.4676, + "step": 24140 + }, + { + "epoch": 3.5003804761387105, + "grad_norm": 1.9279160499572754, + "learning_rate": 1.5006522684447022e-05, + "loss": 0.439, + "step": 24150 + }, + { + "epoch": 3.5018299090480847, + "grad_norm": 1.7692646980285645, + "learning_rate": 1.4992027830120308e-05, + "loss": 0.4132, + "step": 24160 + }, + { + "epoch": 3.5032793419574593, + "grad_norm": 1.7417184114456177, + "learning_rate": 1.4977532975793593e-05, + "loss": 0.4349, + "step": 24170 + }, + { + "epoch": 3.5047287748668334, + "grad_norm": 0.9393298625946045, + "learning_rate": 1.4963038121466879e-05, + "loss": 0.4626, + "step": 24180 + }, + { + "epoch": 3.5061782077762076, + "grad_norm": 1.083738923072815, + "learning_rate": 1.4948543267140167e-05, + "loss": 0.4049, + "step": 24190 + }, + { + "epoch": 3.5076276406855817, + "grad_norm": 1.1180866956710815, + "learning_rate": 1.4934048412813451e-05, + "loss": 0.4522, + "step": 24200 + }, + { + "epoch": 3.509077073594956, + "grad_norm": 0.866869330406189, + "learning_rate": 1.4919553558486737e-05, + "loss": 0.4841, + "step": 24210 + }, + { + "epoch": 3.51052650650433, + "grad_norm": 1.915380835533142, + "learning_rate": 1.4905058704160025e-05, + "loss": 0.4705, + "step": 24220 + }, + { + "epoch": 3.5119759394137042, + "grad_norm": 1.2995796203613281, + "learning_rate": 1.489056384983331e-05, + "loss": 0.4524, + "step": 24230 + }, + { + "epoch": 3.513425372323079, + "grad_norm": 1.4961055517196655, + "learning_rate": 1.4876068995506595e-05, + "loss": 0.4159, + "step": 24240 + }, + { + "epoch": 3.514874805232453, + "grad_norm": 1.1718025207519531, + "learning_rate": 1.4861574141179881e-05, + "loss": 0.4582, + "step": 24250 + }, + { + "epoch": 3.516324238141827, + "grad_norm": 1.1462128162384033, + "learning_rate": 1.4847079286853169e-05, + "loss": 0.4214, + "step": 24260 + }, + { + "epoch": 3.5177736710512013, + "grad_norm": 1.2410391569137573, + "learning_rate": 1.4832584432526453e-05, + "loss": 0.4753, + "step": 24270 + }, + { + "epoch": 3.5192231039605755, + "grad_norm": 0.9186839461326599, + "learning_rate": 1.481808957819974e-05, + "loss": 0.5567, + "step": 24280 + }, + { + "epoch": 3.5206725368699496, + "grad_norm": 1.1599667072296143, + "learning_rate": 1.4803594723873027e-05, + "loss": 0.398, + "step": 24290 + }, + { + "epoch": 3.522121969779324, + "grad_norm": 1.4874101877212524, + "learning_rate": 1.478909986954631e-05, + "loss": 0.4761, + "step": 24300 + }, + { + "epoch": 3.523571402688698, + "grad_norm": 1.1158050298690796, + "learning_rate": 1.4774605015219598e-05, + "loss": 0.469, + "step": 24310 + }, + { + "epoch": 3.525020835598072, + "grad_norm": 1.2028007507324219, + "learning_rate": 1.4760110160892884e-05, + "loss": 0.4738, + "step": 24320 + }, + { + "epoch": 3.5264702685074463, + "grad_norm": 2.6865055561065674, + "learning_rate": 1.4745615306566168e-05, + "loss": 0.5044, + "step": 24330 + }, + { + "epoch": 3.5279197014168204, + "grad_norm": 1.1461601257324219, + "learning_rate": 1.4731120452239456e-05, + "loss": 0.4434, + "step": 24340 + }, + { + "epoch": 3.529369134326195, + "grad_norm": 1.2691434621810913, + "learning_rate": 1.4716625597912742e-05, + "loss": 0.4977, + "step": 24350 + }, + { + "epoch": 3.530818567235569, + "grad_norm": 1.963690996170044, + "learning_rate": 1.470213074358603e-05, + "loss": 0.4618, + "step": 24360 + }, + { + "epoch": 3.5322680001449434, + "grad_norm": 1.308236837387085, + "learning_rate": 1.4687635889259312e-05, + "loss": 0.4043, + "step": 24370 + }, + { + "epoch": 3.5337174330543175, + "grad_norm": 2.2788124084472656, + "learning_rate": 1.46731410349326e-05, + "loss": 0.4942, + "step": 24380 + }, + { + "epoch": 3.5351668659636917, + "grad_norm": 2.374098300933838, + "learning_rate": 1.4658646180605886e-05, + "loss": 0.5085, + "step": 24390 + }, + { + "epoch": 3.536616298873066, + "grad_norm": 0.970146119594574, + "learning_rate": 1.464415132627917e-05, + "loss": 0.5168, + "step": 24400 + }, + { + "epoch": 3.53806573178244, + "grad_norm": 0.8941761255264282, + "learning_rate": 1.4629656471952458e-05, + "loss": 0.5228, + "step": 24410 + }, + { + "epoch": 3.5395151646918146, + "grad_norm": 3.1685407161712646, + "learning_rate": 1.4615161617625744e-05, + "loss": 0.4554, + "step": 24420 + }, + { + "epoch": 3.5409645976011888, + "grad_norm": 2.12044620513916, + "learning_rate": 1.4600666763299029e-05, + "loss": 0.449, + "step": 24430 + }, + { + "epoch": 3.542414030510563, + "grad_norm": 1.0807843208312988, + "learning_rate": 1.4586171908972315e-05, + "loss": 0.4941, + "step": 24440 + }, + { + "epoch": 3.543863463419937, + "grad_norm": 0.8228974938392639, + "learning_rate": 1.4571677054645602e-05, + "loss": 0.4542, + "step": 24450 + }, + { + "epoch": 3.5453128963293112, + "grad_norm": 1.6381698846817017, + "learning_rate": 1.4557182200318887e-05, + "loss": 0.436, + "step": 24460 + }, + { + "epoch": 3.5467623292386854, + "grad_norm": 0.8940735459327698, + "learning_rate": 1.4542687345992173e-05, + "loss": 0.415, + "step": 24470 + }, + { + "epoch": 3.5482117621480596, + "grad_norm": 2.5338544845581055, + "learning_rate": 1.452819249166546e-05, + "loss": 0.4676, + "step": 24480 + }, + { + "epoch": 3.5496611950574337, + "grad_norm": 2.5289785861968994, + "learning_rate": 1.4513697637338747e-05, + "loss": 0.4682, + "step": 24490 + }, + { + "epoch": 3.551110627966808, + "grad_norm": 1.0406261682510376, + "learning_rate": 1.4499202783012031e-05, + "loss": 0.428, + "step": 24500 + }, + { + "epoch": 3.552560060876182, + "grad_norm": 2.039503335952759, + "learning_rate": 1.4484707928685317e-05, + "loss": 0.437, + "step": 24510 + }, + { + "epoch": 3.554009493785556, + "grad_norm": 1.3291372060775757, + "learning_rate": 1.4470213074358605e-05, + "loss": 0.4816, + "step": 24520 + }, + { + "epoch": 3.5554589266949304, + "grad_norm": 2.6131370067596436, + "learning_rate": 1.4455718220031889e-05, + "loss": 0.4719, + "step": 24530 + }, + { + "epoch": 3.556908359604305, + "grad_norm": 1.0953946113586426, + "learning_rate": 1.4441223365705175e-05, + "loss": 0.4539, + "step": 24540 + }, + { + "epoch": 3.558357792513679, + "grad_norm": 0.8537795543670654, + "learning_rate": 1.4426728511378463e-05, + "loss": 0.4534, + "step": 24550 + }, + { + "epoch": 3.5598072254230533, + "grad_norm": 1.268684983253479, + "learning_rate": 1.4412233657051746e-05, + "loss": 0.4564, + "step": 24560 + }, + { + "epoch": 3.5612566583324274, + "grad_norm": 1.0174052715301514, + "learning_rate": 1.4397738802725033e-05, + "loss": 0.4801, + "step": 24570 + }, + { + "epoch": 3.5627060912418016, + "grad_norm": 0.8783228397369385, + "learning_rate": 1.438324394839832e-05, + "loss": 0.4757, + "step": 24580 + }, + { + "epoch": 3.5641555241511758, + "grad_norm": 1.3714172840118408, + "learning_rate": 1.4368749094071604e-05, + "loss": 0.4513, + "step": 24590 + }, + { + "epoch": 3.56560495706055, + "grad_norm": 1.7273422479629517, + "learning_rate": 1.4354254239744891e-05, + "loss": 0.4579, + "step": 24600 + }, + { + "epoch": 3.5670543899699245, + "grad_norm": 3.6264455318450928, + "learning_rate": 1.4339759385418177e-05, + "loss": 0.4518, + "step": 24610 + }, + { + "epoch": 3.5685038228792987, + "grad_norm": 1.7638481855392456, + "learning_rate": 1.4325264531091465e-05, + "loss": 0.4583, + "step": 24620 + }, + { + "epoch": 3.569953255788673, + "grad_norm": 1.263018012046814, + "learning_rate": 1.4310769676764748e-05, + "loss": 0.425, + "step": 24630 + }, + { + "epoch": 3.571402688698047, + "grad_norm": 1.0282634496688843, + "learning_rate": 1.4296274822438036e-05, + "loss": 0.4828, + "step": 24640 + }, + { + "epoch": 3.572852121607421, + "grad_norm": 1.0690590143203735, + "learning_rate": 1.4281779968111322e-05, + "loss": 0.4938, + "step": 24650 + }, + { + "epoch": 3.5743015545167953, + "grad_norm": 1.256479024887085, + "learning_rate": 1.4267285113784606e-05, + "loss": 0.4262, + "step": 24660 + }, + { + "epoch": 3.5757509874261695, + "grad_norm": 2.001762628555298, + "learning_rate": 1.4252790259457894e-05, + "loss": 0.4471, + "step": 24670 + }, + { + "epoch": 3.5772004203355436, + "grad_norm": 0.9218072295188904, + "learning_rate": 1.423829540513118e-05, + "loss": 0.3911, + "step": 24680 + }, + { + "epoch": 3.578649853244918, + "grad_norm": 2.517094135284424, + "learning_rate": 1.4223800550804464e-05, + "loss": 0.4531, + "step": 24690 + }, + { + "epoch": 3.580099286154292, + "grad_norm": 2.5413553714752197, + "learning_rate": 1.420930569647775e-05, + "loss": 0.4515, + "step": 24700 + }, + { + "epoch": 3.581548719063666, + "grad_norm": 2.471299409866333, + "learning_rate": 1.4194810842151038e-05, + "loss": 0.4395, + "step": 24710 + }, + { + "epoch": 3.5829981519730403, + "grad_norm": 2.179800271987915, + "learning_rate": 1.4180315987824322e-05, + "loss": 0.3825, + "step": 24720 + }, + { + "epoch": 3.584447584882415, + "grad_norm": 2.7383923530578613, + "learning_rate": 1.4165821133497608e-05, + "loss": 0.473, + "step": 24730 + }, + { + "epoch": 3.585897017791789, + "grad_norm": 1.0128542184829712, + "learning_rate": 1.4151326279170896e-05, + "loss": 0.4712, + "step": 24740 + }, + { + "epoch": 3.587346450701163, + "grad_norm": 2.4596316814422607, + "learning_rate": 1.4136831424844179e-05, + "loss": 0.4806, + "step": 24750 + }, + { + "epoch": 3.5887958836105374, + "grad_norm": 1.1265087127685547, + "learning_rate": 1.4122336570517467e-05, + "loss": 0.4706, + "step": 24760 + }, + { + "epoch": 3.5902453165199115, + "grad_norm": 2.9591550827026367, + "learning_rate": 1.4107841716190753e-05, + "loss": 0.4352, + "step": 24770 + }, + { + "epoch": 3.5916947494292857, + "grad_norm": 1.0448544025421143, + "learning_rate": 1.409334686186404e-05, + "loss": 0.4267, + "step": 24780 + }, + { + "epoch": 3.59314418233866, + "grad_norm": 2.992042064666748, + "learning_rate": 1.4078852007537325e-05, + "loss": 0.4428, + "step": 24790 + }, + { + "epoch": 3.5945936152480344, + "grad_norm": 2.647773265838623, + "learning_rate": 1.406435715321061e-05, + "loss": 0.4718, + "step": 24800 + }, + { + "epoch": 3.5960430481574086, + "grad_norm": 1.7813819646835327, + "learning_rate": 1.4049862298883899e-05, + "loss": 0.4702, + "step": 24810 + }, + { + "epoch": 3.5974924810667828, + "grad_norm": 0.9270603656768799, + "learning_rate": 1.4035367444557181e-05, + "loss": 0.4603, + "step": 24820 + }, + { + "epoch": 3.598941913976157, + "grad_norm": 5.342667102813721, + "learning_rate": 1.4020872590230469e-05, + "loss": 0.4678, + "step": 24830 + }, + { + "epoch": 3.600391346885531, + "grad_norm": 0.9789586663246155, + "learning_rate": 1.4006377735903755e-05, + "loss": 0.4544, + "step": 24840 + }, + { + "epoch": 3.6018407797949052, + "grad_norm": 1.0158215761184692, + "learning_rate": 1.399188288157704e-05, + "loss": 0.4339, + "step": 24850 + }, + { + "epoch": 3.6032902127042794, + "grad_norm": 1.652602195739746, + "learning_rate": 1.3977388027250327e-05, + "loss": 0.457, + "step": 24860 + }, + { + "epoch": 3.6047396456136536, + "grad_norm": 1.1833007335662842, + "learning_rate": 1.3962893172923613e-05, + "loss": 0.3642, + "step": 24870 + }, + { + "epoch": 3.6061890785230277, + "grad_norm": 2.9165823459625244, + "learning_rate": 1.3948398318596898e-05, + "loss": 0.4549, + "step": 24880 + }, + { + "epoch": 3.607638511432402, + "grad_norm": 1.0205824375152588, + "learning_rate": 1.3933903464270184e-05, + "loss": 0.5003, + "step": 24890 + }, + { + "epoch": 3.609087944341776, + "grad_norm": 0.9283063411712646, + "learning_rate": 1.3919408609943471e-05, + "loss": 0.4788, + "step": 24900 + }, + { + "epoch": 3.6105373772511506, + "grad_norm": 2.781982898712158, + "learning_rate": 1.3904913755616757e-05, + "loss": 0.4448, + "step": 24910 + }, + { + "epoch": 3.611986810160525, + "grad_norm": 1.2095791101455688, + "learning_rate": 1.3890418901290042e-05, + "loss": 0.4371, + "step": 24920 + }, + { + "epoch": 3.613436243069899, + "grad_norm": 1.2724969387054443, + "learning_rate": 1.387592404696333e-05, + "loss": 0.4208, + "step": 24930 + }, + { + "epoch": 3.614885675979273, + "grad_norm": 0.857598066329956, + "learning_rate": 1.3861429192636616e-05, + "loss": 0.433, + "step": 24940 + }, + { + "epoch": 3.6163351088886473, + "grad_norm": 3.5676772594451904, + "learning_rate": 1.38469343383099e-05, + "loss": 0.4787, + "step": 24950 + }, + { + "epoch": 3.6177845417980214, + "grad_norm": 2.6916184425354004, + "learning_rate": 1.3832439483983186e-05, + "loss": 0.4176, + "step": 24960 + }, + { + "epoch": 3.6192339747073956, + "grad_norm": 1.305999755859375, + "learning_rate": 1.3817944629656474e-05, + "loss": 0.4208, + "step": 24970 + }, + { + "epoch": 3.62068340761677, + "grad_norm": 0.7682250738143921, + "learning_rate": 1.3803449775329758e-05, + "loss": 0.5436, + "step": 24980 + }, + { + "epoch": 3.6221328405261444, + "grad_norm": 4.667750835418701, + "learning_rate": 1.3788954921003044e-05, + "loss": 0.4955, + "step": 24990 + }, + { + "epoch": 3.6235822734355185, + "grad_norm": 1.856907844543457, + "learning_rate": 1.3774460066676332e-05, + "loss": 0.4153, + "step": 25000 + }, + { + "epoch": 3.6235822734355185, + "eval_loss": 0.7244583964347839, + "eval_runtime": 671.9338, + "eval_samples_per_second": 51.337, + "eval_steps_per_second": 2.567, + "eval_token_accuracy": 0.0003887544609453762, + "step": 25000 + }, + { + "epoch": 3.6250317063448927, + "grad_norm": 3.083552122116089, + "learning_rate": 1.3759965212349615e-05, + "loss": 0.4554, + "step": 25010 + }, + { + "epoch": 3.626481139254267, + "grad_norm": 2.2491211891174316, + "learning_rate": 1.3745470358022902e-05, + "loss": 0.4488, + "step": 25020 + }, + { + "epoch": 3.627930572163641, + "grad_norm": 1.8625367879867554, + "learning_rate": 1.3730975503696188e-05, + "loss": 0.4801, + "step": 25030 + }, + { + "epoch": 3.629380005073015, + "grad_norm": 1.394662857055664, + "learning_rate": 1.3716480649369476e-05, + "loss": 0.4696, + "step": 25040 + }, + { + "epoch": 3.6308294379823893, + "grad_norm": 2.025663137435913, + "learning_rate": 1.370198579504276e-05, + "loss": 0.5045, + "step": 25050 + }, + { + "epoch": 3.6322788708917635, + "grad_norm": 0.8624992370605469, + "learning_rate": 1.3687490940716046e-05, + "loss": 0.4331, + "step": 25060 + }, + { + "epoch": 3.6337283038011376, + "grad_norm": 1.1813052892684937, + "learning_rate": 1.3674445571822002e-05, + "loss": 0.4484, + "step": 25070 + }, + { + "epoch": 3.635177736710512, + "grad_norm": 2.8747799396514893, + "learning_rate": 1.365995071749529e-05, + "loss": 0.4863, + "step": 25080 + }, + { + "epoch": 3.636627169619886, + "grad_norm": 2.4268813133239746, + "learning_rate": 1.3645455863168576e-05, + "loss": 0.4073, + "step": 25090 + }, + { + "epoch": 3.6380766025292606, + "grad_norm": 2.0211360454559326, + "learning_rate": 1.363096100884186e-05, + "loss": 0.4031, + "step": 25100 + }, + { + "epoch": 3.6395260354386347, + "grad_norm": 1.2551864385604858, + "learning_rate": 1.3616466154515148e-05, + "loss": 0.4496, + "step": 25110 + }, + { + "epoch": 3.640975468348009, + "grad_norm": 1.1632791757583618, + "learning_rate": 1.3601971300188434e-05, + "loss": 0.4563, + "step": 25120 + }, + { + "epoch": 3.642424901257383, + "grad_norm": 1.17465078830719, + "learning_rate": 1.3587476445861719e-05, + "loss": 0.4994, + "step": 25130 + }, + { + "epoch": 3.643874334166757, + "grad_norm": 3.1795096397399902, + "learning_rate": 1.3572981591535005e-05, + "loss": 0.4513, + "step": 25140 + }, + { + "epoch": 3.6453237670761314, + "grad_norm": 2.9688425064086914, + "learning_rate": 1.3558486737208293e-05, + "loss": 0.4774, + "step": 25150 + }, + { + "epoch": 3.6467731999855055, + "grad_norm": 1.1822007894515991, + "learning_rate": 1.3543991882881577e-05, + "loss": 0.4411, + "step": 25160 + }, + { + "epoch": 3.64822263289488, + "grad_norm": 1.930434226989746, + "learning_rate": 1.3529497028554863e-05, + "loss": 0.48, + "step": 25170 + }, + { + "epoch": 3.6496720658042543, + "grad_norm": 2.6842262744903564, + "learning_rate": 1.351500217422815e-05, + "loss": 0.4468, + "step": 25180 + }, + { + "epoch": 3.6511214987136285, + "grad_norm": 2.477809190750122, + "learning_rate": 1.3500507319901437e-05, + "loss": 0.4006, + "step": 25190 + }, + { + "epoch": 3.6525709316230026, + "grad_norm": 1.701682209968567, + "learning_rate": 1.3486012465574721e-05, + "loss": 0.4274, + "step": 25200 + }, + { + "epoch": 3.6540203645323768, + "grad_norm": 2.3307900428771973, + "learning_rate": 1.3471517611248007e-05, + "loss": 0.462, + "step": 25210 + }, + { + "epoch": 3.655469797441751, + "grad_norm": 2.1112964153289795, + "learning_rate": 1.3457022756921295e-05, + "loss": 0.4564, + "step": 25220 + }, + { + "epoch": 3.656919230351125, + "grad_norm": 1.1961227655410767, + "learning_rate": 1.344252790259458e-05, + "loss": 0.4937, + "step": 25230 + }, + { + "epoch": 3.6583686632604993, + "grad_norm": 3.18811297416687, + "learning_rate": 1.3428033048267865e-05, + "loss": 0.4762, + "step": 25240 + }, + { + "epoch": 3.6598180961698734, + "grad_norm": 1.4780430793762207, + "learning_rate": 1.3413538193941153e-05, + "loss": 0.4465, + "step": 25250 + }, + { + "epoch": 3.6612675290792476, + "grad_norm": 2.3059370517730713, + "learning_rate": 1.3399043339614437e-05, + "loss": 0.4737, + "step": 25260 + }, + { + "epoch": 3.6627169619886217, + "grad_norm": 1.1574058532714844, + "learning_rate": 1.3384548485287723e-05, + "loss": 0.4123, + "step": 25270 + }, + { + "epoch": 3.664166394897996, + "grad_norm": 2.186791181564331, + "learning_rate": 1.337005363096101e-05, + "loss": 0.5098, + "step": 25280 + }, + { + "epoch": 3.6656158278073705, + "grad_norm": 1.4572516679763794, + "learning_rate": 1.3355558776634294e-05, + "loss": 0.4766, + "step": 25290 + }, + { + "epoch": 3.6670652607167447, + "grad_norm": 3.103440523147583, + "learning_rate": 1.3341063922307582e-05, + "loss": 0.4361, + "step": 25300 + }, + { + "epoch": 3.668514693626119, + "grad_norm": 2.1486423015594482, + "learning_rate": 1.3326569067980868e-05, + "loss": 0.4026, + "step": 25310 + }, + { + "epoch": 3.669964126535493, + "grad_norm": 1.179584264755249, + "learning_rate": 1.3312074213654152e-05, + "loss": 0.4723, + "step": 25320 + }, + { + "epoch": 3.671413559444867, + "grad_norm": 1.3920953273773193, + "learning_rate": 1.3297579359327438e-05, + "loss": 0.4538, + "step": 25330 + }, + { + "epoch": 3.6728629923542413, + "grad_norm": 2.763362169265747, + "learning_rate": 1.3283084505000726e-05, + "loss": 0.4749, + "step": 25340 + }, + { + "epoch": 3.6743124252636155, + "grad_norm": 2.5469725131988525, + "learning_rate": 1.3268589650674012e-05, + "loss": 0.4669, + "step": 25350 + }, + { + "epoch": 3.67576185817299, + "grad_norm": 2.4919934272766113, + "learning_rate": 1.3254094796347296e-05, + "loss": 0.4622, + "step": 25360 + }, + { + "epoch": 3.677211291082364, + "grad_norm": 1.1692922115325928, + "learning_rate": 1.3239599942020584e-05, + "loss": 0.4486, + "step": 25370 + }, + { + "epoch": 3.6786607239917384, + "grad_norm": 2.3812007904052734, + "learning_rate": 1.322510508769387e-05, + "loss": 0.487, + "step": 25380 + }, + { + "epoch": 3.6801101569011125, + "grad_norm": 0.8145487904548645, + "learning_rate": 1.3210610233367154e-05, + "loss": 0.3698, + "step": 25390 + }, + { + "epoch": 3.6815595898104867, + "grad_norm": 1.1780279874801636, + "learning_rate": 1.319611537904044e-05, + "loss": 0.4306, + "step": 25400 + }, + { + "epoch": 3.683009022719861, + "grad_norm": 1.7611922025680542, + "learning_rate": 1.3181620524713728e-05, + "loss": 0.4532, + "step": 25410 + }, + { + "epoch": 3.684458455629235, + "grad_norm": 4.195759296417236, + "learning_rate": 1.3167125670387013e-05, + "loss": 0.4641, + "step": 25420 + }, + { + "epoch": 3.685907888538609, + "grad_norm": 1.1716632843017578, + "learning_rate": 1.3152630816060299e-05, + "loss": 0.5277, + "step": 25430 + }, + { + "epoch": 3.6873573214479833, + "grad_norm": 2.7256765365600586, + "learning_rate": 1.3138135961733586e-05, + "loss": 0.4893, + "step": 25440 + }, + { + "epoch": 3.6888067543573575, + "grad_norm": 0.9985123872756958, + "learning_rate": 1.312364110740687e-05, + "loss": 0.4418, + "step": 25450 + }, + { + "epoch": 3.6902561872667317, + "grad_norm": 1.2271908521652222, + "learning_rate": 1.3109146253080157e-05, + "loss": 0.4941, + "step": 25460 + }, + { + "epoch": 3.6917056201761063, + "grad_norm": 2.9102699756622314, + "learning_rate": 1.3094651398753443e-05, + "loss": 0.4088, + "step": 25470 + }, + { + "epoch": 3.6931550530854804, + "grad_norm": 2.339369773864746, + "learning_rate": 1.308015654442673e-05, + "loss": 0.4207, + "step": 25480 + }, + { + "epoch": 3.6946044859948546, + "grad_norm": 2.232353687286377, + "learning_rate": 1.3065661690100015e-05, + "loss": 0.4714, + "step": 25490 + }, + { + "epoch": 3.6960539189042287, + "grad_norm": 0.9081094861030579, + "learning_rate": 1.3051166835773301e-05, + "loss": 0.4106, + "step": 25500 + }, + { + "epoch": 3.697503351813603, + "grad_norm": 0.7285069227218628, + "learning_rate": 1.3036671981446589e-05, + "loss": 0.4218, + "step": 25510 + }, + { + "epoch": 3.698952784722977, + "grad_norm": 3.198972225189209, + "learning_rate": 1.3022177127119871e-05, + "loss": 0.4388, + "step": 25520 + }, + { + "epoch": 3.700402217632351, + "grad_norm": 1.26154625415802, + "learning_rate": 1.300768227279316e-05, + "loss": 0.4395, + "step": 25530 + }, + { + "epoch": 3.701851650541726, + "grad_norm": 3.3266167640686035, + "learning_rate": 1.2993187418466445e-05, + "loss": 0.4497, + "step": 25540 + }, + { + "epoch": 3.7033010834511, + "grad_norm": 1.1532198190689087, + "learning_rate": 1.297869256413973e-05, + "loss": 0.4678, + "step": 25550 + }, + { + "epoch": 3.704750516360474, + "grad_norm": 1.0286312103271484, + "learning_rate": 1.2964197709813017e-05, + "loss": 0.5017, + "step": 25560 + }, + { + "epoch": 3.7061999492698483, + "grad_norm": 2.742030382156372, + "learning_rate": 1.2949702855486303e-05, + "loss": 0.4423, + "step": 25570 + }, + { + "epoch": 3.7076493821792225, + "grad_norm": 1.203541874885559, + "learning_rate": 1.2935208001159588e-05, + "loss": 0.4548, + "step": 25580 + }, + { + "epoch": 3.7090988150885966, + "grad_norm": 0.8006945848464966, + "learning_rate": 1.2920713146832874e-05, + "loss": 0.4748, + "step": 25590 + }, + { + "epoch": 3.710548247997971, + "grad_norm": 1.4705841541290283, + "learning_rate": 1.2906218292506162e-05, + "loss": 0.4989, + "step": 25600 + }, + { + "epoch": 3.711997680907345, + "grad_norm": 2.7385144233703613, + "learning_rate": 1.2891723438179448e-05, + "loss": 0.4733, + "step": 25610 + }, + { + "epoch": 3.713447113816719, + "grad_norm": 1.4703665971755981, + "learning_rate": 1.2877228583852732e-05, + "loss": 0.4562, + "step": 25620 + }, + { + "epoch": 3.7148965467260933, + "grad_norm": 1.0603382587432861, + "learning_rate": 1.286273372952602e-05, + "loss": 0.4353, + "step": 25630 + }, + { + "epoch": 3.7163459796354674, + "grad_norm": 0.9223389625549316, + "learning_rate": 1.2848238875199306e-05, + "loss": 0.4356, + "step": 25640 + }, + { + "epoch": 3.7177954125448416, + "grad_norm": 1.1404002904891968, + "learning_rate": 1.283374402087259e-05, + "loss": 0.4852, + "step": 25650 + }, + { + "epoch": 3.719244845454216, + "grad_norm": 6.546042442321777, + "learning_rate": 1.2819249166545876e-05, + "loss": 0.4939, + "step": 25660 + }, + { + "epoch": 3.7206942783635903, + "grad_norm": 1.3828874826431274, + "learning_rate": 1.2804754312219164e-05, + "loss": 0.4718, + "step": 25670 + }, + { + "epoch": 3.7221437112729645, + "grad_norm": 4.71876859664917, + "learning_rate": 1.2790259457892448e-05, + "loss": 0.4888, + "step": 25680 + }, + { + "epoch": 3.7235931441823387, + "grad_norm": 0.9733912944793701, + "learning_rate": 1.2775764603565734e-05, + "loss": 0.4908, + "step": 25690 + }, + { + "epoch": 3.725042577091713, + "grad_norm": 1.7667779922485352, + "learning_rate": 1.2761269749239022e-05, + "loss": 0.4729, + "step": 25700 + }, + { + "epoch": 3.726492010001087, + "grad_norm": 2.0237643718719482, + "learning_rate": 1.2746774894912306e-05, + "loss": 0.3917, + "step": 25710 + }, + { + "epoch": 3.727941442910461, + "grad_norm": 1.8370978832244873, + "learning_rate": 1.2732280040585592e-05, + "loss": 0.565, + "step": 25720 + }, + { + "epoch": 3.7293908758198357, + "grad_norm": 1.0702518224716187, + "learning_rate": 1.2717785186258879e-05, + "loss": 0.4429, + "step": 25730 + }, + { + "epoch": 3.73084030872921, + "grad_norm": 2.3258419036865234, + "learning_rate": 1.2703290331932163e-05, + "loss": 0.4304, + "step": 25740 + }, + { + "epoch": 3.732289741638584, + "grad_norm": 0.7933976650238037, + "learning_rate": 1.268879547760545e-05, + "loss": 0.5605, + "step": 25750 + }, + { + "epoch": 3.7337391745479582, + "grad_norm": 0.957318127155304, + "learning_rate": 1.2674300623278737e-05, + "loss": 0.4661, + "step": 25760 + }, + { + "epoch": 3.7351886074573324, + "grad_norm": 2.946476697921753, + "learning_rate": 1.2659805768952024e-05, + "loss": 0.4515, + "step": 25770 + }, + { + "epoch": 3.7366380403667065, + "grad_norm": 1.1148138046264648, + "learning_rate": 1.2645310914625307e-05, + "loss": 0.4709, + "step": 25780 + }, + { + "epoch": 3.7380874732760807, + "grad_norm": 1.095963716506958, + "learning_rate": 1.2630816060298595e-05, + "loss": 0.4869, + "step": 25790 + }, + { + "epoch": 3.739536906185455, + "grad_norm": 1.272081971168518, + "learning_rate": 1.2616321205971881e-05, + "loss": 0.5348, + "step": 25800 + }, + { + "epoch": 3.740986339094829, + "grad_norm": 0.9788115620613098, + "learning_rate": 1.2601826351645165e-05, + "loss": 0.4189, + "step": 25810 + }, + { + "epoch": 3.742435772004203, + "grad_norm": 1.0667412281036377, + "learning_rate": 1.2587331497318453e-05, + "loss": 0.4168, + "step": 25820 + }, + { + "epoch": 3.7438852049135773, + "grad_norm": 1.109317660331726, + "learning_rate": 1.2572836642991739e-05, + "loss": 0.4634, + "step": 25830 + }, + { + "epoch": 3.7453346378229515, + "grad_norm": 1.8557586669921875, + "learning_rate": 1.2558341788665023e-05, + "loss": 0.4367, + "step": 25840 + }, + { + "epoch": 3.746784070732326, + "grad_norm": 0.8158989548683167, + "learning_rate": 1.254384693433831e-05, + "loss": 0.4793, + "step": 25850 + }, + { + "epoch": 3.7482335036417003, + "grad_norm": 3.7515580654144287, + "learning_rate": 1.2529352080011597e-05, + "loss": 0.4282, + "step": 25860 + }, + { + "epoch": 3.7496829365510744, + "grad_norm": 2.240804433822632, + "learning_rate": 1.2514857225684882e-05, + "loss": 0.4173, + "step": 25870 + }, + { + "epoch": 3.7511323694604486, + "grad_norm": 1.3406633138656616, + "learning_rate": 1.2500362371358168e-05, + "loss": 0.4622, + "step": 25880 + }, + { + "epoch": 3.7525818023698227, + "grad_norm": 1.781441330909729, + "learning_rate": 1.2485867517031455e-05, + "loss": 0.4107, + "step": 25890 + }, + { + "epoch": 3.754031235279197, + "grad_norm": 1.1386650800704956, + "learning_rate": 1.247137266270474e-05, + "loss": 0.4773, + "step": 25900 + }, + { + "epoch": 3.755480668188571, + "grad_norm": 1.1496398448944092, + "learning_rate": 1.2456877808378026e-05, + "loss": 0.467, + "step": 25910 + }, + { + "epoch": 3.7569301010979457, + "grad_norm": 0.9759547710418701, + "learning_rate": 1.2442382954051312e-05, + "loss": 0.4276, + "step": 25920 + }, + { + "epoch": 3.75837953400732, + "grad_norm": 2.8767662048339844, + "learning_rate": 1.2427888099724598e-05, + "loss": 0.4146, + "step": 25930 + }, + { + "epoch": 3.759828966916694, + "grad_norm": 0.8361555933952332, + "learning_rate": 1.2413393245397886e-05, + "loss": 0.4557, + "step": 25940 + }, + { + "epoch": 3.761278399826068, + "grad_norm": 1.1116259098052979, + "learning_rate": 1.239889839107117e-05, + "loss": 0.4553, + "step": 25950 + }, + { + "epoch": 3.7627278327354423, + "grad_norm": 1.1028730869293213, + "learning_rate": 1.2384403536744456e-05, + "loss": 0.438, + "step": 25960 + }, + { + "epoch": 3.7641772656448165, + "grad_norm": 2.509413242340088, + "learning_rate": 1.2369908682417742e-05, + "loss": 0.4404, + "step": 25970 + }, + { + "epoch": 3.7656266985541906, + "grad_norm": 0.8821387887001038, + "learning_rate": 1.2355413828091028e-05, + "loss": 0.4269, + "step": 25980 + }, + { + "epoch": 3.767076131463565, + "grad_norm": 1.5066933631896973, + "learning_rate": 1.2340918973764314e-05, + "loss": 0.4466, + "step": 25990 + }, + { + "epoch": 3.768525564372939, + "grad_norm": 0.8262946605682373, + "learning_rate": 1.23264241194376e-05, + "loss": 0.413, + "step": 26000 + }, + { + "epoch": 3.768525564372939, + "eval_loss": 0.7187819480895996, + "eval_runtime": 671.7423, + "eval_samples_per_second": 51.352, + "eval_steps_per_second": 2.568, + "eval_token_accuracy": 0.00039164961208191607, + "step": 26000 + }, + { + "epoch": 3.769974997282313, + "grad_norm": 2.168036699295044, + "learning_rate": 1.2311929265110886e-05, + "loss": 0.3854, + "step": 26010 + }, + { + "epoch": 3.7714244301916873, + "grad_norm": 1.015336275100708, + "learning_rate": 1.2297434410784172e-05, + "loss": 0.4675, + "step": 26020 + }, + { + "epoch": 3.772873863101062, + "grad_norm": 1.6049621105194092, + "learning_rate": 1.2282939556457458e-05, + "loss": 0.4444, + "step": 26030 + }, + { + "epoch": 3.774323296010436, + "grad_norm": 1.115004301071167, + "learning_rate": 1.2268444702130743e-05, + "loss": 0.483, + "step": 26040 + }, + { + "epoch": 3.77577272891981, + "grad_norm": 1.3188127279281616, + "learning_rate": 1.225394984780403e-05, + "loss": 0.4628, + "step": 26050 + }, + { + "epoch": 3.7772221618291844, + "grad_norm": 1.8667047023773193, + "learning_rate": 1.2239454993477317e-05, + "loss": 0.4742, + "step": 26060 + }, + { + "epoch": 3.7786715947385585, + "grad_norm": 1.2756224870681763, + "learning_rate": 1.2224960139150603e-05, + "loss": 0.4479, + "step": 26070 + }, + { + "epoch": 3.7801210276479327, + "grad_norm": 1.8248677253723145, + "learning_rate": 1.2210465284823889e-05, + "loss": 0.4322, + "step": 26080 + }, + { + "epoch": 3.781570460557307, + "grad_norm": 1.1138015985488892, + "learning_rate": 1.2195970430497173e-05, + "loss": 0.489, + "step": 26090 + }, + { + "epoch": 3.7830198934666814, + "grad_norm": 1.01140558719635, + "learning_rate": 1.218147557617046e-05, + "loss": 0.4234, + "step": 26100 + }, + { + "epoch": 3.7844693263760556, + "grad_norm": 0.93239426612854, + "learning_rate": 1.2166980721843745e-05, + "loss": 0.4982, + "step": 26110 + }, + { + "epoch": 3.7859187592854298, + "grad_norm": 1.03866708278656, + "learning_rate": 1.2152485867517031e-05, + "loss": 0.495, + "step": 26120 + }, + { + "epoch": 3.787368192194804, + "grad_norm": 1.112883448600769, + "learning_rate": 1.2137991013190319e-05, + "loss": 0.4461, + "step": 26130 + }, + { + "epoch": 3.788817625104178, + "grad_norm": 1.2414758205413818, + "learning_rate": 1.2123496158863603e-05, + "loss": 0.4634, + "step": 26140 + }, + { + "epoch": 3.7902670580135522, + "grad_norm": 1.368895411491394, + "learning_rate": 1.2109001304536891e-05, + "loss": 0.4817, + "step": 26150 + }, + { + "epoch": 3.7917164909229264, + "grad_norm": 0.937118411064148, + "learning_rate": 1.2094506450210175e-05, + "loss": 0.448, + "step": 26160 + }, + { + "epoch": 3.7931659238323006, + "grad_norm": 1.29416823387146, + "learning_rate": 1.2080011595883461e-05, + "loss": 0.5016, + "step": 26170 + }, + { + "epoch": 3.7946153567416747, + "grad_norm": 1.7470028400421143, + "learning_rate": 1.2065516741556748e-05, + "loss": 0.4548, + "step": 26180 + }, + { + "epoch": 3.796064789651049, + "grad_norm": 3.248729705810547, + "learning_rate": 1.2051021887230034e-05, + "loss": 0.4589, + "step": 26190 + }, + { + "epoch": 3.797514222560423, + "grad_norm": 3.4089272022247314, + "learning_rate": 1.2036527032903321e-05, + "loss": 0.4598, + "step": 26200 + }, + { + "epoch": 3.798963655469797, + "grad_norm": 1.076757550239563, + "learning_rate": 1.2022032178576606e-05, + "loss": 0.4611, + "step": 26210 + }, + { + "epoch": 3.800413088379172, + "grad_norm": 1.8356128931045532, + "learning_rate": 1.2007537324249892e-05, + "loss": 0.4543, + "step": 26220 + }, + { + "epoch": 3.801862521288546, + "grad_norm": 0.8461928367614746, + "learning_rate": 1.1993042469923178e-05, + "loss": 0.4862, + "step": 26230 + }, + { + "epoch": 3.80331195419792, + "grad_norm": 2.1040289402008057, + "learning_rate": 1.1978547615596464e-05, + "loss": 0.5152, + "step": 26240 + }, + { + "epoch": 3.8047613871072943, + "grad_norm": 0.9999257922172546, + "learning_rate": 1.196405276126975e-05, + "loss": 0.4309, + "step": 26250 + }, + { + "epoch": 3.8062108200166684, + "grad_norm": 1.2932193279266357, + "learning_rate": 1.1949557906943036e-05, + "loss": 0.4541, + "step": 26260 + }, + { + "epoch": 3.8076602529260426, + "grad_norm": 1.0218082666397095, + "learning_rate": 1.1935063052616322e-05, + "loss": 0.4744, + "step": 26270 + }, + { + "epoch": 3.8091096858354168, + "grad_norm": 1.7632607221603394, + "learning_rate": 1.1920568198289608e-05, + "loss": 0.4709, + "step": 26280 + }, + { + "epoch": 3.8105591187447914, + "grad_norm": 1.796879768371582, + "learning_rate": 1.1906073343962894e-05, + "loss": 0.4939, + "step": 26290 + }, + { + "epoch": 3.8120085516541655, + "grad_norm": 1.1654222011566162, + "learning_rate": 1.1891578489636178e-05, + "loss": 0.5227, + "step": 26300 + }, + { + "epoch": 3.8134579845635397, + "grad_norm": 2.0588326454162598, + "learning_rate": 1.1877083635309466e-05, + "loss": 0.4955, + "step": 26310 + }, + { + "epoch": 3.814907417472914, + "grad_norm": 2.2136971950531006, + "learning_rate": 1.1862588780982752e-05, + "loss": 0.4586, + "step": 26320 + }, + { + "epoch": 3.816356850382288, + "grad_norm": 1.157351016998291, + "learning_rate": 1.1848093926656037e-05, + "loss": 0.4496, + "step": 26330 + }, + { + "epoch": 3.817806283291662, + "grad_norm": 1.9036961793899536, + "learning_rate": 1.1833599072329324e-05, + "loss": 0.4533, + "step": 26340 + }, + { + "epoch": 3.8192557162010363, + "grad_norm": 2.055697441101074, + "learning_rate": 1.1819104218002609e-05, + "loss": 0.4858, + "step": 26350 + }, + { + "epoch": 3.8207051491104105, + "grad_norm": 3.0185952186584473, + "learning_rate": 1.1804609363675897e-05, + "loss": 0.4604, + "step": 26360 + }, + { + "epoch": 3.8221545820197846, + "grad_norm": 0.9421207308769226, + "learning_rate": 1.1790114509349181e-05, + "loss": 0.4845, + "step": 26370 + }, + { + "epoch": 3.823604014929159, + "grad_norm": 1.070281982421875, + "learning_rate": 1.1775619655022467e-05, + "loss": 0.3999, + "step": 26380 + }, + { + "epoch": 3.825053447838533, + "grad_norm": 1.2085710763931274, + "learning_rate": 1.1761124800695755e-05, + "loss": 0.4798, + "step": 26390 + }, + { + "epoch": 3.826502880747907, + "grad_norm": 1.1355644464492798, + "learning_rate": 1.1746629946369039e-05, + "loss": 0.4467, + "step": 26400 + }, + { + "epoch": 3.8279523136572817, + "grad_norm": 0.9834990501403809, + "learning_rate": 1.1732135092042327e-05, + "loss": 0.4489, + "step": 26410 + }, + { + "epoch": 3.829401746566656, + "grad_norm": 2.566068649291992, + "learning_rate": 1.1717640237715611e-05, + "loss": 0.4419, + "step": 26420 + }, + { + "epoch": 3.83085117947603, + "grad_norm": 1.464672327041626, + "learning_rate": 1.1703145383388897e-05, + "loss": 0.411, + "step": 26430 + }, + { + "epoch": 3.832300612385404, + "grad_norm": 1.2169744968414307, + "learning_rate": 1.1688650529062183e-05, + "loss": 0.4677, + "step": 26440 + }, + { + "epoch": 3.8337500452947784, + "grad_norm": 1.1080471277236938, + "learning_rate": 1.167415567473547e-05, + "loss": 0.5183, + "step": 26450 + }, + { + "epoch": 3.8351994782041525, + "grad_norm": 2.5729031562805176, + "learning_rate": 1.1659660820408755e-05, + "loss": 0.4575, + "step": 26460 + }, + { + "epoch": 3.8366489111135267, + "grad_norm": 2.8056209087371826, + "learning_rate": 1.1645165966082041e-05, + "loss": 0.4587, + "step": 26470 + }, + { + "epoch": 3.8380983440229013, + "grad_norm": 2.494399070739746, + "learning_rate": 1.1630671111755327e-05, + "loss": 0.4396, + "step": 26480 + }, + { + "epoch": 3.8395477769322754, + "grad_norm": 1.3486852645874023, + "learning_rate": 1.1616176257428614e-05, + "loss": 0.4613, + "step": 26490 + }, + { + "epoch": 3.8409972098416496, + "grad_norm": 0.9714551568031311, + "learning_rate": 1.16016814031019e-05, + "loss": 0.4485, + "step": 26500 + }, + { + "epoch": 3.8424466427510238, + "grad_norm": 1.7452738285064697, + "learning_rate": 1.1587186548775186e-05, + "loss": 0.4754, + "step": 26510 + }, + { + "epoch": 3.843896075660398, + "grad_norm": 2.8123321533203125, + "learning_rate": 1.1572691694448472e-05, + "loss": 0.5065, + "step": 26520 + }, + { + "epoch": 3.845345508569772, + "grad_norm": 1.3239609003067017, + "learning_rate": 1.1558196840121758e-05, + "loss": 0.4508, + "step": 26530 + }, + { + "epoch": 3.8467949414791462, + "grad_norm": 2.126450777053833, + "learning_rate": 1.1543701985795042e-05, + "loss": 0.4637, + "step": 26540 + }, + { + "epoch": 3.8482443743885204, + "grad_norm": 1.0921045541763306, + "learning_rate": 1.152920713146833e-05, + "loss": 0.5123, + "step": 26550 + }, + { + "epoch": 3.8496938072978946, + "grad_norm": 2.9524543285369873, + "learning_rate": 1.1514712277141614e-05, + "loss": 0.4508, + "step": 26560 + }, + { + "epoch": 3.8511432402072687, + "grad_norm": 0.8647069334983826, + "learning_rate": 1.1500217422814902e-05, + "loss": 0.4644, + "step": 26570 + }, + { + "epoch": 3.852592673116643, + "grad_norm": 2.2827160358428955, + "learning_rate": 1.1485722568488188e-05, + "loss": 0.4451, + "step": 26580 + }, + { + "epoch": 3.854042106026017, + "grad_norm": 2.2924423217773438, + "learning_rate": 1.1471227714161472e-05, + "loss": 0.4475, + "step": 26590 + }, + { + "epoch": 3.8554915389353916, + "grad_norm": 1.771622896194458, + "learning_rate": 1.145673285983476e-05, + "loss": 0.4433, + "step": 26600 + }, + { + "epoch": 3.856940971844766, + "grad_norm": 1.3076894283294678, + "learning_rate": 1.1442238005508044e-05, + "loss": 0.3952, + "step": 26610 + }, + { + "epoch": 3.85839040475414, + "grad_norm": 1.0110349655151367, + "learning_rate": 1.1427743151181332e-05, + "loss": 0.4517, + "step": 26620 + }, + { + "epoch": 3.859839837663514, + "grad_norm": 3.608510732650757, + "learning_rate": 1.1413248296854617e-05, + "loss": 0.4661, + "step": 26630 + }, + { + "epoch": 3.8612892705728883, + "grad_norm": 2.3432466983795166, + "learning_rate": 1.1398753442527903e-05, + "loss": 0.4391, + "step": 26640 + }, + { + "epoch": 3.8627387034822624, + "grad_norm": 1.1964057683944702, + "learning_rate": 1.138425858820119e-05, + "loss": 0.4588, + "step": 26650 + }, + { + "epoch": 3.8641881363916366, + "grad_norm": 2.234063148498535, + "learning_rate": 1.1369763733874475e-05, + "loss": 0.4173, + "step": 26660 + }, + { + "epoch": 3.865637569301011, + "grad_norm": 3.324658155441284, + "learning_rate": 1.135526887954776e-05, + "loss": 0.4125, + "step": 26670 + }, + { + "epoch": 3.8670870022103854, + "grad_norm": 1.9230751991271973, + "learning_rate": 1.1340774025221047e-05, + "loss": 0.4814, + "step": 26680 + }, + { + "epoch": 3.8685364351197595, + "grad_norm": 3.1007161140441895, + "learning_rate": 1.1326279170894333e-05, + "loss": 0.4195, + "step": 26690 + }, + { + "epoch": 3.8699858680291337, + "grad_norm": 2.6854794025421143, + "learning_rate": 1.1311784316567619e-05, + "loss": 0.4355, + "step": 26700 + }, + { + "epoch": 3.871435300938508, + "grad_norm": 0.9971222281455994, + "learning_rate": 1.1297289462240905e-05, + "loss": 0.4344, + "step": 26710 + }, + { + "epoch": 3.872884733847882, + "grad_norm": 1.202845811843872, + "learning_rate": 1.1282794607914191e-05, + "loss": 0.5113, + "step": 26720 + }, + { + "epoch": 3.874334166757256, + "grad_norm": 2.6811609268188477, + "learning_rate": 1.1268299753587477e-05, + "loss": 0.4481, + "step": 26730 + }, + { + "epoch": 3.8757835996666303, + "grad_norm": 2.278353452682495, + "learning_rate": 1.1253804899260763e-05, + "loss": 0.4706, + "step": 26740 + }, + { + "epoch": 3.8772330325760045, + "grad_norm": 3.3476059436798096, + "learning_rate": 1.1239310044934047e-05, + "loss": 0.513, + "step": 26750 + }, + { + "epoch": 3.8786824654853786, + "grad_norm": 1.9155887365341187, + "learning_rate": 1.1224815190607335e-05, + "loss": 0.4431, + "step": 26760 + }, + { + "epoch": 3.880131898394753, + "grad_norm": 1.286580204963684, + "learning_rate": 1.1210320336280621e-05, + "loss": 0.5217, + "step": 26770 + }, + { + "epoch": 3.8815813313041274, + "grad_norm": 1.9572170972824097, + "learning_rate": 1.1195825481953907e-05, + "loss": 0.5095, + "step": 26780 + }, + { + "epoch": 3.8830307642135016, + "grad_norm": 0.9819499254226685, + "learning_rate": 1.1181330627627193e-05, + "loss": 0.4438, + "step": 26790 + }, + { + "epoch": 3.8844801971228757, + "grad_norm": 2.0708963871002197, + "learning_rate": 1.1166835773300478e-05, + "loss": 0.4139, + "step": 26800 + }, + { + "epoch": 3.88592963003225, + "grad_norm": 3.3944318294525146, + "learning_rate": 1.1152340918973766e-05, + "loss": 0.459, + "step": 26810 + }, + { + "epoch": 3.887379062941624, + "grad_norm": 1.255771279335022, + "learning_rate": 1.113784606464705e-05, + "loss": 0.438, + "step": 26820 + }, + { + "epoch": 3.888828495850998, + "grad_norm": 0.9337121844291687, + "learning_rate": 1.1123351210320338e-05, + "loss": 0.4895, + "step": 26830 + }, + { + "epoch": 3.8902779287603724, + "grad_norm": 1.0607869625091553, + "learning_rate": 1.1108856355993624e-05, + "loss": 0.4407, + "step": 26840 + }, + { + "epoch": 3.891727361669747, + "grad_norm": 0.9897153377532959, + "learning_rate": 1.1094361501666908e-05, + "loss": 0.4146, + "step": 26850 + }, + { + "epoch": 3.893176794579121, + "grad_norm": 0.8319249749183655, + "learning_rate": 1.1079866647340196e-05, + "loss": 0.4798, + "step": 26860 + }, + { + "epoch": 3.8946262274884953, + "grad_norm": 2.0311408042907715, + "learning_rate": 1.106537179301348e-05, + "loss": 0.4787, + "step": 26870 + }, + { + "epoch": 3.8960756603978695, + "grad_norm": 1.3111026287078857, + "learning_rate": 1.1050876938686766e-05, + "loss": 0.4265, + "step": 26880 + }, + { + "epoch": 3.8975250933072436, + "grad_norm": 2.4597795009613037, + "learning_rate": 1.1036382084360052e-05, + "loss": 0.429, + "step": 26890 + }, + { + "epoch": 3.8989745262166178, + "grad_norm": 2.432349443435669, + "learning_rate": 1.1021887230033338e-05, + "loss": 0.4206, + "step": 26900 + }, + { + "epoch": 3.900423959125992, + "grad_norm": 0.9375098347663879, + "learning_rate": 1.1007392375706626e-05, + "loss": 0.4607, + "step": 26910 + }, + { + "epoch": 3.901873392035366, + "grad_norm": 0.8173941373825073, + "learning_rate": 1.099289752137991e-05, + "loss": 0.3743, + "step": 26920 + }, + { + "epoch": 3.9033228249447403, + "grad_norm": 1.374294638633728, + "learning_rate": 1.0978402667053196e-05, + "loss": 0.5601, + "step": 26930 + }, + { + "epoch": 3.9047722578541144, + "grad_norm": 1.6747016906738281, + "learning_rate": 1.0963907812726483e-05, + "loss": 0.4217, + "step": 26940 + }, + { + "epoch": 3.9062216907634886, + "grad_norm": 1.1631803512573242, + "learning_rate": 1.0949412958399769e-05, + "loss": 0.5497, + "step": 26950 + }, + { + "epoch": 3.9076711236728627, + "grad_norm": 1.1909114122390747, + "learning_rate": 1.0934918104073055e-05, + "loss": 0.4395, + "step": 26960 + }, + { + "epoch": 3.9091205565822373, + "grad_norm": 1.498050570487976, + "learning_rate": 1.092042324974634e-05, + "loss": 0.4709, + "step": 26970 + }, + { + "epoch": 3.9105699894916115, + "grad_norm": 2.889643669128418, + "learning_rate": 1.0905928395419627e-05, + "loss": 0.4511, + "step": 26980 + }, + { + "epoch": 3.9120194224009857, + "grad_norm": 1.0335190296173096, + "learning_rate": 1.0891433541092913e-05, + "loss": 0.5079, + "step": 26990 + }, + { + "epoch": 3.91346885531036, + "grad_norm": 0.8706920146942139, + "learning_rate": 1.0876938686766199e-05, + "loss": 0.4817, + "step": 27000 + }, + { + "epoch": 3.91346885531036, + "eval_loss": 0.7067095041275024, + "eval_runtime": 670.7773, + "eval_samples_per_second": 51.425, + "eval_steps_per_second": 2.572, + "eval_token_accuracy": 0.00039116708689249276, + "step": 27000 + }, + { + "epoch": 3.914918288219734, + "grad_norm": 1.17312753200531, + "learning_rate": 1.0862443832439483e-05, + "loss": 0.438, + "step": 27010 + }, + { + "epoch": 3.916367721129108, + "grad_norm": 1.008259892463684, + "learning_rate": 1.0847948978112771e-05, + "loss": 0.4378, + "step": 27020 + }, + { + "epoch": 3.9178171540384823, + "grad_norm": 1.0406113862991333, + "learning_rate": 1.0833454123786057e-05, + "loss": 0.413, + "step": 27030 + }, + { + "epoch": 3.919266586947857, + "grad_norm": 2.5961663722991943, + "learning_rate": 1.0818959269459343e-05, + "loss": 0.5262, + "step": 27040 + }, + { + "epoch": 3.920716019857231, + "grad_norm": 0.8761188387870789, + "learning_rate": 1.0804464415132629e-05, + "loss": 0.5301, + "step": 27050 + }, + { + "epoch": 3.922165452766605, + "grad_norm": 0.9454947113990784, + "learning_rate": 1.0789969560805913e-05, + "loss": 0.481, + "step": 27060 + }, + { + "epoch": 3.9236148856759794, + "grad_norm": 3.1509249210357666, + "learning_rate": 1.0775474706479201e-05, + "loss": 0.4608, + "step": 27070 + }, + { + "epoch": 3.9250643185853535, + "grad_norm": 3.173954963684082, + "learning_rate": 1.0760979852152486e-05, + "loss": 0.4451, + "step": 27080 + }, + { + "epoch": 3.9265137514947277, + "grad_norm": 0.879859209060669, + "learning_rate": 1.0746484997825772e-05, + "loss": 0.4301, + "step": 27090 + }, + { + "epoch": 3.927963184404102, + "grad_norm": 1.0729224681854248, + "learning_rate": 1.073199014349906e-05, + "loss": 0.4324, + "step": 27100 + }, + { + "epoch": 3.929412617313476, + "grad_norm": 1.933181643486023, + "learning_rate": 1.0717495289172344e-05, + "loss": 0.4767, + "step": 27110 + }, + { + "epoch": 3.93086205022285, + "grad_norm": 1.12887442111969, + "learning_rate": 1.0703000434845631e-05, + "loss": 0.3971, + "step": 27120 + }, + { + "epoch": 3.9323114831322243, + "grad_norm": 0.8794549703598022, + "learning_rate": 1.0688505580518916e-05, + "loss": 0.4423, + "step": 27130 + }, + { + "epoch": 3.9337609160415985, + "grad_norm": 1.0780173540115356, + "learning_rate": 1.0674010726192202e-05, + "loss": 0.4795, + "step": 27140 + }, + { + "epoch": 3.9352103489509727, + "grad_norm": 1.5967097282409668, + "learning_rate": 1.0659515871865488e-05, + "loss": 0.4333, + "step": 27150 + }, + { + "epoch": 3.9366597818603473, + "grad_norm": 1.123369574546814, + "learning_rate": 1.0645021017538774e-05, + "loss": 0.4561, + "step": 27160 + }, + { + "epoch": 3.9381092147697214, + "grad_norm": 3.6029701232910156, + "learning_rate": 1.0630526163212062e-05, + "loss": 0.4613, + "step": 27170 + }, + { + "epoch": 3.9395586476790956, + "grad_norm": 0.8784819841384888, + "learning_rate": 1.0616031308885346e-05, + "loss": 0.4745, + "step": 27180 + }, + { + "epoch": 3.9410080805884697, + "grad_norm": 2.145846366882324, + "learning_rate": 1.0601536454558632e-05, + "loss": 0.4747, + "step": 27190 + }, + { + "epoch": 3.942457513497844, + "grad_norm": 1.1199047565460205, + "learning_rate": 1.0587041600231918e-05, + "loss": 0.4514, + "step": 27200 + }, + { + "epoch": 3.943906946407218, + "grad_norm": 0.8590757250785828, + "learning_rate": 1.0572546745905204e-05, + "loss": 0.3595, + "step": 27210 + }, + { + "epoch": 3.945356379316592, + "grad_norm": 2.8308565616607666, + "learning_rate": 1.055805189157849e-05, + "loss": 0.4527, + "step": 27220 + }, + { + "epoch": 3.946805812225967, + "grad_norm": 0.7825255990028381, + "learning_rate": 1.0543557037251776e-05, + "loss": 0.4501, + "step": 27230 + }, + { + "epoch": 3.948255245135341, + "grad_norm": 2.58693790435791, + "learning_rate": 1.0530511668357732e-05, + "loss": 0.4467, + "step": 27240 + }, + { + "epoch": 3.949704678044715, + "grad_norm": 0.8955137133598328, + "learning_rate": 1.051601681403102e-05, + "loss": 0.479, + "step": 27250 + }, + { + "epoch": 3.9511541109540893, + "grad_norm": 1.1574593782424927, + "learning_rate": 1.0501521959704304e-05, + "loss": 0.456, + "step": 27260 + }, + { + "epoch": 3.9526035438634635, + "grad_norm": 1.5674717426300049, + "learning_rate": 1.0487027105377592e-05, + "loss": 0.4149, + "step": 27270 + }, + { + "epoch": 3.9540529767728376, + "grad_norm": 3.8193156719207764, + "learning_rate": 1.0472532251050878e-05, + "loss": 0.3872, + "step": 27280 + }, + { + "epoch": 3.955502409682212, + "grad_norm": 4.670598030090332, + "learning_rate": 1.0458037396724163e-05, + "loss": 0.4626, + "step": 27290 + }, + { + "epoch": 3.956951842591586, + "grad_norm": 1.0468988418579102, + "learning_rate": 1.044354254239745e-05, + "loss": 0.4514, + "step": 27300 + }, + { + "epoch": 3.95840127550096, + "grad_norm": 1.9248346090316772, + "learning_rate": 1.0429047688070735e-05, + "loss": 0.4064, + "step": 27310 + }, + { + "epoch": 3.9598507084103343, + "grad_norm": 1.1533076763153076, + "learning_rate": 1.041455283374402e-05, + "loss": 0.4469, + "step": 27320 + }, + { + "epoch": 3.9613001413197084, + "grad_norm": 1.8082165718078613, + "learning_rate": 1.0400057979417307e-05, + "loss": 0.4182, + "step": 27330 + }, + { + "epoch": 3.962749574229083, + "grad_norm": 2.489048957824707, + "learning_rate": 1.0385563125090593e-05, + "loss": 0.4603, + "step": 27340 + }, + { + "epoch": 3.964199007138457, + "grad_norm": 1.018362045288086, + "learning_rate": 1.037106827076388e-05, + "loss": 0.415, + "step": 27350 + }, + { + "epoch": 3.9656484400478313, + "grad_norm": 1.5398354530334473, + "learning_rate": 1.0356573416437165e-05, + "loss": 0.4022, + "step": 27360 + }, + { + "epoch": 3.9670978729572055, + "grad_norm": 3.7499420642852783, + "learning_rate": 1.0342078562110451e-05, + "loss": 0.4887, + "step": 27370 + }, + { + "epoch": 3.9685473058665797, + "grad_norm": 3.1090011596679688, + "learning_rate": 1.0327583707783737e-05, + "loss": 0.4385, + "step": 27380 + }, + { + "epoch": 3.969996738775954, + "grad_norm": 2.216132402420044, + "learning_rate": 1.0313088853457023e-05, + "loss": 0.4689, + "step": 27390 + }, + { + "epoch": 3.971446171685328, + "grad_norm": 1.6161932945251465, + "learning_rate": 1.0298593999130309e-05, + "loss": 0.4268, + "step": 27400 + }, + { + "epoch": 3.9728956045947026, + "grad_norm": 0.9366248846054077, + "learning_rate": 1.0284099144803595e-05, + "loss": 0.4378, + "step": 27410 + }, + { + "epoch": 3.9743450375040767, + "grad_norm": 1.3713431358337402, + "learning_rate": 1.0269604290476881e-05, + "loss": 0.429, + "step": 27420 + }, + { + "epoch": 3.975794470413451, + "grad_norm": 1.966168999671936, + "learning_rate": 1.0255109436150167e-05, + "loss": 0.4333, + "step": 27430 + }, + { + "epoch": 3.977243903322825, + "grad_norm": 3.127967119216919, + "learning_rate": 1.0240614581823453e-05, + "loss": 0.4401, + "step": 27440 + }, + { + "epoch": 3.9786933362321992, + "grad_norm": 1.8882118463516235, + "learning_rate": 1.0226119727496738e-05, + "loss": 0.4623, + "step": 27450 + }, + { + "epoch": 3.9801427691415734, + "grad_norm": 0.9691013097763062, + "learning_rate": 1.0211624873170025e-05, + "loss": 0.4796, + "step": 27460 + }, + { + "epoch": 3.9815922020509475, + "grad_norm": 1.0078339576721191, + "learning_rate": 1.0197130018843312e-05, + "loss": 0.445, + "step": 27470 + }, + { + "epoch": 3.9830416349603217, + "grad_norm": 4.346449375152588, + "learning_rate": 1.0182635164516598e-05, + "loss": 0.53, + "step": 27480 + }, + { + "epoch": 3.984491067869696, + "grad_norm": 1.0477688312530518, + "learning_rate": 1.0168140310189884e-05, + "loss": 0.4503, + "step": 27490 + }, + { + "epoch": 3.98594050077907, + "grad_norm": 2.4577136039733887, + "learning_rate": 1.0153645455863168e-05, + "loss": 0.4389, + "step": 27500 + }, + { + "epoch": 3.987389933688444, + "grad_norm": 1.7137912511825562, + "learning_rate": 1.0139150601536456e-05, + "loss": 0.4215, + "step": 27510 + }, + { + "epoch": 3.9888393665978183, + "grad_norm": 2.337385892868042, + "learning_rate": 1.012465574720974e-05, + "loss": 0.488, + "step": 27520 + }, + { + "epoch": 3.990288799507193, + "grad_norm": 1.083498477935791, + "learning_rate": 1.0110160892883028e-05, + "loss": 0.4287, + "step": 27530 + }, + { + "epoch": 3.991738232416567, + "grad_norm": 2.4905881881713867, + "learning_rate": 1.0095666038556314e-05, + "loss": 0.4793, + "step": 27540 + }, + { + "epoch": 3.9931876653259413, + "grad_norm": 1.1195241212844849, + "learning_rate": 1.0081171184229598e-05, + "loss": 0.4409, + "step": 27550 + }, + { + "epoch": 3.9946370982353154, + "grad_norm": 1.6588102579116821, + "learning_rate": 1.0066676329902886e-05, + "loss": 0.4468, + "step": 27560 + }, + { + "epoch": 3.9960865311446896, + "grad_norm": 1.223429560661316, + "learning_rate": 1.005218147557617e-05, + "loss": 0.4396, + "step": 27570 + }, + { + "epoch": 3.9975359640540638, + "grad_norm": 1.179884910583496, + "learning_rate": 1.0037686621249456e-05, + "loss": 0.4471, + "step": 27580 + }, + { + "epoch": 3.998985396963438, + "grad_norm": 1.0562238693237305, + "learning_rate": 1.0023191766922742e-05, + "loss": 0.414, + "step": 27590 + }, + { + "epoch": 4.0004348298728125, + "grad_norm": 1.1050264835357666, + "learning_rate": 1.0008696912596029e-05, + "loss": 0.3992, + "step": 27600 + }, + { + "epoch": 4.001884262782187, + "grad_norm": 2.0935981273651123, + "learning_rate": 9.994202058269316e-06, + "loss": 0.3756, + "step": 27610 + }, + { + "epoch": 4.003333695691561, + "grad_norm": 1.0844813585281372, + "learning_rate": 9.9797072039426e-06, + "loss": 0.4063, + "step": 27620 + }, + { + "epoch": 4.004783128600935, + "grad_norm": 0.849988579750061, + "learning_rate": 9.965212349615887e-06, + "loss": 0.3845, + "step": 27630 + }, + { + "epoch": 4.006232561510309, + "grad_norm": 0.8378141522407532, + "learning_rate": 9.950717495289173e-06, + "loss": 0.3691, + "step": 27640 + }, + { + "epoch": 4.007681994419683, + "grad_norm": 2.816222906112671, + "learning_rate": 9.936222640962459e-06, + "loss": 0.3848, + "step": 27650 + }, + { + "epoch": 4.0091314273290575, + "grad_norm": 1.0942838191986084, + "learning_rate": 9.921727786635745e-06, + "loss": 0.3793, + "step": 27660 + }, + { + "epoch": 4.010580860238432, + "grad_norm": 0.8341204524040222, + "learning_rate": 9.907232932309031e-06, + "loss": 0.3884, + "step": 27670 + }, + { + "epoch": 4.012030293147806, + "grad_norm": 1.0412321090698242, + "learning_rate": 9.892738077982317e-06, + "loss": 0.3347, + "step": 27680 + }, + { + "epoch": 4.01347972605718, + "grad_norm": 1.2412534952163696, + "learning_rate": 9.878243223655603e-06, + "loss": 0.4247, + "step": 27690 + }, + { + "epoch": 4.014929158966554, + "grad_norm": 1.1662929058074951, + "learning_rate": 9.863748369328889e-06, + "loss": 0.3439, + "step": 27700 + }, + { + "epoch": 4.016378591875928, + "grad_norm": 1.7275515794754028, + "learning_rate": 9.849253515002173e-06, + "loss": 0.3666, + "step": 27710 + }, + { + "epoch": 4.017828024785302, + "grad_norm": 1.2136681079864502, + "learning_rate": 9.834758660675461e-06, + "loss": 0.353, + "step": 27720 + }, + { + "epoch": 4.019277457694677, + "grad_norm": 1.1961663961410522, + "learning_rate": 9.820263806348747e-06, + "loss": 0.3749, + "step": 27730 + }, + { + "epoch": 4.020726890604051, + "grad_norm": 1.3632068634033203, + "learning_rate": 9.805768952022033e-06, + "loss": 0.3846, + "step": 27740 + }, + { + "epoch": 4.022176323513426, + "grad_norm": 1.477921724319458, + "learning_rate": 9.79127409769532e-06, + "loss": 0.3621, + "step": 27750 + }, + { + "epoch": 4.0236257564228, + "grad_norm": 1.5257829427719116, + "learning_rate": 9.776779243368604e-06, + "loss": 0.3435, + "step": 27760 + }, + { + "epoch": 4.025075189332174, + "grad_norm": 1.464673638343811, + "learning_rate": 9.762284389041891e-06, + "loss": 0.3477, + "step": 27770 + }, + { + "epoch": 4.026524622241548, + "grad_norm": 1.0875053405761719, + "learning_rate": 9.747789534715176e-06, + "loss": 0.414, + "step": 27780 + }, + { + "epoch": 4.027974055150922, + "grad_norm": 2.292750120162964, + "learning_rate": 9.733294680388462e-06, + "loss": 0.3562, + "step": 27790 + }, + { + "epoch": 4.029423488060297, + "grad_norm": 2.2468414306640625, + "learning_rate": 9.71879982606175e-06, + "loss": 0.3949, + "step": 27800 + }, + { + "epoch": 4.030872920969671, + "grad_norm": 0.8002530932426453, + "learning_rate": 9.704304971735034e-06, + "loss": 0.362, + "step": 27810 + }, + { + "epoch": 4.032322353879045, + "grad_norm": 0.9956921935081482, + "learning_rate": 9.689810117408322e-06, + "loss": 0.3498, + "step": 27820 + }, + { + "epoch": 4.033771786788419, + "grad_norm": 2.087887763977051, + "learning_rate": 9.675315263081606e-06, + "loss": 0.3751, + "step": 27830 + }, + { + "epoch": 4.035221219697793, + "grad_norm": 3.2509827613830566, + "learning_rate": 9.660820408754892e-06, + "loss": 0.3628, + "step": 27840 + }, + { + "epoch": 4.036670652607167, + "grad_norm": 2.119020462036133, + "learning_rate": 9.646325554428178e-06, + "loss": 0.4038, + "step": 27850 + }, + { + "epoch": 4.038120085516542, + "grad_norm": 1.0762407779693604, + "learning_rate": 9.631830700101464e-06, + "loss": 0.4082, + "step": 27860 + }, + { + "epoch": 4.039569518425916, + "grad_norm": 1.4908643960952759, + "learning_rate": 9.61733584577475e-06, + "loss": 0.421, + "step": 27870 + }, + { + "epoch": 4.04101895133529, + "grad_norm": 1.3782938718795776, + "learning_rate": 9.602840991448036e-06, + "loss": 0.3493, + "step": 27880 + }, + { + "epoch": 4.042468384244664, + "grad_norm": 1.111978530883789, + "learning_rate": 9.588346137121322e-06, + "loss": 0.4287, + "step": 27890 + }, + { + "epoch": 4.043917817154038, + "grad_norm": 1.9733977317810059, + "learning_rate": 9.573851282794608e-06, + "loss": 0.3812, + "step": 27900 + }, + { + "epoch": 4.045367250063412, + "grad_norm": 0.8419144153594971, + "learning_rate": 9.559356428467894e-06, + "loss": 0.3388, + "step": 27910 + }, + { + "epoch": 4.0468166829727865, + "grad_norm": 1.2305309772491455, + "learning_rate": 9.54486157414118e-06, + "loss": 0.4008, + "step": 27920 + }, + { + "epoch": 4.048266115882161, + "grad_norm": 1.153856873512268, + "learning_rate": 9.530366719814467e-06, + "loss": 0.4178, + "step": 27930 + }, + { + "epoch": 4.049715548791536, + "grad_norm": 0.8963118195533752, + "learning_rate": 9.515871865487753e-06, + "loss": 0.334, + "step": 27940 + }, + { + "epoch": 4.05116498170091, + "grad_norm": 1.9021250009536743, + "learning_rate": 9.501377011161039e-06, + "loss": 0.3836, + "step": 27950 + }, + { + "epoch": 4.052614414610284, + "grad_norm": 1.6093897819519043, + "learning_rate": 9.486882156834325e-06, + "loss": 0.3618, + "step": 27960 + }, + { + "epoch": 4.054063847519658, + "grad_norm": 2.0067999362945557, + "learning_rate": 9.472387302507609e-06, + "loss": 0.3485, + "step": 27970 + }, + { + "epoch": 4.055513280429032, + "grad_norm": 1.1295496225357056, + "learning_rate": 9.457892448180897e-06, + "loss": 0.4286, + "step": 27980 + }, + { + "epoch": 4.0569627133384065, + "grad_norm": 1.2019520998001099, + "learning_rate": 9.443397593854183e-06, + "loss": 0.3784, + "step": 27990 + }, + { + "epoch": 4.058412146247781, + "grad_norm": 1.1256000995635986, + "learning_rate": 9.428902739527467e-06, + "loss": 0.401, + "step": 28000 + }, + { + "epoch": 4.058412146247781, + "eval_loss": 0.7432445287704468, + "eval_runtime": 670.6325, + "eval_samples_per_second": 51.437, + "eval_steps_per_second": 2.572, + "eval_token_accuracy": 0.0003882719357559529, + "step": 28000 + }, + { + "epoch": 4.059861579157155, + "grad_norm": 1.4858769178390503, + "learning_rate": 9.414407885200755e-06, + "loss": 0.3862, + "step": 28010 + }, + { + "epoch": 4.061311012066529, + "grad_norm": 3.2917826175689697, + "learning_rate": 9.39991303087404e-06, + "loss": 0.3758, + "step": 28020 + }, + { + "epoch": 4.062760444975903, + "grad_norm": 1.42050302028656, + "learning_rate": 9.385418176547327e-06, + "loss": 0.3977, + "step": 28030 + }, + { + "epoch": 4.064209877885277, + "grad_norm": 2.3631317615509033, + "learning_rate": 9.370923322220611e-06, + "loss": 0.3469, + "step": 28040 + }, + { + "epoch": 4.0656593107946515, + "grad_norm": 0.9823781847953796, + "learning_rate": 9.356428467893898e-06, + "loss": 0.3718, + "step": 28050 + }, + { + "epoch": 4.067108743704026, + "grad_norm": 2.334778070449829, + "learning_rate": 9.341933613567185e-06, + "loss": 0.3862, + "step": 28060 + }, + { + "epoch": 4.0685581766134, + "grad_norm": 1.390881896018982, + "learning_rate": 9.32743875924047e-06, + "loss": 0.3643, + "step": 28070 + }, + { + "epoch": 4.070007609522774, + "grad_norm": 1.4181934595108032, + "learning_rate": 9.312943904913756e-06, + "loss": 0.4207, + "step": 28080 + }, + { + "epoch": 4.071457042432148, + "grad_norm": 0.8947294354438782, + "learning_rate": 9.298449050587042e-06, + "loss": 0.3836, + "step": 28090 + }, + { + "epoch": 4.072906475341522, + "grad_norm": 0.939887285232544, + "learning_rate": 9.283954196260328e-06, + "loss": 0.3692, + "step": 28100 + }, + { + "epoch": 4.074355908250896, + "grad_norm": 3.092376470565796, + "learning_rate": 9.269459341933614e-06, + "loss": 0.4199, + "step": 28110 + }, + { + "epoch": 4.075805341160271, + "grad_norm": 1.2078293561935425, + "learning_rate": 9.2549644876069e-06, + "loss": 0.3482, + "step": 28120 + }, + { + "epoch": 4.077254774069646, + "grad_norm": 2.018075704574585, + "learning_rate": 9.240469633280186e-06, + "loss": 0.3607, + "step": 28130 + }, + { + "epoch": 4.07870420697902, + "grad_norm": 1.0871379375457764, + "learning_rate": 9.225974778953472e-06, + "loss": 0.3976, + "step": 28140 + }, + { + "epoch": 4.080153639888394, + "grad_norm": 1.2211644649505615, + "learning_rate": 9.211479924626758e-06, + "loss": 0.3996, + "step": 28150 + }, + { + "epoch": 4.081603072797768, + "grad_norm": 4.177424907684326, + "learning_rate": 9.196985070300044e-06, + "loss": 0.4018, + "step": 28160 + }, + { + "epoch": 4.083052505707142, + "grad_norm": 0.9837602972984314, + "learning_rate": 9.18249021597333e-06, + "loss": 0.3776, + "step": 28170 + }, + { + "epoch": 4.0845019386165164, + "grad_norm": 1.2474371194839478, + "learning_rate": 9.167995361646616e-06, + "loss": 0.3645, + "step": 28180 + }, + { + "epoch": 4.085951371525891, + "grad_norm": 0.8499113917350769, + "learning_rate": 9.153500507319902e-06, + "loss": 0.3986, + "step": 28190 + }, + { + "epoch": 4.087400804435265, + "grad_norm": 1.1272683143615723, + "learning_rate": 9.139005652993188e-06, + "loss": 0.3665, + "step": 28200 + }, + { + "epoch": 4.088850237344639, + "grad_norm": 0.9529480338096619, + "learning_rate": 9.124510798666473e-06, + "loss": 0.3491, + "step": 28210 + }, + { + "epoch": 4.090299670254013, + "grad_norm": 2.408130645751953, + "learning_rate": 9.11001594433976e-06, + "loss": 0.3772, + "step": 28220 + }, + { + "epoch": 4.091749103163387, + "grad_norm": 1.2539433240890503, + "learning_rate": 9.095521090013045e-06, + "loss": 0.4046, + "step": 28230 + }, + { + "epoch": 4.093198536072761, + "grad_norm": 2.582306146621704, + "learning_rate": 9.081026235686333e-06, + "loss": 0.3939, + "step": 28240 + }, + { + "epoch": 4.094647968982136, + "grad_norm": 1.1347997188568115, + "learning_rate": 9.066531381359619e-06, + "loss": 0.3405, + "step": 28250 + }, + { + "epoch": 4.09609740189151, + "grad_norm": 2.0996315479278564, + "learning_rate": 9.052036527032903e-06, + "loss": 0.3538, + "step": 28260 + }, + { + "epoch": 4.097546834800884, + "grad_norm": 1.378403663635254, + "learning_rate": 9.03754167270619e-06, + "loss": 0.3505, + "step": 28270 + }, + { + "epoch": 4.098996267710258, + "grad_norm": 1.0263172388076782, + "learning_rate": 9.023046818379475e-06, + "loss": 0.3402, + "step": 28280 + }, + { + "epoch": 4.100445700619632, + "grad_norm": 1.7301150560379028, + "learning_rate": 9.008551964052761e-06, + "loss": 0.3679, + "step": 28290 + }, + { + "epoch": 4.101895133529006, + "grad_norm": 1.9480246305465698, + "learning_rate": 8.994057109726047e-06, + "loss": 0.4301, + "step": 28300 + }, + { + "epoch": 4.103344566438381, + "grad_norm": 1.4775640964508057, + "learning_rate": 8.979562255399333e-06, + "loss": 0.3665, + "step": 28310 + }, + { + "epoch": 4.104793999347756, + "grad_norm": 2.209846258163452, + "learning_rate": 8.965067401072621e-06, + "loss": 0.3446, + "step": 28320 + }, + { + "epoch": 4.10624343225713, + "grad_norm": 1.0079455375671387, + "learning_rate": 8.950572546745905e-06, + "loss": 0.3878, + "step": 28330 + }, + { + "epoch": 4.107692865166504, + "grad_norm": 0.8966060876846313, + "learning_rate": 8.936077692419191e-06, + "loss": 0.3611, + "step": 28340 + }, + { + "epoch": 4.109142298075878, + "grad_norm": 1.1710530519485474, + "learning_rate": 8.921582838092477e-06, + "loss": 0.3578, + "step": 28350 + }, + { + "epoch": 4.110591730985252, + "grad_norm": 1.261860966682434, + "learning_rate": 8.907087983765763e-06, + "loss": 0.4166, + "step": 28360 + }, + { + "epoch": 4.112041163894626, + "grad_norm": 1.0835785865783691, + "learning_rate": 8.89259312943905e-06, + "loss": 0.3574, + "step": 28370 + }, + { + "epoch": 4.1134905968040005, + "grad_norm": 2.9820594787597656, + "learning_rate": 8.878098275112336e-06, + "loss": 0.3804, + "step": 28380 + }, + { + "epoch": 4.114940029713375, + "grad_norm": 3.3930695056915283, + "learning_rate": 8.863603420785622e-06, + "loss": 0.3602, + "step": 28390 + }, + { + "epoch": 4.116389462622749, + "grad_norm": 2.611999034881592, + "learning_rate": 8.849108566458908e-06, + "loss": 0.3743, + "step": 28400 + }, + { + "epoch": 4.117838895532123, + "grad_norm": 1.090427279472351, + "learning_rate": 8.834613712132194e-06, + "loss": 0.3773, + "step": 28410 + }, + { + "epoch": 4.119288328441497, + "grad_norm": 1.434935212135315, + "learning_rate": 8.820118857805478e-06, + "loss": 0.3808, + "step": 28420 + }, + { + "epoch": 4.120737761350871, + "grad_norm": 6.215574264526367, + "learning_rate": 8.805624003478766e-06, + "loss": 0.3795, + "step": 28430 + }, + { + "epoch": 4.1221871942602455, + "grad_norm": 1.3055042028427124, + "learning_rate": 8.791129149152052e-06, + "loss": 0.376, + "step": 28440 + }, + { + "epoch": 4.12363662716962, + "grad_norm": 1.7235387563705444, + "learning_rate": 8.776634294825338e-06, + "loss": 0.4442, + "step": 28450 + }, + { + "epoch": 4.125086060078994, + "grad_norm": 2.1163599491119385, + "learning_rate": 8.762139440498624e-06, + "loss": 0.3433, + "step": 28460 + }, + { + "epoch": 4.126535492988368, + "grad_norm": 1.0409663915634155, + "learning_rate": 8.747644586171908e-06, + "loss": 0.3704, + "step": 28470 + }, + { + "epoch": 4.127984925897742, + "grad_norm": 1.1953526735305786, + "learning_rate": 8.733149731845196e-06, + "loss": 0.3988, + "step": 28480 + }, + { + "epoch": 4.129434358807116, + "grad_norm": 3.2460741996765137, + "learning_rate": 8.71865487751848e-06, + "loss": 0.3214, + "step": 28490 + }, + { + "epoch": 4.130883791716491, + "grad_norm": 1.576264500617981, + "learning_rate": 8.704160023191767e-06, + "loss": 0.3375, + "step": 28500 + }, + { + "epoch": 4.1323332246258655, + "grad_norm": 3.3589694499969482, + "learning_rate": 8.689665168865054e-06, + "loss": 0.3652, + "step": 28510 + }, + { + "epoch": 4.13378265753524, + "grad_norm": 0.7416440844535828, + "learning_rate": 8.675170314538339e-06, + "loss": 0.3622, + "step": 28520 + }, + { + "epoch": 4.135232090444614, + "grad_norm": 1.1319648027420044, + "learning_rate": 8.660675460211626e-06, + "loss": 0.371, + "step": 28530 + }, + { + "epoch": 4.136681523353988, + "grad_norm": 1.8906817436218262, + "learning_rate": 8.64618060588491e-06, + "loss": 0.3673, + "step": 28540 + }, + { + "epoch": 4.138130956263362, + "grad_norm": 0.9824037551879883, + "learning_rate": 8.631685751558197e-06, + "loss": 0.3147, + "step": 28550 + }, + { + "epoch": 4.139580389172736, + "grad_norm": 1.198995590209961, + "learning_rate": 8.617190897231483e-06, + "loss": 0.3112, + "step": 28560 + }, + { + "epoch": 4.1410298220821105, + "grad_norm": 1.260464072227478, + "learning_rate": 8.602696042904769e-06, + "loss": 0.326, + "step": 28570 + }, + { + "epoch": 4.142479254991485, + "grad_norm": 1.0763384103775024, + "learning_rate": 8.588201188578057e-06, + "loss": 0.3248, + "step": 28580 + }, + { + "epoch": 4.143928687900859, + "grad_norm": 0.7927284836769104, + "learning_rate": 8.573706334251341e-06, + "loss": 0.3897, + "step": 28590 + }, + { + "epoch": 4.145378120810233, + "grad_norm": 0.9664852619171143, + "learning_rate": 8.559211479924627e-06, + "loss": 0.3691, + "step": 28600 + }, + { + "epoch": 4.146827553719607, + "grad_norm": 1.0483916997909546, + "learning_rate": 8.544716625597913e-06, + "loss": 0.3442, + "step": 28610 + }, + { + "epoch": 4.148276986628981, + "grad_norm": 0.9723681807518005, + "learning_rate": 8.5302217712712e-06, + "loss": 0.3085, + "step": 28620 + }, + { + "epoch": 4.149726419538355, + "grad_norm": 1.37981116771698, + "learning_rate": 8.515726916944485e-06, + "loss": 0.4007, + "step": 28630 + }, + { + "epoch": 4.15117585244773, + "grad_norm": 2.6564297676086426, + "learning_rate": 8.501232062617771e-06, + "loss": 0.3669, + "step": 28640 + }, + { + "epoch": 4.152625285357104, + "grad_norm": 1.5353738069534302, + "learning_rate": 8.486737208291057e-06, + "loss": 0.4073, + "step": 28650 + }, + { + "epoch": 4.154074718266478, + "grad_norm": 1.42356538772583, + "learning_rate": 8.472242353964343e-06, + "loss": 0.421, + "step": 28660 + }, + { + "epoch": 4.155524151175852, + "grad_norm": 1.5719921588897705, + "learning_rate": 8.45774749963763e-06, + "loss": 0.3538, + "step": 28670 + }, + { + "epoch": 4.156973584085227, + "grad_norm": 1.324820876121521, + "learning_rate": 8.443252645310914e-06, + "loss": 0.4142, + "step": 28680 + }, + { + "epoch": 4.158423016994601, + "grad_norm": 1.1727113723754883, + "learning_rate": 8.428757790984202e-06, + "loss": 0.3242, + "step": 28690 + }, + { + "epoch": 4.159872449903975, + "grad_norm": 1.2460103034973145, + "learning_rate": 8.414262936657488e-06, + "loss": 0.345, + "step": 28700 + }, + { + "epoch": 4.16132188281335, + "grad_norm": 0.9975523352622986, + "learning_rate": 8.399768082330774e-06, + "loss": 0.36, + "step": 28710 + }, + { + "epoch": 4.162771315722724, + "grad_norm": 0.8736883401870728, + "learning_rate": 8.38527322800406e-06, + "loss": 0.3209, + "step": 28720 + }, + { + "epoch": 4.164220748632098, + "grad_norm": 1.1812388896942139, + "learning_rate": 8.370778373677344e-06, + "loss": 0.4303, + "step": 28730 + }, + { + "epoch": 4.165670181541472, + "grad_norm": 3.161813259124756, + "learning_rate": 8.356283519350632e-06, + "loss": 0.4146, + "step": 28740 + }, + { + "epoch": 4.167119614450846, + "grad_norm": 2.7170424461364746, + "learning_rate": 8.341788665023916e-06, + "loss": 0.3542, + "step": 28750 + }, + { + "epoch": 4.16856904736022, + "grad_norm": 1.185248613357544, + "learning_rate": 8.327293810697202e-06, + "loss": 0.3726, + "step": 28760 + }, + { + "epoch": 4.1700184802695945, + "grad_norm": 2.6338613033294678, + "learning_rate": 8.31279895637049e-06, + "loss": 0.3621, + "step": 28770 + }, + { + "epoch": 4.171467913178969, + "grad_norm": 1.1241165399551392, + "learning_rate": 8.298304102043774e-06, + "loss": 0.3779, + "step": 28780 + }, + { + "epoch": 4.172917346088343, + "grad_norm": 1.1666885614395142, + "learning_rate": 8.283809247717062e-06, + "loss": 0.3321, + "step": 28790 + }, + { + "epoch": 4.174366778997717, + "grad_norm": 1.3057632446289062, + "learning_rate": 8.269314393390346e-06, + "loss": 0.3854, + "step": 28800 + }, + { + "epoch": 4.175816211907091, + "grad_norm": 1.140620231628418, + "learning_rate": 8.254819539063632e-06, + "loss": 0.3756, + "step": 28810 + }, + { + "epoch": 4.177265644816465, + "grad_norm": 1.1446179151535034, + "learning_rate": 8.240324684736919e-06, + "loss": 0.3853, + "step": 28820 + }, + { + "epoch": 4.1787150777258395, + "grad_norm": 1.6761678457260132, + "learning_rate": 8.225829830410205e-06, + "loss": 0.3084, + "step": 28830 + }, + { + "epoch": 4.180164510635214, + "grad_norm": 1.3657466173171997, + "learning_rate": 8.21133497608349e-06, + "loss": 0.3652, + "step": 28840 + }, + { + "epoch": 4.181613943544588, + "grad_norm": 2.4275379180908203, + "learning_rate": 8.196840121756777e-06, + "loss": 0.3708, + "step": 28850 + }, + { + "epoch": 4.183063376453962, + "grad_norm": 1.454341173171997, + "learning_rate": 8.182345267430063e-06, + "loss": 0.319, + "step": 28860 + }, + { + "epoch": 4.184512809363337, + "grad_norm": 0.9970882534980774, + "learning_rate": 8.167850413103349e-06, + "loss": 0.3553, + "step": 28870 + }, + { + "epoch": 4.185962242272711, + "grad_norm": 2.611583709716797, + "learning_rate": 8.153355558776635e-06, + "loss": 0.3761, + "step": 28880 + }, + { + "epoch": 4.187411675182085, + "grad_norm": 2.846203327178955, + "learning_rate": 8.138860704449921e-06, + "loss": 0.3141, + "step": 28890 + }, + { + "epoch": 4.1888611080914595, + "grad_norm": 4.5427422523498535, + "learning_rate": 8.124365850123207e-06, + "loss": 0.3629, + "step": 28900 + }, + { + "epoch": 4.190310541000834, + "grad_norm": 0.8743450045585632, + "learning_rate": 8.109870995796493e-06, + "loss": 0.3784, + "step": 28910 + }, + { + "epoch": 4.191759973910208, + "grad_norm": 1.2091394662857056, + "learning_rate": 8.095376141469779e-06, + "loss": 0.3752, + "step": 28920 + }, + { + "epoch": 4.193209406819582, + "grad_norm": 0.9753203988075256, + "learning_rate": 8.080881287143065e-06, + "loss": 0.3474, + "step": 28930 + }, + { + "epoch": 4.194658839728956, + "grad_norm": 1.3854143619537354, + "learning_rate": 8.06638643281635e-06, + "loss": 0.3687, + "step": 28940 + }, + { + "epoch": 4.19610827263833, + "grad_norm": 0.9765649437904358, + "learning_rate": 8.051891578489637e-06, + "loss": 0.317, + "step": 28950 + }, + { + "epoch": 4.1975577055477045, + "grad_norm": 1.0090285539627075, + "learning_rate": 8.037396724162923e-06, + "loss": 0.3461, + "step": 28960 + }, + { + "epoch": 4.199007138457079, + "grad_norm": 4.256155014038086, + "learning_rate": 8.022901869836208e-06, + "loss": 0.3616, + "step": 28970 + }, + { + "epoch": 4.200456571366453, + "grad_norm": 1.2576302289962769, + "learning_rate": 8.008407015509495e-06, + "loss": 0.4056, + "step": 28980 + }, + { + "epoch": 4.201906004275827, + "grad_norm": 1.1981736421585083, + "learning_rate": 7.99391216118278e-06, + "loss": 0.3322, + "step": 28990 + }, + { + "epoch": 4.203355437185201, + "grad_norm": 2.0531351566314697, + "learning_rate": 7.979417306856067e-06, + "loss": 0.3387, + "step": 29000 + }, + { + "epoch": 4.203355437185201, + "eval_loss": 0.7457379698753357, + "eval_runtime": 671.0276, + "eval_samples_per_second": 51.406, + "eval_steps_per_second": 2.571, + "eval_token_accuracy": 0.00039100624516268497, + "step": 29000 + }, + { + "epoch": 4.204804870094575, + "grad_norm": 1.163179874420166, + "learning_rate": 7.964922452529352e-06, + "loss": 0.3684, + "step": 29010 + }, + { + "epoch": 4.206254303003949, + "grad_norm": 0.9141797423362732, + "learning_rate": 7.950427598202638e-06, + "loss": 0.3168, + "step": 29020 + }, + { + "epoch": 4.207703735913324, + "grad_norm": 4.031828880310059, + "learning_rate": 7.935932743875926e-06, + "loss": 0.3199, + "step": 29030 + }, + { + "epoch": 4.209153168822698, + "grad_norm": 0.893057644367218, + "learning_rate": 7.92143788954921e-06, + "loss": 0.3713, + "step": 29040 + }, + { + "epoch": 4.210602601732072, + "grad_norm": 2.213886022567749, + "learning_rate": 7.906943035222496e-06, + "loss": 0.416, + "step": 29050 + }, + { + "epoch": 4.212052034641447, + "grad_norm": 1.2855664491653442, + "learning_rate": 7.892448180895782e-06, + "loss": 0.3832, + "step": 29060 + }, + { + "epoch": 4.213501467550821, + "grad_norm": 3.5074546337127686, + "learning_rate": 7.877953326569068e-06, + "loss": 0.3621, + "step": 29070 + }, + { + "epoch": 4.214950900460195, + "grad_norm": 0.9053571224212646, + "learning_rate": 7.863458472242354e-06, + "loss": 0.3863, + "step": 29080 + }, + { + "epoch": 4.216400333369569, + "grad_norm": 1.0677660703659058, + "learning_rate": 7.84896361791564e-06, + "loss": 0.3479, + "step": 29090 + }, + { + "epoch": 4.217849766278944, + "grad_norm": 1.8787217140197754, + "learning_rate": 7.834468763588926e-06, + "loss": 0.3097, + "step": 29100 + }, + { + "epoch": 4.219299199188318, + "grad_norm": 1.2158981561660767, + "learning_rate": 7.819973909262212e-06, + "loss": 0.3648, + "step": 29110 + }, + { + "epoch": 4.220748632097692, + "grad_norm": 1.449982762336731, + "learning_rate": 7.805479054935498e-06, + "loss": 0.4157, + "step": 29120 + }, + { + "epoch": 4.222198065007066, + "grad_norm": 1.1894491910934448, + "learning_rate": 7.790984200608784e-06, + "loss": 0.4087, + "step": 29130 + }, + { + "epoch": 4.22364749791644, + "grad_norm": 1.0339592695236206, + "learning_rate": 7.77648934628207e-06, + "loss": 0.4189, + "step": 29140 + }, + { + "epoch": 4.225096930825814, + "grad_norm": 1.3686072826385498, + "learning_rate": 7.761994491955357e-06, + "loss": 0.3937, + "step": 29150 + }, + { + "epoch": 4.2265463637351885, + "grad_norm": 1.0015712976455688, + "learning_rate": 7.747499637628643e-06, + "loss": 0.3284, + "step": 29160 + }, + { + "epoch": 4.227995796644563, + "grad_norm": 1.5643901824951172, + "learning_rate": 7.733004783301929e-06, + "loss": 0.38, + "step": 29170 + }, + { + "epoch": 4.229445229553937, + "grad_norm": 2.5827622413635254, + "learning_rate": 7.718509928975213e-06, + "loss": 0.3809, + "step": 29180 + }, + { + "epoch": 4.230894662463311, + "grad_norm": 1.25447678565979, + "learning_rate": 7.7040150746485e-06, + "loss": 0.3496, + "step": 29190 + }, + { + "epoch": 4.232344095372685, + "grad_norm": 1.0713152885437012, + "learning_rate": 7.689520220321785e-06, + "loss": 0.3992, + "step": 29200 + }, + { + "epoch": 4.233793528282059, + "grad_norm": 1.319899559020996, + "learning_rate": 7.675025365995073e-06, + "loss": 0.355, + "step": 29210 + }, + { + "epoch": 4.2352429611914335, + "grad_norm": 4.824685096740723, + "learning_rate": 7.660530511668359e-06, + "loss": 0.3656, + "step": 29220 + }, + { + "epoch": 4.236692394100808, + "grad_norm": 1.3379300832748413, + "learning_rate": 7.646035657341643e-06, + "loss": 0.3793, + "step": 29230 + }, + { + "epoch": 4.238141827010182, + "grad_norm": 0.7790377140045166, + "learning_rate": 7.631540803014931e-06, + "loss": 0.3265, + "step": 29240 + }, + { + "epoch": 4.239591259919557, + "grad_norm": 2.0793423652648926, + "learning_rate": 7.617045948688216e-06, + "loss": 0.4222, + "step": 29250 + }, + { + "epoch": 4.241040692828931, + "grad_norm": 2.919572114944458, + "learning_rate": 7.6025510943615015e-06, + "loss": 0.3479, + "step": 29260 + }, + { + "epoch": 4.242490125738305, + "grad_norm": 2.324187755584717, + "learning_rate": 7.588056240034788e-06, + "loss": 0.3964, + "step": 29270 + }, + { + "epoch": 4.243939558647679, + "grad_norm": 1.03130304813385, + "learning_rate": 7.573561385708074e-06, + "loss": 0.3583, + "step": 29280 + }, + { + "epoch": 4.2453889915570535, + "grad_norm": 1.1990162134170532, + "learning_rate": 7.5590665313813605e-06, + "loss": 0.3342, + "step": 29290 + }, + { + "epoch": 4.246838424466428, + "grad_norm": 0.9026666879653931, + "learning_rate": 7.544571677054646e-06, + "loss": 0.3746, + "step": 29300 + }, + { + "epoch": 4.248287857375802, + "grad_norm": 1.1271377801895142, + "learning_rate": 7.530076822727932e-06, + "loss": 0.3457, + "step": 29310 + }, + { + "epoch": 4.249737290285176, + "grad_norm": 0.9937952756881714, + "learning_rate": 7.515581968401219e-06, + "loss": 0.343, + "step": 29320 + }, + { + "epoch": 4.25118672319455, + "grad_norm": 1.5706512928009033, + "learning_rate": 7.501087114074504e-06, + "loss": 0.398, + "step": 29330 + }, + { + "epoch": 4.252636156103924, + "grad_norm": 1.5114712715148926, + "learning_rate": 7.486592259747791e-06, + "loss": 0.4031, + "step": 29340 + }, + { + "epoch": 4.2540855890132985, + "grad_norm": 1.1789222955703735, + "learning_rate": 7.472097405421076e-06, + "loss": 0.358, + "step": 29350 + }, + { + "epoch": 4.255535021922673, + "grad_norm": 1.2037668228149414, + "learning_rate": 7.457602551094361e-06, + "loss": 0.3892, + "step": 29360 + }, + { + "epoch": 4.256984454832047, + "grad_norm": 2.206874370574951, + "learning_rate": 7.443107696767648e-06, + "loss": 0.4155, + "step": 29370 + }, + { + "epoch": 4.258433887741421, + "grad_norm": 1.2747567892074585, + "learning_rate": 7.428612842440934e-06, + "loss": 0.3568, + "step": 29380 + }, + { + "epoch": 4.259883320650795, + "grad_norm": 3.8064351081848145, + "learning_rate": 7.414117988114219e-06, + "loss": 0.3754, + "step": 29390 + }, + { + "epoch": 4.261332753560169, + "grad_norm": 1.3781629800796509, + "learning_rate": 7.399623133787506e-06, + "loss": 0.3825, + "step": 29400 + }, + { + "epoch": 4.262782186469543, + "grad_norm": 1.3373839855194092, + "learning_rate": 7.3851282794607914e-06, + "loss": 0.3651, + "step": 29410 + }, + { + "epoch": 4.264231619378918, + "grad_norm": 1.3949230909347534, + "learning_rate": 7.370633425134078e-06, + "loss": 0.3145, + "step": 29420 + }, + { + "epoch": 4.265681052288292, + "grad_norm": 1.4587570428848267, + "learning_rate": 7.3561385708073635e-06, + "loss": 0.4271, + "step": 29430 + }, + { + "epoch": 4.267130485197667, + "grad_norm": 1.9194419384002686, + "learning_rate": 7.34164371648065e-06, + "loss": 0.4094, + "step": 29440 + }, + { + "epoch": 4.268579918107041, + "grad_norm": 1.3405871391296387, + "learning_rate": 7.3271488621539365e-06, + "loss": 0.3578, + "step": 29450 + }, + { + "epoch": 4.270029351016415, + "grad_norm": 2.9382286071777344, + "learning_rate": 7.312654007827222e-06, + "loss": 0.3905, + "step": 29460 + }, + { + "epoch": 4.271478783925789, + "grad_norm": 1.1766321659088135, + "learning_rate": 7.298159153500507e-06, + "loss": 0.3477, + "step": 29470 + }, + { + "epoch": 4.272928216835163, + "grad_norm": 1.8907839059829712, + "learning_rate": 7.283664299173794e-06, + "loss": 0.3483, + "step": 29480 + }, + { + "epoch": 4.274377649744538, + "grad_norm": 0.7925496697425842, + "learning_rate": 7.269169444847079e-06, + "loss": 0.3633, + "step": 29490 + }, + { + "epoch": 4.275827082653912, + "grad_norm": 2.068272352218628, + "learning_rate": 7.254674590520366e-06, + "loss": 0.4196, + "step": 29500 + }, + { + "epoch": 4.277276515563286, + "grad_norm": 1.5290910005569458, + "learning_rate": 7.240179736193652e-06, + "loss": 0.3769, + "step": 29510 + }, + { + "epoch": 4.27872594847266, + "grad_norm": 1.2152941226959229, + "learning_rate": 7.225684881866937e-06, + "loss": 0.3677, + "step": 29520 + }, + { + "epoch": 4.280175381382034, + "grad_norm": 1.2992326021194458, + "learning_rate": 7.211190027540224e-06, + "loss": 0.325, + "step": 29530 + }, + { + "epoch": 4.281624814291408, + "grad_norm": 1.1786214113235474, + "learning_rate": 7.196695173213509e-06, + "loss": 0.3846, + "step": 29540 + }, + { + "epoch": 4.283074247200783, + "grad_norm": 1.2369322776794434, + "learning_rate": 7.182200318886796e-06, + "loss": 0.3818, + "step": 29550 + }, + { + "epoch": 4.284523680110157, + "grad_norm": 0.9638352990150452, + "learning_rate": 7.167705464560081e-06, + "loss": 0.3796, + "step": 29560 + }, + { + "epoch": 4.285973113019531, + "grad_norm": 3.598480463027954, + "learning_rate": 7.1532106102333674e-06, + "loss": 0.4363, + "step": 29570 + }, + { + "epoch": 4.287422545928905, + "grad_norm": 0.931704044342041, + "learning_rate": 7.1387157559066535e-06, + "loss": 0.3293, + "step": 29580 + }, + { + "epoch": 4.288871978838279, + "grad_norm": 2.29419207572937, + "learning_rate": 7.1242209015799396e-06, + "loss": 0.4097, + "step": 29590 + }, + { + "epoch": 4.290321411747653, + "grad_norm": 1.5176444053649902, + "learning_rate": 7.109726047253225e-06, + "loss": 0.388, + "step": 29600 + }, + { + "epoch": 4.291770844657028, + "grad_norm": 2.4443917274475098, + "learning_rate": 7.095231192926512e-06, + "loss": 0.3435, + "step": 29610 + }, + { + "epoch": 4.293220277566402, + "grad_norm": 0.9792112708091736, + "learning_rate": 7.080736338599797e-06, + "loss": 0.4226, + "step": 29620 + }, + { + "epoch": 4.294669710475777, + "grad_norm": 2.7311971187591553, + "learning_rate": 7.066241484273084e-06, + "loss": 0.3917, + "step": 29630 + }, + { + "epoch": 4.296119143385151, + "grad_norm": 1.7587502002716064, + "learning_rate": 7.05174662994637e-06, + "loss": 0.3605, + "step": 29640 + }, + { + "epoch": 4.297568576294525, + "grad_norm": 1.7742453813552856, + "learning_rate": 7.037251775619655e-06, + "loss": 0.3729, + "step": 29650 + }, + { + "epoch": 4.299018009203899, + "grad_norm": 2.9178965091705322, + "learning_rate": 7.022756921292942e-06, + "loss": 0.3903, + "step": 29660 + }, + { + "epoch": 4.300467442113273, + "grad_norm": 2.7828633785247803, + "learning_rate": 7.008262066966227e-06, + "loss": 0.3831, + "step": 29670 + }, + { + "epoch": 4.3019168750226475, + "grad_norm": 1.2201550006866455, + "learning_rate": 6.993767212639512e-06, + "loss": 0.3784, + "step": 29680 + }, + { + "epoch": 4.303366307932022, + "grad_norm": 1.3287190198898315, + "learning_rate": 6.979272358312799e-06, + "loss": 0.3481, + "step": 29690 + }, + { + "epoch": 4.304815740841396, + "grad_norm": 2.4197843074798584, + "learning_rate": 6.964777503986085e-06, + "loss": 0.4058, + "step": 29700 + }, + { + "epoch": 4.30626517375077, + "grad_norm": 0.7661453485488892, + "learning_rate": 6.950282649659371e-06, + "loss": 0.42, + "step": 29710 + }, + { + "epoch": 4.307714606660144, + "grad_norm": 1.3475944995880127, + "learning_rate": 6.935787795332657e-06, + "loss": 0.383, + "step": 29720 + }, + { + "epoch": 4.309164039569518, + "grad_norm": 1.3460301160812378, + "learning_rate": 6.921292941005943e-06, + "loss": 0.3737, + "step": 29730 + }, + { + "epoch": 4.3106134724788925, + "grad_norm": 1.1138921976089478, + "learning_rate": 6.9067980866792295e-06, + "loss": 0.3157, + "step": 29740 + }, + { + "epoch": 4.312062905388267, + "grad_norm": 1.7547369003295898, + "learning_rate": 6.892303232352515e-06, + "loss": 0.3319, + "step": 29750 + }, + { + "epoch": 4.313512338297641, + "grad_norm": 2.247833013534546, + "learning_rate": 6.877808378025802e-06, + "loss": 0.3575, + "step": 29760 + }, + { + "epoch": 4.314961771207015, + "grad_norm": 1.2264424562454224, + "learning_rate": 6.863313523699088e-06, + "loss": 0.4161, + "step": 29770 + }, + { + "epoch": 4.316411204116389, + "grad_norm": 3.117626428604126, + "learning_rate": 6.848818669372373e-06, + "loss": 0.4135, + "step": 29780 + }, + { + "epoch": 4.317860637025763, + "grad_norm": 1.0561470985412598, + "learning_rate": 6.83432381504566e-06, + "loss": 0.3621, + "step": 29790 + }, + { + "epoch": 4.319310069935138, + "grad_norm": 2.285572052001953, + "learning_rate": 6.819828960718945e-06, + "loss": 0.3992, + "step": 29800 + }, + { + "epoch": 4.3207595028445125, + "grad_norm": 1.8720812797546387, + "learning_rate": 6.80533410639223e-06, + "loss": 0.4166, + "step": 29810 + }, + { + "epoch": 4.322208935753887, + "grad_norm": 1.1257436275482178, + "learning_rate": 6.790839252065517e-06, + "loss": 0.4057, + "step": 29820 + }, + { + "epoch": 4.323658368663261, + "grad_norm": 1.3093199729919434, + "learning_rate": 6.776344397738803e-06, + "loss": 0.396, + "step": 29830 + }, + { + "epoch": 4.325107801572635, + "grad_norm": 1.3871891498565674, + "learning_rate": 6.761849543412089e-06, + "loss": 0.362, + "step": 29840 + }, + { + "epoch": 4.326557234482009, + "grad_norm": 1.465741753578186, + "learning_rate": 6.747354689085375e-06, + "loss": 0.4141, + "step": 29850 + }, + { + "epoch": 4.328006667391383, + "grad_norm": 2.450000047683716, + "learning_rate": 6.7328598347586605e-06, + "loss": 0.3643, + "step": 29860 + }, + { + "epoch": 4.3294561003007574, + "grad_norm": 2.5313851833343506, + "learning_rate": 6.718364980431947e-06, + "loss": 0.3701, + "step": 29870 + }, + { + "epoch": 4.330905533210132, + "grad_norm": 4.837574005126953, + "learning_rate": 6.7038701261052326e-06, + "loss": 0.3432, + "step": 29880 + }, + { + "epoch": 4.332354966119506, + "grad_norm": 1.1457256078720093, + "learning_rate": 6.6893752717785195e-06, + "loss": 0.4141, + "step": 29890 + }, + { + "epoch": 4.33380439902888, + "grad_norm": 2.2787954807281494, + "learning_rate": 6.6748804174518055e-06, + "loss": 0.3558, + "step": 29900 + }, + { + "epoch": 4.335253831938254, + "grad_norm": 1.28756582736969, + "learning_rate": 6.660385563125091e-06, + "loss": 0.331, + "step": 29910 + }, + { + "epoch": 4.336703264847628, + "grad_norm": 1.1767174005508423, + "learning_rate": 6.645890708798378e-06, + "loss": 0.375, + "step": 29920 + }, + { + "epoch": 4.338152697757002, + "grad_norm": 0.9055870771408081, + "learning_rate": 6.631395854471663e-06, + "loss": 0.3931, + "step": 29930 + }, + { + "epoch": 4.339602130666377, + "grad_norm": 0.9743188619613647, + "learning_rate": 6.616901000144948e-06, + "loss": 0.322, + "step": 29940 + }, + { + "epoch": 4.341051563575751, + "grad_norm": 0.8511983156204224, + "learning_rate": 6.602406145818235e-06, + "loss": 0.3795, + "step": 29950 + }, + { + "epoch": 4.342500996485125, + "grad_norm": 2.069638252258301, + "learning_rate": 6.587911291491521e-06, + "loss": 0.4144, + "step": 29960 + }, + { + "epoch": 4.343950429394499, + "grad_norm": 1.1656533479690552, + "learning_rate": 6.573416437164807e-06, + "loss": 0.3852, + "step": 29970 + }, + { + "epoch": 4.345399862303873, + "grad_norm": 0.7122425436973572, + "learning_rate": 6.558921582838093e-06, + "loss": 0.3483, + "step": 29980 + }, + { + "epoch": 4.346849295213248, + "grad_norm": 2.2737338542938232, + "learning_rate": 6.544426728511378e-06, + "loss": 0.3555, + "step": 29990 + }, + { + "epoch": 4.348298728122622, + "grad_norm": 3.028883934020996, + "learning_rate": 6.529931874184665e-06, + "loss": 0.3979, + "step": 30000 + }, + { + "epoch": 4.348298728122622, + "eval_loss": 0.7482135891914368, + "eval_runtime": 670.7938, + "eval_samples_per_second": 51.424, + "eval_steps_per_second": 2.572, + "eval_token_accuracy": 0.00039020203651364614, + "step": 30000 + }, + { + "epoch": 4.349748161031997, + "grad_norm": 5.186731815338135, + "learning_rate": 6.51543701985795e-06, + "loss": 0.3573, + "step": 30010 + }, + { + "epoch": 4.351197593941371, + "grad_norm": 1.2703396081924438, + "learning_rate": 6.5009421655312365e-06, + "loss": 0.3683, + "step": 30020 + }, + { + "epoch": 4.352647026850745, + "grad_norm": 1.4026705026626587, + "learning_rate": 6.4864473112045225e-06, + "loss": 0.3419, + "step": 30030 + }, + { + "epoch": 4.354096459760119, + "grad_norm": 2.418346405029297, + "learning_rate": 6.4719524568778086e-06, + "loss": 0.3841, + "step": 30040 + }, + { + "epoch": 4.355545892669493, + "grad_norm": 1.4166592359542847, + "learning_rate": 6.4574576025510955e-06, + "loss": 0.3962, + "step": 30050 + }, + { + "epoch": 4.356995325578867, + "grad_norm": 3.7106871604919434, + "learning_rate": 6.442962748224381e-06, + "loss": 0.3566, + "step": 30060 + }, + { + "epoch": 4.3584447584882415, + "grad_norm": 2.121419906616211, + "learning_rate": 6.428467893897666e-06, + "loss": 0.3455, + "step": 30070 + }, + { + "epoch": 4.359894191397616, + "grad_norm": 1.0121147632598877, + "learning_rate": 6.413973039570953e-06, + "loss": 0.3533, + "step": 30080 + }, + { + "epoch": 4.36134362430699, + "grad_norm": 1.1510685682296753, + "learning_rate": 6.399478185244239e-06, + "loss": 0.3555, + "step": 30090 + }, + { + "epoch": 4.362793057216364, + "grad_norm": 1.9346082210540771, + "learning_rate": 6.384983330917525e-06, + "loss": 0.3558, + "step": 30100 + }, + { + "epoch": 4.364242490125738, + "grad_norm": 0.8266393542289734, + "learning_rate": 6.370488476590811e-06, + "loss": 0.3293, + "step": 30110 + }, + { + "epoch": 4.365691923035112, + "grad_norm": 3.2215356826782227, + "learning_rate": 6.355993622264096e-06, + "loss": 0.4042, + "step": 30120 + }, + { + "epoch": 4.3671413559444865, + "grad_norm": 1.1464142799377441, + "learning_rate": 6.341498767937383e-06, + "loss": 0.3664, + "step": 30130 + }, + { + "epoch": 4.368590788853861, + "grad_norm": 1.333441972732544, + "learning_rate": 6.327003913610668e-06, + "loss": 0.3791, + "step": 30140 + }, + { + "epoch": 4.370040221763235, + "grad_norm": 2.17280912399292, + "learning_rate": 6.312509059283954e-06, + "loss": 0.3384, + "step": 30150 + }, + { + "epoch": 4.371489654672609, + "grad_norm": 0.8691934943199158, + "learning_rate": 6.29801420495724e-06, + "loss": 0.3567, + "step": 30160 + }, + { + "epoch": 4.372939087581983, + "grad_norm": 3.7443151473999023, + "learning_rate": 6.283519350630526e-06, + "loss": 0.351, + "step": 30170 + }, + { + "epoch": 4.374388520491358, + "grad_norm": 1.8633484840393066, + "learning_rate": 6.269024496303813e-06, + "loss": 0.3379, + "step": 30180 + }, + { + "epoch": 4.375837953400732, + "grad_norm": 1.3033164739608765, + "learning_rate": 6.2545296419770985e-06, + "loss": 0.3508, + "step": 30190 + }, + { + "epoch": 4.3772873863101065, + "grad_norm": 1.2720485925674438, + "learning_rate": 6.2400347876503846e-06, + "loss": 0.3582, + "step": 30200 + }, + { + "epoch": 4.378736819219481, + "grad_norm": 1.2477245330810547, + "learning_rate": 6.22553993332367e-06, + "loss": 0.3825, + "step": 30210 + }, + { + "epoch": 4.380186252128855, + "grad_norm": 0.7797627449035645, + "learning_rate": 6.211045078996957e-06, + "loss": 0.3619, + "step": 30220 + }, + { + "epoch": 4.381635685038229, + "grad_norm": 1.2631372213363647, + "learning_rate": 6.196550224670243e-06, + "loss": 0.4119, + "step": 30230 + }, + { + "epoch": 4.383085117947603, + "grad_norm": 1.0257337093353271, + "learning_rate": 6.182055370343529e-06, + "loss": 0.3778, + "step": 30240 + }, + { + "epoch": 4.384534550856977, + "grad_norm": 1.3090648651123047, + "learning_rate": 6.167560516016814e-06, + "loss": 0.389, + "step": 30250 + }, + { + "epoch": 4.3859839837663515, + "grad_norm": 2.94443941116333, + "learning_rate": 6.1530656616901e-06, + "loss": 0.3542, + "step": 30260 + }, + { + "epoch": 4.387433416675726, + "grad_norm": 2.2124414443969727, + "learning_rate": 6.138570807363386e-06, + "loss": 0.3965, + "step": 30270 + }, + { + "epoch": 4.3888828495851, + "grad_norm": 0.953989565372467, + "learning_rate": 6.124075953036672e-06, + "loss": 0.3712, + "step": 30280 + }, + { + "epoch": 4.390332282494474, + "grad_norm": 1.1157747507095337, + "learning_rate": 6.109581098709958e-06, + "loss": 0.3971, + "step": 30290 + }, + { + "epoch": 4.391781715403848, + "grad_norm": 2.8602752685546875, + "learning_rate": 6.095086244383244e-06, + "loss": 0.3393, + "step": 30300 + }, + { + "epoch": 4.393231148313222, + "grad_norm": 1.1085230112075806, + "learning_rate": 6.08059139005653e-06, + "loss": 0.3075, + "step": 30310 + }, + { + "epoch": 4.394680581222596, + "grad_norm": 1.6991569995880127, + "learning_rate": 6.066096535729816e-06, + "loss": 0.3769, + "step": 30320 + }, + { + "epoch": 4.396130014131971, + "grad_norm": 2.418412923812866, + "learning_rate": 6.051601681403102e-06, + "loss": 0.3531, + "step": 30330 + }, + { + "epoch": 4.397579447041345, + "grad_norm": 3.8710434436798096, + "learning_rate": 6.037106827076388e-06, + "loss": 0.3861, + "step": 30340 + }, + { + "epoch": 4.399028879950719, + "grad_norm": 1.5485104322433472, + "learning_rate": 6.022611972749674e-06, + "loss": 0.3489, + "step": 30350 + }, + { + "epoch": 4.400478312860093, + "grad_norm": 1.4378596544265747, + "learning_rate": 6.0081171184229606e-06, + "loss": 0.3723, + "step": 30360 + }, + { + "epoch": 4.401927745769468, + "grad_norm": 1.7529176473617554, + "learning_rate": 5.993622264096247e-06, + "loss": 0.3547, + "step": 30370 + }, + { + "epoch": 4.403377178678842, + "grad_norm": 5.1034770011901855, + "learning_rate": 5.979127409769532e-06, + "loss": 0.3742, + "step": 30380 + }, + { + "epoch": 4.404826611588216, + "grad_norm": 1.0158805847167969, + "learning_rate": 5.964632555442818e-06, + "loss": 0.2926, + "step": 30390 + }, + { + "epoch": 4.406276044497591, + "grad_norm": 1.960871934890747, + "learning_rate": 5.950137701116104e-06, + "loss": 0.3627, + "step": 30400 + }, + { + "epoch": 4.407725477406965, + "grad_norm": 1.0288090705871582, + "learning_rate": 5.93564284678939e-06, + "loss": 0.3859, + "step": 30410 + }, + { + "epoch": 4.409174910316339, + "grad_norm": 2.829598903656006, + "learning_rate": 5.921147992462676e-06, + "loss": 0.411, + "step": 30420 + }, + { + "epoch": 4.410624343225713, + "grad_norm": 2.1486783027648926, + "learning_rate": 5.906653138135962e-06, + "loss": 0.437, + "step": 30430 + }, + { + "epoch": 4.412073776135087, + "grad_norm": 3.272394895553589, + "learning_rate": 5.892158283809248e-06, + "loss": 0.3843, + "step": 30440 + }, + { + "epoch": 4.413523209044461, + "grad_norm": 2.6068198680877686, + "learning_rate": 5.877663429482534e-06, + "loss": 0.341, + "step": 30450 + }, + { + "epoch": 4.4149726419538355, + "grad_norm": 1.2102731466293335, + "learning_rate": 5.863168575155819e-06, + "loss": 0.3798, + "step": 30460 + }, + { + "epoch": 4.41642207486321, + "grad_norm": 1.095699429512024, + "learning_rate": 5.8486737208291055e-06, + "loss": 0.3586, + "step": 30470 + }, + { + "epoch": 4.417871507772584, + "grad_norm": 0.7822348475456238, + "learning_rate": 5.8341788665023915e-06, + "loss": 0.4051, + "step": 30480 + }, + { + "epoch": 4.419320940681958, + "grad_norm": 1.0242159366607666, + "learning_rate": 5.819684012175678e-06, + "loss": 0.3461, + "step": 30490 + }, + { + "epoch": 4.420770373591332, + "grad_norm": 0.9580021500587463, + "learning_rate": 5.8051891578489645e-06, + "loss": 0.3512, + "step": 30500 + }, + { + "epoch": 4.422219806500706, + "grad_norm": 0.9196089506149292, + "learning_rate": 5.79069430352225e-06, + "loss": 0.3654, + "step": 30510 + }, + { + "epoch": 4.4236692394100805, + "grad_norm": 1.0600301027297974, + "learning_rate": 5.776199449195536e-06, + "loss": 0.36, + "step": 30520 + }, + { + "epoch": 4.425118672319455, + "grad_norm": 1.2911509275436401, + "learning_rate": 5.761704594868822e-06, + "loss": 0.3656, + "step": 30530 + }, + { + "epoch": 4.426568105228829, + "grad_norm": 1.0180575847625732, + "learning_rate": 5.747209740542108e-06, + "loss": 0.3845, + "step": 30540 + }, + { + "epoch": 4.428017538138203, + "grad_norm": 2.772061586380005, + "learning_rate": 5.732714886215394e-06, + "loss": 0.4467, + "step": 30550 + }, + { + "epoch": 4.429466971047578, + "grad_norm": 1.3060688972473145, + "learning_rate": 5.71822003188868e-06, + "loss": 0.4538, + "step": 30560 + }, + { + "epoch": 4.430916403956952, + "grad_norm": 1.272159218788147, + "learning_rate": 5.703725177561966e-06, + "loss": 0.3817, + "step": 30570 + }, + { + "epoch": 4.432365836866326, + "grad_norm": 1.1166160106658936, + "learning_rate": 5.690679808667923e-06, + "loss": 0.3476, + "step": 30580 + }, + { + "epoch": 4.4338152697757005, + "grad_norm": 3.2157490253448486, + "learning_rate": 5.676184954341209e-06, + "loss": 0.3477, + "step": 30590 + }, + { + "epoch": 4.435264702685075, + "grad_norm": 3.557138204574585, + "learning_rate": 5.661690100014496e-06, + "loss": 0.4244, + "step": 30600 + }, + { + "epoch": 4.436714135594449, + "grad_norm": 1.1806175708770752, + "learning_rate": 5.647195245687781e-06, + "loss": 0.3459, + "step": 30610 + }, + { + "epoch": 4.438163568503823, + "grad_norm": 2.0846927165985107, + "learning_rate": 5.632700391361067e-06, + "loss": 0.3608, + "step": 30620 + }, + { + "epoch": 4.439613001413197, + "grad_norm": 1.833366870880127, + "learning_rate": 5.618205537034353e-06, + "loss": 0.3432, + "step": 30630 + }, + { + "epoch": 4.441062434322571, + "grad_norm": 1.0325963497161865, + "learning_rate": 5.603710682707639e-06, + "loss": 0.3604, + "step": 30640 + }, + { + "epoch": 4.4425118672319455, + "grad_norm": 2.9289095401763916, + "learning_rate": 5.589215828380924e-06, + "loss": 0.4177, + "step": 30650 + }, + { + "epoch": 4.44396130014132, + "grad_norm": 5.7591233253479, + "learning_rate": 5.574720974054211e-06, + "loss": 0.3608, + "step": 30660 + }, + { + "epoch": 4.445410733050694, + "grad_norm": 1.84727144241333, + "learning_rate": 5.560226119727497e-06, + "loss": 0.381, + "step": 30670 + }, + { + "epoch": 4.446860165960068, + "grad_norm": 2.1353752613067627, + "learning_rate": 5.545731265400783e-06, + "loss": 0.3827, + "step": 30680 + }, + { + "epoch": 4.448309598869442, + "grad_norm": 0.8009002208709717, + "learning_rate": 5.531236411074069e-06, + "loss": 0.3945, + "step": 30690 + }, + { + "epoch": 4.449759031778816, + "grad_norm": 1.4904606342315674, + "learning_rate": 5.5167415567473546e-06, + "loss": 0.3419, + "step": 30700 + }, + { + "epoch": 4.45120846468819, + "grad_norm": 0.9975236654281616, + "learning_rate": 5.502246702420641e-06, + "loss": 0.3785, + "step": 30710 + }, + { + "epoch": 4.452657897597565, + "grad_norm": 1.2959935665130615, + "learning_rate": 5.487751848093927e-06, + "loss": 0.3375, + "step": 30720 + }, + { + "epoch": 4.454107330506939, + "grad_norm": 0.96599942445755, + "learning_rate": 5.4732569937672136e-06, + "loss": 0.4297, + "step": 30730 + }, + { + "epoch": 4.455556763416313, + "grad_norm": 3.204017400741577, + "learning_rate": 5.458762139440499e-06, + "loss": 0.4341, + "step": 30740 + }, + { + "epoch": 4.457006196325688, + "grad_norm": 2.8199219703674316, + "learning_rate": 5.444267285113785e-06, + "loss": 0.4114, + "step": 30750 + }, + { + "epoch": 4.458455629235062, + "grad_norm": 1.0257399082183838, + "learning_rate": 5.429772430787071e-06, + "loss": 0.3839, + "step": 30760 + }, + { + "epoch": 4.459905062144436, + "grad_norm": 1.6263266801834106, + "learning_rate": 5.415277576460357e-06, + "loss": 0.3479, + "step": 30770 + }, + { + "epoch": 4.46135449505381, + "grad_norm": 1.4419124126434326, + "learning_rate": 5.400782722133642e-06, + "loss": 0.3483, + "step": 30780 + }, + { + "epoch": 4.462803927963185, + "grad_norm": 1.30050790309906, + "learning_rate": 5.386287867806929e-06, + "loss": 0.3943, + "step": 30790 + }, + { + "epoch": 4.464253360872559, + "grad_norm": 1.2906863689422607, + "learning_rate": 5.371793013480215e-06, + "loss": 0.371, + "step": 30800 + }, + { + "epoch": 4.465702793781933, + "grad_norm": 3.1720147132873535, + "learning_rate": 5.357298159153501e-06, + "loss": 0.4104, + "step": 30810 + }, + { + "epoch": 4.467152226691307, + "grad_norm": 3.066178321838379, + "learning_rate": 5.342803304826786e-06, + "loss": 0.3952, + "step": 30820 + }, + { + "epoch": 4.468601659600681, + "grad_norm": 1.2431939840316772, + "learning_rate": 5.328308450500072e-06, + "loss": 0.3695, + "step": 30830 + }, + { + "epoch": 4.470051092510055, + "grad_norm": 1.892478108406067, + "learning_rate": 5.3138135961733585e-06, + "loss": 0.4132, + "step": 30840 + }, + { + "epoch": 4.4715005254194296, + "grad_norm": 3.275731325149536, + "learning_rate": 5.2993187418466445e-06, + "loss": 0.4039, + "step": 30850 + }, + { + "epoch": 4.472949958328804, + "grad_norm": 1.752671718597412, + "learning_rate": 5.284823887519931e-06, + "loss": 0.3586, + "step": 30860 + }, + { + "epoch": 4.474399391238178, + "grad_norm": 1.0183453559875488, + "learning_rate": 5.270329033193217e-06, + "loss": 0.3996, + "step": 30870 + }, + { + "epoch": 4.475848824147552, + "grad_norm": 1.460972547531128, + "learning_rate": 5.255834178866503e-06, + "loss": 0.4028, + "step": 30880 + }, + { + "epoch": 4.477298257056926, + "grad_norm": 0.9415088295936584, + "learning_rate": 5.241339324539789e-06, + "loss": 0.3684, + "step": 30890 + }, + { + "epoch": 4.4787476899663, + "grad_norm": 1.0596305131912231, + "learning_rate": 5.226844470213075e-06, + "loss": 0.3759, + "step": 30900 + }, + { + "epoch": 4.4801971228756745, + "grad_norm": 1.233249306678772, + "learning_rate": 5.21234961588636e-06, + "loss": 0.3843, + "step": 30910 + }, + { + "epoch": 4.48164655578505, + "grad_norm": 0.9509343504905701, + "learning_rate": 5.197854761559647e-06, + "loss": 0.3638, + "step": 30920 + }, + { + "epoch": 4.483095988694424, + "grad_norm": 1.282771348953247, + "learning_rate": 5.183359907232933e-06, + "loss": 0.3996, + "step": 30930 + }, + { + "epoch": 4.484545421603798, + "grad_norm": 0.8737091422080994, + "learning_rate": 5.168865052906219e-06, + "loss": 0.4465, + "step": 30940 + }, + { + "epoch": 4.485994854513172, + "grad_norm": 1.1933667659759521, + "learning_rate": 5.154370198579504e-06, + "loss": 0.4358, + "step": 30950 + }, + { + "epoch": 4.487444287422546, + "grad_norm": 3.5740418434143066, + "learning_rate": 5.13987534425279e-06, + "loss": 0.4641, + "step": 30960 + }, + { + "epoch": 4.48889372033192, + "grad_norm": 1.8437187671661377, + "learning_rate": 5.125380489926076e-06, + "loss": 0.3242, + "step": 30970 + }, + { + "epoch": 4.4903431532412945, + "grad_norm": 0.7734196782112122, + "learning_rate": 5.110885635599362e-06, + "loss": 0.3591, + "step": 30980 + }, + { + "epoch": 4.491792586150669, + "grad_norm": 1.2379523515701294, + "learning_rate": 5.096390781272648e-06, + "loss": 0.3943, + "step": 30990 + }, + { + "epoch": 4.493242019060043, + "grad_norm": 1.5033332109451294, + "learning_rate": 5.0818959269459345e-06, + "loss": 0.3712, + "step": 31000 + }, + { + "epoch": 4.493242019060043, + "eval_loss": 0.7379248738288879, + "eval_runtime": 671.531, + "eval_samples_per_second": 51.368, + "eval_steps_per_second": 2.569, + "eval_token_accuracy": 0.00039438392148864813, + "step": 31000 + }, + { + "epoch": 4.494691451969417, + "grad_norm": 0.9957663416862488, + "learning_rate": 5.0674010726192205e-06, + "loss": 0.3583, + "step": 31010 + }, + { + "epoch": 4.496140884878791, + "grad_norm": 2.0881576538085938, + "learning_rate": 5.0529062182925066e-06, + "loss": 0.457, + "step": 31020 + }, + { + "epoch": 4.497590317788165, + "grad_norm": 2.050384283065796, + "learning_rate": 5.038411363965792e-06, + "loss": 0.3787, + "step": 31030 + }, + { + "epoch": 4.4990397506975395, + "grad_norm": 1.6135433912277222, + "learning_rate": 5.023916509639078e-06, + "loss": 0.3959, + "step": 31040 + }, + { + "epoch": 4.500489183606914, + "grad_norm": 1.5975104570388794, + "learning_rate": 5.009421655312365e-06, + "loss": 0.3822, + "step": 31050 + }, + { + "epoch": 4.501938616516288, + "grad_norm": 0.8000165820121765, + "learning_rate": 4.994926800985651e-06, + "loss": 0.4042, + "step": 31060 + }, + { + "epoch": 4.503388049425662, + "grad_norm": 2.54587984085083, + "learning_rate": 4.980431946658937e-06, + "loss": 0.3736, + "step": 31070 + }, + { + "epoch": 4.504837482335036, + "grad_norm": 1.709140419960022, + "learning_rate": 4.965937092332222e-06, + "loss": 0.3642, + "step": 31080 + }, + { + "epoch": 4.50628691524441, + "grad_norm": 1.4000811576843262, + "learning_rate": 4.951442238005508e-06, + "loss": 0.4011, + "step": 31090 + }, + { + "epoch": 4.507736348153784, + "grad_norm": 1.3397213220596313, + "learning_rate": 4.936947383678794e-06, + "loss": 0.3643, + "step": 31100 + }, + { + "epoch": 4.5091857810631595, + "grad_norm": 1.9789170026779175, + "learning_rate": 4.92245252935208e-06, + "loss": 0.3884, + "step": 31110 + }, + { + "epoch": 4.510635213972533, + "grad_norm": 1.190403938293457, + "learning_rate": 4.907957675025366e-06, + "loss": 0.3527, + "step": 31120 + }, + { + "epoch": 4.512084646881908, + "grad_norm": 2.6957547664642334, + "learning_rate": 4.893462820698652e-06, + "loss": 0.3774, + "step": 31130 + }, + { + "epoch": 4.513534079791282, + "grad_norm": 2.3824095726013184, + "learning_rate": 4.878967966371938e-06, + "loss": 0.3082, + "step": 31140 + }, + { + "epoch": 4.514983512700656, + "grad_norm": 2.6183104515075684, + "learning_rate": 4.864473112045224e-06, + "loss": 0.384, + "step": 31150 + }, + { + "epoch": 4.51643294561003, + "grad_norm": 3.184983491897583, + "learning_rate": 4.84997825771851e-06, + "loss": 0.4113, + "step": 31160 + }, + { + "epoch": 4.517882378519404, + "grad_norm": 1.9513596296310425, + "learning_rate": 4.835483403391796e-06, + "loss": 0.3987, + "step": 31170 + }, + { + "epoch": 4.519331811428779, + "grad_norm": 1.6810030937194824, + "learning_rate": 4.8209885490650826e-06, + "loss": 0.3304, + "step": 31180 + }, + { + "epoch": 4.520781244338153, + "grad_norm": 1.9544678926467896, + "learning_rate": 4.806493694738369e-06, + "loss": 0.3758, + "step": 31190 + }, + { + "epoch": 4.522230677247527, + "grad_norm": 1.5571215152740479, + "learning_rate": 4.791998840411654e-06, + "loss": 0.387, + "step": 31200 + }, + { + "epoch": 4.523680110156901, + "grad_norm": 1.1763455867767334, + "learning_rate": 4.77750398608494e-06, + "loss": 0.3823, + "step": 31210 + }, + { + "epoch": 4.525129543066275, + "grad_norm": 0.8894882798194885, + "learning_rate": 4.763009131758226e-06, + "loss": 0.3887, + "step": 31220 + }, + { + "epoch": 4.526578975975649, + "grad_norm": 1.1514997482299805, + "learning_rate": 4.748514277431512e-06, + "loss": 0.3829, + "step": 31230 + }, + { + "epoch": 4.528028408885024, + "grad_norm": 2.9833333492279053, + "learning_rate": 4.734019423104798e-06, + "loss": 0.3914, + "step": 31240 + }, + { + "epoch": 4.529477841794398, + "grad_norm": 2.5646626949310303, + "learning_rate": 4.719524568778084e-06, + "loss": 0.3732, + "step": 31250 + }, + { + "epoch": 4.530927274703772, + "grad_norm": 1.1297060251235962, + "learning_rate": 4.70502971445137e-06, + "loss": 0.4017, + "step": 31260 + }, + { + "epoch": 4.532376707613146, + "grad_norm": 0.9327825903892517, + "learning_rate": 4.690534860124656e-06, + "loss": 0.4008, + "step": 31270 + }, + { + "epoch": 4.53382614052252, + "grad_norm": 3.265214681625366, + "learning_rate": 4.676040005797942e-06, + "loss": 0.3892, + "step": 31280 + }, + { + "epoch": 4.535275573431894, + "grad_norm": 1.352654218673706, + "learning_rate": 4.6615451514712275e-06, + "loss": 0.3863, + "step": 31290 + }, + { + "epoch": 4.536725006341269, + "grad_norm": 2.224597692489624, + "learning_rate": 4.6470502971445135e-06, + "loss": 0.3573, + "step": 31300 + }, + { + "epoch": 4.538174439250644, + "grad_norm": 3.5135717391967773, + "learning_rate": 4.6325554428178e-06, + "loss": 0.3702, + "step": 31310 + }, + { + "epoch": 4.539623872160018, + "grad_norm": 1.3063206672668457, + "learning_rate": 4.6180605884910865e-06, + "loss": 0.3273, + "step": 31320 + }, + { + "epoch": 4.541073305069392, + "grad_norm": 2.603647232055664, + "learning_rate": 4.603565734164372e-06, + "loss": 0.3497, + "step": 31330 + }, + { + "epoch": 4.542522737978766, + "grad_norm": 2.3394999504089355, + "learning_rate": 4.589070879837658e-06, + "loss": 0.3514, + "step": 31340 + }, + { + "epoch": 4.54397217088814, + "grad_norm": 2.2680232524871826, + "learning_rate": 4.574576025510944e-06, + "loss": 0.3498, + "step": 31350 + }, + { + "epoch": 4.545421603797514, + "grad_norm": 3.2815279960632324, + "learning_rate": 4.56008117118423e-06, + "loss": 0.3533, + "step": 31360 + }, + { + "epoch": 4.5468710367068885, + "grad_norm": 1.1699095964431763, + "learning_rate": 4.545586316857516e-06, + "loss": 0.3879, + "step": 31370 + }, + { + "epoch": 4.548320469616263, + "grad_norm": 1.0367563962936401, + "learning_rate": 4.531091462530802e-06, + "loss": 0.3172, + "step": 31380 + }, + { + "epoch": 4.549769902525637, + "grad_norm": 0.8867725133895874, + "learning_rate": 4.516596608204088e-06, + "loss": 0.3398, + "step": 31390 + }, + { + "epoch": 4.551219335435011, + "grad_norm": 2.650059700012207, + "learning_rate": 4.502101753877374e-06, + "loss": 0.349, + "step": 31400 + }, + { + "epoch": 4.552668768344385, + "grad_norm": 1.0593748092651367, + "learning_rate": 4.487606899550659e-06, + "loss": 0.3405, + "step": 31410 + }, + { + "epoch": 4.554118201253759, + "grad_norm": 2.0027034282684326, + "learning_rate": 4.473112045223945e-06, + "loss": 0.4048, + "step": 31420 + }, + { + "epoch": 4.5555676341631335, + "grad_norm": 0.9918128252029419, + "learning_rate": 4.458617190897231e-06, + "loss": 0.3633, + "step": 31430 + }, + { + "epoch": 4.557017067072508, + "grad_norm": 1.1196961402893066, + "learning_rate": 4.444122336570518e-06, + "loss": 0.3797, + "step": 31440 + }, + { + "epoch": 4.558466499981882, + "grad_norm": 0.9278882145881653, + "learning_rate": 4.429627482243804e-06, + "loss": 0.3514, + "step": 31450 + }, + { + "epoch": 4.559915932891256, + "grad_norm": 3.254080057144165, + "learning_rate": 4.4151326279170895e-06, + "loss": 0.3482, + "step": 31460 + }, + { + "epoch": 4.56136536580063, + "grad_norm": 1.4721205234527588, + "learning_rate": 4.400637773590376e-06, + "loss": 0.3894, + "step": 31470 + }, + { + "epoch": 4.562814798710004, + "grad_norm": 2.2425873279571533, + "learning_rate": 4.386142919263662e-06, + "loss": 0.3487, + "step": 31480 + }, + { + "epoch": 4.564264231619379, + "grad_norm": 1.2692406177520752, + "learning_rate": 4.371648064936948e-06, + "loss": 0.3727, + "step": 31490 + }, + { + "epoch": 4.5657136645287535, + "grad_norm": 3.3129732608795166, + "learning_rate": 4.357153210610234e-06, + "loss": 0.378, + "step": 31500 + }, + { + "epoch": 4.567163097438128, + "grad_norm": 1.16659414768219, + "learning_rate": 4.34265835628352e-06, + "loss": 0.363, + "step": 31510 + }, + { + "epoch": 4.568612530347502, + "grad_norm": 2.6467912197113037, + "learning_rate": 4.328163501956806e-06, + "loss": 0.368, + "step": 31520 + }, + { + "epoch": 4.570061963256876, + "grad_norm": 1.0563710927963257, + "learning_rate": 4.313668647630092e-06, + "loss": 0.3764, + "step": 31530 + }, + { + "epoch": 4.57151139616625, + "grad_norm": 2.4709951877593994, + "learning_rate": 4.299173793303377e-06, + "loss": 0.4012, + "step": 31540 + }, + { + "epoch": 4.572960829075624, + "grad_norm": 1.983708143234253, + "learning_rate": 4.284678938976663e-06, + "loss": 0.3856, + "step": 31550 + }, + { + "epoch": 4.5744102619849984, + "grad_norm": 2.240994691848755, + "learning_rate": 4.270184084649949e-06, + "loss": 0.3514, + "step": 31560 + }, + { + "epoch": 4.575859694894373, + "grad_norm": 0.9982314705848694, + "learning_rate": 4.255689230323236e-06, + "loss": 0.3412, + "step": 31570 + }, + { + "epoch": 4.577309127803747, + "grad_norm": 1.9218682050704956, + "learning_rate": 4.241194375996521e-06, + "loss": 0.4052, + "step": 31580 + }, + { + "epoch": 4.578758560713121, + "grad_norm": 0.9534339308738708, + "learning_rate": 4.226699521669807e-06, + "loss": 0.375, + "step": 31590 + }, + { + "epoch": 4.580207993622495, + "grad_norm": 1.9388453960418701, + "learning_rate": 4.212204667343093e-06, + "loss": 0.2817, + "step": 31600 + }, + { + "epoch": 4.581657426531869, + "grad_norm": 2.833406925201416, + "learning_rate": 4.1977098130163795e-06, + "loss": 0.3794, + "step": 31610 + }, + { + "epoch": 4.583106859441243, + "grad_norm": 3.2463910579681396, + "learning_rate": 4.183214958689665e-06, + "loss": 0.3999, + "step": 31620 + }, + { + "epoch": 4.584556292350618, + "grad_norm": 1.149850606918335, + "learning_rate": 4.168720104362952e-06, + "loss": 0.3686, + "step": 31630 + }, + { + "epoch": 4.586005725259992, + "grad_norm": 1.5383622646331787, + "learning_rate": 4.154225250036238e-06, + "loss": 0.3935, + "step": 31640 + }, + { + "epoch": 4.587455158169366, + "grad_norm": 2.9202868938446045, + "learning_rate": 4.139730395709524e-06, + "loss": 0.4011, + "step": 31650 + }, + { + "epoch": 4.588904591078741, + "grad_norm": 2.718942403793335, + "learning_rate": 4.12523554138281e-06, + "loss": 0.3847, + "step": 31660 + }, + { + "epoch": 4.590354023988114, + "grad_norm": 1.3251688480377197, + "learning_rate": 4.110740687056095e-06, + "loss": 0.3841, + "step": 31670 + }, + { + "epoch": 4.591803456897489, + "grad_norm": 2.328781843185425, + "learning_rate": 4.096245832729381e-06, + "loss": 0.3276, + "step": 31680 + }, + { + "epoch": 4.593252889806863, + "grad_norm": 1.1885885000228882, + "learning_rate": 4.081750978402667e-06, + "loss": 0.3735, + "step": 31690 + }, + { + "epoch": 4.594702322716238, + "grad_norm": 1.1832793951034546, + "learning_rate": 4.067256124075953e-06, + "loss": 0.3657, + "step": 31700 + }, + { + "epoch": 4.596151755625612, + "grad_norm": 1.5288221836090088, + "learning_rate": 4.052761269749239e-06, + "loss": 0.4204, + "step": 31710 + }, + { + "epoch": 4.597601188534986, + "grad_norm": 4.330370903015137, + "learning_rate": 4.038266415422525e-06, + "loss": 0.378, + "step": 31720 + }, + { + "epoch": 4.59905062144436, + "grad_norm": 3.168311595916748, + "learning_rate": 4.023771561095811e-06, + "loss": 0.3494, + "step": 31730 + }, + { + "epoch": 4.600500054353734, + "grad_norm": 1.6006698608398438, + "learning_rate": 4.009276706769097e-06, + "loss": 0.3926, + "step": 31740 + }, + { + "epoch": 4.601949487263108, + "grad_norm": 1.0302852392196655, + "learning_rate": 3.9947818524423825e-06, + "loss": 0.3911, + "step": 31750 + }, + { + "epoch": 4.6033989201724825, + "grad_norm": 1.771335482597351, + "learning_rate": 3.9802869981156694e-06, + "loss": 0.3601, + "step": 31760 + }, + { + "epoch": 4.604848353081857, + "grad_norm": 2.2682509422302246, + "learning_rate": 3.9657921437889555e-06, + "loss": 0.3695, + "step": 31770 + }, + { + "epoch": 4.606297785991231, + "grad_norm": 3.5409722328186035, + "learning_rate": 3.9512972894622415e-06, + "loss": 0.335, + "step": 31780 + }, + { + "epoch": 4.607747218900605, + "grad_norm": 1.0471943616867065, + "learning_rate": 3.936802435135527e-06, + "loss": 0.3629, + "step": 31790 + }, + { + "epoch": 4.609196651809979, + "grad_norm": 3.680050849914551, + "learning_rate": 3.922307580808813e-06, + "loss": 0.4099, + "step": 31800 + }, + { + "epoch": 4.610646084719353, + "grad_norm": 2.532383441925049, + "learning_rate": 3.907812726482099e-06, + "loss": 0.3722, + "step": 31810 + }, + { + "epoch": 4.6120955176287275, + "grad_norm": 1.3008819818496704, + "learning_rate": 3.893317872155385e-06, + "loss": 0.3492, + "step": 31820 + }, + { + "epoch": 4.613544950538102, + "grad_norm": 2.2614519596099854, + "learning_rate": 3.878823017828671e-06, + "loss": 0.3966, + "step": 31830 + }, + { + "epoch": 4.614994383447476, + "grad_norm": 1.2360858917236328, + "learning_rate": 3.864328163501957e-06, + "loss": 0.3602, + "step": 31840 + }, + { + "epoch": 4.616443816356851, + "grad_norm": 1.06869375705719, + "learning_rate": 3.849833309175243e-06, + "loss": 0.3444, + "step": 31850 + }, + { + "epoch": 4.617893249266224, + "grad_norm": 1.8802971839904785, + "learning_rate": 3.835338454848529e-06, + "loss": 0.366, + "step": 31860 + }, + { + "epoch": 4.619342682175599, + "grad_norm": 1.0579355955123901, + "learning_rate": 3.820843600521815e-06, + "loss": 0.3805, + "step": 31870 + }, + { + "epoch": 4.620792115084973, + "grad_norm": 2.116001844406128, + "learning_rate": 3.806348746195101e-06, + "loss": 0.3232, + "step": 31880 + }, + { + "epoch": 4.6222415479943475, + "grad_norm": 1.0102404356002808, + "learning_rate": 3.791853891868387e-06, + "loss": 0.3467, + "step": 31890 + }, + { + "epoch": 4.623690980903722, + "grad_norm": 1.343885064125061, + "learning_rate": 3.777359037541673e-06, + "loss": 0.4108, + "step": 31900 + }, + { + "epoch": 4.625140413813096, + "grad_norm": 1.089756727218628, + "learning_rate": 3.762864183214959e-06, + "loss": 0.3257, + "step": 31910 + }, + { + "epoch": 4.62658984672247, + "grad_norm": 3.171680450439453, + "learning_rate": 3.7483693288882446e-06, + "loss": 0.3808, + "step": 31920 + }, + { + "epoch": 4.628039279631844, + "grad_norm": 1.396655559539795, + "learning_rate": 3.7338744745615306e-06, + "loss": 0.3759, + "step": 31930 + }, + { + "epoch": 4.629488712541218, + "grad_norm": 1.0053515434265137, + "learning_rate": 3.7193796202348167e-06, + "loss": 0.3902, + "step": 31940 + }, + { + "epoch": 4.6309381454505925, + "grad_norm": 2.9222521781921387, + "learning_rate": 3.704884765908103e-06, + "loss": 0.428, + "step": 31950 + }, + { + "epoch": 4.632387578359967, + "grad_norm": 1.056503176689148, + "learning_rate": 3.6903899115813884e-06, + "loss": 0.3976, + "step": 31960 + }, + { + "epoch": 4.633837011269341, + "grad_norm": 1.6677240133285522, + "learning_rate": 3.675895057254675e-06, + "loss": 0.4082, + "step": 31970 + }, + { + "epoch": 4.635286444178715, + "grad_norm": 4.744880676269531, + "learning_rate": 3.661400202927961e-06, + "loss": 0.4138, + "step": 31980 + }, + { + "epoch": 4.636735877088089, + "grad_norm": 1.129930019378662, + "learning_rate": 3.646905348601247e-06, + "loss": 0.4117, + "step": 31990 + }, + { + "epoch": 4.638185309997463, + "grad_norm": 1.0966359376907349, + "learning_rate": 3.6324104942745326e-06, + "loss": 0.3454, + "step": 32000 + }, + { + "epoch": 4.638185309997463, + "eval_loss": 0.7402629256248474, + "eval_runtime": 670.9833, + "eval_samples_per_second": 51.41, + "eval_steps_per_second": 2.571, + "eval_token_accuracy": 0.0003950272884078792, + "step": 32000 + }, + { + "epoch": 4.639634742906837, + "grad_norm": 0.9030367732048035, + "learning_rate": 3.6179156399478186e-06, + "loss": 0.3426, + "step": 32010 + }, + { + "epoch": 4.641084175816212, + "grad_norm": 1.3874012231826782, + "learning_rate": 3.6034207856211047e-06, + "loss": 0.3564, + "step": 32020 + }, + { + "epoch": 4.642533608725586, + "grad_norm": 1.7644416093826294, + "learning_rate": 3.5889259312943907e-06, + "loss": 0.4444, + "step": 32030 + }, + { + "epoch": 4.643983041634961, + "grad_norm": 1.5777500867843628, + "learning_rate": 3.574431076967677e-06, + "loss": 0.4593, + "step": 32040 + }, + { + "epoch": 4.645432474544334, + "grad_norm": 0.8278200030326843, + "learning_rate": 3.5599362226409624e-06, + "loss": 0.3529, + "step": 32050 + }, + { + "epoch": 4.646881907453709, + "grad_norm": 2.230210542678833, + "learning_rate": 3.5454413683142485e-06, + "loss": 0.3893, + "step": 32060 + }, + { + "epoch": 4.648331340363083, + "grad_norm": 1.252874493598938, + "learning_rate": 3.5309465139875345e-06, + "loss": 0.3803, + "step": 32070 + }, + { + "epoch": 4.649780773272457, + "grad_norm": 1.3750890493392944, + "learning_rate": 3.516451659660821e-06, + "loss": 0.3525, + "step": 32080 + }, + { + "epoch": 4.651230206181832, + "grad_norm": 2.124300241470337, + "learning_rate": 3.5019568053341062e-06, + "loss": 0.3254, + "step": 32090 + }, + { + "epoch": 4.652679639091206, + "grad_norm": 2.438168525695801, + "learning_rate": 3.4874619510073923e-06, + "loss": 0.3373, + "step": 32100 + }, + { + "epoch": 4.65412907200058, + "grad_norm": 2.202852487564087, + "learning_rate": 3.4729670966806788e-06, + "loss": 0.3415, + "step": 32110 + }, + { + "epoch": 4.655578504909954, + "grad_norm": 2.5048999786376953, + "learning_rate": 3.458472242353965e-06, + "loss": 0.3981, + "step": 32120 + }, + { + "epoch": 4.657027937819328, + "grad_norm": 1.2640185356140137, + "learning_rate": 3.4439773880272504e-06, + "loss": 0.4032, + "step": 32130 + }, + { + "epoch": 4.658477370728702, + "grad_norm": 2.459134578704834, + "learning_rate": 3.4294825337005365e-06, + "loss": 0.3492, + "step": 32140 + }, + { + "epoch": 4.6599268036380765, + "grad_norm": 1.477692723274231, + "learning_rate": 3.4149876793738225e-06, + "loss": 0.3743, + "step": 32150 + }, + { + "epoch": 4.661376236547451, + "grad_norm": 1.0264941453933716, + "learning_rate": 3.4004928250471086e-06, + "loss": 0.3192, + "step": 32160 + }, + { + "epoch": 4.662825669456825, + "grad_norm": 2.095033645629883, + "learning_rate": 3.3859979707203942e-06, + "loss": 0.4001, + "step": 32170 + }, + { + "epoch": 4.664275102366199, + "grad_norm": 2.331382989883423, + "learning_rate": 3.3715031163936803e-06, + "loss": 0.3358, + "step": 32180 + }, + { + "epoch": 4.665724535275573, + "grad_norm": 1.119020938873291, + "learning_rate": 3.3570082620669663e-06, + "loss": 0.3441, + "step": 32190 + }, + { + "epoch": 4.667173968184947, + "grad_norm": 2.7784945964813232, + "learning_rate": 3.3425134077402524e-06, + "loss": 0.3859, + "step": 32200 + }, + { + "epoch": 4.6686234010943215, + "grad_norm": 2.008835554122925, + "learning_rate": 3.328018553413538e-06, + "loss": 0.3245, + "step": 32210 + }, + { + "epoch": 4.670072834003696, + "grad_norm": 1.0234019756317139, + "learning_rate": 3.313523699086824e-06, + "loss": 0.3272, + "step": 32220 + }, + { + "epoch": 4.671522266913071, + "grad_norm": 1.2887885570526123, + "learning_rate": 3.29902884476011e-06, + "loss": 0.3998, + "step": 32230 + }, + { + "epoch": 4.672971699822444, + "grad_norm": 1.0463289022445679, + "learning_rate": 3.2845339904333966e-06, + "loss": 0.4122, + "step": 32240 + }, + { + "epoch": 4.674421132731819, + "grad_norm": 1.143078327178955, + "learning_rate": 3.2700391361066826e-06, + "loss": 0.4343, + "step": 32250 + }, + { + "epoch": 4.675870565641193, + "grad_norm": 1.4409193992614746, + "learning_rate": 3.255544281779968e-06, + "loss": 0.3785, + "step": 32260 + }, + { + "epoch": 4.677319998550567, + "grad_norm": 0.8818618059158325, + "learning_rate": 3.2410494274532543e-06, + "loss": 0.3965, + "step": 32270 + }, + { + "epoch": 4.6787694314599415, + "grad_norm": 1.2765010595321655, + "learning_rate": 3.2265545731265404e-06, + "loss": 0.4304, + "step": 32280 + }, + { + "epoch": 4.680218864369316, + "grad_norm": 1.0178990364074707, + "learning_rate": 3.2120597187998264e-06, + "loss": 0.3661, + "step": 32290 + }, + { + "epoch": 4.68166829727869, + "grad_norm": 1.806789755821228, + "learning_rate": 3.197564864473112e-06, + "loss": 0.3513, + "step": 32300 + }, + { + "epoch": 4.683117730188064, + "grad_norm": 2.861187219619751, + "learning_rate": 3.183070010146398e-06, + "loss": 0.38, + "step": 32310 + }, + { + "epoch": 4.684567163097438, + "grad_norm": 1.3312263488769531, + "learning_rate": 3.168575155819684e-06, + "loss": 0.4133, + "step": 32320 + }, + { + "epoch": 4.686016596006812, + "grad_norm": 2.996910572052002, + "learning_rate": 3.1540803014929702e-06, + "loss": 0.3746, + "step": 32330 + }, + { + "epoch": 4.6874660289161865, + "grad_norm": 1.8551859855651855, + "learning_rate": 3.139585447166256e-06, + "loss": 0.3787, + "step": 32340 + }, + { + "epoch": 4.688915461825561, + "grad_norm": 2.761601448059082, + "learning_rate": 3.125090592839542e-06, + "loss": 0.3392, + "step": 32350 + }, + { + "epoch": 4.690364894734935, + "grad_norm": 1.2694810628890991, + "learning_rate": 3.110595738512828e-06, + "loss": 0.3822, + "step": 32360 + }, + { + "epoch": 4.691814327644309, + "grad_norm": 3.686568260192871, + "learning_rate": 3.096100884186114e-06, + "loss": 0.4155, + "step": 32370 + }, + { + "epoch": 4.693263760553683, + "grad_norm": 3.2763731479644775, + "learning_rate": 3.0816060298594e-06, + "loss": 0.4185, + "step": 32380 + }, + { + "epoch": 4.694713193463057, + "grad_norm": 1.6245061159133911, + "learning_rate": 3.0671111755326857e-06, + "loss": 0.386, + "step": 32390 + }, + { + "epoch": 4.696162626372431, + "grad_norm": 1.0963075160980225, + "learning_rate": 3.052616321205972e-06, + "loss": 0.4002, + "step": 32400 + }, + { + "epoch": 4.697612059281806, + "grad_norm": 1.1938602924346924, + "learning_rate": 3.038121466879258e-06, + "loss": 0.3925, + "step": 32410 + }, + { + "epoch": 4.699061492191181, + "grad_norm": 2.0994720458984375, + "learning_rate": 3.023626612552544e-06, + "loss": 0.3838, + "step": 32420 + }, + { + "epoch": 4.700510925100555, + "grad_norm": 0.9986076354980469, + "learning_rate": 3.0091317582258303e-06, + "loss": 0.3786, + "step": 32430 + }, + { + "epoch": 4.701960358009929, + "grad_norm": 3.9333064556121826, + "learning_rate": 2.994636903899116e-06, + "loss": 0.4101, + "step": 32440 + }, + { + "epoch": 4.703409790919303, + "grad_norm": 2.4673099517822266, + "learning_rate": 2.980142049572402e-06, + "loss": 0.3625, + "step": 32450 + }, + { + "epoch": 4.704859223828677, + "grad_norm": 2.997154712677002, + "learning_rate": 2.965647195245688e-06, + "loss": 0.3262, + "step": 32460 + }, + { + "epoch": 4.706308656738051, + "grad_norm": 2.245668411254883, + "learning_rate": 2.951152340918974e-06, + "loss": 0.3691, + "step": 32470 + }, + { + "epoch": 4.707758089647426, + "grad_norm": 1.5757455825805664, + "learning_rate": 2.9366574865922598e-06, + "loss": 0.4091, + "step": 32480 + }, + { + "epoch": 4.7092075225568, + "grad_norm": 3.344550609588623, + "learning_rate": 2.922162632265546e-06, + "loss": 0.4019, + "step": 32490 + }, + { + "epoch": 4.710656955466174, + "grad_norm": 1.346003532409668, + "learning_rate": 2.907667777938832e-06, + "loss": 0.3991, + "step": 32500 + }, + { + "epoch": 4.712106388375548, + "grad_norm": 1.0615825653076172, + "learning_rate": 2.893172923612118e-06, + "loss": 0.3754, + "step": 32510 + }, + { + "epoch": 4.713555821284922, + "grad_norm": 1.41044282913208, + "learning_rate": 2.8786780692854035e-06, + "loss": 0.3001, + "step": 32520 + }, + { + "epoch": 4.715005254194296, + "grad_norm": 0.9503432512283325, + "learning_rate": 2.86418321495869e-06, + "loss": 0.381, + "step": 32530 + }, + { + "epoch": 4.7164546871036706, + "grad_norm": 2.1342263221740723, + "learning_rate": 2.8496883606319757e-06, + "loss": 0.3472, + "step": 32540 + }, + { + "epoch": 4.717904120013045, + "grad_norm": 1.4033797979354858, + "learning_rate": 2.8351935063052617e-06, + "loss": 0.387, + "step": 32550 + }, + { + "epoch": 4.719353552922419, + "grad_norm": 1.4123448133468628, + "learning_rate": 2.8206986519785478e-06, + "loss": 0.3624, + "step": 32560 + }, + { + "epoch": 4.720802985831793, + "grad_norm": 1.1497482061386108, + "learning_rate": 2.806203797651834e-06, + "loss": 0.3973, + "step": 32570 + }, + { + "epoch": 4.722252418741167, + "grad_norm": 1.3808242082595825, + "learning_rate": 2.7917089433251194e-06, + "loss": 0.3993, + "step": 32580 + }, + { + "epoch": 4.723701851650541, + "grad_norm": 1.8661115169525146, + "learning_rate": 2.777214088998406e-06, + "loss": 0.3679, + "step": 32590 + }, + { + "epoch": 4.7251512845599155, + "grad_norm": 3.0913310050964355, + "learning_rate": 2.7627192346716915e-06, + "loss": 0.2996, + "step": 32600 + }, + { + "epoch": 4.726600717469291, + "grad_norm": 2.8135464191436768, + "learning_rate": 2.7482243803449776e-06, + "loss": 0.3693, + "step": 32610 + }, + { + "epoch": 4.728050150378665, + "grad_norm": 2.949455738067627, + "learning_rate": 2.7337295260182637e-06, + "loss": 0.3799, + "step": 32620 + }, + { + "epoch": 4.729499583288039, + "grad_norm": 1.1281206607818604, + "learning_rate": 2.7192346716915497e-06, + "loss": 0.4465, + "step": 32630 + }, + { + "epoch": 4.730949016197413, + "grad_norm": 1.0962039232254028, + "learning_rate": 2.7047398173648358e-06, + "loss": 0.3521, + "step": 32640 + }, + { + "epoch": 4.732398449106787, + "grad_norm": 1.1542834043502808, + "learning_rate": 2.6902449630381214e-06, + "loss": 0.3267, + "step": 32650 + }, + { + "epoch": 4.733847882016161, + "grad_norm": 2.2527689933776855, + "learning_rate": 2.675750108711408e-06, + "loss": 0.3637, + "step": 32660 + }, + { + "epoch": 4.7352973149255355, + "grad_norm": 3.0720131397247314, + "learning_rate": 2.6612552543846935e-06, + "loss": 0.4143, + "step": 32670 + }, + { + "epoch": 4.73674674783491, + "grad_norm": 3.2450637817382812, + "learning_rate": 2.6467604000579796e-06, + "loss": 0.3778, + "step": 32680 + }, + { + "epoch": 4.738196180744284, + "grad_norm": 0.8772507309913635, + "learning_rate": 2.6322655457312656e-06, + "loss": 0.3518, + "step": 32690 + }, + { + "epoch": 4.739645613653658, + "grad_norm": 2.1515049934387207, + "learning_rate": 2.6177706914045517e-06, + "loss": 0.3454, + "step": 32700 + }, + { + "epoch": 4.741095046563032, + "grad_norm": 0.9839628338813782, + "learning_rate": 2.6032758370778373e-06, + "loss": 0.3805, + "step": 32710 + }, + { + "epoch": 4.742544479472406, + "grad_norm": 1.3019304275512695, + "learning_rate": 2.5887809827511238e-06, + "loss": 0.3661, + "step": 32720 + }, + { + "epoch": 4.7439939123817805, + "grad_norm": 1.898655891418457, + "learning_rate": 2.5742861284244094e-06, + "loss": 0.3405, + "step": 32730 + }, + { + "epoch": 4.745443345291155, + "grad_norm": 1.5310337543487549, + "learning_rate": 2.5597912740976954e-06, + "loss": 0.3885, + "step": 32740 + }, + { + "epoch": 4.746892778200529, + "grad_norm": 2.6126482486724854, + "learning_rate": 2.5452964197709815e-06, + "loss": 0.4087, + "step": 32750 + }, + { + "epoch": 4.748342211109903, + "grad_norm": 2.2393836975097656, + "learning_rate": 2.5308015654442676e-06, + "loss": 0.3558, + "step": 32760 + }, + { + "epoch": 4.749791644019277, + "grad_norm": 1.1990814208984375, + "learning_rate": 2.516306711117553e-06, + "loss": 0.3635, + "step": 32770 + }, + { + "epoch": 4.751241076928652, + "grad_norm": 0.9254428148269653, + "learning_rate": 2.5018118567908392e-06, + "loss": 0.4035, + "step": 32780 + }, + { + "epoch": 4.752690509838025, + "grad_norm": 3.164818525314331, + "learning_rate": 2.4873170024641253e-06, + "loss": 0.3199, + "step": 32790 + }, + { + "epoch": 4.7541399427474005, + "grad_norm": 0.9472362995147705, + "learning_rate": 2.4728221481374113e-06, + "loss": 0.3885, + "step": 32800 + }, + { + "epoch": 4.755589375656775, + "grad_norm": 1.333412528038025, + "learning_rate": 2.458327293810697e-06, + "loss": 0.4131, + "step": 32810 + }, + { + "epoch": 4.757038808566149, + "grad_norm": 0.9033955335617065, + "learning_rate": 2.4438324394839834e-06, + "loss": 0.3789, + "step": 32820 + }, + { + "epoch": 4.758488241475523, + "grad_norm": 2.3149867057800293, + "learning_rate": 2.4293375851572695e-06, + "loss": 0.3417, + "step": 32830 + }, + { + "epoch": 4.759937674384897, + "grad_norm": 5.298091888427734, + "learning_rate": 2.414842730830555e-06, + "loss": 0.3469, + "step": 32840 + }, + { + "epoch": 4.761387107294271, + "grad_norm": 1.0421656370162964, + "learning_rate": 2.400347876503841e-06, + "loss": 0.3931, + "step": 32850 + }, + { + "epoch": 4.762836540203645, + "grad_norm": 1.1229223012924194, + "learning_rate": 2.3858530221771272e-06, + "loss": 0.3572, + "step": 32860 + }, + { + "epoch": 4.76428597311302, + "grad_norm": 1.4950625896453857, + "learning_rate": 2.3713581678504133e-06, + "loss": 0.434, + "step": 32870 + }, + { + "epoch": 4.765735406022394, + "grad_norm": 0.9826526641845703, + "learning_rate": 2.3568633135236993e-06, + "loss": 0.3881, + "step": 32880 + }, + { + "epoch": 4.767184838931768, + "grad_norm": 1.790209174156189, + "learning_rate": 2.3423684591969854e-06, + "loss": 0.3745, + "step": 32890 + }, + { + "epoch": 4.768634271841142, + "grad_norm": 3.7698655128479004, + "learning_rate": 2.327873604870271e-06, + "loss": 0.4164, + "step": 32900 + }, + { + "epoch": 4.770083704750516, + "grad_norm": 1.2466737031936646, + "learning_rate": 2.313378750543557e-06, + "loss": 0.3579, + "step": 32910 + }, + { + "epoch": 4.77153313765989, + "grad_norm": 1.1752427816390991, + "learning_rate": 2.298883896216843e-06, + "loss": 0.3975, + "step": 32920 + }, + { + "epoch": 4.772982570569265, + "grad_norm": 4.076375961303711, + "learning_rate": 2.284389041890129e-06, + "loss": 0.4107, + "step": 32930 + }, + { + "epoch": 4.774432003478639, + "grad_norm": 2.440988540649414, + "learning_rate": 2.269894187563415e-06, + "loss": 0.3744, + "step": 32940 + }, + { + "epoch": 4.775881436388013, + "grad_norm": 2.235698699951172, + "learning_rate": 2.2553993332367013e-06, + "loss": 0.3726, + "step": 32950 + }, + { + "epoch": 4.777330869297387, + "grad_norm": 2.558980941772461, + "learning_rate": 2.240904478909987e-06, + "loss": 0.4063, + "step": 32960 + }, + { + "epoch": 4.778780302206762, + "grad_norm": 0.951149582862854, + "learning_rate": 2.226409624583273e-06, + "loss": 0.3733, + "step": 32970 + }, + { + "epoch": 4.780229735116135, + "grad_norm": 1.0211766958236694, + "learning_rate": 2.211914770256559e-06, + "loss": 0.3332, + "step": 32980 + }, + { + "epoch": 4.78167916802551, + "grad_norm": 2.6187455654144287, + "learning_rate": 2.197419915929845e-06, + "loss": 0.367, + "step": 32990 + }, + { + "epoch": 4.783128600934885, + "grad_norm": 2.046088933944702, + "learning_rate": 2.1829250616031307e-06, + "loss": 0.3534, + "step": 33000 + }, + { + "epoch": 4.783128600934885, + "eval_loss": 0.7387080788612366, + "eval_runtime": 671.8242, + "eval_samples_per_second": 51.345, + "eval_steps_per_second": 2.568, + "eval_token_accuracy": 0.00039422307975884034, + "step": 33000 + }, + { + "epoch": 4.784578033844259, + "grad_norm": 1.1268091201782227, + "learning_rate": 2.1684302072764168e-06, + "loss": 0.3607, + "step": 33010 + }, + { + "epoch": 4.786027466753633, + "grad_norm": 0.9580293297767639, + "learning_rate": 2.1539353529497032e-06, + "loss": 0.3174, + "step": 33020 + }, + { + "epoch": 4.787476899663007, + "grad_norm": 0.8425546288490295, + "learning_rate": 2.139440498622989e-06, + "loss": 0.3413, + "step": 33030 + }, + { + "epoch": 4.788926332572381, + "grad_norm": 2.578343391418457, + "learning_rate": 2.124945644296275e-06, + "loss": 0.3708, + "step": 33040 + }, + { + "epoch": 4.790375765481755, + "grad_norm": 1.0360602140426636, + "learning_rate": 2.110450789969561e-06, + "loss": 0.4026, + "step": 33050 + }, + { + "epoch": 4.7918251983911295, + "grad_norm": 1.632294774055481, + "learning_rate": 2.095955935642847e-06, + "loss": 0.3823, + "step": 33060 + }, + { + "epoch": 4.793274631300504, + "grad_norm": 1.0471699237823486, + "learning_rate": 2.0814610813161327e-06, + "loss": 0.4129, + "step": 33070 + }, + { + "epoch": 4.794724064209878, + "grad_norm": 1.4320305585861206, + "learning_rate": 2.066966226989419e-06, + "loss": 0.3969, + "step": 33080 + }, + { + "epoch": 4.796173497119252, + "grad_norm": 0.9930534362792969, + "learning_rate": 2.0524713726627048e-06, + "loss": 0.349, + "step": 33090 + }, + { + "epoch": 4.797622930028626, + "grad_norm": 1.012718915939331, + "learning_rate": 2.037976518335991e-06, + "loss": 0.3413, + "step": 33100 + }, + { + "epoch": 4.799072362938, + "grad_norm": 1.2036629915237427, + "learning_rate": 2.023481664009277e-06, + "loss": 0.3922, + "step": 33110 + }, + { + "epoch": 4.8005217958473745, + "grad_norm": 2.238356351852417, + "learning_rate": 2.008986809682563e-06, + "loss": 0.3291, + "step": 33120 + }, + { + "epoch": 4.801971228756749, + "grad_norm": 1.101607084274292, + "learning_rate": 1.9944919553558486e-06, + "loss": 0.3901, + "step": 33130 + }, + { + "epoch": 4.803420661666123, + "grad_norm": 2.1101644039154053, + "learning_rate": 1.9799971010291346e-06, + "loss": 0.3392, + "step": 33140 + }, + { + "epoch": 4.804870094575497, + "grad_norm": 2.539505958557129, + "learning_rate": 1.9655022467024207e-06, + "loss": 0.3621, + "step": 33150 + }, + { + "epoch": 4.806319527484872, + "grad_norm": 4.139045715332031, + "learning_rate": 1.9510073923757067e-06, + "loss": 0.4016, + "step": 33160 + }, + { + "epoch": 4.807768960394245, + "grad_norm": 0.9701805114746094, + "learning_rate": 1.9365125380489928e-06, + "loss": 0.3427, + "step": 33170 + }, + { + "epoch": 4.80921839330362, + "grad_norm": 0.8436054587364197, + "learning_rate": 1.922017683722279e-06, + "loss": 0.3647, + "step": 33180 + }, + { + "epoch": 4.8106678262129945, + "grad_norm": 1.258991003036499, + "learning_rate": 1.9075228293955645e-06, + "loss": 0.3411, + "step": 33190 + }, + { + "epoch": 4.812117259122369, + "grad_norm": 3.107102394104004, + "learning_rate": 1.8930279750688507e-06, + "loss": 0.4033, + "step": 33200 + }, + { + "epoch": 4.813566692031743, + "grad_norm": 0.9850480556488037, + "learning_rate": 1.8785331207421368e-06, + "loss": 0.3227, + "step": 33210 + }, + { + "epoch": 4.815016124941117, + "grad_norm": 1.4181708097457886, + "learning_rate": 1.8640382664154226e-06, + "loss": 0.3293, + "step": 33220 + }, + { + "epoch": 4.816465557850491, + "grad_norm": 2.7566471099853516, + "learning_rate": 1.8495434120887087e-06, + "loss": 0.4135, + "step": 33230 + }, + { + "epoch": 4.817914990759865, + "grad_norm": 2.4442412853240967, + "learning_rate": 1.8350485577619945e-06, + "loss": 0.383, + "step": 33240 + }, + { + "epoch": 4.8193644236692395, + "grad_norm": 2.687974691390991, + "learning_rate": 1.8205537034352808e-06, + "loss": 0.3687, + "step": 33250 + }, + { + "epoch": 4.820813856578614, + "grad_norm": 1.3777414560317993, + "learning_rate": 1.8060588491085664e-06, + "loss": 0.3538, + "step": 33260 + }, + { + "epoch": 4.822263289487988, + "grad_norm": 1.8097695112228394, + "learning_rate": 1.7915639947818527e-06, + "loss": 0.2637, + "step": 33270 + }, + { + "epoch": 4.823712722397362, + "grad_norm": 2.2218167781829834, + "learning_rate": 1.7770691404551385e-06, + "loss": 0.4544, + "step": 33280 + }, + { + "epoch": 4.825162155306736, + "grad_norm": 1.296929955482483, + "learning_rate": 1.7625742861284246e-06, + "loss": 0.3599, + "step": 33290 + }, + { + "epoch": 4.82661158821611, + "grad_norm": 2.5712153911590576, + "learning_rate": 1.7480794318017104e-06, + "loss": 0.4087, + "step": 33300 + }, + { + "epoch": 4.828061021125484, + "grad_norm": 1.5026534795761108, + "learning_rate": 1.7335845774749965e-06, + "loss": 0.3474, + "step": 33310 + }, + { + "epoch": 4.829510454034859, + "grad_norm": 1.357690691947937, + "learning_rate": 1.7190897231482823e-06, + "loss": 0.3939, + "step": 33320 + }, + { + "epoch": 4.830959886944233, + "grad_norm": 1.5788393020629883, + "learning_rate": 1.7045948688215686e-06, + "loss": 0.3603, + "step": 33330 + }, + { + "epoch": 4.832409319853607, + "grad_norm": 0.8207820653915405, + "learning_rate": 1.6901000144948542e-06, + "loss": 0.4107, + "step": 33340 + }, + { + "epoch": 4.833858752762982, + "grad_norm": 1.154213309288025, + "learning_rate": 1.6756051601681405e-06, + "loss": 0.3741, + "step": 33350 + }, + { + "epoch": 4.835308185672355, + "grad_norm": 1.2206467390060425, + "learning_rate": 1.6611103058414263e-06, + "loss": 0.3595, + "step": 33360 + }, + { + "epoch": 4.83675761858173, + "grad_norm": 1.109948992729187, + "learning_rate": 1.6466154515147124e-06, + "loss": 0.4087, + "step": 33370 + }, + { + "epoch": 4.838207051491104, + "grad_norm": 3.8707070350646973, + "learning_rate": 1.6321205971879982e-06, + "loss": 0.3811, + "step": 33380 + }, + { + "epoch": 4.839656484400479, + "grad_norm": 2.4858224391937256, + "learning_rate": 1.6176257428612842e-06, + "loss": 0.3446, + "step": 33390 + }, + { + "epoch": 4.841105917309853, + "grad_norm": 1.0518187284469604, + "learning_rate": 1.60313088853457e-06, + "loss": 0.384, + "step": 33400 + }, + { + "epoch": 4.842555350219227, + "grad_norm": 1.2034097909927368, + "learning_rate": 1.5886360342078564e-06, + "loss": 0.3816, + "step": 33410 + }, + { + "epoch": 4.844004783128601, + "grad_norm": 2.01949405670166, + "learning_rate": 1.5741411798811424e-06, + "loss": 0.4041, + "step": 33420 + }, + { + "epoch": 4.845454216037975, + "grad_norm": 2.740180492401123, + "learning_rate": 1.5596463255544283e-06, + "loss": 0.3867, + "step": 33430 + }, + { + "epoch": 4.846903648947349, + "grad_norm": 1.3823065757751465, + "learning_rate": 1.5451514712277143e-06, + "loss": 0.3484, + "step": 33440 + }, + { + "epoch": 4.8483530818567235, + "grad_norm": 1.4719023704528809, + "learning_rate": 1.5306566169010001e-06, + "loss": 0.355, + "step": 33450 + }, + { + "epoch": 4.849802514766098, + "grad_norm": 1.2143726348876953, + "learning_rate": 1.5161617625742862e-06, + "loss": 0.3884, + "step": 33460 + }, + { + "epoch": 4.851251947675472, + "grad_norm": 3.5473649501800537, + "learning_rate": 1.501666908247572e-06, + "loss": 0.38, + "step": 33470 + }, + { + "epoch": 4.852701380584846, + "grad_norm": 1.1043490171432495, + "learning_rate": 1.487172053920858e-06, + "loss": 0.3983, + "step": 33480 + }, + { + "epoch": 4.85415081349422, + "grad_norm": 1.6615132093429565, + "learning_rate": 1.4726771995941441e-06, + "loss": 0.3192, + "step": 33490 + }, + { + "epoch": 4.855600246403594, + "grad_norm": 3.8564488887786865, + "learning_rate": 1.45818234526743e-06, + "loss": 0.403, + "step": 33500 + }, + { + "epoch": 4.8570496793129685, + "grad_norm": 1.4073970317840576, + "learning_rate": 1.4436874909407163e-06, + "loss": 0.3609, + "step": 33510 + }, + { + "epoch": 4.858499112222343, + "grad_norm": 2.5036427974700928, + "learning_rate": 1.429192636614002e-06, + "loss": 0.3634, + "step": 33520 + }, + { + "epoch": 4.859948545131717, + "grad_norm": 2.0004215240478516, + "learning_rate": 1.4146977822872881e-06, + "loss": 0.33, + "step": 33530 + }, + { + "epoch": 4.861397978041092, + "grad_norm": 3.5735065937042236, + "learning_rate": 1.4002029279605742e-06, + "loss": 0.3731, + "step": 33540 + }, + { + "epoch": 4.862847410950465, + "grad_norm": 1.5266069173812866, + "learning_rate": 1.38570807363386e-06, + "loss": 0.3769, + "step": 33550 + }, + { + "epoch": 4.86429684385984, + "grad_norm": 2.06255841255188, + "learning_rate": 1.371213219307146e-06, + "loss": 0.3752, + "step": 33560 + }, + { + "epoch": 4.865746276769214, + "grad_norm": 3.314347267150879, + "learning_rate": 1.356718364980432e-06, + "loss": 0.3887, + "step": 33570 + }, + { + "epoch": 4.8671957096785885, + "grad_norm": 1.280548095703125, + "learning_rate": 1.342223510653718e-06, + "loss": 0.3756, + "step": 33580 + }, + { + "epoch": 4.868645142587963, + "grad_norm": 2.8850739002227783, + "learning_rate": 1.327728656327004e-06, + "loss": 0.334, + "step": 33590 + }, + { + "epoch": 4.870094575497337, + "grad_norm": 1.094756841659546, + "learning_rate": 1.3132338020002899e-06, + "loss": 0.3001, + "step": 33600 + }, + { + "epoch": 4.871544008406711, + "grad_norm": 5.066037654876709, + "learning_rate": 1.298738947673576e-06, + "loss": 0.3696, + "step": 33610 + }, + { + "epoch": 4.872993441316085, + "grad_norm": 3.2792046070098877, + "learning_rate": 1.284244093346862e-06, + "loss": 0.3982, + "step": 33620 + }, + { + "epoch": 4.874442874225459, + "grad_norm": 0.984911322593689, + "learning_rate": 1.2697492390201478e-06, + "loss": 0.364, + "step": 33630 + }, + { + "epoch": 4.8758923071348335, + "grad_norm": 1.0168277025222778, + "learning_rate": 1.2552543846934339e-06, + "loss": 0.4063, + "step": 33640 + }, + { + "epoch": 4.877341740044208, + "grad_norm": 1.043861746788025, + "learning_rate": 1.2407595303667197e-06, + "loss": 0.3545, + "step": 33650 + }, + { + "epoch": 4.878791172953582, + "grad_norm": 1.2901207208633423, + "learning_rate": 1.2262646760400058e-06, + "loss": 0.3993, + "step": 33660 + }, + { + "epoch": 4.880240605862956, + "grad_norm": 2.493473768234253, + "learning_rate": 1.2117698217132918e-06, + "loss": 0.3616, + "step": 33670 + }, + { + "epoch": 4.88169003877233, + "grad_norm": 1.1101723909378052, + "learning_rate": 1.1972749673865777e-06, + "loss": 0.3554, + "step": 33680 + }, + { + "epoch": 4.883139471681704, + "grad_norm": 1.0349514484405518, + "learning_rate": 1.1827801130598637e-06, + "loss": 0.3509, + "step": 33690 + }, + { + "epoch": 4.884588904591078, + "grad_norm": 1.1507887840270996, + "learning_rate": 1.1682852587331498e-06, + "loss": 0.364, + "step": 33700 + }, + { + "epoch": 4.886038337500453, + "grad_norm": 3.162672281265259, + "learning_rate": 1.1537904044064358e-06, + "loss": 0.3366, + "step": 33710 + }, + { + "epoch": 4.887487770409827, + "grad_norm": 1.3397330045700073, + "learning_rate": 1.1392955500797219e-06, + "loss": 0.4009, + "step": 33720 + }, + { + "epoch": 4.888937203319202, + "grad_norm": 1.1104768514633179, + "learning_rate": 1.1248006957530077e-06, + "loss": 0.383, + "step": 33730 + }, + { + "epoch": 4.890386636228576, + "grad_norm": 2.326978921890259, + "learning_rate": 1.1103058414262938e-06, + "loss": 0.3212, + "step": 33740 + }, + { + "epoch": 4.89183606913795, + "grad_norm": 1.2988479137420654, + "learning_rate": 1.0958109870995798e-06, + "loss": 0.3584, + "step": 33750 + }, + { + "epoch": 4.893285502047324, + "grad_norm": 2.5861144065856934, + "learning_rate": 1.0813161327728657e-06, + "loss": 0.3753, + "step": 33760 + }, + { + "epoch": 4.894734934956698, + "grad_norm": 1.090281367301941, + "learning_rate": 1.0668212784461517e-06, + "loss": 0.3217, + "step": 33770 + }, + { + "epoch": 4.896184367866073, + "grad_norm": 3.0620217323303223, + "learning_rate": 1.0523264241194376e-06, + "loss": 0.3265, + "step": 33780 + }, + { + "epoch": 4.897633800775447, + "grad_norm": 1.767822265625, + "learning_rate": 1.0378315697927236e-06, + "loss": 0.3413, + "step": 33790 + }, + { + "epoch": 4.899083233684821, + "grad_norm": 0.9307612776756287, + "learning_rate": 1.0233367154660097e-06, + "loss": 0.3716, + "step": 33800 + }, + { + "epoch": 4.900532666594195, + "grad_norm": 2.0926198959350586, + "learning_rate": 1.0088418611392955e-06, + "loss": 0.2855, + "step": 33810 + }, + { + "epoch": 4.901982099503569, + "grad_norm": 2.8857176303863525, + "learning_rate": 9.943470068125816e-07, + "loss": 0.3717, + "step": 33820 + }, + { + "epoch": 4.903431532412943, + "grad_norm": 1.6708345413208008, + "learning_rate": 9.798521524858676e-07, + "loss": 0.3262, + "step": 33830 + }, + { + "epoch": 4.9048809653223175, + "grad_norm": 1.5556457042694092, + "learning_rate": 9.653572981591535e-07, + "loss": 0.3121, + "step": 33840 + }, + { + "epoch": 4.906330398231692, + "grad_norm": 3.1761248111724854, + "learning_rate": 9.508624438324395e-07, + "loss": 0.3898, + "step": 33850 + }, + { + "epoch": 4.907779831141066, + "grad_norm": 1.3463460206985474, + "learning_rate": 9.363675895057255e-07, + "loss": 0.3161, + "step": 33860 + }, + { + "epoch": 4.90922926405044, + "grad_norm": 2.4650721549987793, + "learning_rate": 9.218727351790114e-07, + "loss": 0.3677, + "step": 33870 + }, + { + "epoch": 4.910678696959814, + "grad_norm": 1.9204206466674805, + "learning_rate": 9.073778808522974e-07, + "loss": 0.3759, + "step": 33880 + }, + { + "epoch": 4.912128129869188, + "grad_norm": 4.739460468292236, + "learning_rate": 8.928830265255834e-07, + "loss": 0.37, + "step": 33890 + }, + { + "epoch": 4.9135775627785625, + "grad_norm": 1.053930401802063, + "learning_rate": 8.783881721988696e-07, + "loss": 0.3451, + "step": 33900 + }, + { + "epoch": 4.915026995687937, + "grad_norm": 0.9955337643623352, + "learning_rate": 8.638933178721555e-07, + "loss": 0.3697, + "step": 33910 + }, + { + "epoch": 4.916476428597312, + "grad_norm": 2.0659215450286865, + "learning_rate": 8.493984635454415e-07, + "loss": 0.3888, + "step": 33920 + }, + { + "epoch": 4.917925861506686, + "grad_norm": 0.9193480610847473, + "learning_rate": 8.349036092187274e-07, + "loss": 0.345, + "step": 33930 + }, + { + "epoch": 4.91937529441606, + "grad_norm": 0.9872389435768127, + "learning_rate": 8.204087548920135e-07, + "loss": 0.3537, + "step": 33940 + }, + { + "epoch": 4.920824727325434, + "grad_norm": 1.2617017030715942, + "learning_rate": 8.059139005652994e-07, + "loss": 0.3635, + "step": 33950 + }, + { + "epoch": 4.922274160234808, + "grad_norm": 1.9014856815338135, + "learning_rate": 7.914190462385854e-07, + "loss": 0.306, + "step": 33960 + }, + { + "epoch": 4.9237235931441825, + "grad_norm": 2.8198678493499756, + "learning_rate": 7.769241919118713e-07, + "loss": 0.2914, + "step": 33970 + }, + { + "epoch": 4.925173026053557, + "grad_norm": 1.2952039241790771, + "learning_rate": 7.624293375851574e-07, + "loss": 0.3804, + "step": 33980 + }, + { + "epoch": 4.926622458962931, + "grad_norm": 1.6215705871582031, + "learning_rate": 7.479344832584433e-07, + "loss": 0.3993, + "step": 33990 + }, + { + "epoch": 4.928071891872305, + "grad_norm": 2.0749127864837646, + "learning_rate": 7.334396289317293e-07, + "loss": 0.37, + "step": 34000 + }, + { + "epoch": 4.928071891872305, + "eval_loss": 0.7409716248512268, + "eval_runtime": 672.128, + "eval_samples_per_second": 51.322, + "eval_steps_per_second": 2.566, + "eval_token_accuracy": 0.0003939013962992248, + "step": 34000 + }, + { + "epoch": 4.929521324781679, + "grad_norm": 1.1606099605560303, + "learning_rate": 7.189447746050152e-07, + "loss": 0.3569, + "step": 34010 + }, + { + "epoch": 4.930970757691053, + "grad_norm": 1.0649526119232178, + "learning_rate": 7.044499202783013e-07, + "loss": 0.3471, + "step": 34020 + }, + { + "epoch": 4.9324201906004275, + "grad_norm": 2.766159772872925, + "learning_rate": 6.899550659515872e-07, + "loss": 0.3769, + "step": 34030 + }, + { + "epoch": 4.933869623509802, + "grad_norm": 2.2011117935180664, + "learning_rate": 6.754602116248732e-07, + "loss": 0.3361, + "step": 34040 + }, + { + "epoch": 4.935319056419176, + "grad_norm": 1.4634935855865479, + "learning_rate": 6.609653572981592e-07, + "loss": 0.3589, + "step": 34050 + }, + { + "epoch": 4.93676848932855, + "grad_norm": 2.5267245769500732, + "learning_rate": 6.464705029714452e-07, + "loss": 0.3581, + "step": 34060 + }, + { + "epoch": 4.938217922237924, + "grad_norm": 3.63118314743042, + "learning_rate": 6.319756486447312e-07, + "loss": 0.3609, + "step": 34070 + }, + { + "epoch": 4.939667355147298, + "grad_norm": 1.3346004486083984, + "learning_rate": 6.174807943180172e-07, + "loss": 0.4164, + "step": 34080 + }, + { + "epoch": 4.941116788056673, + "grad_norm": 1.1010299921035767, + "learning_rate": 6.029859399913031e-07, + "loss": 0.3551, + "step": 34090 + }, + { + "epoch": 4.942566220966047, + "grad_norm": 1.9690850973129272, + "learning_rate": 5.884910856645891e-07, + "loss": 0.3622, + "step": 34100 + }, + { + "epoch": 4.944015653875422, + "grad_norm": 2.8707642555236816, + "learning_rate": 5.739962313378751e-07, + "loss": 0.3133, + "step": 34110 + }, + { + "epoch": 4.945465086784796, + "grad_norm": 2.438140869140625, + "learning_rate": 5.595013770111611e-07, + "loss": 0.4127, + "step": 34120 + }, + { + "epoch": 4.94691451969417, + "grad_norm": 1.2852033376693726, + "learning_rate": 5.45006522684447e-07, + "loss": 0.3827, + "step": 34130 + }, + { + "epoch": 4.948363952603544, + "grad_norm": 1.318913221359253, + "learning_rate": 5.30511668357733e-07, + "loss": 0.4089, + "step": 34140 + }, + { + "epoch": 4.949813385512918, + "grad_norm": 1.2271665334701538, + "learning_rate": 5.16016814031019e-07, + "loss": 0.3706, + "step": 34150 + }, + { + "epoch": 4.951262818422292, + "grad_norm": 1.5632954835891724, + "learning_rate": 5.015219597043051e-07, + "loss": 0.3624, + "step": 34160 + }, + { + "epoch": 4.952712251331667, + "grad_norm": 3.3097076416015625, + "learning_rate": 4.87027105377591e-07, + "loss": 0.3476, + "step": 34170 + }, + { + "epoch": 4.954161684241041, + "grad_norm": 1.1198471784591675, + "learning_rate": 4.7253225105087695e-07, + "loss": 0.405, + "step": 34180 + }, + { + "epoch": 4.955611117150415, + "grad_norm": 0.8534343838691711, + "learning_rate": 4.5803739672416295e-07, + "loss": 0.3386, + "step": 34190 + }, + { + "epoch": 4.957060550059789, + "grad_norm": 2.1439709663391113, + "learning_rate": 4.435425423974489e-07, + "loss": 0.3165, + "step": 34200 + }, + { + "epoch": 4.958509982969163, + "grad_norm": 1.0483152866363525, + "learning_rate": 4.290476880707349e-07, + "loss": 0.3532, + "step": 34210 + }, + { + "epoch": 4.959959415878537, + "grad_norm": 0.9778922200202942, + "learning_rate": 4.1455283374402085e-07, + "loss": 0.3545, + "step": 34220 + }, + { + "epoch": 4.9614088487879116, + "grad_norm": 1.4083807468414307, + "learning_rate": 4.0005797941730685e-07, + "loss": 0.4085, + "step": 34230 + }, + { + "epoch": 4.962858281697286, + "grad_norm": 3.145470142364502, + "learning_rate": 3.8556312509059285e-07, + "loss": 0.3465, + "step": 34240 + }, + { + "epoch": 4.96430771460666, + "grad_norm": 1.1022526025772095, + "learning_rate": 3.7106827076387885e-07, + "loss": 0.338, + "step": 34250 + }, + { + "epoch": 4.965757147516034, + "grad_norm": 1.3441957235336304, + "learning_rate": 3.565734164371648e-07, + "loss": 0.3876, + "step": 34260 + }, + { + "epoch": 4.967206580425408, + "grad_norm": 1.3486835956573486, + "learning_rate": 3.4207856211045085e-07, + "loss": 0.339, + "step": 34270 + }, + { + "epoch": 4.968656013334783, + "grad_norm": 2.366464853286743, + "learning_rate": 3.275837077837368e-07, + "loss": 0.4093, + "step": 34280 + }, + { + "epoch": 4.9701054462441565, + "grad_norm": 2.5643863677978516, + "learning_rate": 3.130888534570228e-07, + "loss": 0.3802, + "step": 34290 + }, + { + "epoch": 4.971554879153532, + "grad_norm": 1.02238130569458, + "learning_rate": 2.9859399913030874e-07, + "loss": 0.3807, + "step": 34300 + }, + { + "epoch": 4.973004312062906, + "grad_norm": 3.854581594467163, + "learning_rate": 2.8409914480359474e-07, + "loss": 0.3604, + "step": 34310 + }, + { + "epoch": 4.97445374497228, + "grad_norm": 2.844658374786377, + "learning_rate": 2.6960429047688074e-07, + "loss": 0.3959, + "step": 34320 + }, + { + "epoch": 4.975903177881654, + "grad_norm": 1.9395471811294556, + "learning_rate": 2.551094361501667e-07, + "loss": 0.3671, + "step": 34330 + }, + { + "epoch": 4.977352610791028, + "grad_norm": 1.0512335300445557, + "learning_rate": 2.420640672561241e-07, + "loss": 0.3511, + "step": 34340 + }, + { + "epoch": 4.978802043700402, + "grad_norm": 1.3229668140411377, + "learning_rate": 2.275692129294101e-07, + "loss": 0.4025, + "step": 34350 + }, + { + "epoch": 4.9802514766097765, + "grad_norm": 0.8195211291313171, + "learning_rate": 2.1307435860269607e-07, + "loss": 0.3992, + "step": 34360 + }, + { + "epoch": 4.981700909519151, + "grad_norm": 1.1004067659378052, + "learning_rate": 1.9857950427598204e-07, + "loss": 0.3877, + "step": 34370 + }, + { + "epoch": 4.983150342428525, + "grad_norm": 1.3482115268707275, + "learning_rate": 1.8408464994926802e-07, + "loss": 0.3713, + "step": 34380 + }, + { + "epoch": 4.984599775337899, + "grad_norm": 1.212549090385437, + "learning_rate": 1.69589795622554e-07, + "loss": 0.4233, + "step": 34390 + }, + { + "epoch": 4.986049208247273, + "grad_norm": 3.041564702987671, + "learning_rate": 1.5509494129583997e-07, + "loss": 0.3125, + "step": 34400 + }, + { + "epoch": 4.987498641156647, + "grad_norm": 2.7439684867858887, + "learning_rate": 1.4060008696912597e-07, + "loss": 0.3424, + "step": 34410 + }, + { + "epoch": 4.9889480740660215, + "grad_norm": 2.762303590774536, + "learning_rate": 1.2610523264241194e-07, + "loss": 0.3415, + "step": 34420 + }, + { + "epoch": 4.990397506975396, + "grad_norm": 2.809271812438965, + "learning_rate": 1.1161037831569793e-07, + "loss": 0.3471, + "step": 34430 + }, + { + "epoch": 4.99184693988477, + "grad_norm": 2.674682855606079, + "learning_rate": 9.711552398898391e-08, + "loss": 0.359, + "step": 34440 + }, + { + "epoch": 4.993296372794144, + "grad_norm": 1.3303627967834473, + "learning_rate": 8.26206696622699e-08, + "loss": 0.3938, + "step": 34450 + }, + { + "epoch": 4.994745805703518, + "grad_norm": 1.5275235176086426, + "learning_rate": 6.812581533555588e-08, + "loss": 0.4174, + "step": 34460 + }, + { + "epoch": 4.996195238612893, + "grad_norm": 1.2861908674240112, + "learning_rate": 5.363096100884186e-08, + "loss": 0.3857, + "step": 34470 + }, + { + "epoch": 4.997644671522266, + "grad_norm": 3.7214603424072266, + "learning_rate": 3.913610668212785e-08, + "loss": 0.3666, + "step": 34480 + }, + { + "epoch": 4.9990941044316415, + "grad_norm": 4.302470684051514, + "learning_rate": 2.464125235541383e-08, + "loss": 0.3139, + "step": 34490 + } + ], + "logging_steps": 10, + "max_steps": 34495, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.471826509366821e+18, + "train_batch_size": 10, + "trial_name": null, + "trial_params": null +}