{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 2.131298542022705, "learning_rate": 0.00019962000000000002, "loss": 2.0121, "mean_token_accuracy": 0.6703190118074417, "num_tokens": 2348.0, "step": 20 }, { "epoch": 0.008, "grad_norm": 1.2890418767929077, "learning_rate": 0.00019922, "loss": 0.2751, "mean_token_accuracy": 0.9120438575744629, "num_tokens": 4697.0, "step": 40 }, { "epoch": 0.012, "grad_norm": 0.8126867413520813, "learning_rate": 0.00019882, "loss": 0.1966, "mean_token_accuracy": 0.9203487157821655, "num_tokens": 7014.0, "step": 60 }, { "epoch": 0.016, "grad_norm": 0.6051881313323975, "learning_rate": 0.00019842000000000001, "loss": 0.1851, "mean_token_accuracy": 0.9291799515485764, "num_tokens": 9327.0, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.6378348469734192, "learning_rate": 0.00019802, "loss": 0.1766, "mean_token_accuracy": 0.9286984890699387, "num_tokens": 11670.0, "step": 100 }, { "epoch": 0.024, "grad_norm": 0.624138593673706, "learning_rate": 0.00019762, "loss": 0.1784, "mean_token_accuracy": 0.9264282643795013, "num_tokens": 14000.0, "step": 120 }, { "epoch": 0.028, "grad_norm": 0.2111046463251114, "learning_rate": 0.00019722, "loss": 0.1702, "mean_token_accuracy": 0.9321970880031586, "num_tokens": 16329.0, "step": 140 }, { "epoch": 0.032, "grad_norm": 0.5350440740585327, "learning_rate": 0.00019682, "loss": 0.171, "mean_token_accuracy": 0.9311463803052902, "num_tokens": 18667.0, "step": 160 }, { "epoch": 0.036, "grad_norm": 0.19237647950649261, "learning_rate": 0.00019642, "loss": 0.167, "mean_token_accuracy": 0.9345656305551528, "num_tokens": 20985.0, "step": 180 }, { "epoch": 0.04, "grad_norm": 0.28153756260871887, "learning_rate": 0.00019602, "loss": 0.1674, "mean_token_accuracy": 0.9329667061567306, "num_tokens": 23325.0, "step": 200 }, { "epoch": 0.044, "grad_norm": 0.8545331954956055, "learning_rate": 0.00019562, "loss": 0.166, "mean_token_accuracy": 0.9344212204217911, "num_tokens": 25670.0, "step": 220 }, { "epoch": 0.048, "grad_norm": 0.24941129982471466, "learning_rate": 0.00019522, "loss": 0.1661, "mean_token_accuracy": 0.934859549999237, "num_tokens": 28016.0, "step": 240 }, { "epoch": 0.052, "grad_norm": 0.29549548029899597, "learning_rate": 0.00019482, "loss": 0.1707, "mean_token_accuracy": 0.9345517784357071, "num_tokens": 30345.0, "step": 260 }, { "epoch": 0.056, "grad_norm": 0.20388178527355194, "learning_rate": 0.00019442, "loss": 0.1673, "mean_token_accuracy": 0.9353183209896088, "num_tokens": 32691.0, "step": 280 }, { "epoch": 0.06, "grad_norm": 0.10762794315814972, "learning_rate": 0.00019402, "loss": 0.1642, "mean_token_accuracy": 0.9325390756130219, "num_tokens": 35030.0, "step": 300 }, { "epoch": 0.064, "grad_norm": 0.07676753401756287, "learning_rate": 0.00019362, "loss": 0.1633, "mean_token_accuracy": 0.9348760217428207, "num_tokens": 37359.0, "step": 320 }, { "epoch": 0.068, "grad_norm": 0.06781225651502609, "learning_rate": 0.00019322, "loss": 0.1589, "mean_token_accuracy": 0.936154904961586, "num_tokens": 39707.0, "step": 340 }, { "epoch": 0.072, "grad_norm": 0.10010460019111633, "learning_rate": 0.00019282000000000001, "loss": 0.1583, "mean_token_accuracy": 0.9410124599933625, "num_tokens": 42071.0, "step": 360 }, { "epoch": 0.076, "grad_norm": 0.07932794839143753, "learning_rate": 0.00019242, "loss": 0.1608, "mean_token_accuracy": 0.9380002528429031, "num_tokens": 44404.0, "step": 380 }, { "epoch": 0.08, "grad_norm": 0.06678586453199387, "learning_rate": 0.00019202, "loss": 0.1633, "mean_token_accuracy": 0.9347729980945587, "num_tokens": 46715.0, "step": 400 }, { "epoch": 0.084, "grad_norm": 0.05118393525481224, "learning_rate": 0.00019162, "loss": 0.1621, "mean_token_accuracy": 0.9356200367212295, "num_tokens": 49030.0, "step": 420 }, { "epoch": 0.088, "grad_norm": 0.07563836127519608, "learning_rate": 0.00019122, "loss": 0.1603, "mean_token_accuracy": 0.9362942427396774, "num_tokens": 51366.0, "step": 440 }, { "epoch": 0.092, "grad_norm": 0.053388580679893494, "learning_rate": 0.00019082, "loss": 0.1585, "mean_token_accuracy": 0.9377258807420731, "num_tokens": 53706.0, "step": 460 }, { "epoch": 0.096, "grad_norm": 0.05659119412302971, "learning_rate": 0.00019042, "loss": 0.1575, "mean_token_accuracy": 0.937972965836525, "num_tokens": 56052.0, "step": 480 }, { "epoch": 0.1, "grad_norm": 0.04934714362025261, "learning_rate": 0.00019002, "loss": 0.1606, "mean_token_accuracy": 0.9356311202049256, "num_tokens": 58374.0, "step": 500 }, { "epoch": 0.104, "grad_norm": 0.05647804215550423, "learning_rate": 0.00018962000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9353525519371033, "num_tokens": 60706.0, "step": 520 }, { "epoch": 0.108, "grad_norm": 0.058523017913103104, "learning_rate": 0.00018922, "loss": 0.1595, "mean_token_accuracy": 0.9371987581253052, "num_tokens": 63021.0, "step": 540 }, { "epoch": 0.112, "grad_norm": 0.03793497756123543, "learning_rate": 0.00018882000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9341553807258606, "num_tokens": 65369.0, "step": 560 }, { "epoch": 0.116, "grad_norm": 0.04743633046746254, "learning_rate": 0.00018842000000000002, "loss": 0.1586, "mean_token_accuracy": 0.936157900094986, "num_tokens": 67700.0, "step": 580 }, { "epoch": 0.12, "grad_norm": 0.05463261529803276, "learning_rate": 0.00018802, "loss": 0.1578, "mean_token_accuracy": 0.937694975733757, "num_tokens": 70038.0, "step": 600 }, { "epoch": 0.124, "grad_norm": 0.11279874294996262, "learning_rate": 0.00018762000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9382745862007141, "num_tokens": 72367.0, "step": 620 }, { "epoch": 0.128, "grad_norm": 0.050823234021663666, "learning_rate": 0.00018722, "loss": 0.1602, "mean_token_accuracy": 0.9388011395931244, "num_tokens": 74677.0, "step": 640 }, { "epoch": 0.132, "grad_norm": 0.04983159899711609, "learning_rate": 0.00018682000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9374112606048584, "num_tokens": 77014.0, "step": 660 }, { "epoch": 0.136, "grad_norm": 0.05084273964166641, "learning_rate": 0.00018642000000000002, "loss": 0.1591, "mean_token_accuracy": 0.9387890577316285, "num_tokens": 79339.0, "step": 680 }, { "epoch": 0.14, "grad_norm": 0.04936506226658821, "learning_rate": 0.00018602, "loss": 0.157, "mean_token_accuracy": 0.9359721839427948, "num_tokens": 81688.0, "step": 700 }, { "epoch": 0.144, "grad_norm": 0.04119481146335602, "learning_rate": 0.00018562000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9359663873910904, "num_tokens": 84025.0, "step": 720 }, { "epoch": 0.148, "grad_norm": 0.03988514095544815, "learning_rate": 0.00018522000000000002, "loss": 0.1595, "mean_token_accuracy": 0.9372004926204681, "num_tokens": 86341.0, "step": 740 }, { "epoch": 0.152, "grad_norm": 0.04381653666496277, "learning_rate": 0.00018482, "loss": 0.1591, "mean_token_accuracy": 0.9373360633850097, "num_tokens": 88665.0, "step": 760 }, { "epoch": 0.156, "grad_norm": 0.03504428267478943, "learning_rate": 0.00018442000000000003, "loss": 0.1579, "mean_token_accuracy": 0.9403579801321029, "num_tokens": 90997.0, "step": 780 }, { "epoch": 0.16, "grad_norm": 0.03745226562023163, "learning_rate": 0.00018402000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9388439536094666, "num_tokens": 93322.0, "step": 800 }, { "epoch": 0.164, "grad_norm": 0.033838506788015366, "learning_rate": 0.00018362, "loss": 0.1562, "mean_token_accuracy": 0.9360319077968597, "num_tokens": 95688.0, "step": 820 }, { "epoch": 0.168, "grad_norm": 0.0314440056681633, "learning_rate": 0.00018322000000000002, "loss": 0.1596, "mean_token_accuracy": 0.9349893003702163, "num_tokens": 98005.0, "step": 840 }, { "epoch": 0.172, "grad_norm": 0.06577116250991821, "learning_rate": 0.00018282000000000001, "loss": 0.1589, "mean_token_accuracy": 0.9364802747964859, "num_tokens": 100327.0, "step": 860 }, { "epoch": 0.176, "grad_norm": 0.029735982418060303, "learning_rate": 0.00018242, "loss": 0.1576, "mean_token_accuracy": 0.9365677893161773, "num_tokens": 102671.0, "step": 880 }, { "epoch": 0.18, "grad_norm": 0.03155644237995148, "learning_rate": 0.00018202000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9387517213821411, "num_tokens": 104996.0, "step": 900 }, { "epoch": 0.184, "grad_norm": 0.027411427348852158, "learning_rate": 0.00018162, "loss": 0.1581, "mean_token_accuracy": 0.9342827945947647, "num_tokens": 107337.0, "step": 920 }, { "epoch": 0.188, "grad_norm": 0.024014495313167572, "learning_rate": 0.00018122, "loss": 0.1583, "mean_token_accuracy": 0.9355534523725509, "num_tokens": 109674.0, "step": 940 }, { "epoch": 0.192, "grad_norm": 0.02990046516060829, "learning_rate": 0.00018082000000000002, "loss": 0.1577, "mean_token_accuracy": 0.9373938798904419, "num_tokens": 112015.0, "step": 960 }, { "epoch": 0.196, "grad_norm": 0.020676879212260246, "learning_rate": 0.00018042, "loss": 0.1582, "mean_token_accuracy": 0.9363098949193954, "num_tokens": 114349.0, "step": 980 }, { "epoch": 0.2, "grad_norm": 0.024556942284107208, "learning_rate": 0.00018002, "loss": 0.159, "mean_token_accuracy": 0.9355833351612091, "num_tokens": 116672.0, "step": 1000 }, { "epoch": 0.204, "grad_norm": 0.22343556582927704, "learning_rate": 0.00017962000000000002, "loss": 0.157, "mean_token_accuracy": 0.9363467574119568, "num_tokens": 119023.0, "step": 1020 }, { "epoch": 0.208, "grad_norm": 0.515048623085022, "learning_rate": 0.00017922, "loss": 0.1585, "mean_token_accuracy": 0.9374267637729645, "num_tokens": 121366.0, "step": 1040 }, { "epoch": 0.212, "grad_norm": 0.2672664225101471, "learning_rate": 0.00017882, "loss": 0.1704, "mean_token_accuracy": 0.938492265343666, "num_tokens": 123719.0, "step": 1060 }, { "epoch": 0.216, "grad_norm": 0.11909265071153641, "learning_rate": 0.00017842000000000002, "loss": 0.1752, "mean_token_accuracy": 0.9297878712415695, "num_tokens": 126053.0, "step": 1080 }, { "epoch": 0.22, "grad_norm": 0.11977271735668182, "learning_rate": 0.00017802, "loss": 0.1652, "mean_token_accuracy": 0.9360411554574967, "num_tokens": 128379.0, "step": 1100 }, { "epoch": 0.224, "grad_norm": 0.17722292244434357, "learning_rate": 0.00017762, "loss": 0.1697, "mean_token_accuracy": 0.9314894318580628, "num_tokens": 130700.0, "step": 1120 }, { "epoch": 0.228, "grad_norm": 0.8375388979911804, "learning_rate": 0.00017722000000000001, "loss": 0.1895, "mean_token_accuracy": 0.9316652357578278, "num_tokens": 133026.0, "step": 1140 }, { "epoch": 0.232, "grad_norm": 0.10569056123495102, "learning_rate": 0.00017682, "loss": 0.1679, "mean_token_accuracy": 0.9333775132894516, "num_tokens": 135371.0, "step": 1160 }, { "epoch": 0.236, "grad_norm": 0.07626856118440628, "learning_rate": 0.00017642, "loss": 0.1613, "mean_token_accuracy": 0.9380175620317459, "num_tokens": 137695.0, "step": 1180 }, { "epoch": 0.24, "grad_norm": 0.06852507591247559, "learning_rate": 0.00017602, "loss": 0.1693, "mean_token_accuracy": 0.9329774439334869, "num_tokens": 140007.0, "step": 1200 }, { "epoch": 0.244, "grad_norm": 0.11691898107528687, "learning_rate": 0.00017562, "loss": 0.159, "mean_token_accuracy": 0.9360336065292358, "num_tokens": 142345.0, "step": 1220 }, { "epoch": 0.248, "grad_norm": 0.10780195891857147, "learning_rate": 0.00017522000000000002, "loss": 0.1586, "mean_token_accuracy": 0.9376411676406861, "num_tokens": 144693.0, "step": 1240 }, { "epoch": 0.252, "grad_norm": 0.07631397247314453, "learning_rate": 0.00017482, "loss": 0.1633, "mean_token_accuracy": 0.9390978574752807, "num_tokens": 147031.0, "step": 1260 }, { "epoch": 0.256, "grad_norm": 0.05569858103990555, "learning_rate": 0.00017442, "loss": 0.1615, "mean_token_accuracy": 0.9400094121694564, "num_tokens": 149371.0, "step": 1280 }, { "epoch": 0.26, "grad_norm": 0.04815123230218887, "learning_rate": 0.00017402000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9370361328125, "num_tokens": 151716.0, "step": 1300 }, { "epoch": 0.264, "grad_norm": 0.11904877424240112, "learning_rate": 0.00017362, "loss": 0.159, "mean_token_accuracy": 0.941026845574379, "num_tokens": 154046.0, "step": 1320 }, { "epoch": 0.268, "grad_norm": 0.12382964044809341, "learning_rate": 0.00017322, "loss": 0.1594, "mean_token_accuracy": 0.9340965986251831, "num_tokens": 156382.0, "step": 1340 }, { "epoch": 0.272, "grad_norm": 0.05523005872964859, "learning_rate": 0.00017282000000000002, "loss": 0.1593, "mean_token_accuracy": 0.9396583586931229, "num_tokens": 158705.0, "step": 1360 }, { "epoch": 0.276, "grad_norm": 0.05591598525643349, "learning_rate": 0.00017242, "loss": 0.1592, "mean_token_accuracy": 0.9364501267671586, "num_tokens": 161025.0, "step": 1380 }, { "epoch": 0.28, "grad_norm": 0.06780663877725601, "learning_rate": 0.00017202, "loss": 0.1618, "mean_token_accuracy": 0.936437115073204, "num_tokens": 163345.0, "step": 1400 }, { "epoch": 0.284, "grad_norm": 0.03291817009449005, "learning_rate": 0.00017162000000000001, "loss": 0.1569, "mean_token_accuracy": 0.9380638599395752, "num_tokens": 165711.0, "step": 1420 }, { "epoch": 0.288, "grad_norm": 0.04884820431470871, "learning_rate": 0.00017122, "loss": 0.1608, "mean_token_accuracy": 0.9370934247970581, "num_tokens": 168007.0, "step": 1440 }, { "epoch": 0.292, "grad_norm": 0.04577581211924553, "learning_rate": 0.00017082, "loss": 0.1595, "mean_token_accuracy": 0.9374603897333145, "num_tokens": 170332.0, "step": 1460 }, { "epoch": 0.296, "grad_norm": 0.03866467997431755, "learning_rate": 0.00017042, "loss": 0.1582, "mean_token_accuracy": 0.9355076909065246, "num_tokens": 172667.0, "step": 1480 }, { "epoch": 0.3, "grad_norm": 0.06204424798488617, "learning_rate": 0.00017002, "loss": 0.1599, "mean_token_accuracy": 0.9372841835021972, "num_tokens": 174970.0, "step": 1500 }, { "epoch": 0.304, "grad_norm": 0.03288702666759491, "learning_rate": 0.00016962, "loss": 0.1578, "mean_token_accuracy": 0.93585424721241, "num_tokens": 177317.0, "step": 1520 }, { "epoch": 0.308, "grad_norm": 0.03605024516582489, "learning_rate": 0.00016922, "loss": 0.1597, "mean_token_accuracy": 0.9366346269845962, "num_tokens": 179629.0, "step": 1540 }, { "epoch": 0.312, "grad_norm": 0.03328383341431618, "learning_rate": 0.00016882, "loss": 0.1582, "mean_token_accuracy": 0.9372009009122848, "num_tokens": 181960.0, "step": 1560 }, { "epoch": 0.316, "grad_norm": 0.03522924706339836, "learning_rate": 0.00016842, "loss": 0.1572, "mean_token_accuracy": 0.9355730235576629, "num_tokens": 184314.0, "step": 1580 }, { "epoch": 0.32, "grad_norm": 0.0317777544260025, "learning_rate": 0.00016802, "loss": 0.1572, "mean_token_accuracy": 0.9378984242677688, "num_tokens": 186658.0, "step": 1600 }, { "epoch": 0.324, "grad_norm": 0.07111163437366486, "learning_rate": 0.00016762, "loss": 0.1613, "mean_token_accuracy": 0.9344337552785873, "num_tokens": 188974.0, "step": 1620 }, { "epoch": 0.328, "grad_norm": 0.04765714704990387, "learning_rate": 0.00016722, "loss": 0.1608, "mean_token_accuracy": 0.9345141768455505, "num_tokens": 191276.0, "step": 1640 }, { "epoch": 0.332, "grad_norm": 0.041960619390010834, "learning_rate": 0.00016682, "loss": 0.1594, "mean_token_accuracy": 0.9375987917184829, "num_tokens": 193597.0, "step": 1660 }, { "epoch": 0.336, "grad_norm": 0.041757769882678986, "learning_rate": 0.00016642, "loss": 0.157, "mean_token_accuracy": 0.9367031455039978, "num_tokens": 195949.0, "step": 1680 }, { "epoch": 0.34, "grad_norm": 0.05323236435651779, "learning_rate": 0.00016601999999999999, "loss": 0.1654, "mean_token_accuracy": 0.9359392642974853, "num_tokens": 198267.0, "step": 1700 }, { "epoch": 0.344, "grad_norm": 0.08934314548969269, "learning_rate": 0.00016562, "loss": 0.1619, "mean_token_accuracy": 0.9331722050905228, "num_tokens": 200613.0, "step": 1720 }, { "epoch": 0.348, "grad_norm": 0.033347178250551224, "learning_rate": 0.00016522, "loss": 0.1611, "mean_token_accuracy": 0.9317500472068787, "num_tokens": 202927.0, "step": 1740 }, { "epoch": 0.352, "grad_norm": 0.03238425776362419, "learning_rate": 0.00016482, "loss": 0.1582, "mean_token_accuracy": 0.9358267247676849, "num_tokens": 205273.0, "step": 1760 }, { "epoch": 0.356, "grad_norm": 0.03249628096818924, "learning_rate": 0.00016442000000000003, "loss": 0.1591, "mean_token_accuracy": 0.9367831707000732, "num_tokens": 207595.0, "step": 1780 }, { "epoch": 0.36, "grad_norm": 0.034572117030620575, "learning_rate": 0.00016402000000000002, "loss": 0.1612, "mean_token_accuracy": 0.9356453567743301, "num_tokens": 209892.0, "step": 1800 }, { "epoch": 0.364, "grad_norm": 0.04566624388098717, "learning_rate": 0.00016362, "loss": 0.1605, "mean_token_accuracy": 0.9359289228916168, "num_tokens": 212194.0, "step": 1820 }, { "epoch": 0.368, "grad_norm": 0.029180865734815598, "learning_rate": 0.00016322000000000003, "loss": 0.1579, "mean_token_accuracy": 0.9378338158130646, "num_tokens": 214535.0, "step": 1840 }, { "epoch": 0.372, "grad_norm": 0.04812979698181152, "learning_rate": 0.00016282000000000002, "loss": 0.1576, "mean_token_accuracy": 0.9391636937856674, "num_tokens": 216872.0, "step": 1860 }, { "epoch": 0.376, "grad_norm": 0.06872449070215225, "learning_rate": 0.00016242, "loss": 0.1606, "mean_token_accuracy": 0.9389324098825454, "num_tokens": 219184.0, "step": 1880 }, { "epoch": 0.38, "grad_norm": 0.05308040603995323, "learning_rate": 0.00016202000000000002, "loss": 0.1592, "mean_token_accuracy": 0.9385550439357757, "num_tokens": 221499.0, "step": 1900 }, { "epoch": 0.384, "grad_norm": 0.13082465529441833, "learning_rate": 0.00016162000000000001, "loss": 0.1579, "mean_token_accuracy": 0.9380007416009903, "num_tokens": 223847.0, "step": 1920 }, { "epoch": 0.388, "grad_norm": 0.03414028137922287, "learning_rate": 0.00016122, "loss": 0.1602, "mean_token_accuracy": 0.9369382321834564, "num_tokens": 226156.0, "step": 1940 }, { "epoch": 0.392, "grad_norm": 0.04112333804368973, "learning_rate": 0.00016082000000000002, "loss": 0.1597, "mean_token_accuracy": 0.935492268204689, "num_tokens": 228475.0, "step": 1960 }, { "epoch": 0.396, "grad_norm": 0.02955610118806362, "learning_rate": 0.00016042, "loss": 0.1591, "mean_token_accuracy": 0.93908212184906, "num_tokens": 230792.0, "step": 1980 }, { "epoch": 0.4, "grad_norm": 0.024307863786816597, "learning_rate": 0.00016002, "loss": 0.1595, "mean_token_accuracy": 0.9393438696861267, "num_tokens": 233105.0, "step": 2000 }, { "epoch": 0.404, "grad_norm": 0.031049860641360283, "learning_rate": 0.00015962000000000002, "loss": 0.157, "mean_token_accuracy": 0.9350471049547195, "num_tokens": 235457.0, "step": 2020 }, { "epoch": 0.408, "grad_norm": 0.031201306730508804, "learning_rate": 0.00015922, "loss": 0.1597, "mean_token_accuracy": 0.9370091885328293, "num_tokens": 237770.0, "step": 2040 }, { "epoch": 0.412, "grad_norm": 0.03218454122543335, "learning_rate": 0.00015882, "loss": 0.159, "mean_token_accuracy": 0.9386389076709747, "num_tokens": 240090.0, "step": 2060 }, { "epoch": 0.416, "grad_norm": 0.03348623961210251, "learning_rate": 0.00015842000000000002, "loss": 0.1597, "mean_token_accuracy": 0.9362810254096985, "num_tokens": 242393.0, "step": 2080 }, { "epoch": 0.42, "grad_norm": 0.04007818177342415, "learning_rate": 0.00015802, "loss": 0.1581, "mean_token_accuracy": 0.9370237767696381, "num_tokens": 244733.0, "step": 2100 }, { "epoch": 0.424, "grad_norm": 0.03378809243440628, "learning_rate": 0.00015762, "loss": 0.1568, "mean_token_accuracy": 0.9377921044826507, "num_tokens": 247088.0, "step": 2120 }, { "epoch": 0.428, "grad_norm": 0.02798735350370407, "learning_rate": 0.00015722000000000002, "loss": 0.1586, "mean_token_accuracy": 0.9383140057325363, "num_tokens": 249415.0, "step": 2140 }, { "epoch": 0.432, "grad_norm": 0.02400992065668106, "learning_rate": 0.00015682, "loss": 0.1581, "mean_token_accuracy": 0.9389418184757232, "num_tokens": 251751.0, "step": 2160 }, { "epoch": 0.436, "grad_norm": 0.028334975242614746, "learning_rate": 0.00015642000000000002, "loss": 0.1578, "mean_token_accuracy": 0.936533722281456, "num_tokens": 254093.0, "step": 2180 }, { "epoch": 0.44, "grad_norm": 0.024794427677989006, "learning_rate": 0.00015602000000000001, "loss": 0.1587, "mean_token_accuracy": 0.9344067484140396, "num_tokens": 256422.0, "step": 2200 }, { "epoch": 0.444, "grad_norm": 0.024761928245425224, "learning_rate": 0.00015562, "loss": 0.1589, "mean_token_accuracy": 0.936401879787445, "num_tokens": 258747.0, "step": 2220 }, { "epoch": 0.448, "grad_norm": 0.023300737142562866, "learning_rate": 0.00015522000000000002, "loss": 0.1594, "mean_token_accuracy": 0.9409920126199722, "num_tokens": 261059.0, "step": 2240 }, { "epoch": 0.452, "grad_norm": 0.023498738184571266, "learning_rate": 0.00015482, "loss": 0.1595, "mean_token_accuracy": 0.9378518283367157, "num_tokens": 263369.0, "step": 2260 }, { "epoch": 0.456, "grad_norm": 0.020731788128614426, "learning_rate": 0.00015442, "loss": 0.1586, "mean_token_accuracy": 0.9395518034696579, "num_tokens": 265691.0, "step": 2280 }, { "epoch": 0.46, "grad_norm": 0.03587990626692772, "learning_rate": 0.00015402000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9378482937812805, "num_tokens": 268016.0, "step": 2300 }, { "epoch": 0.464, "grad_norm": 0.03809090331196785, "learning_rate": 0.00015362, "loss": 0.1588, "mean_token_accuracy": 0.9367815405130386, "num_tokens": 270334.0, "step": 2320 }, { "epoch": 0.468, "grad_norm": 0.04313996061682701, "learning_rate": 0.00015322, "loss": 0.1593, "mean_token_accuracy": 0.9363488733768464, "num_tokens": 272651.0, "step": 2340 }, { "epoch": 0.472, "grad_norm": 0.033811088651418686, "learning_rate": 0.00015282000000000002, "loss": 0.1594, "mean_token_accuracy": 0.9361798793077469, "num_tokens": 274964.0, "step": 2360 }, { "epoch": 0.476, "grad_norm": 0.03164658322930336, "learning_rate": 0.00015242, "loss": 0.1598, "mean_token_accuracy": 0.9361578047275543, "num_tokens": 277276.0, "step": 2380 }, { "epoch": 0.48, "grad_norm": 0.030314739793539047, "learning_rate": 0.00015202, "loss": 0.1601, "mean_token_accuracy": 0.9364828914403915, "num_tokens": 279583.0, "step": 2400 }, { "epoch": 0.484, "grad_norm": 0.03359575197100639, "learning_rate": 0.00015162000000000002, "loss": 0.1577, "mean_token_accuracy": 0.9370669215917588, "num_tokens": 281927.0, "step": 2420 }, { "epoch": 0.488, "grad_norm": 0.025949697941541672, "learning_rate": 0.00015122, "loss": 0.1582, "mean_token_accuracy": 0.9364959686994553, "num_tokens": 284267.0, "step": 2440 }, { "epoch": 0.492, "grad_norm": 0.031149016693234444, "learning_rate": 0.00015082, "loss": 0.1589, "mean_token_accuracy": 0.9384445637464524, "num_tokens": 286585.0, "step": 2460 }, { "epoch": 0.496, "grad_norm": 0.031569018959999084, "learning_rate": 0.00015042, "loss": 0.1589, "mean_token_accuracy": 0.9359267175197601, "num_tokens": 288907.0, "step": 2480 }, { "epoch": 0.5, "grad_norm": 0.02912713773548603, "learning_rate": 0.00015002, "loss": 0.1582, "mean_token_accuracy": 0.9354292452335358, "num_tokens": 291240.0, "step": 2500 }, { "epoch": 0.504, "grad_norm": 0.029648004099726677, "learning_rate": 0.00014962, "loss": 0.1577, "mean_token_accuracy": 0.9371567130088806, "num_tokens": 293587.0, "step": 2520 }, { "epoch": 0.508, "grad_norm": 0.01994331367313862, "learning_rate": 0.00014922, "loss": 0.1565, "mean_token_accuracy": 0.9374659866094589, "num_tokens": 295947.0, "step": 2540 }, { "epoch": 0.512, "grad_norm": 0.022220291197299957, "learning_rate": 0.00014882, "loss": 0.1576, "mean_token_accuracy": 0.9352899432182312, "num_tokens": 298291.0, "step": 2560 }, { "epoch": 0.516, "grad_norm": 0.019389133900403976, "learning_rate": 0.00014842, "loss": 0.1579, "mean_token_accuracy": 0.9369660496711731, "num_tokens": 300630.0, "step": 2580 }, { "epoch": 0.52, "grad_norm": 0.025073856115341187, "learning_rate": 0.00014802, "loss": 0.1594, "mean_token_accuracy": 0.9384920775890351, "num_tokens": 302943.0, "step": 2600 }, { "epoch": 0.524, "grad_norm": 0.02601858787238598, "learning_rate": 0.00014762, "loss": 0.1589, "mean_token_accuracy": 0.9373117983341217, "num_tokens": 305265.0, "step": 2620 }, { "epoch": 0.528, "grad_norm": 0.0248605664819479, "learning_rate": 0.00014722, "loss": 0.1594, "mean_token_accuracy": 0.9345557481050492, "num_tokens": 307583.0, "step": 2640 }, { "epoch": 0.532, "grad_norm": 0.022037120535969734, "learning_rate": 0.00014682, "loss": 0.1596, "mean_token_accuracy": 0.9348031014204026, "num_tokens": 309896.0, "step": 2660 }, { "epoch": 0.536, "grad_norm": 0.03458873927593231, "learning_rate": 0.00014642, "loss": 0.1572, "mean_token_accuracy": 0.9376107037067414, "num_tokens": 312244.0, "step": 2680 }, { "epoch": 0.54, "grad_norm": 0.016396528109908104, "learning_rate": 0.00014602, "loss": 0.1582, "mean_token_accuracy": 0.9370677560567856, "num_tokens": 314578.0, "step": 2700 }, { "epoch": 0.544, "grad_norm": 0.01821085438132286, "learning_rate": 0.00014562, "loss": 0.1596, "mean_token_accuracy": 0.9343250393867493, "num_tokens": 316892.0, "step": 2720 }, { "epoch": 0.548, "grad_norm": 0.025619324296712875, "learning_rate": 0.00014522, "loss": 0.1591, "mean_token_accuracy": 0.9362540364265441, "num_tokens": 319213.0, "step": 2740 }, { "epoch": 0.552, "grad_norm": 0.01870078593492508, "learning_rate": 0.00014482, "loss": 0.1598, "mean_token_accuracy": 0.9338186293840408, "num_tokens": 321525.0, "step": 2760 }, { "epoch": 0.556, "grad_norm": 0.018730677664279938, "learning_rate": 0.00014442, "loss": 0.1566, "mean_token_accuracy": 0.9394799619913101, "num_tokens": 323880.0, "step": 2780 }, { "epoch": 0.56, "grad_norm": 0.019898803904652596, "learning_rate": 0.00014402, "loss": 0.1579, "mean_token_accuracy": 0.9384922862052918, "num_tokens": 326214.0, "step": 2800 }, { "epoch": 0.564, "grad_norm": 0.021964257583022118, "learning_rate": 0.00014362, "loss": 0.1588, "mean_token_accuracy": 0.9378354996442795, "num_tokens": 328536.0, "step": 2820 }, { "epoch": 0.568, "grad_norm": 0.02397042326629162, "learning_rate": 0.00014322, "loss": 0.157, "mean_token_accuracy": 0.9381829768419265, "num_tokens": 330888.0, "step": 2840 }, { "epoch": 0.572, "grad_norm": 0.017819812521338463, "learning_rate": 0.00014282, "loss": 0.1569, "mean_token_accuracy": 0.940555801987648, "num_tokens": 333234.0, "step": 2860 }, { "epoch": 0.576, "grad_norm": 0.019056344404816628, "learning_rate": 0.00014242, "loss": 0.1621, "mean_token_accuracy": 0.9321106940507888, "num_tokens": 335516.0, "step": 2880 }, { "epoch": 0.58, "grad_norm": 0.020357482135295868, "learning_rate": 0.00014202, "loss": 0.1586, "mean_token_accuracy": 0.9384677648544312, "num_tokens": 337843.0, "step": 2900 }, { "epoch": 0.584, "grad_norm": 0.02105647511780262, "learning_rate": 0.00014162, "loss": 0.1592, "mean_token_accuracy": 0.9367755681276322, "num_tokens": 340159.0, "step": 2920 }, { "epoch": 0.588, "grad_norm": 0.02058851346373558, "learning_rate": 0.00014122, "loss": 0.1583, "mean_token_accuracy": 0.9383471548557282, "num_tokens": 342487.0, "step": 2940 }, { "epoch": 0.592, "grad_norm": 0.022488698363304138, "learning_rate": 0.00014082, "loss": 0.1575, "mean_token_accuracy": 0.9383687317371369, "num_tokens": 344831.0, "step": 2960 }, { "epoch": 0.596, "grad_norm": 0.01753912679851055, "learning_rate": 0.00014042, "loss": 0.1583, "mean_token_accuracy": 0.9367416232824326, "num_tokens": 347163.0, "step": 2980 }, { "epoch": 0.6, "grad_norm": 0.01856599561870098, "learning_rate": 0.00014002, "loss": 0.156, "mean_token_accuracy": 0.9410459071397781, "num_tokens": 349528.0, "step": 3000 }, { "epoch": 0.604, "grad_norm": 0.018140017986297607, "learning_rate": 0.00013962000000000002, "loss": 0.1588, "mean_token_accuracy": 0.9355444282293319, "num_tokens": 351852.0, "step": 3020 }, { "epoch": 0.608, "grad_norm": 0.01846504583954811, "learning_rate": 0.00013922, "loss": 0.1592, "mean_token_accuracy": 0.9346548557281494, "num_tokens": 354175.0, "step": 3040 }, { "epoch": 0.612, "grad_norm": 0.020237931981682777, "learning_rate": 0.00013882000000000003, "loss": 0.1578, "mean_token_accuracy": 0.9351754993200302, "num_tokens": 356517.0, "step": 3060 }, { "epoch": 0.616, "grad_norm": 0.017988894134759903, "learning_rate": 0.00013842000000000002, "loss": 0.1596, "mean_token_accuracy": 0.9330779641866684, "num_tokens": 358833.0, "step": 3080 }, { "epoch": 0.62, "grad_norm": 0.01684187725186348, "learning_rate": 0.00013802, "loss": 0.1571, "mean_token_accuracy": 0.9366971403360367, "num_tokens": 361182.0, "step": 3100 }, { "epoch": 0.624, "grad_norm": 0.019796263426542282, "learning_rate": 0.00013762000000000003, "loss": 0.1594, "mean_token_accuracy": 0.937585511803627, "num_tokens": 363494.0, "step": 3120 }, { "epoch": 0.628, "grad_norm": 0.027366606518626213, "learning_rate": 0.00013722000000000002, "loss": 0.1579, "mean_token_accuracy": 0.9360651940107345, "num_tokens": 365832.0, "step": 3140 }, { "epoch": 0.632, "grad_norm": 0.01635519415140152, "learning_rate": 0.00013682, "loss": 0.1575, "mean_token_accuracy": 0.9379144310951233, "num_tokens": 368176.0, "step": 3160 }, { "epoch": 0.636, "grad_norm": 0.01604699157178402, "learning_rate": 0.00013642000000000003, "loss": 0.1597, "mean_token_accuracy": 0.9374749541282654, "num_tokens": 370489.0, "step": 3180 }, { "epoch": 0.64, "grad_norm": 0.01789054647088051, "learning_rate": 0.00013602000000000002, "loss": 0.1577, "mean_token_accuracy": 0.9346486628055573, "num_tokens": 372828.0, "step": 3200 }, { "epoch": 0.644, "grad_norm": 0.018880745396018028, "learning_rate": 0.00013562, "loss": 0.1591, "mean_token_accuracy": 0.9359114378690719, "num_tokens": 375150.0, "step": 3220 }, { "epoch": 0.648, "grad_norm": 0.017004717141389847, "learning_rate": 0.00013522000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9370229512453079, "num_tokens": 377478.0, "step": 3240 }, { "epoch": 0.652, "grad_norm": 0.016740689054131508, "learning_rate": 0.00013482000000000001, "loss": 0.1581, "mean_token_accuracy": 0.9358916640281677, "num_tokens": 379814.0, "step": 3260 }, { "epoch": 0.656, "grad_norm": 0.015598557889461517, "learning_rate": 0.00013442, "loss": 0.157, "mean_token_accuracy": 0.9368194431066513, "num_tokens": 382165.0, "step": 3280 }, { "epoch": 0.66, "grad_norm": 0.017107274383306503, "learning_rate": 0.00013402000000000002, "loss": 0.158, "mean_token_accuracy": 0.9370757102966308, "num_tokens": 384499.0, "step": 3300 }, { "epoch": 0.664, "grad_norm": 0.019015047699213028, "learning_rate": 0.00013362, "loss": 0.1578, "mean_token_accuracy": 0.938113397359848, "num_tokens": 386835.0, "step": 3320 }, { "epoch": 0.668, "grad_norm": 0.02084503509104252, "learning_rate": 0.00013322, "loss": 0.1594, "mean_token_accuracy": 0.9364703267812728, "num_tokens": 389148.0, "step": 3340 }, { "epoch": 0.672, "grad_norm": 0.02378230169415474, "learning_rate": 0.00013282000000000002, "loss": 0.1598, "mean_token_accuracy": 0.9386055022478104, "num_tokens": 391454.0, "step": 3360 }, { "epoch": 0.676, "grad_norm": 0.014733811840415001, "learning_rate": 0.00013242, "loss": 0.1585, "mean_token_accuracy": 0.938000163435936, "num_tokens": 393786.0, "step": 3380 }, { "epoch": 0.68, "grad_norm": 0.015965940430760384, "learning_rate": 0.00013202, "loss": 0.1594, "mean_token_accuracy": 0.9366149872541427, "num_tokens": 396103.0, "step": 3400 }, { "epoch": 0.684, "grad_norm": 0.014542631804943085, "learning_rate": 0.00013162000000000002, "loss": 0.161, "mean_token_accuracy": 0.9346071958541871, "num_tokens": 398396.0, "step": 3420 }, { "epoch": 0.688, "grad_norm": 0.014583873562514782, "learning_rate": 0.00013122, "loss": 0.1587, "mean_token_accuracy": 0.9348091840744018, "num_tokens": 400725.0, "step": 3440 }, { "epoch": 0.692, "grad_norm": 0.017750371247529984, "learning_rate": 0.00013082, "loss": 0.1577, "mean_token_accuracy": 0.9374497979879379, "num_tokens": 403064.0, "step": 3460 }, { "epoch": 0.696, "grad_norm": 0.01569240354001522, "learning_rate": 0.00013042000000000002, "loss": 0.1566, "mean_token_accuracy": 0.938677328824997, "num_tokens": 405418.0, "step": 3480 }, { "epoch": 0.7, "grad_norm": 0.012828970327973366, "learning_rate": 0.00013002, "loss": 0.1579, "mean_token_accuracy": 0.9367502212524415, "num_tokens": 407754.0, "step": 3500 }, { "epoch": 0.704, "grad_norm": 0.01372049655765295, "learning_rate": 0.00012962, "loss": 0.1559, "mean_token_accuracy": 0.9384414672851562, "num_tokens": 410118.0, "step": 3520 }, { "epoch": 0.708, "grad_norm": 0.015669073909521103, "learning_rate": 0.00012922, "loss": 0.159, "mean_token_accuracy": 0.9381854623556137, "num_tokens": 412440.0, "step": 3540 }, { "epoch": 0.712, "grad_norm": 0.017576146870851517, "learning_rate": 0.00012882, "loss": 0.1585, "mean_token_accuracy": 0.9391775250434875, "num_tokens": 414766.0, "step": 3560 }, { "epoch": 0.716, "grad_norm": 0.02452530339360237, "learning_rate": 0.00012842, "loss": 0.1559, "mean_token_accuracy": 0.9388908207416534, "num_tokens": 417117.0, "step": 3580 }, { "epoch": 0.72, "grad_norm": 0.019447200000286102, "learning_rate": 0.00012802, "loss": 0.1602, "mean_token_accuracy": 0.936455848813057, "num_tokens": 419428.0, "step": 3600 }, { "epoch": 0.724, "grad_norm": 0.021487407386302948, "learning_rate": 0.00012762, "loss": 0.158, "mean_token_accuracy": 0.9349821031093597, "num_tokens": 421763.0, "step": 3620 }, { "epoch": 0.728, "grad_norm": 0.013334971852600574, "learning_rate": 0.00012722000000000002, "loss": 0.1566, "mean_token_accuracy": 0.9395059019327163, "num_tokens": 424115.0, "step": 3640 }, { "epoch": 0.732, "grad_norm": 0.023315824568271637, "learning_rate": 0.00012682, "loss": 0.1596, "mean_token_accuracy": 0.936040785908699, "num_tokens": 426426.0, "step": 3660 }, { "epoch": 0.736, "grad_norm": 0.021865224465727806, "learning_rate": 0.00012642, "loss": 0.1582, "mean_token_accuracy": 0.9385276228189469, "num_tokens": 428761.0, "step": 3680 }, { "epoch": 0.74, "grad_norm": 0.014885502867400646, "learning_rate": 0.00012602000000000002, "loss": 0.1579, "mean_token_accuracy": 0.9356612026691437, "num_tokens": 431102.0, "step": 3700 }, { "epoch": 0.744, "grad_norm": 0.020914755761623383, "learning_rate": 0.00012562, "loss": 0.1576, "mean_token_accuracy": 0.9347645163536071, "num_tokens": 433444.0, "step": 3720 }, { "epoch": 0.748, "grad_norm": 0.011823791079223156, "learning_rate": 0.00012522, "loss": 0.1587, "mean_token_accuracy": 0.93475821018219, "num_tokens": 435772.0, "step": 3740 }, { "epoch": 0.752, "grad_norm": 0.01868574135005474, "learning_rate": 0.00012482000000000001, "loss": 0.1616, "mean_token_accuracy": 0.9349653989076614, "num_tokens": 438056.0, "step": 3760 }, { "epoch": 0.756, "grad_norm": 0.013049358502030373, "learning_rate": 0.00012442, "loss": 0.1581, "mean_token_accuracy": 0.9381139695644378, "num_tokens": 440390.0, "step": 3780 }, { "epoch": 0.76, "grad_norm": 0.01722385175526142, "learning_rate": 0.00012402, "loss": 0.1568, "mean_token_accuracy": 0.9363266348838806, "num_tokens": 442741.0, "step": 3800 }, { "epoch": 0.764, "grad_norm": 0.014651446603238583, "learning_rate": 0.00012362, "loss": 0.1568, "mean_token_accuracy": 0.9367681205272674, "num_tokens": 445092.0, "step": 3820 }, { "epoch": 0.768, "grad_norm": 0.012667631730437279, "learning_rate": 0.00012322, "loss": 0.1583, "mean_token_accuracy": 0.9367698729038239, "num_tokens": 447426.0, "step": 3840 }, { "epoch": 0.772, "grad_norm": 0.017640771344304085, "learning_rate": 0.00012282, "loss": 0.1607, "mean_token_accuracy": 0.9369511902332306, "num_tokens": 449722.0, "step": 3860 }, { "epoch": 0.776, "grad_norm": 0.0181003175675869, "learning_rate": 0.00012242, "loss": 0.158, "mean_token_accuracy": 0.9366753160953522, "num_tokens": 452055.0, "step": 3880 }, { "epoch": 0.78, "grad_norm": 0.025726528838276863, "learning_rate": 0.00012202, "loss": 0.1575, "mean_token_accuracy": 0.937316569685936, "num_tokens": 454397.0, "step": 3900 }, { "epoch": 0.784, "grad_norm": 0.019859878346323967, "learning_rate": 0.00012162, "loss": 0.1601, "mean_token_accuracy": 0.9387007981538773, "num_tokens": 456702.0, "step": 3920 }, { "epoch": 0.788, "grad_norm": 0.013250220566987991, "learning_rate": 0.00012122, "loss": 0.1574, "mean_token_accuracy": 0.936699178814888, "num_tokens": 459047.0, "step": 3940 }, { "epoch": 0.792, "grad_norm": 0.013169913552701473, "learning_rate": 0.00012082, "loss": 0.1594, "mean_token_accuracy": 0.936829337477684, "num_tokens": 461362.0, "step": 3960 }, { "epoch": 0.796, "grad_norm": 0.012993971817195415, "learning_rate": 0.00012042, "loss": 0.1601, "mean_token_accuracy": 0.9353927552700043, "num_tokens": 463667.0, "step": 3980 }, { "epoch": 0.8, "grad_norm": 0.013329907320439816, "learning_rate": 0.00012001999999999999, "loss": 0.1585, "mean_token_accuracy": 0.9350113153457642, "num_tokens": 465998.0, "step": 4000 }, { "epoch": 0.804, "grad_norm": 0.01984967105090618, "learning_rate": 0.00011962, "loss": 0.1585, "mean_token_accuracy": 0.9361959755420685, "num_tokens": 468327.0, "step": 4020 }, { "epoch": 0.808, "grad_norm": 0.01188992615789175, "learning_rate": 0.00011922, "loss": 0.1574, "mean_token_accuracy": 0.9366391479969025, "num_tokens": 470674.0, "step": 4040 }, { "epoch": 0.812, "grad_norm": 0.011742761358618736, "learning_rate": 0.00011882, "loss": 0.1589, "mean_token_accuracy": 0.9355545520782471, "num_tokens": 472995.0, "step": 4060 }, { "epoch": 0.816, "grad_norm": 0.014725361950695515, "learning_rate": 0.00011842, "loss": 0.1579, "mean_token_accuracy": 0.9372966170310975, "num_tokens": 475328.0, "step": 4080 }, { "epoch": 0.82, "grad_norm": 0.013188617303967476, "learning_rate": 0.00011802, "loss": 0.1591, "mean_token_accuracy": 0.9364562898874282, "num_tokens": 477648.0, "step": 4100 }, { "epoch": 0.824, "grad_norm": 0.01250818558037281, "learning_rate": 0.00011762, "loss": 0.1599, "mean_token_accuracy": 0.9361166715621948, "num_tokens": 479955.0, "step": 4120 }, { "epoch": 0.828, "grad_norm": 0.012422624975442886, "learning_rate": 0.00011721999999999999, "loss": 0.1578, "mean_token_accuracy": 0.9352377116680145, "num_tokens": 482295.0, "step": 4140 }, { "epoch": 0.832, "grad_norm": 0.013321579433977604, "learning_rate": 0.00011682, "loss": 0.1585, "mean_token_accuracy": 0.9374675869941711, "num_tokens": 484619.0, "step": 4160 }, { "epoch": 0.836, "grad_norm": 0.010815759189426899, "learning_rate": 0.00011642, "loss": 0.158, "mean_token_accuracy": 0.937660351395607, "num_tokens": 486953.0, "step": 4180 }, { "epoch": 0.84, "grad_norm": 0.014364980161190033, "learning_rate": 0.00011601999999999999, "loss": 0.1612, "mean_token_accuracy": 0.9387158721685409, "num_tokens": 489241.0, "step": 4200 }, { "epoch": 0.844, "grad_norm": 0.01238577626645565, "learning_rate": 0.00011562, "loss": 0.1581, "mean_token_accuracy": 0.9387918084859848, "num_tokens": 491572.0, "step": 4220 }, { "epoch": 0.848, "grad_norm": 0.01074713934212923, "learning_rate": 0.00011522, "loss": 0.1579, "mean_token_accuracy": 0.9364918410778046, "num_tokens": 493911.0, "step": 4240 }, { "epoch": 0.852, "grad_norm": 0.018538950011134148, "learning_rate": 0.00011482000000000002, "loss": 0.1596, "mean_token_accuracy": 0.9368883103132248, "num_tokens": 496218.0, "step": 4260 }, { "epoch": 0.856, "grad_norm": 0.012234380468726158, "learning_rate": 0.00011442000000000002, "loss": 0.1563, "mean_token_accuracy": 0.9358471721410752, "num_tokens": 498579.0, "step": 4280 }, { "epoch": 0.86, "grad_norm": 0.012620363384485245, "learning_rate": 0.00011402000000000001, "loss": 0.1584, "mean_token_accuracy": 0.9370616048574447, "num_tokens": 500910.0, "step": 4300 }, { "epoch": 0.864, "grad_norm": 0.014111168682575226, "learning_rate": 0.00011362000000000001, "loss": 0.1599, "mean_token_accuracy": 0.9376936018466949, "num_tokens": 503215.0, "step": 4320 }, { "epoch": 0.868, "grad_norm": 0.01211662869900465, "learning_rate": 0.00011322000000000002, "loss": 0.1582, "mean_token_accuracy": 0.9375888526439666, "num_tokens": 505546.0, "step": 4340 }, { "epoch": 0.872, "grad_norm": 0.012831459753215313, "learning_rate": 0.00011282000000000002, "loss": 0.1603, "mean_token_accuracy": 0.9398003369569778, "num_tokens": 507842.0, "step": 4360 }, { "epoch": 0.876, "grad_norm": 0.018729697912931442, "learning_rate": 0.00011242000000000001, "loss": 0.1568, "mean_token_accuracy": 0.9386621713638306, "num_tokens": 510196.0, "step": 4380 }, { "epoch": 0.88, "grad_norm": 0.020225845277309418, "learning_rate": 0.00011202000000000002, "loss": 0.1586, "mean_token_accuracy": 0.9374362021684647, "num_tokens": 512521.0, "step": 4400 }, { "epoch": 0.884, "grad_norm": 0.03628004714846611, "learning_rate": 0.00011162000000000002, "loss": 0.1598, "mean_token_accuracy": 0.9386918365955352, "num_tokens": 514827.0, "step": 4420 }, { "epoch": 0.888, "grad_norm": 0.02236984297633171, "learning_rate": 0.00011122000000000001, "loss": 0.1574, "mean_token_accuracy": 0.9361942201852799, "num_tokens": 517171.0, "step": 4440 }, { "epoch": 0.892, "grad_norm": 0.015914643183350563, "learning_rate": 0.00011082000000000001, "loss": 0.1569, "mean_token_accuracy": 0.9363924354314804, "num_tokens": 519523.0, "step": 4460 }, { "epoch": 0.896, "grad_norm": 0.011004773899912834, "learning_rate": 0.00011042000000000002, "loss": 0.1575, "mean_token_accuracy": 0.9354314595460892, "num_tokens": 521871.0, "step": 4480 }, { "epoch": 0.9, "grad_norm": 0.01320959534496069, "learning_rate": 0.00011002000000000001, "loss": 0.1582, "mean_token_accuracy": 0.9367985218763352, "num_tokens": 524205.0, "step": 4500 }, { "epoch": 0.904, "grad_norm": 0.016331197693943977, "learning_rate": 0.00010962000000000001, "loss": 0.1577, "mean_token_accuracy": 0.9359941184520721, "num_tokens": 526544.0, "step": 4520 }, { "epoch": 0.908, "grad_norm": 0.011050857603549957, "learning_rate": 0.00010922000000000001, "loss": 0.1577, "mean_token_accuracy": 0.9339568525552749, "num_tokens": 528887.0, "step": 4540 }, { "epoch": 0.912, "grad_norm": 0.010212858207523823, "learning_rate": 0.00010882, "loss": 0.1573, "mean_token_accuracy": 0.9375766962766647, "num_tokens": 531234.0, "step": 4560 }, { "epoch": 0.916, "grad_norm": 0.02018802985548973, "learning_rate": 0.00010842000000000001, "loss": 0.1569, "mean_token_accuracy": 0.9405841529369354, "num_tokens": 533583.0, "step": 4580 }, { "epoch": 0.92, "grad_norm": 0.020224014297127724, "learning_rate": 0.00010802000000000001, "loss": 0.159, "mean_token_accuracy": 0.9367424637079239, "num_tokens": 535905.0, "step": 4600 }, { "epoch": 0.924, "grad_norm": 0.01311077456921339, "learning_rate": 0.00010762, "loss": 0.1604, "mean_token_accuracy": 0.9359079092741013, "num_tokens": 538208.0, "step": 4620 }, { "epoch": 0.928, "grad_norm": 0.012784361839294434, "learning_rate": 0.00010722000000000001, "loss": 0.1588, "mean_token_accuracy": 0.933400297164917, "num_tokens": 540532.0, "step": 4640 }, { "epoch": 0.932, "grad_norm": 0.01144441869109869, "learning_rate": 0.00010682000000000001, "loss": 0.1576, "mean_token_accuracy": 0.9386591941118241, "num_tokens": 542872.0, "step": 4660 }, { "epoch": 0.936, "grad_norm": 0.012654446065425873, "learning_rate": 0.00010642000000000001, "loss": 0.1582, "mean_token_accuracy": 0.9370446026325225, "num_tokens": 545204.0, "step": 4680 }, { "epoch": 0.94, "grad_norm": 0.011317115277051926, "learning_rate": 0.00010602, "loss": 0.1561, "mean_token_accuracy": 0.940504989027977, "num_tokens": 547566.0, "step": 4700 }, { "epoch": 0.944, "grad_norm": 0.010913815349340439, "learning_rate": 0.00010562000000000001, "loss": 0.1594, "mean_token_accuracy": 0.9370900303125381, "num_tokens": 549883.0, "step": 4720 }, { "epoch": 0.948, "grad_norm": 0.013113110326230526, "learning_rate": 0.00010522000000000001, "loss": 0.1591, "mean_token_accuracy": 0.9380158931016922, "num_tokens": 552201.0, "step": 4740 }, { "epoch": 0.952, "grad_norm": 0.011078396812081337, "learning_rate": 0.00010482, "loss": 0.1576, "mean_token_accuracy": 0.9394772380590439, "num_tokens": 554539.0, "step": 4760 }, { "epoch": 0.956, "grad_norm": 0.011660384945571423, "learning_rate": 0.00010442, "loss": 0.1601, "mean_token_accuracy": 0.9355088382959366, "num_tokens": 556846.0, "step": 4780 }, { "epoch": 0.96, "grad_norm": 0.011648285202682018, "learning_rate": 0.00010402000000000001, "loss": 0.1573, "mean_token_accuracy": 0.9378848969936371, "num_tokens": 559191.0, "step": 4800 }, { "epoch": 0.964, "grad_norm": 0.012828272767364979, "learning_rate": 0.00010362, "loss": 0.1566, "mean_token_accuracy": 0.936909893155098, "num_tokens": 561548.0, "step": 4820 }, { "epoch": 0.968, "grad_norm": 0.015896698459982872, "learning_rate": 0.00010322, "loss": 0.1596, "mean_token_accuracy": 0.9348096013069153, "num_tokens": 563862.0, "step": 4840 }, { "epoch": 0.972, "grad_norm": 0.016044626012444496, "learning_rate": 0.00010282000000000001, "loss": 0.1606, "mean_token_accuracy": 0.9357615917921066, "num_tokens": 566161.0, "step": 4860 }, { "epoch": 0.976, "grad_norm": 0.010811380110681057, "learning_rate": 0.00010242, "loss": 0.1576, "mean_token_accuracy": 0.9412378251552582, "num_tokens": 568500.0, "step": 4880 }, { "epoch": 0.98, "grad_norm": 0.01213027909398079, "learning_rate": 0.00010202, "loss": 0.1589, "mean_token_accuracy": 0.9359610795974731, "num_tokens": 570823.0, "step": 4900 }, { "epoch": 0.984, "grad_norm": 0.012461444362998009, "learning_rate": 0.00010162, "loss": 0.1578, "mean_token_accuracy": 0.9338973581790924, "num_tokens": 573165.0, "step": 4920 }, { "epoch": 0.988, "grad_norm": 0.009025659412145615, "learning_rate": 0.00010122000000000001, "loss": 0.1568, "mean_token_accuracy": 0.9369413673877716, "num_tokens": 575520.0, "step": 4940 }, { "epoch": 0.992, "grad_norm": 0.010953028686344624, "learning_rate": 0.00010082, "loss": 0.1591, "mean_token_accuracy": 0.9341312050819397, "num_tokens": 577840.0, "step": 4960 }, { "epoch": 0.996, "grad_norm": 0.00946497917175293, "learning_rate": 0.00010042, "loss": 0.1575, "mean_token_accuracy": 0.9408338129520416, "num_tokens": 580179.0, "step": 4980 }, { "epoch": 1.0, "grad_norm": 0.009777408093214035, "learning_rate": 0.00010002000000000001, "loss": 0.159, "mean_token_accuracy": 0.935635381937027, "num_tokens": 582500.0, "step": 5000 } ], "logging_steps": 20, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7056874607118336.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }