{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 2.131298542022705, "learning_rate": 0.00019962000000000002, "loss": 2.0121, "mean_token_accuracy": 0.6703190118074417, "num_tokens": 2348.0, "step": 20 }, { "epoch": 0.008, "grad_norm": 1.2890418767929077, "learning_rate": 0.00019922, "loss": 0.2751, "mean_token_accuracy": 0.9120438575744629, "num_tokens": 4697.0, "step": 40 }, { "epoch": 0.012, "grad_norm": 0.8126867413520813, "learning_rate": 0.00019882, "loss": 0.1966, "mean_token_accuracy": 0.9203487157821655, "num_tokens": 7014.0, "step": 60 }, { "epoch": 0.016, "grad_norm": 0.6051881313323975, "learning_rate": 0.00019842000000000001, "loss": 0.1851, "mean_token_accuracy": 0.9291799515485764, "num_tokens": 9327.0, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.6378348469734192, "learning_rate": 0.00019802, "loss": 0.1766, "mean_token_accuracy": 0.9286984890699387, "num_tokens": 11670.0, "step": 100 }, { "epoch": 0.024, "grad_norm": 0.624138593673706, "learning_rate": 0.00019762, "loss": 0.1784, "mean_token_accuracy": 0.9264282643795013, "num_tokens": 14000.0, "step": 120 }, { "epoch": 0.028, "grad_norm": 0.2111046463251114, "learning_rate": 0.00019722, "loss": 0.1702, "mean_token_accuracy": 0.9321970880031586, "num_tokens": 16329.0, "step": 140 }, { "epoch": 0.032, "grad_norm": 0.5350440740585327, "learning_rate": 0.00019682, "loss": 0.171, "mean_token_accuracy": 0.9311463803052902, "num_tokens": 18667.0, "step": 160 }, { "epoch": 0.036, "grad_norm": 0.19237647950649261, "learning_rate": 0.00019642, "loss": 0.167, "mean_token_accuracy": 0.9345656305551528, "num_tokens": 20985.0, "step": 180 }, { "epoch": 0.04, "grad_norm": 0.28153756260871887, "learning_rate": 0.00019602, "loss": 0.1674, "mean_token_accuracy": 0.9329667061567306, "num_tokens": 23325.0, "step": 200 }, { "epoch": 0.044, "grad_norm": 0.8545331954956055, "learning_rate": 0.00019562, "loss": 0.166, "mean_token_accuracy": 0.9344212204217911, "num_tokens": 25670.0, "step": 220 }, { "epoch": 0.048, "grad_norm": 0.24941129982471466, "learning_rate": 0.00019522, "loss": 0.1661, "mean_token_accuracy": 0.934859549999237, "num_tokens": 28016.0, "step": 240 }, { "epoch": 0.052, "grad_norm": 0.29549548029899597, "learning_rate": 0.00019482, "loss": 0.1707, "mean_token_accuracy": 0.9345517784357071, "num_tokens": 30345.0, "step": 260 }, { "epoch": 0.056, "grad_norm": 0.20388178527355194, "learning_rate": 0.00019442, "loss": 0.1673, "mean_token_accuracy": 0.9353183209896088, "num_tokens": 32691.0, "step": 280 }, { "epoch": 0.06, "grad_norm": 0.10762794315814972, "learning_rate": 0.00019402, "loss": 0.1642, "mean_token_accuracy": 0.9325390756130219, "num_tokens": 35030.0, "step": 300 }, { "epoch": 0.064, "grad_norm": 0.07676753401756287, "learning_rate": 0.00019362, "loss": 0.1633, "mean_token_accuracy": 0.9348760217428207, "num_tokens": 37359.0, "step": 320 }, { "epoch": 0.068, "grad_norm": 0.06781225651502609, "learning_rate": 0.00019322, "loss": 0.1589, "mean_token_accuracy": 0.936154904961586, "num_tokens": 39707.0, "step": 340 }, { "epoch": 0.072, "grad_norm": 0.10010460019111633, "learning_rate": 0.00019282000000000001, "loss": 0.1583, "mean_token_accuracy": 0.9410124599933625, "num_tokens": 42071.0, "step": 360 }, { "epoch": 0.076, "grad_norm": 0.07932794839143753, "learning_rate": 0.00019242, "loss": 0.1608, "mean_token_accuracy": 0.9380002528429031, "num_tokens": 44404.0, "step": 380 }, { "epoch": 0.08, "grad_norm": 0.06678586453199387, "learning_rate": 0.00019202, "loss": 0.1633, "mean_token_accuracy": 0.9347729980945587, "num_tokens": 46715.0, "step": 400 }, { "epoch": 0.084, "grad_norm": 0.05118393525481224, "learning_rate": 0.00019162, "loss": 0.1621, "mean_token_accuracy": 0.9356200367212295, "num_tokens": 49030.0, "step": 420 }, { "epoch": 0.088, "grad_norm": 0.07563836127519608, "learning_rate": 0.00019122, "loss": 0.1603, "mean_token_accuracy": 0.9362942427396774, "num_tokens": 51366.0, "step": 440 }, { "epoch": 0.092, "grad_norm": 0.053388580679893494, "learning_rate": 0.00019082, "loss": 0.1585, "mean_token_accuracy": 0.9377258807420731, "num_tokens": 53706.0, "step": 460 }, { "epoch": 0.096, "grad_norm": 0.05659119412302971, "learning_rate": 0.00019042, "loss": 0.1575, "mean_token_accuracy": 0.937972965836525, "num_tokens": 56052.0, "step": 480 }, { "epoch": 0.1, "grad_norm": 0.04934714362025261, "learning_rate": 0.00019002, "loss": 0.1606, "mean_token_accuracy": 0.9356311202049256, "num_tokens": 58374.0, "step": 500 }, { "epoch": 0.104, "grad_norm": 0.05647804215550423, "learning_rate": 0.00018962000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9353525519371033, "num_tokens": 60706.0, "step": 520 }, { "epoch": 0.108, "grad_norm": 0.058523017913103104, "learning_rate": 0.00018922, "loss": 0.1595, "mean_token_accuracy": 0.9371987581253052, "num_tokens": 63021.0, "step": 540 }, { "epoch": 0.112, "grad_norm": 0.03793497756123543, "learning_rate": 0.00018882000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9341553807258606, "num_tokens": 65369.0, "step": 560 }, { "epoch": 0.116, "grad_norm": 0.04743633046746254, "learning_rate": 0.00018842000000000002, "loss": 0.1586, "mean_token_accuracy": 0.936157900094986, "num_tokens": 67700.0, "step": 580 }, { "epoch": 0.12, "grad_norm": 0.05463261529803276, "learning_rate": 0.00018802, "loss": 0.1578, "mean_token_accuracy": 0.937694975733757, "num_tokens": 70038.0, "step": 600 }, { "epoch": 0.124, "grad_norm": 0.11279874294996262, "learning_rate": 0.00018762000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9382745862007141, "num_tokens": 72367.0, "step": 620 }, { "epoch": 0.128, "grad_norm": 0.050823234021663666, "learning_rate": 0.00018722, "loss": 0.1602, "mean_token_accuracy": 0.9388011395931244, "num_tokens": 74677.0, "step": 640 }, { "epoch": 0.132, "grad_norm": 0.04983159899711609, "learning_rate": 0.00018682000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9374112606048584, "num_tokens": 77014.0, "step": 660 }, { "epoch": 0.136, "grad_norm": 0.05084273964166641, "learning_rate": 0.00018642000000000002, "loss": 0.1591, "mean_token_accuracy": 0.9387890577316285, "num_tokens": 79339.0, "step": 680 }, { "epoch": 0.14, "grad_norm": 0.04936506226658821, "learning_rate": 0.00018602, "loss": 0.157, "mean_token_accuracy": 0.9359721839427948, "num_tokens": 81688.0, "step": 700 }, { "epoch": 0.144, "grad_norm": 0.04119481146335602, "learning_rate": 0.00018562000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9359663873910904, "num_tokens": 84025.0, "step": 720 }, { "epoch": 0.148, "grad_norm": 0.03988514095544815, "learning_rate": 0.00018522000000000002, "loss": 0.1595, "mean_token_accuracy": 0.9372004926204681, "num_tokens": 86341.0, "step": 740 }, { "epoch": 0.152, "grad_norm": 0.04381653666496277, "learning_rate": 0.00018482, "loss": 0.1591, "mean_token_accuracy": 0.9373360633850097, "num_tokens": 88665.0, "step": 760 }, { "epoch": 0.156, "grad_norm": 0.03504428267478943, "learning_rate": 0.00018442000000000003, "loss": 0.1579, "mean_token_accuracy": 0.9403579801321029, "num_tokens": 90997.0, "step": 780 }, { "epoch": 0.16, "grad_norm": 0.03745226562023163, "learning_rate": 0.00018402000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9388439536094666, "num_tokens": 93322.0, "step": 800 }, { "epoch": 0.164, "grad_norm": 0.033838506788015366, "learning_rate": 0.00018362, "loss": 0.1562, "mean_token_accuracy": 0.9360319077968597, "num_tokens": 95688.0, "step": 820 }, { "epoch": 0.168, "grad_norm": 0.0314440056681633, "learning_rate": 0.00018322000000000002, "loss": 0.1596, "mean_token_accuracy": 0.9349893003702163, "num_tokens": 98005.0, "step": 840 }, { "epoch": 0.172, "grad_norm": 0.06577116250991821, "learning_rate": 0.00018282000000000001, "loss": 0.1589, "mean_token_accuracy": 0.9364802747964859, "num_tokens": 100327.0, "step": 860 }, { "epoch": 0.176, "grad_norm": 0.029735982418060303, "learning_rate": 0.00018242, "loss": 0.1576, "mean_token_accuracy": 0.9365677893161773, "num_tokens": 102671.0, "step": 880 }, { "epoch": 0.18, "grad_norm": 0.03155644237995148, "learning_rate": 0.00018202000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9387517213821411, "num_tokens": 104996.0, "step": 900 }, { "epoch": 0.184, "grad_norm": 0.027411427348852158, "learning_rate": 0.00018162, "loss": 0.1581, "mean_token_accuracy": 0.9342827945947647, "num_tokens": 107337.0, "step": 920 }, { "epoch": 0.188, "grad_norm": 0.024014495313167572, "learning_rate": 0.00018122, "loss": 0.1583, "mean_token_accuracy": 0.9355534523725509, "num_tokens": 109674.0, "step": 940 }, { "epoch": 0.192, "grad_norm": 0.02990046516060829, "learning_rate": 0.00018082000000000002, "loss": 0.1577, "mean_token_accuracy": 0.9373938798904419, "num_tokens": 112015.0, "step": 960 }, { "epoch": 0.196, "grad_norm": 0.020676879212260246, "learning_rate": 0.00018042, "loss": 0.1582, "mean_token_accuracy": 0.9363098949193954, "num_tokens": 114349.0, "step": 980 }, { "epoch": 0.2, "grad_norm": 0.024556942284107208, "learning_rate": 0.00018002, "loss": 0.159, "mean_token_accuracy": 0.9355833351612091, "num_tokens": 116672.0, "step": 1000 }, { "epoch": 0.204, "grad_norm": 0.22343556582927704, "learning_rate": 0.00017962000000000002, "loss": 0.157, "mean_token_accuracy": 0.9363467574119568, "num_tokens": 119023.0, "step": 1020 }, { "epoch": 0.208, "grad_norm": 0.515048623085022, "learning_rate": 0.00017922, "loss": 0.1585, "mean_token_accuracy": 0.9374267637729645, "num_tokens": 121366.0, "step": 1040 }, { "epoch": 0.212, "grad_norm": 0.2672664225101471, "learning_rate": 0.00017882, "loss": 0.1704, "mean_token_accuracy": 0.938492265343666, "num_tokens": 123719.0, "step": 1060 }, { "epoch": 0.216, "grad_norm": 0.11909265071153641, "learning_rate": 0.00017842000000000002, "loss": 0.1752, "mean_token_accuracy": 0.9297878712415695, "num_tokens": 126053.0, "step": 1080 }, { "epoch": 0.22, "grad_norm": 0.11977271735668182, "learning_rate": 0.00017802, "loss": 0.1652, "mean_token_accuracy": 0.9360411554574967, "num_tokens": 128379.0, "step": 1100 }, { "epoch": 0.224, "grad_norm": 0.17722292244434357, "learning_rate": 0.00017762, "loss": 0.1697, "mean_token_accuracy": 0.9314894318580628, "num_tokens": 130700.0, "step": 1120 }, { "epoch": 0.228, "grad_norm": 0.8375388979911804, "learning_rate": 0.00017722000000000001, "loss": 0.1895, "mean_token_accuracy": 0.9316652357578278, "num_tokens": 133026.0, "step": 1140 }, { "epoch": 0.232, "grad_norm": 0.10569056123495102, "learning_rate": 0.00017682, "loss": 0.1679, "mean_token_accuracy": 0.9333775132894516, "num_tokens": 135371.0, "step": 1160 }, { "epoch": 0.236, "grad_norm": 0.07626856118440628, "learning_rate": 0.00017642, "loss": 0.1613, "mean_token_accuracy": 0.9380175620317459, "num_tokens": 137695.0, "step": 1180 }, { "epoch": 0.24, "grad_norm": 0.06852507591247559, "learning_rate": 0.00017602, "loss": 0.1693, "mean_token_accuracy": 0.9329774439334869, "num_tokens": 140007.0, "step": 1200 }, { "epoch": 0.244, "grad_norm": 0.11691898107528687, "learning_rate": 0.00017562, "loss": 0.159, "mean_token_accuracy": 0.9360336065292358, "num_tokens": 142345.0, "step": 1220 }, { "epoch": 0.248, "grad_norm": 0.10780195891857147, "learning_rate": 0.00017522000000000002, "loss": 0.1586, "mean_token_accuracy": 0.9376411676406861, "num_tokens": 144693.0, "step": 1240 }, { "epoch": 0.252, "grad_norm": 0.07631397247314453, "learning_rate": 0.00017482, "loss": 0.1633, "mean_token_accuracy": 0.9390978574752807, "num_tokens": 147031.0, "step": 1260 }, { "epoch": 0.256, "grad_norm": 0.05569858103990555, "learning_rate": 0.00017442, "loss": 0.1615, "mean_token_accuracy": 0.9400094121694564, "num_tokens": 149371.0, "step": 1280 }, { "epoch": 0.26, "grad_norm": 0.04815123230218887, "learning_rate": 0.00017402000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9370361328125, "num_tokens": 151716.0, "step": 1300 }, { "epoch": 0.264, "grad_norm": 0.11904877424240112, "learning_rate": 0.00017362, "loss": 0.159, "mean_token_accuracy": 0.941026845574379, "num_tokens": 154046.0, "step": 1320 }, { "epoch": 0.268, "grad_norm": 0.12382964044809341, "learning_rate": 0.00017322, "loss": 0.1594, "mean_token_accuracy": 0.9340965986251831, "num_tokens": 156382.0, "step": 1340 }, { "epoch": 0.272, "grad_norm": 0.05523005872964859, "learning_rate": 0.00017282000000000002, "loss": 0.1593, "mean_token_accuracy": 0.9396583586931229, "num_tokens": 158705.0, "step": 1360 }, { "epoch": 0.276, "grad_norm": 0.05591598525643349, "learning_rate": 0.00017242, "loss": 0.1592, "mean_token_accuracy": 0.9364501267671586, "num_tokens": 161025.0, "step": 1380 }, { "epoch": 0.28, "grad_norm": 0.06780663877725601, "learning_rate": 0.00017202, "loss": 0.1618, "mean_token_accuracy": 0.936437115073204, "num_tokens": 163345.0, "step": 1400 }, { "epoch": 0.284, "grad_norm": 0.03291817009449005, "learning_rate": 0.00017162000000000001, "loss": 0.1569, "mean_token_accuracy": 0.9380638599395752, "num_tokens": 165711.0, "step": 1420 }, { "epoch": 0.288, "grad_norm": 0.04884820431470871, "learning_rate": 0.00017122, "loss": 0.1608, "mean_token_accuracy": 0.9370934247970581, "num_tokens": 168007.0, "step": 1440 }, { "epoch": 0.292, "grad_norm": 0.04577581211924553, "learning_rate": 0.00017082, "loss": 0.1595, "mean_token_accuracy": 0.9374603897333145, "num_tokens": 170332.0, "step": 1460 }, { "epoch": 0.296, "grad_norm": 0.03866467997431755, "learning_rate": 0.00017042, "loss": 0.1582, "mean_token_accuracy": 0.9355076909065246, "num_tokens": 172667.0, "step": 1480 }, { "epoch": 0.3, "grad_norm": 0.06204424798488617, "learning_rate": 0.00017002, "loss": 0.1599, "mean_token_accuracy": 0.9372841835021972, "num_tokens": 174970.0, "step": 1500 }, { "epoch": 0.304, "grad_norm": 0.03288702666759491, "learning_rate": 0.00016962, "loss": 0.1578, "mean_token_accuracy": 0.93585424721241, "num_tokens": 177317.0, "step": 1520 }, { "epoch": 0.308, "grad_norm": 0.03605024516582489, "learning_rate": 0.00016922, "loss": 0.1597, "mean_token_accuracy": 0.9366346269845962, "num_tokens": 179629.0, "step": 1540 }, { "epoch": 0.312, "grad_norm": 0.03328383341431618, "learning_rate": 0.00016882, "loss": 0.1582, "mean_token_accuracy": 0.9372009009122848, "num_tokens": 181960.0, "step": 1560 }, { "epoch": 0.316, "grad_norm": 0.03522924706339836, "learning_rate": 0.00016842, "loss": 0.1572, "mean_token_accuracy": 0.9355730235576629, "num_tokens": 184314.0, "step": 1580 }, { "epoch": 0.32, "grad_norm": 0.0317777544260025, "learning_rate": 0.00016802, "loss": 0.1572, "mean_token_accuracy": 0.9378984242677688, "num_tokens": 186658.0, "step": 1600 }, { "epoch": 0.324, "grad_norm": 0.07111163437366486, "learning_rate": 0.00016762, "loss": 0.1613, "mean_token_accuracy": 0.9344337552785873, "num_tokens": 188974.0, "step": 1620 }, { "epoch": 0.328, "grad_norm": 0.04765714704990387, "learning_rate": 0.00016722, "loss": 0.1608, "mean_token_accuracy": 0.9345141768455505, "num_tokens": 191276.0, "step": 1640 }, { "epoch": 0.332, "grad_norm": 0.041960619390010834, "learning_rate": 0.00016682, "loss": 0.1594, "mean_token_accuracy": 0.9375987917184829, "num_tokens": 193597.0, "step": 1660 }, { "epoch": 0.336, "grad_norm": 0.041757769882678986, "learning_rate": 0.00016642, "loss": 0.157, "mean_token_accuracy": 0.9367031455039978, "num_tokens": 195949.0, "step": 1680 }, { "epoch": 0.34, "grad_norm": 0.05323236435651779, "learning_rate": 0.00016601999999999999, "loss": 0.1654, "mean_token_accuracy": 0.9359392642974853, "num_tokens": 198267.0, "step": 1700 }, { "epoch": 0.344, "grad_norm": 0.08934314548969269, "learning_rate": 0.00016562, "loss": 0.1619, "mean_token_accuracy": 0.9331722050905228, "num_tokens": 200613.0, "step": 1720 }, { "epoch": 0.348, "grad_norm": 0.033347178250551224, "learning_rate": 0.00016522, "loss": 0.1611, "mean_token_accuracy": 0.9317500472068787, "num_tokens": 202927.0, "step": 1740 }, { "epoch": 0.352, "grad_norm": 0.03238425776362419, "learning_rate": 0.00016482, "loss": 0.1582, "mean_token_accuracy": 0.9358267247676849, "num_tokens": 205273.0, "step": 1760 }, { "epoch": 0.356, "grad_norm": 0.03249628096818924, "learning_rate": 0.00016442000000000003, "loss": 0.1591, "mean_token_accuracy": 0.9367831707000732, "num_tokens": 207595.0, "step": 1780 }, { "epoch": 0.36, "grad_norm": 0.034572117030620575, "learning_rate": 0.00016402000000000002, "loss": 0.1612, "mean_token_accuracy": 0.9356453567743301, "num_tokens": 209892.0, "step": 1800 }, { "epoch": 0.364, "grad_norm": 0.04566624388098717, "learning_rate": 0.00016362, "loss": 0.1605, "mean_token_accuracy": 0.9359289228916168, "num_tokens": 212194.0, "step": 1820 }, { "epoch": 0.368, "grad_norm": 0.029180865734815598, "learning_rate": 0.00016322000000000003, "loss": 0.1579, "mean_token_accuracy": 0.9378338158130646, "num_tokens": 214535.0, "step": 1840 }, { "epoch": 0.372, "grad_norm": 0.04812979698181152, "learning_rate": 0.00016282000000000002, "loss": 0.1576, "mean_token_accuracy": 0.9391636937856674, "num_tokens": 216872.0, "step": 1860 }, { "epoch": 0.376, "grad_norm": 0.06872449070215225, "learning_rate": 0.00016242, "loss": 0.1606, "mean_token_accuracy": 0.9389324098825454, "num_tokens": 219184.0, "step": 1880 }, { "epoch": 0.38, "grad_norm": 0.05308040603995323, "learning_rate": 0.00016202000000000002, "loss": 0.1592, "mean_token_accuracy": 0.9385550439357757, "num_tokens": 221499.0, "step": 1900 }, { "epoch": 0.384, "grad_norm": 0.13082465529441833, "learning_rate": 0.00016162000000000001, "loss": 0.1579, "mean_token_accuracy": 0.9380007416009903, "num_tokens": 223847.0, "step": 1920 }, { "epoch": 0.388, "grad_norm": 0.03414028137922287, "learning_rate": 0.00016122, "loss": 0.1602, "mean_token_accuracy": 0.9369382321834564, "num_tokens": 226156.0, "step": 1940 }, { "epoch": 0.392, "grad_norm": 0.04112333804368973, "learning_rate": 0.00016082000000000002, "loss": 0.1597, "mean_token_accuracy": 0.935492268204689, "num_tokens": 228475.0, "step": 1960 }, { "epoch": 0.396, "grad_norm": 0.02955610118806362, "learning_rate": 0.00016042, "loss": 0.1591, "mean_token_accuracy": 0.93908212184906, "num_tokens": 230792.0, "step": 1980 }, { "epoch": 0.4, "grad_norm": 0.024307863786816597, "learning_rate": 0.00016002, "loss": 0.1595, "mean_token_accuracy": 0.9393438696861267, "num_tokens": 233105.0, "step": 2000 }, { "epoch": 0.404, "grad_norm": 0.031049860641360283, "learning_rate": 0.00015962000000000002, "loss": 0.157, "mean_token_accuracy": 0.9350471049547195, "num_tokens": 235457.0, "step": 2020 }, { "epoch": 0.408, "grad_norm": 0.031201306730508804, "learning_rate": 0.00015922, "loss": 0.1597, "mean_token_accuracy": 0.9370091885328293, "num_tokens": 237770.0, "step": 2040 }, { "epoch": 0.412, "grad_norm": 0.03218454122543335, "learning_rate": 0.00015882, "loss": 0.159, "mean_token_accuracy": 0.9386389076709747, "num_tokens": 240090.0, "step": 2060 }, { "epoch": 0.416, "grad_norm": 0.03348623961210251, "learning_rate": 0.00015842000000000002, "loss": 0.1597, "mean_token_accuracy": 0.9362810254096985, "num_tokens": 242393.0, "step": 2080 }, { "epoch": 0.42, "grad_norm": 0.04007818177342415, "learning_rate": 0.00015802, "loss": 0.1581, "mean_token_accuracy": 0.9370237767696381, "num_tokens": 244733.0, "step": 2100 }, { "epoch": 0.424, "grad_norm": 0.03378809243440628, "learning_rate": 0.00015762, "loss": 0.1568, "mean_token_accuracy": 0.9377921044826507, "num_tokens": 247088.0, "step": 2120 }, { "epoch": 0.428, "grad_norm": 0.02798735350370407, "learning_rate": 0.00015722000000000002, "loss": 0.1586, "mean_token_accuracy": 0.9383140057325363, "num_tokens": 249415.0, "step": 2140 }, { "epoch": 0.432, "grad_norm": 0.02400992065668106, "learning_rate": 0.00015682, "loss": 0.1581, "mean_token_accuracy": 0.9389418184757232, "num_tokens": 251751.0, "step": 2160 }, { "epoch": 0.436, "grad_norm": 0.028334975242614746, "learning_rate": 0.00015642000000000002, "loss": 0.1578, "mean_token_accuracy": 0.936533722281456, "num_tokens": 254093.0, "step": 2180 }, { "epoch": 0.44, "grad_norm": 0.024794427677989006, "learning_rate": 0.00015602000000000001, "loss": 0.1587, "mean_token_accuracy": 0.9344067484140396, "num_tokens": 256422.0, "step": 2200 }, { "epoch": 0.444, "grad_norm": 0.024761928245425224, "learning_rate": 0.00015562, "loss": 0.1589, "mean_token_accuracy": 0.936401879787445, "num_tokens": 258747.0, "step": 2220 }, { "epoch": 0.448, "grad_norm": 0.023300737142562866, "learning_rate": 0.00015522000000000002, "loss": 0.1594, "mean_token_accuracy": 0.9409920126199722, "num_tokens": 261059.0, "step": 2240 }, { "epoch": 0.452, "grad_norm": 0.023498738184571266, "learning_rate": 0.00015482, "loss": 0.1595, "mean_token_accuracy": 0.9378518283367157, "num_tokens": 263369.0, "step": 2260 }, { "epoch": 0.456, "grad_norm": 0.020731788128614426, "learning_rate": 0.00015442, "loss": 0.1586, "mean_token_accuracy": 0.9395518034696579, "num_tokens": 265691.0, "step": 2280 }, { "epoch": 0.46, "grad_norm": 0.03587990626692772, "learning_rate": 0.00015402000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9378482937812805, "num_tokens": 268016.0, "step": 2300 }, { "epoch": 0.464, "grad_norm": 0.03809090331196785, "learning_rate": 0.00015362, "loss": 0.1588, "mean_token_accuracy": 0.9367815405130386, "num_tokens": 270334.0, "step": 2320 }, { "epoch": 0.468, "grad_norm": 0.04313996061682701, "learning_rate": 0.00015322, "loss": 0.1593, "mean_token_accuracy": 0.9363488733768464, "num_tokens": 272651.0, "step": 2340 }, { "epoch": 0.472, "grad_norm": 0.033811088651418686, "learning_rate": 0.00015282000000000002, "loss": 0.1594, "mean_token_accuracy": 0.9361798793077469, "num_tokens": 274964.0, "step": 2360 }, { "epoch": 0.476, "grad_norm": 0.03164658322930336, "learning_rate": 0.00015242, "loss": 0.1598, "mean_token_accuracy": 0.9361578047275543, "num_tokens": 277276.0, "step": 2380 }, { "epoch": 0.48, "grad_norm": 0.030314739793539047, "learning_rate": 0.00015202, "loss": 0.1601, "mean_token_accuracy": 0.9364828914403915, "num_tokens": 279583.0, "step": 2400 }, { "epoch": 0.484, "grad_norm": 0.03359575197100639, "learning_rate": 0.00015162000000000002, "loss": 0.1577, "mean_token_accuracy": 0.9370669215917588, "num_tokens": 281927.0, "step": 2420 }, { "epoch": 0.488, "grad_norm": 0.025949697941541672, "learning_rate": 0.00015122, "loss": 0.1582, "mean_token_accuracy": 0.9364959686994553, "num_tokens": 284267.0, "step": 2440 }, { "epoch": 0.492, "grad_norm": 0.031149016693234444, "learning_rate": 0.00015082, "loss": 0.1589, "mean_token_accuracy": 0.9384445637464524, "num_tokens": 286585.0, "step": 2460 }, { "epoch": 0.496, "grad_norm": 0.031569018959999084, "learning_rate": 0.00015042, "loss": 0.1589, "mean_token_accuracy": 0.9359267175197601, "num_tokens": 288907.0, "step": 2480 }, { "epoch": 0.5, "grad_norm": 0.02912713773548603, "learning_rate": 0.00015002, "loss": 0.1582, "mean_token_accuracy": 0.9354292452335358, "num_tokens": 291240.0, "step": 2500 }, { "epoch": 0.504, "grad_norm": 0.029648004099726677, "learning_rate": 0.00014962, "loss": 0.1577, "mean_token_accuracy": 0.9371567130088806, "num_tokens": 293587.0, "step": 2520 }, { "epoch": 0.508, "grad_norm": 0.01994331367313862, "learning_rate": 0.00014922, "loss": 0.1565, "mean_token_accuracy": 0.9374659866094589, "num_tokens": 295947.0, "step": 2540 }, { "epoch": 0.512, "grad_norm": 0.022220291197299957, "learning_rate": 0.00014882, "loss": 0.1576, "mean_token_accuracy": 0.9352899432182312, "num_tokens": 298291.0, "step": 2560 }, { "epoch": 0.516, "grad_norm": 0.019389133900403976, "learning_rate": 0.00014842, "loss": 0.1579, "mean_token_accuracy": 0.9369660496711731, "num_tokens": 300630.0, "step": 2580 }, { "epoch": 0.52, "grad_norm": 0.025073856115341187, "learning_rate": 0.00014802, "loss": 0.1594, "mean_token_accuracy": 0.9384920775890351, "num_tokens": 302943.0, "step": 2600 }, { "epoch": 0.524, "grad_norm": 0.02601858787238598, "learning_rate": 0.00014762, "loss": 0.1589, "mean_token_accuracy": 0.9373117983341217, "num_tokens": 305265.0, "step": 2620 }, { "epoch": 0.528, "grad_norm": 0.0248605664819479, "learning_rate": 0.00014722, "loss": 0.1594, "mean_token_accuracy": 0.9345557481050492, "num_tokens": 307583.0, "step": 2640 }, { "epoch": 0.532, "grad_norm": 0.022037120535969734, "learning_rate": 0.00014682, "loss": 0.1596, "mean_token_accuracy": 0.9348031014204026, "num_tokens": 309896.0, "step": 2660 }, { "epoch": 0.536, "grad_norm": 0.03458873927593231, "learning_rate": 0.00014642, "loss": 0.1572, "mean_token_accuracy": 0.9376107037067414, "num_tokens": 312244.0, "step": 2680 }, { "epoch": 0.54, "grad_norm": 0.016396528109908104, "learning_rate": 0.00014602, "loss": 0.1582, "mean_token_accuracy": 0.9370677560567856, "num_tokens": 314578.0, "step": 2700 }, { "epoch": 0.544, "grad_norm": 0.01821085438132286, "learning_rate": 0.00014562, "loss": 0.1596, "mean_token_accuracy": 0.9343250393867493, "num_tokens": 316892.0, "step": 2720 }, { "epoch": 0.548, "grad_norm": 0.025619324296712875, "learning_rate": 0.00014522, "loss": 0.1591, "mean_token_accuracy": 0.9362540364265441, "num_tokens": 319213.0, "step": 2740 }, { "epoch": 0.552, "grad_norm": 0.01870078593492508, "learning_rate": 0.00014482, "loss": 0.1598, "mean_token_accuracy": 0.9338186293840408, "num_tokens": 321525.0, "step": 2760 }, { "epoch": 0.556, "grad_norm": 0.018730677664279938, "learning_rate": 0.00014442, "loss": 0.1566, "mean_token_accuracy": 0.9394799619913101, "num_tokens": 323880.0, "step": 2780 }, { "epoch": 0.56, "grad_norm": 0.019898803904652596, "learning_rate": 0.00014402, "loss": 0.1579, "mean_token_accuracy": 0.9384922862052918, "num_tokens": 326214.0, "step": 2800 }, { "epoch": 0.564, "grad_norm": 0.021964257583022118, "learning_rate": 0.00014362, "loss": 0.1588, "mean_token_accuracy": 0.9378354996442795, "num_tokens": 328536.0, "step": 2820 }, { "epoch": 0.568, "grad_norm": 0.02397042326629162, "learning_rate": 0.00014322, "loss": 0.157, "mean_token_accuracy": 0.9381829768419265, "num_tokens": 330888.0, "step": 2840 }, { "epoch": 0.572, "grad_norm": 0.017819812521338463, "learning_rate": 0.00014282, "loss": 0.1569, "mean_token_accuracy": 0.940555801987648, "num_tokens": 333234.0, "step": 2860 }, { "epoch": 0.576, "grad_norm": 0.019056344404816628, "learning_rate": 0.00014242, "loss": 0.1621, "mean_token_accuracy": 0.9321106940507888, "num_tokens": 335516.0, "step": 2880 }, { "epoch": 0.58, "grad_norm": 0.020357482135295868, "learning_rate": 0.00014202, "loss": 0.1586, "mean_token_accuracy": 0.9384677648544312, "num_tokens": 337843.0, "step": 2900 }, { "epoch": 0.584, "grad_norm": 0.02105647511780262, "learning_rate": 0.00014162, "loss": 0.1592, "mean_token_accuracy": 0.9367755681276322, "num_tokens": 340159.0, "step": 2920 }, { "epoch": 0.588, "grad_norm": 0.02058851346373558, "learning_rate": 0.00014122, "loss": 0.1583, "mean_token_accuracy": 0.9383471548557282, "num_tokens": 342487.0, "step": 2940 }, { "epoch": 0.592, "grad_norm": 0.022488698363304138, "learning_rate": 0.00014082, "loss": 0.1575, "mean_token_accuracy": 0.9383687317371369, "num_tokens": 344831.0, "step": 2960 }, { "epoch": 0.596, "grad_norm": 0.01753912679851055, "learning_rate": 0.00014042, "loss": 0.1583, "mean_token_accuracy": 0.9367416232824326, "num_tokens": 347163.0, "step": 2980 }, { "epoch": 0.6, "grad_norm": 0.01856599561870098, "learning_rate": 0.00014002, "loss": 0.156, "mean_token_accuracy": 0.9410459071397781, "num_tokens": 349528.0, "step": 3000 } ], "logging_steps": 20, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4234514422376448.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }