{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 2.131298542022705, "learning_rate": 0.00019962000000000002, "loss": 2.0121, "mean_token_accuracy": 0.6703190118074417, "num_tokens": 2348.0, "step": 20 }, { "epoch": 0.008, "grad_norm": 1.2890418767929077, "learning_rate": 0.00019922, "loss": 0.2751, "mean_token_accuracy": 0.9120438575744629, "num_tokens": 4697.0, "step": 40 }, { "epoch": 0.012, "grad_norm": 0.8126867413520813, "learning_rate": 0.00019882, "loss": 0.1966, "mean_token_accuracy": 0.9203487157821655, "num_tokens": 7014.0, "step": 60 }, { "epoch": 0.016, "grad_norm": 0.6051881313323975, "learning_rate": 0.00019842000000000001, "loss": 0.1851, "mean_token_accuracy": 0.9291799515485764, "num_tokens": 9327.0, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.6378348469734192, "learning_rate": 0.00019802, "loss": 0.1766, "mean_token_accuracy": 0.9286984890699387, "num_tokens": 11670.0, "step": 100 }, { "epoch": 0.024, "grad_norm": 0.624138593673706, "learning_rate": 0.00019762, "loss": 0.1784, "mean_token_accuracy": 0.9264282643795013, "num_tokens": 14000.0, "step": 120 }, { "epoch": 0.028, "grad_norm": 0.2111046463251114, "learning_rate": 0.00019722, "loss": 0.1702, "mean_token_accuracy": 0.9321970880031586, "num_tokens": 16329.0, "step": 140 }, { "epoch": 0.032, "grad_norm": 0.5350440740585327, "learning_rate": 0.00019682, "loss": 0.171, "mean_token_accuracy": 0.9311463803052902, "num_tokens": 18667.0, "step": 160 }, { "epoch": 0.036, "grad_norm": 0.19237647950649261, "learning_rate": 0.00019642, "loss": 0.167, "mean_token_accuracy": 0.9345656305551528, "num_tokens": 20985.0, "step": 180 }, { "epoch": 0.04, "grad_norm": 0.28153756260871887, "learning_rate": 0.00019602, "loss": 0.1674, "mean_token_accuracy": 0.9329667061567306, "num_tokens": 23325.0, "step": 200 }, { "epoch": 0.044, "grad_norm": 0.8545331954956055, "learning_rate": 0.00019562, "loss": 0.166, "mean_token_accuracy": 0.9344212204217911, "num_tokens": 25670.0, "step": 220 }, { "epoch": 0.048, "grad_norm": 0.24941129982471466, "learning_rate": 0.00019522, "loss": 0.1661, "mean_token_accuracy": 0.934859549999237, "num_tokens": 28016.0, "step": 240 }, { "epoch": 0.052, "grad_norm": 0.29549548029899597, "learning_rate": 0.00019482, "loss": 0.1707, "mean_token_accuracy": 0.9345517784357071, "num_tokens": 30345.0, "step": 260 }, { "epoch": 0.056, "grad_norm": 0.20388178527355194, "learning_rate": 0.00019442, "loss": 0.1673, "mean_token_accuracy": 0.9353183209896088, "num_tokens": 32691.0, "step": 280 }, { "epoch": 0.06, "grad_norm": 0.10762794315814972, "learning_rate": 0.00019402, "loss": 0.1642, "mean_token_accuracy": 0.9325390756130219, "num_tokens": 35030.0, "step": 300 }, { "epoch": 0.064, "grad_norm": 0.07676753401756287, "learning_rate": 0.00019362, "loss": 0.1633, "mean_token_accuracy": 0.9348760217428207, "num_tokens": 37359.0, "step": 320 }, { "epoch": 0.068, "grad_norm": 0.06781225651502609, "learning_rate": 0.00019322, "loss": 0.1589, "mean_token_accuracy": 0.936154904961586, "num_tokens": 39707.0, "step": 340 }, { "epoch": 0.072, "grad_norm": 0.10010460019111633, "learning_rate": 0.00019282000000000001, "loss": 0.1583, "mean_token_accuracy": 0.9410124599933625, "num_tokens": 42071.0, "step": 360 }, { "epoch": 0.076, "grad_norm": 0.07932794839143753, "learning_rate": 0.00019242, "loss": 0.1608, "mean_token_accuracy": 0.9380002528429031, "num_tokens": 44404.0, "step": 380 }, { "epoch": 0.08, "grad_norm": 0.06678586453199387, "learning_rate": 0.00019202, "loss": 0.1633, "mean_token_accuracy": 0.9347729980945587, "num_tokens": 46715.0, "step": 400 }, { "epoch": 0.084, "grad_norm": 0.05118393525481224, "learning_rate": 0.00019162, "loss": 0.1621, "mean_token_accuracy": 0.9356200367212295, "num_tokens": 49030.0, "step": 420 }, { "epoch": 0.088, "grad_norm": 0.07563836127519608, "learning_rate": 0.00019122, "loss": 0.1603, "mean_token_accuracy": 0.9362942427396774, "num_tokens": 51366.0, "step": 440 }, { "epoch": 0.092, "grad_norm": 0.053388580679893494, "learning_rate": 0.00019082, "loss": 0.1585, "mean_token_accuracy": 0.9377258807420731, "num_tokens": 53706.0, "step": 460 }, { "epoch": 0.096, "grad_norm": 0.05659119412302971, "learning_rate": 0.00019042, "loss": 0.1575, "mean_token_accuracy": 0.937972965836525, "num_tokens": 56052.0, "step": 480 }, { "epoch": 0.1, "grad_norm": 0.04934714362025261, "learning_rate": 0.00019002, "loss": 0.1606, "mean_token_accuracy": 0.9356311202049256, "num_tokens": 58374.0, "step": 500 }, { "epoch": 0.104, "grad_norm": 0.05647804215550423, "learning_rate": 0.00018962000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9353525519371033, "num_tokens": 60706.0, "step": 520 }, { "epoch": 0.108, "grad_norm": 0.058523017913103104, "learning_rate": 0.00018922, "loss": 0.1595, "mean_token_accuracy": 0.9371987581253052, "num_tokens": 63021.0, "step": 540 }, { "epoch": 0.112, "grad_norm": 0.03793497756123543, "learning_rate": 0.00018882000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9341553807258606, "num_tokens": 65369.0, "step": 560 }, { "epoch": 0.116, "grad_norm": 0.04743633046746254, "learning_rate": 0.00018842000000000002, "loss": 0.1586, "mean_token_accuracy": 0.936157900094986, "num_tokens": 67700.0, "step": 580 }, { "epoch": 0.12, "grad_norm": 0.05463261529803276, "learning_rate": 0.00018802, "loss": 0.1578, "mean_token_accuracy": 0.937694975733757, "num_tokens": 70038.0, "step": 600 }, { "epoch": 0.124, "grad_norm": 0.11279874294996262, "learning_rate": 0.00018762000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9382745862007141, "num_tokens": 72367.0, "step": 620 }, { "epoch": 0.128, "grad_norm": 0.050823234021663666, "learning_rate": 0.00018722, "loss": 0.1602, "mean_token_accuracy": 0.9388011395931244, "num_tokens": 74677.0, "step": 640 }, { "epoch": 0.132, "grad_norm": 0.04983159899711609, "learning_rate": 0.00018682000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9374112606048584, "num_tokens": 77014.0, "step": 660 }, { "epoch": 0.136, "grad_norm": 0.05084273964166641, "learning_rate": 0.00018642000000000002, "loss": 0.1591, "mean_token_accuracy": 0.9387890577316285, "num_tokens": 79339.0, "step": 680 }, { "epoch": 0.14, "grad_norm": 0.04936506226658821, "learning_rate": 0.00018602, "loss": 0.157, "mean_token_accuracy": 0.9359721839427948, "num_tokens": 81688.0, "step": 700 }, { "epoch": 0.144, "grad_norm": 0.04119481146335602, "learning_rate": 0.00018562000000000003, "loss": 0.1584, "mean_token_accuracy": 0.9359663873910904, "num_tokens": 84025.0, "step": 720 }, { "epoch": 0.148, "grad_norm": 0.03988514095544815, "learning_rate": 0.00018522000000000002, "loss": 0.1595, "mean_token_accuracy": 0.9372004926204681, "num_tokens": 86341.0, "step": 740 }, { "epoch": 0.152, "grad_norm": 0.04381653666496277, "learning_rate": 0.00018482, "loss": 0.1591, "mean_token_accuracy": 0.9373360633850097, "num_tokens": 88665.0, "step": 760 }, { "epoch": 0.156, "grad_norm": 0.03504428267478943, "learning_rate": 0.00018442000000000003, "loss": 0.1579, "mean_token_accuracy": 0.9403579801321029, "num_tokens": 90997.0, "step": 780 }, { "epoch": 0.16, "grad_norm": 0.03745226562023163, "learning_rate": 0.00018402000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9388439536094666, "num_tokens": 93322.0, "step": 800 }, { "epoch": 0.164, "grad_norm": 0.033838506788015366, "learning_rate": 0.00018362, "loss": 0.1562, "mean_token_accuracy": 0.9360319077968597, "num_tokens": 95688.0, "step": 820 }, { "epoch": 0.168, "grad_norm": 0.0314440056681633, "learning_rate": 0.00018322000000000002, "loss": 0.1596, "mean_token_accuracy": 0.9349893003702163, "num_tokens": 98005.0, "step": 840 }, { "epoch": 0.172, "grad_norm": 0.06577116250991821, "learning_rate": 0.00018282000000000001, "loss": 0.1589, "mean_token_accuracy": 0.9364802747964859, "num_tokens": 100327.0, "step": 860 }, { "epoch": 0.176, "grad_norm": 0.029735982418060303, "learning_rate": 0.00018242, "loss": 0.1576, "mean_token_accuracy": 0.9365677893161773, "num_tokens": 102671.0, "step": 880 }, { "epoch": 0.18, "grad_norm": 0.03155644237995148, "learning_rate": 0.00018202000000000002, "loss": 0.1587, "mean_token_accuracy": 0.9387517213821411, "num_tokens": 104996.0, "step": 900 }, { "epoch": 0.184, "grad_norm": 0.027411427348852158, "learning_rate": 0.00018162, "loss": 0.1581, "mean_token_accuracy": 0.9342827945947647, "num_tokens": 107337.0, "step": 920 }, { "epoch": 0.188, "grad_norm": 0.024014495313167572, "learning_rate": 0.00018122, "loss": 0.1583, "mean_token_accuracy": 0.9355534523725509, "num_tokens": 109674.0, "step": 940 }, { "epoch": 0.192, "grad_norm": 0.02990046516060829, "learning_rate": 0.00018082000000000002, "loss": 0.1577, "mean_token_accuracy": 0.9373938798904419, "num_tokens": 112015.0, "step": 960 }, { "epoch": 0.196, "grad_norm": 0.020676879212260246, "learning_rate": 0.00018042, "loss": 0.1582, "mean_token_accuracy": 0.9363098949193954, "num_tokens": 114349.0, "step": 980 }, { "epoch": 0.2, "grad_norm": 0.024556942284107208, "learning_rate": 0.00018002, "loss": 0.159, "mean_token_accuracy": 0.9355833351612091, "num_tokens": 116672.0, "step": 1000 }, { "epoch": 0.204, "grad_norm": 0.22343556582927704, "learning_rate": 0.00017962000000000002, "loss": 0.157, "mean_token_accuracy": 0.9363467574119568, "num_tokens": 119023.0, "step": 1020 }, { "epoch": 0.208, "grad_norm": 0.515048623085022, "learning_rate": 0.00017922, "loss": 0.1585, "mean_token_accuracy": 0.9374267637729645, "num_tokens": 121366.0, "step": 1040 }, { "epoch": 0.212, "grad_norm": 0.2672664225101471, "learning_rate": 0.00017882, "loss": 0.1704, "mean_token_accuracy": 0.938492265343666, "num_tokens": 123719.0, "step": 1060 }, { "epoch": 0.216, "grad_norm": 0.11909265071153641, "learning_rate": 0.00017842000000000002, "loss": 0.1752, "mean_token_accuracy": 0.9297878712415695, "num_tokens": 126053.0, "step": 1080 }, { "epoch": 0.22, "grad_norm": 0.11977271735668182, "learning_rate": 0.00017802, "loss": 0.1652, "mean_token_accuracy": 0.9360411554574967, "num_tokens": 128379.0, "step": 1100 }, { "epoch": 0.224, "grad_norm": 0.17722292244434357, "learning_rate": 0.00017762, "loss": 0.1697, "mean_token_accuracy": 0.9314894318580628, "num_tokens": 130700.0, "step": 1120 }, { "epoch": 0.228, "grad_norm": 0.8375388979911804, "learning_rate": 0.00017722000000000001, "loss": 0.1895, "mean_token_accuracy": 0.9316652357578278, "num_tokens": 133026.0, "step": 1140 }, { "epoch": 0.232, "grad_norm": 0.10569056123495102, "learning_rate": 0.00017682, "loss": 0.1679, "mean_token_accuracy": 0.9333775132894516, "num_tokens": 135371.0, "step": 1160 }, { "epoch": 0.236, "grad_norm": 0.07626856118440628, "learning_rate": 0.00017642, "loss": 0.1613, "mean_token_accuracy": 0.9380175620317459, "num_tokens": 137695.0, "step": 1180 }, { "epoch": 0.24, "grad_norm": 0.06852507591247559, "learning_rate": 0.00017602, "loss": 0.1693, "mean_token_accuracy": 0.9329774439334869, "num_tokens": 140007.0, "step": 1200 }, { "epoch": 0.244, "grad_norm": 0.11691898107528687, "learning_rate": 0.00017562, "loss": 0.159, "mean_token_accuracy": 0.9360336065292358, "num_tokens": 142345.0, "step": 1220 }, { "epoch": 0.248, "grad_norm": 0.10780195891857147, "learning_rate": 0.00017522000000000002, "loss": 0.1586, "mean_token_accuracy": 0.9376411676406861, "num_tokens": 144693.0, "step": 1240 }, { "epoch": 0.252, "grad_norm": 0.07631397247314453, "learning_rate": 0.00017482, "loss": 0.1633, "mean_token_accuracy": 0.9390978574752807, "num_tokens": 147031.0, "step": 1260 }, { "epoch": 0.256, "grad_norm": 0.05569858103990555, "learning_rate": 0.00017442, "loss": 0.1615, "mean_token_accuracy": 0.9400094121694564, "num_tokens": 149371.0, "step": 1280 }, { "epoch": 0.26, "grad_norm": 0.04815123230218887, "learning_rate": 0.00017402000000000002, "loss": 0.1584, "mean_token_accuracy": 0.9370361328125, "num_tokens": 151716.0, "step": 1300 }, { "epoch": 0.264, "grad_norm": 0.11904877424240112, "learning_rate": 0.00017362, "loss": 0.159, "mean_token_accuracy": 0.941026845574379, "num_tokens": 154046.0, "step": 1320 }, { "epoch": 0.268, "grad_norm": 0.12382964044809341, "learning_rate": 0.00017322, "loss": 0.1594, "mean_token_accuracy": 0.9340965986251831, "num_tokens": 156382.0, "step": 1340 }, { "epoch": 0.272, "grad_norm": 0.05523005872964859, "learning_rate": 0.00017282000000000002, "loss": 0.1593, "mean_token_accuracy": 0.9396583586931229, "num_tokens": 158705.0, "step": 1360 }, { "epoch": 0.276, "grad_norm": 0.05591598525643349, "learning_rate": 0.00017242, "loss": 0.1592, "mean_token_accuracy": 0.9364501267671586, "num_tokens": 161025.0, "step": 1380 }, { "epoch": 0.28, "grad_norm": 0.06780663877725601, "learning_rate": 0.00017202, "loss": 0.1618, "mean_token_accuracy": 0.936437115073204, "num_tokens": 163345.0, "step": 1400 }, { "epoch": 0.284, "grad_norm": 0.03291817009449005, "learning_rate": 0.00017162000000000001, "loss": 0.1569, "mean_token_accuracy": 0.9380638599395752, "num_tokens": 165711.0, "step": 1420 }, { "epoch": 0.288, "grad_norm": 0.04884820431470871, "learning_rate": 0.00017122, "loss": 0.1608, "mean_token_accuracy": 0.9370934247970581, "num_tokens": 168007.0, "step": 1440 }, { "epoch": 0.292, "grad_norm": 0.04577581211924553, "learning_rate": 0.00017082, "loss": 0.1595, "mean_token_accuracy": 0.9374603897333145, "num_tokens": 170332.0, "step": 1460 }, { "epoch": 0.296, "grad_norm": 0.03866467997431755, "learning_rate": 0.00017042, "loss": 0.1582, "mean_token_accuracy": 0.9355076909065246, "num_tokens": 172667.0, "step": 1480 }, { "epoch": 0.3, "grad_norm": 0.06204424798488617, "learning_rate": 0.00017002, "loss": 0.1599, "mean_token_accuracy": 0.9372841835021972, "num_tokens": 174970.0, "step": 1500 }, { "epoch": 0.304, "grad_norm": 0.03288702666759491, "learning_rate": 0.00016962, "loss": 0.1578, "mean_token_accuracy": 0.93585424721241, "num_tokens": 177317.0, "step": 1520 }, { "epoch": 0.308, "grad_norm": 0.03605024516582489, "learning_rate": 0.00016922, "loss": 0.1597, "mean_token_accuracy": 0.9366346269845962, "num_tokens": 179629.0, "step": 1540 }, { "epoch": 0.312, "grad_norm": 0.03328383341431618, "learning_rate": 0.00016882, "loss": 0.1582, "mean_token_accuracy": 0.9372009009122848, "num_tokens": 181960.0, "step": 1560 }, { "epoch": 0.316, "grad_norm": 0.03522924706339836, "learning_rate": 0.00016842, "loss": 0.1572, "mean_token_accuracy": 0.9355730235576629, "num_tokens": 184314.0, "step": 1580 }, { "epoch": 0.32, "grad_norm": 0.0317777544260025, "learning_rate": 0.00016802, "loss": 0.1572, "mean_token_accuracy": 0.9378984242677688, "num_tokens": 186658.0, "step": 1600 }, { "epoch": 0.324, "grad_norm": 0.07111163437366486, "learning_rate": 0.00016762, "loss": 0.1613, "mean_token_accuracy": 0.9344337552785873, "num_tokens": 188974.0, "step": 1620 }, { "epoch": 0.328, "grad_norm": 0.04765714704990387, "learning_rate": 0.00016722, "loss": 0.1608, "mean_token_accuracy": 0.9345141768455505, "num_tokens": 191276.0, "step": 1640 }, { "epoch": 0.332, "grad_norm": 0.041960619390010834, "learning_rate": 0.00016682, "loss": 0.1594, "mean_token_accuracy": 0.9375987917184829, "num_tokens": 193597.0, "step": 1660 }, { "epoch": 0.336, "grad_norm": 0.041757769882678986, "learning_rate": 0.00016642, "loss": 0.157, "mean_token_accuracy": 0.9367031455039978, "num_tokens": 195949.0, "step": 1680 }, { "epoch": 0.34, "grad_norm": 0.05323236435651779, "learning_rate": 0.00016601999999999999, "loss": 0.1654, "mean_token_accuracy": 0.9359392642974853, "num_tokens": 198267.0, "step": 1700 }, { "epoch": 0.344, "grad_norm": 0.08934314548969269, "learning_rate": 0.00016562, "loss": 0.1619, "mean_token_accuracy": 0.9331722050905228, "num_tokens": 200613.0, "step": 1720 }, { "epoch": 0.348, "grad_norm": 0.033347178250551224, "learning_rate": 0.00016522, "loss": 0.1611, "mean_token_accuracy": 0.9317500472068787, "num_tokens": 202927.0, "step": 1740 }, { "epoch": 0.352, "grad_norm": 0.03238425776362419, "learning_rate": 0.00016482, "loss": 0.1582, "mean_token_accuracy": 0.9358267247676849, "num_tokens": 205273.0, "step": 1760 }, { "epoch": 0.356, "grad_norm": 0.03249628096818924, "learning_rate": 0.00016442000000000003, "loss": 0.1591, "mean_token_accuracy": 0.9367831707000732, "num_tokens": 207595.0, "step": 1780 }, { "epoch": 0.36, "grad_norm": 0.034572117030620575, "learning_rate": 0.00016402000000000002, "loss": 0.1612, "mean_token_accuracy": 0.9356453567743301, "num_tokens": 209892.0, "step": 1800 }, { "epoch": 0.364, "grad_norm": 0.04566624388098717, "learning_rate": 0.00016362, "loss": 0.1605, "mean_token_accuracy": 0.9359289228916168, "num_tokens": 212194.0, "step": 1820 }, { "epoch": 0.368, "grad_norm": 0.029180865734815598, "learning_rate": 0.00016322000000000003, "loss": 0.1579, "mean_token_accuracy": 0.9378338158130646, "num_tokens": 214535.0, "step": 1840 }, { "epoch": 0.372, "grad_norm": 0.04812979698181152, "learning_rate": 0.00016282000000000002, "loss": 0.1576, "mean_token_accuracy": 0.9391636937856674, "num_tokens": 216872.0, "step": 1860 }, { "epoch": 0.376, "grad_norm": 0.06872449070215225, "learning_rate": 0.00016242, "loss": 0.1606, "mean_token_accuracy": 0.9389324098825454, "num_tokens": 219184.0, "step": 1880 }, { "epoch": 0.38, "grad_norm": 0.05308040603995323, "learning_rate": 0.00016202000000000002, "loss": 0.1592, "mean_token_accuracy": 0.9385550439357757, "num_tokens": 221499.0, "step": 1900 }, { "epoch": 0.384, "grad_norm": 0.13082465529441833, "learning_rate": 0.00016162000000000001, "loss": 0.1579, "mean_token_accuracy": 0.9380007416009903, "num_tokens": 223847.0, "step": 1920 }, { "epoch": 0.388, "grad_norm": 0.03414028137922287, "learning_rate": 0.00016122, "loss": 0.1602, "mean_token_accuracy": 0.9369382321834564, "num_tokens": 226156.0, "step": 1940 }, { "epoch": 0.392, "grad_norm": 0.04112333804368973, "learning_rate": 0.00016082000000000002, "loss": 0.1597, "mean_token_accuracy": 0.935492268204689, "num_tokens": 228475.0, "step": 1960 }, { "epoch": 0.396, "grad_norm": 0.02955610118806362, "learning_rate": 0.00016042, "loss": 0.1591, "mean_token_accuracy": 0.93908212184906, "num_tokens": 230792.0, "step": 1980 }, { "epoch": 0.4, "grad_norm": 0.024307863786816597, "learning_rate": 0.00016002, "loss": 0.1595, "mean_token_accuracy": 0.9393438696861267, "num_tokens": 233105.0, "step": 2000 } ], "logging_steps": 20, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2823215531802624.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }