banking77-intent-classifier / metrics_rank0000.jsonl
stefanwebb's picture
everything except large files
b874299
{"loss": 2.9876, "grad_norm": 157.0, "learning_rate": 5.925925925925926e-06, "entropy": 0.825, "num_tokens": 114489.0, "mean_token_accuracy": 0.6893173575401306, "epoch": 0.01890359168241966, "num_input_tokens_seen": 115216, "train_runtime": 4.4379, "train_tokens_per_second": 25961.602}
{"loss": 0.6662, "grad_norm": 24.5, "learning_rate": 1.3333333333333333e-05, "entropy": 0.8890625, "num_tokens": 229109.0, "mean_token_accuracy": 0.8326915562152862, "epoch": 0.03780718336483932, "num_input_tokens_seen": 230592, "train_runtime": 7.7641, "train_tokens_per_second": 29699.859}
{"loss": 0.6027, "grad_norm": 18.625, "learning_rate": 2.074074074074074e-05, "entropy": 1.02890625, "num_tokens": 343545.0, "mean_token_accuracy": 0.8529165983200073, "epoch": 0.05671077504725898, "num_input_tokens_seen": 345600, "train_runtime": 13.2817, "train_tokens_per_second": 26020.836}
{"loss": 0.4447, "grad_norm": 19.125, "learning_rate": 2.814814814814815e-05, "entropy": 1.1421875, "num_tokens": 458335.0, "mean_token_accuracy": 0.881816154718399, "epoch": 0.07561436672967864, "num_input_tokens_seen": 461282, "train_runtime": 16.8766, "train_tokens_per_second": 27332.56}
{"loss": 0.3989, "grad_norm": 21.75, "learning_rate": 3.555555555555555e-05, "entropy": 1.21640625, "num_tokens": 572824.0, "mean_token_accuracy": 0.8929793298244476, "epoch": 0.0945179584120983, "num_input_tokens_seen": 576346, "train_runtime": 22.5256, "train_tokens_per_second": 25586.277}
{"loss": 0.4872, "grad_norm": 13.625, "learning_rate": 3.999843966403289e-05, "entropy": 1.26484375, "num_tokens": 687152.0, "mean_token_accuracy": 0.8781549751758575, "epoch": 0.11342155009451796, "num_input_tokens_seen": 691188, "train_runtime": 26.0077, "train_tokens_per_second": 26576.234}
{"loss": 0.28, "grad_norm": 17.375, "learning_rate": 3.99808886803243e-05, "entropy": 1.28515625, "num_tokens": 801973.0, "mean_token_accuracy": 0.9074305832386017, "epoch": 0.1323251417769376, "num_input_tokens_seen": 806740, "train_runtime": 29.7225, "train_tokens_per_second": 27142.403}
{"loss": 0.356, "grad_norm": 12.625, "learning_rate": 3.994385346473689e-05, "entropy": 1.2984375, "num_tokens": 916426.0, "mean_token_accuracy": 0.9146295249462127, "epoch": 0.15122873345935728, "num_input_tokens_seen": 921796, "train_runtime": 34.6767, "train_tokens_per_second": 26582.553}
{"loss": 0.3933, "grad_norm": 11.4375, "learning_rate": 3.9887370131917e-05, "entropy": 1.56953125, "num_tokens": 1030824.0, "mean_token_accuracy": 0.9064954161643982, "epoch": 0.17013232514177692, "num_input_tokens_seen": 1036824, "train_runtime": 38.0075, "train_tokens_per_second": 27279.472}
{"loss": 0.2873, "grad_norm": 15.1875, "learning_rate": 3.981149376121427e-05, "entropy": 1.7515625, "num_tokens": 1145500.0, "mean_token_accuracy": 0.9260397672653198, "epoch": 0.1890359168241966, "num_input_tokens_seen": 1152356, "train_runtime": 43.1792, "train_tokens_per_second": 26687.759}
{"loss": 0.3322, "grad_norm": 8.5625, "learning_rate": 3.97162983429714e-05, "entropy": 1.75859375, "num_tokens": 1260057.0, "mean_token_accuracy": 0.9256749033927918, "epoch": 0.20793950850661624, "num_input_tokens_seen": 1267634, "train_runtime": 46.6166, "train_tokens_per_second": 27192.755}
{"loss": 0.2865, "grad_norm": 11.6875, "learning_rate": 3.960187670637294e-05, "entropy": 1.734375, "num_tokens": 1374973.0, "mean_token_accuracy": 0.9282522916793823, "epoch": 0.22684310018903592, "num_input_tokens_seen": 1383494, "train_runtime": 50.5007, "train_tokens_per_second": 27395.522}
{"loss": 0.2277, "grad_norm": 9.75, "learning_rate": 3.946834042892355e-05, "entropy": 1.70859375, "num_tokens": 1489683.0, "mean_token_accuracy": 0.9320353448390961, "epoch": 0.24574669187145556, "num_input_tokens_seen": 1499052, "train_runtime": 55.2907, "train_tokens_per_second": 27112.206}
{"loss": 0.2733, "grad_norm": 10.125, "learning_rate": 3.931581972764386e-05, "entropy": 1.6578125, "num_tokens": 1604106.0, "mean_token_accuracy": 0.9363594233989716, "epoch": 0.2646502835538752, "num_input_tokens_seen": 1614146, "train_runtime": 58.5263, "train_tokens_per_second": 27579.832}
{"loss": 0.2164, "grad_norm": 6.34375, "learning_rate": 3.91444633320903e-05, "entropy": 1.57109375, "num_tokens": 1718632.0, "mean_token_accuracy": 0.9349239528179168, "epoch": 0.2835538752362949, "num_input_tokens_seen": 1729362, "train_runtime": 63.5177, "train_tokens_per_second": 27226.454}
{"loss": 0.2173, "grad_norm": 10.875, "learning_rate": 3.8954438339322366e-05, "entropy": 1.57734375, "num_tokens": 1833068.0, "mean_token_accuracy": 0.9350460767745972, "epoch": 0.30245746691871456, "num_input_tokens_seen": 1844444, "train_runtime": 66.8194, "train_tokens_per_second": 27603.402}
{"loss": 0.2337, "grad_norm": 11.0, "learning_rate": 3.874593005095909e-05, "entropy": 1.62421875, "num_tokens": 1947640.0, "mean_token_accuracy": 0.929820317029953, "epoch": 0.32136105860113423, "num_input_tokens_seen": 1959682, "train_runtime": 70.4744, "train_tokens_per_second": 27806.99}
{"loss": 0.2156, "grad_norm": 8.625, "learning_rate": 3.851914179248333e-05, "entropy": 1.71171875, "num_tokens": 2062310.0, "mean_token_accuracy": 0.9308744966983795, "epoch": 0.34026465028355385, "num_input_tokens_seen": 2075138, "train_runtime": 75.9345, "train_tokens_per_second": 27327.991}
{"loss": 0.2718, "grad_norm": 13.5625, "learning_rate": 3.82742947149703e-05, "entropy": 1.86875, "num_tokens": 2176716.0, "mean_token_accuracy": 0.9264281988143921, "epoch": 0.3591682419659735, "num_input_tokens_seen": 2190160, "train_runtime": 79.4416, "train_tokens_per_second": 27569.42}
{"loss": 0.3385, "grad_norm": 6.125, "learning_rate": 3.801162757943359e-05, "entropy": 1.94765625, "num_tokens": 2291230.0, "mean_token_accuracy": 0.9164456725120544, "epoch": 0.3780718336483932, "num_input_tokens_seen": 2305250, "train_runtime": 84.7105, "train_tokens_per_second": 27213.265}
{"loss": 0.1811, "grad_norm": 15.1875, "learning_rate": 3.773139652399884e-05, "entropy": 1.846875, "num_tokens": 2405904.0, "mean_token_accuracy": 0.944804173707962, "epoch": 0.39697542533081287, "num_input_tokens_seen": 2420666, "train_runtime": 88.7231, "train_tokens_per_second": 27283.383}
{"loss": 0.1974, "grad_norm": 6.3125, "learning_rate": 3.743387481413243e-05, "entropy": 1.84765625, "num_tokens": 2520235.0, "mean_token_accuracy": 0.9379207909107208, "epoch": 0.4158790170132325, "num_input_tokens_seen": 2535606, "train_runtime": 93.0343, "train_tokens_per_second": 27254.523}
{"loss": 0.1266, "grad_norm": 2.875, "learning_rate": 3.711935257616842e-05, "entropy": 1.83984375, "num_tokens": 2634592.0, "mean_token_accuracy": 0.9594786465167999, "epoch": 0.43478260869565216, "num_input_tokens_seen": 2650514, "train_runtime": 98.3865, "train_tokens_per_second": 26939.815}
{"loss": 0.1993, "grad_norm": 3.6875, "learning_rate": 3.678813651439376e-05, "entropy": 1.78984375, "num_tokens": 2749299.0, "mean_token_accuracy": 0.9459972441196441, "epoch": 0.45368620037807184, "num_input_tokens_seen": 2766004, "train_runtime": 102.0946, "train_tokens_per_second": 27092.567}
{"loss": 0.2075, "grad_norm": 9.1875, "learning_rate": 3.6440549611967656e-05, "entropy": 1.78828125, "num_tokens": 2863713.0, "mean_token_accuracy": 0.940614128112793, "epoch": 0.4725897920604915, "num_input_tokens_seen": 2880990, "train_runtime": 107.8735, "train_tokens_per_second": 26707.121}
{"loss": 0.236, "grad_norm": 7.15625, "learning_rate": 3.6076930815966654e-05, "entropy": 1.85234375, "num_tokens": 2978032.0, "mean_token_accuracy": 0.9343804061412812, "epoch": 0.4914933837429111, "num_input_tokens_seen": 2995844, "train_runtime": 111.3362, "train_tokens_per_second": 26908.095}
{"loss": 0.162, "grad_norm": 7.375, "learning_rate": 3.569763470686262e-05, "entropy": 1.95859375, "num_tokens": 3092605.0, "mean_token_accuracy": 0.9484993875026703, "epoch": 0.5103969754253308, "num_input_tokens_seen": 3111092, "train_runtime": 115.8418, "train_tokens_per_second": 26856.393}
{"loss": 0.1892, "grad_norm": 7.125, "learning_rate": 3.530303115275597e-05, "entropy": 1.99921875, "num_tokens": 3207190.0, "mean_token_accuracy": 0.9394895970821381, "epoch": 0.5293005671077504, "num_input_tokens_seen": 3226396, "train_runtime": 120.7172, "train_tokens_per_second": 26726.892}
{"loss": 0.1614, "grad_norm": 4.5625, "learning_rate": 3.4893504948701185e-05, "entropy": 1.96484375, "num_tokens": 3321840.0, "mean_token_accuracy": 0.9600624740123749, "epoch": 0.5482041587901701, "num_input_tokens_seen": 3341802, "train_runtime": 124.4268, "train_tokens_per_second": 26857.576}
{"loss": 0.1334, "grad_norm": 6.96875, "learning_rate": 3.4469455441476475e-05, "entropy": 1.90859375, "num_tokens": 3436339.0, "mean_token_accuracy": 0.9625543296337128, "epoch": 0.5671077504725898, "num_input_tokens_seen": 3456964, "train_runtime": 130.3081, "train_tokens_per_second": 26529.148}
{"loss": 0.1427, "grad_norm": 12.4375, "learning_rate": 3.403129614016339e-05, "entropy": 1.92421875, "num_tokens": 3550813.0, "mean_token_accuracy": 0.9588114261627197, "epoch": 0.5860113421550095, "num_input_tokens_seen": 3572084, "train_runtime": 133.8989, "train_tokens_per_second": 26677.47}
{"loss": 0.2129, "grad_norm": 7.3125, "learning_rate": 3.357945431291618e-05, "entropy": 1.98671875, "num_tokens": 3665300.0, "mean_token_accuracy": 0.9367718935012818, "epoch": 0.6049149338374291, "num_input_tokens_seen": 3687248, "train_runtime": 138.2948, "train_tokens_per_second": 26662.235}
{"loss": 0.2219, "grad_norm": 3.09375, "learning_rate": 3.311437057031406e-05, "entropy": 2.1359375, "num_tokens": 3779809.0, "mean_token_accuracy": 0.9387097895145416, "epoch": 0.6238185255198487, "num_input_tokens_seen": 3802458, "train_runtime": 142.569, "train_tokens_per_second": 26671.004}
{"loss": 0.1355, "grad_norm": 4.53125, "learning_rate": 3.263649843570271e-05, "entropy": 2.0859375, "num_tokens": 3894322.0, "mean_token_accuracy": 0.9585716307163239, "epoch": 0.6427221172022685, "num_input_tokens_seen": 3917580, "train_runtime": 145.9767, "train_tokens_per_second": 26837.021}
{"loss": 0.2962, "grad_norm": 6.53125, "learning_rate": 3.214630390294396e-05, "entropy": 1.946875, "num_tokens": 4008844.0, "mean_token_accuracy": 0.9372412860393524, "epoch": 0.6616257088846881, "num_input_tokens_seen": 4032748, "train_runtime": 151.6027, "train_tokens_per_second": 26600.765}
{"loss": 0.1841, "grad_norm": 6.96875, "learning_rate": 3.1644264982005e-05, "entropy": 1.98671875, "num_tokens": 4123487.0, "mean_token_accuracy": 0.9490657150745392, "epoch": 0.6805293005671077, "num_input_tokens_seen": 4148142, "train_runtime": 154.9764, "train_tokens_per_second": 26766.274}
{"loss": 0.124, "grad_norm": 2.953125, "learning_rate": 3.113087123283002e-05, "entropy": 2.021875, "num_tokens": 4238014.0, "mean_token_accuracy": 0.964401924610138, "epoch": 0.6994328922495274, "num_input_tokens_seen": 4263312, "train_runtime": 159.4694, "train_tokens_per_second": 26734.354}
{"loss": 0.1498, "grad_norm": 3.4375, "learning_rate": 3.060662328794916e-05, "entropy": 1.96171875, "num_tokens": 4352627.0, "mean_token_accuracy": 0.9481843888759613, "epoch": 0.718336483931947, "num_input_tokens_seen": 4378630, "train_runtime": 163.6223, "train_tokens_per_second": 26760.595}
{"loss": 0.1076, "grad_norm": 4.1875, "learning_rate": 3.0072032364289914e-05, "entropy": 1.9640625, "num_tokens": 4467053.0, "mean_token_accuracy": 0.9691859900951385, "epoch": 0.7372400756143668, "num_input_tokens_seen": 4493600, "train_runtime": 166.9247, "train_tokens_per_second": 26919.915}
{"loss": 0.2501, "grad_norm": 3.875, "learning_rate": 2.9527619764667376e-05, "entropy": 2.02734375, "num_tokens": 4581812.0, "mean_token_accuracy": 0.9455641567707062, "epoch": 0.7561436672967864, "num_input_tokens_seen": 4609216, "train_runtime": 172.0695, "train_tokens_per_second": 26786.938}
{"loss": 0.2157, "grad_norm": 5.4375, "learning_rate": 2.8973916369439194e-05, "entropy": 2.14375, "num_tokens": 4696178.0, "mean_token_accuracy": 0.9492439985275268, "epoch": 0.775047258979206, "num_input_tokens_seen": 4724086, "train_runtime": 175.6473, "train_tokens_per_second": 26895.294}
{"loss": 0.1762, "grad_norm": 3.75, "learning_rate": 2.84114621188211e-05, "entropy": 2.2625, "num_tokens": 4810939.0, "mean_token_accuracy": 0.9574925601482391, "epoch": 0.7939508506616257, "num_input_tokens_seen": 4839702, "train_runtime": 180.4712, "train_tokens_per_second": 26817.036}
{"loss": 0.1703, "grad_norm": 3.9375, "learning_rate": 2.7840805486367792e-05, "entropy": 2.2953125, "num_tokens": 4925591.0, "mean_token_accuracy": 0.9540181159973145, "epoch": 0.8128544423440454, "num_input_tokens_seen": 4955098, "train_runtime": 184.4177, "train_tokens_per_second": 26868.891}
{"loss": 0.0938, "grad_norm": 4.625, "learning_rate": 2.7262502944132526e-05, "entropy": 2.2828125, "num_tokens": 5040089.0, "mean_token_accuracy": 0.9725252389907837, "epoch": 0.831758034026465, "num_input_tokens_seen": 5070258, "train_runtime": 188.065, "train_tokens_per_second": 26960.132}
{"loss": 0.1704, "grad_norm": 5.71875, "learning_rate": 2.667711842002707e-05, "entropy": 2.1265625, "num_tokens": 5154604.0, "mean_token_accuracy": 0.9579161703586578, "epoch": 0.8506616257088847, "num_input_tokens_seen": 5185478, "train_runtime": 192.8301, "train_tokens_per_second": 26891.43}
{"loss": 0.4284, "grad_norm": 4.3125, "learning_rate": 2.6085222747911155e-05, "entropy": 2.0484375, "num_tokens": 5269357.0, "mean_token_accuracy": 0.9190201222896576, "epoch": 0.8695652173913043, "num_input_tokens_seen": 5301020, "train_runtime": 196.1744, "train_tokens_per_second": 27021.971}
{"loss": 0.1346, "grad_norm": 13.3125, "learning_rate": 2.5487393110947557e-05, "entropy": 1.98671875, "num_tokens": 5384069.0, "mean_token_accuracy": 0.9579481542110443, "epoch": 0.888468809073724, "num_input_tokens_seen": 5416464, "train_runtime": 201.21, "train_tokens_per_second": 26919.463}
{"loss": 0.097, "grad_norm": 3.84375, "learning_rate": 2.4884212478765747e-05, "entropy": 1.9875, "num_tokens": 5498568.0, "mean_token_accuracy": 0.9672803819179535, "epoch": 0.9073724007561437, "num_input_tokens_seen": 5531644, "train_runtime": 205.075, "train_tokens_per_second": 26973.766}
{"loss": 0.2298, "grad_norm": 4.9375, "learning_rate": 2.427626903898292e-05, "entropy": 2.00234375, "num_tokens": 5613157.0, "mean_token_accuracy": 0.9443018674850464, "epoch": 0.9262759924385633, "num_input_tokens_seen": 5646952, "train_runtime": 208.4891, "train_tokens_per_second": 27085.115}
{"loss": 0.1732, "grad_norm": 6.03125, "learning_rate": 2.3664155623636715e-05, "entropy": 2.0140625, "num_tokens": 5727795.0, "mean_token_accuracy": 0.9442705571651459, "epoch": 0.945179584120983, "num_input_tokens_seen": 5762366, "train_runtime": 214.059, "train_tokens_per_second": 26919.525}
{"loss": 0.1083, "grad_norm": 3.546875, "learning_rate": 2.304846913108891e-05, "entropy": 2.0125, "num_tokens": 5842437.0, "mean_token_accuracy": 0.9664817750453949, "epoch": 0.9640831758034026, "num_input_tokens_seen": 5877646, "train_runtime": 275.7098, "train_tokens_per_second": 21318.232}
{"loss": 0.0875, "grad_norm": 2.671875, "learning_rate": 2.242980994396401e-05, "entropy": 2.0, "num_tokens": 5956870.0, "mean_token_accuracy": 0.9795427262783051, "epoch": 0.9829867674858223, "num_input_tokens_seen": 5992710, "train_runtime": 280.9684, "train_tokens_per_second": 21328.766}
{"loss": 0.1654, "grad_norm": 7.4375, "learning_rate": 2.1808781343690027e-05, "entropy": 1.9513888888888888, "num_tokens": 6059927.0, "mean_token_accuracy": 0.9603289763132731, "epoch": 1.0, "num_input_tokens_seen": 6096342, "train_runtime": 284.3725, "train_tokens_per_second": 21437.877}
{"loss": 0.0783, "grad_norm": 3.453125, "learning_rate": 2.118598892221257e-05, "entropy": 1.903125, "num_tokens": 6174483.0, "mean_token_accuracy": 0.9817151129245758, "epoch": 1.0189035916824196, "num_input_tokens_seen": 6211574, "train_runtime": 288.2049, "train_tokens_per_second": 21552.63}
{"loss": 0.1214, "grad_norm": 1.734375, "learning_rate": 2.0562039991455877e-05, "entropy": 1.84375, "num_tokens": 6289163.0, "mean_token_accuracy": 0.9741188943386078, "epoch": 1.0378071833648392, "num_input_tokens_seen": 6327000, "train_runtime": 293.7126, "train_tokens_per_second": 21541.469}
{"loss": 0.1393, "grad_norm": 3.78125, "learning_rate": 1.99375429911066e-05, "entropy": 1.8421875, "num_tokens": 6403766.0, "mean_token_accuracy": 0.9579156279563904, "epoch": 1.056710775047259, "num_input_tokens_seen": 6442290, "train_runtime": 297.1668, "train_tokens_per_second": 21679.038}
{"loss": 0.0872, "grad_norm": 3.953125, "learning_rate": 1.931310689529781e-05, "entropy": 1.85078125, "num_tokens": 6518469.0, "mean_token_accuracy": 0.9788394093513488, "epoch": 1.0756143667296787, "num_input_tokens_seen": 6557852, "train_runtime": 301.7702, "train_tokens_per_second": 21731.276}
{"loss": 0.0637, "grad_norm": 7.1875, "learning_rate": 1.8689340618771937e-05, "entropy": 1.8234375, "num_tokens": 6632963.0, "mean_token_accuracy": 0.972537738084793, "epoch": 1.0945179584120983, "num_input_tokens_seen": 6673032, "train_runtime": 306.4769, "train_tokens_per_second": 21773.362}
{"loss": 0.0565, "grad_norm": 5.78125, "learning_rate": 1.806685242310156e-05, "entropy": 1.78359375, "num_tokens": 6747403.0, "mean_token_accuracy": 0.9854797184467315, "epoch": 1.113421550094518, "num_input_tokens_seen": 6788174, "train_runtime": 310.3851, "train_tokens_per_second": 21870.17}
{"loss": 0.0973, "grad_norm": 8.8125, "learning_rate": 1.7446249323547117e-05, "entropy": 1.76015625, "num_tokens": 6861788.0, "mean_token_accuracy": 0.9734237968921662, "epoch": 1.1323251417769375, "num_input_tokens_seen": 6903146, "train_runtime": 315.4655, "train_tokens_per_second": 21882.41}
{"loss": 0.0681, "grad_norm": 1.4453125, "learning_rate": 1.6828136497130014e-05, "entropy": 1.75078125, "num_tokens": 6976277.0, "mean_token_accuracy": 0.9820096373558045, "epoch": 1.1512287334593574, "num_input_tokens_seen": 7018350, "train_runtime": 319.0527, "train_tokens_per_second": 21997.465}
{"loss": 0.0625, "grad_norm": 4.90625, "learning_rate": 1.6213116692498206e-05, "entropy": 1.740625, "num_tokens": 7090874.0, "mean_token_accuracy": 0.9826828062534332, "epoch": 1.170132325141777, "num_input_tokens_seen": 7133636, "train_runtime": 323.8986, "train_tokens_per_second": 22024.29}
{"loss": 0.077, "grad_norm": 0.66015625, "learning_rate": 1.560178964215987e-05, "entropy": 1.7328125, "num_tokens": 7205391.0, "mean_token_accuracy": 0.978941410779953, "epoch": 1.1890359168241966, "num_input_tokens_seen": 7248866, "train_runtime": 327.5895, "train_tokens_per_second": 22127.897}
{"loss": 0.067, "grad_norm": 4.0625, "learning_rate": 1.4994751477658139e-05, "entropy": 1.73203125, "num_tokens": 7319827.0, "mean_token_accuracy": 0.9818780541419982, "epoch": 1.2079395085066162, "num_input_tokens_seen": 7363900, "train_runtime": 331.4598, "train_tokens_per_second": 22216.571}
{"loss": 0.1153, "grad_norm": 2.734375, "learning_rate": 1.4392594148257426e-05, "entropy": 1.73515625, "num_tokens": 7434543.0, "mean_token_accuracy": 0.9638942897319793, "epoch": 1.2268431001890359, "num_input_tokens_seen": 7479394, "train_runtime": 336.2629, "train_tokens_per_second": 22242.696}
{"loss": 0.0359, "grad_norm": 2.046875, "learning_rate": 1.3795904843707959e-05, "entropy": 1.74609375, "num_tokens": 7549134.0, "mean_token_accuracy": 0.9886789560317993, "epoch": 1.2457466918714555, "num_input_tokens_seen": 7594632, "train_runtime": 339.6052, "train_tokens_per_second": 22363.12}
{"loss": 0.0808, "grad_norm": 2.25, "learning_rate": 1.3205265421651588e-05, "entropy": 1.740625, "num_tokens": 7663583.0, "mean_token_accuracy": 0.9852688193321228, "epoch": 1.264650283553875, "num_input_tokens_seen": 7709704, "train_runtime": 344.9458, "train_tokens_per_second": 22350.48}
{"loss": 0.0663, "grad_norm": 1.8125, "learning_rate": 1.2621251840227112e-05, "entropy": 1.75078125, "num_tokens": 7778064.0, "mean_token_accuracy": 0.9817369997501373, "epoch": 1.283553875236295, "num_input_tokens_seen": 7824834, "train_runtime": 348.223, "train_tokens_per_second": 22470.756}
{"loss": 0.0678, "grad_norm": 4.28125, "learning_rate": 1.2044433596428537e-05, "entropy": 1.75234375, "num_tokens": 7892415.0, "mean_token_accuracy": 0.9812626421451569, "epoch": 1.3024574669187146, "num_input_tokens_seen": 7939832, "train_runtime": 352.0847, "train_tokens_per_second": 22550.916}
{"loss": 0.0465, "grad_norm": 2.703125, "learning_rate": 1.1475373170763819e-05, "entropy": 1.746875, "num_tokens": 8006926.0, "mean_token_accuracy": 0.9823280215263367, "epoch": 1.3213610586011342, "num_input_tokens_seen": 8054988, "train_runtime": 357.1271, "train_tokens_per_second": 22554.962}
{"loss": 0.1174, "grad_norm": 1.4921875, "learning_rate": 1.0914625478755672e-05, "entropy": 1.74765625, "num_tokens": 8121373.0, "mean_token_accuracy": 0.9695515096187591, "epoch": 1.3402646502835538, "num_input_tokens_seen": 8170098, "train_runtime": 360.7524, "train_tokens_per_second": 22647.381}
{"loss": 0.045, "grad_norm": 1.1015625, "learning_rate": 1.0362737329819413e-05, "entropy": 1.74453125, "num_tokens": 8235981.0, "mean_token_accuracy": 0.9885900497436524, "epoch": 1.3591682419659734, "num_input_tokens_seen": 8285346, "train_runtime": 366.0216, "train_tokens_per_second": 22636.221}
{"loss": 0.0428, "grad_norm": 5.15625, "learning_rate": 9.820246894045316e-06, "entropy": 1.74296875, "num_tokens": 8350356.0, "mean_token_accuracy": 0.9822307825088501, "epoch": 1.3780718336483933, "num_input_tokens_seen": 8400240, "train_runtime": 369.6364, "train_tokens_per_second": 22725.685}
{"loss": 0.099, "grad_norm": 4.09375, "learning_rate": 9.28768317740564e-06, "entropy": 1.73515625, "num_tokens": 8465025.0, "mean_token_accuracy": 0.9710565328598022, "epoch": 1.3969754253308129, "num_input_tokens_seen": 8515740, "train_runtime": 373.5701, "train_tokens_per_second": 22795.56}
{"loss": 0.0736, "grad_norm": 4.96875, "learning_rate": 8.765565505897902e-06, "entropy": 1.7328125, "num_tokens": 8579648.0, "mean_token_accuracy": 0.9741575241088867, "epoch": 1.4158790170132325, "num_input_tokens_seen": 8631054, "train_runtime": 378.7394, "train_tokens_per_second": 22788.901}
{"loss": 0.0806, "grad_norm": 3.265625, "learning_rate": 8.254403019127566e-06, "entropy": 1.73359375, "num_tokens": 8694249.0, "mean_token_accuracy": 0.9791056990623475, "epoch": 1.434782608695652, "num_input_tokens_seen": 8746364, "train_runtime": 382.0615, "train_tokens_per_second": 22892.552}
{"loss": 0.0404, "grad_norm": 3.75, "learning_rate": 7.754694173823947e-06, "entropy": 1.73515625, "num_tokens": 8808789.0, "mean_token_accuracy": 0.9839386224746705, "epoch": 1.4536862003780717, "num_input_tokens_seen": 8861574, "train_runtime": 387.2205, "train_tokens_per_second": 22885.084}
{"loss": 0.0926, "grad_norm": 5.09375, "learning_rate": 7.266926257773346e-06, "entropy": 1.73359375, "num_tokens": 8923407.0, "mean_token_accuracy": 0.9714232623577118, "epoch": 1.4725897920604916, "num_input_tokens_seen": 8976944, "train_runtime": 390.891, "train_tokens_per_second": 22965.336}
{"loss": 0.0519, "grad_norm": 5.0, "learning_rate": 6.7915749146436415e-06, "entropy": 1.72265625, "num_tokens": 9037924.0, "mean_token_accuracy": 0.9837916433811188, "epoch": 1.4914933837429112, "num_input_tokens_seen": 9092050, "train_runtime": 395.3397, "train_tokens_per_second": 22998.071}
{"loss": 0.2115, "grad_norm": 3.875, "learning_rate": 6.329103680163495e-06, "entropy": 1.71796875, "num_tokens": 9152659.0, "mean_token_accuracy": 0.9516554296016693, "epoch": 1.5103969754253308, "num_input_tokens_seen": 9207594, "train_runtime": 399.5499, "train_tokens_per_second": 23044.916}
{"loss": 0.0348, "grad_norm": 0.51953125, "learning_rate": 5.879963530108506e-06, "entropy": 1.71640625, "num_tokens": 9267059.0, "mean_token_accuracy": 0.9919346511363983, "epoch": 1.5293005671077504, "num_input_tokens_seen": 9322572, "train_runtime": 403.4031, "train_tokens_per_second": 23109.815}
{"loss": 0.0374, "grad_norm": 2.234375, "learning_rate": 5.444592440535177e-06, "entropy": 1.7125, "num_tokens": 9381725.0, "mean_token_accuracy": 0.9837370038032531, "epoch": 1.54820415879017, "num_input_tokens_seen": 9438004, "train_runtime": 407.9692, "train_tokens_per_second": 23134.111}
{"loss": 0.0325, "grad_norm": 3.09375, "learning_rate": 5.023414960691469e-06, "entropy": 1.7078125, "num_tokens": 9496255.0, "mean_token_accuracy": 0.9918534696102143, "epoch": 1.5671077504725899, "num_input_tokens_seen": 9553156, "train_runtime": 412.1408, "train_tokens_per_second": 23179.35}
{"loss": 0.0618, "grad_norm": 4.78125, "learning_rate": 4.616841799020364e-06, "entropy": 1.703125, "num_tokens": 9610808.0, "mean_token_accuracy": 0.9808044970035553, "epoch": 1.5860113421550095, "num_input_tokens_seen": 9668364, "train_runtime": 416.3235, "train_tokens_per_second": 23223.203}
{"loss": 0.0493, "grad_norm": 6.1875, "learning_rate": 4.225269422660258e-06, "entropy": 1.703125, "num_tokens": 9725283.0, "mean_token_accuracy": 0.9843941271305084, "epoch": 1.6049149338374291, "num_input_tokens_seen": 9783552, "train_runtime": 421.1316, "train_tokens_per_second": 23231.576}
{"loss": 0.0595, "grad_norm": 1.5078125, "learning_rate": 3.8490796708326404e-06, "entropy": 1.70390625, "num_tokens": 9839878.0, "mean_token_accuracy": 0.9822299420833588, "epoch": 1.6238185255198487, "num_input_tokens_seen": 9898934, "train_runtime": 424.7606, "train_tokens_per_second": 23304.735}
{"loss": 0.059, "grad_norm": 1.328125, "learning_rate": 3.4886393824940924e-06, "entropy": 1.7015625, "num_tokens": 9954403.0, "mean_token_accuracy": 0.9807979345321656, "epoch": 1.6427221172022684, "num_input_tokens_seen": 10014142, "train_runtime": 429.8927, "train_tokens_per_second": 23294.514}
{"loss": 0.0574, "grad_norm": 2.09375, "learning_rate": 3.144300038615691e-06, "entropy": 1.70390625, "num_tokens": 10068933.0, "mean_token_accuracy": 0.9839386105537414, "epoch": 1.6616257088846882, "num_input_tokens_seen": 10129264, "train_runtime": 433.4828, "train_tokens_per_second": 23367.164}
{"loss": 0.0669, "grad_norm": 4.25, "learning_rate": 2.8163974194386766e-06, "entropy": 1.6984375, "num_tokens": 10183591.0, "mean_token_accuracy": 0.9792383193969727, "epoch": 1.6805293005671076, "num_input_tokens_seen": 10244732, "train_runtime": 437.9792, "train_tokens_per_second": 23390.909}
{"loss": 0.0801, "grad_norm": 3.46875, "learning_rate": 2.5052512770405434e-06, "entropy": 1.7015625, "num_tokens": 10298251.0, "mean_token_accuracy": 0.9761136710643769, "epoch": 1.6994328922495274, "num_input_tokens_seen": 10360212, "train_runtime": 442.481, "train_tokens_per_second": 23413.915}
{"loss": 0.0297, "grad_norm": 0.59765625, "learning_rate": 2.2111650235309147e-06, "entropy": 1.70234375, "num_tokens": 10412810.0, "mean_token_accuracy": 0.9904489517211914, "epoch": 1.718336483931947, "num_input_tokens_seen": 10475400, "train_runtime": 446.3738, "train_tokens_per_second": 23467.773}
{"loss": 0.0989, "grad_norm": 4.0625, "learning_rate": 1.9344254351812287e-06, "entropy": 1.69921875, "num_tokens": 10527389.0, "mean_token_accuracy": 0.9743396818637848, "epoch": 1.7372400756143667, "num_input_tokens_seen": 10590710, "train_runtime": 451.1755, "train_tokens_per_second": 23473.591}
{"loss": 0.0476, "grad_norm": 0.890625, "learning_rate": 1.6753023727767436e-06, "entropy": 1.703125, "num_tokens": 10641918.0, "mean_token_accuracy": 0.9838890075683594, "epoch": 1.7561436672967865, "num_input_tokens_seen": 10705900, "train_runtime": 454.754, "train_tokens_per_second": 23542.179}
{"loss": 0.0556, "grad_norm": 1.359375, "learning_rate": 1.4340485184635712e-06, "entropy": 1.7, "num_tokens": 10756496.0, "mean_token_accuracy": 0.9777659058570862, "epoch": 1.775047258979206, "num_input_tokens_seen": 10821144, "train_runtime": 459.2027, "train_tokens_per_second": 23565.072}
{"loss": 0.0595, "grad_norm": 1.171875, "learning_rate": 1.2108991293473627e-06, "entropy": 1.69921875, "num_tokens": 10871124.0, "mean_token_accuracy": 0.9741835057735443, "epoch": 1.7939508506616257, "num_input_tokens_seen": 10936460, "train_runtime": 463.6099, "train_tokens_per_second": 23589.79}
{"loss": 0.0541, "grad_norm": 3.265625, "learning_rate": 1.0060718080838683e-06, "entropy": 1.69765625, "num_tokens": 10985594.0, "mean_token_accuracy": 0.9831156551837921, "epoch": 1.8128544423440454, "num_input_tokens_seen": 11051508, "train_runtime": 467.1593, "train_tokens_per_second": 23656.828}
{"loss": 0.0835, "grad_norm": 2.4375, "learning_rate": 8.197662906851534e-07, "entropy": 1.70078125, "num_tokens": 11100230.0, "mean_token_accuracy": 0.9726030707359314, "epoch": 1.831758034026465, "num_input_tokens_seen": 11166904, "train_runtime": 472.195, "train_tokens_per_second": 23648.922}
{"loss": 0.0532, "grad_norm": 2.765625, "learning_rate": 6.521642517483573e-07, "entropy": 1.69921875, "num_tokens": 11214624.0, "mean_token_accuracy": 0.9853454470634461, "epoch": 1.8506616257088848, "num_input_tokens_seen": 11281802, "train_runtime": 475.7718, "train_tokens_per_second": 23712.635}
{"loss": 0.027, "grad_norm": 2.171875, "learning_rate": 5.034291272968772e-07, "entropy": 1.70078125, "num_tokens": 11329098.0, "mean_token_accuracy": 0.9934648215770722, "epoch": 1.8695652173913042, "num_input_tokens_seen": 11396946, "train_runtime": 480.2436, "train_tokens_per_second": 23731.596}
{"loss": 0.0742, "grad_norm": 4.0625, "learning_rate": 3.737059554068334e-07, "entropy": 1.6984375, "num_tokens": 11443715.0, "mean_token_accuracy": 0.9744843065738678, "epoch": 1.888468809073724, "num_input_tokens_seen": 11512282, "train_runtime": 484.6792, "train_tokens_per_second": 23752.376}
{"loss": 0.1322, "grad_norm": 6.84375, "learning_rate": 2.631212347741352e-07, "entropy": 1.69921875, "num_tokens": 11558513.0, "mean_token_accuracy": 0.9680740118026734, "epoch": 1.9073724007561437, "num_input_tokens_seen": 11627828, "train_runtime": 544.5283, "train_tokens_per_second": 21353.945}
{"loss": 0.0864, "grad_norm": 1.0078125, "learning_rate": 1.7178280136011417e-07, "entropy": 1.69921875, "num_tokens": 11673010.0, "mean_token_accuracy": 0.9749818980693817, "epoch": 1.9262759924385633, "num_input_tokens_seen": 11743010, "train_runtime": 549.7569, "train_tokens_per_second": 21360.369}
{"loss": 0.1175, "grad_norm": 2.5625, "learning_rate": 9.977972323599095e-08, "entropy": 1.69921875, "num_tokens": 11787637.0, "mean_token_accuracy": 0.9680160820484162, "epoch": 1.9451795841209831, "num_input_tokens_seen": 11858430, "train_runtime": 553.6509, "train_tokens_per_second": 21418.605}
{"loss": 0.0695, "grad_norm": 2.921875, "learning_rate": 4.718221372874254e-08, "entropy": 1.69765625, "num_tokens": 11902111.0, "mean_token_accuracy": 0.9804269134998321, "epoch": 1.9640831758034025, "num_input_tokens_seen": 11973576, "train_runtime": 557.8609, "train_tokens_per_second": 21463.371}
{"loss": 0.1152, "grad_norm": 5.8125, "learning_rate": 1.4041562953031051e-08, "entropy": 1.69609375, "num_tokens": 12016759.0, "mean_token_accuracy": 0.9696780204772949, "epoch": 1.9829867674858224, "num_input_tokens_seen": 12088990, "train_runtime": 561.9991, "train_tokens_per_second": 21510.694}
{"loss": 0.0989, "grad_norm": 3.75, "learning_rate": 3.900877959917004e-10, "entropy": 1.6961805555555556, "num_tokens": 12119827.0, "mean_token_accuracy": 0.9715293182267083, "epoch": 2.0, "num_input_tokens_seen": 12192662, "train_runtime": 565.5622, "train_tokens_per_second": 21558.482}
{"train_runtime": 612.9949, "train_samples_per_second": 27.592, "train_steps_per_second": 0.865, "train_tokens_per_second": 2486.879, "total_flos": 3.3226637176733696e+16, "train_loss": 0.1822078584218925, "epoch": 2.0, "num_input_tokens_seen": 12192662}