diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,3364 +0,0 @@ -{ - "best_global_step": 30000, - "best_metric": 1.5021542310714722, - "best_model_checkpoint": "/mnt/mydata2/MoE/SLMOE/FINAL-MODEL-V2/checkpoint-30000", - "epoch": 3.9583848780246425, - "eval_steps": 1000, - "global_step": 30000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "entropy": 1.9970872095227241, - "epoch": 0.013195440975143089, - "grad_norm": 4.293704986572266, - "learning_rate": 3.96e-06, - "loss": 16.2682, - "mean_token_accuracy": 0.5875154070556163, - "num_tokens": 4681608.0, - "step": 100 - }, - { - "entropy": 1.9974382193386555, - "epoch": 0.026390881950286178, - "grad_norm": 4.16009521484375, - "learning_rate": 7.960000000000002e-06, - "loss": 16.2883, - "mean_token_accuracy": 0.5872169394791126, - "num_tokens": 9422456.0, - "step": 200 - }, - { - "entropy": 1.9942241810262202, - "epoch": 0.03958632292542926, - "grad_norm": 4.352612495422363, - "learning_rate": 1.196e-05, - "loss": 16.2576, - "mean_token_accuracy": 0.5878808307275176, - "num_tokens": 14133809.0, - "step": 300 - }, - { - "entropy": 1.9993504942953586, - "epoch": 0.052781763900572355, - "grad_norm": 4.423626899719238, - "learning_rate": 1.5960000000000003e-05, - "loss": 16.3033, - "mean_token_accuracy": 0.5866739044710994, - "num_tokens": 18832160.0, - "step": 400 - }, - { - "entropy": 2.0050615316629408, - "epoch": 0.06597720487571544, - "grad_norm": 4.602761268615723, - "learning_rate": 1.9960000000000002e-05, - "loss": 16.3501, - "mean_token_accuracy": 0.5865391325205564, - "num_tokens": 23552733.0, - "step": 500 - }, - { - "entropy": 1.9940124328434468, - "epoch": 0.07917264585085852, - "grad_norm": 4.151139736175537, - "learning_rate": 1.9997385793504095e-05, - "loss": 16.2651, - "mean_token_accuracy": 0.5878189096227289, - "num_tokens": 28254208.0, - "step": 600 - }, - { - "entropy": 1.9896119074523448, - "epoch": 0.09236808682600162, - "grad_norm": 4.60593843460083, - "learning_rate": 1.9994745180881968e-05, - "loss": 16.233, - "mean_token_accuracy": 0.5883422317355871, - "num_tokens": 32941633.0, - "step": 700 - }, - { - "entropy": 1.9750585129857063, - "epoch": 0.10556352780114471, - "grad_norm": 4.440836429595947, - "learning_rate": 1.9992104568259838e-05, - "loss": 16.089, - "mean_token_accuracy": 0.5906037037447095, - "num_tokens": 37665687.0, - "step": 800 - }, - { - "entropy": 1.959184586405754, - "epoch": 0.11875896877628779, - "grad_norm": 4.4237213134765625, - "learning_rate": 1.998946395563771e-05, - "loss": 15.9685, - "mean_token_accuracy": 0.5921973436698318, - "num_tokens": 42408986.0, - "step": 900 - }, - { - "entropy": 1.9661301617324352, - "epoch": 0.13195440975143088, - "grad_norm": 4.37338399887085, - "learning_rate": 1.9986823343015583e-05, - "loss": 16.0303, - "mean_token_accuracy": 0.5912668254226446, - "num_tokens": 47206895.0, - "step": 1000 - }, - { - "epoch": 0.13195440975143088, - "eval_entropy": 1.7280410604585583, - "eval_loss": 1.876280426979065, - "eval_mean_token_accuracy": 0.6125666955363589, - "eval_num_tokens": 47206895.0, - "eval_runtime": 3192.5073, - "eval_samples_per_second": 33.76, - "eval_steps_per_second": 4.22, - "step": 1000 - }, - { - "entropy": 1.952514111250639, - "epoch": 0.14514985072657396, - "grad_norm": 4.2272047996521, - "learning_rate": 1.9984182730393453e-05, - "loss": 15.9104, - "mean_token_accuracy": 0.5935716019570827, - "num_tokens": 51930574.0, - "step": 1100 - }, - { - "entropy": 1.951762547492981, - "epoch": 0.15834529170171704, - "grad_norm": 4.368087291717529, - "learning_rate": 1.9981542117771325e-05, - "loss": 15.9057, - "mean_token_accuracy": 0.5942437520623207, - "num_tokens": 56651451.0, - "step": 1200 - }, - { - "entropy": 1.9393150758743287, - "epoch": 0.17154073267686015, - "grad_norm": 4.117947101593018, - "learning_rate": 1.9978901505149195e-05, - "loss": 15.7951, - "mean_token_accuracy": 0.5956410552933812, - "num_tokens": 61366203.0, - "step": 1300 - }, - { - "entropy": 1.9590581172704697, - "epoch": 0.18473617365200323, - "grad_norm": 4.429177761077881, - "learning_rate": 1.9976260892527068e-05, - "loss": 15.9623, - "mean_token_accuracy": 0.5927011578902602, - "num_tokens": 66105310.0, - "step": 1400 - }, - { - "entropy": 1.9479418122768402, - "epoch": 0.1979316146271463, - "grad_norm": 4.270752906799316, - "learning_rate": 1.997362027990494e-05, - "loss": 15.8612, - "mean_token_accuracy": 0.5947995102033019, - "num_tokens": 70828886.0, - "step": 1500 - }, - { - "entropy": 1.9247798535227776, - "epoch": 0.21112705560228942, - "grad_norm": 4.055892467498779, - "learning_rate": 1.997097966728281e-05, - "loss": 15.6661, - "mean_token_accuracy": 0.598046317063272, - "num_tokens": 75498479.0, - "step": 1600 - }, - { - "entropy": 1.920890159010887, - "epoch": 0.2243224965774325, - "grad_norm": 4.256282329559326, - "learning_rate": 1.9968339054660683e-05, - "loss": 15.6489, - "mean_token_accuracy": 0.598283537067473, - "num_tokens": 80183990.0, - "step": 1700 - }, - { - "entropy": 1.9235932590067386, - "epoch": 0.23751793755257558, - "grad_norm": 4.397217750549316, - "learning_rate": 1.9965698442038555e-05, - "loss": 15.6559, - "mean_token_accuracy": 0.5974558148160577, - "num_tokens": 84876881.0, - "step": 1800 - }, - { - "entropy": 1.9103332214057445, - "epoch": 0.2507133785277187, - "grad_norm": 4.097607612609863, - "learning_rate": 1.9963057829416425e-05, - "loss": 15.5497, - "mean_token_accuracy": 0.6003026623278856, - "num_tokens": 89611584.0, - "step": 1900 - }, - { - "entropy": 1.914695471227169, - "epoch": 0.26390881950286177, - "grad_norm": 4.328080177307129, - "learning_rate": 1.9960417216794298e-05, - "loss": 15.5798, - "mean_token_accuracy": 0.5991538398340345, - "num_tokens": 94341018.0, - "step": 2000 - }, - { - "epoch": 0.26390881950286177, - "eval_entropy": 1.6988523987462736, - "eval_loss": 1.8283307552337646, - "eval_mean_token_accuracy": 0.6190770778948521, - "eval_num_tokens": 94341018.0, - "eval_runtime": 3191.3926, - "eval_samples_per_second": 33.772, - "eval_steps_per_second": 4.222, - "step": 2000 - }, - { - "entropy": 1.9157458925247193, - "epoch": 0.27710426047800485, - "grad_norm": 4.077412128448486, - "learning_rate": 1.995777660417217e-05, - "loss": 15.5759, - "mean_token_accuracy": 0.599464968368411, - "num_tokens": 99101131.0, - "step": 2100 - }, - { - "entropy": 1.9121157658100127, - "epoch": 0.29029970145314793, - "grad_norm": 4.194200038909912, - "learning_rate": 1.995513599155004e-05, - "loss": 15.5592, - "mean_token_accuracy": 0.5996096661686897, - "num_tokens": 103821529.0, - "step": 2200 - }, - { - "entropy": 1.8956915408372879, - "epoch": 0.303495142428291, - "grad_norm": 4.202551364898682, - "learning_rate": 1.9952495378927913e-05, - "loss": 15.4098, - "mean_token_accuracy": 0.6023901195079088, - "num_tokens": 108538002.0, - "step": 2300 - }, - { - "entropy": 1.9013544350862503, - "epoch": 0.3166905834034341, - "grad_norm": 4.200172424316406, - "learning_rate": 1.9949854766305785e-05, - "loss": 15.4667, - "mean_token_accuracy": 0.6008677286282181, - "num_tokens": 113208708.0, - "step": 2400 - }, - { - "entropy": 1.8889785696566106, - "epoch": 0.3298860243785772, - "grad_norm": 4.3031110763549805, - "learning_rate": 1.9947214153683655e-05, - "loss": 15.3555, - "mean_token_accuracy": 0.6025829451531172, - "num_tokens": 117941057.0, - "step": 2500 - }, - { - "entropy": 1.8911775602400303, - "epoch": 0.3430814653537203, - "grad_norm": 4.05417013168335, - "learning_rate": 1.9944573541061528e-05, - "loss": 15.3777, - "mean_token_accuracy": 0.6022695601731539, - "num_tokens": 122654207.0, - "step": 2600 - }, - { - "entropy": 1.8944527316093445, - "epoch": 0.3562769063288634, - "grad_norm": 4.3383097648620605, - "learning_rate": 1.99419329284394e-05, - "loss": 15.3979, - "mean_token_accuracy": 0.6022358436137438, - "num_tokens": 127395162.0, - "step": 2700 - }, - { - "entropy": 1.8847786201536656, - "epoch": 0.36947234730400647, - "grad_norm": 4.202676296234131, - "learning_rate": 1.993929231581727e-05, - "loss": 15.3206, - "mean_token_accuracy": 0.6034166327491403, - "num_tokens": 132115907.0, - "step": 2800 - }, - { - "entropy": 1.8890833668410778, - "epoch": 0.38266778827914955, - "grad_norm": 4.259827613830566, - "learning_rate": 1.9936651703195143e-05, - "loss": 15.3557, - "mean_token_accuracy": 0.6027516862004996, - "num_tokens": 136834999.0, - "step": 2900 - }, - { - "entropy": 1.8763156107068062, - "epoch": 0.3958632292542926, - "grad_norm": 3.9494423866271973, - "learning_rate": 1.9934011090573016e-05, - "loss": 15.2545, - "mean_token_accuracy": 0.6045393405854702, - "num_tokens": 141601677.0, - "step": 3000 - }, - { - "epoch": 0.3958632292542926, - "eval_entropy": 1.6155824041097597, - "eval_loss": 1.7954074144363403, - "eval_mean_token_accuracy": 0.6239857971845204, - "eval_num_tokens": 141601677.0, - "eval_runtime": 3193.3391, - "eval_samples_per_second": 33.752, - "eval_steps_per_second": 4.219, - "step": 3000 - }, - { - "entropy": 1.8736148147284986, - "epoch": 0.4090586702294357, - "grad_norm": 4.154598712921143, - "learning_rate": 1.9931370477950885e-05, - "loss": 15.2248, - "mean_token_accuracy": 0.6050427352637052, - "num_tokens": 146320541.0, - "step": 3100 - }, - { - "entropy": 1.8728535547852516, - "epoch": 0.42225411120457884, - "grad_norm": 4.243231296539307, - "learning_rate": 1.9928729865328758e-05, - "loss": 15.2124, - "mean_token_accuracy": 0.6051392666250467, - "num_tokens": 151034322.0, - "step": 3200 - }, - { - "entropy": 1.8753738756477834, - "epoch": 0.4354495521797219, - "grad_norm": 4.269708633422852, - "learning_rate": 1.9926089252706627e-05, - "loss": 15.2319, - "mean_token_accuracy": 0.6053581718355417, - "num_tokens": 155764395.0, - "step": 3300 - }, - { - "entropy": 1.8592823737859725, - "epoch": 0.448644993154865, - "grad_norm": 4.155630588531494, - "learning_rate": 1.9923448640084503e-05, - "loss": 15.1026, - "mean_token_accuracy": 0.6073809728398919, - "num_tokens": 160546122.0, - "step": 3400 - }, - { - "entropy": 1.8590475974977017, - "epoch": 0.4618404341300081, - "grad_norm": 4.049267768859863, - "learning_rate": 1.9920808027462373e-05, - "loss": 15.1112, - "mean_token_accuracy": 0.607349122017622, - "num_tokens": 165237636.0, - "step": 3500 - }, - { - "entropy": 1.8729989735782147, - "epoch": 0.47503587510515116, - "grad_norm": 4.30539608001709, - "learning_rate": 1.9918167414840242e-05, - "loss": 15.2137, - "mean_token_accuracy": 0.6055576696619391, - "num_tokens": 169961118.0, - "step": 3600 - }, - { - "entropy": 1.8558836616575718, - "epoch": 0.48823131608029424, - "grad_norm": 4.059924125671387, - "learning_rate": 1.991552680221812e-05, - "loss": 15.079, - "mean_token_accuracy": 0.6075002931430936, - "num_tokens": 174694060.0, - "step": 3700 - }, - { - "entropy": 1.8584099148213864, - "epoch": 0.5014267570554374, - "grad_norm": 4.13109016418457, - "learning_rate": 1.9912886189595988e-05, - "loss": 15.0896, - "mean_token_accuracy": 0.6074171752110124, - "num_tokens": 179400845.0, - "step": 3800 - }, - { - "entropy": 1.8450819233059883, - "epoch": 0.5146221980305804, - "grad_norm": 3.8707573413848877, - "learning_rate": 1.991024557697386e-05, - "loss": 14.9808, - "mean_token_accuracy": 0.6096408772468567, - "num_tokens": 184084788.0, - "step": 3900 - }, - { - "entropy": 1.837720358669758, - "epoch": 0.5278176390057235, - "grad_norm": 4.098764419555664, - "learning_rate": 1.9907604964351733e-05, - "loss": 14.9232, - "mean_token_accuracy": 0.6106652595847845, - "num_tokens": 188774170.0, - "step": 4000 - }, - { - "epoch": 0.5278176390057235, - "eval_entropy": 1.5803278617990546, - "eval_loss": 1.7662431001663208, - "eval_mean_token_accuracy": 0.6280910855415341, - "eval_num_tokens": 188774170.0, - "eval_runtime": 3188.7876, - "eval_samples_per_second": 33.8, - "eval_steps_per_second": 4.225, - "step": 4000 - }, - { - "entropy": 1.8657746087014675, - "epoch": 0.5410130799808666, - "grad_norm": 4.129471778869629, - "learning_rate": 1.9904964351729603e-05, - "loss": 15.1413, - "mean_token_accuracy": 0.6065389148145914, - "num_tokens": 193461030.0, - "step": 4100 - }, - { - "entropy": 1.8531225949525834, - "epoch": 0.5542085209560097, - "grad_norm": 4.1151604652404785, - "learning_rate": 1.9902323739107476e-05, - "loss": 15.0425, - "mean_token_accuracy": 0.60759482935071, - "num_tokens": 198128722.0, - "step": 4200 - }, - { - "entropy": 1.842765960842371, - "epoch": 0.5674039619311528, - "grad_norm": 4.083479404449463, - "learning_rate": 1.9899683126485345e-05, - "loss": 14.9646, - "mean_token_accuracy": 0.6094940543919801, - "num_tokens": 202860373.0, - "step": 4300 - }, - { - "entropy": 1.8503095911443233, - "epoch": 0.5805994029062959, - "grad_norm": 4.04292631149292, - "learning_rate": 1.9897042513863218e-05, - "loss": 15.0189, - "mean_token_accuracy": 0.6088438139855862, - "num_tokens": 207589820.0, - "step": 4400 - }, - { - "entropy": 1.8336368641257286, - "epoch": 0.593794843881439, - "grad_norm": 4.082155704498291, - "learning_rate": 1.989440190124109e-05, - "loss": 14.881, - "mean_token_accuracy": 0.6111773661524057, - "num_tokens": 212317338.0, - "step": 4500 - }, - { - "entropy": 1.8509277951717378, - "epoch": 0.606990284856582, - "grad_norm": 4.0600996017456055, - "learning_rate": 1.989176128861896e-05, - "loss": 15.0153, - "mean_token_accuracy": 0.6084930662810802, - "num_tokens": 217036645.0, - "step": 4600 - }, - { - "entropy": 1.8213860428333282, - "epoch": 0.6201857258317252, - "grad_norm": 4.324210166931152, - "learning_rate": 1.9889120675996833e-05, - "loss": 14.7757, - "mean_token_accuracy": 0.6129432079568505, - "num_tokens": 221721335.0, - "step": 4700 - }, - { - "entropy": 1.8302579675614834, - "epoch": 0.6333811668068682, - "grad_norm": 4.18501091003418, - "learning_rate": 1.9886480063374706e-05, - "loss": 14.8499, - "mean_token_accuracy": 0.610900132805109, - "num_tokens": 226424536.0, - "step": 4800 - }, - { - "entropy": 1.8055568043887615, - "epoch": 0.6465766077820113, - "grad_norm": 4.094173431396484, - "learning_rate": 1.9883839450752575e-05, - "loss": 14.6405, - "mean_token_accuracy": 0.6150890862569213, - "num_tokens": 231108868.0, - "step": 4900 - }, - { - "entropy": 1.818359861969948, - "epoch": 0.6597720487571545, - "grad_norm": 4.1406097412109375, - "learning_rate": 1.9881198838130448e-05, - "loss": 14.7503, - "mean_token_accuracy": 0.6136984185874462, - "num_tokens": 235827953.0, - "step": 5000 - }, - { - "epoch": 0.6597720487571545, - "eval_entropy": 1.5863489991730397, - "eval_loss": 1.7410829067230225, - "eval_mean_token_accuracy": 0.6313715932218462, - "eval_num_tokens": 235827953.0, - "eval_runtime": 3189.3586, - "eval_samples_per_second": 33.794, - "eval_steps_per_second": 4.224, - "step": 5000 - }, - { - "entropy": 1.828395222723484, - "epoch": 0.6729674897322975, - "grad_norm": 4.1239495277404785, - "learning_rate": 1.987855822550832e-05, - "loss": 14.8314, - "mean_token_accuracy": 0.611795287951827, - "num_tokens": 240569274.0, - "step": 5100 - }, - { - "entropy": 1.8127701422572136, - "epoch": 0.6861629307074406, - "grad_norm": 4.038999080657959, - "learning_rate": 1.987591761288619e-05, - "loss": 14.7074, - "mean_token_accuracy": 0.613722600787878, - "num_tokens": 245260313.0, - "step": 5200 - }, - { - "entropy": 1.8250135909020901, - "epoch": 0.6993583716825836, - "grad_norm": 4.045648097991943, - "learning_rate": 1.9873277000264063e-05, - "loss": 14.8054, - "mean_token_accuracy": 0.6120235136896371, - "num_tokens": 249954606.0, - "step": 5300 - }, - { - "entropy": 1.8105684253573417, - "epoch": 0.7125538126577268, - "grad_norm": 3.9636144638061523, - "learning_rate": 1.9870636387641936e-05, - "loss": 14.6757, - "mean_token_accuracy": 0.6151208320260048, - "num_tokens": 254701573.0, - "step": 5400 - }, - { - "entropy": 1.8140485088527203, - "epoch": 0.7257492536328698, - "grad_norm": 3.958618640899658, - "learning_rate": 1.9867995775019805e-05, - "loss": 14.7097, - "mean_token_accuracy": 0.613787483125925, - "num_tokens": 259356993.0, - "step": 5500 - }, - { - "entropy": 1.804336573332548, - "epoch": 0.7389446946080129, - "grad_norm": 3.6845266819000244, - "learning_rate": 1.9865355162397678e-05, - "loss": 14.6348, - "mean_token_accuracy": 0.6151521971076727, - "num_tokens": 264064776.0, - "step": 5600 - }, - { - "entropy": 1.7955251815915108, - "epoch": 0.7521401355831561, - "grad_norm": 4.289984703063965, - "learning_rate": 1.986271454977555e-05, - "loss": 14.5556, - "mean_token_accuracy": 0.6166081204265356, - "num_tokens": 268747372.0, - "step": 5700 - }, - { - "entropy": 1.8180018638074398, - "epoch": 0.7653355765582991, - "grad_norm": 4.030449390411377, - "learning_rate": 1.986007393715342e-05, - "loss": 14.7427, - "mean_token_accuracy": 0.613333948738873, - "num_tokens": 273463250.0, - "step": 5800 - }, - { - "entropy": 1.8108704054355622, - "epoch": 0.7785310175334422, - "grad_norm": 4.096724033355713, - "learning_rate": 1.9857433324531293e-05, - "loss": 14.6724, - "mean_token_accuracy": 0.6149687469750643, - "num_tokens": 278183524.0, - "step": 5900 - }, - { - "entropy": 1.8036722446978093, - "epoch": 0.7917264585085853, - "grad_norm": 4.013925552368164, - "learning_rate": 1.9854792711909166e-05, - "loss": 14.6154, - "mean_token_accuracy": 0.6153397902101279, - "num_tokens": 282897630.0, - "step": 6000 - }, - { - "epoch": 0.7917264585085853, - "eval_entropy": 1.5602015991178377, - "eval_loss": 1.718701958656311, - "eval_mean_token_accuracy": 0.6347915723799314, - "eval_num_tokens": 282897630.0, - "eval_runtime": 3190.0929, - "eval_samples_per_second": 33.786, - "eval_steps_per_second": 4.223, - "step": 6000 - }, - { - "entropy": 1.798267685174942, - "epoch": 0.8049218994837284, - "grad_norm": 3.8725955486297607, - "learning_rate": 1.9852152099287035e-05, - "loss": 14.5751, - "mean_token_accuracy": 0.6157875391095877, - "num_tokens": 287635938.0, - "step": 6100 - }, - { - "entropy": 1.7878125695884228, - "epoch": 0.8181173404588714, - "grad_norm": 4.046728134155273, - "learning_rate": 1.9849511486664908e-05, - "loss": 14.4771, - "mean_token_accuracy": 0.6187311994284391, - "num_tokens": 292290134.0, - "step": 6200 - }, - { - "entropy": 1.8043781124055385, - "epoch": 0.8313127814340145, - "grad_norm": 3.8358700275421143, - "learning_rate": 1.9846870874042777e-05, - "loss": 14.6268, - "mean_token_accuracy": 0.6149816115200519, - "num_tokens": 296943359.0, - "step": 6300 - }, - { - "entropy": 1.808346015959978, - "epoch": 0.8445082224091577, - "grad_norm": 4.048934459686279, - "learning_rate": 1.984423026142065e-05, - "loss": 14.6547, - "mean_token_accuracy": 0.6149462160468101, - "num_tokens": 301696338.0, - "step": 6400 - }, - { - "entropy": 1.7909628981351853, - "epoch": 0.8577036633843007, - "grad_norm": 4.050400733947754, - "learning_rate": 1.9841589648798523e-05, - "loss": 14.5162, - "mean_token_accuracy": 0.6173636147379875, - "num_tokens": 306409757.0, - "step": 6500 - }, - { - "entropy": 1.7796907857060433, - "epoch": 0.8708991043594438, - "grad_norm": 3.80794358253479, - "learning_rate": 1.9838949036176392e-05, - "loss": 14.4154, - "mean_token_accuracy": 0.6189755406975747, - "num_tokens": 311094592.0, - "step": 6600 - }, - { - "entropy": 1.7770866174995898, - "epoch": 0.8840945453345869, - "grad_norm": 4.046851634979248, - "learning_rate": 1.9836308423554265e-05, - "loss": 14.3936, - "mean_token_accuracy": 0.6197400981932879, - "num_tokens": 315777049.0, - "step": 6700 - }, - { - "entropy": 1.7914468431472779, - "epoch": 0.89728998630973, - "grad_norm": 3.747300863265991, - "learning_rate": 1.9833667810932138e-05, - "loss": 14.5052, - "mean_token_accuracy": 0.6180075034499168, - "num_tokens": 320507396.0, - "step": 6800 - }, - { - "entropy": 1.7918499463796616, - "epoch": 0.910485427284873, - "grad_norm": 3.889294147491455, - "learning_rate": 1.9831027198310007e-05, - "loss": 14.5115, - "mean_token_accuracy": 0.6171831817179918, - "num_tokens": 325185054.0, - "step": 6900 - }, - { - "entropy": 1.787924758642912, - "epoch": 0.9236808682600162, - "grad_norm": 3.9466371536254883, - "learning_rate": 1.9828386585687884e-05, - "loss": 14.4799, - "mean_token_accuracy": 0.6181329232081771, - "num_tokens": 329928778.0, - "step": 7000 - }, - { - "epoch": 0.9236808682600162, - "eval_entropy": 1.5404707675060503, - "eval_loss": 1.700563669204712, - "eval_mean_token_accuracy": 0.6374752920914234, - "eval_num_tokens": 329928778.0, - "eval_runtime": 3190.0879, - "eval_samples_per_second": 33.786, - "eval_steps_per_second": 4.223, - "step": 7000 - }, - { - "entropy": 1.7860313929617404, - "epoch": 0.9368763092351593, - "grad_norm": 3.9548611640930176, - "learning_rate": 1.9825745973065753e-05, - "loss": 14.4618, - "mean_token_accuracy": 0.6185688901692629, - "num_tokens": 334641739.0, - "step": 7100 - }, - { - "entropy": 1.7841411991417409, - "epoch": 0.9500717502103023, - "grad_norm": 3.9234402179718018, - "learning_rate": 1.9823105360443622e-05, - "loss": 14.4508, - "mean_token_accuracy": 0.6186311930418015, - "num_tokens": 339338811.0, - "step": 7200 - }, - { - "entropy": 1.7960207970440387, - "epoch": 0.9632671911854455, - "grad_norm": 3.974130392074585, - "learning_rate": 1.9820464747821495e-05, - "loss": 14.552, - "mean_token_accuracy": 0.6171760141849518, - "num_tokens": 344012186.0, - "step": 7300 - }, - { - "entropy": 1.7755667209625243, - "epoch": 0.9764626321605885, - "grad_norm": 3.956102132797241, - "learning_rate": 1.9817824135199368e-05, - "loss": 14.3741, - "mean_token_accuracy": 0.6195540763065219, - "num_tokens": 348731167.0, - "step": 7400 - }, - { - "entropy": 1.7824751836061479, - "epoch": 0.9896580731357316, - "grad_norm": 3.968210220336914, - "learning_rate": 1.981518352257724e-05, - "loss": 14.4282, - "mean_token_accuracy": 0.6188432604074479, - "num_tokens": 353442111.0, - "step": 7500 - }, - { - "entropy": 1.7721347595910606, - "epoch": 1.00277104260478, - "grad_norm": 3.980825185775757, - "learning_rate": 1.981254290995511e-05, - "loss": 14.2534, - "mean_token_accuracy": 0.6206337471428157, - "num_tokens": 358105945.0, - "step": 7600 - }, - { - "entropy": 1.7669402280449866, - "epoch": 1.0159664835799231, - "grad_norm": 3.9514989852905273, - "learning_rate": 1.9809902297332983e-05, - "loss": 14.2841, - "mean_token_accuracy": 0.621123610921204, - "num_tokens": 362858257.0, - "step": 7700 - }, - { - "entropy": 1.764955345094204, - "epoch": 1.0291619245550663, - "grad_norm": 3.9744436740875244, - "learning_rate": 1.9807261684710856e-05, - "loss": 14.2833, - "mean_token_accuracy": 0.6217504210770131, - "num_tokens": 367640143.0, - "step": 7800 - }, - { - "entropy": 1.7551834625005722, - "epoch": 1.0423573655302094, - "grad_norm": 4.042919158935547, - "learning_rate": 1.9804621072088725e-05, - "loss": 14.1949, - "mean_token_accuracy": 0.6225048137456178, - "num_tokens": 372344446.0, - "step": 7900 - }, - { - "entropy": 1.7445960550010204, - "epoch": 1.0555528065053523, - "grad_norm": 4.0731353759765625, - "learning_rate": 1.9801980459466598e-05, - "loss": 14.1036, - "mean_token_accuracy": 0.6247953659668565, - "num_tokens": 377055347.0, - "step": 8000 - }, - { - "epoch": 1.0555528065053523, - "eval_entropy": 1.5357393407745563, - "eval_loss": 1.6831690073013306, - "eval_mean_token_accuracy": 0.6397847914951448, - "eval_num_tokens": 377055347.0, - "eval_runtime": 3188.4458, - "eval_samples_per_second": 33.803, - "eval_steps_per_second": 4.226, - "step": 8000 - }, - { - "entropy": 1.7497526466846467, - "epoch": 1.0687482474804955, - "grad_norm": 4.043092727661133, - "learning_rate": 1.979933984684447e-05, - "loss": 14.1557, - "mean_token_accuracy": 0.6233407002687454, - "num_tokens": 381747520.0, - "step": 8100 - }, - { - "entropy": 1.7488390171527863, - "epoch": 1.0819436884556386, - "grad_norm": 3.92033052444458, - "learning_rate": 1.979669923422234e-05, - "loss": 14.1357, - "mean_token_accuracy": 0.6237702713161707, - "num_tokens": 386500736.0, - "step": 8200 - }, - { - "entropy": 1.746285059452057, - "epoch": 1.0951391294307817, - "grad_norm": 3.9088919162750244, - "learning_rate": 1.9794058621600213e-05, - "loss": 14.1195, - "mean_token_accuracy": 0.6236967007815838, - "num_tokens": 391229707.0, - "step": 8300 - }, - { - "entropy": 1.7552506732940674, - "epoch": 1.1083345704059249, - "grad_norm": 4.078105926513672, - "learning_rate": 1.9791418008978086e-05, - "loss": 14.191, - "mean_token_accuracy": 0.6234133420884609, - "num_tokens": 395919393.0, - "step": 8400 - }, - { - "entropy": 1.7433623734116555, - "epoch": 1.1215300113810678, - "grad_norm": 4.080204963684082, - "learning_rate": 1.9788777396355955e-05, - "loss": 14.0987, - "mean_token_accuracy": 0.6247739800065756, - "num_tokens": 400602316.0, - "step": 8500 - }, - { - "entropy": 1.7413157878816128, - "epoch": 1.134725452356211, - "grad_norm": 4.232221603393555, - "learning_rate": 1.9786136783733828e-05, - "loss": 14.0808, - "mean_token_accuracy": 0.6253204553574324, - "num_tokens": 405317129.0, - "step": 8600 - }, - { - "entropy": 1.7399081835150718, - "epoch": 1.147920893331354, - "grad_norm": 4.332466125488281, - "learning_rate": 1.97834961711117e-05, - "loss": 14.0645, - "mean_token_accuracy": 0.6258289200812578, - "num_tokens": 410076395.0, - "step": 8700 - }, - { - "entropy": 1.749226526170969, - "epoch": 1.1611163343064972, - "grad_norm": 3.86761474609375, - "learning_rate": 1.978085555848957e-05, - "loss": 14.1447, - "mean_token_accuracy": 0.6238241862505675, - "num_tokens": 414801400.0, - "step": 8800 - }, - { - "entropy": 1.7294802324473857, - "epoch": 1.17431177528164, - "grad_norm": 3.7982897758483887, - "learning_rate": 1.9778214945867443e-05, - "loss": 13.991, - "mean_token_accuracy": 0.6271847046166659, - "num_tokens": 419506353.0, - "step": 8900 - }, - { - "entropy": 1.751773677021265, - "epoch": 1.1875072162567832, - "grad_norm": 4.006007671356201, - "learning_rate": 1.9775574333245316e-05, - "loss": 14.1556, - "mean_token_accuracy": 0.6236618124693633, - "num_tokens": 424214730.0, - "step": 9000 - }, - { - "epoch": 1.1875072162567832, - "eval_entropy": 1.5117126926364997, - "eval_loss": 1.669206976890564, - "eval_mean_token_accuracy": 0.642173377870142, - "eval_num_tokens": 424214730.0, - "eval_runtime": 3188.1779, - "eval_samples_per_second": 33.806, - "eval_steps_per_second": 4.226, - "step": 9000 - }, - { - "entropy": 1.7496005721390246, - "epoch": 1.2007026572319264, - "grad_norm": 4.041248798370361, - "learning_rate": 1.9772933720623185e-05, - "loss": 14.1458, - "mean_token_accuracy": 0.623925342336297, - "num_tokens": 428874431.0, - "step": 9100 - }, - { - "entropy": 1.747845853716135, - "epoch": 1.2138980982070695, - "grad_norm": 4.10402774810791, - "learning_rate": 1.9770293108001058e-05, - "loss": 14.1284, - "mean_token_accuracy": 0.6235801701620222, - "num_tokens": 433631785.0, - "step": 9200 - }, - { - "entropy": 1.742924979031086, - "epoch": 1.2270935391822126, - "grad_norm": 3.7367687225341797, - "learning_rate": 1.976765249537893e-05, - "loss": 14.0895, - "mean_token_accuracy": 0.6245103114843369, - "num_tokens": 438341526.0, - "step": 9300 - }, - { - "entropy": 1.741254171282053, - "epoch": 1.2402889801573556, - "grad_norm": 3.969815969467163, - "learning_rate": 1.97650118827568e-05, - "loss": 14.069, - "mean_token_accuracy": 0.6253565014153719, - "num_tokens": 443056206.0, - "step": 9400 - }, - { - "entropy": 1.7313911478221415, - "epoch": 1.2534844211324987, - "grad_norm": 3.8585336208343506, - "learning_rate": 1.9762371270134673e-05, - "loss": 13.996, - "mean_token_accuracy": 0.6264266113936902, - "num_tokens": 447754874.0, - "step": 9500 - }, - { - "entropy": 1.736898885667324, - "epoch": 1.2666798621076418, - "grad_norm": 3.976346254348755, - "learning_rate": 1.9759730657512543e-05, - "loss": 14.0392, - "mean_token_accuracy": 0.6258945613354444, - "num_tokens": 452410741.0, - "step": 9600 - }, - { - "entropy": 1.728874337822199, - "epoch": 1.279875303082785, - "grad_norm": 4.028810501098633, - "learning_rate": 1.9757090044890415e-05, - "loss": 13.9683, - "mean_token_accuracy": 0.6269886953383684, - "num_tokens": 457091317.0, - "step": 9700 - }, - { - "entropy": 1.7307989183068275, - "epoch": 1.2930707440579279, - "grad_norm": 3.926074504852295, - "learning_rate": 1.9754449432268288e-05, - "loss": 13.9844, - "mean_token_accuracy": 0.6272025952115655, - "num_tokens": 461795304.0, - "step": 9800 - }, - { - "entropy": 1.7376534953713416, - "epoch": 1.306266185033071, - "grad_norm": 3.973266363143921, - "learning_rate": 1.9751808819646158e-05, - "loss": 14.0339, - "mean_token_accuracy": 0.6259337517619133, - "num_tokens": 466537252.0, - "step": 9900 - }, - { - "entropy": 1.7124468161165713, - "epoch": 1.3194616260082142, - "grad_norm": 3.9212160110473633, - "learning_rate": 1.974916820702403e-05, - "loss": 13.8356, - "mean_token_accuracy": 0.6298421548306942, - "num_tokens": 471210653.0, - "step": 10000 - }, - { - "epoch": 1.3194616260082142, - "eval_entropy": 1.515975620940738, - "eval_loss": 1.6549092531204224, - "eval_mean_token_accuracy": 0.6441165252658436, - "eval_num_tokens": 471210653.0, - "eval_runtime": 3185.9569, - "eval_samples_per_second": 33.83, - "eval_steps_per_second": 4.229, - "step": 10000 - }, - { - "entropy": 1.7285879038274288, - "epoch": 1.3326570669833573, - "grad_norm": 3.7557201385498047, - "learning_rate": 1.9746527594401903e-05, - "loss": 13.9691, - "mean_token_accuracy": 0.6267551811784506, - "num_tokens": 475934229.0, - "step": 10100 - }, - { - "entropy": 1.7297876067459583, - "epoch": 1.3458525079585004, - "grad_norm": 3.7093920707702637, - "learning_rate": 1.9743886981779773e-05, - "loss": 13.9748, - "mean_token_accuracy": 0.6267426482588053, - "num_tokens": 480609229.0, - "step": 10200 - }, - { - "entropy": 1.7228832334280013, - "epoch": 1.3590479489336436, - "grad_norm": 4.082423210144043, - "learning_rate": 1.974124636915765e-05, - "loss": 13.9133, - "mean_token_accuracy": 0.6282350146025419, - "num_tokens": 485365238.0, - "step": 10300 - }, - { - "entropy": 1.7269348740577697, - "epoch": 1.3722433899087865, - "grad_norm": 3.929724931716919, - "learning_rate": 1.9738605756535518e-05, - "loss": 13.9444, - "mean_token_accuracy": 0.6276592640578746, - "num_tokens": 490070914.0, - "step": 10400 - }, - { - "entropy": 1.720128181874752, - "epoch": 1.3854388308839296, - "grad_norm": 4.017079830169678, - "learning_rate": 1.9735965143913388e-05, - "loss": 13.8927, - "mean_token_accuracy": 0.6279544594511389, - "num_tokens": 494810156.0, - "step": 10500 - }, - { - "entropy": 1.7191148309409618, - "epoch": 1.3986342718590727, - "grad_norm": 3.982664108276367, - "learning_rate": 1.973332453129126e-05, - "loss": 13.8775, - "mean_token_accuracy": 0.6286891888082028, - "num_tokens": 499562228.0, - "step": 10600 - }, - { - "entropy": 1.7097445997595786, - "epoch": 1.4118297128342157, - "grad_norm": 3.8982956409454346, - "learning_rate": 1.9730683918669133e-05, - "loss": 13.8203, - "mean_token_accuracy": 0.6301189444214106, - "num_tokens": 504283158.0, - "step": 10700 - }, - { - "entropy": 1.7238348364830016, - "epoch": 1.4250251538093588, - "grad_norm": 3.908458948135376, - "learning_rate": 1.9728043306047006e-05, - "loss": 13.9223, - "mean_token_accuracy": 0.628321581557393, - "num_tokens": 508970795.0, - "step": 10800 - }, - { - "entropy": 1.7226741972565651, - "epoch": 1.438220594784502, - "grad_norm": 4.095105171203613, - "learning_rate": 1.9725402693424876e-05, - "loss": 13.9157, - "mean_token_accuracy": 0.6280835216119885, - "num_tokens": 513673694.0, - "step": 10900 - }, - { - "entropy": 1.7151854334771632, - "epoch": 1.451416035759645, - "grad_norm": 3.830979108810425, - "learning_rate": 1.972276208080275e-05, - "loss": 13.843, - "mean_token_accuracy": 0.6288192373514175, - "num_tokens": 518400143.0, - "step": 11000 - }, - { - "epoch": 1.451416035759645, - "eval_entropy": 1.4893881395710238, - "eval_loss": 1.6429299116134644, - "eval_mean_token_accuracy": 0.6459804228470001, - "eval_num_tokens": 518400143.0, - "eval_runtime": 3190.9199, - "eval_samples_per_second": 33.777, - "eval_steps_per_second": 4.222, - "step": 11000 - }, - { - "entropy": 1.7264313192665577, - "epoch": 1.4646114767347882, - "grad_norm": 3.997246503829956, - "learning_rate": 1.972012146818062e-05, - "loss": 13.9428, - "mean_token_accuracy": 0.6274989359080791, - "num_tokens": 523125727.0, - "step": 11100 - }, - { - "entropy": 1.7230025473237038, - "epoch": 1.4778069177099313, - "grad_norm": 3.9877476692199707, - "learning_rate": 1.971748085555849e-05, - "loss": 13.9068, - "mean_token_accuracy": 0.6281301632523537, - "num_tokens": 527844030.0, - "step": 11200 - }, - { - "entropy": 1.7104627051949501, - "epoch": 1.4910023586850742, - "grad_norm": 3.9071784019470215, - "learning_rate": 1.9714840242936363e-05, - "loss": 13.8127, - "mean_token_accuracy": 0.6302814479917288, - "num_tokens": 532587219.0, - "step": 11300 - }, - { - "entropy": 1.7233010344207287, - "epoch": 1.5041977996602174, - "grad_norm": 3.970679759979248, - "learning_rate": 1.9712199630314236e-05, - "loss": 13.9115, - "mean_token_accuracy": 0.627718816101551, - "num_tokens": 537335492.0, - "step": 11400 - }, - { - "entropy": 1.7085300183296204, - "epoch": 1.5173932406353605, - "grad_norm": 4.084446907043457, - "learning_rate": 1.9709559017692106e-05, - "loss": 13.7914, - "mean_token_accuracy": 0.6303271735459566, - "num_tokens": 542052225.0, - "step": 11500 - }, - { - "entropy": 1.7150896434485912, - "epoch": 1.5305886816105034, - "grad_norm": 3.8559019565582275, - "learning_rate": 1.970691840506998e-05, - "loss": 13.8508, - "mean_token_accuracy": 0.6289579905569553, - "num_tokens": 546782905.0, - "step": 11600 - }, - { - "entropy": 1.7150465674698352, - "epoch": 1.5437841225856466, - "grad_norm": 4.106844902038574, - "learning_rate": 1.970427779244785e-05, - "loss": 13.844, - "mean_token_accuracy": 0.6292690277844667, - "num_tokens": 551481827.0, - "step": 11700 - }, - { - "entropy": 1.7132095769047737, - "epoch": 1.5569795635607897, - "grad_norm": 4.053258419036865, - "learning_rate": 1.970163717982572e-05, - "loss": 13.8218, - "mean_token_accuracy": 0.629193360954523, - "num_tokens": 556220417.0, - "step": 11800 - }, - { - "entropy": 1.7086699897050857, - "epoch": 1.5701750045359328, - "grad_norm": 3.9460904598236084, - "learning_rate": 1.9698996567203593e-05, - "loss": 13.7941, - "mean_token_accuracy": 0.6298571369051933, - "num_tokens": 560909913.0, - "step": 11900 - }, - { - "entropy": 1.705731320977211, - "epoch": 1.583370445511076, - "grad_norm": 3.9144344329833984, - "learning_rate": 1.9696355954581466e-05, - "loss": 13.7618, - "mean_token_accuracy": 0.6303364527225495, - "num_tokens": 565678902.0, - "step": 12000 - }, - { - "epoch": 1.583370445511076, - "eval_entropy": 1.4800323092272984, - "eval_loss": 1.6302741765975952, - "eval_mean_token_accuracy": 0.6479193049404268, - "eval_num_tokens": 565678902.0, - "eval_runtime": 3188.8622, - "eval_samples_per_second": 33.799, - "eval_steps_per_second": 4.225, - "step": 12000 - }, - { - "entropy": 1.6993986825644969, - "epoch": 1.596565886486219, - "grad_norm": 4.023446559906006, - "learning_rate": 1.9693715341959336e-05, - "loss": 13.7179, - "mean_token_accuracy": 0.6318010853976012, - "num_tokens": 570386334.0, - "step": 12100 - }, - { - "entropy": 1.7054964397847652, - "epoch": 1.6097613274613622, - "grad_norm": 3.808046340942383, - "learning_rate": 1.969107472933721e-05, - "loss": 13.774, - "mean_token_accuracy": 0.630204633101821, - "num_tokens": 575112992.0, - "step": 12200 - }, - { - "entropy": 1.7120344342291356, - "epoch": 1.6229567684365052, - "grad_norm": 3.8330607414245605, - "learning_rate": 1.968843411671508e-05, - "loss": 13.8098, - "mean_token_accuracy": 0.6296659503132105, - "num_tokens": 579849787.0, - "step": 12300 - }, - { - "entropy": 1.7050182285904885, - "epoch": 1.6361522094116483, - "grad_norm": 4.006343364715576, - "learning_rate": 1.968579350409295e-05, - "loss": 13.7528, - "mean_token_accuracy": 0.6311237644404173, - "num_tokens": 584536225.0, - "step": 12400 - }, - { - "entropy": 1.708528604209423, - "epoch": 1.6493476503867912, - "grad_norm": 3.9078450202941895, - "learning_rate": 1.9683152891470823e-05, - "loss": 13.7855, - "mean_token_accuracy": 0.630221213772893, - "num_tokens": 589243850.0, - "step": 12500 - }, - { - "entropy": 1.7126329486072063, - "epoch": 1.6625430913619343, - "grad_norm": 3.8646676540374756, - "learning_rate": 1.9680512278848693e-05, - "loss": 13.814, - "mean_token_accuracy": 0.6300745321810246, - "num_tokens": 593974243.0, - "step": 12600 - }, - { - "entropy": 1.687779471129179, - "epoch": 1.6757385323370775, - "grad_norm": 3.9465432167053223, - "learning_rate": 1.9677871666226566e-05, - "loss": 13.6162, - "mean_token_accuracy": 0.6335779485851526, - "num_tokens": 598706959.0, - "step": 12700 - }, - { - "entropy": 1.7161117048561574, - "epoch": 1.6889339733122206, - "grad_norm": 3.7726762294769287, - "learning_rate": 1.967523105360444e-05, - "loss": 13.8461, - "mean_token_accuracy": 0.6291085375845432, - "num_tokens": 603417327.0, - "step": 12800 - }, - { - "entropy": 1.6877114294469358, - "epoch": 1.7021294142873638, - "grad_norm": 4.004697799682617, - "learning_rate": 1.9672590440982308e-05, - "loss": 13.6098, - "mean_token_accuracy": 0.6341679825633765, - "num_tokens": 608065925.0, - "step": 12900 - }, - { - "entropy": 1.6961762863397598, - "epoch": 1.7153248552625069, - "grad_norm": 3.9190824031829834, - "learning_rate": 1.966994982836018e-05, - "loss": 13.6786, - "mean_token_accuracy": 0.6323697911947965, - "num_tokens": 612761369.0, - "step": 13000 - }, - { - "epoch": 1.7153248552625069, - "eval_entropy": 1.4835227909274298, - "eval_loss": 1.619583010673523, - "eval_mean_token_accuracy": 0.6494705043581045, - "eval_num_tokens": 612761369.0, - "eval_runtime": 3188.0387, - "eval_samples_per_second": 33.808, - "eval_steps_per_second": 4.226, - "step": 13000 - }, - { - "entropy": 1.7056168286502362, - "epoch": 1.72852029623765, - "grad_norm": 4.131747245788574, - "learning_rate": 1.9667309215738053e-05, - "loss": 13.7522, - "mean_token_accuracy": 0.63100875236094, - "num_tokens": 617469480.0, - "step": 13100 - }, - { - "entropy": 1.6937249195575714, - "epoch": 1.741715737212793, - "grad_norm": 4.118540287017822, - "learning_rate": 1.9664668603115923e-05, - "loss": 13.6638, - "mean_token_accuracy": 0.6324852432310581, - "num_tokens": 622126208.0, - "step": 13200 - }, - { - "entropy": 1.6925005520880223, - "epoch": 1.754911178187936, - "grad_norm": 3.863349199295044, - "learning_rate": 1.9662027990493796e-05, - "loss": 13.6551, - "mean_token_accuracy": 0.6328117294609547, - "num_tokens": 626878868.0, - "step": 13300 - }, - { - "entropy": 1.698338780850172, - "epoch": 1.7681066191630792, - "grad_norm": 3.8205785751342773, - "learning_rate": 1.965938737787167e-05, - "loss": 13.6957, - "mean_token_accuracy": 0.6319728682935238, - "num_tokens": 631580180.0, - "step": 13400 - }, - { - "entropy": 1.6914977538585663, - "epoch": 1.7813020601382221, - "grad_norm": 3.8058321475982666, - "learning_rate": 1.9656746765249538e-05, - "loss": 13.6316, - "mean_token_accuracy": 0.6332664381712675, - "num_tokens": 636256941.0, - "step": 13500 - }, - { - "entropy": 1.6875527657568454, - "epoch": 1.7944975011133653, - "grad_norm": 4.034668922424316, - "learning_rate": 1.965410615262741e-05, - "loss": 13.6047, - "mean_token_accuracy": 0.6338882031291724, - "num_tokens": 640921349.0, - "step": 13600 - }, - { - "entropy": 1.6999479295313358, - "epoch": 1.8076929420885084, - "grad_norm": 3.9597856998443604, - "learning_rate": 1.9651465540005284e-05, - "loss": 13.7185, - "mean_token_accuracy": 0.6316749695688486, - "num_tokens": 645663113.0, - "step": 13700 - }, - { - "entropy": 1.6779197818040847, - "epoch": 1.8208883830636515, - "grad_norm": 3.8675427436828613, - "learning_rate": 1.9648824927383153e-05, - "loss": 13.5157, - "mean_token_accuracy": 0.6357244378328324, - "num_tokens": 650319261.0, - "step": 13800 - }, - { - "entropy": 1.6929333385825158, - "epoch": 1.8340838240387947, - "grad_norm": 3.900453805923462, - "learning_rate": 1.9646184314761026e-05, - "loss": 13.6518, - "mean_token_accuracy": 0.6333543327450752, - "num_tokens": 655031705.0, - "step": 13900 - }, - { - "entropy": 1.6835547630488872, - "epoch": 1.8472792650139378, - "grad_norm": 4.208860874176025, - "learning_rate": 1.96435437021389e-05, - "loss": 13.5688, - "mean_token_accuracy": 0.6342528595775366, - "num_tokens": 659779121.0, - "step": 14000 - }, - { - "epoch": 1.8472792650139378, - "eval_entropy": 1.4634111206028781, - "eval_loss": 1.6098047494888306, - "eval_mean_token_accuracy": 0.6510293728890473, - "eval_num_tokens": 659779121.0, - "eval_runtime": 3190.0344, - "eval_samples_per_second": 33.786, - "eval_steps_per_second": 4.223, - "step": 14000 - }, - { - "entropy": 1.6931584388017655, - "epoch": 1.860474705989081, - "grad_norm": 4.122419357299805, - "learning_rate": 1.9640903089516768e-05, - "loss": 13.653, - "mean_token_accuracy": 0.6321510327607394, - "num_tokens": 664482614.0, - "step": 14100 - }, - { - "entropy": 1.6729721108078957, - "epoch": 1.8736701469642238, - "grad_norm": 3.7705953121185303, - "learning_rate": 1.963826247689464e-05, - "loss": 13.4799, - "mean_token_accuracy": 0.6365410851687193, - "num_tokens": 669182529.0, - "step": 14200 - }, - { - "entropy": 1.6909792493283748, - "epoch": 1.886865587939367, - "grad_norm": 3.6213951110839844, - "learning_rate": 1.9635621864272514e-05, - "loss": 13.6385, - "mean_token_accuracy": 0.6330462139099836, - "num_tokens": 673935978.0, - "step": 14300 - }, - { - "entropy": 1.6902420930564404, - "epoch": 1.90006102891451, - "grad_norm": 3.9693639278411865, - "learning_rate": 1.9632981251650386e-05, - "loss": 13.618, - "mean_token_accuracy": 0.6335649444907904, - "num_tokens": 678713087.0, - "step": 14400 - }, - { - "entropy": 1.6775447849929332, - "epoch": 1.913256469889653, - "grad_norm": 3.9338343143463135, - "learning_rate": 1.9630340639028256e-05, - "loss": 13.5224, - "mean_token_accuracy": 0.6356154507398606, - "num_tokens": 683433667.0, - "step": 14500 - }, - { - "entropy": 1.6908086335659027, - "epoch": 1.9264519108647962, - "grad_norm": 4.041861534118652, - "learning_rate": 1.9627700026406125e-05, - "loss": 13.6255, - "mean_token_accuracy": 0.6331023909151554, - "num_tokens": 688149621.0, - "step": 14600 - }, - { - "entropy": 1.6837576559185983, - "epoch": 1.9396473518399393, - "grad_norm": 3.9566867351531982, - "learning_rate": 1.9625059413784e-05, - "loss": 13.5716, - "mean_token_accuracy": 0.6349082486331463, - "num_tokens": 692827568.0, - "step": 14700 - }, - { - "entropy": 1.7002186079323292, - "epoch": 1.9528427928150824, - "grad_norm": 4.085751533508301, - "learning_rate": 1.962241880116187e-05, - "loss": 13.7057, - "mean_token_accuracy": 0.6317524817958474, - "num_tokens": 697465702.0, - "step": 14800 - }, - { - "entropy": 1.673355882167816, - "epoch": 1.9660382337902256, - "grad_norm": 4.124021053314209, - "learning_rate": 1.9619778188539744e-05, - "loss": 13.4842, - "mean_token_accuracy": 0.6359885314106941, - "num_tokens": 702178812.0, - "step": 14900 - }, - { - "entropy": 1.6702920420467853, - "epoch": 1.9792336747653687, - "grad_norm": 3.9790737628936768, - "learning_rate": 1.9617137575917616e-05, - "loss": 13.4491, - "mean_token_accuracy": 0.6364967184513808, - "num_tokens": 706869072.0, - "step": 15000 - }, - { - "epoch": 1.9792336747653687, - "eval_entropy": 1.4559450233017144, - "eval_loss": 1.6001065969467163, - "eval_mean_token_accuracy": 0.6525887808856017, - "eval_num_tokens": 706869072.0, - "eval_runtime": 3188.4337, - "eval_samples_per_second": 33.803, - "eval_steps_per_second": 4.226, - "step": 15000 - }, - { - "entropy": 1.6789049740135669, - "epoch": 1.9924291157405116, - "grad_norm": 3.97013521194458, - "learning_rate": 1.9614496963295486e-05, - "loss": 13.5376, - "mean_token_accuracy": 0.6352450941503048, - "num_tokens": 711564626.0, - "step": 15100 - }, - { - "entropy": 1.6562597664647132, - "epoch": 2.00554208520956, - "grad_norm": 3.7767651081085205, - "learning_rate": 1.961185635067336e-05, - "loss": 13.2346, - "mean_token_accuracy": 0.6391579893400084, - "num_tokens": 716196843.0, - "step": 15200 - }, - { - "entropy": 1.6494072581827641, - "epoch": 2.018737526184703, - "grad_norm": 3.924259662628174, - "learning_rate": 1.960921573805123e-05, - "loss": 13.2641, - "mean_token_accuracy": 0.6396440506726503, - "num_tokens": 720902841.0, - "step": 15300 - }, - { - "entropy": 1.667148039340973, - "epoch": 2.0319329671598463, - "grad_norm": 4.009494304656982, - "learning_rate": 1.96065751254291e-05, - "loss": 13.4176, - "mean_token_accuracy": 0.6364856123179198, - "num_tokens": 725611095.0, - "step": 15400 - }, - { - "entropy": 1.6549566097557544, - "epoch": 2.0451284081349894, - "grad_norm": 3.6249144077301025, - "learning_rate": 1.9603934512806974e-05, - "loss": 13.3186, - "mean_token_accuracy": 0.6393066050112247, - "num_tokens": 730380517.0, - "step": 15500 - }, - { - "entropy": 1.6757390736043454, - "epoch": 2.0583238491101326, - "grad_norm": 4.016038417816162, - "learning_rate": 1.9601293900184843e-05, - "loss": 13.4928, - "mean_token_accuracy": 0.6354907912015915, - "num_tokens": 735121046.0, - "step": 15600 - }, - { - "entropy": 1.6515671475231648, - "epoch": 2.0715192900852757, - "grad_norm": 3.98551607131958, - "learning_rate": 1.9598653287562716e-05, - "loss": 13.2811, - "mean_token_accuracy": 0.6401347954571247, - "num_tokens": 739800256.0, - "step": 15700 - }, - { - "entropy": 1.6661450408399106, - "epoch": 2.084714731060419, - "grad_norm": 3.937788248062134, - "learning_rate": 1.959601267494059e-05, - "loss": 13.408, - "mean_token_accuracy": 0.6374346616864205, - "num_tokens": 744553871.0, - "step": 15800 - }, - { - "entropy": 1.6551994441449642, - "epoch": 2.0979101720355615, - "grad_norm": 3.7203454971313477, - "learning_rate": 1.9593372062318458e-05, - "loss": 13.31, - "mean_token_accuracy": 0.6383191919326783, - "num_tokens": 749255191.0, - "step": 15900 - }, - { - "entropy": 1.651690663099289, - "epoch": 2.1111056130107047, - "grad_norm": 3.984395980834961, - "learning_rate": 1.959073144969633e-05, - "loss": 13.285, - "mean_token_accuracy": 0.6396138309687376, - "num_tokens": 754009683.0, - "step": 16000 - }, - { - "epoch": 2.1111056130107047, - "eval_entropy": 1.4368949367493393, - "eval_loss": 1.5915658473968506, - "eval_mean_token_accuracy": 0.6540027022879364, - "eval_num_tokens": 754009683.0, - "eval_runtime": 3188.3805, - "eval_samples_per_second": 33.804, - "eval_steps_per_second": 4.226, - "step": 16000 - }, - { - "entropy": 1.6593476708233357, - "epoch": 2.124301053985848, - "grad_norm": 3.950589895248413, - "learning_rate": 1.9588090837074204e-05, - "loss": 13.3504, - "mean_token_accuracy": 0.6379156097769737, - "num_tokens": 758750576.0, - "step": 16100 - }, - { - "entropy": 1.643917052000761, - "epoch": 2.137496494960991, - "grad_norm": 3.8415000438690186, - "learning_rate": 1.9585450224452073e-05, - "loss": 13.2282, - "mean_token_accuracy": 0.6402662719786167, - "num_tokens": 763474293.0, - "step": 16200 - }, - { - "entropy": 1.6558144466578961, - "epoch": 2.150691935936134, - "grad_norm": 3.8246536254882812, - "learning_rate": 1.9582809611829946e-05, - "loss": 13.3193, - "mean_token_accuracy": 0.638944916576147, - "num_tokens": 768160609.0, - "step": 16300 - }, - { - "entropy": 1.6544341269135474, - "epoch": 2.163887376911277, - "grad_norm": 4.140397548675537, - "learning_rate": 1.958016899920782e-05, - "loss": 13.3071, - "mean_token_accuracy": 0.6394369124621153, - "num_tokens": 772837344.0, - "step": 16400 - }, - { - "entropy": 1.6510882955789565, - "epoch": 2.1770828178864203, - "grad_norm": 3.7167270183563232, - "learning_rate": 1.9577528386585688e-05, - "loss": 13.279, - "mean_token_accuracy": 0.6400315296649933, - "num_tokens": 777550627.0, - "step": 16500 - }, - { - "entropy": 1.6512632183730602, - "epoch": 2.1902782588615635, - "grad_norm": 4.07649040222168, - "learning_rate": 1.957488777396356e-05, - "loss": 13.2926, - "mean_token_accuracy": 0.6393875291198492, - "num_tokens": 782253834.0, - "step": 16600 - }, - { - "entropy": 1.6434899391233921, - "epoch": 2.2034736998367066, - "grad_norm": 3.8309640884399414, - "learning_rate": 1.9572247161341434e-05, - "loss": 13.2162, - "mean_token_accuracy": 0.6406759959459305, - "num_tokens": 786939754.0, - "step": 16700 - }, - { - "entropy": 1.6499764910340309, - "epoch": 2.2166691408118497, - "grad_norm": 3.879365921020508, - "learning_rate": 1.9569606548719303e-05, - "loss": 13.2715, - "mean_token_accuracy": 0.6398765755444765, - "num_tokens": 791637741.0, - "step": 16800 - }, - { - "entropy": 1.65800940066576, - "epoch": 2.2298645817869924, - "grad_norm": 3.7924554347991943, - "learning_rate": 1.9566965936097176e-05, - "loss": 13.3308, - "mean_token_accuracy": 0.6387289334088564, - "num_tokens": 796360880.0, - "step": 16900 - }, - { - "entropy": 1.6624345737695694, - "epoch": 2.2430600227621356, - "grad_norm": 3.9134092330932617, - "learning_rate": 1.956432532347505e-05, - "loss": 13.371, - "mean_token_accuracy": 0.6376094933599233, - "num_tokens": 801087791.0, - "step": 17000 - }, - { - "epoch": 2.2430600227621356, - "eval_entropy": 1.448110183363174, - "eval_loss": 1.5824671983718872, - "eval_mean_token_accuracy": 0.6552899748779286, - "eval_num_tokens": 801087791.0, - "eval_runtime": 3192.5207, - "eval_samples_per_second": 33.76, - "eval_steps_per_second": 4.22, - "step": 17000 - }, - { - "entropy": 1.6444039134681225, - "epoch": 2.2562554637372787, - "grad_norm": 3.820003032684326, - "learning_rate": 1.9561684710852918e-05, - "loss": 13.2239, - "mean_token_accuracy": 0.6408811850100755, - "num_tokens": 805773247.0, - "step": 17100 - }, - { - "entropy": 1.6364771522581578, - "epoch": 2.269450904712422, - "grad_norm": 3.975039005279541, - "learning_rate": 1.955904409823079e-05, - "loss": 13.1499, - "mean_token_accuracy": 0.6420202821493148, - "num_tokens": 810487263.0, - "step": 17200 - }, - { - "entropy": 1.6440125972032547, - "epoch": 2.282646345687565, - "grad_norm": 3.7972419261932373, - "learning_rate": 1.9556403485608664e-05, - "loss": 13.223, - "mean_token_accuracy": 0.6403986816108227, - "num_tokens": 815192150.0, - "step": 17300 - }, - { - "entropy": 1.6492359913885593, - "epoch": 2.295841786662708, - "grad_norm": 3.869448184967041, - "learning_rate": 1.9553762872986533e-05, - "loss": 13.2591, - "mean_token_accuracy": 0.6401562896370888, - "num_tokens": 819926950.0, - "step": 17400 - }, - { - "entropy": 1.6649620904028415, - "epoch": 2.3090372276378512, - "grad_norm": 3.9279003143310547, - "learning_rate": 1.9551122260364406e-05, - "loss": 13.3943, - "mean_token_accuracy": 0.6380402848124505, - "num_tokens": 824679306.0, - "step": 17500 - }, - { - "entropy": 1.6454954193532467, - "epoch": 2.3222326686129944, - "grad_norm": 3.772763729095459, - "learning_rate": 1.9548481647742275e-05, - "loss": 13.2317, - "mean_token_accuracy": 0.6408380315452814, - "num_tokens": 829391977.0, - "step": 17600 - }, - { - "entropy": 1.6422753143310547, - "epoch": 2.335428109588137, - "grad_norm": 3.8395848274230957, - "learning_rate": 1.954584103512015e-05, - "loss": 13.2048, - "mean_token_accuracy": 0.6405471435189247, - "num_tokens": 834068376.0, - "step": 17700 - }, - { - "entropy": 1.6545028822124004, - "epoch": 2.34862355056328, - "grad_norm": 3.8454055786132812, - "learning_rate": 1.954320042249802e-05, - "loss": 13.3062, - "mean_token_accuracy": 0.6392966616153717, - "num_tokens": 838797046.0, - "step": 17800 - }, - { - "entropy": 1.645525890737772, - "epoch": 2.3618189915384233, - "grad_norm": 3.923624038696289, - "learning_rate": 1.954055980987589e-05, - "loss": 13.2301, - "mean_token_accuracy": 0.6412591298669577, - "num_tokens": 843531964.0, - "step": 17900 - }, - { - "entropy": 1.6482495306432248, - "epoch": 2.3750144325135665, - "grad_norm": 3.7244012355804443, - "learning_rate": 1.9537919197253767e-05, - "loss": 13.2457, - "mean_token_accuracy": 0.6402912633121014, - "num_tokens": 848222945.0, - "step": 18000 - }, - { - "epoch": 2.3750144325135665, - "eval_entropy": 1.4282245099796693, - "eval_loss": 1.5754202604293823, - "eval_mean_token_accuracy": 0.6565356317000374, - "eval_num_tokens": 848222945.0, - "eval_runtime": 3189.9723, - "eval_samples_per_second": 33.787, - "eval_steps_per_second": 4.224, - "step": 18000 - }, - { - "entropy": 1.6553110727667808, - "epoch": 2.3882098734887096, - "grad_norm": 4.023122310638428, - "learning_rate": 1.9535278584631636e-05, - "loss": 13.3239, - "mean_token_accuracy": 0.6388157194852829, - "num_tokens": 852929556.0, - "step": 18100 - }, - { - "entropy": 1.6394376514852047, - "epoch": 2.4014053144638527, - "grad_norm": 3.670243263244629, - "learning_rate": 1.9532637972009505e-05, - "loss": 13.1698, - "mean_token_accuracy": 0.6420335720479489, - "num_tokens": 857624358.0, - "step": 18200 - }, - { - "entropy": 1.6484889774024487, - "epoch": 2.414600755438996, - "grad_norm": 3.8759660720825195, - "learning_rate": 1.952999735938738e-05, - "loss": 13.2519, - "mean_token_accuracy": 0.6404349724948406, - "num_tokens": 862377133.0, - "step": 18300 - }, - { - "entropy": 1.6585754190385342, - "epoch": 2.427796196414139, - "grad_norm": 3.8007171154022217, - "learning_rate": 1.952735674676525e-05, - "loss": 13.3324, - "mean_token_accuracy": 0.6386233323067426, - "num_tokens": 867068101.0, - "step": 18400 - }, - { - "entropy": 1.6433968134224415, - "epoch": 2.440991637389282, - "grad_norm": 3.977482795715332, - "learning_rate": 1.9524716134143124e-05, - "loss": 13.2035, - "mean_token_accuracy": 0.6406733729690314, - "num_tokens": 871781888.0, - "step": 18500 - }, - { - "entropy": 1.6583472032845021, - "epoch": 2.4541870783644253, - "grad_norm": 3.9003212451934814, - "learning_rate": 1.9522075521520993e-05, - "loss": 13.3365, - "mean_token_accuracy": 0.6388550719618797, - "num_tokens": 876490259.0, - "step": 18600 - }, - { - "entropy": 1.6305412173271179, - "epoch": 2.4673825193395684, - "grad_norm": 3.718053102493286, - "learning_rate": 1.9519434908898866e-05, - "loss": 13.1029, - "mean_token_accuracy": 0.6429571820795537, - "num_tokens": 881226173.0, - "step": 18700 - }, - { - "entropy": 1.639412898272276, - "epoch": 2.480577960314711, - "grad_norm": 3.988676071166992, - "learning_rate": 1.951679429627674e-05, - "loss": 13.1736, - "mean_token_accuracy": 0.6418413355201483, - "num_tokens": 885951801.0, - "step": 18800 - }, - { - "entropy": 1.6380822832882405, - "epoch": 2.4937734012898543, - "grad_norm": 4.214244842529297, - "learning_rate": 1.951415368365461e-05, - "loss": 13.1635, - "mean_token_accuracy": 0.6418183808401227, - "num_tokens": 890633366.0, - "step": 18900 - }, - { - "entropy": 1.6225868400931358, - "epoch": 2.5069688422649974, - "grad_norm": 3.6833455562591553, - "learning_rate": 1.951151307103248e-05, - "loss": 13.031, - "mean_token_accuracy": 0.6444745562970638, - "num_tokens": 895348664.0, - "step": 19000 - }, - { - "epoch": 2.5069688422649974, - "eval_entropy": 1.4229462220639948, - "eval_loss": 1.5674341917037964, - "eval_mean_token_accuracy": 0.6577362913928956, - "eval_num_tokens": 895348664.0, - "eval_runtime": 3191.3643, - "eval_samples_per_second": 33.772, - "eval_steps_per_second": 4.222, - "step": 19000 - }, - { - "entropy": 1.6459231120347977, - "epoch": 2.5201642832401405, - "grad_norm": 3.8315622806549072, - "learning_rate": 1.9508872458410354e-05, - "loss": 13.2281, - "mean_token_accuracy": 0.6401839184761048, - "num_tokens": 900054420.0, - "step": 19100 - }, - { - "entropy": 1.6490216328203677, - "epoch": 2.5333597242152837, - "grad_norm": 4.153664588928223, - "learning_rate": 1.9506231845788223e-05, - "loss": 13.2486, - "mean_token_accuracy": 0.6404328163713217, - "num_tokens": 904751166.0, - "step": 19200 - }, - { - "entropy": 1.6497003653645514, - "epoch": 2.546555165190427, - "grad_norm": 4.136670112609863, - "learning_rate": 1.9503591233166096e-05, - "loss": 13.2643, - "mean_token_accuracy": 0.6399475292861462, - "num_tokens": 909440312.0, - "step": 19300 - }, - { - "entropy": 1.6385792715847491, - "epoch": 2.55975060616557, - "grad_norm": 3.998361110687256, - "learning_rate": 1.950095062054397e-05, - "loss": 13.1585, - "mean_token_accuracy": 0.64167800180614, - "num_tokens": 914149150.0, - "step": 19400 - }, - { - "entropy": 1.640668357759714, - "epoch": 2.5729460471407126, - "grad_norm": 3.94272518157959, - "learning_rate": 1.949831000792184e-05, - "loss": 13.193, - "mean_token_accuracy": 0.6411869799345732, - "num_tokens": 918821751.0, - "step": 19500 - }, - { - "entropy": 1.639274080991745, - "epoch": 2.5861414881158558, - "grad_norm": 3.8270695209503174, - "learning_rate": 1.949566939529971e-05, - "loss": 13.1661, - "mean_token_accuracy": 0.6411959240585565, - "num_tokens": 923531575.0, - "step": 19600 - }, - { - "entropy": 1.6339846841990948, - "epoch": 2.599336929090999, - "grad_norm": 3.880585193634033, - "learning_rate": 1.9493028782677584e-05, - "loss": 13.1204, - "mean_token_accuracy": 0.6424805308878422, - "num_tokens": 928308752.0, - "step": 19700 - }, - { - "entropy": 1.6360827976465224, - "epoch": 2.612532370066142, - "grad_norm": 3.80788254737854, - "learning_rate": 1.9490388170055453e-05, - "loss": 13.1472, - "mean_token_accuracy": 0.6421842590346932, - "num_tokens": 933061721.0, - "step": 19800 - }, - { - "entropy": 1.642481252104044, - "epoch": 2.625727811041285, - "grad_norm": 3.922104835510254, - "learning_rate": 1.9487747557433326e-05, - "loss": 13.1925, - "mean_token_accuracy": 0.6409500490874052, - "num_tokens": 937780844.0, - "step": 19900 - }, - { - "entropy": 1.6298034279048443, - "epoch": 2.6389232520164283, - "grad_norm": 3.8405327796936035, - "learning_rate": 1.94851069448112e-05, - "loss": 13.0954, - "mean_token_accuracy": 0.6432888546586036, - "num_tokens": 942511572.0, - "step": 20000 - }, - { - "epoch": 2.6389232520164283, - "eval_entropy": 1.4283716988299866, - "eval_loss": 1.5600693225860596, - "eval_mean_token_accuracy": 0.6588419941908921, - "eval_num_tokens": 942511572.0, - "eval_runtime": 3187.1949, - "eval_samples_per_second": 33.817, - "eval_steps_per_second": 4.227, - "step": 20000 - }, - { - "entropy": 1.650664220750332, - "epoch": 2.6521186929915714, - "grad_norm": 3.9403529167175293, - "learning_rate": 1.948246633218907e-05, - "loss": 13.2577, - "mean_token_accuracy": 0.6399555268138647, - "num_tokens": 947255243.0, - "step": 20100 - }, - { - "entropy": 1.6283037734031678, - "epoch": 2.6653141339667146, - "grad_norm": 3.7040855884552, - "learning_rate": 1.947982571956694e-05, - "loss": 13.0764, - "mean_token_accuracy": 0.6435702281445265, - "num_tokens": 951924178.0, - "step": 20200 - }, - { - "entropy": 1.6288749648630618, - "epoch": 2.6785095749418577, - "grad_norm": 4.053677558898926, - "learning_rate": 1.9477185106944814e-05, - "loss": 13.0763, - "mean_token_accuracy": 0.6431901397556067, - "num_tokens": 956614112.0, - "step": 20300 - }, - { - "entropy": 1.6154142348468303, - "epoch": 2.691705015917001, - "grad_norm": 3.6884868144989014, - "learning_rate": 1.9474544494322683e-05, - "loss": 12.9704, - "mean_token_accuracy": 0.6454039007425308, - "num_tokens": 961310812.0, - "step": 20400 - }, - { - "entropy": 1.6393168839812278, - "epoch": 2.704900456892144, - "grad_norm": 3.920409679412842, - "learning_rate": 1.9471903881700556e-05, - "loss": 13.1667, - "mean_token_accuracy": 0.6419647770375013, - "num_tokens": 966060016.0, - "step": 20500 - }, - { - "entropy": 1.6280412651598453, - "epoch": 2.718095897867287, - "grad_norm": 3.860715627670288, - "learning_rate": 1.946926326907843e-05, - "loss": 13.0693, - "mean_token_accuracy": 0.6434524042159319, - "num_tokens": 970778070.0, - "step": 20600 - }, - { - "entropy": 1.6182962483167649, - "epoch": 2.73129133884243, - "grad_norm": 3.9382476806640625, - "learning_rate": 1.94666226564563e-05, - "loss": 12.9953, - "mean_token_accuracy": 0.645245413929224, - "num_tokens": 975501164.0, - "step": 20700 - }, - { - "entropy": 1.6398981650918723, - "epoch": 2.744486779817573, - "grad_norm": 3.749861240386963, - "learning_rate": 1.946398204383417e-05, - "loss": 13.1652, - "mean_token_accuracy": 0.6417403563112021, - "num_tokens": 980266218.0, - "step": 20800 - }, - { - "entropy": 1.6464379735291004, - "epoch": 2.757682220792716, - "grad_norm": 3.912741184234619, - "learning_rate": 1.946134143121204e-05, - "loss": 13.2189, - "mean_token_accuracy": 0.6404729437828064, - "num_tokens": 984961898.0, - "step": 20900 - }, - { - "entropy": 1.6274100148677826, - "epoch": 2.770877661767859, - "grad_norm": 3.992455005645752, - "learning_rate": 1.9458700818589913e-05, - "loss": 13.0539, - "mean_token_accuracy": 0.6441804407536984, - "num_tokens": 989614043.0, - "step": 21000 - }, - { - "epoch": 2.770877661767859, - "eval_entropy": 1.4158240256813492, - "eval_loss": 1.5523220300674438, - "eval_mean_token_accuracy": 0.6600379936238364, - "eval_num_tokens": 989614043.0, - "eval_runtime": 3189.5795, - "eval_samples_per_second": 33.791, - "eval_steps_per_second": 4.224, - "step": 21000 - }, - { - "entropy": 1.632586480230093, - "epoch": 2.7840731027430023, - "grad_norm": 3.9651315212249756, - "learning_rate": 1.9456060205967786e-05, - "loss": 13.1083, - "mean_token_accuracy": 0.6430674945563077, - "num_tokens": 994328590.0, - "step": 21100 - }, - { - "entropy": 1.6308821719884872, - "epoch": 2.7972685437181455, - "grad_norm": 3.786008834838867, - "learning_rate": 1.9453419593345656e-05, - "loss": 13.091, - "mean_token_accuracy": 0.6433896777033806, - "num_tokens": 999024614.0, - "step": 21200 - }, - { - "entropy": 1.6474685882031919, - "epoch": 2.8104639846932886, - "grad_norm": 3.9114224910736084, - "learning_rate": 1.9450778980723532e-05, - "loss": 13.2318, - "mean_token_accuracy": 0.640612950772047, - "num_tokens": 1003713201.0, - "step": 21300 - }, - { - "entropy": 1.6211960214376449, - "epoch": 2.8236594256684313, - "grad_norm": 4.09506368637085, - "learning_rate": 1.94481383681014e-05, - "loss": 13.0251, - "mean_token_accuracy": 0.6447122542560101, - "num_tokens": 1008443027.0, - "step": 21400 - }, - { - "entropy": 1.6274728824198246, - "epoch": 2.8368548666435744, - "grad_norm": 3.768113136291504, - "learning_rate": 1.944549775547927e-05, - "loss": 13.0694, - "mean_token_accuracy": 0.6439897135645151, - "num_tokens": 1013177678.0, - "step": 21500 - }, - { - "entropy": 1.6317154209315776, - "epoch": 2.8500503076187176, - "grad_norm": 4.080435276031494, - "learning_rate": 1.9442857142857147e-05, - "loss": 13.0882, - "mean_token_accuracy": 0.6439899149537086, - "num_tokens": 1017879507.0, - "step": 21600 - }, - { - "entropy": 1.6208388382196426, - "epoch": 2.8632457485938607, - "grad_norm": 3.988337993621826, - "learning_rate": 1.9440216530235016e-05, - "loss": 13.0154, - "mean_token_accuracy": 0.6451309756934642, - "num_tokens": 1022592377.0, - "step": 21700 - }, - { - "entropy": 1.633671799302101, - "epoch": 2.876441189569004, - "grad_norm": 3.740370988845825, - "learning_rate": 1.943757591761289e-05, - "loss": 13.1097, - "mean_token_accuracy": 0.6425185710191726, - "num_tokens": 1027278637.0, - "step": 21800 - }, - { - "entropy": 1.6209680989384652, - "epoch": 2.889636630544147, - "grad_norm": 3.957094669342041, - "learning_rate": 1.943493530499076e-05, - "loss": 13.0099, - "mean_token_accuracy": 0.6445255218446255, - "num_tokens": 1031978678.0, - "step": 21900 - }, - { - "entropy": 1.6203568048775197, - "epoch": 2.90283207151929, - "grad_norm": 3.7622430324554443, - "learning_rate": 1.943229469236863e-05, - "loss": 13.0027, - "mean_token_accuracy": 0.6453801936656237, - "num_tokens": 1036715064.0, - "step": 22000 - }, - { - "epoch": 2.90283207151929, - "eval_entropy": 1.4137322844946154, - "eval_loss": 1.5457555055618286, - "eval_mean_token_accuracy": 0.6610697363702012, - "eval_num_tokens": 1036715064.0, - "eval_runtime": 3187.9709, - "eval_samples_per_second": 33.808, - "eval_steps_per_second": 4.226, - "step": 22000 - }, - { - "entropy": 1.6192519588023424, - "epoch": 2.9160275124944333, - "grad_norm": 3.8866078853607178, - "learning_rate": 1.9429654079746504e-05, - "loss": 12.99, - "mean_token_accuracy": 0.6450617261230945, - "num_tokens": 1041372665.0, - "step": 22100 - }, - { - "entropy": 1.6254705637693405, - "epoch": 2.9292229534695764, - "grad_norm": 4.082258701324463, - "learning_rate": 1.9427013467124374e-05, - "loss": 13.0384, - "mean_token_accuracy": 0.6447862467169762, - "num_tokens": 1046068792.0, - "step": 22200 - }, - { - "entropy": 1.6164238581061363, - "epoch": 2.9424183944447195, - "grad_norm": 3.86102557182312, - "learning_rate": 1.9424372854502246e-05, - "loss": 12.953, - "mean_token_accuracy": 0.646661720648408, - "num_tokens": 1050838675.0, - "step": 22300 - }, - { - "entropy": 1.6251713410019875, - "epoch": 2.9556138354198627, - "grad_norm": 4.027069568634033, - "learning_rate": 1.942173224188012e-05, - "loss": 13.035, - "mean_token_accuracy": 0.6444545089453458, - "num_tokens": 1055533401.0, - "step": 22400 - }, - { - "entropy": 1.622108271420002, - "epoch": 2.968809276395006, - "grad_norm": 3.6608834266662598, - "learning_rate": 1.941909162925799e-05, - "loss": 13.0153, - "mean_token_accuracy": 0.6446515038982034, - "num_tokens": 1060259488.0, - "step": 22500 - }, - { - "entropy": 1.6189582243561744, - "epoch": 2.9820047173701485, - "grad_norm": 3.8715226650238037, - "learning_rate": 1.941645101663586e-05, - "loss": 12.9811, - "mean_token_accuracy": 0.6453317078202963, - "num_tokens": 1064896187.0, - "step": 22600 - }, - { - "entropy": 1.6309356051683426, - "epoch": 2.9952001583452916, - "grad_norm": 3.7819671630859375, - "learning_rate": 1.9413810404013734e-05, - "loss": 13.0815, - "mean_token_accuracy": 0.643519636541605, - "num_tokens": 1069620808.0, - "step": 22700 - }, - { - "entropy": 1.6086442614501377, - "epoch": 3.0083131278143402, - "grad_norm": 3.8293449878692627, - "learning_rate": 1.9411169791391604e-05, - "loss": 12.8103, - "mean_token_accuracy": 0.6470718562977869, - "num_tokens": 1074292820.0, - "step": 22800 - }, - { - "entropy": 1.5990345920622349, - "epoch": 3.0215085687894834, - "grad_norm": 3.9333574771881104, - "learning_rate": 1.9408529178769476e-05, - "loss": 12.8028, - "mean_token_accuracy": 0.6484330788999796, - "num_tokens": 1079008586.0, - "step": 22900 - }, - { - "entropy": 1.6048170095682144, - "epoch": 3.0347040097646265, - "grad_norm": 4.1680707931518555, - "learning_rate": 1.940588856614735e-05, - "loss": 12.8522, - "mean_token_accuracy": 0.6468084762245416, - "num_tokens": 1083742693.0, - "step": 23000 - }, - { - "epoch": 3.0347040097646265, - "eval_entropy": 1.4036445253712735, - "eval_loss": 1.5400227308273315, - "eval_mean_token_accuracy": 0.6620142995550259, - "eval_num_tokens": 1083742693.0, - "eval_runtime": 3187.7068, - "eval_samples_per_second": 33.811, - "eval_steps_per_second": 4.227, - "step": 23000 - }, - { - "entropy": 1.59215646982193, - "epoch": 3.047899450739769, - "grad_norm": 3.910896062850952, - "learning_rate": 1.940324795352522e-05, - "loss": 12.7527, - "mean_token_accuracy": 0.6494686865061522, - "num_tokens": 1088416491.0, - "step": 23100 - }, - { - "entropy": 1.6077661024034022, - "epoch": 3.0610948917149123, - "grad_norm": 3.82198429107666, - "learning_rate": 1.940060734090309e-05, - "loss": 12.8693, - "mean_token_accuracy": 0.6476485385000705, - "num_tokens": 1093142922.0, - "step": 23200 - }, - { - "entropy": 1.6198658345639705, - "epoch": 3.0742903326900555, - "grad_norm": 3.7349178791046143, - "learning_rate": 1.9397966728280964e-05, - "loss": 12.9709, - "mean_token_accuracy": 0.6451571603119374, - "num_tokens": 1097858791.0, - "step": 23300 - }, - { - "entropy": 1.6040863755345345, - "epoch": 3.0874857736651986, - "grad_norm": 4.026642799377441, - "learning_rate": 1.9395326115658834e-05, - "loss": 12.8509, - "mean_token_accuracy": 0.6470640433579683, - "num_tokens": 1102555946.0, - "step": 23400 - }, - { - "entropy": 1.6048882656544448, - "epoch": 3.1006812146403417, - "grad_norm": 3.827873706817627, - "learning_rate": 1.9392685503036706e-05, - "loss": 12.8447, - "mean_token_accuracy": 0.6481162996590137, - "num_tokens": 1107293137.0, - "step": 23500 - }, - { - "entropy": 1.6029019843041896, - "epoch": 3.113876655615485, - "grad_norm": 4.017215728759766, - "learning_rate": 1.939004489041458e-05, - "loss": 12.843, - "mean_token_accuracy": 0.6474066472053528, - "num_tokens": 1111954662.0, - "step": 23600 - }, - { - "entropy": 1.6040606062114238, - "epoch": 3.127072096590628, - "grad_norm": 3.802905797958374, - "learning_rate": 1.938740427779245e-05, - "loss": 12.84, - "mean_token_accuracy": 0.6483595797419548, - "num_tokens": 1116658744.0, - "step": 23700 - }, - { - "entropy": 1.6007949909567833, - "epoch": 3.140267537565771, - "grad_norm": 3.832118272781372, - "learning_rate": 1.938476366517032e-05, - "loss": 12.8116, - "mean_token_accuracy": 0.6484786373376846, - "num_tokens": 1121381338.0, - "step": 23800 - }, - { - "entropy": 1.603892664760351, - "epoch": 3.1534629785409143, - "grad_norm": 3.7704367637634277, - "learning_rate": 1.938212305254819e-05, - "loss": 12.8438, - "mean_token_accuracy": 0.6480603955686093, - "num_tokens": 1126116413.0, - "step": 23900 - }, - { - "entropy": 1.6137782764434814, - "epoch": 3.1666584195160574, - "grad_norm": 3.97542142868042, - "learning_rate": 1.9379482439926064e-05, - "loss": 12.9155, - "mean_token_accuracy": 0.6462130547314883, - "num_tokens": 1130879241.0, - "step": 24000 - }, - { - "epoch": 3.1666584195160574, - "eval_entropy": 1.409310864803211, - "eval_loss": 1.532834768295288, - "eval_mean_token_accuracy": 0.6631031635829886, - "eval_num_tokens": 1130879241.0, - "eval_runtime": 3194.5415, - "eval_samples_per_second": 33.739, - "eval_steps_per_second": 4.218, - "step": 24000 - }, - { - "entropy": 1.603936430066824, - "epoch": 3.1798538604912, - "grad_norm": 3.6425111293792725, - "learning_rate": 1.9376841827303937e-05, - "loss": 12.8439, - "mean_token_accuracy": 0.6481852814555168, - "num_tokens": 1135605204.0, - "step": 24100 - }, - { - "entropy": 1.5988010117411613, - "epoch": 3.1930493014663432, - "grad_norm": 3.9451022148132324, - "learning_rate": 1.9374201214681806e-05, - "loss": 12.8002, - "mean_token_accuracy": 0.6487304736673832, - "num_tokens": 1140319680.0, - "step": 24200 - }, - { - "entropy": 1.607831762433052, - "epoch": 3.2062447424414864, - "grad_norm": 3.7540087699890137, - "learning_rate": 1.937156060205968e-05, - "loss": 12.8768, - "mean_token_accuracy": 0.6474707532674074, - "num_tokens": 1145017019.0, - "step": 24300 - }, - { - "entropy": 1.6050450451672078, - "epoch": 3.2194401834166295, - "grad_norm": 3.8583192825317383, - "learning_rate": 1.936891998943755e-05, - "loss": 12.8607, - "mean_token_accuracy": 0.6476561278104782, - "num_tokens": 1149667952.0, - "step": 24400 - }, - { - "entropy": 1.6129796238243579, - "epoch": 3.2326356243917727, - "grad_norm": 3.7425315380096436, - "learning_rate": 1.936627937681542e-05, - "loss": 12.912, - "mean_token_accuracy": 0.6465145403146744, - "num_tokens": 1154393799.0, - "step": 24500 - }, - { - "entropy": 1.6006788285076619, - "epoch": 3.245831065366916, - "grad_norm": 3.9942708015441895, - "learning_rate": 1.9363638764193297e-05, - "loss": 12.8184, - "mean_token_accuracy": 0.6481123934686184, - "num_tokens": 1159132842.0, - "step": 24600 - }, - { - "entropy": 1.5836338178813458, - "epoch": 3.259026506342059, - "grad_norm": 3.8745949268341064, - "learning_rate": 1.9360998151571167e-05, - "loss": 12.6762, - "mean_token_accuracy": 0.6513490001112223, - "num_tokens": 1163839466.0, - "step": 24700 - }, - { - "entropy": 1.5958380722999572, - "epoch": 3.272221947317202, - "grad_norm": 3.5539045333862305, - "learning_rate": 1.9358357538949036e-05, - "loss": 12.7741, - "mean_token_accuracy": 0.6492387424409389, - "num_tokens": 1168609132.0, - "step": 24800 - }, - { - "entropy": 1.6030096143484116, - "epoch": 3.2854173882923448, - "grad_norm": 3.7582950592041016, - "learning_rate": 1.935571692632691e-05, - "loss": 12.8404, - "mean_token_accuracy": 0.6481160232424736, - "num_tokens": 1173311250.0, - "step": 24900 - }, - { - "entropy": 1.5964187413454056, - "epoch": 3.298612829267488, - "grad_norm": 3.8397164344787598, - "learning_rate": 1.935307631370478e-05, - "loss": 12.7728, - "mean_token_accuracy": 0.6495844420790672, - "num_tokens": 1178028693.0, - "step": 25000 - }, - { - "epoch": 3.298612829267488, - "eval_entropy": 1.4047100693374543, - "eval_loss": 1.5277026891708374, - "eval_mean_token_accuracy": 0.6640447715511214, - "eval_num_tokens": 1178028693.0, - "eval_runtime": 3194.6362, - "eval_samples_per_second": 33.738, - "eval_steps_per_second": 4.217, - "step": 25000 - }, - { - "entropy": 1.5998302234709263, - "epoch": 3.311808270242631, - "grad_norm": 3.663982391357422, - "learning_rate": 1.935043570108265e-05, - "loss": 12.8137, - "mean_token_accuracy": 0.6484686867892742, - "num_tokens": 1182755467.0, - "step": 25100 - }, - { - "entropy": 1.5965089382231235, - "epoch": 3.325003711217774, - "grad_norm": 3.9860620498657227, - "learning_rate": 1.9347795088460524e-05, - "loss": 12.7805, - "mean_token_accuracy": 0.6490729383379221, - "num_tokens": 1187484128.0, - "step": 25200 - }, - { - "entropy": 1.5897659668326378, - "epoch": 3.3381991521929173, - "grad_norm": 3.856250286102295, - "learning_rate": 1.9345154475838397e-05, - "loss": 12.7224, - "mean_token_accuracy": 0.6499618509411812, - "num_tokens": 1192183136.0, - "step": 25300 - }, - { - "entropy": 1.5996447187662124, - "epoch": 3.3513945931680604, - "grad_norm": 3.984705686569214, - "learning_rate": 1.934251386321627e-05, - "loss": 12.7979, - "mean_token_accuracy": 0.6491198971122504, - "num_tokens": 1196911691.0, - "step": 25400 - }, - { - "entropy": 1.5892897661030292, - "epoch": 3.3645900341432036, - "grad_norm": 3.9916419982910156, - "learning_rate": 1.933987325059414e-05, - "loss": 12.7189, - "mean_token_accuracy": 0.6509596475213766, - "num_tokens": 1201632490.0, - "step": 25500 - }, - { - "entropy": 1.599773357063532, - "epoch": 3.3777854751183467, - "grad_norm": 4.092468738555908, - "learning_rate": 1.933723263797201e-05, - "loss": 12.7999, - "mean_token_accuracy": 0.6487147487699986, - "num_tokens": 1206369149.0, - "step": 25600 - }, - { - "entropy": 1.5881148573756219, - "epoch": 3.39098091609349, - "grad_norm": 3.912086248397827, - "learning_rate": 1.9334592025349884e-05, - "loss": 12.7128, - "mean_token_accuracy": 0.6499782233685255, - "num_tokens": 1211082760.0, - "step": 25700 - }, - { - "entropy": 1.594959545582533, - "epoch": 3.404176357068633, - "grad_norm": 4.231387138366699, - "learning_rate": 1.9331951412727754e-05, - "loss": 12.758, - "mean_token_accuracy": 0.6493679398298263, - "num_tokens": 1215823737.0, - "step": 25800 - }, - { - "entropy": 1.5961143529415132, - "epoch": 3.417371798043776, - "grad_norm": 3.8411481380462646, - "learning_rate": 1.9329310800105627e-05, - "loss": 12.7719, - "mean_token_accuracy": 0.6493826608359814, - "num_tokens": 1220532274.0, - "step": 25900 - }, - { - "entropy": 1.6002527132630349, - "epoch": 3.430567239018919, - "grad_norm": 3.925577163696289, - "learning_rate": 1.93266701874835e-05, - "loss": 12.8023, - "mean_token_accuracy": 0.6485183215141297, - "num_tokens": 1225247176.0, - "step": 26000 - }, - { - "epoch": 3.430567239018919, - "eval_entropy": 1.3897053414566838, - "eval_loss": 1.5220413208007812, - "eval_mean_token_accuracy": 0.6650882579387893, - "eval_num_tokens": 1225247176.0, - "eval_runtime": 3189.1636, - "eval_samples_per_second": 33.796, - "eval_steps_per_second": 4.225, - "step": 26000 - }, - { - "entropy": 1.590363145917654, - "epoch": 3.443762679994062, - "grad_norm": 4.163620948791504, - "learning_rate": 1.932402957486137e-05, - "loss": 12.7238, - "mean_token_accuracy": 0.6503931378573179, - "num_tokens": 1229952948.0, - "step": 26100 - }, - { - "entropy": 1.5977550886571408, - "epoch": 3.456958120969205, - "grad_norm": 3.7180027961730957, - "learning_rate": 1.932138896223924e-05, - "loss": 12.7795, - "mean_token_accuracy": 0.6490350770950317, - "num_tokens": 1234708040.0, - "step": 26200 - }, - { - "entropy": 1.5859640747308732, - "epoch": 3.470153561944348, - "grad_norm": 5.798464298248291, - "learning_rate": 1.9318748349617114e-05, - "loss": 12.6806, - "mean_token_accuracy": 0.6516380459070206, - "num_tokens": 1239398688.0, - "step": 26300 - }, - { - "entropy": 1.5860621571540832, - "epoch": 3.4833490029194913, - "grad_norm": 3.894265651702881, - "learning_rate": 1.9316107736994984e-05, - "loss": 12.6903, - "mean_token_accuracy": 0.6511745493113995, - "num_tokens": 1244100384.0, - "step": 26400 - }, - { - "entropy": 1.5941654224693775, - "epoch": 3.4965444438946345, - "grad_norm": 3.791640520095825, - "learning_rate": 1.9313467124372857e-05, - "loss": 12.7507, - "mean_token_accuracy": 0.6497996034473181, - "num_tokens": 1248767346.0, - "step": 26500 - }, - { - "entropy": 1.5958171512186528, - "epoch": 3.5097398848697776, - "grad_norm": 4.074944496154785, - "learning_rate": 1.931082651175073e-05, - "loss": 12.7584, - "mean_token_accuracy": 0.649353106468916, - "num_tokens": 1253505675.0, - "step": 26600 - }, - { - "entropy": 1.5861407789587973, - "epoch": 3.5229353258449203, - "grad_norm": 3.7499473094940186, - "learning_rate": 1.93081858991286e-05, - "loss": 12.6829, - "mean_token_accuracy": 0.6514942896366119, - "num_tokens": 1258210555.0, - "step": 26700 - }, - { - "entropy": 1.5875330168008803, - "epoch": 3.5361307668200634, - "grad_norm": 3.857792854309082, - "learning_rate": 1.9305545286506472e-05, - "loss": 12.7021, - "mean_token_accuracy": 0.6506117970496416, - "num_tokens": 1262969138.0, - "step": 26800 - }, - { - "entropy": 1.5890568955242634, - "epoch": 3.5493262077952066, - "grad_norm": 3.789581537246704, - "learning_rate": 1.930290467388434e-05, - "loss": 12.7028, - "mean_token_accuracy": 0.6507829879224301, - "num_tokens": 1267718836.0, - "step": 26900 - }, - { - "entropy": 1.5945152980089188, - "epoch": 3.5625216487703497, - "grad_norm": 3.9242899417877197, - "learning_rate": 1.9300264061262214e-05, - "loss": 12.7496, - "mean_token_accuracy": 0.6498741636425257, - "num_tokens": 1272416713.0, - "step": 27000 - }, - { - "epoch": 3.5625216487703497, - "eval_entropy": 1.3967469545008124, - "eval_loss": 1.5160529613494873, - "eval_mean_token_accuracy": 0.6659562944003312, - "eval_num_tokens": 1272416713.0, - "eval_runtime": 3190.1496, - "eval_samples_per_second": 33.785, - "eval_steps_per_second": 4.223, - "step": 27000 - }, - { - "entropy": 1.5936158713698387, - "epoch": 3.575717089745493, - "grad_norm": 3.866844654083252, - "learning_rate": 1.9297623448640087e-05, - "loss": 12.7489, - "mean_token_accuracy": 0.6495719265192748, - "num_tokens": 1277143213.0, - "step": 27100 - }, - { - "entropy": 1.5857186970114707, - "epoch": 3.588912530720636, - "grad_norm": 3.7745931148529053, - "learning_rate": 1.9294982836017956e-05, - "loss": 12.6741, - "mean_token_accuracy": 0.6512483295798301, - "num_tokens": 1281830839.0, - "step": 27200 - }, - { - "entropy": 1.5887311869859695, - "epoch": 3.602107971695779, - "grad_norm": 3.8768470287323, - "learning_rate": 1.929234222339583e-05, - "loss": 12.7105, - "mean_token_accuracy": 0.6507860495895147, - "num_tokens": 1286521666.0, - "step": 27300 - }, - { - "entropy": 1.5972675527632236, - "epoch": 3.6153034126709223, - "grad_norm": 4.008309841156006, - "learning_rate": 1.9289701610773702e-05, - "loss": 12.7788, - "mean_token_accuracy": 0.648838073015213, - "num_tokens": 1291216900.0, - "step": 27400 - }, - { - "entropy": 1.5907193027436732, - "epoch": 3.6284988536460654, - "grad_norm": 3.7738773822784424, - "learning_rate": 1.928706099815157e-05, - "loss": 12.7136, - "mean_token_accuracy": 0.6508843255788088, - "num_tokens": 1295939930.0, - "step": 27500 - }, - { - "entropy": 1.5870854687690734, - "epoch": 3.6416942946212085, - "grad_norm": 4.030341625213623, - "learning_rate": 1.9284420385529444e-05, - "loss": 12.6917, - "mean_token_accuracy": 0.6508548408001661, - "num_tokens": 1300586613.0, - "step": 27600 - }, - { - "entropy": 1.5814402961730958, - "epoch": 3.6548897355963517, - "grad_norm": 3.9914443492889404, - "learning_rate": 1.9281779772907317e-05, - "loss": 12.647, - "mean_token_accuracy": 0.6520128079503774, - "num_tokens": 1305252391.0, - "step": 27700 - }, - { - "entropy": 1.5867080081999303, - "epoch": 3.668085176571495, - "grad_norm": 3.799436330795288, - "learning_rate": 1.9279139160285186e-05, - "loss": 12.6798, - "mean_token_accuracy": 0.6509626308828592, - "num_tokens": 1310026702.0, - "step": 27800 - }, - { - "entropy": 1.5863153117895126, - "epoch": 3.6812806175466375, - "grad_norm": 3.7359097003936768, - "learning_rate": 1.927649854766306e-05, - "loss": 12.6794, - "mean_token_accuracy": 0.6507589789479971, - "num_tokens": 1314753404.0, - "step": 27900 - }, - { - "entropy": 1.5910515576601028, - "epoch": 3.6944760585217806, - "grad_norm": 3.9891273975372314, - "learning_rate": 1.9273857935040932e-05, - "loss": 12.7211, - "mean_token_accuracy": 0.6505811680108309, - "num_tokens": 1319464143.0, - "step": 28000 - }, - { - "epoch": 3.6944760585217806, - "eval_entropy": 1.383818113944547, - "eval_loss": 1.5120134353637695, - "eval_mean_token_accuracy": 0.6666769426967735, - "eval_num_tokens": 1319464143.0, - "eval_runtime": 3188.8443, - "eval_samples_per_second": 33.799, - "eval_steps_per_second": 4.225, - "step": 28000 - }, - { - "entropy": 1.5939770445227623, - "epoch": 3.7076714994969238, - "grad_norm": 3.8743128776550293, - "learning_rate": 1.92712173224188e-05, - "loss": 12.7487, - "mean_token_accuracy": 0.6503194896131754, - "num_tokens": 1324177416.0, - "step": 28100 - }, - { - "entropy": 1.5899296332895756, - "epoch": 3.720866940472067, - "grad_norm": 4.185421466827393, - "learning_rate": 1.9268576709796674e-05, - "loss": 12.7073, - "mean_token_accuracy": 0.6508678549528122, - "num_tokens": 1328870314.0, - "step": 28200 - }, - { - "entropy": 1.5931289853900672, - "epoch": 3.73406238144721, - "grad_norm": 4.025436878204346, - "learning_rate": 1.9265936097174547e-05, - "loss": 12.7334, - "mean_token_accuracy": 0.650798460394144, - "num_tokens": 1333603965.0, - "step": 28300 - }, - { - "entropy": 1.579432168751955, - "epoch": 3.747257822422353, - "grad_norm": 3.8230814933776855, - "learning_rate": 1.9263295484552416e-05, - "loss": 12.6252, - "mean_token_accuracy": 0.6523202404379844, - "num_tokens": 1338353598.0, - "step": 28400 - }, - { - "entropy": 1.5902026624977588, - "epoch": 3.7604532633974963, - "grad_norm": 3.726195812225342, - "learning_rate": 1.926065487193029e-05, - "loss": 12.7178, - "mean_token_accuracy": 0.6507448834180832, - "num_tokens": 1343045600.0, - "step": 28500 - }, - { - "entropy": 1.5953569206595422, - "epoch": 3.773648704372639, - "grad_norm": 4.179470539093018, - "learning_rate": 1.9258014259308162e-05, - "loss": 12.756, - "mean_token_accuracy": 0.6497294913977385, - "num_tokens": 1347765894.0, - "step": 28600 - }, - { - "entropy": 1.5810581628233193, - "epoch": 3.786844145347782, - "grad_norm": 4.100180625915527, - "learning_rate": 1.9255373646686035e-05, - "loss": 12.6317, - "mean_token_accuracy": 0.6522324250638485, - "num_tokens": 1352440589.0, - "step": 28700 - }, - { - "entropy": 1.588026827275753, - "epoch": 3.8000395863229253, - "grad_norm": 3.9760444164276123, - "learning_rate": 1.9252733034063904e-05, - "loss": 12.6961, - "mean_token_accuracy": 0.6506867495179176, - "num_tokens": 1357132028.0, - "step": 28800 - }, - { - "entropy": 1.5934601209312678, - "epoch": 3.8132350272980684, - "grad_norm": 3.8777272701263428, - "learning_rate": 1.9250092421441773e-05, - "loss": 12.7266, - "mean_token_accuracy": 0.6502763725072146, - "num_tokens": 1361826888.0, - "step": 28900 - }, - { - "entropy": 1.5815143549442292, - "epoch": 3.8264304682732115, - "grad_norm": 3.8989417552948, - "learning_rate": 1.924745180881965e-05, - "loss": 12.6365, - "mean_token_accuracy": 0.6515141806006431, - "num_tokens": 1366504824.0, - "step": 29000 - }, - { - "epoch": 3.8264304682732115, - "eval_entropy": 1.3917201210933459, - "eval_loss": 1.5054519176483154, - "eval_mean_token_accuracy": 0.6676328241900312, - "eval_num_tokens": 1366504824.0, - "eval_runtime": 3191.917, - "eval_samples_per_second": 33.767, - "eval_steps_per_second": 4.221, - "step": 29000 - }, - { - "entropy": 1.585688829421997, - "epoch": 3.8396259092483547, - "grad_norm": 3.791611671447754, - "learning_rate": 1.924481119619752e-05, - "loss": 12.6651, - "mean_token_accuracy": 0.6511684814840555, - "num_tokens": 1371224055.0, - "step": 29100 - }, - { - "entropy": 1.564525719434023, - "epoch": 3.852821350223498, - "grad_norm": 3.8431272506713867, - "learning_rate": 1.9242170583575392e-05, - "loss": 12.4972, - "mean_token_accuracy": 0.6550224151462316, - "num_tokens": 1375950143.0, - "step": 29200 - }, - { - "entropy": 1.5668389943242074, - "epoch": 3.866016791198641, - "grad_norm": 3.915454149246216, - "learning_rate": 1.9239529970953265e-05, - "loss": 12.5206, - "mean_token_accuracy": 0.6548665834218264, - "num_tokens": 1380590024.0, - "step": 29300 - }, - { - "entropy": 1.5666982433199883, - "epoch": 3.879212232173784, - "grad_norm": 3.8743038177490234, - "learning_rate": 1.9236889358331134e-05, - "loss": 12.5167, - "mean_token_accuracy": 0.654480542242527, - "num_tokens": 1385311696.0, - "step": 29400 - }, - { - "entropy": 1.5765258029848337, - "epoch": 3.892407673148927, - "grad_norm": 3.8786461353302, - "learning_rate": 1.9234248745709007e-05, - "loss": 12.5871, - "mean_token_accuracy": 0.6527524341642856, - "num_tokens": 1390063828.0, - "step": 29500 - }, - { - "entropy": 1.5753777919709682, - "epoch": 3.9056031141240704, - "grad_norm": 3.727337598800659, - "learning_rate": 1.923160813308688e-05, - "loss": 12.5866, - "mean_token_accuracy": 0.6533032587170601, - "num_tokens": 1394744367.0, - "step": 29600 - }, - { - "entropy": 1.572633812725544, - "epoch": 3.9187985550992135, - "grad_norm": 3.863529920578003, - "learning_rate": 1.922896752046475e-05, - "loss": 12.563, - "mean_token_accuracy": 0.6538976988196373, - "num_tokens": 1399475298.0, - "step": 29700 - }, - { - "entropy": 1.576025907844305, - "epoch": 3.931993996074356, - "grad_norm": 3.851825714111328, - "learning_rate": 1.9226326907842622e-05, - "loss": 12.5812, - "mean_token_accuracy": 0.6533911099284887, - "num_tokens": 1404131763.0, - "step": 29800 - }, - { - "entropy": 1.5805292607098818, - "epoch": 3.9451894370494993, - "grad_norm": 3.8343303203582764, - "learning_rate": 1.922368629522049e-05, - "loss": 12.6123, - "mean_token_accuracy": 0.6528911211341619, - "num_tokens": 1408894509.0, - "step": 29900 - }, - { - "entropy": 1.5806397613883019, - "epoch": 3.9583848780246425, - "grad_norm": 3.8559279441833496, - "learning_rate": 1.9221045682598364e-05, - "loss": 12.6316, - "mean_token_accuracy": 0.6520166169852019, - "num_tokens": 1413596551.0, - "step": 30000 - }, - { - "epoch": 3.9583848780246425, - "eval_entropy": 1.3611459893354956, - "eval_loss": 1.5021542310714722, - "eval_mean_token_accuracy": 0.6685199348280462, - "eval_num_tokens": 1413596551.0, - "eval_runtime": 3405.6838, - "eval_samples_per_second": 31.647, - "eval_steps_per_second": 3.956, - "step": 30000 - } - ], - "logging_steps": 100, - "max_steps": 757900, - "num_input_tokens_seen": 0, - "num_train_epochs": 100, - "save_steps": 1000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 4.4113451599817784e+18, - "train_batch_size": 16, - "trial_name": null, - "trial_params": null -}