diff --git "a/trainer_state (2).json" "b/trainer_state (2).json" new file mode 100644--- /dev/null +++ "b/trainer_state (2).json" @@ -0,0 +1,5154 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 512, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 3.246363878250122, + "epoch": 0.015625, + "grad_norm": 1.8967702388763428, + "learning_rate": 0.0003, + "loss": 3.182457685470581, + "mean_token_accuracy": 0.5472440719604492, + "num_tokens": 512.0, + "step": 1 + }, + { + "entropy": 5.435122966766357, + "epoch": 0.03125, + "grad_norm": 12.227993965148926, + "learning_rate": 0.0002999971762923901, + "loss": 3.6484947204589844, + "mean_token_accuracy": 0.5498008131980896, + "num_tokens": 1024.0, + "step": 2 + }, + { + "entropy": 3.1172115802764893, + "epoch": 0.046875, + "grad_norm": 5.236744403839111, + "learning_rate": 0.00029998870527587167, + "loss": 3.2157859802246094, + "mean_token_accuracy": 0.5445343852043152, + "num_tokens": 1536.0, + "step": 3 + }, + { + "entropy": 2.616081714630127, + "epoch": 0.0625, + "grad_norm": 5.9123759269714355, + "learning_rate": 0.0002999745872693735, + "loss": 3.129127264022827, + "mean_token_accuracy": 0.5597609281539917, + "num_tokens": 2048.0, + "step": 4 + }, + { + "entropy": 2.709519386291504, + "epoch": 0.078125, + "grad_norm": 6.461076736450195, + "learning_rate": 0.0002999548228044306, + "loss": 3.373767852783203, + "mean_token_accuracy": 0.5413385629653931, + "num_tokens": 2560.0, + "step": 5 + }, + { + "entropy": 3.1329281330108643, + "epoch": 0.09375, + "grad_norm": 3.9384171962738037, + "learning_rate": 0.00029992941262516396, + "loss": 2.912034034729004, + "mean_token_accuracy": 0.5628865957260132, + "num_tokens": 3072.0, + "step": 6 + }, + { + "entropy": 3.3683834075927734, + "epoch": 0.109375, + "grad_norm": 18.561269760131836, + "learning_rate": 0.0002998983576882524, + "loss": 3.3030648231506348, + "mean_token_accuracy": 0.40442654490470886, + "num_tokens": 3584.0, + "step": 7 + }, + { + "entropy": 3.167931079864502, + "epoch": 0.125, + "grad_norm": 16.356603622436523, + "learning_rate": 0.0002998616591628968, + "loss": 3.1349599361419678, + "mean_token_accuracy": 0.48065173625946045, + "num_tokens": 4096.0, + "step": 8 + }, + { + "entropy": 2.8782107830047607, + "epoch": 0.140625, + "grad_norm": 2.5236713886260986, + "learning_rate": 0.00029981931843077583, + "loss": 2.7147700786590576, + "mean_token_accuracy": 0.6140725016593933, + "num_tokens": 4608.0, + "step": 9 + }, + { + "entropy": 2.622675657272339, + "epoch": 0.15625, + "grad_norm": 11.60867691040039, + "learning_rate": 0.0002997713370859942, + "loss": 3.0942561626434326, + "mean_token_accuracy": 0.57485032081604, + "num_tokens": 5120.0, + "step": 10 + }, + { + "entropy": 2.784639835357666, + "epoch": 0.171875, + "grad_norm": 8.239684104919434, + "learning_rate": 0.0002997177169350223, + "loss": 3.309100866317749, + "mean_token_accuracy": 0.5496957302093506, + "num_tokens": 5632.0, + "step": 11 + }, + { + "entropy": 3.0426368713378906, + "epoch": 0.1875, + "grad_norm": 6.443057537078857, + "learning_rate": 0.00029965845999662874, + "loss": 3.346451759338379, + "mean_token_accuracy": 0.5492125749588013, + "num_tokens": 6144.0, + "step": 12 + }, + { + "entropy": 3.314704656600952, + "epoch": 0.203125, + "grad_norm": 3.021179676055908, + "learning_rate": 0.0002995935685018035, + "loss": 3.1350901126861572, + "mean_token_accuracy": 0.5705645084381104, + "num_tokens": 6656.0, + "step": 13 + }, + { + "entropy": 3.367347002029419, + "epoch": 0.21875, + "grad_norm": 6.598303318023682, + "learning_rate": 0.0002995230448936748, + "loss": 3.036130428314209, + "mean_token_accuracy": 0.5705645084381104, + "num_tokens": 7168.0, + "step": 14 + }, + { + "entropy": 3.309898614883423, + "epoch": 0.234375, + "grad_norm": 9.488039016723633, + "learning_rate": 0.00029944689182741664, + "loss": 3.060746431350708, + "mean_token_accuracy": 0.5546558499336243, + "num_tokens": 7680.0, + "step": 15 + }, + { + "entropy": 3.3201732635498047, + "epoch": 0.25, + "grad_norm": 6.782240390777588, + "learning_rate": 0.00029936511217014893, + "loss": 3.138178586959839, + "mean_token_accuracy": 0.5295275449752808, + "num_tokens": 8192.0, + "step": 16 + }, + { + "entropy": 3.141629934310913, + "epoch": 0.265625, + "grad_norm": 2.4261715412139893, + "learning_rate": 0.00029927770900082954, + "loss": 2.9288454055786133, + "mean_token_accuracy": 0.5950413346290588, + "num_tokens": 8704.0, + "step": 17 + }, + { + "entropy": 3.184690475463867, + "epoch": 0.28125, + "grad_norm": 2.769068479537964, + "learning_rate": 0.0002991846856101383, + "loss": 2.7941722869873047, + "mean_token_accuracy": 0.590436577796936, + "num_tokens": 9216.0, + "step": 18 + }, + { + "entropy": 3.0208375453948975, + "epoch": 0.296875, + "grad_norm": 2.609088182449341, + "learning_rate": 0.0002990860455003534, + "loss": 2.9833905696868896, + "mean_token_accuracy": 0.5841785073280334, + "num_tokens": 9728.0, + "step": 19 + }, + { + "entropy": 2.88010573387146, + "epoch": 0.3125, + "grad_norm": 2.748533248901367, + "learning_rate": 0.00029898179238521916, + "loss": 2.6276729106903076, + "mean_token_accuracy": 0.6270833611488342, + "num_tokens": 10240.0, + "step": 20 + }, + { + "entropy": 2.6639840602874756, + "epoch": 0.328125, + "grad_norm": 2.043412923812866, + "learning_rate": 0.0002988719301898065, + "loss": 2.456700325012207, + "mean_token_accuracy": 0.6457023024559021, + "num_tokens": 10752.0, + "step": 21 + }, + { + "entropy": 3.010587453842163, + "epoch": 0.34375, + "grad_norm": 4.867815017700195, + "learning_rate": 0.0002987564630503649, + "loss": 2.8242549896240234, + "mean_token_accuracy": 0.5879917144775391, + "num_tokens": 11264.0, + "step": 22 + }, + { + "entropy": 2.9374547004699707, + "epoch": 0.359375, + "grad_norm": 4.809256076812744, + "learning_rate": 0.000298635395314167, + "loss": 2.4628262519836426, + "mean_token_accuracy": 0.6552462577819824, + "num_tokens": 11776.0, + "step": 23 + }, + { + "entropy": 3.3613696098327637, + "epoch": 0.375, + "grad_norm": 3.2862000465393066, + "learning_rate": 0.00029850873153934457, + "loss": 3.3322877883911133, + "mean_token_accuracy": 0.5728346705436707, + "num_tokens": 12288.0, + "step": 24 + }, + { + "entropy": 3.3719234466552734, + "epoch": 0.390625, + "grad_norm": 2.0418403148651123, + "learning_rate": 0.00029837647649471715, + "loss": 2.932267427444458, + "mean_token_accuracy": 0.5895372033119202, + "num_tokens": 12800.0, + "step": 25 + }, + { + "entropy": 2.842104911804199, + "epoch": 0.40625, + "grad_norm": 2.131535291671753, + "learning_rate": 0.0002982386351596124, + "loss": 2.8979134559631348, + "mean_token_accuracy": 0.6000000238418579, + "num_tokens": 13312.0, + "step": 26 + }, + { + "entropy": 2.60341215133667, + "epoch": 0.421875, + "grad_norm": 1.886840581893921, + "learning_rate": 0.00029809521272367874, + "loss": 2.448031425476074, + "mean_token_accuracy": 0.6652360558509827, + "num_tokens": 13824.0, + "step": 27 + }, + { + "entropy": 2.783334732055664, + "epoch": 0.4375, + "grad_norm": 2.298892021179199, + "learning_rate": 0.0002979462145866898, + "loss": 2.8729963302612305, + "mean_token_accuracy": 0.6239495873451233, + "num_tokens": 14336.0, + "step": 28 + }, + { + "entropy": 3.2722766399383545, + "epoch": 0.453125, + "grad_norm": 1.7541515827178955, + "learning_rate": 0.00029779164635834114, + "loss": 2.9888410568237305, + "mean_token_accuracy": 0.5971943736076355, + "num_tokens": 14848.0, + "step": 29 + }, + { + "entropy": 3.3545026779174805, + "epoch": 0.46875, + "grad_norm": 1.9528993368148804, + "learning_rate": 0.0002976315138580393, + "loss": 2.4215219020843506, + "mean_token_accuracy": 0.6466809511184692, + "num_tokens": 15360.0, + "step": 30 + }, + { + "entropy": 3.3544554710388184, + "epoch": 0.484375, + "grad_norm": 2.7539281845092773, + "learning_rate": 0.00029746582311468244, + "loss": 2.8554189205169678, + "mean_token_accuracy": 0.6028806567192078, + "num_tokens": 15872.0, + "step": 31 + }, + { + "entropy": 2.996553421020508, + "epoch": 0.5, + "grad_norm": 2.825383424758911, + "learning_rate": 0.0002972945803664333, + "loss": 2.8956212997436523, + "mean_token_accuracy": 0.5841785073280334, + "num_tokens": 16384.0, + "step": 32 + }, + { + "entropy": 2.7225828170776367, + "epoch": 0.515625, + "grad_norm": 1.8512521982192993, + "learning_rate": 0.00029711779206048454, + "loss": 2.828862190246582, + "mean_token_accuracy": 0.586614191532135, + "num_tokens": 16896.0, + "step": 33 + }, + { + "entropy": 2.5095906257629395, + "epoch": 0.53125, + "grad_norm": 2.991373062133789, + "learning_rate": 0.00029693546485281603, + "loss": 2.879727363586426, + "mean_token_accuracy": 0.6134969592094421, + "num_tokens": 17408.0, + "step": 34 + }, + { + "entropy": 2.993544340133667, + "epoch": 0.546875, + "grad_norm": 2.760350465774536, + "learning_rate": 0.0002967476056079441, + "loss": 3.0702333450317383, + "mean_token_accuracy": 0.5669291615486145, + "num_tokens": 17920.0, + "step": 35 + }, + { + "entropy": 3.340437650680542, + "epoch": 0.5625, + "grad_norm": 3.0262949466705322, + "learning_rate": 0.0002965542213986631, + "loss": 3.17567777633667, + "mean_token_accuracy": 0.5472440719604492, + "num_tokens": 18432.0, + "step": 36 + }, + { + "entropy": 3.2044739723205566, + "epoch": 0.578125, + "grad_norm": 6.876502990722656, + "learning_rate": 0.00029635531950577925, + "loss": 2.78464674949646, + "mean_token_accuracy": 0.6036961078643799, + "num_tokens": 18944.0, + "step": 37 + }, + { + "entropy": 3.24843692779541, + "epoch": 0.59375, + "grad_norm": 5.545853137969971, + "learning_rate": 0.00029615090741783636, + "loss": 2.7331504821777344, + "mean_token_accuracy": 0.608433723449707, + "num_tokens": 19456.0, + "step": 38 + }, + { + "entropy": 3.1350715160369873, + "epoch": 0.609375, + "grad_norm": 3.3395559787750244, + "learning_rate": 0.000295940992830834, + "loss": 3.300499677658081, + "mean_token_accuracy": 0.5590550899505615, + "num_tokens": 19968.0, + "step": 39 + }, + { + "entropy": 3.0839738845825195, + "epoch": 0.625, + "grad_norm": 2.1904847621917725, + "learning_rate": 0.00029572558364793775, + "loss": 3.1389362812042236, + "mean_token_accuracy": 0.5748031735420227, + "num_tokens": 20480.0, + "step": 40 + }, + { + "entropy": 2.964693546295166, + "epoch": 0.640625, + "grad_norm": 1.7939658164978027, + "learning_rate": 0.0002955046879791816, + "loss": 3.1034960746765137, + "mean_token_accuracy": 0.5610235929489136, + "num_tokens": 20992.0, + "step": 41 + }, + { + "entropy": 2.911698818206787, + "epoch": 0.65625, + "grad_norm": 1.7506498098373413, + "learning_rate": 0.0002952783141411626, + "loss": 2.3598453998565674, + "mean_token_accuracy": 0.6395833492279053, + "num_tokens": 21504.0, + "step": 42 + }, + { + "entropy": 2.961909770965576, + "epoch": 0.671875, + "grad_norm": 3.6559484004974365, + "learning_rate": 0.00029504647065672776, + "loss": 3.004157304763794, + "mean_token_accuracy": 0.5748031735420227, + "num_tokens": 22016.0, + "step": 43 + }, + { + "entropy": 2.7874066829681396, + "epoch": 0.6875, + "grad_norm": 3.728947162628174, + "learning_rate": 0.00029480916625465337, + "loss": 2.615809440612793, + "mean_token_accuracy": 0.6064257025718689, + "num_tokens": 22528.0, + "step": 44 + }, + { + "entropy": 2.7361326217651367, + "epoch": 0.703125, + "grad_norm": 6.112745761871338, + "learning_rate": 0.00029456640986931596, + "loss": 2.6459853649139404, + "mean_token_accuracy": 0.6361746191978455, + "num_tokens": 23040.0, + "step": 45 + }, + { + "entropy": 2.6819934844970703, + "epoch": 0.71875, + "grad_norm": 5.611839294433594, + "learning_rate": 0.0002943182106403562, + "loss": 2.431462287902832, + "mean_token_accuracy": 0.6595744490623474, + "num_tokens": 23552.0, + "step": 46 + }, + { + "entropy": 2.5808193683624268, + "epoch": 0.734375, + "grad_norm": 3.8999392986297607, + "learning_rate": 0.0002940645779123348, + "loss": 2.77005934715271, + "mean_token_accuracy": 0.6206896305084229, + "num_tokens": 24064.0, + "step": 47 + }, + { + "entropy": 2.98895001411438, + "epoch": 0.75, + "grad_norm": 2.57936954498291, + "learning_rate": 0.0002938055212343807, + "loss": 2.8034496307373047, + "mean_token_accuracy": 0.6020202040672302, + "num_tokens": 24576.0, + "step": 48 + }, + { + "entropy": 2.952457904815674, + "epoch": 0.765625, + "grad_norm": 5.904950141906738, + "learning_rate": 0.0002935410503598313, + "loss": 3.1076672077178955, + "mean_token_accuracy": 0.5944882035255432, + "num_tokens": 25088.0, + "step": 49 + }, + { + "entropy": 2.9851343631744385, + "epoch": 0.78125, + "grad_norm": 8.890862464904785, + "learning_rate": 0.0002932711752458656, + "loss": 2.2850301265716553, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 25600.0, + "step": 50 + }, + { + "entropy": 3.0929861068725586, + "epoch": 0.796875, + "grad_norm": 6.197035789489746, + "learning_rate": 0.00029299590605312906, + "loss": 2.604055166244507, + "mean_token_accuracy": 0.6183673739433289, + "num_tokens": 26112.0, + "step": 51 + }, + { + "entropy": 2.9722883701324463, + "epoch": 0.8125, + "grad_norm": 7.0517473220825195, + "learning_rate": 0.00029271525314535123, + "loss": 2.4527275562286377, + "mean_token_accuracy": 0.6301652789115906, + "num_tokens": 26624.0, + "step": 52 + }, + { + "entropy": 2.6360151767730713, + "epoch": 0.828125, + "grad_norm": 9.385331153869629, + "learning_rate": 0.00029242922708895547, + "loss": 2.614161729812622, + "mean_token_accuracy": 0.6169354915618896, + "num_tokens": 27136.0, + "step": 53 + }, + { + "entropy": 2.6233577728271484, + "epoch": 0.84375, + "grad_norm": 9.205692291259766, + "learning_rate": 0.00029213783865266114, + "loss": 2.693942070007324, + "mean_token_accuracy": 0.6159999966621399, + "num_tokens": 27648.0, + "step": 54 + }, + { + "entropy": 2.5832924842834473, + "epoch": 0.859375, + "grad_norm": 4.841373920440674, + "learning_rate": 0.0002918410988070782, + "loss": 2.87924861907959, + "mean_token_accuracy": 0.5991984009742737, + "num_tokens": 28160.0, + "step": 55 + }, + { + "entropy": 2.815551280975342, + "epoch": 0.875, + "grad_norm": 5.897908687591553, + "learning_rate": 0.00029153901872429404, + "loss": 2.737504720687866, + "mean_token_accuracy": 0.5767716765403748, + "num_tokens": 28672.0, + "step": 56 + }, + { + "entropy": 2.7117974758148193, + "epoch": 0.890625, + "grad_norm": 1.9384305477142334, + "learning_rate": 0.00029123160977745306, + "loss": 2.689833164215088, + "mean_token_accuracy": 0.6227180361747742, + "num_tokens": 29184.0, + "step": 57 + }, + { + "entropy": 3.006451368331909, + "epoch": 0.90625, + "grad_norm": 11.659981727600098, + "learning_rate": 0.00029091888354032845, + "loss": 2.6457479000091553, + "mean_token_accuracy": 0.6306122541427612, + "num_tokens": 29696.0, + "step": 58 + }, + { + "entropy": 3.347648859024048, + "epoch": 0.921875, + "grad_norm": 13.3857421875, + "learning_rate": 0.0002906008517868862, + "loss": 3.0881667137145996, + "mean_token_accuracy": 0.5826771855354309, + "num_tokens": 30208.0, + "step": 59 + }, + { + "entropy": 3.1437995433807373, + "epoch": 0.9375, + "grad_norm": 11.784278869628906, + "learning_rate": 0.0002902775264908421, + "loss": 2.768523931503296, + "mean_token_accuracy": 0.6226804256439209, + "num_tokens": 30720.0, + "step": 60 + }, + { + "entropy": 3.3461129665374756, + "epoch": 0.953125, + "grad_norm": 14.32170295715332, + "learning_rate": 0.0002899489198252108, + "loss": 2.813835859298706, + "mean_token_accuracy": 0.608961284160614, + "num_tokens": 31232.0, + "step": 61 + }, + { + "entropy": 3.2309908866882324, + "epoch": 0.96875, + "grad_norm": 10.945052146911621, + "learning_rate": 0.00028961504416184753, + "loss": 2.9945812225341797, + "mean_token_accuracy": 0.586614191532135, + "num_tokens": 31744.0, + "step": 62 + }, + { + "entropy": 3.0333917140960693, + "epoch": 0.984375, + "grad_norm": 10.011848449707031, + "learning_rate": 0.00028927591207098235, + "loss": 2.868558645248413, + "mean_token_accuracy": 0.5984252095222473, + "num_tokens": 32256.0, + "step": 63 + }, + { + "entropy": 2.871372938156128, + "epoch": 1.0, + "grad_norm": 3.1529769897460938, + "learning_rate": 0.0002889315363207467, + "loss": 2.403618335723877, + "mean_token_accuracy": 0.6575630307197571, + "num_tokens": 32768.0, + "step": 64 + }, + { + "entropy": 2.411283016204834, + "epoch": 1.015625, + "grad_norm": 5.358489513397217, + "learning_rate": 0.000288581929876693, + "loss": 2.1208901405334473, + "mean_token_accuracy": 0.6616702079772949, + "num_tokens": 33280.0, + "step": 65 + }, + { + "entropy": 2.3217580318450928, + "epoch": 1.03125, + "grad_norm": 2.0556492805480957, + "learning_rate": 0.0002882271059013063, + "loss": 2.562404155731201, + "mean_token_accuracy": 0.6129032373428345, + "num_tokens": 33792.0, + "step": 66 + }, + { + "entropy": 2.6636924743652344, + "epoch": 1.046875, + "grad_norm": 13.256611824035645, + "learning_rate": 0.0002878670777535087, + "loss": 2.492720127105713, + "mean_token_accuracy": 0.585170328617096, + "num_tokens": 34304.0, + "step": 67 + }, + { + "entropy": 2.7850069999694824, + "epoch": 1.0625, + "grad_norm": 18.333816528320312, + "learning_rate": 0.0002875018589881564, + "loss": 2.679954767227173, + "mean_token_accuracy": 0.573122501373291, + "num_tokens": 34816.0, + "step": 68 + }, + { + "entropy": 2.7594878673553467, + "epoch": 1.078125, + "grad_norm": 15.908726692199707, + "learning_rate": 0.0002871314633555296, + "loss": 2.299175262451172, + "mean_token_accuracy": 0.6004056930541992, + "num_tokens": 35328.0, + "step": 69 + }, + { + "entropy": 3.0297257900238037, + "epoch": 1.09375, + "grad_norm": 9.716331481933594, + "learning_rate": 0.0002867559048008145, + "loss": 2.834432363510132, + "mean_token_accuracy": 0.5748031735420227, + "num_tokens": 35840.0, + "step": 70 + }, + { + "entropy": 3.0106217861175537, + "epoch": 1.109375, + "grad_norm": 6.021564960479736, + "learning_rate": 0.0002863751974635783, + "loss": 2.832149028778076, + "mean_token_accuracy": 0.5708661675453186, + "num_tokens": 36352.0, + "step": 71 + }, + { + "entropy": 2.9219532012939453, + "epoch": 1.125, + "grad_norm": 9.39206314086914, + "learning_rate": 0.0002859893556772373, + "loss": 2.65635085105896, + "mean_token_accuracy": 0.5787401795387268, + "num_tokens": 36864.0, + "step": 72 + }, + { + "entropy": 2.6846885681152344, + "epoch": 1.140625, + "grad_norm": 10.221190452575684, + "learning_rate": 0.0002855983939685165, + "loss": 2.652174949645996, + "mean_token_accuracy": 0.6043307185173035, + "num_tokens": 37376.0, + "step": 73 + }, + { + "entropy": 2.483905792236328, + "epoch": 1.15625, + "grad_norm": 8.940083503723145, + "learning_rate": 0.0002852023270569033, + "loss": 2.270918369293213, + "mean_token_accuracy": 0.6493775844573975, + "num_tokens": 37888.0, + "step": 74 + }, + { + "entropy": 2.4425692558288574, + "epoch": 1.171875, + "grad_norm": 5.5992512702941895, + "learning_rate": 0.00028480116985409303, + "loss": 1.9062901735305786, + "mean_token_accuracy": 0.6832971572875977, + "num_tokens": 38400.0, + "step": 75 + }, + { + "entropy": 2.611058235168457, + "epoch": 1.1875, + "grad_norm": 4.461435317993164, + "learning_rate": 0.00028439493746342773, + "loss": 2.5850863456726074, + "mean_token_accuracy": 0.5925197005271912, + "num_tokens": 38912.0, + "step": 76 + }, + { + "entropy": 2.6180343627929688, + "epoch": 1.203125, + "grad_norm": 8.797931671142578, + "learning_rate": 0.00028398364517932725, + "loss": 2.809943437576294, + "mean_token_accuracy": 0.5925197005271912, + "num_tokens": 39424.0, + "step": 77 + }, + { + "entropy": 2.826270341873169, + "epoch": 1.21875, + "grad_norm": 6.125537872314453, + "learning_rate": 0.0002835673084867137, + "loss": 2.7826144695281982, + "mean_token_accuracy": 0.5708661675453186, + "num_tokens": 39936.0, + "step": 78 + }, + { + "entropy": 2.6673154830932617, + "epoch": 1.234375, + "grad_norm": 3.9816977977752686, + "learning_rate": 0.0002831459430604281, + "loss": 2.5085325241088867, + "mean_token_accuracy": 0.6043307185173035, + "num_tokens": 40448.0, + "step": 79 + }, + { + "entropy": 2.777114152908325, + "epoch": 1.25, + "grad_norm": 4.296886444091797, + "learning_rate": 0.00028271956476464067, + "loss": 2.4793484210968018, + "mean_token_accuracy": 0.6035503149032593, + "num_tokens": 40960.0, + "step": 80 + }, + { + "entropy": 2.671078681945801, + "epoch": 1.265625, + "grad_norm": 3.269895315170288, + "learning_rate": 0.0002822881896522532, + "loss": 2.2512905597686768, + "mean_token_accuracy": 0.6570248007774353, + "num_tokens": 41472.0, + "step": 81 + }, + { + "entropy": 2.6435415744781494, + "epoch": 1.28125, + "grad_norm": 2.195950746536255, + "learning_rate": 0.000281851833964295, + "loss": 2.6383659839630127, + "mean_token_accuracy": 0.5826771855354309, + "num_tokens": 41984.0, + "step": 82 + }, + { + "entropy": 2.531452178955078, + "epoch": 1.296875, + "grad_norm": 11.792201042175293, + "learning_rate": 0.00028141051412931096, + "loss": 2.146348237991333, + "mean_token_accuracy": 0.6505263447761536, + "num_tokens": 42496.0, + "step": 83 + }, + { + "entropy": 2.572371006011963, + "epoch": 1.3125, + "grad_norm": 8.870929718017578, + "learning_rate": 0.00028096424676274346, + "loss": 2.3176331520080566, + "mean_token_accuracy": 0.6183673739433289, + "num_tokens": 43008.0, + "step": 84 + }, + { + "entropy": 2.348723888397217, + "epoch": 1.328125, + "grad_norm": 14.476244926452637, + "learning_rate": 0.0002805130486663067, + "loss": 2.4164490699768066, + "mean_token_accuracy": 0.6023622155189514, + "num_tokens": 43520.0, + "step": 85 + }, + { + "entropy": 2.422470808029175, + "epoch": 1.34375, + "grad_norm": 14.661328315734863, + "learning_rate": 0.00028005693682735385, + "loss": 2.112380027770996, + "mean_token_accuracy": 0.6382536292076111, + "num_tokens": 44032.0, + "step": 86 + }, + { + "entropy": 2.3329508304595947, + "epoch": 1.359375, + "grad_norm": 2.964010000228882, + "learning_rate": 0.000279595928418238, + "loss": 2.1712806224823, + "mean_token_accuracy": 0.6480331420898438, + "num_tokens": 44544.0, + "step": 87 + }, + { + "entropy": 2.69111967086792, + "epoch": 1.375, + "grad_norm": 10.743988990783691, + "learning_rate": 0.00027913004079566507, + "loss": 2.5868759155273438, + "mean_token_accuracy": 0.586614191532135, + "num_tokens": 45056.0, + "step": 88 + }, + { + "entropy": 2.4709086418151855, + "epoch": 1.390625, + "grad_norm": 11.36965560913086, + "learning_rate": 0.0002786592915000408, + "loss": 2.68729829788208, + "mean_token_accuracy": 0.6062992215156555, + "num_tokens": 45568.0, + "step": 89 + }, + { + "entropy": 2.602285385131836, + "epoch": 1.40625, + "grad_norm": 8.707329750061035, + "learning_rate": 0.00027818369825481, + "loss": 2.1857831478118896, + "mean_token_accuracy": 0.6625514626502991, + "num_tokens": 46080.0, + "step": 90 + }, + { + "entropy": 2.6694841384887695, + "epoch": 1.421875, + "grad_norm": 3.59173846244812, + "learning_rate": 0.0002777032789657898, + "loss": 2.162902355194092, + "mean_token_accuracy": 0.6695095896720886, + "num_tokens": 46592.0, + "step": 91 + }, + { + "entropy": 2.4398598670959473, + "epoch": 1.4375, + "grad_norm": 2.533829689025879, + "learning_rate": 0.00027721805172049456, + "loss": 2.2611405849456787, + "mean_token_accuracy": 0.6639510989189148, + "num_tokens": 47104.0, + "step": 92 + }, + { + "entropy": 2.4955785274505615, + "epoch": 1.453125, + "grad_norm": 6.045874118804932, + "learning_rate": 0.000276728034787456, + "loss": 2.6435904502868652, + "mean_token_accuracy": 0.5964567065238953, + "num_tokens": 47616.0, + "step": 93 + }, + { + "entropy": 2.6504571437835693, + "epoch": 1.46875, + "grad_norm": 7.782134532928467, + "learning_rate": 0.00027623324661553477, + "loss": 2.2059261798858643, + "mean_token_accuracy": 0.6631799340248108, + "num_tokens": 48128.0, + "step": 94 + }, + { + "entropy": 2.8098762035369873, + "epoch": 1.484375, + "grad_norm": 4.1661763191223145, + "learning_rate": 0.00027573370583322565, + "loss": 2.0274105072021484, + "mean_token_accuracy": 0.6802574992179871, + "num_tokens": 48640.0, + "step": 95 + }, + { + "entropy": 2.6600871086120605, + "epoch": 1.5, + "grad_norm": 5.105381488800049, + "learning_rate": 0.000275229431247957, + "loss": 2.1700079441070557, + "mean_token_accuracy": 0.6549586653709412, + "num_tokens": 49152.0, + "step": 96 + }, + { + "entropy": 2.47259783744812, + "epoch": 1.515625, + "grad_norm": 3.5480713844299316, + "learning_rate": 0.0002747204418453818, + "loss": 2.4075517654418945, + "mean_token_accuracy": 0.6232464909553528, + "num_tokens": 49664.0, + "step": 97 + }, + { + "entropy": 2.4480819702148438, + "epoch": 1.53125, + "grad_norm": 4.602890968322754, + "learning_rate": 0.00027420675678866335, + "loss": 2.44067120552063, + "mean_token_accuracy": 0.6198019981384277, + "num_tokens": 50176.0, + "step": 98 + }, + { + "entropy": 2.316689968109131, + "epoch": 1.546875, + "grad_norm": 3.827681303024292, + "learning_rate": 0.0002736883954177538, + "loss": 2.0741045475006104, + "mean_token_accuracy": 0.676171064376831, + "num_tokens": 50688.0, + "step": 99 + }, + { + "entropy": 2.5585527420043945, + "epoch": 1.5625, + "grad_norm": 3.0183746814727783, + "learning_rate": 0.00027316537724866565, + "loss": 2.4706835746765137, + "mean_token_accuracy": 0.6270161271095276, + "num_tokens": 51200.0, + "step": 100 + }, + { + "entropy": 2.5568273067474365, + "epoch": 1.578125, + "grad_norm": 5.168515682220459, + "learning_rate": 0.0002726377219727375, + "loss": 2.2053163051605225, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 51712.0, + "step": 101 + }, + { + "entropy": 2.6455063819885254, + "epoch": 1.59375, + "grad_norm": 2.874056100845337, + "learning_rate": 0.00027210544945589223, + "loss": 2.0263545513153076, + "mean_token_accuracy": 0.6826722621917725, + "num_tokens": 52224.0, + "step": 102 + }, + { + "entropy": 2.610264301300049, + "epoch": 1.609375, + "grad_norm": 3.7625222206115723, + "learning_rate": 0.0002715685797378892, + "loss": 2.221018075942993, + "mean_token_accuracy": 0.6285714507102966, + "num_tokens": 52736.0, + "step": 103 + }, + { + "entropy": 2.410818576812744, + "epoch": 1.625, + "grad_norm": 2.1665024757385254, + "learning_rate": 0.0002710271330315699, + "loss": 2.3151586055755615, + "mean_token_accuracy": 0.6328600645065308, + "num_tokens": 53248.0, + "step": 104 + }, + { + "entropy": 2.333315372467041, + "epoch": 1.640625, + "grad_norm": 4.645427227020264, + "learning_rate": 0.0002704811297220967, + "loss": 2.478018045425415, + "mean_token_accuracy": 0.5944882035255432, + "num_tokens": 53760.0, + "step": 105 + }, + { + "entropy": 2.4208340644836426, + "epoch": 1.65625, + "grad_norm": 7.0705952644348145, + "learning_rate": 0.0002699305903661857, + "loss": 1.7686328887939453, + "mean_token_accuracy": 0.7170626521110535, + "num_tokens": 54272.0, + "step": 106 + }, + { + "entropy": 2.5010931491851807, + "epoch": 1.671875, + "grad_norm": 2.786400556564331, + "learning_rate": 0.0002693755356913325, + "loss": 2.338402509689331, + "mean_token_accuracy": 0.613545835018158, + "num_tokens": 54784.0, + "step": 107 + }, + { + "entropy": 2.2968597412109375, + "epoch": 1.6875, + "grad_norm": 3.2380192279815674, + "learning_rate": 0.00026881598659503185, + "loss": 1.6511207818984985, + "mean_token_accuracy": 0.7251082062721252, + "num_tokens": 55296.0, + "step": 108 + }, + { + "entropy": 2.4909420013427734, + "epoch": 1.703125, + "grad_norm": 2.494285821914673, + "learning_rate": 0.00026825196414399094, + "loss": 2.376694440841675, + "mean_token_accuracy": 0.6358267664909363, + "num_tokens": 55808.0, + "step": 109 + }, + { + "entropy": 2.5020644664764404, + "epoch": 1.71875, + "grad_norm": 4.628073692321777, + "learning_rate": 0.00026768348957333625, + "loss": 1.9303858280181885, + "mean_token_accuracy": 0.6993603706359863, + "num_tokens": 56320.0, + "step": 110 + }, + { + "entropy": 2.327064275741577, + "epoch": 1.734375, + "grad_norm": 3.5953564643859863, + "learning_rate": 0.00026711058428581416, + "loss": 2.2964227199554443, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 56832.0, + "step": 111 + }, + { + "entropy": 2.2144789695739746, + "epoch": 1.75, + "grad_norm": 17.284832000732422, + "learning_rate": 0.0002665332698509848, + "loss": 2.381066083908081, + "mean_token_accuracy": 0.6338582634925842, + "num_tokens": 57344.0, + "step": 112 + }, + { + "entropy": 2.454603672027588, + "epoch": 1.765625, + "grad_norm": 18.100194931030273, + "learning_rate": 0.0002659515680044105, + "loss": 2.4117445945739746, + "mean_token_accuracy": 0.6487026214599609, + "num_tokens": 57856.0, + "step": 113 + }, + { + "entropy": 2.4259469509124756, + "epoch": 1.78125, + "grad_norm": 5.293238639831543, + "learning_rate": 0.00026536550064683697, + "loss": 2.249678373336792, + "mean_token_accuracy": 0.6659836173057556, + "num_tokens": 58368.0, + "step": 114 + }, + { + "entropy": 2.419496536254883, + "epoch": 1.796875, + "grad_norm": 3.008138418197632, + "learning_rate": 0.0002647750898433688, + "loss": 1.8539735078811646, + "mean_token_accuracy": 0.7130434513092041, + "num_tokens": 58880.0, + "step": 115 + }, + { + "entropy": 2.444873809814453, + "epoch": 1.8125, + "grad_norm": 5.0132060050964355, + "learning_rate": 0.00026418035782263923, + "loss": 1.8831861019134521, + "mean_token_accuracy": 0.6965811848640442, + "num_tokens": 59392.0, + "step": 116 + }, + { + "entropy": 2.5142552852630615, + "epoch": 1.828125, + "grad_norm": 3.7436330318450928, + "learning_rate": 0.00026358132697597265, + "loss": 2.437363624572754, + "mean_token_accuracy": 0.6220472455024719, + "num_tokens": 59904.0, + "step": 117 + }, + { + "entropy": 2.6384663581848145, + "epoch": 1.84375, + "grad_norm": 3.390326976776123, + "learning_rate": 0.00026297801985654184, + "loss": 2.490990161895752, + "mean_token_accuracy": 0.6102362275123596, + "num_tokens": 60416.0, + "step": 118 + }, + { + "entropy": 2.204195261001587, + "epoch": 1.859375, + "grad_norm": 2.7175426483154297, + "learning_rate": 0.0002623704591785189, + "loss": 2.220968246459961, + "mean_token_accuracy": 0.6434262990951538, + "num_tokens": 60928.0, + "step": 119 + }, + { + "entropy": 2.1468725204467773, + "epoch": 1.875, + "grad_norm": 14.231541633605957, + "learning_rate": 0.0002617586678162199, + "loss": 2.3230385780334473, + "mean_token_accuracy": 0.6220472455024719, + "num_tokens": 61440.0, + "step": 120 + }, + { + "entropy": 2.3221914768218994, + "epoch": 1.890625, + "grad_norm": 19.32585906982422, + "learning_rate": 0.00026114266880324387, + "loss": 2.1758298873901367, + "mean_token_accuracy": 0.6481481194496155, + "num_tokens": 61952.0, + "step": 121 + }, + { + "entropy": 2.2526302337646484, + "epoch": 1.90625, + "grad_norm": 9.51953411102295, + "learning_rate": 0.0002605224853316054, + "loss": 2.22743558883667, + "mean_token_accuracy": 0.6179999709129333, + "num_tokens": 62464.0, + "step": 122 + }, + { + "entropy": 2.501652956008911, + "epoch": 1.921875, + "grad_norm": 11.819076538085938, + "learning_rate": 0.00025989814075086186, + "loss": 2.346043825149536, + "mean_token_accuracy": 0.6244897842407227, + "num_tokens": 62976.0, + "step": 123 + }, + { + "entropy": 2.394810676574707, + "epoch": 1.9375, + "grad_norm": 18.02789306640625, + "learning_rate": 0.00025926965856723375, + "loss": 2.325348138809204, + "mean_token_accuracy": 0.6220472455024719, + "num_tokens": 63488.0, + "step": 124 + }, + { + "entropy": 2.577543020248413, + "epoch": 1.953125, + "grad_norm": 27.88809585571289, + "learning_rate": 0.00025863706244272003, + "loss": 2.1486191749572754, + "mean_token_accuracy": 0.6508264541625977, + "num_tokens": 64000.0, + "step": 125 + }, + { + "entropy": 2.4816625118255615, + "epoch": 1.96875, + "grad_norm": 9.078181266784668, + "learning_rate": 0.0002580003761942072, + "loss": 2.2877392768859863, + "mean_token_accuracy": 0.6141732335090637, + "num_tokens": 64512.0, + "step": 126 + }, + { + "entropy": 2.5372767448425293, + "epoch": 1.984375, + "grad_norm": 5.26969575881958, + "learning_rate": 0.00025735962379257274, + "loss": 2.0592477321624756, + "mean_token_accuracy": 0.6659750938415527, + "num_tokens": 65024.0, + "step": 127 + }, + { + "entropy": 2.397751808166504, + "epoch": 2.0, + "grad_norm": 15.707524299621582, + "learning_rate": 0.00025671482936178244, + "loss": 2.44035267829895, + "mean_token_accuracy": 0.625984251499176, + "num_tokens": 65536.0, + "step": 128 + }, + { + "entropy": 2.3219223022460938, + "epoch": 2.015625, + "grad_norm": 14.231306076049805, + "learning_rate": 0.00025606601717798207, + "loss": 2.2578790187835693, + "mean_token_accuracy": 0.6288032531738281, + "num_tokens": 66048.0, + "step": 129 + }, + { + "entropy": 2.1380038261413574, + "epoch": 2.03125, + "grad_norm": 11.717175483703613, + "learning_rate": 0.00025541321166858377, + "loss": 2.0398218631744385, + "mean_token_accuracy": 0.6378269791603088, + "num_tokens": 66560.0, + "step": 130 + }, + { + "entropy": 2.236997127532959, + "epoch": 2.046875, + "grad_norm": 4.696479797363281, + "learning_rate": 0.00025475643741134594, + "loss": 1.7749477624893188, + "mean_token_accuracy": 0.704016923904419, + "num_tokens": 67072.0, + "step": 131 + }, + { + "entropy": 2.2647347450256348, + "epoch": 2.0625, + "grad_norm": 6.612071990966797, + "learning_rate": 0.0002540957191334481, + "loss": 2.0588979721069336, + "mean_token_accuracy": 0.650306761264801, + "num_tokens": 67584.0, + "step": 132 + }, + { + "entropy": 2.4617621898651123, + "epoch": 2.078125, + "grad_norm": 3.019199848175049, + "learning_rate": 0.00025343108171056, + "loss": 1.6648919582366943, + "mean_token_accuracy": 0.7114967703819275, + "num_tokens": 68096.0, + "step": 133 + }, + { + "entropy": 2.4026403427124023, + "epoch": 2.09375, + "grad_norm": 2.1030118465423584, + "learning_rate": 0.00025276255016590504, + "loss": 2.198902130126953, + "mean_token_accuracy": 0.6279527544975281, + "num_tokens": 68608.0, + "step": 134 + }, + { + "entropy": 2.264892101287842, + "epoch": 2.109375, + "grad_norm": 7.839612007141113, + "learning_rate": 0.0002520901496693179, + "loss": 2.0725574493408203, + "mean_token_accuracy": 0.6606060862541199, + "num_tokens": 69120.0, + "step": 135 + }, + { + "entropy": 2.173431873321533, + "epoch": 2.125, + "grad_norm": 2.4342360496520996, + "learning_rate": 0.00025141390553629734, + "loss": 2.201805353164673, + "mean_token_accuracy": 0.624015748500824, + "num_tokens": 69632.0, + "step": 136 + }, + { + "entropy": 2.3075344562530518, + "epoch": 2.140625, + "grad_norm": 3.552382469177246, + "learning_rate": 0.00025073384322705274, + "loss": 2.0459301471710205, + "mean_token_accuracy": 0.6570841670036316, + "num_tokens": 70144.0, + "step": 137 + }, + { + "entropy": 2.276052951812744, + "epoch": 2.15625, + "grad_norm": 2.7191050052642822, + "learning_rate": 0.0002500499883455456, + "loss": 1.8903124332427979, + "mean_token_accuracy": 0.6797520518302917, + "num_tokens": 70656.0, + "step": 138 + }, + { + "entropy": 2.4734106063842773, + "epoch": 2.171875, + "grad_norm": 3.9799861907958984, + "learning_rate": 0.00024936236663852573, + "loss": 2.1707847118377686, + "mean_token_accuracy": 0.6437007784843445, + "num_tokens": 71168.0, + "step": 139 + }, + { + "entropy": 2.1881818771362305, + "epoch": 2.1875, + "grad_norm": 3.292069435119629, + "learning_rate": 0.0002486710039945618, + "loss": 1.7409114837646484, + "mean_token_accuracy": 0.6932772994041443, + "num_tokens": 71680.0, + "step": 140 + }, + { + "entropy": 2.099501371383667, + "epoch": 2.203125, + "grad_norm": 8.15732479095459, + "learning_rate": 0.00024797592644306646, + "loss": 2.028435707092285, + "mean_token_accuracy": 0.6646586060523987, + "num_tokens": 72192.0, + "step": 141 + }, + { + "entropy": 1.8974528312683105, + "epoch": 2.21875, + "grad_norm": 4.572197914123535, + "learning_rate": 0.00024727716015331683, + "loss": 2.1486008167266846, + "mean_token_accuracy": 0.6338582634925842, + "num_tokens": 72704.0, + "step": 142 + }, + { + "entropy": 2.18802547454834, + "epoch": 2.234375, + "grad_norm": 11.048628807067871, + "learning_rate": 0.0002465747314334687, + "loss": 2.185668468475342, + "mean_token_accuracy": 0.6448979377746582, + "num_tokens": 73216.0, + "step": 143 + }, + { + "entropy": 2.1774299144744873, + "epoch": 2.25, + "grad_norm": 19.580671310424805, + "learning_rate": 0.00024586866672956636, + "loss": 2.223238229751587, + "mean_token_accuracy": 0.6318897604942322, + "num_tokens": 73728.0, + "step": 144 + }, + { + "entropy": 2.1214840412139893, + "epoch": 2.265625, + "grad_norm": 3.102067232131958, + "learning_rate": 0.0002451589926245468, + "loss": 1.4063600301742554, + "mean_token_accuracy": 0.7455357313156128, + "num_tokens": 74240.0, + "step": 145 + }, + { + "entropy": 2.288113832473755, + "epoch": 2.28125, + "grad_norm": 2.9369845390319824, + "learning_rate": 0.00024444573583723905, + "loss": 1.8721083402633667, + "mean_token_accuracy": 0.6897274851799011, + "num_tokens": 74752.0, + "step": 146 + }, + { + "entropy": 2.1388370990753174, + "epoch": 2.296875, + "grad_norm": 4.0269880294799805, + "learning_rate": 0.00024372892322135792, + "loss": 2.0353121757507324, + "mean_token_accuracy": 0.658777117729187, + "num_tokens": 75264.0, + "step": 147 + }, + { + "entropy": 2.038548231124878, + "epoch": 2.3125, + "grad_norm": 5.226291179656982, + "learning_rate": 0.00024300858176449337, + "loss": 1.9185303449630737, + "mean_token_accuracy": 0.6618556976318359, + "num_tokens": 75776.0, + "step": 148 + }, + { + "entropy": 1.952240228652954, + "epoch": 2.328125, + "grad_norm": 4.657230377197266, + "learning_rate": 0.000242284738587094, + "loss": 1.8080577850341797, + "mean_token_accuracy": 0.6946721076965332, + "num_tokens": 76288.0, + "step": 149 + }, + { + "entropy": 2.0790510177612305, + "epoch": 2.34375, + "grad_norm": 3.602454900741577, + "learning_rate": 0.0002415574209414464, + "loss": 1.8954757452011108, + "mean_token_accuracy": 0.6814516186714172, + "num_tokens": 76800.0, + "step": 150 + }, + { + "entropy": 1.993764877319336, + "epoch": 2.359375, + "grad_norm": 3.6864471435546875, + "learning_rate": 0.00024082665621064884, + "loss": 1.8048733472824097, + "mean_token_accuracy": 0.676706850528717, + "num_tokens": 77312.0, + "step": 151 + }, + { + "entropy": 2.1891403198242188, + "epoch": 2.375, + "grad_norm": 11.02034854888916, + "learning_rate": 0.00024009247190758033, + "loss": 2.1008198261260986, + "mean_token_accuracy": 0.6417322754859924, + "num_tokens": 77824.0, + "step": 152 + }, + { + "entropy": 2.02280855178833, + "epoch": 2.390625, + "grad_norm": 10.282227516174316, + "learning_rate": 0.000239354895673865, + "loss": 1.854500412940979, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 78336.0, + "step": 153 + }, + { + "entropy": 1.9524571895599365, + "epoch": 2.40625, + "grad_norm": 4.118484973907471, + "learning_rate": 0.00023861395527883115, + "loss": 1.7170909643173218, + "mean_token_accuracy": 0.695652186870575, + "num_tokens": 78848.0, + "step": 154 + }, + { + "entropy": 1.9783227443695068, + "epoch": 2.421875, + "grad_norm": 2.67722487449646, + "learning_rate": 0.00023786967861846582, + "loss": 1.60783851146698, + "mean_token_accuracy": 0.7184873819351196, + "num_tokens": 79360.0, + "step": 155 + }, + { + "entropy": 2.2170073986053467, + "epoch": 2.4375, + "grad_norm": 8.424147605895996, + "learning_rate": 0.00023712209371436465, + "loss": 1.2314633131027222, + "mean_token_accuracy": 0.7594654560089111, + "num_tokens": 79872.0, + "step": 156 + }, + { + "entropy": 2.0562796592712402, + "epoch": 2.453125, + "grad_norm": 9.240602493286133, + "learning_rate": 0.00023637122871267679, + "loss": 1.9202128648757935, + "mean_token_accuracy": 0.6593625545501709, + "num_tokens": 80384.0, + "step": 157 + }, + { + "entropy": 2.150996685028076, + "epoch": 2.46875, + "grad_norm": 4.457634925842285, + "learning_rate": 0.0002356171118830451, + "loss": 2.1217310428619385, + "mean_token_accuracy": 0.6535432934761047, + "num_tokens": 80896.0, + "step": 158 + }, + { + "entropy": 2.337796211242676, + "epoch": 2.484375, + "grad_norm": 9.230558395385742, + "learning_rate": 0.00023485977161754194, + "loss": 1.9099302291870117, + "mean_token_accuracy": 0.6680412292480469, + "num_tokens": 81408.0, + "step": 159 + }, + { + "entropy": 2.247264862060547, + "epoch": 2.5, + "grad_norm": 7.798185348510742, + "learning_rate": 0.0002340992364296004, + "loss": 1.8917332887649536, + "mean_token_accuracy": 0.6985743641853333, + "num_tokens": 81920.0, + "step": 160 + }, + { + "entropy": 2.262213706970215, + "epoch": 2.515625, + "grad_norm": 2.7314772605895996, + "learning_rate": 0.0002333355349529403, + "loss": 1.941816806793213, + "mean_token_accuracy": 0.6734279990196228, + "num_tokens": 82432.0, + "step": 161 + }, + { + "entropy": 2.2101516723632812, + "epoch": 2.53125, + "grad_norm": 6.667520999908447, + "learning_rate": 0.0002325686959404907, + "loss": 1.9418377876281738, + "mean_token_accuracy": 0.6827309131622314, + "num_tokens": 82944.0, + "step": 162 + }, + { + "entropy": 2.1902859210968018, + "epoch": 2.546875, + "grad_norm": 3.1942148208618164, + "learning_rate": 0.00023179874826330694, + "loss": 2.1716907024383545, + "mean_token_accuracy": 0.663385808467865, + "num_tokens": 83456.0, + "step": 163 + }, + { + "entropy": 2.114820957183838, + "epoch": 2.5625, + "grad_norm": 9.429683685302734, + "learning_rate": 0.00023102572090948393, + "loss": 2.1209301948547363, + "mean_token_accuracy": 0.663385808467865, + "num_tokens": 83968.0, + "step": 164 + }, + { + "entropy": 2.3089890480041504, + "epoch": 2.578125, + "grad_norm": 3.248178005218506, + "learning_rate": 0.00023024964298306458, + "loss": 2.042099714279175, + "mean_token_accuracy": 0.6760563254356384, + "num_tokens": 84480.0, + "step": 165 + }, + { + "entropy": 2.2706456184387207, + "epoch": 2.59375, + "grad_norm": 7.227741718292236, + "learning_rate": 0.00022947054370294422, + "loss": 1.613346815109253, + "mean_token_accuracy": 0.7441860437393188, + "num_tokens": 84992.0, + "step": 166 + }, + { + "entropy": 2.1461312770843506, + "epoch": 2.609375, + "grad_norm": 5.42982816696167, + "learning_rate": 0.00022868845240177032, + "loss": 2.038721799850464, + "mean_token_accuracy": 0.6811023354530334, + "num_tokens": 85504.0, + "step": 167 + }, + { + "entropy": 2.085728645324707, + "epoch": 2.625, + "grad_norm": 4.73753023147583, + "learning_rate": 0.0002279033985248384, + "loss": 1.7772815227508545, + "mean_token_accuracy": 0.7190082669258118, + "num_tokens": 86016.0, + "step": 168 + }, + { + "entropy": 2.0724904537200928, + "epoch": 2.640625, + "grad_norm": 3.573122262954712, + "learning_rate": 0.00022711541162898321, + "loss": 1.9584404230117798, + "mean_token_accuracy": 0.6918489336967468, + "num_tokens": 86528.0, + "step": 169 + }, + { + "entropy": 2.082350492477417, + "epoch": 2.65625, + "grad_norm": 5.604883670806885, + "learning_rate": 0.00022632452138146602, + "loss": 2.0279061794281006, + "mean_token_accuracy": 0.6867470145225525, + "num_tokens": 87040.0, + "step": 170 + }, + { + "entropy": 2.2631471157073975, + "epoch": 2.671875, + "grad_norm": 8.89859390258789, + "learning_rate": 0.00022553075755885762, + "loss": 2.2429392337799072, + "mean_token_accuracy": 0.6515747904777527, + "num_tokens": 87552.0, + "step": 171 + }, + { + "entropy": 2.1096630096435547, + "epoch": 2.6875, + "grad_norm": 15.497108459472656, + "learning_rate": 0.00022473415004591727, + "loss": 1.7870018482208252, + "mean_token_accuracy": 0.7008196711540222, + "num_tokens": 88064.0, + "step": 172 + }, + { + "entropy": 2.14209246635437, + "epoch": 2.703125, + "grad_norm": 18.13780403137207, + "learning_rate": 0.0002239347288344676, + "loss": 2.0227110385894775, + "mean_token_accuracy": 0.6794354915618896, + "num_tokens": 88576.0, + "step": 173 + }, + { + "entropy": 2.0351309776306152, + "epoch": 2.71875, + "grad_norm": 5.5439605712890625, + "learning_rate": 0.00022313252402226538, + "loss": 2.0029079914093018, + "mean_token_accuracy": 0.6673228144645691, + "num_tokens": 89088.0, + "step": 174 + }, + { + "entropy": 2.1739964485168457, + "epoch": 2.734375, + "grad_norm": 19.21829605102539, + "learning_rate": 0.00022232756581186841, + "loss": 2.211519241333008, + "mean_token_accuracy": 0.6594488024711609, + "num_tokens": 89600.0, + "step": 175 + }, + { + "entropy": 2.021829843521118, + "epoch": 2.75, + "grad_norm": 17.91119956970215, + "learning_rate": 0.00022151988450949832, + "loss": 1.7456486225128174, + "mean_token_accuracy": 0.6915322542190552, + "num_tokens": 90112.0, + "step": 176 + }, + { + "entropy": 2.2625017166137695, + "epoch": 2.765625, + "grad_norm": 19.28619956970215, + "learning_rate": 0.00022070951052389966, + "loss": 1.5992084741592407, + "mean_token_accuracy": 0.7397849559783936, + "num_tokens": 90624.0, + "step": 177 + }, + { + "entropy": 2.0457301139831543, + "epoch": 2.78125, + "grad_norm": 6.377933979034424, + "learning_rate": 0.0002198964743651949, + "loss": 2.0016820430755615, + "mean_token_accuracy": 0.6898608207702637, + "num_tokens": 91136.0, + "step": 178 + }, + { + "entropy": 2.1969172954559326, + "epoch": 2.796875, + "grad_norm": 4.351161956787109, + "learning_rate": 0.00021908080664373596, + "loss": 2.069615602493286, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 91648.0, + "step": 179 + }, + { + "entropy": 2.0312910079956055, + "epoch": 2.8125, + "grad_norm": 2.8072102069854736, + "learning_rate": 0.00021826253806895156, + "loss": 1.5687063932418823, + "mean_token_accuracy": 0.7635983228683472, + "num_tokens": 92160.0, + "step": 180 + }, + { + "entropy": 2.0322093963623047, + "epoch": 2.828125, + "grad_norm": 12.090331077575684, + "learning_rate": 0.00021744169944819098, + "loss": 1.9994778633117676, + "mean_token_accuracy": 0.6771653294563293, + "num_tokens": 92672.0, + "step": 181 + }, + { + "entropy": 2.1715853214263916, + "epoch": 2.84375, + "grad_norm": 15.88219165802002, + "learning_rate": 0.00021661832168556438, + "loss": 1.8473044633865356, + "mean_token_accuracy": 0.6991701126098633, + "num_tokens": 93184.0, + "step": 182 + }, + { + "entropy": 2.033459424972534, + "epoch": 2.859375, + "grad_norm": 4.496399402618408, + "learning_rate": 0.00021579243578077913, + "loss": 1.9900826215744019, + "mean_token_accuracy": 0.6948819160461426, + "num_tokens": 93696.0, + "step": 183 + }, + { + "entropy": 2.100759983062744, + "epoch": 2.875, + "grad_norm": 5.62208366394043, + "learning_rate": 0.00021496407282797276, + "loss": 1.7417033910751343, + "mean_token_accuracy": 0.7119675278663635, + "num_tokens": 94208.0, + "step": 184 + }, + { + "entropy": 2.1455624103546143, + "epoch": 2.890625, + "grad_norm": 4.165937423706055, + "learning_rate": 0.0002141332640145423, + "loss": 1.9299179315567017, + "mean_token_accuracy": 0.7094188332557678, + "num_tokens": 94720.0, + "step": 185 + }, + { + "entropy": 2.002096176147461, + "epoch": 2.90625, + "grad_norm": 1.7983911037445068, + "learning_rate": 0.00021330004061996996, + "loss": 1.6952036619186401, + "mean_token_accuracy": 0.7520492076873779, + "num_tokens": 95232.0, + "step": 186 + }, + { + "entropy": 2.0048775672912598, + "epoch": 2.921875, + "grad_norm": 1.7580811977386475, + "learning_rate": 0.00021246443401464558, + "loss": 1.7680833339691162, + "mean_token_accuracy": 0.7364184856414795, + "num_tokens": 95744.0, + "step": 187 + }, + { + "entropy": 2.169144868850708, + "epoch": 2.9375, + "grad_norm": 5.268362522125244, + "learning_rate": 0.00021162647565868556, + "loss": 1.8059192895889282, + "mean_token_accuracy": 0.7242798209190369, + "num_tokens": 96256.0, + "step": 188 + }, + { + "entropy": 2.1363723278045654, + "epoch": 2.953125, + "grad_norm": 3.498081684112549, + "learning_rate": 0.00021078619710074845, + "loss": 2.1745285987854004, + "mean_token_accuracy": 0.6751968264579773, + "num_tokens": 96768.0, + "step": 189 + }, + { + "entropy": 2.148711919784546, + "epoch": 2.96875, + "grad_norm": 6.432483196258545, + "learning_rate": 0.000209943629976847, + "loss": 2.1445229053497314, + "mean_token_accuracy": 0.6830708384513855, + "num_tokens": 97280.0, + "step": 190 + }, + { + "entropy": 2.019306182861328, + "epoch": 2.984375, + "grad_norm": 10.80850601196289, + "learning_rate": 0.0002090988060091572, + "loss": 1.7723709344863892, + "mean_token_accuracy": 0.7313131093978882, + "num_tokens": 97792.0, + "step": 191 + }, + { + "entropy": 1.978491187095642, + "epoch": 3.0, + "grad_norm": 4.951074123382568, + "learning_rate": 0.00020825175700482393, + "loss": 2.0183067321777344, + "mean_token_accuracy": 0.7105788588523865, + "num_tokens": 98304.0, + "step": 192 + }, + { + "entropy": 1.889210820198059, + "epoch": 3.015625, + "grad_norm": 7.480757713317871, + "learning_rate": 0.00020740251485476345, + "loss": 1.4690678119659424, + "mean_token_accuracy": 0.7515657544136047, + "num_tokens": 98816.0, + "step": 193 + }, + { + "entropy": 1.83897066116333, + "epoch": 3.03125, + "grad_norm": 3.0285542011260986, + "learning_rate": 0.00020655111153246273, + "loss": 1.531701922416687, + "mean_token_accuracy": 0.7408906817436218, + "num_tokens": 99328.0, + "step": 194 + }, + { + "entropy": 1.9947177171707153, + "epoch": 3.046875, + "grad_norm": 11.762626647949219, + "learning_rate": 0.00020569757909277562, + "loss": 1.6982847452163696, + "mean_token_accuracy": 0.7285714149475098, + "num_tokens": 99840.0, + "step": 195 + }, + { + "entropy": 2.1106882095336914, + "epoch": 3.0625, + "grad_norm": 13.554593086242676, + "learning_rate": 0.00020484194967071608, + "loss": 1.6521421670913696, + "mean_token_accuracy": 0.7452631592750549, + "num_tokens": 100352.0, + "step": 196 + }, + { + "entropy": 2.124321937561035, + "epoch": 3.078125, + "grad_norm": 7.983447074890137, + "learning_rate": 0.00020398425548024822, + "loss": 2.0003597736358643, + "mean_token_accuracy": 0.6988189220428467, + "num_tokens": 100864.0, + "step": 197 + }, + { + "entropy": 1.918179988861084, + "epoch": 3.09375, + "grad_norm": 9.18137264251709, + "learning_rate": 0.00020312452881307355, + "loss": 1.682350993156433, + "mean_token_accuracy": 0.7344064116477966, + "num_tokens": 101376.0, + "step": 198 + }, + { + "entropy": 1.787161946296692, + "epoch": 3.109375, + "grad_norm": 21.148954391479492, + "learning_rate": 0.00020226280203741514, + "loss": 1.5121514797210693, + "mean_token_accuracy": 0.7413442134857178, + "num_tokens": 101888.0, + "step": 199 + }, + { + "entropy": 1.9293241500854492, + "epoch": 3.125, + "grad_norm": 20.98975944519043, + "learning_rate": 0.00020139910759679915, + "loss": 1.4278969764709473, + "mean_token_accuracy": 0.7689075469970703, + "num_tokens": 102400.0, + "step": 200 + }, + { + "entropy": 1.7874903678894043, + "epoch": 3.140625, + "grad_norm": 16.30684471130371, + "learning_rate": 0.00020053347800883298, + "loss": 1.8084406852722168, + "mean_token_accuracy": 0.7157257795333862, + "num_tokens": 102912.0, + "step": 201 + }, + { + "entropy": 1.6876550912857056, + "epoch": 3.15625, + "grad_norm": 13.622233390808105, + "learning_rate": 0.00019966594586398145, + "loss": 1.6798195838928223, + "mean_token_accuracy": 0.7037773132324219, + "num_tokens": 103424.0, + "step": 202 + }, + { + "entropy": 1.776758074760437, + "epoch": 3.171875, + "grad_norm": 6.037537574768066, + "learning_rate": 0.00019879654382433943, + "loss": 1.6979624032974243, + "mean_token_accuracy": 0.7298387289047241, + "num_tokens": 103936.0, + "step": 203 + }, + { + "entropy": 1.850691318511963, + "epoch": 3.1875, + "grad_norm": 2.61728835105896, + "learning_rate": 0.00019792530462240234, + "loss": 1.6017121076583862, + "mean_token_accuracy": 0.7342799305915833, + "num_tokens": 104448.0, + "step": 204 + }, + { + "entropy": 1.8923485279083252, + "epoch": 3.203125, + "grad_norm": 7.442923069000244, + "learning_rate": 0.00019705226105983374, + "loss": 1.687976360321045, + "mean_token_accuracy": 0.7244094610214233, + "num_tokens": 104960.0, + "step": 205 + }, + { + "entropy": 2.107788324356079, + "epoch": 3.21875, + "grad_norm": 19.86151695251465, + "learning_rate": 0.00019617744600623023, + "loss": 2.016284942626953, + "mean_token_accuracy": 0.6968504190444946, + "num_tokens": 105472.0, + "step": 206 + }, + { + "entropy": 2.0825321674346924, + "epoch": 3.234375, + "grad_norm": 20.006290435791016, + "learning_rate": 0.00019530089239788422, + "loss": 1.8263378143310547, + "mean_token_accuracy": 0.7134020328521729, + "num_tokens": 105984.0, + "step": 207 + }, + { + "entropy": 2.016390800476074, + "epoch": 3.25, + "grad_norm": 15.552451133728027, + "learning_rate": 0.00019442263323654358, + "loss": 1.716286063194275, + "mean_token_accuracy": 0.7065868377685547, + "num_tokens": 106496.0, + "step": 208 + }, + { + "entropy": 1.9402761459350586, + "epoch": 3.265625, + "grad_norm": 10.624086380004883, + "learning_rate": 0.0001935427015881693, + "loss": 1.7164943218231201, + "mean_token_accuracy": 0.7269076108932495, + "num_tokens": 107008.0, + "step": 209 + }, + { + "entropy": 1.8228812217712402, + "epoch": 3.28125, + "grad_norm": 21.39549446105957, + "learning_rate": 0.00019266113058169076, + "loss": 1.7704980373382568, + "mean_token_accuracy": 0.7145669460296631, + "num_tokens": 107520.0, + "step": 210 + }, + { + "entropy": 1.7182958126068115, + "epoch": 3.296875, + "grad_norm": 23.144405364990234, + "learning_rate": 0.00019177795340775792, + "loss": 1.7252445220947266, + "mean_token_accuracy": 0.7263779640197754, + "num_tokens": 108032.0, + "step": 211 + }, + { + "entropy": 1.808842658996582, + "epoch": 3.3125, + "grad_norm": 22.433195114135742, + "learning_rate": 0.00019089320331749235, + "loss": 1.713385820388794, + "mean_token_accuracy": 0.7269076108932495, + "num_tokens": 108544.0, + "step": 212 + }, + { + "entropy": 1.8282392024993896, + "epoch": 3.328125, + "grad_norm": 18.469093322753906, + "learning_rate": 0.00019000691362123473, + "loss": 1.8379396200180054, + "mean_token_accuracy": 0.7134387493133545, + "num_tokens": 109056.0, + "step": 213 + }, + { + "entropy": 1.8023868799209595, + "epoch": 3.34375, + "grad_norm": 19.18539047241211, + "learning_rate": 0.0001891191176872913, + "loss": 1.8020644187927246, + "mean_token_accuracy": 0.7007874250411987, + "num_tokens": 109568.0, + "step": 214 + }, + { + "entropy": 1.677249789237976, + "epoch": 3.359375, + "grad_norm": 18.677223205566406, + "learning_rate": 0.00018822984894067719, + "loss": 1.4826351404190063, + "mean_token_accuracy": 0.7551020383834839, + "num_tokens": 110080.0, + "step": 215 + }, + { + "entropy": 1.930768609046936, + "epoch": 3.375, + "grad_norm": 11.903674125671387, + "learning_rate": 0.00018733914086185803, + "loss": 1.5919502973556519, + "mean_token_accuracy": 0.7355371713638306, + "num_tokens": 110592.0, + "step": 216 + }, + { + "entropy": 1.727242350578308, + "epoch": 3.390625, + "grad_norm": 3.6214687824249268, + "learning_rate": 0.0001864470269854896, + "loss": 1.5873475074768066, + "mean_token_accuracy": 0.7410358786582947, + "num_tokens": 111104.0, + "step": 217 + }, + { + "entropy": 1.7539688348770142, + "epoch": 3.40625, + "grad_norm": 9.966615676879883, + "learning_rate": 0.0001855535408991551, + "loss": 1.5512422323226929, + "mean_token_accuracy": 0.7586911916732788, + "num_tokens": 111616.0, + "step": 218 + }, + { + "entropy": 1.534348726272583, + "epoch": 3.421875, + "grad_norm": 15.220844268798828, + "learning_rate": 0.00018465871624210068, + "loss": 1.485011339187622, + "mean_token_accuracy": 0.7715430855751038, + "num_tokens": 112128.0, + "step": 219 + }, + { + "entropy": 1.7263754606246948, + "epoch": 3.4375, + "grad_norm": 9.868559837341309, + "learning_rate": 0.00018376258670396888, + "loss": 1.5979400873184204, + "mean_token_accuracy": 0.7459016442298889, + "num_tokens": 112640.0, + "step": 220 + }, + { + "entropy": 1.7610721588134766, + "epoch": 3.453125, + "grad_norm": 5.717709541320801, + "learning_rate": 0.00018286518602353045, + "loss": 1.5840563774108887, + "mean_token_accuracy": 0.751028835773468, + "num_tokens": 113152.0, + "step": 221 + }, + { + "entropy": 1.7383480072021484, + "epoch": 3.46875, + "grad_norm": 4.265955448150635, + "learning_rate": 0.00018196654798741368, + "loss": 1.6178569793701172, + "mean_token_accuracy": 0.751968502998352, + "num_tokens": 113664.0, + "step": 222 + }, + { + "entropy": 1.9051017761230469, + "epoch": 3.484375, + "grad_norm": 4.079834461212158, + "learning_rate": 0.00018106670642883277, + "loss": 1.125648856163025, + "mean_token_accuracy": 0.8163716793060303, + "num_tokens": 114176.0, + "step": 223 + }, + { + "entropy": 1.7379391193389893, + "epoch": 3.5, + "grad_norm": 3.7545363903045654, + "learning_rate": 0.00018016569522631378, + "loss": 1.2588374614715576, + "mean_token_accuracy": 0.7928870320320129, + "num_tokens": 114688.0, + "step": 224 + }, + { + "entropy": 1.7147713899612427, + "epoch": 3.515625, + "grad_norm": 3.103086471557617, + "learning_rate": 0.00017926354830241924, + "loss": 1.4433473348617554, + "mean_token_accuracy": 0.7766393423080444, + "num_tokens": 115200.0, + "step": 225 + }, + { + "entropy": 1.804306149482727, + "epoch": 3.53125, + "grad_norm": 2.8875999450683594, + "learning_rate": 0.00017836029962247092, + "loss": 1.563567042350769, + "mean_token_accuracy": 0.7510204315185547, + "num_tokens": 115712.0, + "step": 226 + }, + { + "entropy": 1.7466846704483032, + "epoch": 3.546875, + "grad_norm": 4.622049331665039, + "learning_rate": 0.00017745598319327116, + "loss": 1.6097654104232788, + "mean_token_accuracy": 0.7484909296035767, + "num_tokens": 116224.0, + "step": 227 + }, + { + "entropy": 1.8188687562942505, + "epoch": 3.5625, + "grad_norm": 5.317627906799316, + "learning_rate": 0.00017655063306182232, + "loss": 1.7428910732269287, + "mean_token_accuracy": 0.7283464670181274, + "num_tokens": 116736.0, + "step": 228 + }, + { + "entropy": 1.9028565883636475, + "epoch": 3.578125, + "grad_norm": 2.2038638591766357, + "learning_rate": 0.00017564428331404519, + "loss": 1.5572713613510132, + "mean_token_accuracy": 0.7560975551605225, + "num_tokens": 117248.0, + "step": 229 + }, + { + "entropy": 1.766879677772522, + "epoch": 3.59375, + "grad_norm": 1.8373303413391113, + "learning_rate": 0.0001747369680734955, + "loss": 1.3965166807174683, + "mean_token_accuracy": 0.7731958627700806, + "num_tokens": 117760.0, + "step": 230 + }, + { + "entropy": 2.0388832092285156, + "epoch": 3.609375, + "grad_norm": 2.9878053665161133, + "learning_rate": 0.0001738287215000792, + "loss": 1.5748356580734253, + "mean_token_accuracy": 0.7380457520484924, + "num_tokens": 118272.0, + "step": 231 + }, + { + "entropy": 1.748144268989563, + "epoch": 3.625, + "grad_norm": 2.4010939598083496, + "learning_rate": 0.0001729195777887665, + "loss": 1.605446219444275, + "mean_token_accuracy": 0.753564178943634, + "num_tokens": 118784.0, + "step": 232 + }, + { + "entropy": 1.8476933240890503, + "epoch": 3.640625, + "grad_norm": 2.0416951179504395, + "learning_rate": 0.00017200957116830423, + "loss": 1.83555006980896, + "mean_token_accuracy": 0.7313131093978882, + "num_tokens": 119296.0, + "step": 233 + }, + { + "entropy": 1.730068564414978, + "epoch": 3.65625, + "grad_norm": 2.2305748462677, + "learning_rate": 0.00017109873589992737, + "loss": 1.4430031776428223, + "mean_token_accuracy": 0.7628865838050842, + "num_tokens": 119808.0, + "step": 234 + }, + { + "entropy": 1.5812182426452637, + "epoch": 3.671875, + "grad_norm": 2.0247128009796143, + "learning_rate": 0.00017018710627606892, + "loss": 1.2767280340194702, + "mean_token_accuracy": 0.78925621509552, + "num_tokens": 120320.0, + "step": 235 + }, + { + "entropy": 1.6156758069992065, + "epoch": 3.6875, + "grad_norm": 2.141160488128662, + "learning_rate": 0.00016927471661906898, + "loss": 1.5877039432525635, + "mean_token_accuracy": 0.7560483813285828, + "num_tokens": 120832.0, + "step": 236 + }, + { + "entropy": 1.81034517288208, + "epoch": 3.703125, + "grad_norm": 6.961944580078125, + "learning_rate": 0.00016836160127988242, + "loss": 1.7641907930374146, + "mean_token_accuracy": 0.7263779640197754, + "num_tokens": 121344.0, + "step": 237 + }, + { + "entropy": 1.7935938835144043, + "epoch": 3.71875, + "grad_norm": 8.582378387451172, + "learning_rate": 0.00016744779463678572, + "loss": 1.6680549383163452, + "mean_token_accuracy": 0.7401574850082397, + "num_tokens": 121856.0, + "step": 238 + }, + { + "entropy": 1.9910942316055298, + "epoch": 3.734375, + "grad_norm": 2.0824148654937744, + "learning_rate": 0.00016653333109408248, + "loss": 1.7776023149490356, + "mean_token_accuracy": 0.7283464670181274, + "num_tokens": 122368.0, + "step": 239 + }, + { + "entropy": 1.8618779182434082, + "epoch": 3.75, + "grad_norm": 2.343003988265991, + "learning_rate": 0.00016561824508080819, + "loss": 1.5893044471740723, + "mean_token_accuracy": 0.7647058963775635, + "num_tokens": 122880.0, + "step": 240 + }, + { + "entropy": 1.6664297580718994, + "epoch": 3.765625, + "grad_norm": 2.6146957874298096, + "learning_rate": 0.0001647025710494341, + "loss": 1.7686117887496948, + "mean_token_accuracy": 0.7185039520263672, + "num_tokens": 123392.0, + "step": 241 + }, + { + "entropy": 1.7106194496154785, + "epoch": 3.78125, + "grad_norm": 1.93027925491333, + "learning_rate": 0.00016378634347456988, + "loss": 1.0115760564804077, + "mean_token_accuracy": 0.8384955525398254, + "num_tokens": 123904.0, + "step": 242 + }, + { + "entropy": 1.7875338792800903, + "epoch": 3.796875, + "grad_norm": 1.9203755855560303, + "learning_rate": 0.000162869596851666, + "loss": 1.6601542234420776, + "mean_token_accuracy": 0.7581967115402222, + "num_tokens": 124416.0, + "step": 243 + }, + { + "entropy": 1.8426944017410278, + "epoch": 3.8125, + "grad_norm": 2.230609178543091, + "learning_rate": 0.0001619523656957145, + "loss": 1.7351903915405273, + "mean_token_accuracy": 0.7386138439178467, + "num_tokens": 124928.0, + "step": 244 + }, + { + "entropy": 1.7839840650558472, + "epoch": 3.828125, + "grad_norm": 2.761748790740967, + "learning_rate": 0.00016103468453995012, + "loss": 1.7518467903137207, + "mean_token_accuracy": 0.7263779640197754, + "num_tokens": 125440.0, + "step": 245 + }, + { + "entropy": 1.8428518772125244, + "epoch": 3.84375, + "grad_norm": 2.753977060317993, + "learning_rate": 0.0001601165879345496, + "loss": 1.6037272214889526, + "mean_token_accuracy": 0.7444218993186951, + "num_tokens": 125952.0, + "step": 246 + }, + { + "entropy": 1.7670025825500488, + "epoch": 3.859375, + "grad_norm": 3.583522319793701, + "learning_rate": 0.00015919811044533128, + "loss": 1.8228002786636353, + "mean_token_accuracy": 0.7185039520263672, + "num_tokens": 126464.0, + "step": 247 + }, + { + "entropy": 1.7220356464385986, + "epoch": 3.875, + "grad_norm": 2.739765167236328, + "learning_rate": 0.0001582792866524535, + "loss": 1.6976571083068848, + "mean_token_accuracy": 0.7404426336288452, + "num_tokens": 126976.0, + "step": 248 + }, + { + "entropy": 1.8917089700698853, + "epoch": 3.890625, + "grad_norm": 2.6218745708465576, + "learning_rate": 0.0001573601511491127, + "loss": 1.3674724102020264, + "mean_token_accuracy": 0.7713097929954529, + "num_tokens": 127488.0, + "step": 249 + }, + { + "entropy": 1.785091757774353, + "epoch": 3.90625, + "grad_norm": 2.275317907333374, + "learning_rate": 0.00015644073854024113, + "loss": 1.5308012962341309, + "mean_token_accuracy": 0.765999972820282, + "num_tokens": 128000.0, + "step": 250 + }, + { + "entropy": 2.054109573364258, + "epoch": 3.921875, + "grad_norm": 2.2449769973754883, + "learning_rate": 0.00015552108344120383, + "loss": 1.0721290111541748, + "mean_token_accuracy": 0.8066666722297668, + "num_tokens": 128512.0, + "step": 251 + }, + { + "entropy": 2.0395283699035645, + "epoch": 3.9375, + "grad_norm": 1.5280213356018066, + "learning_rate": 0.0001546012204764955, + "loss": 1.3407353162765503, + "mean_token_accuracy": 0.7982832789421082, + "num_tokens": 129024.0, + "step": 252 + }, + { + "entropy": 1.815192461013794, + "epoch": 3.953125, + "grad_norm": 2.583451271057129, + "learning_rate": 0.00015368118427843682, + "loss": 1.573038101196289, + "mean_token_accuracy": 0.7582644820213318, + "num_tokens": 129536.0, + "step": 253 + }, + { + "entropy": 1.9515974521636963, + "epoch": 3.96875, + "grad_norm": 3.4174747467041016, + "learning_rate": 0.0001527610094858707, + "loss": 2.025150775909424, + "mean_token_accuracy": 0.6889764070510864, + "num_tokens": 130048.0, + "step": 254 + }, + { + "entropy": 1.8663040399551392, + "epoch": 3.984375, + "grad_norm": 1.9246138334274292, + "learning_rate": 0.00015184073074285797, + "loss": 1.731005311012268, + "mean_token_accuracy": 0.7358871102333069, + "num_tokens": 130560.0, + "step": 255 + }, + { + "entropy": 1.9476964473724365, + "epoch": 4.0, + "grad_norm": 1.8395166397094727, + "learning_rate": 0.00015092038269737317, + "loss": 1.8282963037490845, + "mean_token_accuracy": 0.7285429239273071, + "num_tokens": 131072.0, + "step": 256 + }, + { + "entropy": 1.9428930282592773, + "epoch": 4.015625, + "grad_norm": 2.5020766258239746, + "learning_rate": 0.00015, + "loss": 1.7320835590362549, + "mean_token_accuracy": 0.7224409580230713, + "num_tokens": 131584.0, + "step": 257 + }, + { + "entropy": 1.7875992059707642, + "epoch": 4.03125, + "grad_norm": 2.313523769378662, + "learning_rate": 0.00014907961730262684, + "loss": 1.6047075986862183, + "mean_token_accuracy": 0.7401574850082397, + "num_tokens": 132096.0, + "step": 258 + }, + { + "entropy": 1.5366038084030151, + "epoch": 4.046875, + "grad_norm": 2.181321382522583, + "learning_rate": 0.000148159269257142, + "loss": 1.5011101961135864, + "mean_token_accuracy": 0.7644710540771484, + "num_tokens": 132608.0, + "step": 259 + }, + { + "entropy": 1.6452386379241943, + "epoch": 4.0625, + "grad_norm": 4.90228796005249, + "learning_rate": 0.00014723899051412927, + "loss": 1.4501490592956543, + "mean_token_accuracy": 0.7625754475593567, + "num_tokens": 133120.0, + "step": 260 + }, + { + "entropy": 1.683490514755249, + "epoch": 4.078125, + "grad_norm": 3.2073004245758057, + "learning_rate": 0.00014631881572156315, + "loss": 1.290379285812378, + "mean_token_accuracy": 0.7727272510528564, + "num_tokens": 133632.0, + "step": 261 + }, + { + "entropy": 1.7427382469177246, + "epoch": 4.09375, + "grad_norm": 2.1592955589294434, + "learning_rate": 0.0001453987795235045, + "loss": 1.4221155643463135, + "mean_token_accuracy": 0.782608687877655, + "num_tokens": 134144.0, + "step": 262 + }, + { + "entropy": 1.7729072570800781, + "epoch": 4.109375, + "grad_norm": 1.8065860271453857, + "learning_rate": 0.00014447891655879617, + "loss": 1.433501958847046, + "mean_token_accuracy": 0.7873684167861938, + "num_tokens": 134656.0, + "step": 263 + }, + { + "entropy": 1.8184741735458374, + "epoch": 4.125, + "grad_norm": 2.002441167831421, + "learning_rate": 0.00014355926145975887, + "loss": 1.6424577236175537, + "mean_token_accuracy": 0.7247524857521057, + "num_tokens": 135168.0, + "step": 264 + }, + { + "entropy": 1.6545014381408691, + "epoch": 4.140625, + "grad_norm": 1.860556960105896, + "learning_rate": 0.0001426398488508873, + "loss": 1.4745804071426392, + "mean_token_accuracy": 0.757515013217926, + "num_tokens": 135680.0, + "step": 265 + }, + { + "entropy": 1.6565005779266357, + "epoch": 4.15625, + "grad_norm": 1.6392408609390259, + "learning_rate": 0.0001417207133475465, + "loss": 1.3510708808898926, + "mean_token_accuracy": 0.7739307284355164, + "num_tokens": 136192.0, + "step": 266 + }, + { + "entropy": 1.5082423686981201, + "epoch": 4.171875, + "grad_norm": 8.039023399353027, + "learning_rate": 0.0001408018895546687, + "loss": 1.5158226490020752, + "mean_token_accuracy": 0.7657480239868164, + "num_tokens": 136704.0, + "step": 267 + }, + { + "entropy": 1.58040452003479, + "epoch": 4.1875, + "grad_norm": 2.5063955783843994, + "learning_rate": 0.00013988341206545038, + "loss": 1.3905433416366577, + "mean_token_accuracy": 0.7622950673103333, + "num_tokens": 137216.0, + "step": 268 + }, + { + "entropy": 1.4347314834594727, + "epoch": 4.203125, + "grad_norm": 2.2003631591796875, + "learning_rate": 0.00013896531546004988, + "loss": 1.3757840394973755, + "mean_token_accuracy": 0.7868525981903076, + "num_tokens": 137728.0, + "step": 269 + }, + { + "entropy": 1.5444324016571045, + "epoch": 4.21875, + "grad_norm": 2.6288857460021973, + "learning_rate": 0.00013804763430428548, + "loss": 1.5756033658981323, + "mean_token_accuracy": 0.7539370059967041, + "num_tokens": 138240.0, + "step": 270 + }, + { + "entropy": 1.7162096500396729, + "epoch": 4.234375, + "grad_norm": 1.8113332986831665, + "learning_rate": 0.00013713040314833404, + "loss": 1.460745096206665, + "mean_token_accuracy": 0.7789255976676941, + "num_tokens": 138752.0, + "step": 271 + }, + { + "entropy": 1.6978294849395752, + "epoch": 4.25, + "grad_norm": 2.186129570007324, + "learning_rate": 0.0001362136565254301, + "loss": 1.3529666662216187, + "mean_token_accuracy": 0.7899159789085388, + "num_tokens": 139264.0, + "step": 272 + }, + { + "entropy": 1.716745138168335, + "epoch": 4.265625, + "grad_norm": 1.89174222946167, + "learning_rate": 0.0001352974289505659, + "loss": 1.4731371402740479, + "mean_token_accuracy": 0.7571428418159485, + "num_tokens": 139776.0, + "step": 273 + }, + { + "entropy": 1.6406333446502686, + "epoch": 4.28125, + "grad_norm": 1.6710184812545776, + "learning_rate": 0.0001343817549191918, + "loss": 1.4243406057357788, + "mean_token_accuracy": 0.7645875215530396, + "num_tokens": 140288.0, + "step": 274 + }, + { + "entropy": 1.6350668668746948, + "epoch": 4.296875, + "grad_norm": 2.4510951042175293, + "learning_rate": 0.00013346666890591753, + "loss": 1.5063180923461914, + "mean_token_accuracy": 0.7625754475593567, + "num_tokens": 140800.0, + "step": 275 + }, + { + "entropy": 1.6373059749603271, + "epoch": 4.3125, + "grad_norm": 3.550650119781494, + "learning_rate": 0.00013255220536321428, + "loss": 1.7208104133605957, + "mean_token_accuracy": 0.7244094610214233, + "num_tokens": 141312.0, + "step": 276 + }, + { + "entropy": 1.7569215297698975, + "epoch": 4.328125, + "grad_norm": 2.7330479621887207, + "learning_rate": 0.00013163839872011758, + "loss": 1.7864205837249756, + "mean_token_accuracy": 0.7244094610214233, + "num_tokens": 141824.0, + "step": 277 + }, + { + "entropy": 1.7542294263839722, + "epoch": 4.34375, + "grad_norm": 3.1029021739959717, + "learning_rate": 0.00013072528338093102, + "loss": 1.4170482158660889, + "mean_token_accuracy": 0.7632653117179871, + "num_tokens": 142336.0, + "step": 278 + }, + { + "entropy": 1.7410451173782349, + "epoch": 4.359375, + "grad_norm": 2.644277811050415, + "learning_rate": 0.00012981289372393108, + "loss": 1.7685855627059937, + "mean_token_accuracy": 0.7145669460296631, + "num_tokens": 142848.0, + "step": 279 + }, + { + "entropy": 1.586928129196167, + "epoch": 4.375, + "grad_norm": 2.635669469833374, + "learning_rate": 0.00012890126410007263, + "loss": 1.4982590675354004, + "mean_token_accuracy": 0.7569169998168945, + "num_tokens": 143360.0, + "step": 280 + }, + { + "entropy": 1.649911642074585, + "epoch": 4.390625, + "grad_norm": 1.8594759702682495, + "learning_rate": 0.00012799042883169574, + "loss": 1.5442874431610107, + "mean_token_accuracy": 0.7460629940032959, + "num_tokens": 143872.0, + "step": 281 + }, + { + "entropy": 1.5503339767456055, + "epoch": 4.40625, + "grad_norm": 1.9106673002243042, + "learning_rate": 0.0001270804222112335, + "loss": 1.3214415311813354, + "mean_token_accuracy": 0.7707910537719727, + "num_tokens": 144384.0, + "step": 282 + }, + { + "entropy": 1.6034382581710815, + "epoch": 4.421875, + "grad_norm": 1.9895292520523071, + "learning_rate": 0.0001261712784999208, + "loss": 1.1624211072921753, + "mean_token_accuracy": 0.8087317943572998, + "num_tokens": 144896.0, + "step": 283 + }, + { + "entropy": 1.6620171070098877, + "epoch": 4.4375, + "grad_norm": 1.7769454717636108, + "learning_rate": 0.0001252630319265045, + "loss": 1.5078996419906616, + "mean_token_accuracy": 0.7757575511932373, + "num_tokens": 145408.0, + "step": 284 + }, + { + "entropy": 1.6838772296905518, + "epoch": 4.453125, + "grad_norm": 2.0985946655273438, + "learning_rate": 0.0001243557166859548, + "loss": 1.7390193939208984, + "mean_token_accuracy": 0.7263779640197754, + "num_tokens": 145920.0, + "step": 285 + }, + { + "entropy": 1.6525346040725708, + "epoch": 4.46875, + "grad_norm": 2.0259389877319336, + "learning_rate": 0.00012344936693817768, + "loss": 1.3714038133621216, + "mean_token_accuracy": 0.7653061151504517, + "num_tokens": 146432.0, + "step": 286 + }, + { + "entropy": 1.6501621007919312, + "epoch": 4.484375, + "grad_norm": 1.975203275680542, + "learning_rate": 0.00012254401680672884, + "loss": 1.5183178186416626, + "mean_token_accuracy": 0.75, + "num_tokens": 146944.0, + "step": 287 + }, + { + "entropy": 1.548411250114441, + "epoch": 4.5, + "grad_norm": 2.04488205909729, + "learning_rate": 0.00012163970037752906, + "loss": 1.3318936824798584, + "mean_token_accuracy": 0.7874494194984436, + "num_tokens": 147456.0, + "step": 288 + }, + { + "entropy": 1.626534104347229, + "epoch": 4.515625, + "grad_norm": 1.795937418937683, + "learning_rate": 0.00012073645169758076, + "loss": 1.386396050453186, + "mean_token_accuracy": 0.7818930149078369, + "num_tokens": 147968.0, + "step": 289 + }, + { + "entropy": 1.6640959978103638, + "epoch": 4.53125, + "grad_norm": 1.907228946685791, + "learning_rate": 0.00011983430477368622, + "loss": 1.4364591836929321, + "mean_token_accuracy": 0.7804877758026123, + "num_tokens": 148480.0, + "step": 290 + }, + { + "entropy": 1.5656487941741943, + "epoch": 4.546875, + "grad_norm": 1.5509178638458252, + "learning_rate": 0.00011893329357116722, + "loss": 1.4595284461975098, + "mean_token_accuracy": 0.7677165269851685, + "num_tokens": 148992.0, + "step": 291 + }, + { + "entropy": 1.5234248638153076, + "epoch": 4.5625, + "grad_norm": 2.0590877532958984, + "learning_rate": 0.0001180334520125863, + "loss": 1.4068399667739868, + "mean_token_accuracy": 0.772819459438324, + "num_tokens": 149504.0, + "step": 292 + }, + { + "entropy": 1.8282629251480103, + "epoch": 4.578125, + "grad_norm": 1.5572700500488281, + "learning_rate": 0.00011713481397646953, + "loss": 1.334957242012024, + "mean_token_accuracy": 0.7923728823661804, + "num_tokens": 150016.0, + "step": 293 + }, + { + "entropy": 1.6177082061767578, + "epoch": 4.59375, + "grad_norm": 1.9526519775390625, + "learning_rate": 0.00011623741329603108, + "loss": 1.2470734119415283, + "mean_token_accuracy": 0.7995867729187012, + "num_tokens": 150528.0, + "step": 294 + }, + { + "entropy": 1.8059481382369995, + "epoch": 4.609375, + "grad_norm": 1.610900640487671, + "learning_rate": 0.00011534128375789933, + "loss": 0.9543240070343018, + "mean_token_accuracy": 0.8344519138336182, + "num_tokens": 151040.0, + "step": 295 + }, + { + "entropy": 1.8307008743286133, + "epoch": 4.625, + "grad_norm": 2.0146024227142334, + "learning_rate": 0.0001144464591008449, + "loss": 1.7310861349105835, + "mean_token_accuracy": 0.7263779640197754, + "num_tokens": 151552.0, + "step": 296 + }, + { + "entropy": 1.754088282585144, + "epoch": 4.640625, + "grad_norm": 1.8568058013916016, + "learning_rate": 0.00011355297301451042, + "loss": 1.3614675998687744, + "mean_token_accuracy": 0.7957446575164795, + "num_tokens": 152064.0, + "step": 297 + }, + { + "entropy": 1.800140380859375, + "epoch": 4.65625, + "grad_norm": 1.858344554901123, + "learning_rate": 0.00011266085913814197, + "loss": 1.2782111167907715, + "mean_token_accuracy": 0.7932489514350891, + "num_tokens": 152576.0, + "step": 298 + }, + { + "entropy": 1.6610287427902222, + "epoch": 4.671875, + "grad_norm": 2.0229034423828125, + "learning_rate": 0.00011177015105932281, + "loss": 1.6102267503738403, + "mean_token_accuracy": 0.7490118741989136, + "num_tokens": 153088.0, + "step": 299 + }, + { + "entropy": 1.509531021118164, + "epoch": 4.6875, + "grad_norm": 1.8015098571777344, + "learning_rate": 0.00011088088231270866, + "loss": 1.0522032976150513, + "mean_token_accuracy": 0.8229166865348816, + "num_tokens": 153600.0, + "step": 300 + }, + { + "entropy": 1.5897778272628784, + "epoch": 4.703125, + "grad_norm": 2.122609853744507, + "learning_rate": 0.00010999308637876524, + "loss": 1.3583630323410034, + "mean_token_accuracy": 0.7762096524238586, + "num_tokens": 154112.0, + "step": 301 + }, + { + "entropy": 1.5236281156539917, + "epoch": 4.71875, + "grad_norm": 2.0490593910217285, + "learning_rate": 0.00010910679668250767, + "loss": 1.568931221961975, + "mean_token_accuracy": 0.7618110179901123, + "num_tokens": 154624.0, + "step": 302 + }, + { + "entropy": 1.6798558235168457, + "epoch": 4.734375, + "grad_norm": 1.9797496795654297, + "learning_rate": 0.00010822204659224204, + "loss": 1.6891627311706543, + "mean_token_accuracy": 0.7342519760131836, + "num_tokens": 155136.0, + "step": 303 + }, + { + "entropy": 1.6777228116989136, + "epoch": 4.75, + "grad_norm": 1.6742204427719116, + "learning_rate": 0.00010733886941830923, + "loss": 1.0915815830230713, + "mean_token_accuracy": 0.8258064389228821, + "num_tokens": 155648.0, + "step": 304 + }, + { + "entropy": 1.6578925848007202, + "epoch": 4.765625, + "grad_norm": 1.6061065196990967, + "learning_rate": 0.00010645729841183066, + "loss": 1.469952940940857, + "mean_token_accuracy": 0.7644710540771484, + "num_tokens": 156160.0, + "step": 305 + }, + { + "entropy": 1.7077447175979614, + "epoch": 4.78125, + "grad_norm": 2.1335628032684326, + "learning_rate": 0.0001055773667634564, + "loss": 1.4437124729156494, + "mean_token_accuracy": 0.7602459192276001, + "num_tokens": 156672.0, + "step": 306 + }, + { + "entropy": 1.5449203252792358, + "epoch": 4.796875, + "grad_norm": 2.039146900177002, + "learning_rate": 0.00010469910760211578, + "loss": 1.4151314496994019, + "mean_token_accuracy": 0.772819459438324, + "num_tokens": 157184.0, + "step": 307 + }, + { + "entropy": 1.5264627933502197, + "epoch": 4.8125, + "grad_norm": 1.8946317434310913, + "learning_rate": 0.00010382255399376975, + "loss": 1.4154433012008667, + "mean_token_accuracy": 0.7569721341133118, + "num_tokens": 157696.0, + "step": 308 + }, + { + "entropy": 1.564460277557373, + "epoch": 4.828125, + "grad_norm": 2.3768820762634277, + "learning_rate": 0.00010294773894016627, + "loss": 1.3997899293899536, + "mean_token_accuracy": 0.7730923891067505, + "num_tokens": 158208.0, + "step": 309 + }, + { + "entropy": 1.548466682434082, + "epoch": 4.84375, + "grad_norm": 2.616581439971924, + "learning_rate": 0.00010207469537759764, + "loss": 1.2758865356445312, + "mean_token_accuracy": 0.7893660664558411, + "num_tokens": 158720.0, + "step": 310 + }, + { + "entropy": 1.6836023330688477, + "epoch": 4.859375, + "grad_norm": 1.9081612825393677, + "learning_rate": 0.00010120345617566057, + "loss": 1.1824684143066406, + "mean_token_accuracy": 0.8062499761581421, + "num_tokens": 159232.0, + "step": 311 + }, + { + "entropy": 1.6853163242340088, + "epoch": 4.875, + "grad_norm": 1.8191639184951782, + "learning_rate": 0.00010033405413601855, + "loss": 1.3885215520858765, + "mean_token_accuracy": 0.7745901346206665, + "num_tokens": 159744.0, + "step": 312 + }, + { + "entropy": 1.5399816036224365, + "epoch": 4.890625, + "grad_norm": 1.456498146057129, + "learning_rate": 9.946652199116699e-05, + "loss": 1.1631718873977661, + "mean_token_accuracy": 0.8189300298690796, + "num_tokens": 160256.0, + "step": 313 + }, + { + "entropy": 1.7912606000900269, + "epoch": 4.90625, + "grad_norm": 1.6905421018600464, + "learning_rate": 9.860089240320085e-05, + "loss": 1.2534205913543701, + "mean_token_accuracy": 0.7970085740089417, + "num_tokens": 160768.0, + "step": 314 + }, + { + "entropy": 1.63590407371521, + "epoch": 4.921875, + "grad_norm": 1.4305380582809448, + "learning_rate": 9.773719796258482e-05, + "loss": 1.370961308479309, + "mean_token_accuracy": 0.7987551689147949, + "num_tokens": 161280.0, + "step": 315 + }, + { + "entropy": 1.5808641910552979, + "epoch": 4.9375, + "grad_norm": 1.9045383930206299, + "learning_rate": 9.687547118692643e-05, + "loss": 1.3712960481643677, + "mean_token_accuracy": 0.7773279547691345, + "num_tokens": 161792.0, + "step": 316 + }, + { + "entropy": 1.5849716663360596, + "epoch": 4.953125, + "grad_norm": 2.0240650177001953, + "learning_rate": 9.601574451975175e-05, + "loss": 1.5580910444259644, + "mean_token_accuracy": 0.751968502998352, + "num_tokens": 162304.0, + "step": 317 + }, + { + "entropy": 1.7162737846374512, + "epoch": 4.96875, + "grad_norm": 1.6734753847122192, + "learning_rate": 9.515805032928391e-05, + "loss": 1.4473096132278442, + "mean_token_accuracy": 0.7798354029655457, + "num_tokens": 162816.0, + "step": 318 + }, + { + "entropy": 1.6163382530212402, + "epoch": 4.984375, + "grad_norm": 1.8411775827407837, + "learning_rate": 9.430242090722436e-05, + "loss": 1.35471773147583, + "mean_token_accuracy": 0.7836734652519226, + "num_tokens": 163328.0, + "step": 319 + }, + { + "entropy": 1.4940069913864136, + "epoch": 5.0, + "grad_norm": 1.8255853652954102, + "learning_rate": 9.344888846753726e-05, + "loss": 1.4874554872512817, + "mean_token_accuracy": 0.7618110179901123, + "num_tokens": 163840.0, + "step": 320 + }, + { + "entropy": 1.549060344696045, + "epoch": 5.015625, + "grad_norm": 1.745566964149475, + "learning_rate": 9.259748514523653e-05, + "loss": 1.4019237756729126, + "mean_token_accuracy": 0.7618110179901123, + "num_tokens": 164352.0, + "step": 321 + }, + { + "entropy": 1.5626025199890137, + "epoch": 5.03125, + "grad_norm": 1.8400838375091553, + "learning_rate": 9.174824299517607e-05, + "loss": 1.07392156124115, + "mean_token_accuracy": 0.8119834661483765, + "num_tokens": 164864.0, + "step": 322 + }, + { + "entropy": 1.6569701433181763, + "epoch": 5.046875, + "grad_norm": 1.8855388164520264, + "learning_rate": 9.09011939908428e-05, + "loss": 1.4062210321426392, + "mean_token_accuracy": 0.751968502998352, + "num_tokens": 165376.0, + "step": 323 + }, + { + "entropy": 1.5729684829711914, + "epoch": 5.0625, + "grad_norm": 1.8148961067199707, + "learning_rate": 9.0056370023153e-05, + "loss": 1.2550370693206787, + "mean_token_accuracy": 0.7766599655151367, + "num_tokens": 165888.0, + "step": 324 + }, + { + "entropy": 1.698303461074829, + "epoch": 5.078125, + "grad_norm": 1.404887080192566, + "learning_rate": 8.921380289925153e-05, + "loss": 0.7562521696090698, + "mean_token_accuracy": 0.8741573095321655, + "num_tokens": 166400.0, + "step": 325 + }, + { + "entropy": 1.4567222595214844, + "epoch": 5.09375, + "grad_norm": 2.1354284286499023, + "learning_rate": 8.837352434131443e-05, + "loss": 1.0930452346801758, + "mean_token_accuracy": 0.7897959351539612, + "num_tokens": 166912.0, + "step": 326 + }, + { + "entropy": 1.5501275062561035, + "epoch": 5.109375, + "grad_norm": 1.6018931865692139, + "learning_rate": 8.753556598535444e-05, + "loss": 1.1254503726959229, + "mean_token_accuracy": 0.8037189841270447, + "num_tokens": 167424.0, + "step": 327 + }, + { + "entropy": 1.4922233819961548, + "epoch": 5.125, + "grad_norm": 2.1968183517456055, + "learning_rate": 8.669995938003005e-05, + "loss": 1.4305412769317627, + "mean_token_accuracy": 0.7421259880065918, + "num_tokens": 167936.0, + "step": 328 + }, + { + "entropy": 1.6226767301559448, + "epoch": 5.140625, + "grad_norm": 1.895836591720581, + "learning_rate": 8.586673598545771e-05, + "loss": 1.1839910745620728, + "mean_token_accuracy": 0.7855669856071472, + "num_tokens": 168448.0, + "step": 329 + }, + { + "entropy": 1.453590750694275, + "epoch": 5.15625, + "grad_norm": 1.9998018741607666, + "learning_rate": 8.503592717202721e-05, + "loss": 1.4171252250671387, + "mean_token_accuracy": 0.7578740119934082, + "num_tokens": 168960.0, + "step": 330 + }, + { + "entropy": 1.523696780204773, + "epoch": 5.171875, + "grad_norm": 2.0707786083221436, + "learning_rate": 8.420756421922088e-05, + "loss": 1.198662519454956, + "mean_token_accuracy": 0.790123462677002, + "num_tokens": 169472.0, + "step": 331 + }, + { + "entropy": 1.4649165868759155, + "epoch": 5.1875, + "grad_norm": 1.9728301763534546, + "learning_rate": 8.338167831443563e-05, + "loss": 1.0776567459106445, + "mean_token_accuracy": 0.8117154836654663, + "num_tokens": 169984.0, + "step": 332 + }, + { + "entropy": 1.3908473253250122, + "epoch": 5.203125, + "grad_norm": 1.7057424783706665, + "learning_rate": 8.255830055180899e-05, + "loss": 1.3423693180084229, + "mean_token_accuracy": 0.7771202921867371, + "num_tokens": 170496.0, + "step": 333 + }, + { + "entropy": 1.602295160293579, + "epoch": 5.21875, + "grad_norm": 1.9868454933166504, + "learning_rate": 8.173746193104845e-05, + "loss": 1.1564866304397583, + "mean_token_accuracy": 0.7991631627082825, + "num_tokens": 171008.0, + "step": 334 + }, + { + "entropy": 1.4484628438949585, + "epoch": 5.234375, + "grad_norm": 1.8873497247695923, + "learning_rate": 8.091919335626399e-05, + "loss": 1.2578895092010498, + "mean_token_accuracy": 0.7795275449752808, + "num_tokens": 171520.0, + "step": 335 + }, + { + "entropy": 1.283211350440979, + "epoch": 5.25, + "grad_norm": 2.281681537628174, + "learning_rate": 8.010352563480509e-05, + "loss": 1.1836169958114624, + "mean_token_accuracy": 0.783730149269104, + "num_tokens": 172032.0, + "step": 336 + }, + { + "entropy": 1.3733656406402588, + "epoch": 5.265625, + "grad_norm": 1.8627188205718994, + "learning_rate": 7.929048947610034e-05, + "loss": 1.0594978332519531, + "mean_token_accuracy": 0.8168724179267883, + "num_tokens": 172544.0, + "step": 337 + }, + { + "entropy": 1.3879873752593994, + "epoch": 5.28125, + "grad_norm": 2.2347571849823, + "learning_rate": 7.84801154905017e-05, + "loss": 1.1629652976989746, + "mean_token_accuracy": 0.7951318621635437, + "num_tokens": 173056.0, + "step": 338 + }, + { + "entropy": 1.3931185007095337, + "epoch": 5.296875, + "grad_norm": 2.036378860473633, + "learning_rate": 7.76724341881316e-05, + "loss": 1.1349397897720337, + "mean_token_accuracy": 0.803680956363678, + "num_tokens": 173568.0, + "step": 339 + }, + { + "entropy": 1.6057257652282715, + "epoch": 5.3125, + "grad_norm": 2.337108612060547, + "learning_rate": 7.686747597773462e-05, + "loss": 1.4726322889328003, + "mean_token_accuracy": 0.7322834730148315, + "num_tokens": 174080.0, + "step": 340 + }, + { + "entropy": 1.5440049171447754, + "epoch": 5.328125, + "grad_norm": 2.250331401824951, + "learning_rate": 7.606527116553241e-05, + "loss": 1.4109631776809692, + "mean_token_accuracy": 0.7570281028747559, + "num_tokens": 174592.0, + "step": 341 + }, + { + "entropy": 1.5478792190551758, + "epoch": 5.34375, + "grad_norm": 1.9495255947113037, + "learning_rate": 7.526584995408275e-05, + "loss": 1.334647297859192, + "mean_token_accuracy": 0.7843942642211914, + "num_tokens": 175104.0, + "step": 342 + }, + { + "entropy": 1.4533789157867432, + "epoch": 5.359375, + "grad_norm": 2.0633509159088135, + "learning_rate": 7.446924244114238e-05, + "loss": 0.9381183385848999, + "mean_token_accuracy": 0.8195329308509827, + "num_tokens": 175616.0, + "step": 343 + }, + { + "entropy": 1.5073310136795044, + "epoch": 5.375, + "grad_norm": 2.1370253562927246, + "learning_rate": 7.367547861853393e-05, + "loss": 1.2126432657241821, + "mean_token_accuracy": 0.7962577939033508, + "num_tokens": 176128.0, + "step": 344 + }, + { + "entropy": 1.5421262979507446, + "epoch": 5.390625, + "grad_norm": 2.130643844604492, + "learning_rate": 7.288458837101675e-05, + "loss": 1.481192946434021, + "mean_token_accuracy": 0.75, + "num_tokens": 176640.0, + "step": 345 + }, + { + "entropy": 1.4015882015228271, + "epoch": 5.40625, + "grad_norm": 1.9656389951705933, + "learning_rate": 7.209660147516154e-05, + "loss": 1.4018691778182983, + "mean_token_accuracy": 0.7677165269851685, + "num_tokens": 177152.0, + "step": 346 + }, + { + "entropy": 1.2663908004760742, + "epoch": 5.421875, + "grad_norm": 1.9276113510131836, + "learning_rate": 7.131154759822968e-05, + "loss": 1.1962919235229492, + "mean_token_accuracy": 0.804780900478363, + "num_tokens": 177664.0, + "step": 347 + }, + { + "entropy": 1.369564175605774, + "epoch": 5.4375, + "grad_norm": 2.2459683418273926, + "learning_rate": 7.052945629705579e-05, + "loss": 1.0512068271636963, + "mean_token_accuracy": 0.8061224222183228, + "num_tokens": 178176.0, + "step": 348 + }, + { + "entropy": 1.4165040254592896, + "epoch": 5.453125, + "grad_norm": 1.9474868774414062, + "learning_rate": 6.975035701693544e-05, + "loss": 1.1490492820739746, + "mean_token_accuracy": 0.7983871102333069, + "num_tokens": 178688.0, + "step": 349 + }, + { + "entropy": 1.396820068359375, + "epoch": 5.46875, + "grad_norm": 2.4981231689453125, + "learning_rate": 6.897427909051607e-05, + "loss": 1.2930082082748413, + "mean_token_accuracy": 0.7736220359802246, + "num_tokens": 179200.0, + "step": 350 + }, + { + "entropy": 1.5704162120819092, + "epoch": 5.484375, + "grad_norm": 1.901033639907837, + "learning_rate": 6.820125173669306e-05, + "loss": 1.0180715322494507, + "mean_token_accuracy": 0.8322580456733704, + "num_tokens": 179712.0, + "step": 351 + }, + { + "entropy": 1.5132912397384644, + "epoch": 5.5, + "grad_norm": 2.2790002822875977, + "learning_rate": 6.743130405950929e-05, + "loss": 1.506807565689087, + "mean_token_accuracy": 0.747035562992096, + "num_tokens": 180224.0, + "step": 352 + }, + { + "entropy": 1.4252924919128418, + "epoch": 5.515625, + "grad_norm": 1.6666802167892456, + "learning_rate": 6.66644650470597e-05, + "loss": 0.6767361760139465, + "mean_token_accuracy": 0.8766520023345947, + "num_tokens": 180736.0, + "step": 353 + }, + { + "entropy": 1.3335093259811401, + "epoch": 5.53125, + "grad_norm": 2.164742946624756, + "learning_rate": 6.59007635703996e-05, + "loss": 1.316645622253418, + "mean_token_accuracy": 0.7795275449752808, + "num_tokens": 181248.0, + "step": 354 + }, + { + "entropy": 1.360636830329895, + "epoch": 5.546875, + "grad_norm": 2.145939350128174, + "learning_rate": 6.514022838245801e-05, + "loss": 1.1844747066497803, + "mean_token_accuracy": 0.782868504524231, + "num_tokens": 181760.0, + "step": 355 + }, + { + "entropy": 1.400923490524292, + "epoch": 5.5625, + "grad_norm": 2.355154037475586, + "learning_rate": 6.438288811695492e-05, + "loss": 1.380852222442627, + "mean_token_accuracy": 0.7618110179901123, + "num_tokens": 182272.0, + "step": 356 + }, + { + "entropy": 1.387778878211975, + "epoch": 5.578125, + "grad_norm": 2.233530282974243, + "learning_rate": 6.362877128732319e-05, + "loss": 1.174194097518921, + "mean_token_accuracy": 0.781124472618103, + "num_tokens": 182784.0, + "step": 357 + }, + { + "entropy": 1.5693912506103516, + "epoch": 5.59375, + "grad_norm": 1.7340208292007446, + "learning_rate": 6.287790628563534e-05, + "loss": 0.8804768323898315, + "mean_token_accuracy": 0.8436123132705688, + "num_tokens": 183296.0, + "step": 358 + }, + { + "entropy": 1.3868722915649414, + "epoch": 5.609375, + "grad_norm": 1.980425477027893, + "learning_rate": 6.213032138153417e-05, + "loss": 0.9284123182296753, + "mean_token_accuracy": 0.8329854011535645, + "num_tokens": 183808.0, + "step": 359 + }, + { + "entropy": 1.5730829238891602, + "epoch": 5.625, + "grad_norm": 2.1033363342285156, + "learning_rate": 6.138604472116889e-05, + "loss": 1.3292860984802246, + "mean_token_accuracy": 0.7835671305656433, + "num_tokens": 184320.0, + "step": 360 + }, + { + "entropy": 1.3760509490966797, + "epoch": 5.640625, + "grad_norm": 2.2947301864624023, + "learning_rate": 6.064510432613499e-05, + "loss": 1.3286343812942505, + "mean_token_accuracy": 0.7677165269851685, + "num_tokens": 184832.0, + "step": 361 + }, + { + "entropy": 1.4730992317199707, + "epoch": 5.65625, + "grad_norm": 1.7933112382888794, + "learning_rate": 5.990752809241968e-05, + "loss": 1.1549919843673706, + "mean_token_accuracy": 0.7962962985038757, + "num_tokens": 185344.0, + "step": 362 + }, + { + "entropy": 1.396909236907959, + "epoch": 5.671875, + "grad_norm": 2.2379417419433594, + "learning_rate": 5.917334378935118e-05, + "loss": 1.1229009628295898, + "mean_token_accuracy": 0.7857142686843872, + "num_tokens": 185856.0, + "step": 363 + }, + { + "entropy": 1.545285701751709, + "epoch": 5.6875, + "grad_norm": 2.0039150714874268, + "learning_rate": 5.8442579058553556e-05, + "loss": 1.2438194751739502, + "mean_token_accuracy": 0.7868852615356445, + "num_tokens": 186368.0, + "step": 364 + }, + { + "entropy": 1.264091968536377, + "epoch": 5.703125, + "grad_norm": 2.11838436126709, + "learning_rate": 5.771526141290599e-05, + "loss": 1.0326876640319824, + "mean_token_accuracy": 0.8125, + "num_tokens": 186880.0, + "step": 365 + }, + { + "entropy": 1.390173077583313, + "epoch": 5.71875, + "grad_norm": 2.515826463699341, + "learning_rate": 5.6991418235506615e-05, + "loss": 1.2848618030548096, + "mean_token_accuracy": 0.7696850299835205, + "num_tokens": 187392.0, + "step": 366 + }, + { + "entropy": 1.4828932285308838, + "epoch": 5.734375, + "grad_norm": 3.0285286903381348, + "learning_rate": 5.627107677864206e-05, + "loss": 1.4369527101516724, + "mean_token_accuracy": 0.7539370059967041, + "num_tokens": 187904.0, + "step": 367 + }, + { + "entropy": 1.3886916637420654, + "epoch": 5.75, + "grad_norm": 2.0452868938446045, + "learning_rate": 5.555426416276093e-05, + "loss": 1.0845024585723877, + "mean_token_accuracy": 0.8155737519264221, + "num_tokens": 188416.0, + "step": 368 + }, + { + "entropy": 1.3375890254974365, + "epoch": 5.765625, + "grad_norm": 2.1924948692321777, + "learning_rate": 5.4841007375453186e-05, + "loss": 1.253337025642395, + "mean_token_accuracy": 0.7795275449752808, + "num_tokens": 188928.0, + "step": 369 + }, + { + "entropy": 1.457409381866455, + "epoch": 5.78125, + "grad_norm": 2.147710084915161, + "learning_rate": 5.413133327043364e-05, + "loss": 1.1731246709823608, + "mean_token_accuracy": 0.800407350063324, + "num_tokens": 189440.0, + "step": 370 + }, + { + "entropy": 1.477699637413025, + "epoch": 5.796875, + "grad_norm": 2.23069429397583, + "learning_rate": 5.34252685665313e-05, + "loss": 0.9165684580802917, + "mean_token_accuracy": 0.8441558480262756, + "num_tokens": 189952.0, + "step": 371 + }, + { + "entropy": 1.5255780220031738, + "epoch": 5.8125, + "grad_norm": 2.031313180923462, + "learning_rate": 5.272283984668313e-05, + "loss": 1.2056560516357422, + "mean_token_accuracy": 0.8024948239326477, + "num_tokens": 190464.0, + "step": 372 + }, + { + "entropy": 1.4738668203353882, + "epoch": 5.828125, + "grad_norm": 2.3102340698242188, + "learning_rate": 5.2024073556933516e-05, + "loss": 1.2545979022979736, + "mean_token_accuracy": 0.7870182394981384, + "num_tokens": 190976.0, + "step": 373 + }, + { + "entropy": 1.4832838773727417, + "epoch": 5.84375, + "grad_norm": 1.9114770889282227, + "learning_rate": 5.13289960054382e-05, + "loss": 1.063859224319458, + "mean_token_accuracy": 0.8204593062400818, + "num_tokens": 191488.0, + "step": 374 + }, + { + "entropy": 1.4326761960983276, + "epoch": 5.859375, + "grad_norm": 2.0074758529663086, + "learning_rate": 5.063763336147421e-05, + "loss": 1.0995495319366455, + "mean_token_accuracy": 0.7991718649864197, + "num_tokens": 192000.0, + "step": 375 + }, + { + "entropy": 1.218077301979065, + "epoch": 5.875, + "grad_norm": 2.2023136615753174, + "learning_rate": 4.9950011654454394e-05, + "loss": 1.2278096675872803, + "mean_token_accuracy": 0.7854330539703369, + "num_tokens": 192512.0, + "step": 376 + }, + { + "entropy": 1.4478676319122314, + "epoch": 5.890625, + "grad_norm": 2.902523994445801, + "learning_rate": 4.926615677294723e-05, + "loss": 1.3362658023834229, + "mean_token_accuracy": 0.7757201790809631, + "num_tokens": 193024.0, + "step": 377 + }, + { + "entropy": 1.5202453136444092, + "epoch": 5.90625, + "grad_norm": 2.273052453994751, + "learning_rate": 4.8586094463702626e-05, + "loss": 1.5842170715332031, + "mean_token_accuracy": 0.7618110179901123, + "num_tokens": 193536.0, + "step": 378 + }, + { + "entropy": 1.467264175415039, + "epoch": 5.921875, + "grad_norm": 2.2763044834136963, + "learning_rate": 4.7909850330682046e-05, + "loss": 1.1156023740768433, + "mean_token_accuracy": 0.7871900796890259, + "num_tokens": 194048.0, + "step": 379 + }, + { + "entropy": 1.2735207080841064, + "epoch": 5.9375, + "grad_norm": 2.1325440406799316, + "learning_rate": 4.7237449834094956e-05, + "loss": 1.0693836212158203, + "mean_token_accuracy": 0.8052738308906555, + "num_tokens": 194560.0, + "step": 380 + }, + { + "entropy": 1.4293042421340942, + "epoch": 5.953125, + "grad_norm": 2.122368097305298, + "learning_rate": 4.656891828943996e-05, + "loss": 1.4175359010696411, + "mean_token_accuracy": 0.7657480239868164, + "num_tokens": 195072.0, + "step": 381 + }, + { + "entropy": 1.3723492622375488, + "epoch": 5.96875, + "grad_norm": 2.236246347427368, + "learning_rate": 4.5904280866551926e-05, + "loss": 1.3109025955200195, + "mean_token_accuracy": 0.7814960479736328, + "num_tokens": 195584.0, + "step": 382 + }, + { + "entropy": 1.4019876718521118, + "epoch": 5.984375, + "grad_norm": 2.12497878074646, + "learning_rate": 4.5243562588654076e-05, + "loss": 1.4718176126480103, + "mean_token_accuracy": 0.7657480239868164, + "num_tokens": 196096.0, + "step": 383 + }, + { + "entropy": 1.4074183702468872, + "epoch": 6.0, + "grad_norm": 2.1889452934265137, + "learning_rate": 4.4586788331416235e-05, + "loss": 1.2516783475875854, + "mean_token_accuracy": 0.7814960479736328, + "num_tokens": 196608.0, + "step": 384 + }, + { + "entropy": 1.4472509622573853, + "epoch": 6.015625, + "grad_norm": 2.2552759647369385, + "learning_rate": 4.3933982822017876e-05, + "loss": 1.1190861463546753, + "mean_token_accuracy": 0.7952286005020142, + "num_tokens": 197120.0, + "step": 385 + }, + { + "entropy": 1.3649916648864746, + "epoch": 6.03125, + "grad_norm": 2.2668163776397705, + "learning_rate": 4.3285170638217514e-05, + "loss": 1.1158784627914429, + "mean_token_accuracy": 0.789370059967041, + "num_tokens": 197632.0, + "step": 386 + }, + { + "entropy": 1.3257055282592773, + "epoch": 6.046875, + "grad_norm": 2.023829698562622, + "learning_rate": 4.264037620742721e-05, + "loss": 0.9378941655158997, + "mean_token_accuracy": 0.8217213153839111, + "num_tokens": 198144.0, + "step": 387 + }, + { + "entropy": 1.5115382671356201, + "epoch": 6.0625, + "grad_norm": 1.9996124505996704, + "learning_rate": 4.199962380579275e-05, + "loss": 0.8486489057540894, + "mean_token_accuracy": 0.8340517282485962, + "num_tokens": 198656.0, + "step": 388 + }, + { + "entropy": 1.3277173042297363, + "epoch": 6.078125, + "grad_norm": 2.146892547607422, + "learning_rate": 4.136293755727998e-05, + "loss": 1.0290329456329346, + "mean_token_accuracy": 0.8016194105148315, + "num_tokens": 199168.0, + "step": 389 + }, + { + "entropy": 1.558488130569458, + "epoch": 6.09375, + "grad_norm": 2.4036035537719727, + "learning_rate": 4.073034143276622e-05, + "loss": 1.5012381076812744, + "mean_token_accuracy": 0.7381889820098877, + "num_tokens": 199680.0, + "step": 390 + }, + { + "entropy": 1.4996081590652466, + "epoch": 6.109375, + "grad_norm": 1.952165126800537, + "learning_rate": 4.010185924913809e-05, + "loss": 0.9398556351661682, + "mean_token_accuracy": 0.8113207817077637, + "num_tokens": 200192.0, + "step": 391 + }, + { + "entropy": 1.3826960325241089, + "epoch": 6.125, + "grad_norm": 2.272625207901001, + "learning_rate": 3.947751466839451e-05, + "loss": 1.102223515510559, + "mean_token_accuracy": 0.811475396156311, + "num_tokens": 200704.0, + "step": 392 + }, + { + "entropy": 1.4677839279174805, + "epoch": 6.140625, + "grad_norm": 1.7582485675811768, + "learning_rate": 3.885733119675616e-05, + "loss": 0.5989710092544556, + "mean_token_accuracy": 0.8839285969734192, + "num_tokens": 201216.0, + "step": 393 + }, + { + "entropy": 1.6351354122161865, + "epoch": 6.15625, + "grad_norm": 1.6400773525238037, + "learning_rate": 3.8241332183780105e-05, + "loss": 0.6478846073150635, + "mean_token_accuracy": 0.8755760192871094, + "num_tokens": 201728.0, + "step": 394 + }, + { + "entropy": 1.2197786569595337, + "epoch": 6.171875, + "grad_norm": 2.219877243041992, + "learning_rate": 3.762954082148113e-05, + "loss": 0.9954053163528442, + "mean_token_accuracy": 0.803960382938385, + "num_tokens": 202240.0, + "step": 395 + }, + { + "entropy": 1.3044180870056152, + "epoch": 6.1875, + "grad_norm": 2.2616729736328125, + "learning_rate": 3.702198014345813e-05, + "loss": 1.1857067346572876, + "mean_token_accuracy": 0.7786561250686646, + "num_tokens": 202752.0, + "step": 396 + }, + { + "entropy": 1.4440141916275024, + "epoch": 6.203125, + "grad_norm": 2.3922085762023926, + "learning_rate": 3.641867302402731e-05, + "loss": 1.0443466901779175, + "mean_token_accuracy": 0.8074533939361572, + "num_tokens": 203264.0, + "step": 397 + }, + { + "entropy": 1.4667960405349731, + "epoch": 6.21875, + "grad_norm": 1.9791189432144165, + "learning_rate": 3.5819642177360744e-05, + "loss": 0.9006252884864807, + "mean_token_accuracy": 0.8259023427963257, + "num_tokens": 203776.0, + "step": 398 + }, + { + "entropy": 1.2889478206634521, + "epoch": 6.234375, + "grad_norm": 2.4023540019989014, + "learning_rate": 3.5224910156631154e-05, + "loss": 1.1616065502166748, + "mean_token_accuracy": 0.7854330539703369, + "num_tokens": 204288.0, + "step": 399 + }, + { + "entropy": 1.3888750076293945, + "epoch": 6.25, + "grad_norm": 3.2403616905212402, + "learning_rate": 3.4634499353163075e-05, + "loss": 1.2539849281311035, + "mean_token_accuracy": 0.7775590419769287, + "num_tokens": 204800.0, + "step": 400 + }, + { + "entropy": 1.4797190427780151, + "epoch": 6.265625, + "grad_norm": 2.4358670711517334, + "learning_rate": 3.404843199558945e-05, + "loss": 1.1758122444152832, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 205312.0, + "step": 401 + }, + { + "entropy": 1.337235689163208, + "epoch": 6.28125, + "grad_norm": 2.2013049125671387, + "learning_rate": 3.346673014901515e-05, + "loss": 1.0903222560882568, + "mean_token_accuracy": 0.8020201921463013, + "num_tokens": 205824.0, + "step": 402 + }, + { + "entropy": 1.3701938390731812, + "epoch": 6.296875, + "grad_norm": 2.5289316177368164, + "learning_rate": 3.288941571418582e-05, + "loss": 1.1845871210098267, + "mean_token_accuracy": 0.7736220359802246, + "num_tokens": 206336.0, + "step": 403 + }, + { + "entropy": 1.2536909580230713, + "epoch": 6.3125, + "grad_norm": 2.306061029434204, + "learning_rate": 3.2316510426663745e-05, + "loss": 1.063169240951538, + "mean_token_accuracy": 0.8110235929489136, + "num_tokens": 206848.0, + "step": 404 + }, + { + "entropy": 1.3208762407302856, + "epoch": 6.328125, + "grad_norm": 2.4128077030181885, + "learning_rate": 3.174803585600906e-05, + "loss": 0.8401311039924622, + "mean_token_accuracy": 0.8350951671600342, + "num_tokens": 207360.0, + "step": 405 + }, + { + "entropy": 1.3629928827285767, + "epoch": 6.34375, + "grad_norm": 2.3814520835876465, + "learning_rate": 3.1184013404968174e-05, + "loss": 1.1989407539367676, + "mean_token_accuracy": 0.7854330539703369, + "num_tokens": 207872.0, + "step": 406 + }, + { + "entropy": 1.4353210926055908, + "epoch": 6.359375, + "grad_norm": 2.414461612701416, + "learning_rate": 3.062446430866748e-05, + "loss": 1.0983890295028687, + "mean_token_accuracy": 0.8086419701576233, + "num_tokens": 208384.0, + "step": 407 + }, + { + "entropy": 1.324364185333252, + "epoch": 6.375, + "grad_norm": 2.3120386600494385, + "learning_rate": 3.0069409633814228e-05, + "loss": 1.0871342420578003, + "mean_token_accuracy": 0.7933070659637451, + "num_tokens": 208896.0, + "step": 408 + }, + { + "entropy": 1.4769569635391235, + "epoch": 6.390625, + "grad_norm": 2.1611950397491455, + "learning_rate": 2.9518870277903274e-05, + "loss": 1.0565687417984009, + "mean_token_accuracy": 0.8062499761581421, + "num_tokens": 209408.0, + "step": 409 + }, + { + "entropy": 1.2712613344192505, + "epoch": 6.40625, + "grad_norm": 2.1195592880249023, + "learning_rate": 2.8972866968430098e-05, + "loss": 0.8914839029312134, + "mean_token_accuracy": 0.8381742835044861, + "num_tokens": 209920.0, + "step": 410 + }, + { + "entropy": 1.3642261028289795, + "epoch": 6.421875, + "grad_norm": 2.089002847671509, + "learning_rate": 2.84314202621108e-05, + "loss": 1.042889952659607, + "mean_token_accuracy": 0.8139059543609619, + "num_tokens": 210432.0, + "step": 411 + }, + { + "entropy": 1.2996537685394287, + "epoch": 6.4375, + "grad_norm": 2.5762758255004883, + "learning_rate": 2.7894550544107737e-05, + "loss": 1.1719818115234375, + "mean_token_accuracy": 0.7814960479736328, + "num_tokens": 210944.0, + "step": 412 + }, + { + "entropy": 1.325704574584961, + "epoch": 6.453125, + "grad_norm": 2.469484329223633, + "learning_rate": 2.7362278027262457e-05, + "loss": 0.9921610355377197, + "mean_token_accuracy": 0.7991803288459778, + "num_tokens": 211456.0, + "step": 413 + }, + { + "entropy": 1.2528401613235474, + "epoch": 6.46875, + "grad_norm": 2.5817010402679443, + "learning_rate": 2.68346227513343e-05, + "loss": 1.012644648551941, + "mean_token_accuracy": 0.8104838728904724, + "num_tokens": 211968.0, + "step": 414 + }, + { + "entropy": 1.3440062999725342, + "epoch": 6.484375, + "grad_norm": 2.138012170791626, + "learning_rate": 2.6311604582246238e-05, + "loss": 0.9965952634811401, + "mean_token_accuracy": 0.8103092908859253, + "num_tokens": 212480.0, + "step": 415 + }, + { + "entropy": 1.2636010646820068, + "epoch": 6.5, + "grad_norm": 3.449289083480835, + "learning_rate": 2.5793243211336645e-05, + "loss": 1.2079899311065674, + "mean_token_accuracy": 0.7696850299835205, + "num_tokens": 212992.0, + "step": 416 + }, + { + "entropy": 1.3567794561386108, + "epoch": 6.515625, + "grad_norm": 2.7362849712371826, + "learning_rate": 2.5279558154618197e-05, + "loss": 1.003839373588562, + "mean_token_accuracy": 0.8060606122016907, + "num_tokens": 213504.0, + "step": 417 + }, + { + "entropy": 1.477582335472107, + "epoch": 6.53125, + "grad_norm": 2.272670030593872, + "learning_rate": 2.4770568752042995e-05, + "loss": 0.9059895873069763, + "mean_token_accuracy": 0.8144989609718323, + "num_tokens": 214016.0, + "step": 418 + }, + { + "entropy": 1.4449951648712158, + "epoch": 6.546875, + "grad_norm": 2.270644187927246, + "learning_rate": 2.4266294166774288e-05, + "loss": 1.1380894184112549, + "mean_token_accuracy": 0.7979592084884644, + "num_tokens": 214528.0, + "step": 419 + }, + { + "entropy": 1.210769772529602, + "epoch": 6.5625, + "grad_norm": 2.6737060546875, + "learning_rate": 2.376675338446525e-05, + "loss": 1.0513958930969238, + "mean_token_accuracy": 0.8007968068122864, + "num_tokens": 215040.0, + "step": 420 + }, + { + "entropy": 1.351199984550476, + "epoch": 6.578125, + "grad_norm": 2.685697078704834, + "learning_rate": 2.3271965212543932e-05, + "loss": 1.2178758382797241, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 215552.0, + "step": 421 + }, + { + "entropy": 1.4135276079177856, + "epoch": 6.59375, + "grad_norm": 2.340247869491577, + "learning_rate": 2.278194827950543e-05, + "loss": 1.250877857208252, + "mean_token_accuracy": 0.7775590419769287, + "num_tokens": 216064.0, + "step": 422 + }, + { + "entropy": 1.230191946029663, + "epoch": 6.609375, + "grad_norm": 2.4362449645996094, + "learning_rate": 2.2296721034210218e-05, + "loss": 1.1102575063705444, + "mean_token_accuracy": 0.7924901247024536, + "num_tokens": 216576.0, + "step": 423 + }, + { + "entropy": 1.311639428138733, + "epoch": 6.625, + "grad_norm": 2.4978349208831787, + "learning_rate": 2.1816301745189933e-05, + "loss": 1.133759617805481, + "mean_token_accuracy": 0.7854330539703369, + "num_tokens": 217088.0, + "step": 424 + }, + { + "entropy": 1.344711422920227, + "epoch": 6.640625, + "grad_norm": 2.260749578475952, + "learning_rate": 2.1340708499959197e-05, + "loss": 1.0410059690475464, + "mean_token_accuracy": 0.8065173029899597, + "num_tokens": 217600.0, + "step": 425 + }, + { + "entropy": 1.2411428689956665, + "epoch": 6.65625, + "grad_norm": 2.3936338424682617, + "learning_rate": 2.0869959204334935e-05, + "loss": 1.097914457321167, + "mean_token_accuracy": 0.796407163143158, + "num_tokens": 218112.0, + "step": 426 + }, + { + "entropy": 1.447340965270996, + "epoch": 6.671875, + "grad_norm": 2.2700304985046387, + "learning_rate": 2.0404071581761995e-05, + "loss": 1.098491907119751, + "mean_token_accuracy": 0.7946611642837524, + "num_tokens": 218624.0, + "step": 427 + }, + { + "entropy": 1.4071056842803955, + "epoch": 6.6875, + "grad_norm": 2.3570244312286377, + "learning_rate": 1.9943063172646085e-05, + "loss": 1.2364177703857422, + "mean_token_accuracy": 0.7695390582084656, + "num_tokens": 219136.0, + "step": 428 + }, + { + "entropy": 1.3318933248519897, + "epoch": 6.703125, + "grad_norm": 2.6349446773529053, + "learning_rate": 1.9486951333693296e-05, + "loss": 1.1402510404586792, + "mean_token_accuracy": 0.7907444834709167, + "num_tokens": 219648.0, + "step": 429 + }, + { + "entropy": 1.3377373218536377, + "epoch": 6.71875, + "grad_norm": 2.332634925842285, + "learning_rate": 1.9035753237256486e-05, + "loss": 1.0878653526306152, + "mean_token_accuracy": 0.7923387289047241, + "num_tokens": 220160.0, + "step": 430 + }, + { + "entropy": 1.3920668363571167, + "epoch": 6.734375, + "grad_norm": 2.1408042907714844, + "learning_rate": 1.8589485870689023e-05, + "loss": 1.233846664428711, + "mean_token_accuracy": 0.7933070659637451, + "num_tokens": 220672.0, + "step": 431 + }, + { + "entropy": 1.3373357057571411, + "epoch": 6.75, + "grad_norm": 3.003164768218994, + "learning_rate": 1.814816603570497e-05, + "loss": 1.2591382265090942, + "mean_token_accuracy": 0.7657480239868164, + "num_tokens": 221184.0, + "step": 432 + }, + { + "entropy": 1.3029452562332153, + "epoch": 6.765625, + "grad_norm": 2.2512388229370117, + "learning_rate": 1.7711810347746757e-05, + "loss": 1.1033636331558228, + "mean_token_accuracy": 0.805220901966095, + "num_tokens": 221696.0, + "step": 433 + }, + { + "entropy": 1.3201580047607422, + "epoch": 6.78125, + "grad_norm": 2.216168165206909, + "learning_rate": 1.728043523535933e-05, + "loss": 0.8506653308868408, + "mean_token_accuracy": 0.8407643437385559, + "num_tokens": 222208.0, + "step": 434 + }, + { + "entropy": 1.3668451309204102, + "epoch": 6.796875, + "grad_norm": 2.320620059967041, + "learning_rate": 1.6854056939571925e-05, + "loss": 1.1892409324645996, + "mean_token_accuracy": 0.7755905389785767, + "num_tokens": 222720.0, + "step": 435 + }, + { + "entropy": 1.3634605407714844, + "epoch": 6.8125, + "grad_norm": 2.3351492881774902, + "learning_rate": 1.6432691513286318e-05, + "loss": 1.1596734523773193, + "mean_token_accuracy": 0.787401556968689, + "num_tokens": 223232.0, + "step": 436 + }, + { + "entropy": 1.494618535041809, + "epoch": 6.828125, + "grad_norm": 2.343942165374756, + "learning_rate": 1.6016354820672715e-05, + "loss": 1.0683393478393555, + "mean_token_accuracy": 0.8004158139228821, + "num_tokens": 223744.0, + "step": 437 + }, + { + "entropy": 1.3987553119659424, + "epoch": 6.84375, + "grad_norm": 2.640872001647949, + "learning_rate": 1.560506253657223e-05, + "loss": 0.9901496767997742, + "mean_token_accuracy": 0.8179916143417358, + "num_tokens": 224256.0, + "step": 438 + }, + { + "entropy": 1.3903093338012695, + "epoch": 6.859375, + "grad_norm": 2.8874828815460205, + "learning_rate": 1.519883014590691e-05, + "loss": 1.076893925666809, + "mean_token_accuracy": 0.7962962985038757, + "num_tokens": 224768.0, + "step": 439 + }, + { + "entropy": 1.4098860025405884, + "epoch": 6.875, + "grad_norm": 2.306347370147705, + "learning_rate": 1.4797672943096711e-05, + "loss": 1.2542139291763306, + "mean_token_accuracy": 0.7786720395088196, + "num_tokens": 225280.0, + "step": 440 + }, + { + "entropy": 1.4133609533309937, + "epoch": 6.890625, + "grad_norm": 2.25966739654541, + "learning_rate": 1.4401606031483497e-05, + "loss": 1.2815967798233032, + "mean_token_accuracy": 0.7742574214935303, + "num_tokens": 225792.0, + "step": 441 + }, + { + "entropy": 1.248429775238037, + "epoch": 6.90625, + "grad_norm": 2.32254695892334, + "learning_rate": 1.4010644322762699e-05, + "loss": 0.8687695860862732, + "mean_token_accuracy": 0.8353909254074097, + "num_tokens": 226304.0, + "step": 442 + }, + { + "entropy": 1.3251779079437256, + "epoch": 6.921875, + "grad_norm": 2.5684423446655273, + "learning_rate": 1.3624802536421641e-05, + "loss": 1.2095526456832886, + "mean_token_accuracy": 0.7775590419769287, + "num_tokens": 226816.0, + "step": 443 + }, + { + "entropy": 1.2735978364944458, + "epoch": 6.9375, + "grad_norm": 2.184539794921875, + "learning_rate": 1.3244095199185534e-05, + "loss": 1.0298391580581665, + "mean_token_accuracy": 0.8132529854774475, + "num_tokens": 227328.0, + "step": 444 + }, + { + "entropy": 1.426642656326294, + "epoch": 6.953125, + "grad_norm": 2.2295777797698975, + "learning_rate": 1.2868536644470396e-05, + "loss": 1.2079875469207764, + "mean_token_accuracy": 0.7871485948562622, + "num_tokens": 227840.0, + "step": 445 + }, + { + "entropy": 1.306974172592163, + "epoch": 6.96875, + "grad_norm": 2.530898094177246, + "learning_rate": 1.249814101184361e-05, + "loss": 1.1371548175811768, + "mean_token_accuracy": 0.7834645509719849, + "num_tokens": 228352.0, + "step": 446 + }, + { + "entropy": 1.3690567016601562, + "epoch": 6.984375, + "grad_norm": 2.2345566749572754, + "learning_rate": 1.2132922246491333e-05, + "loss": 1.045675277709961, + "mean_token_accuracy": 0.802874743938446, + "num_tokens": 228864.0, + "step": 447 + }, + { + "entropy": 1.3706583976745605, + "epoch": 7.0, + "grad_norm": 2.3838391304016113, + "learning_rate": 1.177289409869373e-05, + "loss": 0.8126500844955444, + "mean_token_accuracy": 0.8404255509376526, + "num_tokens": 229376.0, + "step": 448 + }, + { + "entropy": 1.4000086784362793, + "epoch": 7.015625, + "grad_norm": 2.8213186264038086, + "learning_rate": 1.1418070123306989e-05, + "loss": 1.2279022932052612, + "mean_token_accuracy": 0.7637795209884644, + "num_tokens": 229888.0, + "step": 449 + }, + { + "entropy": 1.4237322807312012, + "epoch": 7.03125, + "grad_norm": 1.9904886484146118, + "learning_rate": 1.1068463679253293e-05, + "loss": 0.717651903629303, + "mean_token_accuracy": 0.8627451062202454, + "num_tokens": 230400.0, + "step": 450 + }, + { + "entropy": 1.3660708665847778, + "epoch": 7.046875, + "grad_norm": 2.419309139251709, + "learning_rate": 1.0724087929017677e-05, + "loss": 1.1893762350082397, + "mean_token_accuracy": 0.7775590419769287, + "num_tokens": 230912.0, + "step": 451 + }, + { + "entropy": 1.4297083616256714, + "epoch": 7.0625, + "grad_norm": 2.564194679260254, + "learning_rate": 1.0384955838152442e-05, + "loss": 0.9669811725616455, + "mean_token_accuracy": 0.8092243075370789, + "num_tokens": 231424.0, + "step": 452 + }, + { + "entropy": 1.2599018812179565, + "epoch": 7.078125, + "grad_norm": 2.5647976398468018, + "learning_rate": 1.0051080174789172e-05, + "loss": 0.9141256809234619, + "mean_token_accuracy": 0.8220859169960022, + "num_tokens": 231936.0, + "step": 453 + }, + { + "entropy": 1.3683661222457886, + "epoch": 7.09375, + "grad_norm": 2.4807143211364746, + "learning_rate": 9.722473509157857e-06, + "loss": 1.0112833976745605, + "mean_token_accuracy": 0.806584358215332, + "num_tokens": 232448.0, + "step": 454 + }, + { + "entropy": 1.4733227491378784, + "epoch": 7.109375, + "grad_norm": 2.253063440322876, + "learning_rate": 9.399148213113772e-06, + "loss": 0.8170402646064758, + "mean_token_accuracy": 0.850649356842041, + "num_tokens": 232960.0, + "step": 455 + }, + { + "entropy": 1.2353498935699463, + "epoch": 7.125, + "grad_norm": 2.056302547454834, + "learning_rate": 9.081116459671511e-06, + "loss": 0.8609686493873596, + "mean_token_accuracy": 0.8292682766914368, + "num_tokens": 233472.0, + "step": 456 + }, + { + "entropy": 1.3128821849822998, + "epoch": 7.140625, + "grad_norm": 2.351895570755005, + "learning_rate": 8.768390222546895e-06, + "loss": 0.9468564391136169, + "mean_token_accuracy": 0.8200408816337585, + "num_tokens": 233984.0, + "step": 457 + }, + { + "entropy": 1.3826062679290771, + "epoch": 7.15625, + "grad_norm": 2.380359172821045, + "learning_rate": 8.460981275705942e-06, + "loss": 0.9691749811172485, + "mean_token_accuracy": 0.8140496015548706, + "num_tokens": 234496.0, + "step": 458 + }, + { + "entropy": 1.3204487562179565, + "epoch": 7.171875, + "grad_norm": 2.3769736289978027, + "learning_rate": 8.158901192921823e-06, + "loss": 1.018041968345642, + "mean_token_accuracy": 0.805220901966095, + "num_tokens": 235008.0, + "step": 459 + }, + { + "entropy": 1.301025390625, + "epoch": 7.1875, + "grad_norm": 2.4315240383148193, + "learning_rate": 7.862161347338836e-06, + "loss": 0.9299899339675903, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 235520.0, + "step": 460 + }, + { + "entropy": 1.1855802536010742, + "epoch": 7.203125, + "grad_norm": 2.7347376346588135, + "learning_rate": 7.570772911044498e-06, + "loss": 1.0176899433135986, + "mean_token_accuracy": 0.8063241243362427, + "num_tokens": 236032.0, + "step": 461 + }, + { + "entropy": 1.3519090414047241, + "epoch": 7.21875, + "grad_norm": 2.1036648750305176, + "learning_rate": 7.284746854648748e-06, + "loss": 0.780006468296051, + "mean_token_accuracy": 0.8479657173156738, + "num_tokens": 236544.0, + "step": 462 + }, + { + "entropy": 1.3214199542999268, + "epoch": 7.234375, + "grad_norm": 2.382298707962036, + "learning_rate": 7.00409394687092e-06, + "loss": 1.1440752744674683, + "mean_token_accuracy": 0.7913385629653931, + "num_tokens": 237056.0, + "step": 463 + }, + { + "entropy": 1.482852816581726, + "epoch": 7.25, + "grad_norm": 2.64764142036438, + "learning_rate": 6.728824754134398e-06, + "loss": 1.1399593353271484, + "mean_token_accuracy": 0.7877551317214966, + "num_tokens": 237568.0, + "step": 464 + }, + { + "entropy": 1.3064343929290771, + "epoch": 7.265625, + "grad_norm": 2.473623752593994, + "learning_rate": 6.458949640168675e-06, + "loss": 0.8528488874435425, + "mean_token_accuracy": 0.828157365322113, + "num_tokens": 238080.0, + "step": 465 + }, + { + "entropy": 1.1553479433059692, + "epoch": 7.28125, + "grad_norm": 2.495208501815796, + "learning_rate": 6.1944787656192765e-06, + "loss": 0.8627750873565674, + "mean_token_accuracy": 0.8340080976486206, + "num_tokens": 238592.0, + "step": 466 + }, + { + "entropy": 1.411194920539856, + "epoch": 7.296875, + "grad_norm": 2.01597261428833, + "learning_rate": 5.935422087665132e-06, + "loss": 0.8617269992828369, + "mean_token_accuracy": 0.8397436141967773, + "num_tokens": 239104.0, + "step": 467 + }, + { + "entropy": 1.3339228630065918, + "epoch": 7.3125, + "grad_norm": 3.0477869510650635, + "learning_rate": 5.681789359643779e-06, + "loss": 0.9688454270362854, + "mean_token_accuracy": 0.7971311211585999, + "num_tokens": 239616.0, + "step": 468 + }, + { + "entropy": 1.497868299484253, + "epoch": 7.328125, + "grad_norm": 2.4108757972717285, + "learning_rate": 5.4335901306840235e-06, + "loss": 0.8658555746078491, + "mean_token_accuracy": 0.8293736577033997, + "num_tokens": 240128.0, + "step": 469 + }, + { + "entropy": 1.3256721496582031, + "epoch": 7.34375, + "grad_norm": 2.7251064777374268, + "learning_rate": 5.190833745346606e-06, + "loss": 0.959291934967041, + "mean_token_accuracy": 0.8202019929885864, + "num_tokens": 240640.0, + "step": 470 + }, + { + "entropy": 1.3926868438720703, + "epoch": 7.359375, + "grad_norm": 2.09291410446167, + "learning_rate": 4.953529343272189e-06, + "loss": 0.841661274433136, + "mean_token_accuracy": 0.8400852680206299, + "num_tokens": 241152.0, + "step": 471 + }, + { + "entropy": 1.2108948230743408, + "epoch": 7.375, + "grad_norm": 2.283916711807251, + "learning_rate": 4.721685858837393e-06, + "loss": 1.1214765310287476, + "mean_token_accuracy": 0.807539701461792, + "num_tokens": 241664.0, + "step": 472 + }, + { + "entropy": 1.4406850337982178, + "epoch": 7.390625, + "grad_norm": 2.1983139514923096, + "learning_rate": 4.495312020818403e-06, + "loss": 1.072562336921692, + "mean_token_accuracy": 0.8132780194282532, + "num_tokens": 242176.0, + "step": 473 + }, + { + "entropy": 1.4410004615783691, + "epoch": 7.40625, + "grad_norm": 2.522141933441162, + "learning_rate": 4.2744163520622325e-06, + "loss": 1.186017394065857, + "mean_token_accuracy": 0.7716535329818726, + "num_tokens": 242688.0, + "step": 474 + }, + { + "entropy": 1.443795084953308, + "epoch": 7.421875, + "grad_norm": 2.468191385269165, + "learning_rate": 4.05900716916599e-06, + "loss": 1.041429042816162, + "mean_token_accuracy": 0.8024691343307495, + "num_tokens": 243200.0, + "step": 475 + }, + { + "entropy": 1.4273154735565186, + "epoch": 7.4375, + "grad_norm": 2.0933821201324463, + "learning_rate": 3.849092582163621e-06, + "loss": 0.7831029295921326, + "mean_token_accuracy": 0.8481561541557312, + "num_tokens": 243712.0, + "step": 476 + }, + { + "entropy": 1.1794345378875732, + "epoch": 7.453125, + "grad_norm": 2.505176544189453, + "learning_rate": 3.6446804942207306e-06, + "loss": 1.0385328531265259, + "mean_token_accuracy": 0.7952755689620972, + "num_tokens": 244224.0, + "step": 477 + }, + { + "entropy": 1.3042283058166504, + "epoch": 7.46875, + "grad_norm": 2.2837576866149902, + "learning_rate": 3.4457786013368403e-06, + "loss": 1.044230580329895, + "mean_token_accuracy": 0.8084677457809448, + "num_tokens": 244736.0, + "step": 478 + }, + { + "entropy": 1.3156306743621826, + "epoch": 7.484375, + "grad_norm": 3.331399440765381, + "learning_rate": 3.252394392055868e-06, + "loss": 1.091844916343689, + "mean_token_accuracy": 0.789370059967041, + "num_tokens": 245248.0, + "step": 479 + }, + { + "entropy": 1.2917033433914185, + "epoch": 7.5, + "grad_norm": 2.2519593238830566, + "learning_rate": 3.064535147183922e-06, + "loss": 1.050087332725525, + "mean_token_accuracy": 0.8056111931800842, + "num_tokens": 245760.0, + "step": 480 + }, + { + "entropy": 1.3923559188842773, + "epoch": 7.515625, + "grad_norm": 2.422091007232666, + "learning_rate": 2.882207939515435e-06, + "loss": 0.909593403339386, + "mean_token_accuracy": 0.8132780194282532, + "num_tokens": 246272.0, + "step": 481 + }, + { + "entropy": 1.279539942741394, + "epoch": 7.53125, + "grad_norm": 2.5200605392456055, + "learning_rate": 2.7054196335667133e-06, + "loss": 1.2554877996444702, + "mean_token_accuracy": 0.7920792102813721, + "num_tokens": 246784.0, + "step": 482 + }, + { + "entropy": 1.3303519487380981, + "epoch": 7.546875, + "grad_norm": 2.704820394515991, + "learning_rate": 2.534176885317557e-06, + "loss": 1.147330641746521, + "mean_token_accuracy": 0.7834645509719849, + "num_tokens": 247296.0, + "step": 483 + }, + { + "entropy": 1.2642784118652344, + "epoch": 7.5625, + "grad_norm": 2.542236089706421, + "learning_rate": 2.368486141960646e-06, + "loss": 0.8348578810691833, + "mean_token_accuracy": 0.8431771993637085, + "num_tokens": 247808.0, + "step": 484 + }, + { + "entropy": 1.379125952720642, + "epoch": 7.578125, + "grad_norm": 2.5580883026123047, + "learning_rate": 2.2083536416588165e-06, + "loss": 1.251000165939331, + "mean_token_accuracy": 0.7775590419769287, + "num_tokens": 248320.0, + "step": 485 + }, + { + "entropy": 1.2168140411376953, + "epoch": 7.59375, + "grad_norm": 2.39772367477417, + "learning_rate": 2.053785413310216e-06, + "loss": 1.0238333940505981, + "mean_token_accuracy": 0.7952755689620972, + "num_tokens": 248832.0, + "step": 486 + }, + { + "entropy": 1.4885849952697754, + "epoch": 7.609375, + "grad_norm": 2.4192397594451904, + "learning_rate": 1.9047872763212347e-06, + "loss": 1.1900980472564697, + "mean_token_accuracy": 0.7917525768280029, + "num_tokens": 249344.0, + "step": 487 + }, + { + "entropy": 1.2745150327682495, + "epoch": 7.625, + "grad_norm": 2.538703203201294, + "learning_rate": 1.7613648403875802e-06, + "loss": 0.8750802874565125, + "mean_token_accuracy": 0.8223140239715576, + "num_tokens": 249856.0, + "step": 488 + }, + { + "entropy": 1.3218390941619873, + "epoch": 7.640625, + "grad_norm": 2.5100438594818115, + "learning_rate": 1.6235235052828476e-06, + "loss": 1.1404964923858643, + "mean_token_accuracy": 0.787401556968689, + "num_tokens": 250368.0, + "step": 489 + }, + { + "entropy": 1.3542776107788086, + "epoch": 7.65625, + "grad_norm": 2.773221969604492, + "learning_rate": 1.4912684606554482e-06, + "loss": 1.2540241479873657, + "mean_token_accuracy": 0.7559055089950562, + "num_tokens": 250880.0, + "step": 490 + }, + { + "entropy": 1.227868914604187, + "epoch": 7.671875, + "grad_norm": 2.7581679821014404, + "learning_rate": 1.3646046858329984e-06, + "loss": 0.8930553197860718, + "mean_token_accuracy": 0.8255578279495239, + "num_tokens": 251392.0, + "step": 491 + }, + { + "entropy": 1.2694940567016602, + "epoch": 7.6875, + "grad_norm": 2.4182348251342773, + "learning_rate": 1.2435369496350711e-06, + "loss": 1.0677306652069092, + "mean_token_accuracy": 0.7913385629653931, + "num_tokens": 251904.0, + "step": 492 + }, + { + "entropy": 1.341015338897705, + "epoch": 7.703125, + "grad_norm": 2.8310325145721436, + "learning_rate": 1.128069810193505e-06, + "loss": 1.1821932792663574, + "mean_token_accuracy": 0.7696850299835205, + "num_tokens": 252416.0, + "step": 493 + }, + { + "entropy": 1.3577978610992432, + "epoch": 7.71875, + "grad_norm": 2.641374111175537, + "learning_rate": 1.018207614780825e-06, + "loss": 1.157777190208435, + "mean_token_accuracy": 0.7795275449752808, + "num_tokens": 252928.0, + "step": 494 + }, + { + "entropy": 1.3288973569869995, + "epoch": 7.734375, + "grad_norm": 2.7389330863952637, + "learning_rate": 9.139544996465908e-07, + "loss": 1.018500804901123, + "mean_token_accuracy": 0.8004032373428345, + "num_tokens": 253440.0, + "step": 495 + }, + { + "entropy": 1.0912328958511353, + "epoch": 7.75, + "grad_norm": 2.2947134971618652, + "learning_rate": 8.153143898616876e-07, + "loss": 0.9197831749916077, + "mean_token_accuracy": 0.8167330622673035, + "num_tokens": 253952.0, + "step": 496 + }, + { + "entropy": 1.4121466875076294, + "epoch": 7.765625, + "grad_norm": 2.588513135910034, + "learning_rate": 7.222909991704773e-07, + "loss": 1.215504765510559, + "mean_token_accuracy": 0.772455096244812, + "num_tokens": 254464.0, + "step": 497 + }, + { + "entropy": 1.4346033334732056, + "epoch": 7.78125, + "grad_norm": 2.178701400756836, + "learning_rate": 6.348878298510274e-07, + "loss": 0.8305696845054626, + "mean_token_accuracy": 0.8376068472862244, + "num_tokens": 254976.0, + "step": 498 + }, + { + "entropy": 1.3011205196380615, + "epoch": 7.796875, + "grad_norm": 2.3126487731933594, + "learning_rate": 5.531081725832998e-07, + "loss": 1.1374728679656982, + "mean_token_accuracy": 0.7913385629653931, + "num_tokens": 255488.0, + "step": 499 + }, + { + "entropy": 1.401319146156311, + "epoch": 7.8125, + "grad_norm": 2.48214054107666, + "learning_rate": 4.769551063251497e-07, + "loss": 1.0062270164489746, + "mean_token_accuracy": 0.8073770403862, + "num_tokens": 256000.0, + "step": 500 + }, + { + "entropy": 1.2516509294509888, + "epoch": 7.828125, + "grad_norm": 2.50232195854187, + "learning_rate": 4.064314981964689e-07, + "loss": 0.7502802014350891, + "mean_token_accuracy": 0.850210964679718, + "num_tokens": 256512.0, + "step": 501 + }, + { + "entropy": 1.3641161918640137, + "epoch": 7.84375, + "grad_norm": 2.276825428009033, + "learning_rate": 3.415400033712545e-07, + "loss": 1.0526609420776367, + "mean_token_accuracy": 0.7983871102333069, + "num_tokens": 257024.0, + "step": 502 + }, + { + "entropy": 1.1331143379211426, + "epoch": 7.859375, + "grad_norm": 2.2967472076416016, + "learning_rate": 2.822830649776231e-07, + "loss": 0.7948004603385925, + "mean_token_accuracy": 0.8393574357032776, + "num_tokens": 257536.0, + "step": 503 + }, + { + "entropy": 1.3357852697372437, + "epoch": 7.875, + "grad_norm": 2.6608662605285645, + "learning_rate": 2.2866291400578385e-07, + "loss": 1.15070641040802, + "mean_token_accuracy": 0.7736220359802246, + "num_tokens": 258048.0, + "step": 504 + }, + { + "entropy": 1.2608537673950195, + "epoch": 7.890625, + "grad_norm": 2.4359138011932373, + "learning_rate": 1.8068156922413924e-07, + "loss": 1.077385663986206, + "mean_token_accuracy": 0.8011810779571533, + "num_tokens": 258560.0, + "step": 505 + }, + { + "entropy": 1.3851075172424316, + "epoch": 7.90625, + "grad_norm": 2.354517936706543, + "learning_rate": 1.3834083710319577e-07, + "loss": 0.9085444808006287, + "mean_token_accuracy": 0.837837815284729, + "num_tokens": 259072.0, + "step": 506 + }, + { + "entropy": 1.3305258750915527, + "epoch": 7.921875, + "grad_norm": 2.5277912616729736, + "learning_rate": 1.0164231174756843e-07, + "loss": 1.1649365425109863, + "mean_token_accuracy": 0.7972440719604492, + "num_tokens": 259584.0, + "step": 507 + }, + { + "entropy": 1.3707011938095093, + "epoch": 7.9375, + "grad_norm": 2.5459818840026855, + "learning_rate": 7.058737483602861e-08, + "loss": 1.1839475631713867, + "mean_token_accuracy": 0.7755905389785767, + "num_tokens": 260096.0, + "step": 508 + }, + { + "entropy": 1.3563947677612305, + "epoch": 7.953125, + "grad_norm": 2.303511142730713, + "learning_rate": 4.51771955693625e-08, + "loss": 1.1173865795135498, + "mean_token_accuracy": 0.7955911755561829, + "num_tokens": 260608.0, + "step": 509 + }, + { + "entropy": 1.3299047946929932, + "epoch": 7.96875, + "grad_norm": 2.4176976680755615, + "learning_rate": 2.541273062648952e-08, + "loss": 0.9735764265060425, + "mean_token_accuracy": 0.8189300298690796, + "num_tokens": 261120.0, + "step": 510 + }, + { + "entropy": 1.252206563949585, + "epoch": 7.984375, + "grad_norm": 2.453003168106079, + "learning_rate": 1.1294724128324551e-08, + "loss": 1.102551817893982, + "mean_token_accuracy": 0.782608687877655, + "num_tokens": 261632.0, + "step": 511 + }, + { + "entropy": 1.3690651655197144, + "epoch": 8.0, + "grad_norm": 2.252596616744995, + "learning_rate": 2.8237076098336365e-09, + "loss": 1.034975528717041, + "mean_token_accuracy": 0.8102040886878967, + "num_tokens": 262144.0, + "step": 512 + } + ], + "logging_steps": 1, + "max_steps": 512, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 128, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 70078365696000.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}