diff --git "a/VideoRFT-SFT-64/checkpoint-2500/trainer_state.json" "b/VideoRFT-SFT-64/checkpoint-2500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/VideoRFT-SFT-64/checkpoint-2500/trainer_state.json" @@ -0,0 +1,22533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5599032497171614, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006241953731517965, + "grad_norm": 17.25544548034668, + "learning_rate": 9.996e-07, + "loss": 8.5434, + "mean_token_accuracy": 0.273918142542243, + "num_tokens": 68439.0, + "step": 1 + }, + { + "epoch": 0.001248390746303593, + "grad_norm": 358.458984375, + "learning_rate": 9.992e-07, + "loss": 9.5736, + "mean_token_accuracy": 0.23063147906214, + "num_tokens": 138954.0, + "step": 2 + }, + { + "epoch": 0.0018725861194553895, + "grad_norm": 27.68710708618164, + "learning_rate": 9.988e-07, + "loss": 9.2342, + "mean_token_accuracy": 0.24578081723302603, + "num_tokens": 208893.0, + "step": 3 + }, + { + "epoch": 0.002496781492607186, + "grad_norm": 11.739906311035156, + "learning_rate": 9.983999999999998e-07, + "loss": 8.3543, + "mean_token_accuracy": 0.28470986522734165, + "num_tokens": 276500.0, + "step": 4 + }, + { + "epoch": 0.0031209768657589824, + "grad_norm": 14.976438522338867, + "learning_rate": 9.98e-07, + "loss": 9.3069, + "mean_token_accuracy": 0.23937270510941744, + "num_tokens": 348577.0, + "step": 5 + }, + { + "epoch": 0.003745172238910779, + "grad_norm": 34.87214660644531, + "learning_rate": 9.976e-07, + "loss": 8.2966, + "mean_token_accuracy": 0.2748953141272068, + "num_tokens": 418896.0, + "step": 6 + }, + { + "epoch": 0.004369367612062576, + "grad_norm": 18.72233772277832, + "learning_rate": 9.972e-07, + "loss": 8.5032, + "mean_token_accuracy": 0.27094727009534836, + "num_tokens": 489370.0, + "step": 7 + }, + { + "epoch": 0.004993562985214372, + "grad_norm": 11.080031394958496, + "learning_rate": 9.968e-07, + "loss": 9.3235, + "mean_token_accuracy": 0.24496650323271751, + "num_tokens": 562867.0, + "step": 8 + }, + { + "epoch": 0.005617758358366169, + "grad_norm": 90.85437774658203, + "learning_rate": 9.964e-07, + "loss": 8.7871, + "mean_token_accuracy": 0.2528521418571472, + "num_tokens": 634883.0, + "step": 9 + }, + { + "epoch": 0.006241953731517965, + "grad_norm": 30.238351821899414, + "learning_rate": 9.959999999999999e-07, + "loss": 8.3435, + "mean_token_accuracy": 0.2776096425950527, + "num_tokens": 707954.0, + "step": 10 + }, + { + "epoch": 0.006866149104669762, + "grad_norm": 15.585051536560059, + "learning_rate": 9.956e-07, + "loss": 7.7686, + "mean_token_accuracy": 0.3210552539676428, + "num_tokens": 780843.0, + "step": 11 + }, + { + "epoch": 0.007490344477821558, + "grad_norm": 15.990130424499512, + "learning_rate": 9.952e-07, + "loss": 7.8187, + "mean_token_accuracy": 0.3180449502542615, + "num_tokens": 854137.0, + "step": 12 + }, + { + "epoch": 0.008114539850973355, + "grad_norm": 25.397552490234375, + "learning_rate": 9.948e-07, + "loss": 7.9021, + "mean_token_accuracy": 0.2877113036811352, + "num_tokens": 926274.0, + "step": 13 + }, + { + "epoch": 0.008738735224125152, + "grad_norm": 12.293536186218262, + "learning_rate": 9.944e-07, + "loss": 7.169, + "mean_token_accuracy": 0.3381050443276763, + "num_tokens": 996445.0, + "step": 14 + }, + { + "epoch": 0.009362930597276947, + "grad_norm": 45.536712646484375, + "learning_rate": 9.94e-07, + "loss": 7.8367, + "mean_token_accuracy": 0.30633394606411457, + "num_tokens": 1068884.0, + "step": 15 + }, + { + "epoch": 0.009987125970428744, + "grad_norm": 64.1750717163086, + "learning_rate": 9.936e-07, + "loss": 7.5882, + "mean_token_accuracy": 0.31630368810147047, + "num_tokens": 1142700.0, + "step": 16 + }, + { + "epoch": 0.01061132134358054, + "grad_norm": 42.90964889526367, + "learning_rate": 9.931999999999999e-07, + "loss": 7.8268, + "mean_token_accuracy": 0.2754935659468174, + "num_tokens": 1216019.0, + "step": 17 + }, + { + "epoch": 0.011235516716732338, + "grad_norm": 6.927515029907227, + "learning_rate": 9.928e-07, + "loss": 6.9444, + "mean_token_accuracy": 0.33526306599378586, + "num_tokens": 1285731.0, + "step": 18 + }, + { + "epoch": 0.011859712089884135, + "grad_norm": 24.95079231262207, + "learning_rate": 9.923999999999998e-07, + "loss": 6.5419, + "mean_token_accuracy": 0.34128680545836687, + "num_tokens": 1358383.0, + "step": 19 + }, + { + "epoch": 0.01248390746303593, + "grad_norm": 33.162437438964844, + "learning_rate": 9.92e-07, + "loss": 6.5872, + "mean_token_accuracy": 0.3143086237832904, + "num_tokens": 1430552.0, + "step": 20 + }, + { + "epoch": 0.013108102836187727, + "grad_norm": 11.807770729064941, + "learning_rate": 9.916e-07, + "loss": 5.0385, + "mean_token_accuracy": 0.3795516202226281, + "num_tokens": 1494029.0, + "step": 21 + }, + { + "epoch": 0.013732298209339524, + "grad_norm": 27.99307632446289, + "learning_rate": 9.912e-07, + "loss": 6.1081, + "mean_token_accuracy": 0.32700329925864935, + "num_tokens": 1565051.0, + "step": 22 + }, + { + "epoch": 0.01435649358249132, + "grad_norm": 16.961132049560547, + "learning_rate": 9.908e-07, + "loss": 6.285, + "mean_token_accuracy": 0.3062258521094918, + "num_tokens": 1638553.0, + "step": 23 + }, + { + "epoch": 0.014980688955643116, + "grad_norm": 9.736113548278809, + "learning_rate": 9.903999999999999e-07, + "loss": 5.5588, + "mean_token_accuracy": 0.35241850558668375, + "num_tokens": 1708763.0, + "step": 24 + }, + { + "epoch": 0.015604884328794913, + "grad_norm": 11.06151008605957, + "learning_rate": 9.9e-07, + "loss": 6.1869, + "mean_token_accuracy": 0.31777405086904764, + "num_tokens": 1784143.0, + "step": 25 + }, + { + "epoch": 0.01622907970194671, + "grad_norm": 11.496856689453125, + "learning_rate": 9.896e-07, + "loss": 5.58, + "mean_token_accuracy": 0.3433418981730938, + "num_tokens": 1853644.0, + "step": 26 + }, + { + "epoch": 0.016853275075098505, + "grad_norm": 8.099380493164062, + "learning_rate": 9.892e-07, + "loss": 6.3192, + "mean_token_accuracy": 0.3149422388523817, + "num_tokens": 1929869.0, + "step": 27 + }, + { + "epoch": 0.017477470448250303, + "grad_norm": 30.143823623657227, + "learning_rate": 9.888e-07, + "loss": 5.7214, + "mean_token_accuracy": 0.32226298097521067, + "num_tokens": 2001157.0, + "step": 28 + }, + { + "epoch": 0.0181016658214021, + "grad_norm": 6.38374137878418, + "learning_rate": 9.884e-07, + "loss": 5.7185, + "mean_token_accuracy": 0.3307513426989317, + "num_tokens": 2075305.0, + "step": 29 + }, + { + "epoch": 0.018725861194553894, + "grad_norm": 0.8968116044998169, + "learning_rate": 9.88e-07, + "loss": 5.4392, + "mean_token_accuracy": 0.34164623729884624, + "num_tokens": 2146379.0, + "step": 30 + }, + { + "epoch": 0.019350056567705692, + "grad_norm": 13.619659423828125, + "learning_rate": 9.876e-07, + "loss": 5.656, + "mean_token_accuracy": 0.3418971076607704, + "num_tokens": 2221821.0, + "step": 31 + }, + { + "epoch": 0.019974251940857488, + "grad_norm": 4.945206165313721, + "learning_rate": 9.871999999999998e-07, + "loss": 5.0611, + "mean_token_accuracy": 0.35001065488904715, + "num_tokens": 2292070.0, + "step": 32 + }, + { + "epoch": 0.020598447314009286, + "grad_norm": 40.115970611572266, + "learning_rate": 9.868e-07, + "loss": 5.1247, + "mean_token_accuracy": 0.35443792305886745, + "num_tokens": 2364977.0, + "step": 33 + }, + { + "epoch": 0.02122264268716108, + "grad_norm": 13.273247718811035, + "learning_rate": 9.864e-07, + "loss": 4.9631, + "mean_token_accuracy": 0.364372487179935, + "num_tokens": 2435306.0, + "step": 34 + }, + { + "epoch": 0.021846838060312877, + "grad_norm": 7.187255382537842, + "learning_rate": 9.86e-07, + "loss": 4.4443, + "mean_token_accuracy": 0.3889983994886279, + "num_tokens": 2506196.0, + "step": 35 + }, + { + "epoch": 0.022471033433464675, + "grad_norm": 14.940446853637695, + "learning_rate": 9.856e-07, + "loss": 4.8625, + "mean_token_accuracy": 0.3650856511667371, + "num_tokens": 2579194.0, + "step": 36 + }, + { + "epoch": 0.02309522880661647, + "grad_norm": 3.3641610145568848, + "learning_rate": 9.852e-07, + "loss": 5.0014, + "mean_token_accuracy": 0.3314152853563428, + "num_tokens": 2654270.0, + "step": 37 + }, + { + "epoch": 0.02371942417976827, + "grad_norm": 6.470470905303955, + "learning_rate": 9.847999999999999e-07, + "loss": 4.857, + "mean_token_accuracy": 0.3407899560406804, + "num_tokens": 2728760.0, + "step": 38 + }, + { + "epoch": 0.024343619552920064, + "grad_norm": 6.731522083282471, + "learning_rate": 9.844e-07, + "loss": 5.1998, + "mean_token_accuracy": 0.2973237466067076, + "num_tokens": 2805485.0, + "step": 39 + }, + { + "epoch": 0.02496781492607186, + "grad_norm": 2.7483808994293213, + "learning_rate": 9.84e-07, + "loss": 4.0682, + "mean_token_accuracy": 0.39288341347128153, + "num_tokens": 2875491.0, + "step": 40 + }, + { + "epoch": 0.025592010299223658, + "grad_norm": 4.212852954864502, + "learning_rate": 9.836e-07, + "loss": 4.913, + "mean_token_accuracy": 0.3132828688248992, + "num_tokens": 2950304.0, + "step": 41 + }, + { + "epoch": 0.026216205672375453, + "grad_norm": 3.8348255157470703, + "learning_rate": 9.832e-07, + "loss": 4.3569, + "mean_token_accuracy": 0.3750774832442403, + "num_tokens": 3021730.0, + "step": 42 + }, + { + "epoch": 0.02684040104552725, + "grad_norm": 0.8846646547317505, + "learning_rate": 9.828e-07, + "loss": 3.9858, + "mean_token_accuracy": 0.39173205476254225, + "num_tokens": 3091280.0, + "step": 43 + }, + { + "epoch": 0.027464596418679047, + "grad_norm": 6.942370414733887, + "learning_rate": 9.824e-07, + "loss": 4.419, + "mean_token_accuracy": 0.3575789360329509, + "num_tokens": 3165267.0, + "step": 44 + }, + { + "epoch": 0.028088791791830842, + "grad_norm": 2.3689680099487305, + "learning_rate": 9.819999999999999e-07, + "loss": 4.1538, + "mean_token_accuracy": 0.3756133858114481, + "num_tokens": 3237323.0, + "step": 45 + }, + { + "epoch": 0.02871298716498264, + "grad_norm": 1.8894206285476685, + "learning_rate": 9.816e-07, + "loss": 3.9113, + "mean_token_accuracy": 0.4163288287818432, + "num_tokens": 3308609.0, + "step": 46 + }, + { + "epoch": 0.029337182538134436, + "grad_norm": 13.67268180847168, + "learning_rate": 9.811999999999998e-07, + "loss": 3.6531, + "mean_token_accuracy": 0.41791752725839615, + "num_tokens": 3376906.0, + "step": 47 + }, + { + "epoch": 0.02996137791128623, + "grad_norm": 4.209970474243164, + "learning_rate": 9.808e-07, + "loss": 4.5005, + "mean_token_accuracy": 0.34258707892149687, + "num_tokens": 3452278.0, + "step": 48 + }, + { + "epoch": 0.03058557328443803, + "grad_norm": 7.526029109954834, + "learning_rate": 9.804e-07, + "loss": 4.2911, + "mean_token_accuracy": 0.3827690966427326, + "num_tokens": 3527058.0, + "step": 49 + }, + { + "epoch": 0.031209768657589825, + "grad_norm": 2.122880220413208, + "learning_rate": 9.8e-07, + "loss": 4.3308, + "mean_token_accuracy": 0.3736236896365881, + "num_tokens": 3600503.0, + "step": 50 + }, + { + "epoch": 0.031833964030741624, + "grad_norm": 3.8492212295532227, + "learning_rate": 9.796e-07, + "loss": 4.099, + "mean_token_accuracy": 0.3650047332048416, + "num_tokens": 3673199.0, + "step": 51 + }, + { + "epoch": 0.03245815940389342, + "grad_norm": 3.9944045543670654, + "learning_rate": 9.791999999999999e-07, + "loss": 3.1844, + "mean_token_accuracy": 0.4542169217020273, + "num_tokens": 3740412.0, + "step": 52 + }, + { + "epoch": 0.033082354777045214, + "grad_norm": 6.169367790222168, + "learning_rate": 9.788e-07, + "loss": 4.0363, + "mean_token_accuracy": 0.3692982345819473, + "num_tokens": 3815563.0, + "step": 53 + }, + { + "epoch": 0.03370655015019701, + "grad_norm": 2.1248767375946045, + "learning_rate": 9.784e-07, + "loss": 3.5037, + "mean_token_accuracy": 0.4372657844796777, + "num_tokens": 3885917.0, + "step": 54 + }, + { + "epoch": 0.03433074552334881, + "grad_norm": 6.588809013366699, + "learning_rate": 9.78e-07, + "loss": 3.4912, + "mean_token_accuracy": 0.41233247332274914, + "num_tokens": 3956647.0, + "step": 55 + }, + { + "epoch": 0.03495494089650061, + "grad_norm": 4.523096084594727, + "learning_rate": 9.776e-07, + "loss": 3.9076, + "mean_token_accuracy": 0.39284045808017254, + "num_tokens": 4033452.0, + "step": 56 + }, + { + "epoch": 0.0355791362696524, + "grad_norm": 7.290942668914795, + "learning_rate": 9.772e-07, + "loss": 3.5392, + "mean_token_accuracy": 0.427378224208951, + "num_tokens": 4104394.0, + "step": 57 + }, + { + "epoch": 0.0362033316428042, + "grad_norm": 10.499979972839355, + "learning_rate": 9.768e-07, + "loss": 3.8001, + "mean_token_accuracy": 0.39460830576717854, + "num_tokens": 4179110.0, + "step": 58 + }, + { + "epoch": 0.03682752701595599, + "grad_norm": 2.6359169483184814, + "learning_rate": 9.764e-07, + "loss": 3.3732, + "mean_token_accuracy": 0.42214369028806686, + "num_tokens": 4249007.0, + "step": 59 + }, + { + "epoch": 0.03745172238910779, + "grad_norm": 2.7533557415008545, + "learning_rate": 9.759999999999998e-07, + "loss": 3.5268, + "mean_token_accuracy": 0.41251463908702135, + "num_tokens": 4320460.0, + "step": 60 + }, + { + "epoch": 0.03807591776225959, + "grad_norm": 7.864406108856201, + "learning_rate": 9.756e-07, + "loss": 3.4771, + "mean_token_accuracy": 0.4127057120203972, + "num_tokens": 4393683.0, + "step": 61 + }, + { + "epoch": 0.038700113135411385, + "grad_norm": 8.053912162780762, + "learning_rate": 9.752e-07, + "loss": 3.6572, + "mean_token_accuracy": 0.39490477088838816, + "num_tokens": 4469096.0, + "step": 62 + }, + { + "epoch": 0.03932430850856318, + "grad_norm": 5.509406089782715, + "learning_rate": 9.748e-07, + "loss": 3.1147, + "mean_token_accuracy": 0.44453135784715414, + "num_tokens": 4536789.0, + "step": 63 + }, + { + "epoch": 0.039948503881714975, + "grad_norm": 2.907407522201538, + "learning_rate": 9.744e-07, + "loss": 3.191, + "mean_token_accuracy": 0.4393644714727998, + "num_tokens": 4608142.0, + "step": 64 + }, + { + "epoch": 0.04057269925486677, + "grad_norm": 2.617462158203125, + "learning_rate": 9.74e-07, + "loss": 3.4136, + "mean_token_accuracy": 0.3966305162757635, + "num_tokens": 4681844.0, + "step": 65 + }, + { + "epoch": 0.04119689462801857, + "grad_norm": 2.643540143966675, + "learning_rate": 9.735999999999999e-07, + "loss": 3.558, + "mean_token_accuracy": 0.4038737080991268, + "num_tokens": 4757568.0, + "step": 66 + }, + { + "epoch": 0.04182109000117037, + "grad_norm": 0.34584879875183105, + "learning_rate": 9.731999999999998e-07, + "loss": 3.3735, + "mean_token_accuracy": 0.41008515655994415, + "num_tokens": 4830387.0, + "step": 67 + }, + { + "epoch": 0.04244528537432216, + "grad_norm": 8.369765281677246, + "learning_rate": 9.728e-07, + "loss": 3.5372, + "mean_token_accuracy": 0.4134551119059324, + "num_tokens": 4903369.0, + "step": 68 + }, + { + "epoch": 0.04306948074747396, + "grad_norm": 2.8616392612457275, + "learning_rate": 9.724e-07, + "loss": 3.6074, + "mean_token_accuracy": 0.3741075322031975, + "num_tokens": 4979757.0, + "step": 69 + }, + { + "epoch": 0.04369367612062575, + "grad_norm": 8.232832908630371, + "learning_rate": 9.72e-07, + "loss": 3.8138, + "mean_token_accuracy": 0.3565077856183052, + "num_tokens": 5057123.0, + "step": 70 + }, + { + "epoch": 0.044317871493777555, + "grad_norm": 5.5518646240234375, + "learning_rate": 9.716e-07, + "loss": 3.3853, + "mean_token_accuracy": 0.40091484785079956, + "num_tokens": 5131793.0, + "step": 71 + }, + { + "epoch": 0.04494206686692935, + "grad_norm": 7.901788711547852, + "learning_rate": 9.712e-07, + "loss": 3.1943, + "mean_token_accuracy": 0.42189537547528744, + "num_tokens": 5204744.0, + "step": 72 + }, + { + "epoch": 0.045566262240081146, + "grad_norm": 5.840290069580078, + "learning_rate": 9.707999999999999e-07, + "loss": 2.9538, + "mean_token_accuracy": 0.44593478739261627, + "num_tokens": 5274641.0, + "step": 73 + }, + { + "epoch": 0.04619045761323294, + "grad_norm": 5.598299980163574, + "learning_rate": 9.704e-07, + "loss": 2.7585, + "mean_token_accuracy": 0.4696329589933157, + "num_tokens": 5342886.0, + "step": 74 + }, + { + "epoch": 0.046814652986384736, + "grad_norm": 5.895838260650635, + "learning_rate": 9.7e-07, + "loss": 3.0451, + "mean_token_accuracy": 0.43241861648857594, + "num_tokens": 5416456.0, + "step": 75 + }, + { + "epoch": 0.04743884835953654, + "grad_norm": 2.880441665649414, + "learning_rate": 9.696e-07, + "loss": 3.2557, + "mean_token_accuracy": 0.4075364079326391, + "num_tokens": 5490235.0, + "step": 76 + }, + { + "epoch": 0.04806304373268833, + "grad_norm": 0.316191703081131, + "learning_rate": 9.692e-07, + "loss": 2.4266, + "mean_token_accuracy": 0.49917714670300484, + "num_tokens": 5555086.0, + "step": 77 + }, + { + "epoch": 0.04868723910584013, + "grad_norm": 5.317595481872559, + "learning_rate": 9.688e-07, + "loss": 3.3519, + "mean_token_accuracy": 0.3744805287569761, + "num_tokens": 5629905.0, + "step": 78 + }, + { + "epoch": 0.049311434478991924, + "grad_norm": 8.304346084594727, + "learning_rate": 9.684e-07, + "loss": 3.2339, + "mean_token_accuracy": 0.39141651801764965, + "num_tokens": 5702121.0, + "step": 79 + }, + { + "epoch": 0.04993562985214372, + "grad_norm": 8.848092079162598, + "learning_rate": 9.679999999999999e-07, + "loss": 3.4629, + "mean_token_accuracy": 0.3851572498679161, + "num_tokens": 5779350.0, + "step": 80 + }, + { + "epoch": 0.050559825225295514, + "grad_norm": 8.295083045959473, + "learning_rate": 9.676e-07, + "loss": 3.3826, + "mean_token_accuracy": 0.39699453115463257, + "num_tokens": 5856826.0, + "step": 81 + }, + { + "epoch": 0.051184020598447316, + "grad_norm": 10.870695114135742, + "learning_rate": 9.671999999999998e-07, + "loss": 2.8136, + "mean_token_accuracy": 0.47172751650214195, + "num_tokens": 5926579.0, + "step": 82 + }, + { + "epoch": 0.05180821597159911, + "grad_norm": 2.6977903842926025, + "learning_rate": 9.668e-07, + "loss": 2.7124, + "mean_token_accuracy": 0.46093476563692093, + "num_tokens": 5998201.0, + "step": 83 + }, + { + "epoch": 0.05243241134475091, + "grad_norm": 5.437819957733154, + "learning_rate": 9.664e-07, + "loss": 2.6206, + "mean_token_accuracy": 0.4777855705469847, + "num_tokens": 6067593.0, + "step": 84 + }, + { + "epoch": 0.0530566067179027, + "grad_norm": 5.301544189453125, + "learning_rate": 9.66e-07, + "loss": 2.7652, + "mean_token_accuracy": 0.45118373818695545, + "num_tokens": 6137564.0, + "step": 85 + }, + { + "epoch": 0.0536808020910545, + "grad_norm": 8.00975227355957, + "learning_rate": 9.656e-07, + "loss": 3.056, + "mean_token_accuracy": 0.426558505743742, + "num_tokens": 6213150.0, + "step": 86 + }, + { + "epoch": 0.0543049974642063, + "grad_norm": 5.430366039276123, + "learning_rate": 9.651999999999999e-07, + "loss": 2.7748, + "mean_token_accuracy": 0.44263155199587345, + "num_tokens": 6284919.0, + "step": 87 + }, + { + "epoch": 0.054929192837358094, + "grad_norm": 5.378356456756592, + "learning_rate": 9.647999999999999e-07, + "loss": 2.7902, + "mean_token_accuracy": 0.4566228464245796, + "num_tokens": 6357628.0, + "step": 88 + }, + { + "epoch": 0.05555338821050989, + "grad_norm": 5.3619704246521, + "learning_rate": 9.644e-07, + "loss": 3.3253, + "mean_token_accuracy": 0.38746162317693233, + "num_tokens": 6433563.0, + "step": 89 + }, + { + "epoch": 0.056177583583661685, + "grad_norm": 7.832232475280762, + "learning_rate": 9.64e-07, + "loss": 2.7835, + "mean_token_accuracy": 0.4331488534808159, + "num_tokens": 6504577.0, + "step": 90 + }, + { + "epoch": 0.05680177895681348, + "grad_norm": 2.9571073055267334, + "learning_rate": 9.636e-07, + "loss": 2.8488, + "mean_token_accuracy": 0.4346345001831651, + "num_tokens": 6576313.0, + "step": 91 + }, + { + "epoch": 0.05742597432996528, + "grad_norm": 5.558602333068848, + "learning_rate": 9.632e-07, + "loss": 2.6471, + "mean_token_accuracy": 0.4774735514074564, + "num_tokens": 6647849.0, + "step": 92 + }, + { + "epoch": 0.05805016970311708, + "grad_norm": 8.028719902038574, + "learning_rate": 9.628e-07, + "loss": 3.0663, + "mean_token_accuracy": 0.3925786167383194, + "num_tokens": 6722617.0, + "step": 93 + }, + { + "epoch": 0.05867436507626887, + "grad_norm": 5.597121715545654, + "learning_rate": 9.624e-07, + "loss": 2.5673, + "mean_token_accuracy": 0.4711476918309927, + "num_tokens": 6792266.0, + "step": 94 + }, + { + "epoch": 0.05929856044942067, + "grad_norm": 2.907738208770752, + "learning_rate": 9.619999999999999e-07, + "loss": 2.3481, + "mean_token_accuracy": 0.48446944914758205, + "num_tokens": 6861691.0, + "step": 95 + }, + { + "epoch": 0.05992275582257246, + "grad_norm": 7.614640235900879, + "learning_rate": 9.616e-07, + "loss": 2.5295, + "mean_token_accuracy": 0.47646917775273323, + "num_tokens": 6931251.0, + "step": 96 + }, + { + "epoch": 0.060546951195724265, + "grad_norm": 5.174458980560303, + "learning_rate": 9.612e-07, + "loss": 2.9762, + "mean_token_accuracy": 0.39805862680077553, + "num_tokens": 7007101.0, + "step": 97 + }, + { + "epoch": 0.06117114656887606, + "grad_norm": 5.423238277435303, + "learning_rate": 9.608e-07, + "loss": 3.1361, + "mean_token_accuracy": 0.37795576453208923, + "num_tokens": 7082510.0, + "step": 98 + }, + { + "epoch": 0.061795341942027855, + "grad_norm": 5.688440322875977, + "learning_rate": 9.604e-07, + "loss": 2.4505, + "mean_token_accuracy": 0.4660475980490446, + "num_tokens": 7151058.0, + "step": 99 + }, + { + "epoch": 0.06241953731517965, + "grad_norm": 5.158726692199707, + "learning_rate": 9.6e-07, + "loss": 2.7193, + "mean_token_accuracy": 0.43638070672750473, + "num_tokens": 7224260.0, + "step": 100 + }, + { + "epoch": 0.06304373268833145, + "grad_norm": 3.8112547397613525, + "learning_rate": 9.595999999999999e-07, + "loss": 2.3645, + "mean_token_accuracy": 0.4727396834641695, + "num_tokens": 7291378.0, + "step": 101 + }, + { + "epoch": 0.06366792806148325, + "grad_norm": 8.052077293395996, + "learning_rate": 9.592e-07, + "loss": 2.8315, + "mean_token_accuracy": 0.4083132743835449, + "num_tokens": 7364994.0, + "step": 102 + }, + { + "epoch": 0.06429212343463504, + "grad_norm": 10.764423370361328, + "learning_rate": 9.588e-07, + "loss": 2.2744, + "mean_token_accuracy": 0.4706199821084738, + "num_tokens": 7431894.0, + "step": 103 + }, + { + "epoch": 0.06491631880778684, + "grad_norm": 0.34294837713241577, + "learning_rate": 9.584e-07, + "loss": 2.4039, + "mean_token_accuracy": 0.4625617694109678, + "num_tokens": 7500493.0, + "step": 104 + }, + { + "epoch": 0.06554051418093863, + "grad_norm": 8.098499298095703, + "learning_rate": 9.58e-07, + "loss": 2.8511, + "mean_token_accuracy": 0.3948721010237932, + "num_tokens": 7575570.0, + "step": 105 + }, + { + "epoch": 0.06616470955409043, + "grad_norm": 7.982059955596924, + "learning_rate": 9.576e-07, + "loss": 3.1074, + "mean_token_accuracy": 0.3637480493634939, + "num_tokens": 7654100.0, + "step": 106 + }, + { + "epoch": 0.06678890492724222, + "grad_norm": 10.697887420654297, + "learning_rate": 9.572e-07, + "loss": 2.6167, + "mean_token_accuracy": 0.4347928697243333, + "num_tokens": 7727014.0, + "step": 107 + }, + { + "epoch": 0.06741310030039402, + "grad_norm": 10.78205394744873, + "learning_rate": 9.567999999999999e-07, + "loss": 2.493, + "mean_token_accuracy": 0.4426170848309994, + "num_tokens": 7796256.0, + "step": 108 + }, + { + "epoch": 0.06803729567354581, + "grad_norm": 8.350289344787598, + "learning_rate": 9.564e-07, + "loss": 2.5657, + "mean_token_accuracy": 0.4259480107575655, + "num_tokens": 7867745.0, + "step": 109 + }, + { + "epoch": 0.06866149104669762, + "grad_norm": 8.640109062194824, + "learning_rate": 9.559999999999998e-07, + "loss": 2.8211, + "mean_token_accuracy": 0.3914318438619375, + "num_tokens": 7943892.0, + "step": 110 + }, + { + "epoch": 0.06928568641984942, + "grad_norm": 5.193445682525635, + "learning_rate": 9.556e-07, + "loss": 2.5277, + "mean_token_accuracy": 0.43346918001770973, + "num_tokens": 8018450.0, + "step": 111 + }, + { + "epoch": 0.06990988179300121, + "grad_norm": 10.809521675109863, + "learning_rate": 9.552e-07, + "loss": 2.8952, + "mean_token_accuracy": 0.39842038974165916, + "num_tokens": 8095684.0, + "step": 112 + }, + { + "epoch": 0.07053407716615301, + "grad_norm": 5.497768878936768, + "learning_rate": 9.548e-07, + "loss": 2.3967, + "mean_token_accuracy": 0.4447348341345787, + "num_tokens": 8166406.0, + "step": 113 + }, + { + "epoch": 0.0711582725393048, + "grad_norm": 5.638972282409668, + "learning_rate": 9.544e-07, + "loss": 2.7301, + "mean_token_accuracy": 0.3977528251707554, + "num_tokens": 8239771.0, + "step": 114 + }, + { + "epoch": 0.0717824679124566, + "grad_norm": 8.302619934082031, + "learning_rate": 9.539999999999999e-07, + "loss": 2.7009, + "mean_token_accuracy": 0.3963926210999489, + "num_tokens": 8314680.0, + "step": 115 + }, + { + "epoch": 0.0724066632856084, + "grad_norm": 5.551988124847412, + "learning_rate": 9.536e-07, + "loss": 2.2423, + "mean_token_accuracy": 0.4626568406820297, + "num_tokens": 8383016.0, + "step": 116 + }, + { + "epoch": 0.07303085865876019, + "grad_norm": 10.686290740966797, + "learning_rate": 9.532e-07, + "loss": 2.3966, + "mean_token_accuracy": 0.44909630715847015, + "num_tokens": 8454455.0, + "step": 117 + }, + { + "epoch": 0.07365505403191198, + "grad_norm": 8.04770278930664, + "learning_rate": 9.527999999999999e-07, + "loss": 2.3436, + "mean_token_accuracy": 0.4485420174896717, + "num_tokens": 8525729.0, + "step": 118 + }, + { + "epoch": 0.07427924940506378, + "grad_norm": 8.204573631286621, + "learning_rate": 9.524e-07, + "loss": 2.2376, + "mean_token_accuracy": 0.46074257604777813, + "num_tokens": 8594649.0, + "step": 119 + }, + { + "epoch": 0.07490344477821557, + "grad_norm": 2.9314703941345215, + "learning_rate": 9.52e-07, + "loss": 2.25, + "mean_token_accuracy": 0.44587117433547974, + "num_tokens": 8662344.0, + "step": 120 + }, + { + "epoch": 0.07552764015136738, + "grad_norm": 8.148239135742188, + "learning_rate": 9.515999999999999e-07, + "loss": 2.2322, + "mean_token_accuracy": 0.4659658204764128, + "num_tokens": 8731400.0, + "step": 121 + }, + { + "epoch": 0.07615183552451918, + "grad_norm": 8.139657020568848, + "learning_rate": 9.512e-07, + "loss": 2.2556, + "mean_token_accuracy": 0.4854020159691572, + "num_tokens": 8801941.0, + "step": 122 + }, + { + "epoch": 0.07677603089767097, + "grad_norm": 8.320220947265625, + "learning_rate": 9.508e-07, + "loss": 2.3501, + "mean_token_accuracy": 0.5229448135942221, + "num_tokens": 8874600.0, + "step": 123 + }, + { + "epoch": 0.07740022627082277, + "grad_norm": 5.417640209197998, + "learning_rate": 9.503999999999999e-07, + "loss": 2.595, + "mean_token_accuracy": 0.5553014483302832, + "num_tokens": 8950359.0, + "step": 124 + }, + { + "epoch": 0.07802442164397456, + "grad_norm": 11.254721641540527, + "learning_rate": 9.499999999999999e-07, + "loss": 2.4769, + "mean_token_accuracy": 0.5048603918403387, + "num_tokens": 9022501.0, + "step": 125 + }, + { + "epoch": 0.07864861701712636, + "grad_norm": 2.4788269996643066, + "learning_rate": 9.496e-07, + "loss": 2.371, + "mean_token_accuracy": 0.4698783978819847, + "num_tokens": 9095815.0, + "step": 126 + }, + { + "epoch": 0.07927281239027816, + "grad_norm": 8.651468276977539, + "learning_rate": 9.492e-07, + "loss": 2.0845, + "mean_token_accuracy": 0.47476439364254475, + "num_tokens": 9162981.0, + "step": 127 + }, + { + "epoch": 0.07989700776342995, + "grad_norm": 5.616523265838623, + "learning_rate": 9.487999999999999e-07, + "loss": 2.4026, + "mean_token_accuracy": 0.46908529102802277, + "num_tokens": 9235913.0, + "step": 128 + }, + { + "epoch": 0.08052120313658175, + "grad_norm": 5.564061164855957, + "learning_rate": 9.484e-07, + "loss": 2.2018, + "mean_token_accuracy": 0.614772479981184, + "num_tokens": 9307376.0, + "step": 129 + }, + { + "epoch": 0.08114539850973354, + "grad_norm": 5.697883605957031, + "learning_rate": 9.479999999999999e-07, + "loss": 2.2463, + "mean_token_accuracy": 0.6068003140389919, + "num_tokens": 9378080.0, + "step": 130 + }, + { + "epoch": 0.08176959388288535, + "grad_norm": 2.5631017684936523, + "learning_rate": 9.475999999999999e-07, + "loss": 2.4799, + "mean_token_accuracy": 0.4611246697604656, + "num_tokens": 9454418.0, + "step": 131 + }, + { + "epoch": 0.08239378925603714, + "grad_norm": 11.26060962677002, + "learning_rate": 9.472e-07, + "loss": 2.3173, + "mean_token_accuracy": 0.4402575436979532, + "num_tokens": 9528458.0, + "step": 132 + }, + { + "epoch": 0.08301798462918894, + "grad_norm": 8.238165855407715, + "learning_rate": 9.468e-07, + "loss": 2.3181, + "mean_token_accuracy": 0.47400329262018204, + "num_tokens": 9601991.0, + "step": 133 + }, + { + "epoch": 0.08364218000234074, + "grad_norm": 8.445817947387695, + "learning_rate": 9.464e-07, + "loss": 2.0695, + "mean_token_accuracy": 0.7179492376744747, + "num_tokens": 9671423.0, + "step": 134 + }, + { + "epoch": 0.08426637537549253, + "grad_norm": 8.088071823120117, + "learning_rate": 9.459999999999999e-07, + "loss": 2.2359, + "mean_token_accuracy": 0.8247935585677624, + "num_tokens": 9742576.0, + "step": 135 + }, + { + "epoch": 0.08489057074864433, + "grad_norm": 8.635455131530762, + "learning_rate": 9.456e-07, + "loss": 2.0614, + "mean_token_accuracy": 0.653862040489912, + "num_tokens": 9813994.0, + "step": 136 + }, + { + "epoch": 0.08551476612179612, + "grad_norm": 5.3678083419799805, + "learning_rate": 9.452e-07, + "loss": 1.8988, + "mean_token_accuracy": 0.5860353857278824, + "num_tokens": 9881296.0, + "step": 137 + }, + { + "epoch": 0.08613896149494792, + "grad_norm": 8.261592864990234, + "learning_rate": 9.447999999999999e-07, + "loss": 2.1675, + "mean_token_accuracy": 0.5836942959576845, + "num_tokens": 9955180.0, + "step": 138 + }, + { + "epoch": 0.08676315686809971, + "grad_norm": 8.667990684509277, + "learning_rate": 9.444e-07, + "loss": 2.1832, + "mean_token_accuracy": 0.7322559282183647, + "num_tokens": 10027945.0, + "step": 139 + }, + { + "epoch": 0.0873873522412515, + "grad_norm": 5.292821407318115, + "learning_rate": 9.439999999999999e-07, + "loss": 1.8674, + "mean_token_accuracy": 0.8219792768359184, + "num_tokens": 10096359.0, + "step": 140 + }, + { + "epoch": 0.0880115476144033, + "grad_norm": 7.995087623596191, + "learning_rate": 9.436e-07, + "loss": 2.0883, + "mean_token_accuracy": 0.827305443584919, + "num_tokens": 10169962.0, + "step": 141 + }, + { + "epoch": 0.08863574298755511, + "grad_norm": 8.51004409790039, + "learning_rate": 9.432e-07, + "loss": 2.1471, + "mean_token_accuracy": 0.6703935079276562, + "num_tokens": 10242410.0, + "step": 142 + }, + { + "epoch": 0.0892599383607069, + "grad_norm": 4.5640411376953125, + "learning_rate": 9.427999999999999e-07, + "loss": 1.9284, + "mean_token_accuracy": 0.5677116364240646, + "num_tokens": 10311870.0, + "step": 143 + }, + { + "epoch": 0.0898841337338587, + "grad_norm": 8.135300636291504, + "learning_rate": 9.424e-07, + "loss": 1.9048, + "mean_token_accuracy": 0.6515733599662781, + "num_tokens": 10380736.0, + "step": 144 + }, + { + "epoch": 0.0905083291070105, + "grad_norm": 5.441276550292969, + "learning_rate": 9.419999999999999e-07, + "loss": 2.4262, + "mean_token_accuracy": 0.8654657527804375, + "num_tokens": 10458829.0, + "step": 145 + }, + { + "epoch": 0.09113252448016229, + "grad_norm": 5.705361366271973, + "learning_rate": 9.415999999999999e-07, + "loss": 2.2468, + "mean_token_accuracy": 0.9081156849861145, + "num_tokens": 10533998.0, + "step": 146 + }, + { + "epoch": 0.09175671985331409, + "grad_norm": 2.7271947860717773, + "learning_rate": 9.412e-07, + "loss": 1.8659, + "mean_token_accuracy": 0.8830665722489357, + "num_tokens": 10601684.0, + "step": 147 + }, + { + "epoch": 0.09238091522646588, + "grad_norm": 5.58353853225708, + "learning_rate": 9.408e-07, + "loss": 2.0464, + "mean_token_accuracy": 0.7929886244237423, + "num_tokens": 10673767.0, + "step": 148 + }, + { + "epoch": 0.09300511059961768, + "grad_norm": 7.02947473526001, + "learning_rate": 9.403999999999999e-07, + "loss": 1.9452, + "mean_token_accuracy": 0.7648746557533741, + "num_tokens": 10744697.0, + "step": 149 + }, + { + "epoch": 0.09362930597276947, + "grad_norm": 5.471534252166748, + "learning_rate": 9.399999999999999e-07, + "loss": 2.202, + "mean_token_accuracy": 0.7427481934428215, + "num_tokens": 10820687.0, + "step": 150 + }, + { + "epoch": 0.09425350134592127, + "grad_norm": 8.263473510742188, + "learning_rate": 9.396e-07, + "loss": 1.7151, + "mean_token_accuracy": 0.8576823957264423, + "num_tokens": 10887314.0, + "step": 151 + }, + { + "epoch": 0.09487769671907308, + "grad_norm": 10.934205055236816, + "learning_rate": 9.391999999999999e-07, + "loss": 2.1236, + "mean_token_accuracy": 0.9021845832467079, + "num_tokens": 10961236.0, + "step": 152 + }, + { + "epoch": 0.09550189209222487, + "grad_norm": 11.433155059814453, + "learning_rate": 9.387999999999999e-07, + "loss": 2.2159, + "mean_token_accuracy": 0.9015358090400696, + "num_tokens": 11037387.0, + "step": 153 + }, + { + "epoch": 0.09612608746537667, + "grad_norm": 5.680862903594971, + "learning_rate": 9.384e-07, + "loss": 1.5898, + "mean_token_accuracy": 0.8703570403158665, + "num_tokens": 11101996.0, + "step": 154 + }, + { + "epoch": 0.09675028283852846, + "grad_norm": 5.380183219909668, + "learning_rate": 9.379999999999998e-07, + "loss": 1.898, + "mean_token_accuracy": 0.8727101534605026, + "num_tokens": 11174169.0, + "step": 155 + }, + { + "epoch": 0.09737447821168026, + "grad_norm": 8.087199211120605, + "learning_rate": 9.375999999999999e-07, + "loss": 1.8839, + "mean_token_accuracy": 0.8993255347013474, + "num_tokens": 11247143.0, + "step": 156 + }, + { + "epoch": 0.09799867358483205, + "grad_norm": 11.024115562438965, + "learning_rate": 9.372e-07, + "loss": 1.6381, + "mean_token_accuracy": 0.9047844186425209, + "num_tokens": 11315002.0, + "step": 157 + }, + { + "epoch": 0.09862286895798385, + "grad_norm": 8.222757339477539, + "learning_rate": 9.368e-07, + "loss": 1.8106, + "mean_token_accuracy": 0.9168143719434738, + "num_tokens": 11385579.0, + "step": 158 + }, + { + "epoch": 0.09924706433113564, + "grad_norm": 2.9269771575927734, + "learning_rate": 9.363999999999999e-07, + "loss": 1.9285, + "mean_token_accuracy": 0.9164522737264633, + "num_tokens": 11457459.0, + "step": 159 + }, + { + "epoch": 0.09987125970428744, + "grad_norm": 2.9141650199890137, + "learning_rate": 9.36e-07, + "loss": 1.7026, + "mean_token_accuracy": 0.8860682621598244, + "num_tokens": 11526888.0, + "step": 160 + }, + { + "epoch": 0.10049545507743923, + "grad_norm": 8.452841758728027, + "learning_rate": 9.356e-07, + "loss": 1.8318, + "mean_token_accuracy": 0.9092588238418102, + "num_tokens": 11597756.0, + "step": 161 + }, + { + "epoch": 0.10111965045059103, + "grad_norm": 8.58360481262207, + "learning_rate": 9.352e-07, + "loss": 2.085, + "mean_token_accuracy": 0.9195736832916737, + "num_tokens": 11674933.0, + "step": 162 + }, + { + "epoch": 0.10174384582374284, + "grad_norm": 8.074356079101562, + "learning_rate": 9.347999999999999e-07, + "loss": 1.9983, + "mean_token_accuracy": 0.9227902665734291, + "num_tokens": 11750293.0, + "step": 163 + }, + { + "epoch": 0.10236804119689463, + "grad_norm": 10.822897911071777, + "learning_rate": 9.344e-07, + "loss": 2.1212, + "mean_token_accuracy": 0.922254353761673, + "num_tokens": 11830078.0, + "step": 164 + }, + { + "epoch": 0.10299223657004643, + "grad_norm": 8.02038860321045, + "learning_rate": 9.34e-07, + "loss": 1.8677, + "mean_token_accuracy": 0.9150257557630539, + "num_tokens": 11903664.0, + "step": 165 + }, + { + "epoch": 0.10361643194319822, + "grad_norm": 5.587518215179443, + "learning_rate": 9.335999999999999e-07, + "loss": 1.9175, + "mean_token_accuracy": 0.9223049283027649, + "num_tokens": 11977732.0, + "step": 166 + }, + { + "epoch": 0.10424062731635002, + "grad_norm": 11.292394638061523, + "learning_rate": 9.332e-07, + "loss": 1.9164, + "mean_token_accuracy": 0.9137436300516129, + "num_tokens": 12052295.0, + "step": 167 + }, + { + "epoch": 0.10486482268950181, + "grad_norm": 5.468366622924805, + "learning_rate": 9.327999999999999e-07, + "loss": 2.0037, + "mean_token_accuracy": 0.929412417113781, + "num_tokens": 12128204.0, + "step": 168 + }, + { + "epoch": 0.10548901806265361, + "grad_norm": 8.034927368164062, + "learning_rate": 9.324e-07, + "loss": 1.8816, + "mean_token_accuracy": 0.9212962165474892, + "num_tokens": 12203525.0, + "step": 169 + }, + { + "epoch": 0.1061132134358054, + "grad_norm": 2.746997356414795, + "learning_rate": 9.32e-07, + "loss": 1.8241, + "mean_token_accuracy": 0.9208043627440929, + "num_tokens": 12277147.0, + "step": 170 + }, + { + "epoch": 0.1067374088089572, + "grad_norm": 2.918668508529663, + "learning_rate": 9.315999999999999e-07, + "loss": 1.8257, + "mean_token_accuracy": 0.922179501503706, + "num_tokens": 12351158.0, + "step": 171 + }, + { + "epoch": 0.107361604182109, + "grad_norm": 8.092795372009277, + "learning_rate": 9.312e-07, + "loss": 1.6924, + "mean_token_accuracy": 0.9252418130636215, + "num_tokens": 12422954.0, + "step": 172 + }, + { + "epoch": 0.1079857995552608, + "grad_norm": 5.240293502807617, + "learning_rate": 9.307999999999999e-07, + "loss": 1.7143, + "mean_token_accuracy": 0.9194028824567795, + "num_tokens": 12494742.0, + "step": 173 + }, + { + "epoch": 0.1086099949284126, + "grad_norm": 4.985151767730713, + "learning_rate": 9.303999999999999e-07, + "loss": 1.5177, + "mean_token_accuracy": 0.9118992052972317, + "num_tokens": 12563185.0, + "step": 174 + }, + { + "epoch": 0.1092341903015644, + "grad_norm": 8.365934371948242, + "learning_rate": 9.3e-07, + "loss": 1.4329, + "mean_token_accuracy": 0.9112710319459438, + "num_tokens": 12630309.0, + "step": 175 + }, + { + "epoch": 0.10985838567471619, + "grad_norm": 0.28322505950927734, + "learning_rate": 9.296e-07, + "loss": 1.4942, + "mean_token_accuracy": 0.9076949469745159, + "num_tokens": 12698485.0, + "step": 176 + }, + { + "epoch": 0.11048258104786798, + "grad_norm": 7.978460788726807, + "learning_rate": 9.292e-07, + "loss": 1.581, + "mean_token_accuracy": 0.9133112877607346, + "num_tokens": 12768437.0, + "step": 177 + }, + { + "epoch": 0.11110677642101978, + "grad_norm": 5.322417259216309, + "learning_rate": 9.287999999999999e-07, + "loss": 1.5908, + "mean_token_accuracy": 0.9186087660491467, + "num_tokens": 12839515.0, + "step": 178 + }, + { + "epoch": 0.11173097179417157, + "grad_norm": 5.470612525939941, + "learning_rate": 9.284e-07, + "loss": 1.5479, + "mean_token_accuracy": 0.9106381684541702, + "num_tokens": 12910203.0, + "step": 179 + }, + { + "epoch": 0.11235516716732337, + "grad_norm": 10.888998031616211, + "learning_rate": 9.28e-07, + "loss": 1.696, + "mean_token_accuracy": 0.9200823903083801, + "num_tokens": 12984904.0, + "step": 180 + }, + { + "epoch": 0.11297936254047516, + "grad_norm": 2.578615427017212, + "learning_rate": 9.275999999999999e-07, + "loss": 1.6689, + "mean_token_accuracy": 0.9243290685117245, + "num_tokens": 13059950.0, + "step": 181 + }, + { + "epoch": 0.11360355791362696, + "grad_norm": 5.610386848449707, + "learning_rate": 9.272e-07, + "loss": 1.5423, + "mean_token_accuracy": 0.918064784258604, + "num_tokens": 13131118.0, + "step": 182 + }, + { + "epoch": 0.11422775328677875, + "grad_norm": 10.521158218383789, + "learning_rate": 9.268e-07, + "loss": 1.7309, + "mean_token_accuracy": 0.923332441598177, + "num_tokens": 13205962.0, + "step": 183 + }, + { + "epoch": 0.11485194865993056, + "grad_norm": 5.277563571929932, + "learning_rate": 9.263999999999999e-07, + "loss": 1.7071, + "mean_token_accuracy": 0.9258010573685169, + "num_tokens": 13281976.0, + "step": 184 + }, + { + "epoch": 0.11547614403308236, + "grad_norm": 7.757433891296387, + "learning_rate": 9.26e-07, + "loss": 1.4781, + "mean_token_accuracy": 0.9150664880871773, + "num_tokens": 13352745.0, + "step": 185 + }, + { + "epoch": 0.11610033940623415, + "grad_norm": 8.166189193725586, + "learning_rate": 9.256e-07, + "loss": 1.4733, + "mean_token_accuracy": 0.9123788960278034, + "num_tokens": 13422787.0, + "step": 186 + }, + { + "epoch": 0.11672453477938595, + "grad_norm": 10.356062889099121, + "learning_rate": 9.251999999999999e-07, + "loss": 1.5437, + "mean_token_accuracy": 0.9145462922751904, + "num_tokens": 13494241.0, + "step": 187 + }, + { + "epoch": 0.11734873015253774, + "grad_norm": 8.242297172546387, + "learning_rate": 9.247999999999999e-07, + "loss": 1.5485, + "mean_token_accuracy": 0.9178762994706631, + "num_tokens": 13568229.0, + "step": 188 + }, + { + "epoch": 0.11797292552568954, + "grad_norm": 5.212475299835205, + "learning_rate": 9.244e-07, + "loss": 1.4062, + "mean_token_accuracy": 0.9117616638541222, + "num_tokens": 13637973.0, + "step": 189 + }, + { + "epoch": 0.11859712089884134, + "grad_norm": 4.846549987792969, + "learning_rate": 9.24e-07, + "loss": 1.5921, + "mean_token_accuracy": 0.9237067364156246, + "num_tokens": 13714130.0, + "step": 190 + }, + { + "epoch": 0.11922131627199313, + "grad_norm": 5.192752361297607, + "learning_rate": 9.235999999999999e-07, + "loss": 1.5236, + "mean_token_accuracy": 0.9133113771677017, + "num_tokens": 13786160.0, + "step": 191 + }, + { + "epoch": 0.11984551164514493, + "grad_norm": 5.267004489898682, + "learning_rate": 9.232e-07, + "loss": 1.4886, + "mean_token_accuracy": 0.9154262915253639, + "num_tokens": 13858518.0, + "step": 192 + }, + { + "epoch": 0.12046970701829672, + "grad_norm": 10.632615089416504, + "learning_rate": 9.227999999999999e-07, + "loss": 1.4575, + "mean_token_accuracy": 0.9238744340837002, + "num_tokens": 13930328.0, + "step": 193 + }, + { + "epoch": 0.12109390239144853, + "grad_norm": 10.462716102600098, + "learning_rate": 9.224e-07, + "loss": 1.4196, + "mean_token_accuracy": 0.9145567081868649, + "num_tokens": 14001245.0, + "step": 194 + }, + { + "epoch": 0.12171809776460032, + "grad_norm": 5.286381721496582, + "learning_rate": 9.22e-07, + "loss": 1.4195, + "mean_token_accuracy": 0.9208745285868645, + "num_tokens": 14075712.0, + "step": 195 + }, + { + "epoch": 0.12234229313775212, + "grad_norm": 2.423999547958374, + "learning_rate": 9.215999999999999e-07, + "loss": 1.3295, + "mean_token_accuracy": 0.9187790527939796, + "num_tokens": 14146535.0, + "step": 196 + }, + { + "epoch": 0.12296648851090392, + "grad_norm": 7.909485340118408, + "learning_rate": 9.212e-07, + "loss": 1.3757, + "mean_token_accuracy": 0.9210257269442081, + "num_tokens": 14217629.0, + "step": 197 + }, + { + "epoch": 0.12359068388405571, + "grad_norm": 2.5925230979919434, + "learning_rate": 9.207999999999999e-07, + "loss": 1.2879, + "mean_token_accuracy": 0.9173031412065029, + "num_tokens": 14288166.0, + "step": 198 + }, + { + "epoch": 0.1242148792572075, + "grad_norm": 7.866016387939453, + "learning_rate": 9.203999999999999e-07, + "loss": 1.3209, + "mean_token_accuracy": 0.915681928396225, + "num_tokens": 14360576.0, + "step": 199 + }, + { + "epoch": 0.1248390746303593, + "grad_norm": 5.128418445587158, + "learning_rate": 9.2e-07, + "loss": 1.4467, + "mean_token_accuracy": 0.91860106959939, + "num_tokens": 14436107.0, + "step": 200 + }, + { + "epoch": 0.1254632700035111, + "grad_norm": 7.8198933601379395, + "learning_rate": 9.196e-07, + "loss": 1.3385, + "mean_token_accuracy": 0.9225534051656723, + "num_tokens": 14509047.0, + "step": 201 + }, + { + "epoch": 0.1260874653766629, + "grad_norm": 5.133849143981934, + "learning_rate": 9.192e-07, + "loss": 1.1601, + "mean_token_accuracy": 0.9136953912675381, + "num_tokens": 14575534.0, + "step": 202 + }, + { + "epoch": 0.1267116607498147, + "grad_norm": 9.62126636505127, + "learning_rate": 9.187999999999999e-07, + "loss": 1.4671, + "mean_token_accuracy": 0.923810213804245, + "num_tokens": 14652889.0, + "step": 203 + }, + { + "epoch": 0.1273358561229665, + "grad_norm": 7.788510799407959, + "learning_rate": 9.184e-07, + "loss": 1.3339, + "mean_token_accuracy": 0.9229598566889763, + "num_tokens": 14727352.0, + "step": 204 + }, + { + "epoch": 0.12796005149611828, + "grad_norm": 1.0009371042251587, + "learning_rate": 9.18e-07, + "loss": 1.2681, + "mean_token_accuracy": 0.9171731658279896, + "num_tokens": 14799568.0, + "step": 205 + }, + { + "epoch": 0.12858424686927009, + "grad_norm": 7.367977619171143, + "learning_rate": 9.175999999999999e-07, + "loss": 1.2686, + "mean_token_accuracy": 0.9179406464099884, + "num_tokens": 14873349.0, + "step": 206 + }, + { + "epoch": 0.12920844224242187, + "grad_norm": 8.009611129760742, + "learning_rate": 9.172e-07, + "loss": 1.2355, + "mean_token_accuracy": 0.9173550568521023, + "num_tokens": 14941765.0, + "step": 207 + }, + { + "epoch": 0.12983263761557368, + "grad_norm": 0.28489747643470764, + "learning_rate": 9.168e-07, + "loss": 1.1213, + "mean_token_accuracy": 0.9089673720300198, + "num_tokens": 15010336.0, + "step": 208 + }, + { + "epoch": 0.13045683298872546, + "grad_norm": 5.03115177154541, + "learning_rate": 9.163999999999999e-07, + "loss": 1.195, + "mean_token_accuracy": 0.917554147541523, + "num_tokens": 15077965.0, + "step": 209 + }, + { + "epoch": 0.13108102836187727, + "grad_norm": 5.094935894012451, + "learning_rate": 9.16e-07, + "loss": 1.2757, + "mean_token_accuracy": 0.9190593808889389, + "num_tokens": 15151565.0, + "step": 210 + }, + { + "epoch": 0.13170522373502908, + "grad_norm": 2.76515531539917, + "learning_rate": 9.156e-07, + "loss": 1.1963, + "mean_token_accuracy": 0.9166045114398003, + "num_tokens": 15221738.0, + "step": 211 + }, + { + "epoch": 0.13232941910818086, + "grad_norm": 10.110687255859375, + "learning_rate": 9.151999999999999e-07, + "loss": 1.2716, + "mean_token_accuracy": 0.9204342179000378, + "num_tokens": 15295424.0, + "step": 212 + }, + { + "epoch": 0.13295361448133267, + "grad_norm": 5.2001261711120605, + "learning_rate": 9.147999999999999e-07, + "loss": 1.2542, + "mean_token_accuracy": 0.9178045317530632, + "num_tokens": 15368527.0, + "step": 213 + }, + { + "epoch": 0.13357780985448445, + "grad_norm": 5.049051761627197, + "learning_rate": 9.144e-07, + "loss": 1.2213, + "mean_token_accuracy": 0.9194723777472973, + "num_tokens": 15443255.0, + "step": 214 + }, + { + "epoch": 0.13420200522763626, + "grad_norm": 7.801861763000488, + "learning_rate": 9.14e-07, + "loss": 1.3295, + "mean_token_accuracy": 0.9262109063565731, + "num_tokens": 15519506.0, + "step": 215 + }, + { + "epoch": 0.13482620060078804, + "grad_norm": 7.463664531707764, + "learning_rate": 9.135999999999999e-07, + "loss": 1.1187, + "mean_token_accuracy": 0.918974194675684, + "num_tokens": 15587685.0, + "step": 216 + }, + { + "epoch": 0.13545039597393985, + "grad_norm": 7.7051568031311035, + "learning_rate": 9.132e-07, + "loss": 1.1218, + "mean_token_accuracy": 0.9162348955869675, + "num_tokens": 15658562.0, + "step": 217 + }, + { + "epoch": 0.13607459134709163, + "grad_norm": 4.836665630340576, + "learning_rate": 9.127999999999999e-07, + "loss": 0.9558, + "mean_token_accuracy": 0.9027752466499805, + "num_tokens": 15722096.0, + "step": 218 + }, + { + "epoch": 0.13669878672024344, + "grad_norm": 10.028327941894531, + "learning_rate": 9.123999999999999e-07, + "loss": 1.2193, + "mean_token_accuracy": 0.9179589003324509, + "num_tokens": 15797169.0, + "step": 219 + }, + { + "epoch": 0.13732298209339525, + "grad_norm": 6.999484062194824, + "learning_rate": 9.12e-07, + "loss": 1.0519, + "mean_token_accuracy": 0.909615945070982, + "num_tokens": 15866145.0, + "step": 220 + }, + { + "epoch": 0.13794717746654703, + "grad_norm": 2.3297009468078613, + "learning_rate": 9.115999999999999e-07, + "loss": 1.0906, + "mean_token_accuracy": 0.9236158840358257, + "num_tokens": 15935429.0, + "step": 221 + }, + { + "epoch": 0.13857137283969884, + "grad_norm": 7.299313068389893, + "learning_rate": 9.112e-07, + "loss": 1.099, + "mean_token_accuracy": 0.9201101921498775, + "num_tokens": 16008366.0, + "step": 222 + }, + { + "epoch": 0.13919556821285062, + "grad_norm": 4.8594746589660645, + "learning_rate": 9.108e-07, + "loss": 1.0927, + "mean_token_accuracy": 0.91213034465909, + "num_tokens": 16078781.0, + "step": 223 + }, + { + "epoch": 0.13981976358600243, + "grad_norm": 9.167181015014648, + "learning_rate": 9.103999999999999e-07, + "loss": 1.1846, + "mean_token_accuracy": 0.9285093583166599, + "num_tokens": 16157560.0, + "step": 224 + }, + { + "epoch": 0.1404439589591542, + "grad_norm": 7.562880992889404, + "learning_rate": 9.1e-07, + "loss": 1.0718, + "mean_token_accuracy": 0.9226764664053917, + "num_tokens": 16229833.0, + "step": 225 + }, + { + "epoch": 0.14106815433230602, + "grad_norm": 7.0906500816345215, + "learning_rate": 9.095999999999999e-07, + "loss": 1.0585, + "mean_token_accuracy": 0.915476281195879, + "num_tokens": 16300263.0, + "step": 226 + }, + { + "epoch": 0.1416923497054578, + "grad_norm": 2.385265588760376, + "learning_rate": 9.092e-07, + "loss": 1.0082, + "mean_token_accuracy": 0.9205562435090542, + "num_tokens": 16369556.0, + "step": 227 + }, + { + "epoch": 0.1423165450786096, + "grad_norm": 6.672084331512451, + "learning_rate": 9.088e-07, + "loss": 1.079, + "mean_token_accuracy": 0.9254900999367237, + "num_tokens": 16441489.0, + "step": 228 + }, + { + "epoch": 0.1429407404517614, + "grad_norm": 4.2808518409729, + "learning_rate": 9.084e-07, + "loss": 0.9808, + "mean_token_accuracy": 0.9142305105924606, + "num_tokens": 16510117.0, + "step": 229 + }, + { + "epoch": 0.1435649358249132, + "grad_norm": 7.262923717498779, + "learning_rate": 9.08e-07, + "loss": 1.1066, + "mean_token_accuracy": 0.9294875040650368, + "num_tokens": 16586050.0, + "step": 230 + }, + { + "epoch": 0.144189131198065, + "grad_norm": 2.315251350402832, + "learning_rate": 9.075999999999999e-07, + "loss": 1.0402, + "mean_token_accuracy": 0.9211455695331097, + "num_tokens": 16658076.0, + "step": 231 + }, + { + "epoch": 0.1448133265712168, + "grad_norm": 4.604584217071533, + "learning_rate": 9.072e-07, + "loss": 0.8917, + "mean_token_accuracy": 0.9137470684945583, + "num_tokens": 16725015.0, + "step": 232 + }, + { + "epoch": 0.1454375219443686, + "grad_norm": 6.627901554107666, + "learning_rate": 9.068e-07, + "loss": 1.0017, + "mean_token_accuracy": 0.9179323613643646, + "num_tokens": 16796477.0, + "step": 233 + }, + { + "epoch": 0.14606171731752038, + "grad_norm": 7.2127227783203125, + "learning_rate": 9.063999999999999e-07, + "loss": 1.0481, + "mean_token_accuracy": 0.9238373003900051, + "num_tokens": 16873535.0, + "step": 234 + }, + { + "epoch": 0.1466859126906722, + "grad_norm": 6.750350475311279, + "learning_rate": 9.06e-07, + "loss": 1.0132, + "mean_token_accuracy": 0.9286116436123848, + "num_tokens": 16946255.0, + "step": 235 + }, + { + "epoch": 0.14731010806382397, + "grad_norm": 6.974132061004639, + "learning_rate": 9.056e-07, + "loss": 0.985, + "mean_token_accuracy": 0.9234875813126564, + "num_tokens": 17017960.0, + "step": 236 + }, + { + "epoch": 0.14793430343697578, + "grad_norm": 6.718789100646973, + "learning_rate": 9.051999999999999e-07, + "loss": 1.0101, + "mean_token_accuracy": 0.9212662689387798, + "num_tokens": 17092320.0, + "step": 237 + }, + { + "epoch": 0.14855849881012756, + "grad_norm": 2.276811361312866, + "learning_rate": 9.048e-07, + "loss": 0.8876, + "mean_token_accuracy": 0.9173206947743893, + "num_tokens": 17161200.0, + "step": 238 + }, + { + "epoch": 0.14918269418327937, + "grad_norm": 4.366613864898682, + "learning_rate": 9.044e-07, + "loss": 0.8801, + "mean_token_accuracy": 0.9057007618248463, + "num_tokens": 17227939.0, + "step": 239 + }, + { + "epoch": 0.14980688955643115, + "grad_norm": 6.607369422912598, + "learning_rate": 9.039999999999999e-07, + "loss": 1.0189, + "mean_token_accuracy": 0.9282168634235859, + "num_tokens": 17306041.0, + "step": 240 + }, + { + "epoch": 0.15043108492958296, + "grad_norm": 4.232856273651123, + "learning_rate": 9.035999999999999e-07, + "loss": 0.8925, + "mean_token_accuracy": 0.9180495738983154, + "num_tokens": 17375044.0, + "step": 241 + }, + { + "epoch": 0.15105528030273477, + "grad_norm": 4.2526044845581055, + "learning_rate": 9.032e-07, + "loss": 0.9413, + "mean_token_accuracy": 0.920606717467308, + "num_tokens": 17447118.0, + "step": 242 + }, + { + "epoch": 0.15167947567588655, + "grad_norm": 2.206423044204712, + "learning_rate": 9.028e-07, + "loss": 0.9194, + "mean_token_accuracy": 0.9231000654399395, + "num_tokens": 17520541.0, + "step": 243 + }, + { + "epoch": 0.15230367104903836, + "grad_norm": 6.550907135009766, + "learning_rate": 9.023999999999999e-07, + "loss": 0.8887, + "mean_token_accuracy": 0.9143834039568901, + "num_tokens": 17593508.0, + "step": 244 + }, + { + "epoch": 0.15292786642219014, + "grad_norm": 4.179169654846191, + "learning_rate": 9.02e-07, + "loss": 0.9158, + "mean_token_accuracy": 0.925802793353796, + "num_tokens": 17669670.0, + "step": 245 + }, + { + "epoch": 0.15355206179534195, + "grad_norm": 2.0171589851379395, + "learning_rate": 9.015999999999999e-07, + "loss": 0.8846, + "mean_token_accuracy": 0.9246592856943607, + "num_tokens": 17743445.0, + "step": 246 + }, + { + "epoch": 0.15417625716849373, + "grad_norm": 6.173559665679932, + "learning_rate": 9.011999999999999e-07, + "loss": 0.8851, + "mean_token_accuracy": 0.9320320673286915, + "num_tokens": 17818727.0, + "step": 247 + }, + { + "epoch": 0.15480045254164554, + "grad_norm": 6.248245716094971, + "learning_rate": 9.008e-07, + "loss": 0.7844, + "mean_token_accuracy": 0.912699606269598, + "num_tokens": 17885175.0, + "step": 248 + }, + { + "epoch": 0.15542464791479732, + "grad_norm": 2.034191131591797, + "learning_rate": 9.004e-07, + "loss": 0.8611, + "mean_token_accuracy": 0.9232048504054546, + "num_tokens": 17958917.0, + "step": 249 + }, + { + "epoch": 0.15604884328794913, + "grad_norm": 4.073791027069092, + "learning_rate": 9e-07, + "loss": 0.8231, + "mean_token_accuracy": 0.9232704900205135, + "num_tokens": 18031523.0, + "step": 250 + }, + { + "epoch": 0.15667303866110094, + "grad_norm": 6.003040313720703, + "learning_rate": 8.995999999999999e-07, + "loss": 0.8051, + "mean_token_accuracy": 0.9214721880853176, + "num_tokens": 18101157.0, + "step": 251 + }, + { + "epoch": 0.15729723403425272, + "grad_norm": 7.990306854248047, + "learning_rate": 8.992e-07, + "loss": 0.8663, + "mean_token_accuracy": 0.9240248017013073, + "num_tokens": 18178470.0, + "step": 252 + }, + { + "epoch": 0.15792142940740453, + "grad_norm": 7.7937116622924805, + "learning_rate": 8.988e-07, + "loss": 0.8144, + "mean_token_accuracy": 0.9162204191088676, + "num_tokens": 18251554.0, + "step": 253 + }, + { + "epoch": 0.1585456247805563, + "grad_norm": 4.021476745605469, + "learning_rate": 8.983999999999999e-07, + "loss": 0.7996, + "mean_token_accuracy": 0.917805690318346, + "num_tokens": 18322539.0, + "step": 254 + }, + { + "epoch": 0.15916982015370812, + "grad_norm": 8.010640144348145, + "learning_rate": 8.98e-07, + "loss": 0.7645, + "mean_token_accuracy": 0.9174820892512798, + "num_tokens": 18391936.0, + "step": 255 + }, + { + "epoch": 0.1597940155268599, + "grad_norm": 4.005990028381348, + "learning_rate": 8.975999999999999e-07, + "loss": 0.8062, + "mean_token_accuracy": 0.9221107959747314, + "num_tokens": 18463522.0, + "step": 256 + }, + { + "epoch": 0.1604182109000117, + "grad_norm": 3.8430492877960205, + "learning_rate": 8.972e-07, + "loss": 0.7543, + "mean_token_accuracy": 0.9232521802186966, + "num_tokens": 18535494.0, + "step": 257 + }, + { + "epoch": 0.1610424062731635, + "grad_norm": 5.738049507141113, + "learning_rate": 8.968e-07, + "loss": 0.7607, + "mean_token_accuracy": 0.9231390170753002, + "num_tokens": 18606448.0, + "step": 258 + }, + { + "epoch": 0.1616666016463153, + "grad_norm": 3.8291401863098145, + "learning_rate": 8.963999999999999e-07, + "loss": 0.782, + "mean_token_accuracy": 0.9246168956160545, + "num_tokens": 18678690.0, + "step": 259 + }, + { + "epoch": 0.16229079701946708, + "grad_norm": 3.7632787227630615, + "learning_rate": 8.96e-07, + "loss": 0.8008, + "mean_token_accuracy": 0.9147759675979614, + "num_tokens": 18750838.0, + "step": 260 + }, + { + "epoch": 0.1629149923926189, + "grad_norm": 2.0309298038482666, + "learning_rate": 8.955999999999999e-07, + "loss": 0.7, + "mean_token_accuracy": 0.906768687069416, + "num_tokens": 18816296.0, + "step": 261 + }, + { + "epoch": 0.1635391877657707, + "grad_norm": 3.681788921356201, + "learning_rate": 8.951999999999999e-07, + "loss": 0.7258, + "mean_token_accuracy": 0.9221327230334282, + "num_tokens": 18886104.0, + "step": 262 + }, + { + "epoch": 0.16416338313892248, + "grad_norm": 5.3626837730407715, + "learning_rate": 8.948e-07, + "loss": 0.6993, + "mean_token_accuracy": 0.9221790917217731, + "num_tokens": 18955722.0, + "step": 263 + }, + { + "epoch": 0.1647875785120743, + "grad_norm": 5.501428604125977, + "learning_rate": 8.944e-07, + "loss": 0.7254, + "mean_token_accuracy": 0.9257684424519539, + "num_tokens": 19029819.0, + "step": 264 + }, + { + "epoch": 0.16541177388522607, + "grad_norm": 3.6627449989318848, + "learning_rate": 8.939999999999999e-07, + "loss": 0.7309, + "mean_token_accuracy": 0.9193191789090633, + "num_tokens": 19102504.0, + "step": 265 + }, + { + "epoch": 0.16603596925837788, + "grad_norm": 0.2712026536464691, + "learning_rate": 8.935999999999999e-07, + "loss": 0.7061, + "mean_token_accuracy": 0.9194264896214008, + "num_tokens": 19173251.0, + "step": 266 + }, + { + "epoch": 0.16666016463152966, + "grad_norm": 5.12305212020874, + "learning_rate": 8.932e-07, + "loss": 0.7359, + "mean_token_accuracy": 0.9240240082144737, + "num_tokens": 19249509.0, + "step": 267 + }, + { + "epoch": 0.16728436000468147, + "grad_norm": 3.4386534690856934, + "learning_rate": 8.928e-07, + "loss": 0.692, + "mean_token_accuracy": 0.9101699814200401, + "num_tokens": 19318156.0, + "step": 268 + }, + { + "epoch": 0.16790855537783325, + "grad_norm": 5.161988735198975, + "learning_rate": 8.923999999999999e-07, + "loss": 0.7102, + "mean_token_accuracy": 0.9245063103735447, + "num_tokens": 19391686.0, + "step": 269 + }, + { + "epoch": 0.16853275075098506, + "grad_norm": 3.3804566860198975, + "learning_rate": 8.92e-07, + "loss": 0.6881, + "mean_token_accuracy": 0.9200237616896629, + "num_tokens": 19466118.0, + "step": 270 + }, + { + "epoch": 0.16915694612413684, + "grad_norm": 3.553159475326538, + "learning_rate": 8.915999999999999e-07, + "loss": 0.6936, + "mean_token_accuracy": 0.9296778216958046, + "num_tokens": 19541364.0, + "step": 271 + }, + { + "epoch": 0.16978114149728865, + "grad_norm": 1.7626819610595703, + "learning_rate": 8.911999999999999e-07, + "loss": 0.6934, + "mean_token_accuracy": 0.9221810065209866, + "num_tokens": 19613848.0, + "step": 272 + }, + { + "epoch": 0.17040533687044046, + "grad_norm": 1.6031099557876587, + "learning_rate": 8.908e-07, + "loss": 0.686, + "mean_token_accuracy": 0.9166489988565445, + "num_tokens": 19685477.0, + "step": 273 + }, + { + "epoch": 0.17102953224359224, + "grad_norm": 3.2471511363983154, + "learning_rate": 8.904e-07, + "loss": 0.6502, + "mean_token_accuracy": 0.9268861003220081, + "num_tokens": 19756640.0, + "step": 274 + }, + { + "epoch": 0.17165372761674405, + "grad_norm": 6.707427501678467, + "learning_rate": 8.9e-07, + "loss": 0.6561, + "mean_token_accuracy": 0.9250268638134003, + "num_tokens": 19828741.0, + "step": 275 + }, + { + "epoch": 0.17227792298989583, + "grad_norm": 3.2816786766052246, + "learning_rate": 8.895999999999999e-07, + "loss": 0.6568, + "mean_token_accuracy": 0.9165336675941944, + "num_tokens": 19900364.0, + "step": 276 + }, + { + "epoch": 0.17290211836304764, + "grad_norm": 6.137754440307617, + "learning_rate": 8.892e-07, + "loss": 0.6575, + "mean_token_accuracy": 0.9207403063774109, + "num_tokens": 19972956.0, + "step": 277 + }, + { + "epoch": 0.17352631373619942, + "grad_norm": 4.800975799560547, + "learning_rate": 8.888e-07, + "loss": 0.6715, + "mean_token_accuracy": 0.9269882887601852, + "num_tokens": 20048223.0, + "step": 278 + }, + { + "epoch": 0.17415050910935123, + "grad_norm": 3.0919814109802246, + "learning_rate": 8.883999999999999e-07, + "loss": 0.6271, + "mean_token_accuracy": 0.9199992828071117, + "num_tokens": 20121461.0, + "step": 279 + }, + { + "epoch": 0.174774704482503, + "grad_norm": 4.76736307144165, + "learning_rate": 8.88e-07, + "loss": 0.6244, + "mean_token_accuracy": 0.9144281633198261, + "num_tokens": 20189927.0, + "step": 280 + }, + { + "epoch": 0.17539889985565482, + "grad_norm": 2.987694263458252, + "learning_rate": 8.875999999999999e-07, + "loss": 0.6483, + "mean_token_accuracy": 0.9280169196426868, + "num_tokens": 20266049.0, + "step": 281 + }, + { + "epoch": 0.1760230952288066, + "grad_norm": 4.727084636688232, + "learning_rate": 8.872e-07, + "loss": 0.6123, + "mean_token_accuracy": 0.9181976951658726, + "num_tokens": 20334358.0, + "step": 282 + }, + { + "epoch": 0.1766472906019584, + "grad_norm": 5.995877742767334, + "learning_rate": 8.868e-07, + "loss": 0.6263, + "mean_token_accuracy": 0.9246489144861698, + "num_tokens": 20407439.0, + "step": 283 + }, + { + "epoch": 0.17727148597511022, + "grad_norm": 4.42982292175293, + "learning_rate": 8.863999999999999e-07, + "loss": 0.6197, + "mean_token_accuracy": 0.9229095615446568, + "num_tokens": 20479706.0, + "step": 284 + }, + { + "epoch": 0.177895681348262, + "grad_norm": 4.397432804107666, + "learning_rate": 8.86e-07, + "loss": 0.6677, + "mean_token_accuracy": 0.9228343367576599, + "num_tokens": 20556437.0, + "step": 285 + }, + { + "epoch": 0.1785198767214138, + "grad_norm": 4.311978340148926, + "learning_rate": 8.856e-07, + "loss": 0.6148, + "mean_token_accuracy": 0.9273019395768642, + "num_tokens": 20629867.0, + "step": 286 + }, + { + "epoch": 0.1791440720945656, + "grad_norm": 2.751955270767212, + "learning_rate": 8.851999999999999e-07, + "loss": 0.6209, + "mean_token_accuracy": 0.9213513396680355, + "num_tokens": 20703095.0, + "step": 287 + }, + { + "epoch": 0.1797682674677174, + "grad_norm": 4.102765083312988, + "learning_rate": 8.848e-07, + "loss": 0.6343, + "mean_token_accuracy": 0.9197850301861763, + "num_tokens": 20780530.0, + "step": 288 + }, + { + "epoch": 0.18039246284086918, + "grad_norm": 4.114978313446045, + "learning_rate": 8.844e-07, + "loss": 0.6205, + "mean_token_accuracy": 0.9163020439445972, + "num_tokens": 20852090.0, + "step": 289 + }, + { + "epoch": 0.181016658214021, + "grad_norm": 2.690525531768799, + "learning_rate": 8.839999999999999e-07, + "loss": 0.5775, + "mean_token_accuracy": 0.9212459437549114, + "num_tokens": 20924817.0, + "step": 290 + }, + { + "epoch": 0.18164085358717277, + "grad_norm": 2.542531967163086, + "learning_rate": 8.836e-07, + "loss": 0.5937, + "mean_token_accuracy": 0.9168826639652252, + "num_tokens": 20995736.0, + "step": 291 + }, + { + "epoch": 0.18226504896032458, + "grad_norm": 3.8409745693206787, + "learning_rate": 8.832e-07, + "loss": 0.5717, + "mean_token_accuracy": 0.9313341379165649, + "num_tokens": 21072521.0, + "step": 292 + }, + { + "epoch": 0.1828892443334764, + "grad_norm": 2.5837008953094482, + "learning_rate": 8.827999999999999e-07, + "loss": 0.5705, + "mean_token_accuracy": 0.9210564531385899, + "num_tokens": 21144339.0, + "step": 293 + }, + { + "epoch": 0.18351343970662817, + "grad_norm": 1.3285531997680664, + "learning_rate": 8.823999999999999e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.9195240288972855, + "num_tokens": 21214517.0, + "step": 294 + }, + { + "epoch": 0.18413763507977998, + "grad_norm": 2.5066184997558594, + "learning_rate": 8.82e-07, + "loss": 0.5684, + "mean_token_accuracy": 0.9227254241704941, + "num_tokens": 21287375.0, + "step": 295 + }, + { + "epoch": 0.18476183045293176, + "grad_norm": 3.7103333473205566, + "learning_rate": 8.816000000000001e-07, + "loss": 0.5736, + "mean_token_accuracy": 0.9259331077337265, + "num_tokens": 21364221.0, + "step": 296 + }, + { + "epoch": 0.18538602582608357, + "grad_norm": 3.6857786178588867, + "learning_rate": 8.811999999999999e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.9199340753257275, + "num_tokens": 21433773.0, + "step": 297 + }, + { + "epoch": 0.18601022119923535, + "grad_norm": 2.4223549365997314, + "learning_rate": 8.808e-07, + "loss": 0.5617, + "mean_token_accuracy": 0.9241475202143192, + "num_tokens": 21505557.0, + "step": 298 + }, + { + "epoch": 0.18663441657238716, + "grad_norm": 3.512373924255371, + "learning_rate": 8.804e-07, + "loss": 0.5554, + "mean_token_accuracy": 0.9271243251860142, + "num_tokens": 21578997.0, + "step": 299 + }, + { + "epoch": 0.18725861194553894, + "grad_norm": 1.1658852100372314, + "learning_rate": 8.799999999999999e-07, + "loss": 0.5605, + "mean_token_accuracy": 0.91535235196352, + "num_tokens": 21649357.0, + "step": 300 + }, + { + "epoch": 0.18788280731869075, + "grad_norm": 1.2288845777511597, + "learning_rate": 8.796e-07, + "loss": 0.5274, + "mean_token_accuracy": 0.9191324077546597, + "num_tokens": 21718787.0, + "step": 301 + }, + { + "epoch": 0.18850700269184253, + "grad_norm": 1.1937166452407837, + "learning_rate": 8.792e-07, + "loss": 0.5581, + "mean_token_accuracy": 0.9143362790346146, + "num_tokens": 21789343.0, + "step": 302 + }, + { + "epoch": 0.18913119806499434, + "grad_norm": 3.4846014976501465, + "learning_rate": 8.788e-07, + "loss": 0.5822, + "mean_token_accuracy": 0.911719061434269, + "num_tokens": 21860110.0, + "step": 303 + }, + { + "epoch": 0.18975539343814615, + "grad_norm": 4.586836814880371, + "learning_rate": 8.783999999999999e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.9314946755766869, + "num_tokens": 21934618.0, + "step": 304 + }, + { + "epoch": 0.19037958881129793, + "grad_norm": 3.182224750518799, + "learning_rate": 8.78e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.9216264933347702, + "num_tokens": 22010108.0, + "step": 305 + }, + { + "epoch": 0.19100378418444974, + "grad_norm": 3.312130928039551, + "learning_rate": 8.776e-07, + "loss": 0.5419, + "mean_token_accuracy": 0.9131336398422718, + "num_tokens": 22078727.0, + "step": 306 + }, + { + "epoch": 0.19162797955760152, + "grad_norm": 1.0294653177261353, + "learning_rate": 8.771999999999999e-07, + "loss": 0.536, + "mean_token_accuracy": 0.9182093925774097, + "num_tokens": 22149618.0, + "step": 307 + }, + { + "epoch": 0.19225217493075333, + "grad_norm": 2.1827099323272705, + "learning_rate": 8.768e-07, + "loss": 0.528, + "mean_token_accuracy": 0.9168641194701195, + "num_tokens": 22219669.0, + "step": 308 + }, + { + "epoch": 0.19287637030390511, + "grad_norm": 2.200655460357666, + "learning_rate": 8.763999999999999e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.9270654171705246, + "num_tokens": 22291908.0, + "step": 309 + }, + { + "epoch": 0.19350056567705692, + "grad_norm": 4.376758098602295, + "learning_rate": 8.76e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.9147041626274586, + "num_tokens": 22359363.0, + "step": 310 + }, + { + "epoch": 0.1941247610502087, + "grad_norm": 1.9491456747055054, + "learning_rate": 8.756e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.9250685833394527, + "num_tokens": 22432494.0, + "step": 311 + }, + { + "epoch": 0.19474895642336051, + "grad_norm": 0.30999302864074707, + "learning_rate": 8.751999999999999e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.9207328855991364, + "num_tokens": 22503492.0, + "step": 312 + }, + { + "epoch": 0.1953731517965123, + "grad_norm": 1.0560754537582397, + "learning_rate": 8.748e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.9238898158073425, + "num_tokens": 22575416.0, + "step": 313 + }, + { + "epoch": 0.1959973471696641, + "grad_norm": 2.971607208251953, + "learning_rate": 8.743999999999999e-07, + "loss": 0.533, + "mean_token_accuracy": 0.9224493429064751, + "num_tokens": 22649065.0, + "step": 314 + }, + { + "epoch": 0.1966215425428159, + "grad_norm": 1.9408035278320312, + "learning_rate": 8.739999999999999e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.925947979092598, + "num_tokens": 22724399.0, + "step": 315 + }, + { + "epoch": 0.1972457379159677, + "grad_norm": 1.9919980764389038, + "learning_rate": 8.736e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.9135693944990635, + "num_tokens": 22795990.0, + "step": 316 + }, + { + "epoch": 0.1978699332891195, + "grad_norm": 1.16225266456604, + "learning_rate": 8.732e-07, + "loss": 0.515, + "mean_token_accuracy": 0.9201497547328472, + "num_tokens": 22867292.0, + "step": 317 + }, + { + "epoch": 0.19849412866227129, + "grad_norm": 1.9718466997146606, + "learning_rate": 8.728e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.9120406582951546, + "num_tokens": 22934474.0, + "step": 318 + }, + { + "epoch": 0.1991183240354231, + "grad_norm": 1.9588881731033325, + "learning_rate": 8.723999999999999e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.9219380579888821, + "num_tokens": 23007530.0, + "step": 319 + }, + { + "epoch": 0.19974251940857488, + "grad_norm": 0.9395773410797119, + "learning_rate": 8.72e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.9167209789156914, + "num_tokens": 23079046.0, + "step": 320 + }, + { + "epoch": 0.20036671478172668, + "grad_norm": 0.9463289380073547, + "learning_rate": 8.716e-07, + "loss": 0.5155, + "mean_token_accuracy": 0.9193439371883869, + "num_tokens": 23149666.0, + "step": 321 + }, + { + "epoch": 0.20099091015487847, + "grad_norm": 0.8668283224105835, + "learning_rate": 8.711999999999999e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.9267007894814014, + "num_tokens": 23221965.0, + "step": 322 + }, + { + "epoch": 0.20161510552803028, + "grad_norm": 0.9823909401893616, + "learning_rate": 8.708e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.9271320924162865, + "num_tokens": 23294488.0, + "step": 323 + }, + { + "epoch": 0.20223930090118206, + "grad_norm": 0.961146354675293, + "learning_rate": 8.704e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.9141620211303234, + "num_tokens": 23361710.0, + "step": 324 + }, + { + "epoch": 0.20286349627433387, + "grad_norm": 1.7309682369232178, + "learning_rate": 8.699999999999999e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.9189098365604877, + "num_tokens": 23437107.0, + "step": 325 + }, + { + "epoch": 0.20348769164748567, + "grad_norm": 0.912071943283081, + "learning_rate": 8.696e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.9214383848011494, + "num_tokens": 23504787.0, + "step": 326 + }, + { + "epoch": 0.20411188702063746, + "grad_norm": 0.9424958825111389, + "learning_rate": 8.692e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.9286565147340298, + "num_tokens": 23577551.0, + "step": 327 + }, + { + "epoch": 0.20473608239378926, + "grad_norm": 0.9649046063423157, + "learning_rate": 8.687999999999999e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.9243063032627106, + "num_tokens": 23651549.0, + "step": 328 + }, + { + "epoch": 0.20536027776694105, + "grad_norm": 1.6607611179351807, + "learning_rate": 8.683999999999999e-07, + "loss": 0.4811, + "mean_token_accuracy": 0.9211996085941792, + "num_tokens": 23724050.0, + "step": 329 + }, + { + "epoch": 0.20598447314009286, + "grad_norm": 0.9780636429786682, + "learning_rate": 8.68e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.920963428914547, + "num_tokens": 23793356.0, + "step": 330 + }, + { + "epoch": 0.20660866851324464, + "grad_norm": 2.5218260288238525, + "learning_rate": 8.676e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.9156523831188679, + "num_tokens": 23865901.0, + "step": 331 + }, + { + "epoch": 0.20723286388639645, + "grad_norm": 1.7883888483047485, + "learning_rate": 8.671999999999999e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.9203068241477013, + "num_tokens": 23933168.0, + "step": 332 + }, + { + "epoch": 0.20785705925954823, + "grad_norm": 0.8361603021621704, + "learning_rate": 8.668e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.9065203927457333, + "num_tokens": 24002835.0, + "step": 333 + }, + { + "epoch": 0.20848125463270004, + "grad_norm": 0.912175714969635, + "learning_rate": 8.663999999999999e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.923988152295351, + "num_tokens": 24075843.0, + "step": 334 + }, + { + "epoch": 0.20910545000585185, + "grad_norm": 2.5694775581359863, + "learning_rate": 8.659999999999999e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.9196614623069763, + "num_tokens": 24150074.0, + "step": 335 + }, + { + "epoch": 0.20972964537900363, + "grad_norm": 2.5357964038848877, + "learning_rate": 8.656e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.917326096445322, + "num_tokens": 24222073.0, + "step": 336 + }, + { + "epoch": 0.21035384075215544, + "grad_norm": 2.5211615562438965, + "learning_rate": 8.651999999999999e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.9228744432330132, + "num_tokens": 24295899.0, + "step": 337 + }, + { + "epoch": 0.21097803612530722, + "grad_norm": 0.8773064613342285, + "learning_rate": 8.648e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.9232124611735344, + "num_tokens": 24370875.0, + "step": 338 + }, + { + "epoch": 0.21160223149845903, + "grad_norm": 0.7836575508117676, + "learning_rate": 8.643999999999999e-07, + "loss": 0.4595, + "mean_token_accuracy": 0.925804577767849, + "num_tokens": 24446073.0, + "step": 339 + }, + { + "epoch": 0.2122264268716108, + "grad_norm": 2.4918482303619385, + "learning_rate": 8.639999999999999e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.9221850037574768, + "num_tokens": 24519043.0, + "step": 340 + }, + { + "epoch": 0.21285062224476262, + "grad_norm": 0.8078194260597229, + "learning_rate": 8.636e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.9222163297235966, + "num_tokens": 24591558.0, + "step": 341 + }, + { + "epoch": 0.2134748176179144, + "grad_norm": 1.670353889465332, + "learning_rate": 8.632e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.9207929521799088, + "num_tokens": 24661489.0, + "step": 342 + }, + { + "epoch": 0.2140990129910662, + "grad_norm": 1.6045187711715698, + "learning_rate": 8.628e-07, + "loss": 0.4317, + "mean_token_accuracy": 0.9321544617414474, + "num_tokens": 24735383.0, + "step": 343 + }, + { + "epoch": 0.214723208364218, + "grad_norm": 1.4934444427490234, + "learning_rate": 8.624e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.91195372864604, + "num_tokens": 24808092.0, + "step": 344 + }, + { + "epoch": 0.2153474037373698, + "grad_norm": 0.8003431558609009, + "learning_rate": 8.62e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.9115734659135342, + "num_tokens": 24874295.0, + "step": 345 + }, + { + "epoch": 0.2159715991105216, + "grad_norm": 2.375620126724243, + "learning_rate": 8.616e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.9086932577192783, + "num_tokens": 24940399.0, + "step": 346 + }, + { + "epoch": 0.2165957944836734, + "grad_norm": 3.018425703048706, + "learning_rate": 8.611999999999999e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.9146109260618687, + "num_tokens": 25010896.0, + "step": 347 + }, + { + "epoch": 0.2172199898568252, + "grad_norm": 1.399186372756958, + "learning_rate": 8.608e-07, + "loss": 0.4331, + "mean_token_accuracy": 0.9315755292773247, + "num_tokens": 25085233.0, + "step": 348 + }, + { + "epoch": 0.21784418522997698, + "grad_norm": 0.8286827206611633, + "learning_rate": 8.604000000000001e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.9212476573884487, + "num_tokens": 25150827.0, + "step": 349 + }, + { + "epoch": 0.2184683806031288, + "grad_norm": 0.7845478653907776, + "learning_rate": 8.599999999999999e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.9202132783830166, + "num_tokens": 25217061.0, + "step": 350 + }, + { + "epoch": 0.21909257597628057, + "grad_norm": 0.7909486293792725, + "learning_rate": 8.596e-07, + "loss": 0.4397, + "mean_token_accuracy": 0.9279148392379284, + "num_tokens": 25291821.0, + "step": 351 + }, + { + "epoch": 0.21971677134943238, + "grad_norm": 1.524327278137207, + "learning_rate": 8.592e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.9234129972755909, + "num_tokens": 25359049.0, + "step": 352 + }, + { + "epoch": 0.22034096672258416, + "grad_norm": 2.1286449432373047, + "learning_rate": 8.587999999999999e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.925106979906559, + "num_tokens": 25431227.0, + "step": 353 + }, + { + "epoch": 0.22096516209573597, + "grad_norm": 2.2779810428619385, + "learning_rate": 8.584e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.9252244904637337, + "num_tokens": 25506007.0, + "step": 354 + }, + { + "epoch": 0.22158935746888775, + "grad_norm": 1.3562296628952026, + "learning_rate": 8.58e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.9169961102306843, + "num_tokens": 25576911.0, + "step": 355 + }, + { + "epoch": 0.22221355284203956, + "grad_norm": 0.7795654535293579, + "learning_rate": 8.576e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.9218585044145584, + "num_tokens": 25645426.0, + "step": 356 + }, + { + "epoch": 0.22283774821519137, + "grad_norm": 1.5523988008499146, + "learning_rate": 8.571999999999999e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.9274989813566208, + "num_tokens": 25720071.0, + "step": 357 + }, + { + "epoch": 0.22346194358834315, + "grad_norm": 1.2857192754745483, + "learning_rate": 8.568e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.9232259094715118, + "num_tokens": 25795999.0, + "step": 358 + }, + { + "epoch": 0.22408613896149496, + "grad_norm": 2.137576103210449, + "learning_rate": 8.564e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.9243124797940254, + "num_tokens": 25867136.0, + "step": 359 + }, + { + "epoch": 0.22471033433464674, + "grad_norm": 2.458399534225464, + "learning_rate": 8.559999999999999e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.9150168560445309, + "num_tokens": 25935016.0, + "step": 360 + }, + { + "epoch": 0.22533452970779855, + "grad_norm": 1.9607969522476196, + "learning_rate": 8.556e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.923651646822691, + "num_tokens": 26011032.0, + "step": 361 + }, + { + "epoch": 0.22595872508095033, + "grad_norm": 1.3728708028793335, + "learning_rate": 8.551999999999999e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.9165203161537647, + "num_tokens": 26080969.0, + "step": 362 + }, + { + "epoch": 0.22658292045410214, + "grad_norm": 1.392662763595581, + "learning_rate": 8.548e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.9057382196187973, + "num_tokens": 26149313.0, + "step": 363 + }, + { + "epoch": 0.22720711582725392, + "grad_norm": 1.3480995893478394, + "learning_rate": 8.544e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.9191391468048096, + "num_tokens": 26219897.0, + "step": 364 + }, + { + "epoch": 0.22783131120040573, + "grad_norm": 2.04896879196167, + "learning_rate": 8.539999999999999e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.9199820160865784, + "num_tokens": 26288382.0, + "step": 365 + }, + { + "epoch": 0.2284555065735575, + "grad_norm": 1.9956952333450317, + "learning_rate": 8.536e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.9249612577259541, + "num_tokens": 26361670.0, + "step": 366 + }, + { + "epoch": 0.22907970194670932, + "grad_norm": 0.7502142190933228, + "learning_rate": 8.531999999999999e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.9189267456531525, + "num_tokens": 26433413.0, + "step": 367 + }, + { + "epoch": 0.22970389731986113, + "grad_norm": 1.308139443397522, + "learning_rate": 8.528e-07, + "loss": 0.425, + "mean_token_accuracy": 0.926760770380497, + "num_tokens": 26505506.0, + "step": 368 + }, + { + "epoch": 0.2303280926930129, + "grad_norm": 1.296675205230713, + "learning_rate": 8.524e-07, + "loss": 0.4051, + "mean_token_accuracy": 0.9225211814045906, + "num_tokens": 26575815.0, + "step": 369 + }, + { + "epoch": 0.23095228806616472, + "grad_norm": 1.9104191064834595, + "learning_rate": 8.52e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.9201551452279091, + "num_tokens": 26647797.0, + "step": 370 + }, + { + "epoch": 0.2315764834393165, + "grad_norm": 0.8089154362678528, + "learning_rate": 8.516e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.9099730663001537, + "num_tokens": 26717975.0, + "step": 371 + }, + { + "epoch": 0.2322006788124683, + "grad_norm": 1.8594608306884766, + "learning_rate": 8.511999999999999e-07, + "loss": 0.4212, + "mean_token_accuracy": 0.9235678352415562, + "num_tokens": 26792156.0, + "step": 372 + }, + { + "epoch": 0.2328248741856201, + "grad_norm": 1.2001798152923584, + "learning_rate": 8.508e-07, + "loss": 0.4173, + "mean_token_accuracy": 0.9245698302984238, + "num_tokens": 26864277.0, + "step": 373 + }, + { + "epoch": 0.2334490695587719, + "grad_norm": 1.230749249458313, + "learning_rate": 8.504e-07, + "loss": 0.4216, + "mean_token_accuracy": 0.921494010835886, + "num_tokens": 26937479.0, + "step": 374 + }, + { + "epoch": 0.23407326493192368, + "grad_norm": 1.1909148693084717, + "learning_rate": 8.499999999999999e-07, + "loss": 0.4199, + "mean_token_accuracy": 0.9290775023400784, + "num_tokens": 27014918.0, + "step": 375 + }, + { + "epoch": 0.2346974603050755, + "grad_norm": 1.3617513179779053, + "learning_rate": 8.496e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.9312286600470543, + "num_tokens": 27090755.0, + "step": 376 + }, + { + "epoch": 0.2353216556782273, + "grad_norm": 0.6376486420631409, + "learning_rate": 8.492e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.9228227473795414, + "num_tokens": 27162943.0, + "step": 377 + }, + { + "epoch": 0.23594585105137908, + "grad_norm": 1.1157821416854858, + "learning_rate": 8.487999999999999e-07, + "loss": 0.4072, + "mean_token_accuracy": 0.9222900941967964, + "num_tokens": 27233894.0, + "step": 378 + }, + { + "epoch": 0.2365700464245309, + "grad_norm": 0.5952740907669067, + "learning_rate": 8.484e-07, + "loss": 0.4095, + "mean_token_accuracy": 0.9275263771414757, + "num_tokens": 27306114.0, + "step": 379 + }, + { + "epoch": 0.23719424179768267, + "grad_norm": 0.5804332494735718, + "learning_rate": 8.48e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.9191073924303055, + "num_tokens": 27378710.0, + "step": 380 + }, + { + "epoch": 0.23781843717083448, + "grad_norm": 0.6052722930908203, + "learning_rate": 8.475999999999999e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.9200603328645229, + "num_tokens": 27448079.0, + "step": 381 + }, + { + "epoch": 0.23844263254398626, + "grad_norm": 1.1853963136672974, + "learning_rate": 8.471999999999999e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.914018739014864, + "num_tokens": 27518565.0, + "step": 382 + }, + { + "epoch": 0.23906682791713807, + "grad_norm": 0.6546916961669922, + "learning_rate": 8.468e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.9173280820250511, + "num_tokens": 27587846.0, + "step": 383 + }, + { + "epoch": 0.23969102329028985, + "grad_norm": 1.7882285118103027, + "learning_rate": 8.464e-07, + "loss": 0.4053, + "mean_token_accuracy": 0.9251374267041683, + "num_tokens": 27661293.0, + "step": 384 + }, + { + "epoch": 0.24031521866344166, + "grad_norm": 2.310976266860962, + "learning_rate": 8.459999999999999e-07, + "loss": 0.4088, + "mean_token_accuracy": 0.9227562807500362, + "num_tokens": 27733527.0, + "step": 385 + }, + { + "epoch": 0.24093941403659344, + "grad_norm": 0.3753027319908142, + "learning_rate": 8.456e-07, + "loss": 0.406, + "mean_token_accuracy": 0.9240587763488293, + "num_tokens": 27809744.0, + "step": 386 + }, + { + "epoch": 0.24156360940974525, + "grad_norm": 1.1255004405975342, + "learning_rate": 8.451999999999999e-07, + "loss": 0.4027, + "mean_token_accuracy": 0.922537162899971, + "num_tokens": 27879908.0, + "step": 387 + }, + { + "epoch": 0.24218780478289706, + "grad_norm": 1.728273630142212, + "learning_rate": 8.447999999999999e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.9165510907769203, + "num_tokens": 27947722.0, + "step": 388 + }, + { + "epoch": 0.24281200015604884, + "grad_norm": 1.1392329931259155, + "learning_rate": 8.444e-07, + "loss": 0.4137, + "mean_token_accuracy": 0.9213821589946747, + "num_tokens": 28016839.0, + "step": 389 + }, + { + "epoch": 0.24343619552920065, + "grad_norm": 1.1253199577331543, + "learning_rate": 8.439999999999999e-07, + "loss": 0.4112, + "mean_token_accuracy": 0.9206866696476936, + "num_tokens": 28084621.0, + "step": 390 + }, + { + "epoch": 0.24406039090235243, + "grad_norm": 1.1136630773544312, + "learning_rate": 8.436e-07, + "loss": 0.4187, + "mean_token_accuracy": 0.9186587296426296, + "num_tokens": 28157293.0, + "step": 391 + }, + { + "epoch": 0.24468458627550424, + "grad_norm": 0.5871824622154236, + "learning_rate": 8.431999999999999e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.9158234111964703, + "num_tokens": 28227939.0, + "step": 392 + }, + { + "epoch": 0.24530878164865602, + "grad_norm": 2.1546518802642822, + "learning_rate": 8.428e-07, + "loss": 0.3839, + "mean_token_accuracy": 0.928234089165926, + "num_tokens": 28301942.0, + "step": 393 + }, + { + "epoch": 0.24593297702180783, + "grad_norm": 1.6724817752838135, + "learning_rate": 8.424e-07, + "loss": 0.4064, + "mean_token_accuracy": 0.9248273372650146, + "num_tokens": 28376719.0, + "step": 394 + }, + { + "epoch": 0.2465571723949596, + "grad_norm": 1.69247305393219, + "learning_rate": 8.419999999999999e-07, + "loss": 0.4113, + "mean_token_accuracy": 0.9238225184381008, + "num_tokens": 28453154.0, + "step": 395 + }, + { + "epoch": 0.24718136776811142, + "grad_norm": 1.1182528734207153, + "learning_rate": 8.416e-07, + "loss": 0.3865, + "mean_token_accuracy": 0.9262501187622547, + "num_tokens": 28520596.0, + "step": 396 + }, + { + "epoch": 0.2478055631412632, + "grad_norm": 0.5353150963783264, + "learning_rate": 8.411999999999999e-07, + "loss": 0.4092, + "mean_token_accuracy": 0.9224611297249794, + "num_tokens": 28589771.0, + "step": 397 + }, + { + "epoch": 0.248429758514415, + "grad_norm": 1.5561952590942383, + "learning_rate": 8.408e-07, + "loss": 0.4197, + "mean_token_accuracy": 0.9252982772886753, + "num_tokens": 28665975.0, + "step": 398 + }, + { + "epoch": 0.24905395388756682, + "grad_norm": 0.9938348531723022, + "learning_rate": 8.404e-07, + "loss": 0.3826, + "mean_token_accuracy": 0.9210449159145355, + "num_tokens": 28735893.0, + "step": 399 + }, + { + "epoch": 0.2496781492607186, + "grad_norm": 1.2187830209732056, + "learning_rate": 8.399999999999999e-07, + "loss": 0.4053, + "mean_token_accuracy": 0.9271294698119164, + "num_tokens": 28810621.0, + "step": 400 + }, + { + "epoch": 0.2503023446338704, + "grad_norm": 1.3881856203079224, + "learning_rate": 8.396e-07, + "loss": 0.3908, + "mean_token_accuracy": 0.9332247227430344, + "num_tokens": 28888430.0, + "step": 401 + }, + { + "epoch": 0.2509265400070222, + "grad_norm": 1.5399160385131836, + "learning_rate": 8.391999999999999e-07, + "loss": 0.4008, + "mean_token_accuracy": 0.926115620881319, + "num_tokens": 28960778.0, + "step": 402 + }, + { + "epoch": 0.251550735380174, + "grad_norm": 1.5015639066696167, + "learning_rate": 8.387999999999999e-07, + "loss": 0.38, + "mean_token_accuracy": 0.9280642941594124, + "num_tokens": 29032917.0, + "step": 403 + }, + { + "epoch": 0.2521749307533258, + "grad_norm": 0.9755170345306396, + "learning_rate": 8.384e-07, + "loss": 0.4271, + "mean_token_accuracy": 0.9176996536552906, + "num_tokens": 29104383.0, + "step": 404 + }, + { + "epoch": 0.2527991261264776, + "grad_norm": 0.9623498916625977, + "learning_rate": 8.38e-07, + "loss": 0.3931, + "mean_token_accuracy": 0.9232301451265812, + "num_tokens": 29176946.0, + "step": 405 + }, + { + "epoch": 0.2534233214996294, + "grad_norm": 1.4719113111495972, + "learning_rate": 8.375999999999999e-07, + "loss": 0.3994, + "mean_token_accuracy": 0.9208735749125481, + "num_tokens": 29250010.0, + "step": 406 + }, + { + "epoch": 0.25404751687278115, + "grad_norm": 1.420078158378601, + "learning_rate": 8.372e-07, + "loss": 0.3847, + "mean_token_accuracy": 0.9272407628595829, + "num_tokens": 29323717.0, + "step": 407 + }, + { + "epoch": 0.254671712245933, + "grad_norm": 0.936745285987854, + "learning_rate": 8.368e-07, + "loss": 0.4253, + "mean_token_accuracy": 0.9173033535480499, + "num_tokens": 29394139.0, + "step": 408 + }, + { + "epoch": 0.25529590761908477, + "grad_norm": 0.5124381184577942, + "learning_rate": 8.363999999999999e-07, + "loss": 0.3956, + "mean_token_accuracy": 0.9207809679210186, + "num_tokens": 29462291.0, + "step": 409 + }, + { + "epoch": 0.25592010299223655, + "grad_norm": 0.541309118270874, + "learning_rate": 8.359999999999999e-07, + "loss": 0.3691, + "mean_token_accuracy": 0.9308284670114517, + "num_tokens": 29538759.0, + "step": 410 + }, + { + "epoch": 0.2565442983653884, + "grad_norm": 0.9743673205375671, + "learning_rate": 8.356e-07, + "loss": 0.3842, + "mean_token_accuracy": 0.9242149740457535, + "num_tokens": 29612667.0, + "step": 411 + }, + { + "epoch": 0.25716849373854017, + "grad_norm": 0.531112015247345, + "learning_rate": 8.352000000000001e-07, + "loss": 0.371, + "mean_token_accuracy": 0.9262283891439438, + "num_tokens": 29683930.0, + "step": 412 + }, + { + "epoch": 0.25779268911169195, + "grad_norm": 1.4184839725494385, + "learning_rate": 8.347999999999999e-07, + "loss": 0.3917, + "mean_token_accuracy": 0.9264179468154907, + "num_tokens": 29756605.0, + "step": 413 + }, + { + "epoch": 0.25841688448484373, + "grad_norm": 1.4830143451690674, + "learning_rate": 8.344e-07, + "loss": 0.4125, + "mean_token_accuracy": 0.9165854007005692, + "num_tokens": 29827260.0, + "step": 414 + }, + { + "epoch": 0.25904107985799557, + "grad_norm": 1.3852146863937378, + "learning_rate": 8.34e-07, + "loss": 0.3884, + "mean_token_accuracy": 0.9258201904594898, + "num_tokens": 29900579.0, + "step": 415 + }, + { + "epoch": 0.25966527523114735, + "grad_norm": 1.811882734298706, + "learning_rate": 8.335999999999999e-07, + "loss": 0.3795, + "mean_token_accuracy": 0.92490029707551, + "num_tokens": 29975594.0, + "step": 416 + }, + { + "epoch": 0.26028947060429913, + "grad_norm": 1.353249192237854, + "learning_rate": 8.332e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.9173424355685711, + "num_tokens": 30046265.0, + "step": 417 + }, + { + "epoch": 0.2609136659774509, + "grad_norm": 0.9155253171920776, + "learning_rate": 8.328e-07, + "loss": 0.3977, + "mean_token_accuracy": 0.9226347766816616, + "num_tokens": 30121600.0, + "step": 418 + }, + { + "epoch": 0.26153786135060275, + "grad_norm": 1.4468632936477661, + "learning_rate": 8.324e-07, + "loss": 0.3684, + "mean_token_accuracy": 0.9264166504144669, + "num_tokens": 30196202.0, + "step": 419 + }, + { + "epoch": 0.26216205672375453, + "grad_norm": 0.9410948753356934, + "learning_rate": 8.319999999999999e-07, + "loss": 0.4268, + "mean_token_accuracy": 0.9185077361762524, + "num_tokens": 30266252.0, + "step": 420 + }, + { + "epoch": 0.2627862520969063, + "grad_norm": 0.9149848222732544, + "learning_rate": 8.316e-07, + "loss": 0.4048, + "mean_token_accuracy": 0.9243842512369156, + "num_tokens": 30341952.0, + "step": 421 + }, + { + "epoch": 0.26341044747005815, + "grad_norm": 0.8869105577468872, + "learning_rate": 8.312e-07, + "loss": 0.3984, + "mean_token_accuracy": 0.921892773360014, + "num_tokens": 30410852.0, + "step": 422 + }, + { + "epoch": 0.26403464284320993, + "grad_norm": 0.9244428277015686, + "learning_rate": 8.308e-07, + "loss": 0.4001, + "mean_token_accuracy": 0.919981986284256, + "num_tokens": 30480407.0, + "step": 423 + }, + { + "epoch": 0.2646588382163617, + "grad_norm": 1.783027172088623, + "learning_rate": 8.304e-07, + "loss": 0.4101, + "mean_token_accuracy": 0.9190192967653275, + "num_tokens": 30554652.0, + "step": 424 + }, + { + "epoch": 0.2652830335895135, + "grad_norm": 0.8427060842514038, + "learning_rate": 8.299999999999999e-07, + "loss": 0.3868, + "mean_token_accuracy": 0.9234179295599461, + "num_tokens": 30623620.0, + "step": 425 + }, + { + "epoch": 0.26590722896266533, + "grad_norm": 0.24793501198291779, + "learning_rate": 8.296e-07, + "loss": 0.3966, + "mean_token_accuracy": 0.9199277870357037, + "num_tokens": 30697139.0, + "step": 426 + }, + { + "epoch": 0.2665314243358171, + "grad_norm": 0.46406906843185425, + "learning_rate": 8.292e-07, + "loss": 0.3978, + "mean_token_accuracy": 0.9246162809431553, + "num_tokens": 30769639.0, + "step": 427 + }, + { + "epoch": 0.2671556197089689, + "grad_norm": 0.27684247493743896, + "learning_rate": 8.287999999999999e-07, + "loss": 0.3624, + "mean_token_accuracy": 0.9286627471446991, + "num_tokens": 30844746.0, + "step": 428 + }, + { + "epoch": 0.26777981508212073, + "grad_norm": 0.486666202545166, + "learning_rate": 8.284e-07, + "loss": 0.3778, + "mean_token_accuracy": 0.9263641089200974, + "num_tokens": 30921360.0, + "step": 429 + }, + { + "epoch": 0.2684040104552725, + "grad_norm": 0.8701847195625305, + "learning_rate": 8.28e-07, + "loss": 0.3917, + "mean_token_accuracy": 0.9192351438105106, + "num_tokens": 30990600.0, + "step": 430 + }, + { + "epoch": 0.2690282058284243, + "grad_norm": 0.47957533597946167, + "learning_rate": 8.275999999999999e-07, + "loss": 0.3453, + "mean_token_accuracy": 0.9272926487028599, + "num_tokens": 31066108.0, + "step": 431 + }, + { + "epoch": 0.2696524012015761, + "grad_norm": 1.20380437374115, + "learning_rate": 8.272e-07, + "loss": 0.3861, + "mean_token_accuracy": 0.9172127097845078, + "num_tokens": 31133799.0, + "step": 432 + }, + { + "epoch": 0.2702765965747279, + "grad_norm": 1.2132657766342163, + "learning_rate": 8.268e-07, + "loss": 0.3743, + "mean_token_accuracy": 0.9244321323931217, + "num_tokens": 31205749.0, + "step": 433 + }, + { + "epoch": 0.2709007919478797, + "grad_norm": 0.5054382681846619, + "learning_rate": 8.263999999999999e-07, + "loss": 0.3527, + "mean_token_accuracy": 0.9261834844946861, + "num_tokens": 31274405.0, + "step": 434 + }, + { + "epoch": 0.2715249873210315, + "grad_norm": 1.2120206356048584, + "learning_rate": 8.259999999999999e-07, + "loss": 0.3817, + "mean_token_accuracy": 0.9236193560063839, + "num_tokens": 31346190.0, + "step": 435 + }, + { + "epoch": 0.27214918269418326, + "grad_norm": 1.2147562503814697, + "learning_rate": 8.256e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.9178253710269928, + "num_tokens": 31417066.0, + "step": 436 + }, + { + "epoch": 0.2727733780673351, + "grad_norm": 0.8387507200241089, + "learning_rate": 8.252000000000001e-07, + "loss": 0.4085, + "mean_token_accuracy": 0.9183575063943863, + "num_tokens": 31487638.0, + "step": 437 + }, + { + "epoch": 0.2733975734404869, + "grad_norm": 0.457234650850296, + "learning_rate": 8.247999999999999e-07, + "loss": 0.3921, + "mean_token_accuracy": 0.920664619654417, + "num_tokens": 31555726.0, + "step": 438 + }, + { + "epoch": 0.27402176881363866, + "grad_norm": 1.218550205230713, + "learning_rate": 8.244e-07, + "loss": 0.3972, + "mean_token_accuracy": 0.9214368127286434, + "num_tokens": 31626798.0, + "step": 439 + }, + { + "epoch": 0.2746459641867905, + "grad_norm": 1.2510223388671875, + "learning_rate": 8.24e-07, + "loss": 0.368, + "mean_token_accuracy": 0.9258331805467606, + "num_tokens": 31698223.0, + "step": 440 + }, + { + "epoch": 0.2752701595599423, + "grad_norm": 1.1848870515823364, + "learning_rate": 8.235999999999999e-07, + "loss": 0.4065, + "mean_token_accuracy": 0.9150502532720566, + "num_tokens": 31770218.0, + "step": 441 + }, + { + "epoch": 0.27589435493309405, + "grad_norm": 0.8210954666137695, + "learning_rate": 8.232e-07, + "loss": 0.3879, + "mean_token_accuracy": 0.9278232716023922, + "num_tokens": 31845687.0, + "step": 442 + }, + { + "epoch": 0.27651855030624584, + "grad_norm": 0.8770108819007874, + "learning_rate": 8.228e-07, + "loss": 0.399, + "mean_token_accuracy": 0.9185272864997387, + "num_tokens": 31916228.0, + "step": 443 + }, + { + "epoch": 0.2771427456793977, + "grad_norm": 1.1238223314285278, + "learning_rate": 8.224e-07, + "loss": 0.3823, + "mean_token_accuracy": 0.9234112687408924, + "num_tokens": 31986380.0, + "step": 444 + }, + { + "epoch": 0.27776694105254945, + "grad_norm": 0.8185886144638062, + "learning_rate": 8.219999999999999e-07, + "loss": 0.3675, + "mean_token_accuracy": 0.9260559901595116, + "num_tokens": 32057315.0, + "step": 445 + }, + { + "epoch": 0.27839113642570124, + "grad_norm": 1.1564223766326904, + "learning_rate": 8.216e-07, + "loss": 0.3536, + "mean_token_accuracy": 0.9315725937485695, + "num_tokens": 32134655.0, + "step": 446 + }, + { + "epoch": 0.279015331798853, + "grad_norm": 1.0922423601150513, + "learning_rate": 8.212e-07, + "loss": 0.3777, + "mean_token_accuracy": 0.919820062816143, + "num_tokens": 32206037.0, + "step": 447 + }, + { + "epoch": 0.27963952717200485, + "grad_norm": 0.8300904631614685, + "learning_rate": 8.207999999999999e-07, + "loss": 0.3843, + "mean_token_accuracy": 0.9201874099671841, + "num_tokens": 32271782.0, + "step": 448 + }, + { + "epoch": 0.28026372254515663, + "grad_norm": 1.0689630508422852, + "learning_rate": 8.204e-07, + "loss": 0.4132, + "mean_token_accuracy": 0.9163887202739716, + "num_tokens": 32341728.0, + "step": 449 + }, + { + "epoch": 0.2808879179183084, + "grad_norm": 0.7719376683235168, + "learning_rate": 8.199999999999999e-07, + "loss": 0.4028, + "mean_token_accuracy": 0.9178178422152996, + "num_tokens": 32413746.0, + "step": 450 + }, + { + "epoch": 0.28151211329146025, + "grad_norm": 0.8189422488212585, + "learning_rate": 8.196e-07, + "loss": 0.3718, + "mean_token_accuracy": 0.9275276958942413, + "num_tokens": 32489023.0, + "step": 451 + }, + { + "epoch": 0.28213630866461203, + "grad_norm": 0.7677391171455383, + "learning_rate": 8.192e-07, + "loss": 0.3993, + "mean_token_accuracy": 0.9175617136061192, + "num_tokens": 32557619.0, + "step": 452 + }, + { + "epoch": 0.2827605040377638, + "grad_norm": 0.7534482479095459, + "learning_rate": 8.187999999999999e-07, + "loss": 0.3715, + "mean_token_accuracy": 0.9263954050838947, + "num_tokens": 32629108.0, + "step": 453 + }, + { + "epoch": 0.2833846994109156, + "grad_norm": 1.0587571859359741, + "learning_rate": 8.184e-07, + "loss": 0.393, + "mean_token_accuracy": 0.9122283458709717, + "num_tokens": 32699246.0, + "step": 454 + }, + { + "epoch": 0.28400889478406743, + "grad_norm": 1.0744235515594482, + "learning_rate": 8.179999999999999e-07, + "loss": 0.3799, + "mean_token_accuracy": 0.9200029857456684, + "num_tokens": 32771788.0, + "step": 455 + }, + { + "epoch": 0.2846330901572192, + "grad_norm": 0.5133286714553833, + "learning_rate": 8.175999999999999e-07, + "loss": 0.4008, + "mean_token_accuracy": 0.9154097177088261, + "num_tokens": 32844323.0, + "step": 456 + }, + { + "epoch": 0.285257285530371, + "grad_norm": 0.4051693081855774, + "learning_rate": 8.172e-07, + "loss": 0.3743, + "mean_token_accuracy": 0.9187863431870937, + "num_tokens": 32911195.0, + "step": 457 + }, + { + "epoch": 0.2858814809035228, + "grad_norm": 0.43125689029693604, + "learning_rate": 8.168e-07, + "loss": 0.3903, + "mean_token_accuracy": 0.9152910970151424, + "num_tokens": 32983744.0, + "step": 458 + }, + { + "epoch": 0.2865056762766746, + "grad_norm": 1.155842900276184, + "learning_rate": 8.163999999999999e-07, + "loss": 0.3766, + "mean_token_accuracy": 0.9185449928045273, + "num_tokens": 33050055.0, + "step": 459 + }, + { + "epoch": 0.2871298716498264, + "grad_norm": 0.6426854133605957, + "learning_rate": 8.159999999999999e-07, + "loss": 0.3952, + "mean_token_accuracy": 0.9194389209151268, + "num_tokens": 33123662.0, + "step": 460 + }, + { + "epoch": 0.2877540670229782, + "grad_norm": 0.3941771388053894, + "learning_rate": 8.156e-07, + "loss": 0.3895, + "mean_token_accuracy": 0.9190852232277393, + "num_tokens": 33195820.0, + "step": 461 + }, + { + "epoch": 0.28837826239613, + "grad_norm": 0.4276549220085144, + "learning_rate": 8.152e-07, + "loss": 0.3632, + "mean_token_accuracy": 0.9277602657675743, + "num_tokens": 33269124.0, + "step": 462 + }, + { + "epoch": 0.2890024577692818, + "grad_norm": 0.4527488946914673, + "learning_rate": 8.147999999999999e-07, + "loss": 0.3597, + "mean_token_accuracy": 0.927538301795721, + "num_tokens": 33338440.0, + "step": 463 + }, + { + "epoch": 0.2896266531424336, + "grad_norm": 0.3035643994808197, + "learning_rate": 8.144e-07, + "loss": 0.3548, + "mean_token_accuracy": 0.9277514107525349, + "num_tokens": 33413283.0, + "step": 464 + }, + { + "epoch": 0.29025084851558536, + "grad_norm": 0.6827325224876404, + "learning_rate": 8.14e-07, + "loss": 0.3905, + "mean_token_accuracy": 0.9223032481968403, + "num_tokens": 33486403.0, + "step": 465 + }, + { + "epoch": 0.2908750438887372, + "grad_norm": 0.3996064364910126, + "learning_rate": 8.135999999999999e-07, + "loss": 0.3597, + "mean_token_accuracy": 0.9243958182632923, + "num_tokens": 33556839.0, + "step": 466 + }, + { + "epoch": 0.291499239261889, + "grad_norm": 1.0095113515853882, + "learning_rate": 8.132e-07, + "loss": 0.3672, + "mean_token_accuracy": 0.9211184307932854, + "num_tokens": 33627939.0, + "step": 467 + }, + { + "epoch": 0.29212343463504076, + "grad_norm": 0.6771048307418823, + "learning_rate": 8.128e-07, + "loss": 0.3493, + "mean_token_accuracy": 0.9311572276055813, + "num_tokens": 33703915.0, + "step": 468 + }, + { + "epoch": 0.29274763000819254, + "grad_norm": 1.3612362146377563, + "learning_rate": 8.123999999999999e-07, + "loss": 0.3898, + "mean_token_accuracy": 0.9170313812792301, + "num_tokens": 33774039.0, + "step": 469 + }, + { + "epoch": 0.2933718253813444, + "grad_norm": 0.9771288633346558, + "learning_rate": 8.12e-07, + "loss": 0.3882, + "mean_token_accuracy": 0.9206005297601223, + "num_tokens": 33849336.0, + "step": 470 + }, + { + "epoch": 0.29399602075449616, + "grad_norm": 1.024867057800293, + "learning_rate": 8.116e-07, + "loss": 0.3446, + "mean_token_accuracy": 0.9304168000817299, + "num_tokens": 33923177.0, + "step": 471 + }, + { + "epoch": 0.29462021612764794, + "grad_norm": 0.6518855690956116, + "learning_rate": 8.112e-07, + "loss": 0.361, + "mean_token_accuracy": 0.9236439689993858, + "num_tokens": 33996124.0, + "step": 472 + }, + { + "epoch": 0.2952444115007998, + "grad_norm": 0.6530463099479675, + "learning_rate": 8.107999999999999e-07, + "loss": 0.3687, + "mean_token_accuracy": 0.9250088781118393, + "num_tokens": 34068623.0, + "step": 473 + }, + { + "epoch": 0.29586860687395156, + "grad_norm": 1.3373897075653076, + "learning_rate": 8.104e-07, + "loss": 0.3572, + "mean_token_accuracy": 0.9255791194736958, + "num_tokens": 34143815.0, + "step": 474 + }, + { + "epoch": 0.29649280224710334, + "grad_norm": 1.049225926399231, + "learning_rate": 8.1e-07, + "loss": 0.3817, + "mean_token_accuracy": 0.9227948747575283, + "num_tokens": 34218652.0, + "step": 475 + }, + { + "epoch": 0.2971169976202551, + "grad_norm": 1.324798345565796, + "learning_rate": 8.095999999999999e-07, + "loss": 0.4043, + "mean_token_accuracy": 0.9184755980968475, + "num_tokens": 34289768.0, + "step": 476 + }, + { + "epoch": 0.29774119299340696, + "grad_norm": 0.23902744054794312, + "learning_rate": 8.092e-07, + "loss": 0.3584, + "mean_token_accuracy": 0.9244324378669262, + "num_tokens": 34361761.0, + "step": 477 + }, + { + "epoch": 0.29836538836655874, + "grad_norm": 0.6264401078224182, + "learning_rate": 8.087999999999999e-07, + "loss": 0.4131, + "mean_token_accuracy": 0.9105452783405781, + "num_tokens": 34433362.0, + "step": 478 + }, + { + "epoch": 0.2989895837397105, + "grad_norm": 0.7376995086669922, + "learning_rate": 8.084e-07, + "loss": 0.3779, + "mean_token_accuracy": 0.9226237758994102, + "num_tokens": 34504708.0, + "step": 479 + }, + { + "epoch": 0.2996137791128623, + "grad_norm": 0.9867478013038635, + "learning_rate": 8.08e-07, + "loss": 0.3878, + "mean_token_accuracy": 0.9225059673190117, + "num_tokens": 34573700.0, + "step": 480 + }, + { + "epoch": 0.30023797448601414, + "grad_norm": 0.996479868888855, + "learning_rate": 8.075999999999999e-07, + "loss": 0.3638, + "mean_token_accuracy": 0.923095341771841, + "num_tokens": 34644923.0, + "step": 481 + }, + { + "epoch": 0.3008621698591659, + "grad_norm": 0.3789809048175812, + "learning_rate": 8.072e-07, + "loss": 0.3836, + "mean_token_accuracy": 0.9172099679708481, + "num_tokens": 34712662.0, + "step": 482 + }, + { + "epoch": 0.3014863652323177, + "grad_norm": 0.5409672856330872, + "learning_rate": 8.067999999999999e-07, + "loss": 0.3438, + "mean_token_accuracy": 0.9296190738677979, + "num_tokens": 34789101.0, + "step": 483 + }, + { + "epoch": 0.30211056060546954, + "grad_norm": 0.6593160629272461, + "learning_rate": 8.064e-07, + "loss": 0.3783, + "mean_token_accuracy": 0.9195466078817844, + "num_tokens": 34861419.0, + "step": 484 + }, + { + "epoch": 0.3027347559786213, + "grad_norm": 0.9712687730789185, + "learning_rate": 8.06e-07, + "loss": 0.3442, + "mean_token_accuracy": 0.9274141415953636, + "num_tokens": 34934817.0, + "step": 485 + }, + { + "epoch": 0.3033589513517731, + "grad_norm": 0.9481886625289917, + "learning_rate": 8.056e-07, + "loss": 0.3613, + "mean_token_accuracy": 0.9272597245872021, + "num_tokens": 35009108.0, + "step": 486 + }, + { + "epoch": 0.3039831467249249, + "grad_norm": 0.6604760885238647, + "learning_rate": 8.052e-07, + "loss": 0.3691, + "mean_token_accuracy": 0.9250000976026058, + "num_tokens": 35084183.0, + "step": 487 + }, + { + "epoch": 0.3046073420980767, + "grad_norm": 0.4148165285587311, + "learning_rate": 8.047999999999999e-07, + "loss": 0.343, + "mean_token_accuracy": 0.9259302839636803, + "num_tokens": 35152335.0, + "step": 488 + }, + { + "epoch": 0.3052315374712285, + "grad_norm": 0.6812533736228943, + "learning_rate": 8.044e-07, + "loss": 0.3648, + "mean_token_accuracy": 0.926187340170145, + "num_tokens": 35226350.0, + "step": 489 + }, + { + "epoch": 0.3058557328443803, + "grad_norm": 0.928580641746521, + "learning_rate": 8.04e-07, + "loss": 0.3541, + "mean_token_accuracy": 0.9230936653912067, + "num_tokens": 35296847.0, + "step": 490 + }, + { + "epoch": 0.30647992821753206, + "grad_norm": 0.9404719471931458, + "learning_rate": 8.035999999999999e-07, + "loss": 0.4057, + "mean_token_accuracy": 0.9147538281977177, + "num_tokens": 35367486.0, + "step": 491 + }, + { + "epoch": 0.3071041235906839, + "grad_norm": 0.8735427856445312, + "learning_rate": 8.032e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.9132251814007759, + "num_tokens": 35439097.0, + "step": 492 + }, + { + "epoch": 0.3077283189638357, + "grad_norm": 0.9538032412528992, + "learning_rate": 8.028e-07, + "loss": 0.4088, + "mean_token_accuracy": 0.9159552976489067, + "num_tokens": 35512252.0, + "step": 493 + }, + { + "epoch": 0.30835251433698746, + "grad_norm": 0.6534470319747925, + "learning_rate": 8.023999999999999e-07, + "loss": 0.3348, + "mean_token_accuracy": 0.9320591278374195, + "num_tokens": 35586964.0, + "step": 494 + }, + { + "epoch": 0.3089767097101393, + "grad_norm": 0.6475063562393188, + "learning_rate": 8.02e-07, + "loss": 0.3771, + "mean_token_accuracy": 0.9201016835868359, + "num_tokens": 35657576.0, + "step": 495 + }, + { + "epoch": 0.3096009050832911, + "grad_norm": 0.35505348443984985, + "learning_rate": 8.016e-07, + "loss": 0.3685, + "mean_token_accuracy": 0.9209739379584789, + "num_tokens": 35724423.0, + "step": 496 + }, + { + "epoch": 0.31022510045644286, + "grad_norm": 0.5871096253395081, + "learning_rate": 8.012e-07, + "loss": 0.3524, + "mean_token_accuracy": 0.9270750507712364, + "num_tokens": 35797645.0, + "step": 497 + }, + { + "epoch": 0.31084929582959464, + "grad_norm": 0.6056950688362122, + "learning_rate": 8.007999999999999e-07, + "loss": 0.3925, + "mean_token_accuracy": 0.9213661998510361, + "num_tokens": 35864538.0, + "step": 498 + }, + { + "epoch": 0.3114734912027465, + "grad_norm": 0.6547621488571167, + "learning_rate": 8.004e-07, + "loss": 0.3887, + "mean_token_accuracy": 0.9117082841694355, + "num_tokens": 35929722.0, + "step": 499 + }, + { + "epoch": 0.31209768657589826, + "grad_norm": 0.9326969385147095, + "learning_rate": 8e-07, + "loss": 0.365, + "mean_token_accuracy": 0.921146672219038, + "num_tokens": 35999033.0, + "step": 500 + }, + { + "epoch": 0.31272188194905004, + "grad_norm": 0.44340744614601135, + "learning_rate": 7.995999999999999e-07, + "loss": 0.3692, + "mean_token_accuracy": 0.9207760877907276, + "num_tokens": 36073494.0, + "step": 501 + }, + { + "epoch": 0.3133460773222019, + "grad_norm": 0.567279040813446, + "learning_rate": 7.992e-07, + "loss": 0.3795, + "mean_token_accuracy": 0.9197339341044426, + "num_tokens": 36146852.0, + "step": 502 + }, + { + "epoch": 0.31397027269535366, + "grad_norm": 0.42524921894073486, + "learning_rate": 7.987999999999999e-07, + "loss": 0.3513, + "mean_token_accuracy": 0.9261737614870071, + "num_tokens": 36218524.0, + "step": 503 + }, + { + "epoch": 0.31459446806850544, + "grad_norm": 0.8648039698600769, + "learning_rate": 7.984e-07, + "loss": 0.3633, + "mean_token_accuracy": 0.9185593500733376, + "num_tokens": 36291663.0, + "step": 504 + }, + { + "epoch": 0.3152186634416572, + "grad_norm": 0.3618353009223938, + "learning_rate": 7.98e-07, + "loss": 0.3621, + "mean_token_accuracy": 0.9248528815805912, + "num_tokens": 36363896.0, + "step": 505 + }, + { + "epoch": 0.31584285881480906, + "grad_norm": 1.1000434160232544, + "learning_rate": 7.975999999999999e-07, + "loss": 0.3067, + "mean_token_accuracy": 0.935703057795763, + "num_tokens": 36442041.0, + "step": 506 + }, + { + "epoch": 0.31646705418796084, + "grad_norm": 0.3486091196537018, + "learning_rate": 7.972e-07, + "loss": 0.3619, + "mean_token_accuracy": 0.9244496859610081, + "num_tokens": 36515156.0, + "step": 507 + }, + { + "epoch": 0.3170912495611126, + "grad_norm": 0.5138978362083435, + "learning_rate": 7.967999999999999e-07, + "loss": 0.3886, + "mean_token_accuracy": 0.9163962192833424, + "num_tokens": 36588071.0, + "step": 508 + }, + { + "epoch": 0.3177154449342644, + "grad_norm": 1.234805941581726, + "learning_rate": 7.964e-07, + "loss": 0.3803, + "mean_token_accuracy": 0.9220647066831589, + "num_tokens": 36657176.0, + "step": 509 + }, + { + "epoch": 0.31833964030741624, + "grad_norm": 0.85943603515625, + "learning_rate": 7.96e-07, + "loss": 0.3618, + "mean_token_accuracy": 0.9248210079967976, + "num_tokens": 36734905.0, + "step": 510 + }, + { + "epoch": 0.318963835680568, + "grad_norm": 0.5815002918243408, + "learning_rate": 7.956e-07, + "loss": 0.3917, + "mean_token_accuracy": 0.9153008237481117, + "num_tokens": 36804876.0, + "step": 511 + }, + { + "epoch": 0.3195880310537198, + "grad_norm": 0.559831976890564, + "learning_rate": 7.952e-07, + "loss": 0.3496, + "mean_token_accuracy": 0.92459636926651, + "num_tokens": 36872556.0, + "step": 512 + }, + { + "epoch": 0.32021222642687164, + "grad_norm": 1.2308224439620972, + "learning_rate": 7.947999999999999e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8950334936380386, + "num_tokens": 36934444.0, + "step": 513 + }, + { + "epoch": 0.3208364218000234, + "grad_norm": 0.8181596994400024, + "learning_rate": 7.944e-07, + "loss": 0.3325, + "mean_token_accuracy": 0.9320288971066475, + "num_tokens": 37008882.0, + "step": 514 + }, + { + "epoch": 0.3214606171731752, + "grad_norm": 0.4042363464832306, + "learning_rate": 7.94e-07, + "loss": 0.3836, + "mean_token_accuracy": 0.9188083931803703, + "num_tokens": 37080376.0, + "step": 515 + }, + { + "epoch": 0.322084812546327, + "grad_norm": 1.0137232542037964, + "learning_rate": 7.935999999999999e-07, + "loss": 0.3765, + "mean_token_accuracy": 0.9191593118011951, + "num_tokens": 37153866.0, + "step": 516 + }, + { + "epoch": 0.3227090079194788, + "grad_norm": 0.840237557888031, + "learning_rate": 7.932e-07, + "loss": 0.3859, + "mean_token_accuracy": 0.917050663381815, + "num_tokens": 37228829.0, + "step": 517 + }, + { + "epoch": 0.3233332032926306, + "grad_norm": 1.093410849571228, + "learning_rate": 7.928e-07, + "loss": 0.3661, + "mean_token_accuracy": 0.9273787848651409, + "num_tokens": 37299866.0, + "step": 518 + }, + { + "epoch": 0.3239573986657824, + "grad_norm": 1.0735011100769043, + "learning_rate": 7.923999999999999e-07, + "loss": 0.3451, + "mean_token_accuracy": 0.9202693998813629, + "num_tokens": 37368186.0, + "step": 519 + }, + { + "epoch": 0.32458159403893416, + "grad_norm": 1.009791612625122, + "learning_rate": 7.92e-07, + "loss": 0.3393, + "mean_token_accuracy": 0.9257577694952488, + "num_tokens": 37440532.0, + "step": 520 + }, + { + "epoch": 0.325205789412086, + "grad_norm": 0.5683333873748779, + "learning_rate": 7.916e-07, + "loss": 0.3651, + "mean_token_accuracy": 0.9217745624482632, + "num_tokens": 37511829.0, + "step": 521 + }, + { + "epoch": 0.3258299847852378, + "grad_norm": 0.7119737863540649, + "learning_rate": 7.911999999999999e-07, + "loss": 0.3841, + "mean_token_accuracy": 0.9179428033530712, + "num_tokens": 37584018.0, + "step": 522 + }, + { + "epoch": 0.32645418015838956, + "grad_norm": 0.52000492811203, + "learning_rate": 7.907999999999999e-07, + "loss": 0.3817, + "mean_token_accuracy": 0.9178862608969212, + "num_tokens": 37651312.0, + "step": 523 + }, + { + "epoch": 0.3270783755315414, + "grad_norm": 1.0212749242782593, + "learning_rate": 7.904e-07, + "loss": 0.3721, + "mean_token_accuracy": 0.9203926138579845, + "num_tokens": 37721284.0, + "step": 524 + }, + { + "epoch": 0.3277025709046932, + "grad_norm": 0.30904000997543335, + "learning_rate": 7.9e-07, + "loss": 0.3447, + "mean_token_accuracy": 0.9289012476801872, + "num_tokens": 37795216.0, + "step": 525 + }, + { + "epoch": 0.32832676627784496, + "grad_norm": 1.0068774223327637, + "learning_rate": 7.895999999999999e-07, + "loss": 0.3429, + "mean_token_accuracy": 0.9220792353153229, + "num_tokens": 37869115.0, + "step": 526 + }, + { + "epoch": 0.32895096165099674, + "grad_norm": 0.7811362147331238, + "learning_rate": 7.892e-07, + "loss": 0.3277, + "mean_token_accuracy": 0.9309323355555534, + "num_tokens": 37939342.0, + "step": 527 + }, + { + "epoch": 0.3295751570241486, + "grad_norm": 0.5424177050590515, + "learning_rate": 7.887999999999999e-07, + "loss": 0.3594, + "mean_token_accuracy": 0.9236319363117218, + "num_tokens": 38016632.0, + "step": 528 + }, + { + "epoch": 0.33019935239730036, + "grad_norm": 0.7413662672042847, + "learning_rate": 7.883999999999999e-07, + "loss": 0.3306, + "mean_token_accuracy": 0.931208036839962, + "num_tokens": 38089679.0, + "step": 529 + }, + { + "epoch": 0.33082354777045214, + "grad_norm": 1.0389795303344727, + "learning_rate": 7.88e-07, + "loss": 0.3616, + "mean_token_accuracy": 0.926129661500454, + "num_tokens": 38166813.0, + "step": 530 + }, + { + "epoch": 0.3314477431436039, + "grad_norm": 0.3387523889541626, + "learning_rate": 7.875999999999999e-07, + "loss": 0.3404, + "mean_token_accuracy": 0.9300377070903778, + "num_tokens": 38240133.0, + "step": 531 + }, + { + "epoch": 0.33207193851675576, + "grad_norm": 0.5325695276260376, + "learning_rate": 7.872e-07, + "loss": 0.3512, + "mean_token_accuracy": 0.9276345297694206, + "num_tokens": 38314974.0, + "step": 532 + }, + { + "epoch": 0.33269613388990754, + "grad_norm": 0.984096884727478, + "learning_rate": 7.868e-07, + "loss": 0.3632, + "mean_token_accuracy": 0.9203736409544945, + "num_tokens": 38384547.0, + "step": 533 + }, + { + "epoch": 0.3333203292630593, + "grad_norm": 0.7203853726387024, + "learning_rate": 7.864e-07, + "loss": 0.3421, + "mean_token_accuracy": 0.9251072555780411, + "num_tokens": 38458541.0, + "step": 534 + }, + { + "epoch": 0.33394452463621116, + "grad_norm": 0.5719137787818909, + "learning_rate": 7.86e-07, + "loss": 0.3531, + "mean_token_accuracy": 0.9257007800042629, + "num_tokens": 38531488.0, + "step": 535 + }, + { + "epoch": 0.33456872000936294, + "grad_norm": 0.489790141582489, + "learning_rate": 7.855999999999999e-07, + "loss": 0.3332, + "mean_token_accuracy": 0.9269058927893639, + "num_tokens": 38602621.0, + "step": 536 + }, + { + "epoch": 0.3351929153825147, + "grad_norm": 0.48282408714294434, + "learning_rate": 7.852e-07, + "loss": 0.3778, + "mean_token_accuracy": 0.9227754510939121, + "num_tokens": 38673689.0, + "step": 537 + }, + { + "epoch": 0.3358171107556665, + "grad_norm": 0.7483878135681152, + "learning_rate": 7.848e-07, + "loss": 0.3111, + "mean_token_accuracy": 0.9343430511653423, + "num_tokens": 38750314.0, + "step": 538 + }, + { + "epoch": 0.33644130612881834, + "grad_norm": 0.7510576248168945, + "learning_rate": 7.844e-07, + "loss": 0.3718, + "mean_token_accuracy": 0.9187307097017765, + "num_tokens": 38826470.0, + "step": 539 + }, + { + "epoch": 0.3370655015019701, + "grad_norm": 0.7023730278015137, + "learning_rate": 7.84e-07, + "loss": 0.363, + "mean_token_accuracy": 0.9249767996370792, + "num_tokens": 38895411.0, + "step": 540 + }, + { + "epoch": 0.3376896968751219, + "grad_norm": 0.503331184387207, + "learning_rate": 7.835999999999999e-07, + "loss": 0.3811, + "mean_token_accuracy": 0.9164841137826443, + "num_tokens": 38963944.0, + "step": 541 + }, + { + "epoch": 0.3383138922482737, + "grad_norm": 0.4962184727191925, + "learning_rate": 7.832e-07, + "loss": 0.3871, + "mean_token_accuracy": 0.9121665470302105, + "num_tokens": 39031110.0, + "step": 542 + }, + { + "epoch": 0.3389380876214255, + "grad_norm": 2.1462461948394775, + "learning_rate": 7.828e-07, + "loss": 0.3531, + "mean_token_accuracy": 0.9244196452200413, + "num_tokens": 39103052.0, + "step": 543 + }, + { + "epoch": 0.3395622829945773, + "grad_norm": 0.7186685800552368, + "learning_rate": 7.823999999999999e-07, + "loss": 0.3179, + "mean_token_accuracy": 0.9326954819262028, + "num_tokens": 39183940.0, + "step": 544 + }, + { + "epoch": 0.3401864783677291, + "grad_norm": 0.706167995929718, + "learning_rate": 7.82e-07, + "loss": 0.3691, + "mean_token_accuracy": 0.9131165146827698, + "num_tokens": 39253289.0, + "step": 545 + }, + { + "epoch": 0.3408106737408809, + "grad_norm": 0.33079788088798523, + "learning_rate": 7.816e-07, + "loss": 0.349, + "mean_token_accuracy": 0.9257286265492439, + "num_tokens": 39324763.0, + "step": 546 + }, + { + "epoch": 0.3414348691140327, + "grad_norm": 0.6968596577644348, + "learning_rate": 7.811999999999999e-07, + "loss": 0.3438, + "mean_token_accuracy": 0.9264971278607845, + "num_tokens": 39397582.0, + "step": 547 + }, + { + "epoch": 0.3420590644871845, + "grad_norm": 0.7445778846740723, + "learning_rate": 7.808e-07, + "loss": 0.3838, + "mean_token_accuracy": 0.9126134067773819, + "num_tokens": 39469006.0, + "step": 548 + }, + { + "epoch": 0.34268325986033626, + "grad_norm": 0.482977032661438, + "learning_rate": 7.804e-07, + "loss": 0.3285, + "mean_token_accuracy": 0.931167010217905, + "num_tokens": 39547491.0, + "step": 549 + }, + { + "epoch": 0.3433074552334881, + "grad_norm": 0.7700469493865967, + "learning_rate": 7.799999999999999e-07, + "loss": 0.3661, + "mean_token_accuracy": 0.9219079948961735, + "num_tokens": 39617465.0, + "step": 550 + }, + { + "epoch": 0.3439316506066399, + "grad_norm": 0.9452283978462219, + "learning_rate": 7.795999999999999e-07, + "loss": 0.3585, + "mean_token_accuracy": 0.9159074127674103, + "num_tokens": 39688818.0, + "step": 551 + }, + { + "epoch": 0.34455584597979166, + "grad_norm": 0.6997014284133911, + "learning_rate": 7.792e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9276483282446861, + "num_tokens": 39762354.0, + "step": 552 + }, + { + "epoch": 0.34518004135294345, + "grad_norm": 0.4857024848461151, + "learning_rate": 7.788000000000001e-07, + "loss": 0.3608, + "mean_token_accuracy": 0.9200742207467556, + "num_tokens": 39831316.0, + "step": 553 + }, + { + "epoch": 0.3458042367260953, + "grad_norm": 0.48957309126853943, + "learning_rate": 7.783999999999999e-07, + "loss": 0.3403, + "mean_token_accuracy": 0.9275477230548859, + "num_tokens": 39906027.0, + "step": 554 + }, + { + "epoch": 0.34642843209924706, + "grad_norm": 0.48259392380714417, + "learning_rate": 7.78e-07, + "loss": 0.3677, + "mean_token_accuracy": 0.9230173043906689, + "num_tokens": 39978449.0, + "step": 555 + }, + { + "epoch": 0.34705262747239884, + "grad_norm": 0.645094633102417, + "learning_rate": 7.776e-07, + "loss": 0.3602, + "mean_token_accuracy": 0.92342958599329, + "num_tokens": 40051792.0, + "step": 556 + }, + { + "epoch": 0.3476768228455507, + "grad_norm": 0.936673104763031, + "learning_rate": 7.771999999999999e-07, + "loss": 0.3368, + "mean_token_accuracy": 0.9274178445339203, + "num_tokens": 40125285.0, + "step": 557 + }, + { + "epoch": 0.34830101821870246, + "grad_norm": 0.32944637537002563, + "learning_rate": 7.768e-07, + "loss": 0.377, + "mean_token_accuracy": 0.918434377759695, + "num_tokens": 40196636.0, + "step": 558 + }, + { + "epoch": 0.34892521359185424, + "grad_norm": 0.7557623982429504, + "learning_rate": 7.764e-07, + "loss": 0.3711, + "mean_token_accuracy": 0.9227299764752388, + "num_tokens": 40267280.0, + "step": 559 + }, + { + "epoch": 0.349549408965006, + "grad_norm": 0.46764472126960754, + "learning_rate": 7.76e-07, + "loss": 0.3372, + "mean_token_accuracy": 0.9256122633814812, + "num_tokens": 40341095.0, + "step": 560 + }, + { + "epoch": 0.35017360433815786, + "grad_norm": 0.6640878319740295, + "learning_rate": 7.755999999999999e-07, + "loss": 0.3558, + "mean_token_accuracy": 0.9216841869056225, + "num_tokens": 40411829.0, + "step": 561 + }, + { + "epoch": 0.35079779971130964, + "grad_norm": 0.2680525779724121, + "learning_rate": 7.752e-07, + "loss": 0.347, + "mean_token_accuracy": 0.9253168888390064, + "num_tokens": 40481909.0, + "step": 562 + }, + { + "epoch": 0.3514219950844614, + "grad_norm": 0.9046684503555298, + "learning_rate": 7.748e-07, + "loss": 0.3419, + "mean_token_accuracy": 0.9286962039768696, + "num_tokens": 40557235.0, + "step": 563 + }, + { + "epoch": 0.3520461904576132, + "grad_norm": 0.5573182106018066, + "learning_rate": 7.743999999999999e-07, + "loss": 0.3292, + "mean_token_accuracy": 0.925889540463686, + "num_tokens": 40631577.0, + "step": 564 + }, + { + "epoch": 0.35267038583076504, + "grad_norm": 0.8464686870574951, + "learning_rate": 7.74e-07, + "loss": 0.3286, + "mean_token_accuracy": 0.9284627512097359, + "num_tokens": 40705192.0, + "step": 565 + }, + { + "epoch": 0.3532945812039168, + "grad_norm": 0.4341270923614502, + "learning_rate": 7.735999999999999e-07, + "loss": 0.3325, + "mean_token_accuracy": 0.9269193299114704, + "num_tokens": 40776977.0, + "step": 566 + }, + { + "epoch": 0.3539187765770686, + "grad_norm": 0.4663252830505371, + "learning_rate": 7.732e-07, + "loss": 0.3679, + "mean_token_accuracy": 0.9171105623245239, + "num_tokens": 40846053.0, + "step": 567 + }, + { + "epoch": 0.35454297195022044, + "grad_norm": 0.46172523498535156, + "learning_rate": 7.728e-07, + "loss": 0.3374, + "mean_token_accuracy": 0.9275957085192204, + "num_tokens": 40920086.0, + "step": 568 + }, + { + "epoch": 0.3551671673233722, + "grad_norm": 0.2973470389842987, + "learning_rate": 7.723999999999999e-07, + "loss": 0.2961, + "mean_token_accuracy": 0.9342636428773403, + "num_tokens": 40996730.0, + "step": 569 + }, + { + "epoch": 0.355791362696524, + "grad_norm": 0.6722999215126038, + "learning_rate": 7.72e-07, + "loss": 0.3914, + "mean_token_accuracy": 0.9111681878566742, + "num_tokens": 41065835.0, + "step": 570 + }, + { + "epoch": 0.3564155580696758, + "grad_norm": 0.4647853672504425, + "learning_rate": 7.716e-07, + "loss": 0.3628, + "mean_token_accuracy": 0.9266398400068283, + "num_tokens": 41138942.0, + "step": 571 + }, + { + "epoch": 0.3570397534428276, + "grad_norm": 0.7400867938995361, + "learning_rate": 7.711999999999999e-07, + "loss": 0.3641, + "mean_token_accuracy": 0.9188443422317505, + "num_tokens": 41210620.0, + "step": 572 + }, + { + "epoch": 0.3576639488159794, + "grad_norm": 0.35945945978164673, + "learning_rate": 7.708e-07, + "loss": 0.3423, + "mean_token_accuracy": 0.9254635684192181, + "num_tokens": 41283277.0, + "step": 573 + }, + { + "epoch": 0.3582881441891312, + "grad_norm": 0.4682519733905792, + "learning_rate": 7.704e-07, + "loss": 0.3759, + "mean_token_accuracy": 0.9224316813051701, + "num_tokens": 41353184.0, + "step": 574 + }, + { + "epoch": 0.35891233956228297, + "grad_norm": 0.8023708462715149, + "learning_rate": 7.699999999999999e-07, + "loss": 0.3108, + "mean_token_accuracy": 0.9367459304630756, + "num_tokens": 41431034.0, + "step": 575 + }, + { + "epoch": 0.3595365349354348, + "grad_norm": 0.707558810710907, + "learning_rate": 7.695999999999999e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9225289300084114, + "num_tokens": 41504232.0, + "step": 576 + }, + { + "epoch": 0.3601607303085866, + "grad_norm": 1.1275163888931274, + "learning_rate": 7.692e-07, + "loss": 0.331, + "mean_token_accuracy": 0.9252995476126671, + "num_tokens": 41578544.0, + "step": 577 + }, + { + "epoch": 0.36078492568173837, + "grad_norm": 0.42957794666290283, + "learning_rate": 7.688000000000001e-07, + "loss": 0.3578, + "mean_token_accuracy": 0.920187171548605, + "num_tokens": 41649210.0, + "step": 578 + }, + { + "epoch": 0.3614091210548902, + "grad_norm": 0.8529049158096313, + "learning_rate": 7.683999999999999e-07, + "loss": 0.3315, + "mean_token_accuracy": 0.9296321868896484, + "num_tokens": 41728910.0, + "step": 579 + }, + { + "epoch": 0.362033316428042, + "grad_norm": 0.6229732036590576, + "learning_rate": 7.68e-07, + "loss": 0.3045, + "mean_token_accuracy": 0.9329870566725731, + "num_tokens": 41805093.0, + "step": 580 + }, + { + "epoch": 0.36265751180119377, + "grad_norm": 0.7021951079368591, + "learning_rate": 7.676e-07, + "loss": 0.3538, + "mean_token_accuracy": 0.9242266975343227, + "num_tokens": 41873966.0, + "step": 581 + }, + { + "epoch": 0.36328170717434555, + "grad_norm": 0.4560698866844177, + "learning_rate": 7.671999999999999e-07, + "loss": 0.3298, + "mean_token_accuracy": 0.9291101545095444, + "num_tokens": 41949321.0, + "step": 582 + }, + { + "epoch": 0.3639059025474974, + "grad_norm": 0.4377002418041229, + "learning_rate": 7.668e-07, + "loss": 0.3624, + "mean_token_accuracy": 0.9210468083620071, + "num_tokens": 42021246.0, + "step": 583 + }, + { + "epoch": 0.36453009792064917, + "grad_norm": 0.294572651386261, + "learning_rate": 7.664e-07, + "loss": 0.3523, + "mean_token_accuracy": 0.9218520075082779, + "num_tokens": 42092564.0, + "step": 584 + }, + { + "epoch": 0.36515429329380095, + "grad_norm": 0.29711201786994934, + "learning_rate": 7.66e-07, + "loss": 0.3678, + "mean_token_accuracy": 0.9170925952494144, + "num_tokens": 42163349.0, + "step": 585 + }, + { + "epoch": 0.3657784886669528, + "grad_norm": 0.6175806522369385, + "learning_rate": 7.655999999999999e-07, + "loss": 0.3812, + "mean_token_accuracy": 0.9189609922468662, + "num_tokens": 42234589.0, + "step": 586 + }, + { + "epoch": 0.36640268404010456, + "grad_norm": 0.7820612788200378, + "learning_rate": 7.652e-07, + "loss": 0.3493, + "mean_token_accuracy": 0.9235295131802559, + "num_tokens": 42303104.0, + "step": 587 + }, + { + "epoch": 0.36702687941325635, + "grad_norm": 0.48036956787109375, + "learning_rate": 7.648e-07, + "loss": 0.3623, + "mean_token_accuracy": 0.9198840446770191, + "num_tokens": 42374253.0, + "step": 588 + }, + { + "epoch": 0.3676510747864081, + "grad_norm": 0.6162477731704712, + "learning_rate": 7.643999999999999e-07, + "loss": 0.3427, + "mean_token_accuracy": 0.9269851222634315, + "num_tokens": 42451484.0, + "step": 589 + }, + { + "epoch": 0.36827527015955996, + "grad_norm": 0.6017821431159973, + "learning_rate": 7.64e-07, + "loss": 0.3463, + "mean_token_accuracy": 0.9242015965282917, + "num_tokens": 42524055.0, + "step": 590 + }, + { + "epoch": 0.36889946553271175, + "grad_norm": 0.7923152446746826, + "learning_rate": 7.635999999999999e-07, + "loss": 0.3401, + "mean_token_accuracy": 0.9252413921058178, + "num_tokens": 42599454.0, + "step": 591 + }, + { + "epoch": 0.3695236609058635, + "grad_norm": 0.452820360660553, + "learning_rate": 7.632e-07, + "loss": 0.4054, + "mean_token_accuracy": 0.9114637114107609, + "num_tokens": 42666310.0, + "step": 592 + }, + { + "epoch": 0.3701478562790153, + "grad_norm": 0.5999427437782288, + "learning_rate": 7.628e-07, + "loss": 0.3438, + "mean_token_accuracy": 0.9267684370279312, + "num_tokens": 42739825.0, + "step": 593 + }, + { + "epoch": 0.37077205165216714, + "grad_norm": 0.7625322937965393, + "learning_rate": 7.623999999999999e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9250470995903015, + "num_tokens": 42816765.0, + "step": 594 + }, + { + "epoch": 0.3713962470253189, + "grad_norm": 0.6362447738647461, + "learning_rate": 7.62e-07, + "loss": 0.3317, + "mean_token_accuracy": 0.9234643131494522, + "num_tokens": 42890591.0, + "step": 595 + }, + { + "epoch": 0.3720204423984707, + "grad_norm": 0.5599761009216309, + "learning_rate": 7.616e-07, + "loss": 0.3412, + "mean_token_accuracy": 0.9247772358357906, + "num_tokens": 42961701.0, + "step": 596 + }, + { + "epoch": 0.37264463777162254, + "grad_norm": 0.5791319012641907, + "learning_rate": 7.611999999999999e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9239939153194427, + "num_tokens": 43033519.0, + "step": 597 + }, + { + "epoch": 0.3732688331447743, + "grad_norm": 0.5624537467956543, + "learning_rate": 7.608e-07, + "loss": 0.3296, + "mean_token_accuracy": 0.9278119280934334, + "num_tokens": 43105774.0, + "step": 598 + }, + { + "epoch": 0.3738930285179261, + "grad_norm": 0.7942206859588623, + "learning_rate": 7.604e-07, + "loss": 0.3631, + "mean_token_accuracy": 0.9214097931981087, + "num_tokens": 43175469.0, + "step": 599 + }, + { + "epoch": 0.3745172238910779, + "grad_norm": 0.34156644344329834, + "learning_rate": 7.599999999999999e-07, + "loss": 0.3381, + "mean_token_accuracy": 0.925388790667057, + "num_tokens": 43244559.0, + "step": 600 + }, + { + "epoch": 0.3751414192642297, + "grad_norm": 0.2829967737197876, + "learning_rate": 7.596e-07, + "loss": 0.3615, + "mean_token_accuracy": 0.9234224781394005, + "num_tokens": 43316161.0, + "step": 601 + }, + { + "epoch": 0.3757656146373815, + "grad_norm": 0.6025108098983765, + "learning_rate": 7.592e-07, + "loss": 0.3705, + "mean_token_accuracy": 0.919626459479332, + "num_tokens": 43386130.0, + "step": 602 + }, + { + "epoch": 0.3763898100105333, + "grad_norm": 0.5837751030921936, + "learning_rate": 7.588e-07, + "loss": 0.3318, + "mean_token_accuracy": 0.9294660799205303, + "num_tokens": 43460187.0, + "step": 603 + }, + { + "epoch": 0.37701400538368507, + "grad_norm": 0.3845606744289398, + "learning_rate": 7.583999999999999e-07, + "loss": 0.341, + "mean_token_accuracy": 0.9264329858124256, + "num_tokens": 43532441.0, + "step": 604 + }, + { + "epoch": 0.3776382007568369, + "grad_norm": 0.40432336926460266, + "learning_rate": 7.58e-07, + "loss": 0.3305, + "mean_token_accuracy": 0.9207708723843098, + "num_tokens": 43603439.0, + "step": 605 + }, + { + "epoch": 0.3782623961299887, + "grad_norm": 0.5441488027572632, + "learning_rate": 7.576000000000001e-07, + "loss": 0.3213, + "mean_token_accuracy": 0.9323575422167778, + "num_tokens": 43678349.0, + "step": 606 + }, + { + "epoch": 0.37888659150314047, + "grad_norm": 0.5424903035163879, + "learning_rate": 7.571999999999999e-07, + "loss": 0.3629, + "mean_token_accuracy": 0.9164081439375877, + "num_tokens": 43748106.0, + "step": 607 + }, + { + "epoch": 0.3795107868762923, + "grad_norm": 0.5946375727653503, + "learning_rate": 7.568e-07, + "loss": 0.3344, + "mean_token_accuracy": 0.9272113926708698, + "num_tokens": 43818438.0, + "step": 608 + }, + { + "epoch": 0.3801349822494441, + "grad_norm": 0.9699196815490723, + "learning_rate": 7.564e-07, + "loss": 0.3225, + "mean_token_accuracy": 0.9256913587450981, + "num_tokens": 43890611.0, + "step": 609 + }, + { + "epoch": 0.38075917762259587, + "grad_norm": 0.4665364921092987, + "learning_rate": 7.559999999999999e-07, + "loss": 0.3366, + "mean_token_accuracy": 0.9285813868045807, + "num_tokens": 43967137.0, + "step": 610 + }, + { + "epoch": 0.38138337299574765, + "grad_norm": 0.5336482524871826, + "learning_rate": 7.556e-07, + "loss": 0.344, + "mean_token_accuracy": 0.9211304225027561, + "num_tokens": 44037936.0, + "step": 611 + }, + { + "epoch": 0.3820075683688995, + "grad_norm": 0.5326005816459656, + "learning_rate": 7.552e-07, + "loss": 0.3637, + "mean_token_accuracy": 0.9215891696512699, + "num_tokens": 44111609.0, + "step": 612 + }, + { + "epoch": 0.38263176374205127, + "grad_norm": 0.27235645055770874, + "learning_rate": 7.548e-07, + "loss": 0.3398, + "mean_token_accuracy": 0.9236892722547054, + "num_tokens": 44185059.0, + "step": 613 + }, + { + "epoch": 0.38325595911520305, + "grad_norm": 0.47155284881591797, + "learning_rate": 7.543999999999999e-07, + "loss": 0.3316, + "mean_token_accuracy": 0.9268237687647343, + "num_tokens": 44258343.0, + "step": 614 + }, + { + "epoch": 0.38388015448835483, + "grad_norm": 0.3805808424949646, + "learning_rate": 7.54e-07, + "loss": 0.3426, + "mean_token_accuracy": 0.9244777448475361, + "num_tokens": 44328611.0, + "step": 615 + }, + { + "epoch": 0.38450434986150667, + "grad_norm": 0.5564144849777222, + "learning_rate": 7.536e-07, + "loss": 0.3464, + "mean_token_accuracy": 0.9197547808289528, + "num_tokens": 44399191.0, + "step": 616 + }, + { + "epoch": 0.38512854523465845, + "grad_norm": 0.5806828737258911, + "learning_rate": 7.531999999999999e-07, + "loss": 0.2991, + "mean_token_accuracy": 0.9331597909331322, + "num_tokens": 44474432.0, + "step": 617 + }, + { + "epoch": 0.38575274060781023, + "grad_norm": 0.5906999707221985, + "learning_rate": 7.528e-07, + "loss": 0.3572, + "mean_token_accuracy": 0.9175950661301613, + "num_tokens": 44543952.0, + "step": 618 + }, + { + "epoch": 0.38637693598096207, + "grad_norm": 0.3685188889503479, + "learning_rate": 7.523999999999999e-07, + "loss": 0.345, + "mean_token_accuracy": 0.9218173548579216, + "num_tokens": 44615331.0, + "step": 619 + }, + { + "epoch": 0.38700113135411385, + "grad_norm": 0.42963331937789917, + "learning_rate": 7.52e-07, + "loss": 0.3824, + "mean_token_accuracy": 0.9150598384439945, + "num_tokens": 44685793.0, + "step": 620 + }, + { + "epoch": 0.38762532672726563, + "grad_norm": 0.7856587171554565, + "learning_rate": 7.516e-07, + "loss": 0.3391, + "mean_token_accuracy": 0.9240389838814735, + "num_tokens": 44760555.0, + "step": 621 + }, + { + "epoch": 0.3882495221004174, + "grad_norm": 0.6671410799026489, + "learning_rate": 7.511999999999999e-07, + "loss": 0.3251, + "mean_token_accuracy": 0.9271936528384686, + "num_tokens": 44830025.0, + "step": 622 + }, + { + "epoch": 0.38887371747356925, + "grad_norm": 0.36151447892189026, + "learning_rate": 7.508e-07, + "loss": 0.3365, + "mean_token_accuracy": 0.9240407310426235, + "num_tokens": 44905764.0, + "step": 623 + }, + { + "epoch": 0.38949791284672103, + "grad_norm": 0.38457614183425903, + "learning_rate": 7.503999999999999e-07, + "loss": 0.3688, + "mean_token_accuracy": 0.9188949130475521, + "num_tokens": 44975012.0, + "step": 624 + }, + { + "epoch": 0.3901221082198728, + "grad_norm": 0.3652454614639282, + "learning_rate": 7.5e-07, + "loss": 0.3142, + "mean_token_accuracy": 0.9310969449579716, + "num_tokens": 45045839.0, + "step": 625 + }, + { + "epoch": 0.3907463035930246, + "grad_norm": 0.5461031794548035, + "learning_rate": 7.496e-07, + "loss": 0.3425, + "mean_token_accuracy": 0.9251462630927563, + "num_tokens": 45117462.0, + "step": 626 + }, + { + "epoch": 0.3913704989661764, + "grad_norm": 0.4545503854751587, + "learning_rate": 7.492e-07, + "loss": 0.383, + "mean_token_accuracy": 0.9178108982741833, + "num_tokens": 45185548.0, + "step": 627 + }, + { + "epoch": 0.3919946943393282, + "grad_norm": 0.38261616230010986, + "learning_rate": 7.488e-07, + "loss": 0.3504, + "mean_token_accuracy": 0.920975849032402, + "num_tokens": 45261338.0, + "step": 628 + }, + { + "epoch": 0.39261888971248, + "grad_norm": 0.2625387907028198, + "learning_rate": 7.483999999999999e-07, + "loss": 0.3511, + "mean_token_accuracy": 0.9234513938426971, + "num_tokens": 45334516.0, + "step": 629 + }, + { + "epoch": 0.3932430850856318, + "grad_norm": 0.48953914642333984, + "learning_rate": 7.48e-07, + "loss": 0.3267, + "mean_token_accuracy": 0.9276548400521278, + "num_tokens": 45408844.0, + "step": 630 + }, + { + "epoch": 0.3938672804587836, + "grad_norm": 0.25507792830467224, + "learning_rate": 7.476e-07, + "loss": 0.3275, + "mean_token_accuracy": 0.9272936768829823, + "num_tokens": 45484129.0, + "step": 631 + }, + { + "epoch": 0.3944914758319354, + "grad_norm": 0.7709338068962097, + "learning_rate": 7.471999999999999e-07, + "loss": 0.3385, + "mean_token_accuracy": 0.9237735457718372, + "num_tokens": 45557220.0, + "step": 632 + }, + { + "epoch": 0.39511567120508717, + "grad_norm": 0.5819105505943298, + "learning_rate": 7.468e-07, + "loss": 0.3112, + "mean_token_accuracy": 0.9251358509063721, + "num_tokens": 45631093.0, + "step": 633 + }, + { + "epoch": 0.395739866578239, + "grad_norm": 0.26736271381378174, + "learning_rate": 7.464e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.926128089427948, + "num_tokens": 45702723.0, + "step": 634 + }, + { + "epoch": 0.3963640619513908, + "grad_norm": 0.4306083917617798, + "learning_rate": 7.459999999999999e-07, + "loss": 0.3154, + "mean_token_accuracy": 0.9313743449747562, + "num_tokens": 45774922.0, + "step": 635 + }, + { + "epoch": 0.39698825732454257, + "grad_norm": 0.40494468808174133, + "learning_rate": 7.456e-07, + "loss": 0.3519, + "mean_token_accuracy": 0.9186866208910942, + "num_tokens": 45846528.0, + "step": 636 + }, + { + "epoch": 0.39761245269769435, + "grad_norm": 0.5655471086502075, + "learning_rate": 7.452e-07, + "loss": 0.3445, + "mean_token_accuracy": 0.9245801381766796, + "num_tokens": 45915531.0, + "step": 637 + }, + { + "epoch": 0.3982366480708462, + "grad_norm": 0.2741180956363678, + "learning_rate": 7.447999999999999e-07, + "loss": 0.3468, + "mean_token_accuracy": 0.9221871383488178, + "num_tokens": 45990719.0, + "step": 638 + }, + { + "epoch": 0.39886084344399797, + "grad_norm": 0.29522502422332764, + "learning_rate": 7.443999999999999e-07, + "loss": 0.3547, + "mean_token_accuracy": 0.9206781312823296, + "num_tokens": 46061240.0, + "step": 639 + }, + { + "epoch": 0.39948503881714975, + "grad_norm": 0.6466435790061951, + "learning_rate": 7.44e-07, + "loss": 0.329, + "mean_token_accuracy": 0.9293878078460693, + "num_tokens": 46135944.0, + "step": 640 + }, + { + "epoch": 0.4001092341903016, + "grad_norm": 0.29942116141319275, + "learning_rate": 7.436e-07, + "loss": 0.3458, + "mean_token_accuracy": 0.925358172506094, + "num_tokens": 46207508.0, + "step": 641 + }, + { + "epoch": 0.40073342956345337, + "grad_norm": 0.6349466443061829, + "learning_rate": 7.431999999999999e-07, + "loss": 0.3321, + "mean_token_accuracy": 0.9232594855129719, + "num_tokens": 46278741.0, + "step": 642 + }, + { + "epoch": 0.40135762493660515, + "grad_norm": 0.48047778010368347, + "learning_rate": 7.428e-07, + "loss": 0.3527, + "mean_token_accuracy": 0.9166908822953701, + "num_tokens": 46347139.0, + "step": 643 + }, + { + "epoch": 0.40198182030975693, + "grad_norm": 0.418552964925766, + "learning_rate": 7.423999999999999e-07, + "loss": 0.3519, + "mean_token_accuracy": 0.9196455329656601, + "num_tokens": 46415780.0, + "step": 644 + }, + { + "epoch": 0.40260601568290877, + "grad_norm": 0.7869686484336853, + "learning_rate": 7.42e-07, + "loss": 0.3725, + "mean_token_accuracy": 0.9172872118651867, + "num_tokens": 46484109.0, + "step": 645 + }, + { + "epoch": 0.40323021105606055, + "grad_norm": 0.3682011663913727, + "learning_rate": 7.416e-07, + "loss": 0.317, + "mean_token_accuracy": 0.9321021102368832, + "num_tokens": 46558889.0, + "step": 646 + }, + { + "epoch": 0.40385440642921233, + "grad_norm": 0.5389231443405151, + "learning_rate": 7.411999999999999e-07, + "loss": 0.3306, + "mean_token_accuracy": 0.9275304302573204, + "num_tokens": 46634325.0, + "step": 647 + }, + { + "epoch": 0.4044786018023641, + "grad_norm": 0.2784671187400818, + "learning_rate": 7.408e-07, + "loss": 0.3292, + "mean_token_accuracy": 0.9264209941029549, + "num_tokens": 46705187.0, + "step": 648 + }, + { + "epoch": 0.40510279717551595, + "grad_norm": 0.6616075038909912, + "learning_rate": 7.403999999999999e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9315904937684536, + "num_tokens": 46782367.0, + "step": 649 + }, + { + "epoch": 0.40572699254866773, + "grad_norm": 0.5459262728691101, + "learning_rate": 7.4e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9320596344769001, + "num_tokens": 46853397.0, + "step": 650 + }, + { + "epoch": 0.4063511879218195, + "grad_norm": 0.37146082520484924, + "learning_rate": 7.396e-07, + "loss": 0.3389, + "mean_token_accuracy": 0.9261991530656815, + "num_tokens": 46920787.0, + "step": 651 + }, + { + "epoch": 0.40697538329497135, + "grad_norm": 0.38181981444358826, + "learning_rate": 7.392e-07, + "loss": 0.3666, + "mean_token_accuracy": 0.9177272506058216, + "num_tokens": 46994472.0, + "step": 652 + }, + { + "epoch": 0.40759957866812313, + "grad_norm": 0.41725292801856995, + "learning_rate": 7.388e-07, + "loss": 0.3277, + "mean_token_accuracy": 0.9293075203895569, + "num_tokens": 47068619.0, + "step": 653 + }, + { + "epoch": 0.4082237740412749, + "grad_norm": 0.4707614481449127, + "learning_rate": 7.383999999999999e-07, + "loss": 0.357, + "mean_token_accuracy": 0.9203765951097012, + "num_tokens": 47136036.0, + "step": 654 + }, + { + "epoch": 0.4088479694144267, + "grad_norm": 0.414760947227478, + "learning_rate": 7.38e-07, + "loss": 0.3512, + "mean_token_accuracy": 0.9205658622086048, + "num_tokens": 47207154.0, + "step": 655 + }, + { + "epoch": 0.40947216478757853, + "grad_norm": 0.6463868021965027, + "learning_rate": 7.376e-07, + "loss": 0.3118, + "mean_token_accuracy": 0.931792251765728, + "num_tokens": 47283959.0, + "step": 656 + }, + { + "epoch": 0.4100963601607303, + "grad_norm": 0.37666013836860657, + "learning_rate": 7.371999999999999e-07, + "loss": 0.3435, + "mean_token_accuracy": 0.9253863766789436, + "num_tokens": 47354576.0, + "step": 657 + }, + { + "epoch": 0.4107205555338821, + "grad_norm": 0.5181844234466553, + "learning_rate": 7.368e-07, + "loss": 0.3291, + "mean_token_accuracy": 0.9307320602238178, + "num_tokens": 47428241.0, + "step": 658 + }, + { + "epoch": 0.41134475090703393, + "grad_norm": 0.37594252824783325, + "learning_rate": 7.364000000000001e-07, + "loss": 0.3175, + "mean_token_accuracy": 0.9273154586553574, + "num_tokens": 47499461.0, + "step": 659 + }, + { + "epoch": 0.4119689462801857, + "grad_norm": 0.5229082703590393, + "learning_rate": 7.359999999999999e-07, + "loss": 0.3476, + "mean_token_accuracy": 0.9239908717572689, + "num_tokens": 47573290.0, + "step": 660 + }, + { + "epoch": 0.4125931416533375, + "grad_norm": 0.36330297589302063, + "learning_rate": 7.356e-07, + "loss": 0.3656, + "mean_token_accuracy": 0.922837421298027, + "num_tokens": 47643897.0, + "step": 661 + }, + { + "epoch": 0.4132173370264893, + "grad_norm": 0.5052156448364258, + "learning_rate": 7.352e-07, + "loss": 0.3318, + "mean_token_accuracy": 0.9263353794813156, + "num_tokens": 47716736.0, + "step": 662 + }, + { + "epoch": 0.4138415323996411, + "grad_norm": 0.2872786819934845, + "learning_rate": 7.347999999999999e-07, + "loss": 0.3451, + "mean_token_accuracy": 0.9226055145263672, + "num_tokens": 47788926.0, + "step": 663 + }, + { + "epoch": 0.4144657277727929, + "grad_norm": 0.4757533669471741, + "learning_rate": 7.344e-07, + "loss": 0.3726, + "mean_token_accuracy": 0.9131042547523975, + "num_tokens": 47855829.0, + "step": 664 + }, + { + "epoch": 0.4150899231459447, + "grad_norm": 0.49452951550483704, + "learning_rate": 7.34e-07, + "loss": 0.3473, + "mean_token_accuracy": 0.9252515956759453, + "num_tokens": 47927776.0, + "step": 665 + }, + { + "epoch": 0.41571411851909645, + "grad_norm": 0.44129571318626404, + "learning_rate": 7.336e-07, + "loss": 0.3153, + "mean_token_accuracy": 0.9331275187432766, + "num_tokens": 47994963.0, + "step": 666 + }, + { + "epoch": 0.4163383138922483, + "grad_norm": 0.4779670238494873, + "learning_rate": 7.331999999999999e-07, + "loss": 0.3205, + "mean_token_accuracy": 0.932083748281002, + "num_tokens": 48067456.0, + "step": 667 + }, + { + "epoch": 0.41696250926540007, + "grad_norm": 0.35190168023109436, + "learning_rate": 7.328e-07, + "loss": 0.3184, + "mean_token_accuracy": 0.9310644865036011, + "num_tokens": 48140224.0, + "step": 668 + }, + { + "epoch": 0.41758670463855185, + "grad_norm": 0.4720968008041382, + "learning_rate": 7.324e-07, + "loss": 0.3711, + "mean_token_accuracy": 0.9152346327900887, + "num_tokens": 48208398.0, + "step": 669 + }, + { + "epoch": 0.4182109000117037, + "grad_norm": 0.5012626051902771, + "learning_rate": 7.319999999999999e-07, + "loss": 0.3405, + "mean_token_accuracy": 0.9250470548868179, + "num_tokens": 48280914.0, + "step": 670 + }, + { + "epoch": 0.41883509538485547, + "grad_norm": 0.6238592863082886, + "learning_rate": 7.316e-07, + "loss": 0.3299, + "mean_token_accuracy": 0.9239915870130062, + "num_tokens": 48352906.0, + "step": 671 + }, + { + "epoch": 0.41945929075800725, + "grad_norm": 0.4548547565937042, + "learning_rate": 7.311999999999999e-07, + "loss": 0.3417, + "mean_token_accuracy": 0.920001145452261, + "num_tokens": 48426029.0, + "step": 672 + }, + { + "epoch": 0.42008348613115903, + "grad_norm": 0.40740537643432617, + "learning_rate": 7.308e-07, + "loss": 0.364, + "mean_token_accuracy": 0.9215504974126816, + "num_tokens": 48497296.0, + "step": 673 + }, + { + "epoch": 0.42070768150431087, + "grad_norm": 0.34685757756233215, + "learning_rate": 7.304e-07, + "loss": 0.32, + "mean_token_accuracy": 0.9303984344005585, + "num_tokens": 48571241.0, + "step": 674 + }, + { + "epoch": 0.42133187687746265, + "grad_norm": 2.8446075916290283, + "learning_rate": 7.3e-07, + "loss": 0.3484, + "mean_token_accuracy": 0.9234076887369156, + "num_tokens": 48644885.0, + "step": 675 + }, + { + "epoch": 0.42195607225061443, + "grad_norm": 0.5381395816802979, + "learning_rate": 7.296e-07, + "loss": 0.344, + "mean_token_accuracy": 0.9226926155388355, + "num_tokens": 48717883.0, + "step": 676 + }, + { + "epoch": 0.4225802676237662, + "grad_norm": 0.3551507294178009, + "learning_rate": 7.291999999999999e-07, + "loss": 0.3499, + "mean_token_accuracy": 0.9218867346644402, + "num_tokens": 48788914.0, + "step": 677 + }, + { + "epoch": 0.42320446299691805, + "grad_norm": 0.5331665873527527, + "learning_rate": 7.288e-07, + "loss": 0.3602, + "mean_token_accuracy": 0.9145861491560936, + "num_tokens": 48859661.0, + "step": 678 + }, + { + "epoch": 0.42382865837006983, + "grad_norm": 0.27044275403022766, + "learning_rate": 7.284e-07, + "loss": 0.3292, + "mean_token_accuracy": 0.9240933321416378, + "num_tokens": 48932733.0, + "step": 679 + }, + { + "epoch": 0.4244528537432216, + "grad_norm": 0.4443718492984772, + "learning_rate": 7.28e-07, + "loss": 0.346, + "mean_token_accuracy": 0.9242341965436935, + "num_tokens": 49008579.0, + "step": 680 + }, + { + "epoch": 0.42507704911637345, + "grad_norm": 0.36047619581222534, + "learning_rate": 7.276e-07, + "loss": 0.3097, + "mean_token_accuracy": 0.9259445257484913, + "num_tokens": 49077572.0, + "step": 681 + }, + { + "epoch": 0.42570124448952523, + "grad_norm": 0.4974146783351898, + "learning_rate": 7.271999999999999e-07, + "loss": 0.3585, + "mean_token_accuracy": 0.9235406182706356, + "num_tokens": 49149258.0, + "step": 682 + }, + { + "epoch": 0.426325439862677, + "grad_norm": 0.34493857622146606, + "learning_rate": 7.268e-07, + "loss": 0.3131, + "mean_token_accuracy": 0.9315989725291729, + "num_tokens": 49221956.0, + "step": 683 + }, + { + "epoch": 0.4269496352358288, + "grad_norm": 0.4466032385826111, + "learning_rate": 7.264e-07, + "loss": 0.3372, + "mean_token_accuracy": 0.9272586852312088, + "num_tokens": 49298313.0, + "step": 684 + }, + { + "epoch": 0.42757383060898063, + "grad_norm": 0.46275943517684937, + "learning_rate": 7.259999999999999e-07, + "loss": 0.3319, + "mean_token_accuracy": 0.927916556596756, + "num_tokens": 49369108.0, + "step": 685 + }, + { + "epoch": 0.4281980259821324, + "grad_norm": 0.27612945437431335, + "learning_rate": 7.256e-07, + "loss": 0.3326, + "mean_token_accuracy": 0.9266219213604927, + "num_tokens": 49443585.0, + "step": 686 + }, + { + "epoch": 0.4288222213552842, + "grad_norm": 0.3892485499382019, + "learning_rate": 7.252e-07, + "loss": 0.3506, + "mean_token_accuracy": 0.9223848171532154, + "num_tokens": 49510772.0, + "step": 687 + }, + { + "epoch": 0.429446416728436, + "grad_norm": 0.35472822189331055, + "learning_rate": 7.247999999999999e-07, + "loss": 0.3339, + "mean_token_accuracy": 0.9262644350528717, + "num_tokens": 49581456.0, + "step": 688 + }, + { + "epoch": 0.4300706121015878, + "grad_norm": 0.5208513736724854, + "learning_rate": 7.244e-07, + "loss": 0.3695, + "mean_token_accuracy": 0.9152507446706295, + "num_tokens": 49645190.0, + "step": 689 + }, + { + "epoch": 0.4306948074747396, + "grad_norm": 0.613875150680542, + "learning_rate": 7.24e-07, + "loss": 0.3322, + "mean_token_accuracy": 0.92716945707798, + "num_tokens": 49718362.0, + "step": 690 + }, + { + "epoch": 0.4313190028478914, + "grad_norm": 0.43399953842163086, + "learning_rate": 7.235999999999999e-07, + "loss": 0.331, + "mean_token_accuracy": 0.9249005541205406, + "num_tokens": 49791382.0, + "step": 691 + }, + { + "epoch": 0.4319431982210432, + "grad_norm": 0.35168054699897766, + "learning_rate": 7.231999999999999e-07, + "loss": 0.3832, + "mean_token_accuracy": 0.911179032176733, + "num_tokens": 49858666.0, + "step": 692 + }, + { + "epoch": 0.432567393594195, + "grad_norm": 0.48481592535972595, + "learning_rate": 7.228e-07, + "loss": 0.3038, + "mean_token_accuracy": 0.9313280507922173, + "num_tokens": 49934095.0, + "step": 693 + }, + { + "epoch": 0.4331915889673468, + "grad_norm": 0.4384796917438507, + "learning_rate": 7.224e-07, + "loss": 0.3042, + "mean_token_accuracy": 0.9318602569401264, + "num_tokens": 50011103.0, + "step": 694 + }, + { + "epoch": 0.43381578434049856, + "grad_norm": 1.4764841794967651, + "learning_rate": 7.219999999999999e-07, + "loss": 0.3374, + "mean_token_accuracy": 0.9262259677052498, + "num_tokens": 50089506.0, + "step": 695 + }, + { + "epoch": 0.4344399797136504, + "grad_norm": 0.4757544994354248, + "learning_rate": 7.216e-07, + "loss": 0.3466, + "mean_token_accuracy": 0.9230071902275085, + "num_tokens": 50158682.0, + "step": 696 + }, + { + "epoch": 0.4350641750868022, + "grad_norm": 0.4512925446033478, + "learning_rate": 7.211999999999999e-07, + "loss": 0.3477, + "mean_token_accuracy": 0.9217991046607494, + "num_tokens": 50230570.0, + "step": 697 + }, + { + "epoch": 0.43568837045995396, + "grad_norm": 0.47074785828590393, + "learning_rate": 7.207999999999999e-07, + "loss": 0.3376, + "mean_token_accuracy": 0.9200443737208843, + "num_tokens": 50298085.0, + "step": 698 + }, + { + "epoch": 0.43631256583310574, + "grad_norm": 0.3686661422252655, + "learning_rate": 7.204e-07, + "loss": 0.3294, + "mean_token_accuracy": 0.9283047094941139, + "num_tokens": 50374486.0, + "step": 699 + }, + { + "epoch": 0.4369367612062576, + "grad_norm": 0.3725312650203705, + "learning_rate": 7.2e-07, + "loss": 0.3618, + "mean_token_accuracy": 0.9213119931519032, + "num_tokens": 50443880.0, + "step": 700 + }, + { + "epoch": 0.43756095657940935, + "grad_norm": 0.2869217097759247, + "learning_rate": 7.196e-07, + "loss": 0.3395, + "mean_token_accuracy": 0.9261060692369938, + "num_tokens": 50513420.0, + "step": 701 + }, + { + "epoch": 0.43818515195256114, + "grad_norm": 0.32761839032173157, + "learning_rate": 7.191999999999999e-07, + "loss": 0.3138, + "mean_token_accuracy": 0.9308922737836838, + "num_tokens": 50590229.0, + "step": 702 + }, + { + "epoch": 0.438809347325713, + "grad_norm": 0.48981109261512756, + "learning_rate": 7.188e-07, + "loss": 0.2983, + "mean_token_accuracy": 0.9335628412663937, + "num_tokens": 50667701.0, + "step": 703 + }, + { + "epoch": 0.43943354269886475, + "grad_norm": 0.26082706451416016, + "learning_rate": 7.184e-07, + "loss": 0.3096, + "mean_token_accuracy": 0.9277554415166378, + "num_tokens": 50738076.0, + "step": 704 + }, + { + "epoch": 0.44005773807201654, + "grad_norm": 0.3906603753566742, + "learning_rate": 7.179999999999999e-07, + "loss": 0.3502, + "mean_token_accuracy": 0.921177189797163, + "num_tokens": 50807586.0, + "step": 705 + }, + { + "epoch": 0.4406819334451683, + "grad_norm": 0.3802482485771179, + "learning_rate": 7.176e-07, + "loss": 0.3297, + "mean_token_accuracy": 0.9246837832033634, + "num_tokens": 50876848.0, + "step": 706 + }, + { + "epoch": 0.44130612881832015, + "grad_norm": 0.3646096885204315, + "learning_rate": 7.171999999999999e-07, + "loss": 0.3563, + "mean_token_accuracy": 0.9209400676190853, + "num_tokens": 50947419.0, + "step": 707 + }, + { + "epoch": 0.44193032419147193, + "grad_norm": 0.4562234878540039, + "learning_rate": 7.168e-07, + "loss": 0.349, + "mean_token_accuracy": 0.9228729270398617, + "num_tokens": 51019131.0, + "step": 708 + }, + { + "epoch": 0.4425545195646237, + "grad_norm": 0.44281619787216187, + "learning_rate": 7.164e-07, + "loss": 0.3346, + "mean_token_accuracy": 0.9228230901062489, + "num_tokens": 51085508.0, + "step": 709 + }, + { + "epoch": 0.4431787149377755, + "grad_norm": 0.3311275541782379, + "learning_rate": 7.159999999999999e-07, + "loss": 0.341, + "mean_token_accuracy": 0.921529233455658, + "num_tokens": 51157007.0, + "step": 710 + }, + { + "epoch": 0.44380291031092733, + "grad_norm": 0.3935934603214264, + "learning_rate": 7.156e-07, + "loss": 0.3325, + "mean_token_accuracy": 0.9283835887908936, + "num_tokens": 51227104.0, + "step": 711 + }, + { + "epoch": 0.4444271056840791, + "grad_norm": 0.4497720003128052, + "learning_rate": 7.151999999999999e-07, + "loss": 0.3261, + "mean_token_accuracy": 0.9282115884125233, + "num_tokens": 51298974.0, + "step": 712 + }, + { + "epoch": 0.4450513010572309, + "grad_norm": 0.5246007442474365, + "learning_rate": 7.147999999999999e-07, + "loss": 0.3627, + "mean_token_accuracy": 0.9195816032588482, + "num_tokens": 51372135.0, + "step": 713 + }, + { + "epoch": 0.44567549643038273, + "grad_norm": 0.735645592212677, + "learning_rate": 7.144e-07, + "loss": 0.3665, + "mean_token_accuracy": 0.9182613231241703, + "num_tokens": 51439662.0, + "step": 714 + }, + { + "epoch": 0.4462996918035345, + "grad_norm": 0.26671546697616577, + "learning_rate": 7.14e-07, + "loss": 0.3315, + "mean_token_accuracy": 0.9239188209176064, + "num_tokens": 51508083.0, + "step": 715 + }, + { + "epoch": 0.4469238871766863, + "grad_norm": 0.27894970774650574, + "learning_rate": 7.135999999999999e-07, + "loss": 0.3041, + "mean_token_accuracy": 0.9310220927000046, + "num_tokens": 51582486.0, + "step": 716 + }, + { + "epoch": 0.4475480825498381, + "grad_norm": 0.5564171075820923, + "learning_rate": 7.131999999999999e-07, + "loss": 0.3479, + "mean_token_accuracy": 0.9212802983820438, + "num_tokens": 51655807.0, + "step": 717 + }, + { + "epoch": 0.4481722779229899, + "grad_norm": 0.34685245156288147, + "learning_rate": 7.128e-07, + "loss": 0.3568, + "mean_token_accuracy": 0.9221464432775974, + "num_tokens": 51730645.0, + "step": 718 + }, + { + "epoch": 0.4487964732961417, + "grad_norm": 0.6511409878730774, + "learning_rate": 7.124e-07, + "loss": 0.3163, + "mean_token_accuracy": 0.9305948689579964, + "num_tokens": 51807543.0, + "step": 719 + }, + { + "epoch": 0.4494206686692935, + "grad_norm": 0.40851685404777527, + "learning_rate": 7.119999999999999e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.9276647195219994, + "num_tokens": 51879153.0, + "step": 720 + }, + { + "epoch": 0.45004486404244526, + "grad_norm": 0.3345511555671692, + "learning_rate": 7.116e-07, + "loss": 0.3031, + "mean_token_accuracy": 0.9311193153262138, + "num_tokens": 51950755.0, + "step": 721 + }, + { + "epoch": 0.4506690594155971, + "grad_norm": 1.2511225938796997, + "learning_rate": 7.112000000000001e-07, + "loss": 0.3638, + "mean_token_accuracy": 0.9171359911561012, + "num_tokens": 52021123.0, + "step": 722 + }, + { + "epoch": 0.4512932547887489, + "grad_norm": 0.5214754939079285, + "learning_rate": 7.107999999999999e-07, + "loss": 0.3163, + "mean_token_accuracy": 0.9283814020454884, + "num_tokens": 52089916.0, + "step": 723 + }, + { + "epoch": 0.45191745016190066, + "grad_norm": 0.26029619574546814, + "learning_rate": 7.104e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9296494275331497, + "num_tokens": 52162690.0, + "step": 724 + }, + { + "epoch": 0.4525416455350525, + "grad_norm": 0.24395406246185303, + "learning_rate": 7.1e-07, + "loss": 0.3419, + "mean_token_accuracy": 0.9235851019620895, + "num_tokens": 52233543.0, + "step": 725 + }, + { + "epoch": 0.4531658409082043, + "grad_norm": 0.39033153653144836, + "learning_rate": 7.096e-07, + "loss": 0.3495, + "mean_token_accuracy": 0.9230191260576248, + "num_tokens": 52300820.0, + "step": 726 + }, + { + "epoch": 0.45379003628135606, + "grad_norm": 0.39378222823143005, + "learning_rate": 7.092e-07, + "loss": 0.3068, + "mean_token_accuracy": 0.9316980689764023, + "num_tokens": 52377765.0, + "step": 727 + }, + { + "epoch": 0.45441423165450784, + "grad_norm": 0.333275705575943, + "learning_rate": 7.088e-07, + "loss": 0.33, + "mean_token_accuracy": 0.9285393580794334, + "num_tokens": 52448172.0, + "step": 728 + }, + { + "epoch": 0.4550384270276597, + "grad_norm": 0.42309725284576416, + "learning_rate": 7.084e-07, + "loss": 0.3302, + "mean_token_accuracy": 0.9275981672108173, + "num_tokens": 52524993.0, + "step": 729 + }, + { + "epoch": 0.45566262240081146, + "grad_norm": 0.5311827063560486, + "learning_rate": 7.079999999999999e-07, + "loss": 0.2751, + "mean_token_accuracy": 0.9398890398442745, + "num_tokens": 52604810.0, + "step": 730 + }, + { + "epoch": 0.45628681777396324, + "grad_norm": 0.404119610786438, + "learning_rate": 7.076e-07, + "loss": 0.3103, + "mean_token_accuracy": 0.9297262318432331, + "num_tokens": 52680013.0, + "step": 731 + }, + { + "epoch": 0.456911013147115, + "grad_norm": 0.2676778733730316, + "learning_rate": 7.072e-07, + "loss": 0.3742, + "mean_token_accuracy": 0.9136869981884956, + "num_tokens": 52748122.0, + "step": 732 + }, + { + "epoch": 0.45753520852026686, + "grad_norm": 0.3234860599040985, + "learning_rate": 7.068e-07, + "loss": 0.3596, + "mean_token_accuracy": 0.9213199652731419, + "num_tokens": 52822157.0, + "step": 733 + }, + { + "epoch": 0.45815940389341864, + "grad_norm": 0.21793296933174133, + "learning_rate": 7.064e-07, + "loss": 0.3601, + "mean_token_accuracy": 0.9182924181222916, + "num_tokens": 52893125.0, + "step": 734 + }, + { + "epoch": 0.4587835992665704, + "grad_norm": 0.45688652992248535, + "learning_rate": 7.059999999999999e-07, + "loss": 0.3333, + "mean_token_accuracy": 0.9230261892080307, + "num_tokens": 52967530.0, + "step": 735 + }, + { + "epoch": 0.45940779463972226, + "grad_norm": 0.5506690740585327, + "learning_rate": 7.056e-07, + "loss": 0.3257, + "mean_token_accuracy": 0.9282068461179733, + "num_tokens": 53038033.0, + "step": 736 + }, + { + "epoch": 0.46003199001287404, + "grad_norm": 0.39481911063194275, + "learning_rate": 7.052e-07, + "loss": 0.386, + "mean_token_accuracy": 0.9134550280869007, + "num_tokens": 53108827.0, + "step": 737 + }, + { + "epoch": 0.4606561853860258, + "grad_norm": 0.39907434582710266, + "learning_rate": 7.047999999999999e-07, + "loss": 0.3391, + "mean_token_accuracy": 0.9239070154726505, + "num_tokens": 53177694.0, + "step": 738 + }, + { + "epoch": 0.4612803807591776, + "grad_norm": 0.5079224705696106, + "learning_rate": 7.044e-07, + "loss": 0.3167, + "mean_token_accuracy": 0.926607720553875, + "num_tokens": 53252577.0, + "step": 739 + }, + { + "epoch": 0.46190457613232944, + "grad_norm": 0.28749263286590576, + "learning_rate": 7.04e-07, + "loss": 0.324, + "mean_token_accuracy": 0.9308020249009132, + "num_tokens": 53323806.0, + "step": 740 + }, + { + "epoch": 0.4625287715054812, + "grad_norm": 0.2666906714439392, + "learning_rate": 7.035999999999999e-07, + "loss": 0.3384, + "mean_token_accuracy": 0.9271521978080273, + "num_tokens": 53398478.0, + "step": 741 + }, + { + "epoch": 0.463152966878633, + "grad_norm": 0.3235752284526825, + "learning_rate": 7.032e-07, + "loss": 0.3692, + "mean_token_accuracy": 0.9176539406180382, + "num_tokens": 53466260.0, + "step": 742 + }, + { + "epoch": 0.46377716225178484, + "grad_norm": 0.424331933259964, + "learning_rate": 7.028e-07, + "loss": 0.3174, + "mean_token_accuracy": 0.9246450252830982, + "num_tokens": 53539239.0, + "step": 743 + }, + { + "epoch": 0.4644013576249366, + "grad_norm": 0.256606787443161, + "learning_rate": 7.024e-07, + "loss": 0.3466, + "mean_token_accuracy": 0.9182081036269665, + "num_tokens": 53608741.0, + "step": 744 + }, + { + "epoch": 0.4650255529980884, + "grad_norm": 0.3169555962085724, + "learning_rate": 7.019999999999999e-07, + "loss": 0.3355, + "mean_token_accuracy": 0.9210515283048153, + "num_tokens": 53677629.0, + "step": 745 + }, + { + "epoch": 0.4656497483712402, + "grad_norm": 0.49976810812950134, + "learning_rate": 7.016e-07, + "loss": 0.3716, + "mean_token_accuracy": 0.9182851128280163, + "num_tokens": 53746893.0, + "step": 746 + }, + { + "epoch": 0.466273943744392, + "grad_norm": 0.48305338621139526, + "learning_rate": 7.012000000000001e-07, + "loss": 0.3783, + "mean_token_accuracy": 0.9136515147984028, + "num_tokens": 53814084.0, + "step": 747 + }, + { + "epoch": 0.4668981391175438, + "grad_norm": 0.3004201352596283, + "learning_rate": 7.007999999999999e-07, + "loss": 0.3317, + "mean_token_accuracy": 0.9266849867999554, + "num_tokens": 53888580.0, + "step": 748 + }, + { + "epoch": 0.4675223344906956, + "grad_norm": 0.248160257935524, + "learning_rate": 7.004e-07, + "loss": 0.346, + "mean_token_accuracy": 0.9221676141023636, + "num_tokens": 53961561.0, + "step": 749 + }, + { + "epoch": 0.46814652986384736, + "grad_norm": 0.4498157501220703, + "learning_rate": 7e-07, + "loss": 0.3635, + "mean_token_accuracy": 0.9175557233393192, + "num_tokens": 54030954.0, + "step": 750 + }, + { + "epoch": 0.4687707252369992, + "grad_norm": 0.40121790766716003, + "learning_rate": 6.995999999999999e-07, + "loss": 0.3449, + "mean_token_accuracy": 0.9215624295175076, + "num_tokens": 54106994.0, + "step": 751 + }, + { + "epoch": 0.469394920610151, + "grad_norm": 0.4183201193809509, + "learning_rate": 6.992e-07, + "loss": 0.3065, + "mean_token_accuracy": 0.9317567013204098, + "num_tokens": 54180104.0, + "step": 752 + }, + { + "epoch": 0.47001911598330276, + "grad_norm": 0.3203515410423279, + "learning_rate": 6.988e-07, + "loss": 0.3106, + "mean_token_accuracy": 0.9256347455084324, + "num_tokens": 54252966.0, + "step": 753 + }, + { + "epoch": 0.4706433113564546, + "grad_norm": 0.32995855808258057, + "learning_rate": 6.984e-07, + "loss": 0.3285, + "mean_token_accuracy": 0.9257500730454922, + "num_tokens": 54322889.0, + "step": 754 + }, + { + "epoch": 0.4712675067296064, + "grad_norm": 0.215129554271698, + "learning_rate": 6.979999999999999e-07, + "loss": 0.3506, + "mean_token_accuracy": 0.9233107268810272, + "num_tokens": 54394403.0, + "step": 755 + }, + { + "epoch": 0.47189170210275816, + "grad_norm": 0.4501388370990753, + "learning_rate": 6.976e-07, + "loss": 0.3222, + "mean_token_accuracy": 0.93061588332057, + "num_tokens": 54464671.0, + "step": 756 + }, + { + "epoch": 0.47251589747590994, + "grad_norm": 0.3878070116043091, + "learning_rate": 6.972e-07, + "loss": 0.3284, + "mean_token_accuracy": 0.9270447343587875, + "num_tokens": 54539439.0, + "step": 757 + }, + { + "epoch": 0.4731400928490618, + "grad_norm": 0.3112216889858246, + "learning_rate": 6.967999999999999e-07, + "loss": 0.3086, + "mean_token_accuracy": 0.9334950409829617, + "num_tokens": 54614382.0, + "step": 758 + }, + { + "epoch": 0.47376428822221356, + "grad_norm": 0.37291496992111206, + "learning_rate": 6.964e-07, + "loss": 0.3469, + "mean_token_accuracy": 0.9203985668718815, + "num_tokens": 54681933.0, + "step": 759 + }, + { + "epoch": 0.47438848359536534, + "grad_norm": 1.7890000343322754, + "learning_rate": 6.959999999999999e-07, + "loss": 0.3754, + "mean_token_accuracy": 0.9173572920262814, + "num_tokens": 54752095.0, + "step": 760 + }, + { + "epoch": 0.4750126789685171, + "grad_norm": 0.38452449440956116, + "learning_rate": 6.956e-07, + "loss": 0.3164, + "mean_token_accuracy": 0.9268531575798988, + "num_tokens": 54820617.0, + "step": 761 + }, + { + "epoch": 0.47563687434166896, + "grad_norm": 0.40700098872184753, + "learning_rate": 6.952e-07, + "loss": 0.3562, + "mean_token_accuracy": 0.9231256954371929, + "num_tokens": 54889992.0, + "step": 762 + }, + { + "epoch": 0.47626106971482074, + "grad_norm": 0.313430517911911, + "learning_rate": 6.947999999999999e-07, + "loss": 0.2851, + "mean_token_accuracy": 0.933523815125227, + "num_tokens": 54960485.0, + "step": 763 + }, + { + "epoch": 0.4768852650879725, + "grad_norm": 0.3233875632286072, + "learning_rate": 6.944e-07, + "loss": 0.3637, + "mean_token_accuracy": 0.9248353838920593, + "num_tokens": 55032811.0, + "step": 764 + }, + { + "epoch": 0.47750946046112436, + "grad_norm": 0.3850257396697998, + "learning_rate": 6.939999999999999e-07, + "loss": 0.3072, + "mean_token_accuracy": 0.9323156476020813, + "num_tokens": 55102615.0, + "step": 765 + }, + { + "epoch": 0.47813365583427614, + "grad_norm": 0.30128633975982666, + "learning_rate": 6.935999999999999e-07, + "loss": 0.377, + "mean_token_accuracy": 0.9120356626808643, + "num_tokens": 55174731.0, + "step": 766 + }, + { + "epoch": 0.4787578512074279, + "grad_norm": 0.4391093850135803, + "learning_rate": 6.932e-07, + "loss": 0.3509, + "mean_token_accuracy": 0.9172132723033428, + "num_tokens": 55243975.0, + "step": 767 + }, + { + "epoch": 0.4793820465805797, + "grad_norm": 0.38679075241088867, + "learning_rate": 6.928e-07, + "loss": 0.3403, + "mean_token_accuracy": 0.9184436909854412, + "num_tokens": 55311031.0, + "step": 768 + }, + { + "epoch": 0.48000624195373154, + "grad_norm": 0.29473400115966797, + "learning_rate": 6.924e-07, + "loss": 0.3042, + "mean_token_accuracy": 0.9334207512438297, + "num_tokens": 55382862.0, + "step": 769 + }, + { + "epoch": 0.4806304373268833, + "grad_norm": 0.3953630328178406, + "learning_rate": 6.919999999999999e-07, + "loss": 0.3284, + "mean_token_accuracy": 0.9284822195768356, + "num_tokens": 55459316.0, + "step": 770 + }, + { + "epoch": 0.4812546327000351, + "grad_norm": 0.5145599842071533, + "learning_rate": 6.916e-07, + "loss": 0.3368, + "mean_token_accuracy": 0.9249029085040092, + "num_tokens": 55531290.0, + "step": 771 + }, + { + "epoch": 0.4818788280731869, + "grad_norm": 0.35696113109588623, + "learning_rate": 6.912e-07, + "loss": 0.3194, + "mean_token_accuracy": 0.9284510426223278, + "num_tokens": 55603569.0, + "step": 772 + }, + { + "epoch": 0.4825030234463387, + "grad_norm": 0.4163554012775421, + "learning_rate": 6.907999999999999e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.926842711865902, + "num_tokens": 55672030.0, + "step": 773 + }, + { + "epoch": 0.4831272188194905, + "grad_norm": 0.4742501974105835, + "learning_rate": 6.904e-07, + "loss": 0.3656, + "mean_token_accuracy": 0.9189232848584652, + "num_tokens": 55739752.0, + "step": 774 + }, + { + "epoch": 0.4837514141926423, + "grad_norm": 0.4275151491165161, + "learning_rate": 6.9e-07, + "loss": 0.2974, + "mean_token_accuracy": 0.9297335892915726, + "num_tokens": 55812195.0, + "step": 775 + }, + { + "epoch": 0.4843756095657941, + "grad_norm": 0.309121310710907, + "learning_rate": 6.895999999999999e-07, + "loss": 0.3182, + "mean_token_accuracy": 0.9266856200993061, + "num_tokens": 55886563.0, + "step": 776 + }, + { + "epoch": 0.4849998049389459, + "grad_norm": 0.5211467146873474, + "learning_rate": 6.892e-07, + "loss": 0.3314, + "mean_token_accuracy": 0.9253830797970295, + "num_tokens": 55956897.0, + "step": 777 + }, + { + "epoch": 0.4856240003120977, + "grad_norm": 0.23561716079711914, + "learning_rate": 6.888e-07, + "loss": 0.3245, + "mean_token_accuracy": 0.9261765368282795, + "num_tokens": 56030117.0, + "step": 778 + }, + { + "epoch": 0.48624819568524946, + "grad_norm": 0.3978263735771179, + "learning_rate": 6.883999999999999e-07, + "loss": 0.3051, + "mean_token_accuracy": 0.9322772137820721, + "num_tokens": 56102961.0, + "step": 779 + }, + { + "epoch": 0.4868723910584013, + "grad_norm": 0.3226126432418823, + "learning_rate": 6.879999999999999e-07, + "loss": 0.3491, + "mean_token_accuracy": 0.9202737398445606, + "num_tokens": 56172222.0, + "step": 780 + }, + { + "epoch": 0.4874965864315531, + "grad_norm": 0.3008454740047455, + "learning_rate": 6.876e-07, + "loss": 0.3361, + "mean_token_accuracy": 0.9267997480928898, + "num_tokens": 56245752.0, + "step": 781 + }, + { + "epoch": 0.48812078180470486, + "grad_norm": 0.3068612515926361, + "learning_rate": 6.872e-07, + "loss": 0.3638, + "mean_token_accuracy": 0.9167297668755054, + "num_tokens": 56316235.0, + "step": 782 + }, + { + "epoch": 0.48874497717785664, + "grad_norm": 0.22114723920822144, + "learning_rate": 6.867999999999999e-07, + "loss": 0.3171, + "mean_token_accuracy": 0.9272118136286736, + "num_tokens": 56390998.0, + "step": 783 + }, + { + "epoch": 0.4893691725510085, + "grad_norm": 0.2479737401008606, + "learning_rate": 6.864e-07, + "loss": 0.3324, + "mean_token_accuracy": 0.9246704541146755, + "num_tokens": 56459176.0, + "step": 784 + }, + { + "epoch": 0.48999336792416026, + "grad_norm": 0.38008129596710205, + "learning_rate": 6.86e-07, + "loss": 0.3046, + "mean_token_accuracy": 0.9347673803567886, + "num_tokens": 56535165.0, + "step": 785 + }, + { + "epoch": 0.49061756329731204, + "grad_norm": 0.34412187337875366, + "learning_rate": 6.855999999999999e-07, + "loss": 0.3265, + "mean_token_accuracy": 0.9240523464977741, + "num_tokens": 56605577.0, + "step": 786 + }, + { + "epoch": 0.4912417586704639, + "grad_norm": 0.3644009232521057, + "learning_rate": 6.852e-07, + "loss": 0.2954, + "mean_token_accuracy": 0.9324140585958958, + "num_tokens": 56680245.0, + "step": 787 + }, + { + "epoch": 0.49186595404361566, + "grad_norm": 0.39564791321754456, + "learning_rate": 6.847999999999999e-07, + "loss": 0.3124, + "mean_token_accuracy": 0.9314664751291275, + "num_tokens": 56753379.0, + "step": 788 + }, + { + "epoch": 0.49249014941676744, + "grad_norm": 0.479529470205307, + "learning_rate": 6.844e-07, + "loss": 0.3378, + "mean_token_accuracy": 0.9203203991055489, + "num_tokens": 56829976.0, + "step": 789 + }, + { + "epoch": 0.4931143447899192, + "grad_norm": 0.4279347062110901, + "learning_rate": 6.84e-07, + "loss": 0.296, + "mean_token_accuracy": 0.9355221167206764, + "num_tokens": 56905864.0, + "step": 790 + }, + { + "epoch": 0.49373854016307106, + "grad_norm": 0.34300193190574646, + "learning_rate": 6.836e-07, + "loss": 0.33, + "mean_token_accuracy": 0.9281573966145515, + "num_tokens": 56978229.0, + "step": 791 + }, + { + "epoch": 0.49436273553622284, + "grad_norm": 0.43124455213546753, + "learning_rate": 6.832e-07, + "loss": 0.3322, + "mean_token_accuracy": 0.9186022728681564, + "num_tokens": 57048922.0, + "step": 792 + }, + { + "epoch": 0.4949869309093746, + "grad_norm": 0.35299694538116455, + "learning_rate": 6.827999999999999e-07, + "loss": 0.3436, + "mean_token_accuracy": 0.9200886636972427, + "num_tokens": 57114461.0, + "step": 793 + }, + { + "epoch": 0.4956111262825264, + "grad_norm": 0.27989789843559265, + "learning_rate": 6.824e-07, + "loss": 0.2922, + "mean_token_accuracy": 0.9341375753283501, + "num_tokens": 57189742.0, + "step": 794 + }, + { + "epoch": 0.49623532165567824, + "grad_norm": 0.25702568888664246, + "learning_rate": 6.82e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.9275923520326614, + "num_tokens": 57261773.0, + "step": 795 + }, + { + "epoch": 0.49685951702883, + "grad_norm": 0.3535645008087158, + "learning_rate": 6.816e-07, + "loss": 0.3492, + "mean_token_accuracy": 0.9200767949223518, + "num_tokens": 57331145.0, + "step": 796 + }, + { + "epoch": 0.4974837124019818, + "grad_norm": 0.36620262265205383, + "learning_rate": 6.812e-07, + "loss": 0.3161, + "mean_token_accuracy": 0.9303906634449959, + "num_tokens": 57410396.0, + "step": 797 + }, + { + "epoch": 0.49810790777513364, + "grad_norm": 0.35625168681144714, + "learning_rate": 6.807999999999999e-07, + "loss": 0.3612, + "mean_token_accuracy": 0.9239784106612206, + "num_tokens": 57482555.0, + "step": 798 + }, + { + "epoch": 0.4987321031482854, + "grad_norm": 0.42324453592300415, + "learning_rate": 6.804e-07, + "loss": 0.3224, + "mean_token_accuracy": 0.9290669783949852, + "num_tokens": 57557257.0, + "step": 799 + }, + { + "epoch": 0.4993562985214372, + "grad_norm": 0.2871803343296051, + "learning_rate": 6.800000000000001e-07, + "loss": 0.3067, + "mean_token_accuracy": 0.9317543730139732, + "num_tokens": 57633259.0, + "step": 800 + }, + { + "epoch": 0.499980493894589, + "grad_norm": 0.4968184232711792, + "learning_rate": 6.795999999999999e-07, + "loss": 0.3442, + "mean_token_accuracy": 0.9231974147260189, + "num_tokens": 57707507.0, + "step": 801 + }, + { + "epoch": 0.5006046892677408, + "grad_norm": 0.30473563075065613, + "learning_rate": 6.792e-07, + "loss": 0.3311, + "mean_token_accuracy": 0.9224537685513496, + "num_tokens": 57776950.0, + "step": 802 + }, + { + "epoch": 0.5012288846408925, + "grad_norm": 0.3754504323005676, + "learning_rate": 6.788e-07, + "loss": 0.3003, + "mean_token_accuracy": 0.9335438162088394, + "num_tokens": 57849072.0, + "step": 803 + }, + { + "epoch": 0.5018530800140444, + "grad_norm": 0.29765474796295166, + "learning_rate": 6.783999999999999e-07, + "loss": 0.3271, + "mean_token_accuracy": 0.9281776584684849, + "num_tokens": 57925007.0, + "step": 804 + }, + { + "epoch": 0.5024772753871962, + "grad_norm": 0.5371522903442383, + "learning_rate": 6.78e-07, + "loss": 0.3261, + "mean_token_accuracy": 0.9277910217642784, + "num_tokens": 57996838.0, + "step": 805 + }, + { + "epoch": 0.503101470760348, + "grad_norm": 0.46128857135772705, + "learning_rate": 6.776e-07, + "loss": 0.355, + "mean_token_accuracy": 0.9118218421936035, + "num_tokens": 58068029.0, + "step": 806 + }, + { + "epoch": 0.5037256661334998, + "grad_norm": 1.187224268913269, + "learning_rate": 6.772e-07, + "loss": 0.3481, + "mean_token_accuracy": 0.9187640734016895, + "num_tokens": 58139184.0, + "step": 807 + }, + { + "epoch": 0.5043498615066516, + "grad_norm": 0.3919733166694641, + "learning_rate": 6.767999999999999e-07, + "loss": 0.3575, + "mean_token_accuracy": 0.9180425144731998, + "num_tokens": 58203177.0, + "step": 808 + }, + { + "epoch": 0.5049740568798033, + "grad_norm": 0.33362945914268494, + "learning_rate": 6.764e-07, + "loss": 0.3329, + "mean_token_accuracy": 0.9237672202289104, + "num_tokens": 58277291.0, + "step": 809 + }, + { + "epoch": 0.5055982522529552, + "grad_norm": 0.2621035873889923, + "learning_rate": 6.76e-07, + "loss": 0.3141, + "mean_token_accuracy": 0.9307037070393562, + "num_tokens": 58348287.0, + "step": 810 + }, + { + "epoch": 0.506222447626107, + "grad_norm": 0.23340308666229248, + "learning_rate": 6.755999999999999e-07, + "loss": 0.3461, + "mean_token_accuracy": 0.921839028596878, + "num_tokens": 58415089.0, + "step": 811 + }, + { + "epoch": 0.5068466429992587, + "grad_norm": 0.3458343744277954, + "learning_rate": 6.752e-07, + "loss": 0.327, + "mean_token_accuracy": 0.9286703877151012, + "num_tokens": 58489140.0, + "step": 812 + }, + { + "epoch": 0.5074708383724106, + "grad_norm": 0.43826496601104736, + "learning_rate": 6.747999999999999e-07, + "loss": 0.3141, + "mean_token_accuracy": 0.929670188575983, + "num_tokens": 58563568.0, + "step": 813 + }, + { + "epoch": 0.5080950337455623, + "grad_norm": 0.8658429980278015, + "learning_rate": 6.744e-07, + "loss": 0.3165, + "mean_token_accuracy": 0.930111650377512, + "num_tokens": 58636169.0, + "step": 814 + }, + { + "epoch": 0.5087192291187141, + "grad_norm": 0.5724182724952698, + "learning_rate": 6.74e-07, + "loss": 0.2598, + "mean_token_accuracy": 0.9401777200400829, + "num_tokens": 58711570.0, + "step": 815 + }, + { + "epoch": 0.509343424491866, + "grad_norm": 0.28098592162132263, + "learning_rate": 6.736e-07, + "loss": 0.3574, + "mean_token_accuracy": 0.918064646422863, + "num_tokens": 58781110.0, + "step": 816 + }, + { + "epoch": 0.5099676198650177, + "grad_norm": 0.3825387954711914, + "learning_rate": 6.732e-07, + "loss": 0.2951, + "mean_token_accuracy": 0.9338416680693626, + "num_tokens": 58857532.0, + "step": 817 + }, + { + "epoch": 0.5105918152381695, + "grad_norm": 0.2271539717912674, + "learning_rate": 6.727999999999999e-07, + "loss": 0.3392, + "mean_token_accuracy": 0.9180256314575672, + "num_tokens": 58929618.0, + "step": 818 + }, + { + "epoch": 0.5112160106113214, + "grad_norm": 1.0451000928878784, + "learning_rate": 6.724e-07, + "loss": 0.3521, + "mean_token_accuracy": 0.9188212901353836, + "num_tokens": 58999419.0, + "step": 819 + }, + { + "epoch": 0.5118402059844731, + "grad_norm": 0.4600159525871277, + "learning_rate": 6.72e-07, + "loss": 0.3371, + "mean_token_accuracy": 0.9256699904799461, + "num_tokens": 59069688.0, + "step": 820 + }, + { + "epoch": 0.5124644013576249, + "grad_norm": 0.37027785181999207, + "learning_rate": 6.716e-07, + "loss": 0.2953, + "mean_token_accuracy": 0.9317917823791504, + "num_tokens": 59143221.0, + "step": 821 + }, + { + "epoch": 0.5130885967307768, + "grad_norm": 0.39364737272262573, + "learning_rate": 6.712e-07, + "loss": 0.3326, + "mean_token_accuracy": 0.9256380274891853, + "num_tokens": 59215897.0, + "step": 822 + }, + { + "epoch": 0.5137127921039285, + "grad_norm": 0.2712680995464325, + "learning_rate": 6.707999999999999e-07, + "loss": 0.3472, + "mean_token_accuracy": 0.9214113615453243, + "num_tokens": 59286560.0, + "step": 823 + }, + { + "epoch": 0.5143369874770803, + "grad_norm": 0.45734336972236633, + "learning_rate": 6.704e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9196154400706291, + "num_tokens": 59357481.0, + "step": 824 + }, + { + "epoch": 0.5149611828502321, + "grad_norm": 0.37052568793296814, + "learning_rate": 6.7e-07, + "loss": 0.3118, + "mean_token_accuracy": 0.9303006753325462, + "num_tokens": 59430046.0, + "step": 825 + }, + { + "epoch": 0.5155853782233839, + "grad_norm": 0.35565704107284546, + "learning_rate": 6.695999999999999e-07, + "loss": 0.3276, + "mean_token_accuracy": 0.927744098007679, + "num_tokens": 59504861.0, + "step": 826 + }, + { + "epoch": 0.5162095735965357, + "grad_norm": 0.3806411027908325, + "learning_rate": 6.692e-07, + "loss": 0.3101, + "mean_token_accuracy": 0.9285195842385292, + "num_tokens": 59576408.0, + "step": 827 + }, + { + "epoch": 0.5168337689696875, + "grad_norm": 0.2509946823120117, + "learning_rate": 6.688e-07, + "loss": 0.3039, + "mean_token_accuracy": 0.9321120083332062, + "num_tokens": 59652561.0, + "step": 828 + }, + { + "epoch": 0.5174579643428393, + "grad_norm": 0.30902498960494995, + "learning_rate": 6.683999999999999e-07, + "loss": 0.3298, + "mean_token_accuracy": 0.9265851452946663, + "num_tokens": 59727436.0, + "step": 829 + }, + { + "epoch": 0.5180821597159911, + "grad_norm": 0.34848296642303467, + "learning_rate": 6.68e-07, + "loss": 0.2895, + "mean_token_accuracy": 0.9355588145554066, + "num_tokens": 59801993.0, + "step": 830 + }, + { + "epoch": 0.5187063550891429, + "grad_norm": 0.8798127174377441, + "learning_rate": 6.676e-07, + "loss": 0.3075, + "mean_token_accuracy": 0.9326577596366405, + "num_tokens": 59872428.0, + "step": 831 + }, + { + "epoch": 0.5193305504622947, + "grad_norm": 0.42768946290016174, + "learning_rate": 6.671999999999999e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9277102574706078, + "num_tokens": 59948764.0, + "step": 832 + }, + { + "epoch": 0.5199547458354465, + "grad_norm": 0.27936241030693054, + "learning_rate": 6.667999999999999e-07, + "loss": 0.3675, + "mean_token_accuracy": 0.9130092337727547, + "num_tokens": 60014163.0, + "step": 833 + }, + { + "epoch": 0.5205789412085983, + "grad_norm": 0.4408262073993683, + "learning_rate": 6.664e-07, + "loss": 0.3086, + "mean_token_accuracy": 0.9310822710394859, + "num_tokens": 60089115.0, + "step": 834 + }, + { + "epoch": 0.5212031365817501, + "grad_norm": 0.3274775743484497, + "learning_rate": 6.66e-07, + "loss": 0.342, + "mean_token_accuracy": 0.9205053001642227, + "num_tokens": 60160550.0, + "step": 835 + }, + { + "epoch": 0.5218273319549018, + "grad_norm": 1.1035127639770508, + "learning_rate": 6.655999999999999e-07, + "loss": 0.3431, + "mean_token_accuracy": 0.9214097373187542, + "num_tokens": 60230895.0, + "step": 836 + }, + { + "epoch": 0.5224515273280537, + "grad_norm": 0.2535402476787567, + "learning_rate": 6.652e-07, + "loss": 0.3285, + "mean_token_accuracy": 0.9241763278841972, + "num_tokens": 60301917.0, + "step": 837 + }, + { + "epoch": 0.5230757227012055, + "grad_norm": 0.3422260582447052, + "learning_rate": 6.647999999999999e-07, + "loss": 0.314, + "mean_token_accuracy": 0.9301337525248528, + "num_tokens": 60376748.0, + "step": 838 + }, + { + "epoch": 0.5236999180743572, + "grad_norm": 0.3284092843532562, + "learning_rate": 6.643999999999999e-07, + "loss": 0.3368, + "mean_token_accuracy": 0.9259582236409187, + "num_tokens": 60447900.0, + "step": 839 + }, + { + "epoch": 0.5243241134475091, + "grad_norm": 0.4033794701099396, + "learning_rate": 6.64e-07, + "loss": 0.3109, + "mean_token_accuracy": 0.929807037115097, + "num_tokens": 60518915.0, + "step": 840 + }, + { + "epoch": 0.5249483088206609, + "grad_norm": 0.6306251287460327, + "learning_rate": 6.636e-07, + "loss": 0.3108, + "mean_token_accuracy": 0.9308784641325474, + "num_tokens": 60593012.0, + "step": 841 + }, + { + "epoch": 0.5255725041938126, + "grad_norm": 0.3373713791370392, + "learning_rate": 6.632e-07, + "loss": 0.3523, + "mean_token_accuracy": 0.9216680601239204, + "num_tokens": 60660953.0, + "step": 842 + }, + { + "epoch": 0.5261966995669645, + "grad_norm": 0.3094328045845032, + "learning_rate": 6.627999999999999e-07, + "loss": 0.3497, + "mean_token_accuracy": 0.9154267460107803, + "num_tokens": 60727607.0, + "step": 843 + }, + { + "epoch": 0.5268208949401163, + "grad_norm": 0.2698313891887665, + "learning_rate": 6.624e-07, + "loss": 0.3177, + "mean_token_accuracy": 0.9287923537194729, + "num_tokens": 60801976.0, + "step": 844 + }, + { + "epoch": 0.527445090313268, + "grad_norm": 0.27001866698265076, + "learning_rate": 6.62e-07, + "loss": 0.316, + "mean_token_accuracy": 0.9283559136092663, + "num_tokens": 60871857.0, + "step": 845 + }, + { + "epoch": 0.5280692856864199, + "grad_norm": 0.2707677185535431, + "learning_rate": 6.615999999999999e-07, + "loss": 0.3185, + "mean_token_accuracy": 0.9285137169063091, + "num_tokens": 60945519.0, + "step": 846 + }, + { + "epoch": 0.5286934810595716, + "grad_norm": 0.2894807457923889, + "learning_rate": 6.612e-07, + "loss": 0.3154, + "mean_token_accuracy": 0.9292592667043209, + "num_tokens": 61019928.0, + "step": 847 + }, + { + "epoch": 0.5293176764327234, + "grad_norm": 0.25697293877601624, + "learning_rate": 6.608e-07, + "loss": 0.3282, + "mean_token_accuracy": 0.9247947856783867, + "num_tokens": 61090697.0, + "step": 848 + }, + { + "epoch": 0.5299418718058753, + "grad_norm": 0.35483691096305847, + "learning_rate": 6.604e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.9281399734318256, + "num_tokens": 61163683.0, + "step": 849 + }, + { + "epoch": 0.530566067179027, + "grad_norm": 0.8732873797416687, + "learning_rate": 6.6e-07, + "loss": 0.3137, + "mean_token_accuracy": 0.927576832473278, + "num_tokens": 61240425.0, + "step": 850 + }, + { + "epoch": 0.5311902625521788, + "grad_norm": 0.33668267726898193, + "learning_rate": 6.595999999999999e-07, + "loss": 0.2974, + "mean_token_accuracy": 0.9318785108625889, + "num_tokens": 61314112.0, + "step": 851 + }, + { + "epoch": 0.5318144579253307, + "grad_norm": 0.2964807450771332, + "learning_rate": 6.592e-07, + "loss": 0.2961, + "mean_token_accuracy": 0.9362316727638245, + "num_tokens": 61387594.0, + "step": 852 + }, + { + "epoch": 0.5324386532984824, + "grad_norm": 0.41130802035331726, + "learning_rate": 6.588e-07, + "loss": 0.2981, + "mean_token_accuracy": 0.9318909607827663, + "num_tokens": 61459533.0, + "step": 853 + }, + { + "epoch": 0.5330628486716342, + "grad_norm": 0.8601356148719788, + "learning_rate": 6.583999999999999e-07, + "loss": 0.3589, + "mean_token_accuracy": 0.9185883365571499, + "num_tokens": 61527652.0, + "step": 854 + }, + { + "epoch": 0.5336870440447861, + "grad_norm": 0.32772937417030334, + "learning_rate": 6.58e-07, + "loss": 0.2959, + "mean_token_accuracy": 0.9342739060521126, + "num_tokens": 61602399.0, + "step": 855 + }, + { + "epoch": 0.5343112394179378, + "grad_norm": 0.2073204219341278, + "learning_rate": 6.576e-07, + "loss": 0.3444, + "mean_token_accuracy": 0.9244417250156403, + "num_tokens": 61670724.0, + "step": 856 + }, + { + "epoch": 0.5349354347910896, + "grad_norm": 0.31500622630119324, + "learning_rate": 6.571999999999999e-07, + "loss": 0.3192, + "mean_token_accuracy": 0.9282494559884071, + "num_tokens": 61742959.0, + "step": 857 + }, + { + "epoch": 0.5355596301642415, + "grad_norm": 0.3423157334327698, + "learning_rate": 6.568e-07, + "loss": 0.3548, + "mean_token_accuracy": 0.9158097580075264, + "num_tokens": 61811114.0, + "step": 858 + }, + { + "epoch": 0.5361838255373932, + "grad_norm": 0.2900506854057312, + "learning_rate": 6.564e-07, + "loss": 0.3265, + "mean_token_accuracy": 0.9255715534090996, + "num_tokens": 61882969.0, + "step": 859 + }, + { + "epoch": 0.536808020910545, + "grad_norm": 0.2774682641029358, + "learning_rate": 6.56e-07, + "loss": 0.3436, + "mean_token_accuracy": 0.9228880815207958, + "num_tokens": 61952163.0, + "step": 860 + }, + { + "epoch": 0.5374322162836968, + "grad_norm": 0.3559592068195343, + "learning_rate": 6.555999999999999e-07, + "loss": 0.3688, + "mean_token_accuracy": 0.9127374067902565, + "num_tokens": 62020369.0, + "step": 861 + }, + { + "epoch": 0.5380564116568486, + "grad_norm": 0.3357591927051544, + "learning_rate": 6.552e-07, + "loss": 0.3395, + "mean_token_accuracy": 0.923944428563118, + "num_tokens": 62090634.0, + "step": 862 + }, + { + "epoch": 0.5386806070300004, + "grad_norm": 0.3759511113166809, + "learning_rate": 6.548000000000001e-07, + "loss": 0.3447, + "mean_token_accuracy": 0.9209614582359791, + "num_tokens": 62161178.0, + "step": 863 + }, + { + "epoch": 0.5393048024031522, + "grad_norm": 0.5431933999061584, + "learning_rate": 6.543999999999999e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9273117445409298, + "num_tokens": 62228117.0, + "step": 864 + }, + { + "epoch": 0.539928997776304, + "grad_norm": 0.28718411922454834, + "learning_rate": 6.54e-07, + "loss": 0.3088, + "mean_token_accuracy": 0.9304384291172028, + "num_tokens": 62297179.0, + "step": 865 + }, + { + "epoch": 0.5405531931494558, + "grad_norm": 0.24944348633289337, + "learning_rate": 6.536e-07, + "loss": 0.331, + "mean_token_accuracy": 0.9251144230365753, + "num_tokens": 62366224.0, + "step": 866 + }, + { + "epoch": 0.5411773885226075, + "grad_norm": 0.3461836874485016, + "learning_rate": 6.531999999999999e-07, + "loss": 0.377, + "mean_token_accuracy": 0.9160482995212078, + "num_tokens": 62439463.0, + "step": 867 + }, + { + "epoch": 0.5418015838957594, + "grad_norm": 0.3286309540271759, + "learning_rate": 6.528e-07, + "loss": 0.3369, + "mean_token_accuracy": 0.925285592675209, + "num_tokens": 62511189.0, + "step": 868 + }, + { + "epoch": 0.5424257792689112, + "grad_norm": 0.2973572313785553, + "learning_rate": 6.524e-07, + "loss": 0.3499, + "mean_token_accuracy": 0.9185475669801235, + "num_tokens": 62587568.0, + "step": 869 + }, + { + "epoch": 0.543049974642063, + "grad_norm": 0.25069576501846313, + "learning_rate": 6.52e-07, + "loss": 0.3018, + "mean_token_accuracy": 0.9320270493626595, + "num_tokens": 62661423.0, + "step": 870 + }, + { + "epoch": 0.5436741700152148, + "grad_norm": 0.2799758017063141, + "learning_rate": 6.515999999999999e-07, + "loss": 0.359, + "mean_token_accuracy": 0.9199363514780998, + "num_tokens": 62733682.0, + "step": 871 + }, + { + "epoch": 0.5442983653883665, + "grad_norm": 0.3064819574356079, + "learning_rate": 6.512e-07, + "loss": 0.3139, + "mean_token_accuracy": 0.9285257048904896, + "num_tokens": 62808212.0, + "step": 872 + }, + { + "epoch": 0.5449225607615183, + "grad_norm": 0.3458581864833832, + "learning_rate": 6.508e-07, + "loss": 0.3426, + "mean_token_accuracy": 0.9253249615430832, + "num_tokens": 62874463.0, + "step": 873 + }, + { + "epoch": 0.5455467561346702, + "grad_norm": 0.34328359365463257, + "learning_rate": 6.504e-07, + "loss": 0.3623, + "mean_token_accuracy": 0.9138705618679523, + "num_tokens": 62939303.0, + "step": 874 + }, + { + "epoch": 0.5461709515078219, + "grad_norm": 2.6398191452026367, + "learning_rate": 6.5e-07, + "loss": 0.3286, + "mean_token_accuracy": 0.9278340861201286, + "num_tokens": 63013503.0, + "step": 875 + }, + { + "epoch": 0.5467951468809737, + "grad_norm": 2.7471883296966553, + "learning_rate": 6.495999999999999e-07, + "loss": 0.3513, + "mean_token_accuracy": 0.9204366207122803, + "num_tokens": 63085928.0, + "step": 876 + }, + { + "epoch": 0.5474193422541256, + "grad_norm": 0.36083847284317017, + "learning_rate": 6.492e-07, + "loss": 0.3329, + "mean_token_accuracy": 0.9260806180536747, + "num_tokens": 63154631.0, + "step": 877 + }, + { + "epoch": 0.5480435376272773, + "grad_norm": 2.3419203758239746, + "learning_rate": 6.488e-07, + "loss": 0.341, + "mean_token_accuracy": 0.9239551052451134, + "num_tokens": 63223905.0, + "step": 878 + }, + { + "epoch": 0.5486677330004291, + "grad_norm": 0.2830072045326233, + "learning_rate": 6.483999999999999e-07, + "loss": 0.3442, + "mean_token_accuracy": 0.9240602850914001, + "num_tokens": 63295303.0, + "step": 879 + }, + { + "epoch": 0.549291928373581, + "grad_norm": 0.3667299449443817, + "learning_rate": 6.48e-07, + "loss": 0.3014, + "mean_token_accuracy": 0.9317955821752548, + "num_tokens": 63369491.0, + "step": 880 + }, + { + "epoch": 0.5499161237467327, + "grad_norm": 0.24864889681339264, + "learning_rate": 6.476e-07, + "loss": 0.3529, + "mean_token_accuracy": 0.9187034443020821, + "num_tokens": 63436479.0, + "step": 881 + }, + { + "epoch": 0.5505403191198845, + "grad_norm": 0.5485105514526367, + "learning_rate": 6.471999999999999e-07, + "loss": 0.3057, + "mean_token_accuracy": 0.9314697347581387, + "num_tokens": 63513212.0, + "step": 882 + }, + { + "epoch": 0.5511645144930363, + "grad_norm": 0.268966406583786, + "learning_rate": 6.468e-07, + "loss": 0.2654, + "mean_token_accuracy": 0.9404886960983276, + "num_tokens": 63590432.0, + "step": 883 + }, + { + "epoch": 0.5517887098661881, + "grad_norm": 0.2550604045391083, + "learning_rate": 6.464e-07, + "loss": 0.3201, + "mean_token_accuracy": 0.9265033677220345, + "num_tokens": 63658317.0, + "step": 884 + }, + { + "epoch": 0.55241290523934, + "grad_norm": 0.32950738072395325, + "learning_rate": 6.46e-07, + "loss": 0.2993, + "mean_token_accuracy": 0.9328174777328968, + "num_tokens": 63735198.0, + "step": 885 + }, + { + "epoch": 0.5530371006124917, + "grad_norm": 0.357653945684433, + "learning_rate": 6.455999999999999e-07, + "loss": 0.3537, + "mean_token_accuracy": 0.9245644435286522, + "num_tokens": 63804965.0, + "step": 886 + }, + { + "epoch": 0.5536612959856435, + "grad_norm": 0.26075589656829834, + "learning_rate": 6.452e-07, + "loss": 0.3624, + "mean_token_accuracy": 0.9179616272449493, + "num_tokens": 63873341.0, + "step": 887 + }, + { + "epoch": 0.5542854913587953, + "grad_norm": 0.5456115007400513, + "learning_rate": 6.448000000000001e-07, + "loss": 0.3286, + "mean_token_accuracy": 0.9267865121364594, + "num_tokens": 63942132.0, + "step": 888 + }, + { + "epoch": 0.5549096867319471, + "grad_norm": 0.2823208272457123, + "learning_rate": 6.443999999999999e-07, + "loss": 0.3452, + "mean_token_accuracy": 0.9206357337534428, + "num_tokens": 64009816.0, + "step": 889 + }, + { + "epoch": 0.5555338821050989, + "grad_norm": 0.29662907123565674, + "learning_rate": 6.44e-07, + "loss": 0.2953, + "mean_token_accuracy": 0.934375673532486, + "num_tokens": 64088166.0, + "step": 890 + }, + { + "epoch": 0.5561580774782507, + "grad_norm": 0.22591635584831238, + "learning_rate": 6.436e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.9257152192294598, + "num_tokens": 64159560.0, + "step": 891 + }, + { + "epoch": 0.5567822728514025, + "grad_norm": 0.26975518465042114, + "learning_rate": 6.431999999999999e-07, + "loss": 0.3039, + "mean_token_accuracy": 0.9288149103522301, + "num_tokens": 64234794.0, + "step": 892 + }, + { + "epoch": 0.5574064682245543, + "grad_norm": 0.35066017508506775, + "learning_rate": 6.428e-07, + "loss": 0.2818, + "mean_token_accuracy": 0.9383094571530819, + "num_tokens": 64313405.0, + "step": 893 + }, + { + "epoch": 0.558030663597706, + "grad_norm": 0.3492513597011566, + "learning_rate": 6.424e-07, + "loss": 0.3084, + "mean_token_accuracy": 0.9289005398750305, + "num_tokens": 64384097.0, + "step": 894 + }, + { + "epoch": 0.5586548589708579, + "grad_norm": 0.56803959608078, + "learning_rate": 6.42e-07, + "loss": 0.3498, + "mean_token_accuracy": 0.9189568646252155, + "num_tokens": 64454292.0, + "step": 895 + }, + { + "epoch": 0.5592790543440097, + "grad_norm": 0.3062562942504883, + "learning_rate": 6.415999999999999e-07, + "loss": 0.2761, + "mean_token_accuracy": 0.935680590569973, + "num_tokens": 64532399.0, + "step": 896 + }, + { + "epoch": 0.5599032497171614, + "grad_norm": 0.3139180839061737, + "learning_rate": 6.412e-07, + "loss": 0.358, + "mean_token_accuracy": 0.915798969566822, + "num_tokens": 64602024.0, + "step": 897 + }, + { + "epoch": 0.5605274450903133, + "grad_norm": 0.47270873188972473, + "learning_rate": 6.408e-07, + "loss": 0.3262, + "mean_token_accuracy": 0.9262346103787422, + "num_tokens": 64671684.0, + "step": 898 + }, + { + "epoch": 0.5611516404634651, + "grad_norm": 0.2556508779525757, + "learning_rate": 6.403999999999999e-07, + "loss": 0.3196, + "mean_token_accuracy": 0.9279545471072197, + "num_tokens": 64742119.0, + "step": 899 + }, + { + "epoch": 0.5617758358366168, + "grad_norm": 0.3059193193912506, + "learning_rate": 6.4e-07, + "loss": 0.3151, + "mean_token_accuracy": 0.9242472685873508, + "num_tokens": 64813055.0, + "step": 900 + }, + { + "epoch": 0.5624000312097687, + "grad_norm": 0.3003333508968353, + "learning_rate": 6.395999999999999e-07, + "loss": 0.2785, + "mean_token_accuracy": 0.934153325855732, + "num_tokens": 64882919.0, + "step": 901 + }, + { + "epoch": 0.5630242265829205, + "grad_norm": 0.5348864793777466, + "learning_rate": 6.392e-07, + "loss": 0.3132, + "mean_token_accuracy": 0.9258575029671192, + "num_tokens": 64955311.0, + "step": 902 + }, + { + "epoch": 0.5636484219560722, + "grad_norm": 0.23971465229988098, + "learning_rate": 6.388e-07, + "loss": 0.3103, + "mean_token_accuracy": 0.9243427775800228, + "num_tokens": 65026770.0, + "step": 903 + }, + { + "epoch": 0.5642726173292241, + "grad_norm": 0.29327601194381714, + "learning_rate": 6.383999999999999e-07, + "loss": 0.3362, + "mean_token_accuracy": 0.9241050370037556, + "num_tokens": 65097009.0, + "step": 904 + }, + { + "epoch": 0.5648968127023758, + "grad_norm": 0.2987630367279053, + "learning_rate": 6.38e-07, + "loss": 0.302, + "mean_token_accuracy": 0.9335595779120922, + "num_tokens": 65171547.0, + "step": 905 + }, + { + "epoch": 0.5655210080755276, + "grad_norm": 0.3643862009048462, + "learning_rate": 6.375999999999999e-07, + "loss": 0.3376, + "mean_token_accuracy": 0.9240440875291824, + "num_tokens": 65240974.0, + "step": 906 + }, + { + "epoch": 0.5661452034486795, + "grad_norm": 0.21183830499649048, + "learning_rate": 6.371999999999999e-07, + "loss": 0.3524, + "mean_token_accuracy": 0.9210678488016129, + "num_tokens": 65310494.0, + "step": 907 + }, + { + "epoch": 0.5667693988218312, + "grad_norm": 0.2144000381231308, + "learning_rate": 6.368e-07, + "loss": 0.341, + "mean_token_accuracy": 0.9211500398814678, + "num_tokens": 65383354.0, + "step": 908 + }, + { + "epoch": 0.567393594194983, + "grad_norm": 0.281815767288208, + "learning_rate": 6.364e-07, + "loss": 0.3551, + "mean_token_accuracy": 0.9175384566187859, + "num_tokens": 65455327.0, + "step": 909 + }, + { + "epoch": 0.5680177895681349, + "grad_norm": 0.3019249439239502, + "learning_rate": 6.36e-07, + "loss": 0.3203, + "mean_token_accuracy": 0.9279223792254925, + "num_tokens": 65529219.0, + "step": 910 + }, + { + "epoch": 0.5686419849412866, + "grad_norm": 0.3000909686088562, + "learning_rate": 6.356e-07, + "loss": 0.3433, + "mean_token_accuracy": 0.923235822468996, + "num_tokens": 65602479.0, + "step": 911 + }, + { + "epoch": 0.5692661803144384, + "grad_norm": 0.5725570321083069, + "learning_rate": 6.352e-07, + "loss": 0.3533, + "mean_token_accuracy": 0.9162441082298756, + "num_tokens": 65670664.0, + "step": 912 + }, + { + "epoch": 0.5698903756875903, + "grad_norm": 0.3796912133693695, + "learning_rate": 6.348e-07, + "loss": 0.3258, + "mean_token_accuracy": 0.9275628663599491, + "num_tokens": 65739173.0, + "step": 913 + }, + { + "epoch": 0.570514571060742, + "grad_norm": 0.389944463968277, + "learning_rate": 6.343999999999999e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.9257393516600132, + "num_tokens": 65812369.0, + "step": 914 + }, + { + "epoch": 0.5711387664338938, + "grad_norm": 0.29456764459609985, + "learning_rate": 6.34e-07, + "loss": 0.3573, + "mean_token_accuracy": 0.915925107896328, + "num_tokens": 65878979.0, + "step": 915 + }, + { + "epoch": 0.5717629618070456, + "grad_norm": 0.46188029646873474, + "learning_rate": 6.336000000000001e-07, + "loss": 0.3068, + "mean_token_accuracy": 0.930741872638464, + "num_tokens": 65956913.0, + "step": 916 + }, + { + "epoch": 0.5723871571801974, + "grad_norm": 0.33244258165359497, + "learning_rate": 6.331999999999999e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9271979779005051, + "num_tokens": 66029823.0, + "step": 917 + }, + { + "epoch": 0.5730113525533492, + "grad_norm": 0.2609564960002899, + "learning_rate": 6.328e-07, + "loss": 0.3108, + "mean_token_accuracy": 0.9277559220790863, + "num_tokens": 66102502.0, + "step": 918 + }, + { + "epoch": 0.573635547926501, + "grad_norm": 0.323757141828537, + "learning_rate": 6.324e-07, + "loss": 0.3423, + "mean_token_accuracy": 0.9249501451849937, + "num_tokens": 66173455.0, + "step": 919 + }, + { + "epoch": 0.5742597432996528, + "grad_norm": 0.27009880542755127, + "learning_rate": 6.319999999999999e-07, + "loss": 0.3456, + "mean_token_accuracy": 0.9221528805792332, + "num_tokens": 66243623.0, + "step": 920 + }, + { + "epoch": 0.5748839386728046, + "grad_norm": 0.48479601740837097, + "learning_rate": 6.316e-07, + "loss": 0.3722, + "mean_token_accuracy": 0.9155812859535217, + "num_tokens": 66312071.0, + "step": 921 + }, + { + "epoch": 0.5755081340459564, + "grad_norm": 0.3155643045902252, + "learning_rate": 6.312e-07, + "loss": 0.2919, + "mean_token_accuracy": 0.9334325976669788, + "num_tokens": 66387975.0, + "step": 922 + }, + { + "epoch": 0.5761323294191082, + "grad_norm": 0.36289772391319275, + "learning_rate": 6.308e-07, + "loss": 0.3549, + "mean_token_accuracy": 0.9194199480116367, + "num_tokens": 66458828.0, + "step": 923 + }, + { + "epoch": 0.57675652479226, + "grad_norm": 0.2849428355693817, + "learning_rate": 6.303999999999999e-07, + "loss": 0.3582, + "mean_token_accuracy": 0.9200663976371288, + "num_tokens": 66528694.0, + "step": 924 + }, + { + "epoch": 0.5773807201654118, + "grad_norm": 0.3699878752231598, + "learning_rate": 6.3e-07, + "loss": 0.2834, + "mean_token_accuracy": 0.9354452006518841, + "num_tokens": 66603601.0, + "step": 925 + }, + { + "epoch": 0.5780049155385636, + "grad_norm": 0.2520447075366974, + "learning_rate": 6.296e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.9290561005473137, + "num_tokens": 66675899.0, + "step": 926 + }, + { + "epoch": 0.5786291109117153, + "grad_norm": 0.29595306515693665, + "learning_rate": 6.291999999999999e-07, + "loss": 0.3209, + "mean_token_accuracy": 0.9225086234509945, + "num_tokens": 66746314.0, + "step": 927 + }, + { + "epoch": 0.5792533062848672, + "grad_norm": 0.28613483905792236, + "learning_rate": 6.288e-07, + "loss": 0.2797, + "mean_token_accuracy": 0.9364576525986195, + "num_tokens": 66825002.0, + "step": 928 + }, + { + "epoch": 0.579877501658019, + "grad_norm": 0.39827844500541687, + "learning_rate": 6.283999999999999e-07, + "loss": 0.3445, + "mean_token_accuracy": 0.9226041659712791, + "num_tokens": 66896340.0, + "step": 929 + }, + { + "epoch": 0.5805016970311707, + "grad_norm": 0.3121720850467682, + "learning_rate": 6.28e-07, + "loss": 0.3474, + "mean_token_accuracy": 0.9231056421995163, + "num_tokens": 66968295.0, + "step": 930 + }, + { + "epoch": 0.5811258924043226, + "grad_norm": 0.3524979054927826, + "learning_rate": 6.276e-07, + "loss": 0.3225, + "mean_token_accuracy": 0.9291897267103195, + "num_tokens": 67043688.0, + "step": 931 + }, + { + "epoch": 0.5817500877774744, + "grad_norm": 0.3090422451496124, + "learning_rate": 6.271999999999999e-07, + "loss": 0.328, + "mean_token_accuracy": 0.9241249933838844, + "num_tokens": 67115363.0, + "step": 932 + }, + { + "epoch": 0.5823742831506261, + "grad_norm": 0.41576865315437317, + "learning_rate": 6.268e-07, + "loss": 0.3369, + "mean_token_accuracy": 0.921410359442234, + "num_tokens": 67187338.0, + "step": 933 + }, + { + "epoch": 0.582998478523778, + "grad_norm": 0.37004759907722473, + "learning_rate": 6.263999999999999e-07, + "loss": 0.3167, + "mean_token_accuracy": 0.9293888881802559, + "num_tokens": 67265983.0, + "step": 934 + }, + { + "epoch": 0.5836226738969298, + "grad_norm": 0.3313625752925873, + "learning_rate": 6.26e-07, + "loss": 0.3145, + "mean_token_accuracy": 0.9283946938812733, + "num_tokens": 67340892.0, + "step": 935 + }, + { + "epoch": 0.5842468692700815, + "grad_norm": 0.7900646328926086, + "learning_rate": 6.256e-07, + "loss": 0.3379, + "mean_token_accuracy": 0.9216341227293015, + "num_tokens": 67412448.0, + "step": 936 + }, + { + "epoch": 0.5848710646432334, + "grad_norm": 0.2533631920814514, + "learning_rate": 6.252e-07, + "loss": 0.368, + "mean_token_accuracy": 0.914864793419838, + "num_tokens": 67480650.0, + "step": 937 + }, + { + "epoch": 0.5854952600163851, + "grad_norm": 0.2965437173843384, + "learning_rate": 6.248e-07, + "loss": 0.2901, + "mean_token_accuracy": 0.9302844144403934, + "num_tokens": 67554150.0, + "step": 938 + }, + { + "epoch": 0.5861194553895369, + "grad_norm": 0.5340675115585327, + "learning_rate": 6.243999999999999e-07, + "loss": 0.3003, + "mean_token_accuracy": 0.9331514127552509, + "num_tokens": 67629794.0, + "step": 939 + }, + { + "epoch": 0.5867436507626888, + "grad_norm": 0.37367090582847595, + "learning_rate": 6.24e-07, + "loss": 0.3012, + "mean_token_accuracy": 0.9326327741146088, + "num_tokens": 67704124.0, + "step": 940 + }, + { + "epoch": 0.5873678461358405, + "grad_norm": 0.3025112748146057, + "learning_rate": 6.236e-07, + "loss": 0.325, + "mean_token_accuracy": 0.9248803555965424, + "num_tokens": 67778035.0, + "step": 941 + }, + { + "epoch": 0.5879920415089923, + "grad_norm": 0.2649754285812378, + "learning_rate": 6.231999999999999e-07, + "loss": 0.3219, + "mean_token_accuracy": 0.9248533174395561, + "num_tokens": 67853540.0, + "step": 942 + }, + { + "epoch": 0.5886162368821442, + "grad_norm": 0.2228899896144867, + "learning_rate": 6.228e-07, + "loss": 0.3183, + "mean_token_accuracy": 0.9270938411355019, + "num_tokens": 67926136.0, + "step": 943 + }, + { + "epoch": 0.5892404322552959, + "grad_norm": 0.3619498908519745, + "learning_rate": 6.224e-07, + "loss": 0.3212, + "mean_token_accuracy": 0.9239177778363228, + "num_tokens": 68000306.0, + "step": 944 + }, + { + "epoch": 0.5898646276284477, + "grad_norm": 0.3066573143005371, + "learning_rate": 6.219999999999999e-07, + "loss": 0.2775, + "mean_token_accuracy": 0.9364175796508789, + "num_tokens": 68080885.0, + "step": 945 + }, + { + "epoch": 0.5904888230015995, + "grad_norm": 0.3840203881263733, + "learning_rate": 6.216e-07, + "loss": 0.2915, + "mean_token_accuracy": 0.9344231821596622, + "num_tokens": 68159099.0, + "step": 946 + }, + { + "epoch": 0.5911130183747513, + "grad_norm": 0.2889833152294159, + "learning_rate": 6.212e-07, + "loss": 0.3317, + "mean_token_accuracy": 0.9206769168376923, + "num_tokens": 68229668.0, + "step": 947 + }, + { + "epoch": 0.5917372137479031, + "grad_norm": 0.30188289284706116, + "learning_rate": 6.208e-07, + "loss": 0.3534, + "mean_token_accuracy": 0.9204833246767521, + "num_tokens": 68296711.0, + "step": 948 + }, + { + "epoch": 0.5923614091210548, + "grad_norm": 0.4030374586582184, + "learning_rate": 6.203999999999999e-07, + "loss": 0.3452, + "mean_token_accuracy": 0.9226979650557041, + "num_tokens": 68369658.0, + "step": 949 + }, + { + "epoch": 0.5929856044942067, + "grad_norm": 0.22480404376983643, + "learning_rate": 6.2e-07, + "loss": 0.3053, + "mean_token_accuracy": 0.931752871721983, + "num_tokens": 68443758.0, + "step": 950 + }, + { + "epoch": 0.5936097998673585, + "grad_norm": 0.23992112278938293, + "learning_rate": 6.196e-07, + "loss": 0.3217, + "mean_token_accuracy": 0.9227739870548248, + "num_tokens": 68516928.0, + "step": 951 + }, + { + "epoch": 0.5942339952405102, + "grad_norm": 0.2592463195323944, + "learning_rate": 6.191999999999999e-07, + "loss": 0.3082, + "mean_token_accuracy": 0.9312540851533413, + "num_tokens": 68591076.0, + "step": 952 + }, + { + "epoch": 0.5948581906136621, + "grad_norm": 0.25232332944869995, + "learning_rate": 6.188e-07, + "loss": 0.3046, + "mean_token_accuracy": 0.9280905947089195, + "num_tokens": 68664847.0, + "step": 953 + }, + { + "epoch": 0.5954823859868139, + "grad_norm": 0.3403299152851105, + "learning_rate": 6.183999999999999e-07, + "loss": 0.3178, + "mean_token_accuracy": 0.930928997695446, + "num_tokens": 68740824.0, + "step": 954 + }, + { + "epoch": 0.5961065813599656, + "grad_norm": 0.4414551556110382, + "learning_rate": 6.18e-07, + "loss": 0.346, + "mean_token_accuracy": 0.9227063581347466, + "num_tokens": 68812612.0, + "step": 955 + }, + { + "epoch": 0.5967307767331175, + "grad_norm": 0.24249251186847687, + "learning_rate": 6.176e-07, + "loss": 0.2788, + "mean_token_accuracy": 0.9356615990400314, + "num_tokens": 68887988.0, + "step": 956 + }, + { + "epoch": 0.5973549721062693, + "grad_norm": 0.286842405796051, + "learning_rate": 6.172e-07, + "loss": 0.3207, + "mean_token_accuracy": 0.9255156442523003, + "num_tokens": 68958825.0, + "step": 957 + }, + { + "epoch": 0.597979167479421, + "grad_norm": 0.25298529863357544, + "learning_rate": 6.168e-07, + "loss": 0.3157, + "mean_token_accuracy": 0.9285575449466705, + "num_tokens": 69030300.0, + "step": 958 + }, + { + "epoch": 0.5986033628525729, + "grad_norm": 0.24993418157100677, + "learning_rate": 6.163999999999999e-07, + "loss": 0.3193, + "mean_token_accuracy": 0.9286920800805092, + "num_tokens": 69104377.0, + "step": 959 + }, + { + "epoch": 0.5992275582257246, + "grad_norm": 0.3230576813220978, + "learning_rate": 6.16e-07, + "loss": 0.3252, + "mean_token_accuracy": 0.9245075061917305, + "num_tokens": 69175213.0, + "step": 960 + }, + { + "epoch": 0.5998517535988764, + "grad_norm": 0.3114889860153198, + "learning_rate": 6.156e-07, + "loss": 0.3357, + "mean_token_accuracy": 0.9220588020980358, + "num_tokens": 69244002.0, + "step": 961 + }, + { + "epoch": 0.6004759489720283, + "grad_norm": 0.7709341645240784, + "learning_rate": 6.152e-07, + "loss": 0.332, + "mean_token_accuracy": 0.9231755025684834, + "num_tokens": 69313761.0, + "step": 962 + }, + { + "epoch": 0.60110014434518, + "grad_norm": 0.2480955421924591, + "learning_rate": 6.148e-07, + "loss": 0.2954, + "mean_token_accuracy": 0.930020809173584, + "num_tokens": 69387535.0, + "step": 963 + }, + { + "epoch": 0.6017243397183318, + "grad_norm": 0.2863242030143738, + "learning_rate": 6.143999999999999e-07, + "loss": 0.3176, + "mean_token_accuracy": 0.9253921695053577, + "num_tokens": 69461407.0, + "step": 964 + }, + { + "epoch": 0.6023485350914837, + "grad_norm": 0.27440109848976135, + "learning_rate": 6.14e-07, + "loss": 0.3052, + "mean_token_accuracy": 0.9294677600264549, + "num_tokens": 69535781.0, + "step": 965 + }, + { + "epoch": 0.6029727304646354, + "grad_norm": 0.2807350158691406, + "learning_rate": 6.136e-07, + "loss": 0.3112, + "mean_token_accuracy": 0.9273786917328835, + "num_tokens": 69608015.0, + "step": 966 + }, + { + "epoch": 0.6035969258377872, + "grad_norm": 0.25397181510925293, + "learning_rate": 6.131999999999999e-07, + "loss": 0.2883, + "mean_token_accuracy": 0.9340161979198456, + "num_tokens": 69686884.0, + "step": 967 + }, + { + "epoch": 0.6042211212109391, + "grad_norm": 0.28548285365104675, + "learning_rate": 6.128e-07, + "loss": 0.3666, + "mean_token_accuracy": 0.9168828167021275, + "num_tokens": 69756019.0, + "step": 968 + }, + { + "epoch": 0.6048453165840908, + "grad_norm": 0.3107945919036865, + "learning_rate": 6.124000000000001e-07, + "loss": 0.2937, + "mean_token_accuracy": 0.931410439312458, + "num_tokens": 69831995.0, + "step": 969 + }, + { + "epoch": 0.6054695119572426, + "grad_norm": 0.2681730091571808, + "learning_rate": 6.119999999999999e-07, + "loss": 0.2895, + "mean_token_accuracy": 0.932623777538538, + "num_tokens": 69908398.0, + "step": 970 + }, + { + "epoch": 0.6060937073303944, + "grad_norm": 0.3379480242729187, + "learning_rate": 6.116e-07, + "loss": 0.3231, + "mean_token_accuracy": 0.9255988374352455, + "num_tokens": 69980869.0, + "step": 971 + }, + { + "epoch": 0.6067179027035462, + "grad_norm": 0.38758599758148193, + "learning_rate": 6.112e-07, + "loss": 0.2985, + "mean_token_accuracy": 0.9341311641037464, + "num_tokens": 70051853.0, + "step": 972 + }, + { + "epoch": 0.607342098076698, + "grad_norm": 0.3435940146446228, + "learning_rate": 6.107999999999999e-07, + "loss": 0.3167, + "mean_token_accuracy": 0.9263659529387951, + "num_tokens": 70121755.0, + "step": 973 + }, + { + "epoch": 0.6079662934498498, + "grad_norm": 0.2881772816181183, + "learning_rate": 6.104e-07, + "loss": 0.313, + "mean_token_accuracy": 0.9251901470124722, + "num_tokens": 70196903.0, + "step": 974 + }, + { + "epoch": 0.6085904888230016, + "grad_norm": 0.2645215690135956, + "learning_rate": 6.1e-07, + "loss": 0.326, + "mean_token_accuracy": 0.9231912940740585, + "num_tokens": 70265776.0, + "step": 975 + }, + { + "epoch": 0.6092146841961534, + "grad_norm": 0.3434924781322479, + "learning_rate": 6.096e-07, + "loss": 0.2883, + "mean_token_accuracy": 0.9334718286991119, + "num_tokens": 70343965.0, + "step": 976 + }, + { + "epoch": 0.6098388795693052, + "grad_norm": 0.35382527112960815, + "learning_rate": 6.091999999999999e-07, + "loss": 0.3257, + "mean_token_accuracy": 0.9259385503828526, + "num_tokens": 70415493.0, + "step": 977 + }, + { + "epoch": 0.610463074942457, + "grad_norm": 0.34119918942451477, + "learning_rate": 6.088e-07, + "loss": 0.3338, + "mean_token_accuracy": 0.922543577849865, + "num_tokens": 70486317.0, + "step": 978 + }, + { + "epoch": 0.6110872703156088, + "grad_norm": 0.26998358964920044, + "learning_rate": 6.084000000000001e-07, + "loss": 0.3708, + "mean_token_accuracy": 0.9130705781280994, + "num_tokens": 70553635.0, + "step": 979 + }, + { + "epoch": 0.6117114656887606, + "grad_norm": 0.2922830581665039, + "learning_rate": 6.079999999999999e-07, + "loss": 0.2852, + "mean_token_accuracy": 0.9325008504092693, + "num_tokens": 70630489.0, + "step": 980 + }, + { + "epoch": 0.6123356610619124, + "grad_norm": 0.5830912590026855, + "learning_rate": 6.076e-07, + "loss": 0.3162, + "mean_token_accuracy": 0.9273079708218575, + "num_tokens": 70704467.0, + "step": 981 + }, + { + "epoch": 0.6129598564350641, + "grad_norm": 0.2736106514930725, + "learning_rate": 6.072e-07, + "loss": 0.3606, + "mean_token_accuracy": 0.91494956985116, + "num_tokens": 70772618.0, + "step": 982 + }, + { + "epoch": 0.613584051808216, + "grad_norm": 0.4670395255088806, + "learning_rate": 6.068e-07, + "loss": 0.3554, + "mean_token_accuracy": 0.919471625238657, + "num_tokens": 70842812.0, + "step": 983 + }, + { + "epoch": 0.6142082471813678, + "grad_norm": 1.4782307147979736, + "learning_rate": 6.064e-07, + "loss": 0.3009, + "mean_token_accuracy": 0.9287406280636787, + "num_tokens": 70915951.0, + "step": 984 + }, + { + "epoch": 0.6148324425545195, + "grad_norm": 0.26075005531311035, + "learning_rate": 6.06e-07, + "loss": 0.3239, + "mean_token_accuracy": 0.9193221107125282, + "num_tokens": 70987901.0, + "step": 985 + }, + { + "epoch": 0.6154566379276714, + "grad_norm": 0.4419979453086853, + "learning_rate": 6.056e-07, + "loss": 0.282, + "mean_token_accuracy": 0.932734627276659, + "num_tokens": 71067581.0, + "step": 986 + }, + { + "epoch": 0.6160808333008232, + "grad_norm": 0.25967687368392944, + "learning_rate": 6.051999999999999e-07, + "loss": 0.2964, + "mean_token_accuracy": 0.9326891787350178, + "num_tokens": 71138064.0, + "step": 987 + }, + { + "epoch": 0.6167050286739749, + "grad_norm": 0.2729571461677551, + "learning_rate": 6.048e-07, + "loss": 0.3181, + "mean_token_accuracy": 0.9252961538732052, + "num_tokens": 71212646.0, + "step": 988 + }, + { + "epoch": 0.6173292240471268, + "grad_norm": 0.6203498244285583, + "learning_rate": 6.044e-07, + "loss": 0.329, + "mean_token_accuracy": 0.9260466694831848, + "num_tokens": 71283979.0, + "step": 989 + }, + { + "epoch": 0.6179534194202786, + "grad_norm": 0.23159927129745483, + "learning_rate": 6.04e-07, + "loss": 0.3351, + "mean_token_accuracy": 0.922436885535717, + "num_tokens": 71352496.0, + "step": 990 + }, + { + "epoch": 0.6185776147934303, + "grad_norm": 1.0212888717651367, + "learning_rate": 6.036e-07, + "loss": 0.3335, + "mean_token_accuracy": 0.9248460121452808, + "num_tokens": 71425071.0, + "step": 991 + }, + { + "epoch": 0.6192018101665822, + "grad_norm": 0.8426920771598816, + "learning_rate": 6.031999999999999e-07, + "loss": 0.3457, + "mean_token_accuracy": 0.9211708642542362, + "num_tokens": 71495542.0, + "step": 992 + }, + { + "epoch": 0.6198260055397339, + "grad_norm": 0.2713734209537506, + "learning_rate": 6.028e-07, + "loss": 0.2941, + "mean_token_accuracy": 0.9296718016266823, + "num_tokens": 71574519.0, + "step": 993 + }, + { + "epoch": 0.6204502009128857, + "grad_norm": 0.3907274603843689, + "learning_rate": 6.024e-07, + "loss": 0.3075, + "mean_token_accuracy": 0.9300686977803707, + "num_tokens": 71651496.0, + "step": 994 + }, + { + "epoch": 0.6210743962860376, + "grad_norm": 0.5442532300949097, + "learning_rate": 6.019999999999999e-07, + "loss": 0.3043, + "mean_token_accuracy": 0.929480466991663, + "num_tokens": 71724206.0, + "step": 995 + }, + { + "epoch": 0.6216985916591893, + "grad_norm": 0.1812916249036789, + "learning_rate": 6.016e-07, + "loss": 0.3389, + "mean_token_accuracy": 0.9178191274404526, + "num_tokens": 71792233.0, + "step": 996 + }, + { + "epoch": 0.6223227870323411, + "grad_norm": 0.3315819799900055, + "learning_rate": 6.012e-07, + "loss": 0.2923, + "mean_token_accuracy": 0.9316144585609436, + "num_tokens": 71865719.0, + "step": 997 + }, + { + "epoch": 0.622946982405493, + "grad_norm": 0.3334830701351166, + "learning_rate": 6.007999999999999e-07, + "loss": 0.3289, + "mean_token_accuracy": 0.9240243583917618, + "num_tokens": 71936941.0, + "step": 998 + }, + { + "epoch": 0.6235711777786447, + "grad_norm": 0.31655794382095337, + "learning_rate": 6.004e-07, + "loss": 0.3194, + "mean_token_accuracy": 0.9260966926813126, + "num_tokens": 72010180.0, + "step": 999 + }, + { + "epoch": 0.6241953731517965, + "grad_norm": 0.2527737319469452, + "learning_rate": 6e-07, + "loss": 0.3314, + "mean_token_accuracy": 0.924817081540823, + "num_tokens": 72081845.0, + "step": 1000 + }, + { + "epoch": 0.6248195685249484, + "grad_norm": 0.30754899978637695, + "learning_rate": 5.995999999999999e-07, + "loss": 0.3307, + "mean_token_accuracy": 0.9243885576725006, + "num_tokens": 72151266.0, + "step": 1001 + }, + { + "epoch": 0.6254437638981001, + "grad_norm": 0.2923766076564789, + "learning_rate": 5.991999999999999e-07, + "loss": 0.349, + "mean_token_accuracy": 0.9197619557380676, + "num_tokens": 72222505.0, + "step": 1002 + }, + { + "epoch": 0.6260679592712519, + "grad_norm": 0.3103010654449463, + "learning_rate": 5.988e-07, + "loss": 0.2924, + "mean_token_accuracy": 0.9275066144764423, + "num_tokens": 72292613.0, + "step": 1003 + }, + { + "epoch": 0.6266921546444038, + "grad_norm": 0.26842477917671204, + "learning_rate": 5.984000000000001e-07, + "loss": 0.2873, + "mean_token_accuracy": 0.9319330155849457, + "num_tokens": 72368511.0, + "step": 1004 + }, + { + "epoch": 0.6273163500175555, + "grad_norm": 0.3967761993408203, + "learning_rate": 5.979999999999999e-07, + "loss": 0.3416, + "mean_token_accuracy": 0.9200648814439774, + "num_tokens": 72438269.0, + "step": 1005 + }, + { + "epoch": 0.6279405453907073, + "grad_norm": 0.2200348675251007, + "learning_rate": 5.976e-07, + "loss": 0.3398, + "mean_token_accuracy": 0.922091256827116, + "num_tokens": 72510715.0, + "step": 1006 + }, + { + "epoch": 0.628564740763859, + "grad_norm": 0.25439026951789856, + "learning_rate": 5.972e-07, + "loss": 0.3071, + "mean_token_accuracy": 0.9271485432982445, + "num_tokens": 72586474.0, + "step": 1007 + }, + { + "epoch": 0.6291889361370109, + "grad_norm": 0.2740319073200226, + "learning_rate": 5.967999999999999e-07, + "loss": 0.3162, + "mean_token_accuracy": 0.9284652359783649, + "num_tokens": 72660656.0, + "step": 1008 + }, + { + "epoch": 0.6298131315101627, + "grad_norm": 0.31265708804130554, + "learning_rate": 5.964e-07, + "loss": 0.3203, + "mean_token_accuracy": 0.9259002022445202, + "num_tokens": 72732417.0, + "step": 1009 + }, + { + "epoch": 0.6304373268833144, + "grad_norm": 0.30185502767562866, + "learning_rate": 5.96e-07, + "loss": 0.2965, + "mean_token_accuracy": 0.9297071471810341, + "num_tokens": 72808141.0, + "step": 1010 + }, + { + "epoch": 0.6310615222564663, + "grad_norm": 0.28273630142211914, + "learning_rate": 5.956e-07, + "loss": 0.2955, + "mean_token_accuracy": 0.9329323172569275, + "num_tokens": 72880603.0, + "step": 1011 + }, + { + "epoch": 0.6316857176296181, + "grad_norm": 0.2927215099334717, + "learning_rate": 5.951999999999999e-07, + "loss": 0.3222, + "mean_token_accuracy": 0.9178882613778114, + "num_tokens": 72947944.0, + "step": 1012 + }, + { + "epoch": 0.6323099130027698, + "grad_norm": 0.21952751278877258, + "learning_rate": 5.948e-07, + "loss": 0.324, + "mean_token_accuracy": 0.9255367890000343, + "num_tokens": 73016582.0, + "step": 1013 + }, + { + "epoch": 0.6329341083759217, + "grad_norm": 0.33792880177497864, + "learning_rate": 5.944e-07, + "loss": 0.272, + "mean_token_accuracy": 0.9355400018393993, + "num_tokens": 73094918.0, + "step": 1014 + }, + { + "epoch": 0.6335583037490735, + "grad_norm": 0.3434586226940155, + "learning_rate": 5.939999999999999e-07, + "loss": 0.317, + "mean_token_accuracy": 0.9279661066830158, + "num_tokens": 73168919.0, + "step": 1015 + }, + { + "epoch": 0.6341824991222252, + "grad_norm": 0.2629319727420807, + "learning_rate": 5.936e-07, + "loss": 0.3263, + "mean_token_accuracy": 0.9267872013151646, + "num_tokens": 73243089.0, + "step": 1016 + }, + { + "epoch": 0.6348066944953771, + "grad_norm": 0.39811626076698303, + "learning_rate": 5.931999999999999e-07, + "loss": 0.3217, + "mean_token_accuracy": 0.9223091788589954, + "num_tokens": 73311867.0, + "step": 1017 + }, + { + "epoch": 0.6354308898685288, + "grad_norm": 0.2064669281244278, + "learning_rate": 5.928e-07, + "loss": 0.3174, + "mean_token_accuracy": 0.9268036745488644, + "num_tokens": 73384845.0, + "step": 1018 + }, + { + "epoch": 0.6360550852416806, + "grad_norm": 0.24848033487796783, + "learning_rate": 5.924e-07, + "loss": 0.2927, + "mean_token_accuracy": 0.9327281787991524, + "num_tokens": 73459495.0, + "step": 1019 + }, + { + "epoch": 0.6366792806148325, + "grad_norm": 0.41608864068984985, + "learning_rate": 5.919999999999999e-07, + "loss": 0.3265, + "mean_token_accuracy": 0.9255826324224472, + "num_tokens": 73533553.0, + "step": 1020 + }, + { + "epoch": 0.6373034759879842, + "grad_norm": 0.24101965129375458, + "learning_rate": 5.916e-07, + "loss": 0.3322, + "mean_token_accuracy": 0.9248477481305599, + "num_tokens": 73607376.0, + "step": 1021 + }, + { + "epoch": 0.637927671361136, + "grad_norm": 0.3014102876186371, + "learning_rate": 5.911999999999999e-07, + "loss": 0.3089, + "mean_token_accuracy": 0.930167380720377, + "num_tokens": 73685049.0, + "step": 1022 + }, + { + "epoch": 0.6385518667342879, + "grad_norm": 0.26585087180137634, + "learning_rate": 5.907999999999999e-07, + "loss": 0.3189, + "mean_token_accuracy": 0.9292494580149651, + "num_tokens": 73759412.0, + "step": 1023 + }, + { + "epoch": 0.6391760621074396, + "grad_norm": 0.36861222982406616, + "learning_rate": 5.904e-07, + "loss": 0.2837, + "mean_token_accuracy": 0.9366088695824146, + "num_tokens": 73835545.0, + "step": 1024 + }, + { + "epoch": 0.6398002574805914, + "grad_norm": 0.34126123785972595, + "learning_rate": 5.9e-07, + "loss": 0.3399, + "mean_token_accuracy": 0.9171537421643734, + "num_tokens": 73906259.0, + "step": 1025 + }, + { + "epoch": 0.6404244528537433, + "grad_norm": 1.1243757009506226, + "learning_rate": 5.896e-07, + "loss": 0.3573, + "mean_token_accuracy": 0.9172572605311871, + "num_tokens": 73974608.0, + "step": 1026 + }, + { + "epoch": 0.641048648226895, + "grad_norm": 0.2576483488082886, + "learning_rate": 5.891999999999999e-07, + "loss": 0.3107, + "mean_token_accuracy": 0.9273215867578983, + "num_tokens": 74046357.0, + "step": 1027 + }, + { + "epoch": 0.6416728436000468, + "grad_norm": 0.28565216064453125, + "learning_rate": 5.888e-07, + "loss": 0.3474, + "mean_token_accuracy": 0.9194425828754902, + "num_tokens": 74114802.0, + "step": 1028 + }, + { + "epoch": 0.6422970389731986, + "grad_norm": 0.289669007062912, + "learning_rate": 5.884000000000001e-07, + "loss": 0.3147, + "mean_token_accuracy": 0.9276023507118225, + "num_tokens": 74187273.0, + "step": 1029 + }, + { + "epoch": 0.6429212343463504, + "grad_norm": 0.48835429549217224, + "learning_rate": 5.879999999999999e-07, + "loss": 0.3128, + "mean_token_accuracy": 0.9267511554062366, + "num_tokens": 74260128.0, + "step": 1030 + }, + { + "epoch": 0.6435454297195022, + "grad_norm": 0.2122006118297577, + "learning_rate": 5.876e-07, + "loss": 0.3367, + "mean_token_accuracy": 0.9258723147213459, + "num_tokens": 74329431.0, + "step": 1031 + }, + { + "epoch": 0.644169625092654, + "grad_norm": 0.24304164946079254, + "learning_rate": 5.872000000000001e-07, + "loss": 0.2769, + "mean_token_accuracy": 0.9348785765469074, + "num_tokens": 74401332.0, + "step": 1032 + }, + { + "epoch": 0.6447938204658058, + "grad_norm": 0.27673792839050293, + "learning_rate": 5.867999999999999e-07, + "loss": 0.2916, + "mean_token_accuracy": 0.9338854737579823, + "num_tokens": 74477389.0, + "step": 1033 + }, + { + "epoch": 0.6454180158389576, + "grad_norm": 0.25692564249038696, + "learning_rate": 5.864e-07, + "loss": 0.3018, + "mean_token_accuracy": 0.9289196506142616, + "num_tokens": 74551014.0, + "step": 1034 + }, + { + "epoch": 0.6460422112121094, + "grad_norm": 0.25651422142982483, + "learning_rate": 5.86e-07, + "loss": 0.3317, + "mean_token_accuracy": 0.9200252331793308, + "num_tokens": 74619147.0, + "step": 1035 + }, + { + "epoch": 0.6466664065852612, + "grad_norm": 0.21372392773628235, + "learning_rate": 5.856e-07, + "loss": 0.3166, + "mean_token_accuracy": 0.923603281378746, + "num_tokens": 74687814.0, + "step": 1036 + }, + { + "epoch": 0.647290601958413, + "grad_norm": 0.3708471357822418, + "learning_rate": 5.852e-07, + "loss": 0.2857, + "mean_token_accuracy": 0.9354647286236286, + "num_tokens": 74760333.0, + "step": 1037 + }, + { + "epoch": 0.6479147973315648, + "grad_norm": 0.32684215903282166, + "learning_rate": 5.848e-07, + "loss": 0.3186, + "mean_token_accuracy": 0.9277516454458237, + "num_tokens": 74831104.0, + "step": 1038 + }, + { + "epoch": 0.6485389927047166, + "grad_norm": 0.23108205199241638, + "learning_rate": 5.844e-07, + "loss": 0.3274, + "mean_token_accuracy": 0.9262121766805649, + "num_tokens": 74901043.0, + "step": 1039 + }, + { + "epoch": 0.6491631880778683, + "grad_norm": 0.24368025362491608, + "learning_rate": 5.839999999999999e-07, + "loss": 0.2833, + "mean_token_accuracy": 0.9331041537225246, + "num_tokens": 74978316.0, + "step": 1040 + }, + { + "epoch": 0.6497873834510202, + "grad_norm": 0.3697032332420349, + "learning_rate": 5.836e-07, + "loss": 0.3397, + "mean_token_accuracy": 0.9175243563950062, + "num_tokens": 75050407.0, + "step": 1041 + }, + { + "epoch": 0.650411578824172, + "grad_norm": 0.23026074469089508, + "learning_rate": 5.832e-07, + "loss": 0.2913, + "mean_token_accuracy": 0.9329636693000793, + "num_tokens": 75127801.0, + "step": 1042 + }, + { + "epoch": 0.6510357741973237, + "grad_norm": 0.296919584274292, + "learning_rate": 5.828e-07, + "loss": 0.2904, + "mean_token_accuracy": 0.9306536242365837, + "num_tokens": 75200690.0, + "step": 1043 + }, + { + "epoch": 0.6516599695704756, + "grad_norm": 0.30488839745521545, + "learning_rate": 5.824e-07, + "loss": 0.3192, + "mean_token_accuracy": 0.9311292394995689, + "num_tokens": 75270050.0, + "step": 1044 + }, + { + "epoch": 0.6522841649436274, + "grad_norm": 0.2895083427429199, + "learning_rate": 5.819999999999999e-07, + "loss": 0.3043, + "mean_token_accuracy": 0.9296497106552124, + "num_tokens": 75346293.0, + "step": 1045 + }, + { + "epoch": 0.6529083603167791, + "grad_norm": 0.3535846471786499, + "learning_rate": 5.816e-07, + "loss": 0.3011, + "mean_token_accuracy": 0.9290899373590946, + "num_tokens": 75421019.0, + "step": 1046 + }, + { + "epoch": 0.653532555689931, + "grad_norm": 0.32460275292396545, + "learning_rate": 5.812e-07, + "loss": 0.3067, + "mean_token_accuracy": 0.9282597526907921, + "num_tokens": 75490525.0, + "step": 1047 + }, + { + "epoch": 0.6541567510630828, + "grad_norm": 0.26844221353530884, + "learning_rate": 5.807999999999999e-07, + "loss": 0.2916, + "mean_token_accuracy": 0.9347451776266098, + "num_tokens": 75566775.0, + "step": 1048 + }, + { + "epoch": 0.6547809464362345, + "grad_norm": 0.565582811832428, + "learning_rate": 5.804e-07, + "loss": 0.276, + "mean_token_accuracy": 0.9326979443430901, + "num_tokens": 75643347.0, + "step": 1049 + }, + { + "epoch": 0.6554051418093864, + "grad_norm": 0.2550245225429535, + "learning_rate": 5.8e-07, + "loss": 0.2962, + "mean_token_accuracy": 0.9318430200219154, + "num_tokens": 75721639.0, + "step": 1050 + }, + { + "epoch": 0.6560293371825381, + "grad_norm": 0.3265411853790283, + "learning_rate": 5.796e-07, + "loss": 0.3214, + "mean_token_accuracy": 0.9311969913542271, + "num_tokens": 75796918.0, + "step": 1051 + }, + { + "epoch": 0.6566535325556899, + "grad_norm": 0.2856011688709259, + "learning_rate": 5.792e-07, + "loss": 0.3369, + "mean_token_accuracy": 0.9250219725072384, + "num_tokens": 75868320.0, + "step": 1052 + }, + { + "epoch": 0.6572777279288418, + "grad_norm": 0.458871066570282, + "learning_rate": 5.788e-07, + "loss": 0.3384, + "mean_token_accuracy": 0.9227413944900036, + "num_tokens": 75936361.0, + "step": 1053 + }, + { + "epoch": 0.6579019233019935, + "grad_norm": 3.0488734245300293, + "learning_rate": 5.784e-07, + "loss": 0.301, + "mean_token_accuracy": 0.9288420751690865, + "num_tokens": 76009502.0, + "step": 1054 + }, + { + "epoch": 0.6585261186751453, + "grad_norm": 0.25852179527282715, + "learning_rate": 5.779999999999999e-07, + "loss": 0.3319, + "mean_token_accuracy": 0.9180583171546459, + "num_tokens": 76078165.0, + "step": 1055 + }, + { + "epoch": 0.6591503140482972, + "grad_norm": 0.27918270230293274, + "learning_rate": 5.776e-07, + "loss": 0.3577, + "mean_token_accuracy": 0.9163014180958271, + "num_tokens": 76147797.0, + "step": 1056 + }, + { + "epoch": 0.6597745094214489, + "grad_norm": 0.31718724966049194, + "learning_rate": 5.772000000000001e-07, + "loss": 0.2993, + "mean_token_accuracy": 0.9288002364337444, + "num_tokens": 76220576.0, + "step": 1057 + }, + { + "epoch": 0.6603987047946007, + "grad_norm": 0.2849857807159424, + "learning_rate": 5.767999999999999e-07, + "loss": 0.3427, + "mean_token_accuracy": 0.9198508150875568, + "num_tokens": 76288257.0, + "step": 1058 + }, + { + "epoch": 0.6610229001677526, + "grad_norm": 0.29711484909057617, + "learning_rate": 5.764e-07, + "loss": 0.3125, + "mean_token_accuracy": 0.9275774285197258, + "num_tokens": 76359535.0, + "step": 1059 + }, + { + "epoch": 0.6616470955409043, + "grad_norm": 0.293529212474823, + "learning_rate": 5.76e-07, + "loss": 0.2938, + "mean_token_accuracy": 0.9301333874464035, + "num_tokens": 76429062.0, + "step": 1060 + }, + { + "epoch": 0.6622712909140561, + "grad_norm": 0.49867019057273865, + "learning_rate": 5.755999999999999e-07, + "loss": 0.3377, + "mean_token_accuracy": 0.9208524413406849, + "num_tokens": 76499771.0, + "step": 1061 + }, + { + "epoch": 0.6628954862872078, + "grad_norm": 0.24410377442836761, + "learning_rate": 5.752e-07, + "loss": 0.2721, + "mean_token_accuracy": 0.935693085193634, + "num_tokens": 76573548.0, + "step": 1062 + }, + { + "epoch": 0.6635196816603597, + "grad_norm": 1.2921710014343262, + "learning_rate": 5.748e-07, + "loss": 0.2675, + "mean_token_accuracy": 0.937473576515913, + "num_tokens": 76650362.0, + "step": 1063 + }, + { + "epoch": 0.6641438770335115, + "grad_norm": 0.35362690687179565, + "learning_rate": 5.744e-07, + "loss": 0.3431, + "mean_token_accuracy": 0.9210461117327213, + "num_tokens": 76723965.0, + "step": 1064 + }, + { + "epoch": 0.6647680724066632, + "grad_norm": 0.35221511125564575, + "learning_rate": 5.739999999999999e-07, + "loss": 0.3355, + "mean_token_accuracy": 0.920368205755949, + "num_tokens": 76793553.0, + "step": 1065 + }, + { + "epoch": 0.6653922677798151, + "grad_norm": 0.24570664763450623, + "learning_rate": 5.736e-07, + "loss": 0.3068, + "mean_token_accuracy": 0.928727813065052, + "num_tokens": 76869383.0, + "step": 1066 + }, + { + "epoch": 0.6660164631529669, + "grad_norm": 0.5916453003883362, + "learning_rate": 5.732e-07, + "loss": 0.3181, + "mean_token_accuracy": 0.9273256398737431, + "num_tokens": 76942498.0, + "step": 1067 + }, + { + "epoch": 0.6666406585261186, + "grad_norm": 0.26609939336776733, + "learning_rate": 5.727999999999999e-07, + "loss": 0.3391, + "mean_token_accuracy": 0.9205876365303993, + "num_tokens": 77017377.0, + "step": 1068 + }, + { + "epoch": 0.6672648538992705, + "grad_norm": 0.27983129024505615, + "learning_rate": 5.724e-07, + "loss": 0.3346, + "mean_token_accuracy": 0.9251751713454723, + "num_tokens": 77086766.0, + "step": 1069 + }, + { + "epoch": 0.6678890492724223, + "grad_norm": 0.3421202003955841, + "learning_rate": 5.719999999999999e-07, + "loss": 0.3085, + "mean_token_accuracy": 0.9284405633807182, + "num_tokens": 77161434.0, + "step": 1070 + }, + { + "epoch": 0.668513244645574, + "grad_norm": 0.6074190139770508, + "learning_rate": 5.716e-07, + "loss": 0.3001, + "mean_token_accuracy": 0.9273298345506191, + "num_tokens": 77236232.0, + "step": 1071 + }, + { + "epoch": 0.6691374400187259, + "grad_norm": 0.257010817527771, + "learning_rate": 5.712e-07, + "loss": 0.3288, + "mean_token_accuracy": 0.923653569072485, + "num_tokens": 77307546.0, + "step": 1072 + }, + { + "epoch": 0.6697616353918776, + "grad_norm": 0.36772382259368896, + "learning_rate": 5.707999999999999e-07, + "loss": 0.3324, + "mean_token_accuracy": 0.9224100597202778, + "num_tokens": 77380075.0, + "step": 1073 + }, + { + "epoch": 0.6703858307650294, + "grad_norm": 0.2195662409067154, + "learning_rate": 5.704e-07, + "loss": 0.3668, + "mean_token_accuracy": 0.9118894971907139, + "num_tokens": 77445973.0, + "step": 1074 + }, + { + "epoch": 0.6710100261381813, + "grad_norm": 0.3883196711540222, + "learning_rate": 5.699999999999999e-07, + "loss": 0.2905, + "mean_token_accuracy": 0.9322254471480846, + "num_tokens": 77521744.0, + "step": 1075 + }, + { + "epoch": 0.671634221511333, + "grad_norm": 0.4641028642654419, + "learning_rate": 5.696e-07, + "loss": 0.3361, + "mean_token_accuracy": 0.9230054169893265, + "num_tokens": 77595339.0, + "step": 1076 + }, + { + "epoch": 0.6722584168844848, + "grad_norm": 0.25512754917144775, + "learning_rate": 5.692e-07, + "loss": 0.2806, + "mean_token_accuracy": 0.935399379581213, + "num_tokens": 77672262.0, + "step": 1077 + }, + { + "epoch": 0.6728826122576367, + "grad_norm": 0.29242661595344543, + "learning_rate": 5.688e-07, + "loss": 0.3395, + "mean_token_accuracy": 0.9219987019896507, + "num_tokens": 77742048.0, + "step": 1078 + }, + { + "epoch": 0.6735068076307884, + "grad_norm": 0.2278558909893036, + "learning_rate": 5.684e-07, + "loss": 0.3342, + "mean_token_accuracy": 0.9236059114336967, + "num_tokens": 77810832.0, + "step": 1079 + }, + { + "epoch": 0.6741310030039402, + "grad_norm": 0.25265419483184814, + "learning_rate": 5.679999999999999e-07, + "loss": 0.3517, + "mean_token_accuracy": 0.9187329299747944, + "num_tokens": 77876778.0, + "step": 1080 + }, + { + "epoch": 0.6747551983770921, + "grad_norm": 0.3071189522743225, + "learning_rate": 5.676e-07, + "loss": 0.3291, + "mean_token_accuracy": 0.9230103567242622, + "num_tokens": 77945933.0, + "step": 1081 + }, + { + "epoch": 0.6753793937502438, + "grad_norm": 1.4307942390441895, + "learning_rate": 5.672e-07, + "loss": 0.2853, + "mean_token_accuracy": 0.9343712665140629, + "num_tokens": 78023992.0, + "step": 1082 + }, + { + "epoch": 0.6760035891233956, + "grad_norm": 0.27039507031440735, + "learning_rate": 5.667999999999999e-07, + "loss": 0.3433, + "mean_token_accuracy": 0.9170473702251911, + "num_tokens": 78091941.0, + "step": 1083 + }, + { + "epoch": 0.6766277844965474, + "grad_norm": 0.2514006197452545, + "learning_rate": 5.664e-07, + "loss": 0.3201, + "mean_token_accuracy": 0.9210469797253609, + "num_tokens": 78161079.0, + "step": 1084 + }, + { + "epoch": 0.6772519798696992, + "grad_norm": 0.2774260342121124, + "learning_rate": 5.66e-07, + "loss": 0.3479, + "mean_token_accuracy": 0.9168211035430431, + "num_tokens": 78229850.0, + "step": 1085 + }, + { + "epoch": 0.677876175242851, + "grad_norm": 0.280916690826416, + "learning_rate": 5.655999999999999e-07, + "loss": 0.3515, + "mean_token_accuracy": 0.9191113077104092, + "num_tokens": 78299351.0, + "step": 1086 + }, + { + "epoch": 0.6785003706160028, + "grad_norm": 0.26132720708847046, + "learning_rate": 5.652e-07, + "loss": 0.3528, + "mean_token_accuracy": 0.9195957481861115, + "num_tokens": 78367562.0, + "step": 1087 + }, + { + "epoch": 0.6791245659891546, + "grad_norm": 0.24149930477142334, + "learning_rate": 5.648e-07, + "loss": 0.2939, + "mean_token_accuracy": 0.9329637065529823, + "num_tokens": 78442243.0, + "step": 1088 + }, + { + "epoch": 0.6797487613623064, + "grad_norm": 0.2841884195804596, + "learning_rate": 5.643999999999999e-07, + "loss": 0.3225, + "mean_token_accuracy": 0.9268374778330326, + "num_tokens": 78511595.0, + "step": 1089 + }, + { + "epoch": 0.6803729567354582, + "grad_norm": 0.27396097779273987, + "learning_rate": 5.639999999999999e-07, + "loss": 0.345, + "mean_token_accuracy": 0.9210294783115387, + "num_tokens": 78581324.0, + "step": 1090 + }, + { + "epoch": 0.68099715210861, + "grad_norm": 0.24793218076229095, + "learning_rate": 5.636e-07, + "loss": 0.32, + "mean_token_accuracy": 0.9221481382846832, + "num_tokens": 78653509.0, + "step": 1091 + }, + { + "epoch": 0.6816213474817618, + "grad_norm": 0.25061824917793274, + "learning_rate": 5.632e-07, + "loss": 0.3437, + "mean_token_accuracy": 0.9142379276454449, + "num_tokens": 78722934.0, + "step": 1092 + }, + { + "epoch": 0.6822455428549136, + "grad_norm": 0.26395317912101746, + "learning_rate": 5.627999999999999e-07, + "loss": 0.2726, + "mean_token_accuracy": 0.9340312480926514, + "num_tokens": 78800071.0, + "step": 1093 + }, + { + "epoch": 0.6828697382280654, + "grad_norm": 0.31192225217819214, + "learning_rate": 5.624e-07, + "loss": 0.3165, + "mean_token_accuracy": 0.9267790764570236, + "num_tokens": 78871684.0, + "step": 1094 + }, + { + "epoch": 0.6834939336012171, + "grad_norm": 0.23404188454151154, + "learning_rate": 5.620000000000001e-07, + "loss": 0.3035, + "mean_token_accuracy": 0.9288293160498142, + "num_tokens": 78942589.0, + "step": 1095 + }, + { + "epoch": 0.684118128974369, + "grad_norm": 0.25686171650886536, + "learning_rate": 5.615999999999999e-07, + "loss": 0.3283, + "mean_token_accuracy": 0.924301490187645, + "num_tokens": 79010073.0, + "step": 1096 + }, + { + "epoch": 0.6847423243475208, + "grad_norm": 0.2640036642551422, + "learning_rate": 5.612e-07, + "loss": 0.3115, + "mean_token_accuracy": 0.9242626279592514, + "num_tokens": 79078968.0, + "step": 1097 + }, + { + "epoch": 0.6853665197206725, + "grad_norm": 0.32023507356643677, + "learning_rate": 5.608e-07, + "loss": 0.3469, + "mean_token_accuracy": 0.9191759489476681, + "num_tokens": 79149171.0, + "step": 1098 + }, + { + "epoch": 0.6859907150938244, + "grad_norm": 1.2757829427719116, + "learning_rate": 5.604e-07, + "loss": 0.3449, + "mean_token_accuracy": 0.920503307133913, + "num_tokens": 79219641.0, + "step": 1099 + }, + { + "epoch": 0.6866149104669762, + "grad_norm": 0.8865218162536621, + "learning_rate": 5.6e-07, + "loss": 0.3353, + "mean_token_accuracy": 0.9223778620362282, + "num_tokens": 79287719.0, + "step": 1100 + }, + { + "epoch": 0.6872391058401279, + "grad_norm": 0.24560661613941193, + "learning_rate": 5.596e-07, + "loss": 0.3115, + "mean_token_accuracy": 0.9249746948480606, + "num_tokens": 79359804.0, + "step": 1101 + }, + { + "epoch": 0.6878633012132798, + "grad_norm": 0.4992161989212036, + "learning_rate": 5.592e-07, + "loss": 0.2959, + "mean_token_accuracy": 0.9307670183479786, + "num_tokens": 79432330.0, + "step": 1102 + }, + { + "epoch": 0.6884874965864316, + "grad_norm": 0.3264545500278473, + "learning_rate": 5.588e-07, + "loss": 0.3298, + "mean_token_accuracy": 0.9285875260829926, + "num_tokens": 79506482.0, + "step": 1103 + }, + { + "epoch": 0.6891116919595833, + "grad_norm": 0.2558642625808716, + "learning_rate": 5.584e-07, + "loss": 0.3149, + "mean_token_accuracy": 0.9253707379102707, + "num_tokens": 79578062.0, + "step": 1104 + }, + { + "epoch": 0.6897358873327352, + "grad_norm": 0.285108745098114, + "learning_rate": 5.58e-07, + "loss": 0.3172, + "mean_token_accuracy": 0.9280072525143623, + "num_tokens": 79650779.0, + "step": 1105 + }, + { + "epoch": 0.6903600827058869, + "grad_norm": 0.3656577467918396, + "learning_rate": 5.576e-07, + "loss": 0.3228, + "mean_token_accuracy": 0.9236637875437737, + "num_tokens": 79724011.0, + "step": 1106 + }, + { + "epoch": 0.6909842780790387, + "grad_norm": 0.44640854001045227, + "learning_rate": 5.572e-07, + "loss": 0.3345, + "mean_token_accuracy": 0.9220766797661781, + "num_tokens": 79792376.0, + "step": 1107 + }, + { + "epoch": 0.6916084734521906, + "grad_norm": 0.3463594317436218, + "learning_rate": 5.567999999999999e-07, + "loss": 0.3171, + "mean_token_accuracy": 0.9280374012887478, + "num_tokens": 79863450.0, + "step": 1108 + }, + { + "epoch": 0.6922326688253423, + "grad_norm": 0.3193429708480835, + "learning_rate": 5.564e-07, + "loss": 0.2916, + "mean_token_accuracy": 0.9300437718629837, + "num_tokens": 79934997.0, + "step": 1109 + }, + { + "epoch": 0.6928568641984941, + "grad_norm": 0.29075735807418823, + "learning_rate": 5.560000000000001e-07, + "loss": 0.3029, + "mean_token_accuracy": 0.9275107569992542, + "num_tokens": 80013261.0, + "step": 1110 + }, + { + "epoch": 0.693481059571646, + "grad_norm": 0.20447741448879242, + "learning_rate": 5.555999999999999e-07, + "loss": 0.3127, + "mean_token_accuracy": 0.9260498695075512, + "num_tokens": 80084041.0, + "step": 1111 + }, + { + "epoch": 0.6941052549447977, + "grad_norm": 0.24571886658668518, + "learning_rate": 5.552e-07, + "loss": 0.3316, + "mean_token_accuracy": 0.9213022030889988, + "num_tokens": 80159547.0, + "step": 1112 + }, + { + "epoch": 0.6947294503179495, + "grad_norm": 0.21463543176651, + "learning_rate": 5.548e-07, + "loss": 0.3097, + "mean_token_accuracy": 0.9271739348769188, + "num_tokens": 80232018.0, + "step": 1113 + }, + { + "epoch": 0.6953536456911014, + "grad_norm": 0.4201563596725464, + "learning_rate": 5.543999999999999e-07, + "loss": 0.333, + "mean_token_accuracy": 0.9247965849936008, + "num_tokens": 80303994.0, + "step": 1114 + }, + { + "epoch": 0.6959778410642531, + "grad_norm": 0.25205832719802856, + "learning_rate": 5.54e-07, + "loss": 0.307, + "mean_token_accuracy": 0.9304822348058224, + "num_tokens": 80374654.0, + "step": 1115 + }, + { + "epoch": 0.6966020364374049, + "grad_norm": 0.2772039771080017, + "learning_rate": 5.536e-07, + "loss": 0.3348, + "mean_token_accuracy": 0.9231976084411144, + "num_tokens": 80446684.0, + "step": 1116 + }, + { + "epoch": 0.6972262318105567, + "grad_norm": 0.22733274102210999, + "learning_rate": 5.532e-07, + "loss": 0.3483, + "mean_token_accuracy": 0.9190201461315155, + "num_tokens": 80515726.0, + "step": 1117 + }, + { + "epoch": 0.6978504271837085, + "grad_norm": 0.7022393345832825, + "learning_rate": 5.527999999999999e-07, + "loss": 0.3097, + "mean_token_accuracy": 0.9300166890025139, + "num_tokens": 80588345.0, + "step": 1118 + }, + { + "epoch": 0.6984746225568603, + "grad_norm": 0.20528490841388702, + "learning_rate": 5.524e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9315690137445927, + "num_tokens": 80660671.0, + "step": 1119 + }, + { + "epoch": 0.699098817930012, + "grad_norm": 0.4754799008369446, + "learning_rate": 5.520000000000001e-07, + "loss": 0.3211, + "mean_token_accuracy": 0.9300379939377308, + "num_tokens": 80730140.0, + "step": 1120 + }, + { + "epoch": 0.6997230133031639, + "grad_norm": 0.2463442087173462, + "learning_rate": 5.515999999999999e-07, + "loss": 0.3016, + "mean_token_accuracy": 0.9275045394897461, + "num_tokens": 80804348.0, + "step": 1121 + }, + { + "epoch": 0.7003472086763157, + "grad_norm": 0.1898731291294098, + "learning_rate": 5.512e-07, + "loss": 0.2825, + "mean_token_accuracy": 0.9353449232876301, + "num_tokens": 80879574.0, + "step": 1122 + }, + { + "epoch": 0.7009714040494675, + "grad_norm": 0.6623883843421936, + "learning_rate": 5.508e-07, + "loss": 0.3276, + "mean_token_accuracy": 0.9281446784734726, + "num_tokens": 80950691.0, + "step": 1123 + }, + { + "epoch": 0.7015955994226193, + "grad_norm": 0.23336370289325714, + "learning_rate": 5.504e-07, + "loss": 0.3177, + "mean_token_accuracy": 0.9276253059506416, + "num_tokens": 81020557.0, + "step": 1124 + }, + { + "epoch": 0.7022197947957711, + "grad_norm": 0.26593703031539917, + "learning_rate": 5.5e-07, + "loss": 0.3109, + "mean_token_accuracy": 0.9245895631611347, + "num_tokens": 81089783.0, + "step": 1125 + }, + { + "epoch": 0.7028439901689228, + "grad_norm": 0.2431308627128601, + "learning_rate": 5.496e-07, + "loss": 0.3145, + "mean_token_accuracy": 0.9274233393371105, + "num_tokens": 81165690.0, + "step": 1126 + }, + { + "epoch": 0.7034681855420747, + "grad_norm": 0.3533809185028076, + "learning_rate": 5.492e-07, + "loss": 0.2732, + "mean_token_accuracy": 0.9331811591982841, + "num_tokens": 81238025.0, + "step": 1127 + }, + { + "epoch": 0.7040923809152264, + "grad_norm": 1.7881593704223633, + "learning_rate": 5.487999999999999e-07, + "loss": 0.3463, + "mean_token_accuracy": 0.9221058301627636, + "num_tokens": 81313155.0, + "step": 1128 + }, + { + "epoch": 0.7047165762883782, + "grad_norm": 0.6505405902862549, + "learning_rate": 5.484e-07, + "loss": 0.3013, + "mean_token_accuracy": 0.931287981569767, + "num_tokens": 81390850.0, + "step": 1129 + }, + { + "epoch": 0.7053407716615301, + "grad_norm": 0.29145315289497375, + "learning_rate": 5.48e-07, + "loss": 0.3341, + "mean_token_accuracy": 0.9197108075022697, + "num_tokens": 81465105.0, + "step": 1130 + }, + { + "epoch": 0.7059649670346818, + "grad_norm": 0.23154234886169434, + "learning_rate": 5.476e-07, + "loss": 0.329, + "mean_token_accuracy": 0.9168270416557789, + "num_tokens": 81534899.0, + "step": 1131 + }, + { + "epoch": 0.7065891624078336, + "grad_norm": 0.27271410822868347, + "learning_rate": 5.472e-07, + "loss": 0.2887, + "mean_token_accuracy": 0.9312710873782635, + "num_tokens": 81610957.0, + "step": 1132 + }, + { + "epoch": 0.7072133577809855, + "grad_norm": 0.30233708024024963, + "learning_rate": 5.467999999999999e-07, + "loss": 0.3306, + "mean_token_accuracy": 0.9224446229636669, + "num_tokens": 81684003.0, + "step": 1133 + }, + { + "epoch": 0.7078375531541372, + "grad_norm": 0.33163005113601685, + "learning_rate": 5.464e-07, + "loss": 0.2599, + "mean_token_accuracy": 0.9362748488783836, + "num_tokens": 81764118.0, + "step": 1134 + }, + { + "epoch": 0.708461748527289, + "grad_norm": 0.25369229912757874, + "learning_rate": 5.46e-07, + "loss": 0.2854, + "mean_token_accuracy": 0.9339959435164928, + "num_tokens": 81840295.0, + "step": 1135 + }, + { + "epoch": 0.7090859439004409, + "grad_norm": 0.23439450562000275, + "learning_rate": 5.455999999999999e-07, + "loss": 0.3107, + "mean_token_accuracy": 0.9274135306477547, + "num_tokens": 81913040.0, + "step": 1136 + }, + { + "epoch": 0.7097101392735926, + "grad_norm": 0.6647812128067017, + "learning_rate": 5.452e-07, + "loss": 0.3429, + "mean_token_accuracy": 0.9202546887099743, + "num_tokens": 81981209.0, + "step": 1137 + }, + { + "epoch": 0.7103343346467444, + "grad_norm": 0.5711204409599304, + "learning_rate": 5.448e-07, + "loss": 0.3147, + "mean_token_accuracy": 0.9273529760539532, + "num_tokens": 82057622.0, + "step": 1138 + }, + { + "epoch": 0.7109585300198962, + "grad_norm": 0.3273007869720459, + "learning_rate": 5.443999999999999e-07, + "loss": 0.3228, + "mean_token_accuracy": 0.9227526932954788, + "num_tokens": 82125387.0, + "step": 1139 + }, + { + "epoch": 0.711582725393048, + "grad_norm": 0.29881981015205383, + "learning_rate": 5.44e-07, + "loss": 0.296, + "mean_token_accuracy": 0.9305686764419079, + "num_tokens": 82196718.0, + "step": 1140 + }, + { + "epoch": 0.7122069207661998, + "grad_norm": 0.30085140466690063, + "learning_rate": 5.436e-07, + "loss": 0.3443, + "mean_token_accuracy": 0.9201326221227646, + "num_tokens": 82265601.0, + "step": 1141 + }, + { + "epoch": 0.7128311161393516, + "grad_norm": 0.31395289301872253, + "learning_rate": 5.431999999999999e-07, + "loss": 0.2956, + "mean_token_accuracy": 0.9267284162342548, + "num_tokens": 82336337.0, + "step": 1142 + }, + { + "epoch": 0.7134553115125034, + "grad_norm": 0.331177294254303, + "learning_rate": 5.427999999999999e-07, + "loss": 0.3357, + "mean_token_accuracy": 0.9213302060961723, + "num_tokens": 82409148.0, + "step": 1143 + }, + { + "epoch": 0.7140795068856552, + "grad_norm": 0.260649710893631, + "learning_rate": 5.424e-07, + "loss": 0.2915, + "mean_token_accuracy": 0.9330309964716434, + "num_tokens": 82486527.0, + "step": 1144 + }, + { + "epoch": 0.714703702258807, + "grad_norm": 0.23713193833827972, + "learning_rate": 5.420000000000001e-07, + "loss": 0.2992, + "mean_token_accuracy": 0.9204347282648087, + "num_tokens": 82558425.0, + "step": 1145 + }, + { + "epoch": 0.7153278976319588, + "grad_norm": 0.23159871995449066, + "learning_rate": 5.415999999999999e-07, + "loss": 0.3102, + "mean_token_accuracy": 0.9262193664908409, + "num_tokens": 82627489.0, + "step": 1146 + }, + { + "epoch": 0.7159520930051106, + "grad_norm": 0.33924880623817444, + "learning_rate": 5.412e-07, + "loss": 0.2991, + "mean_token_accuracy": 0.9295236617326736, + "num_tokens": 82704922.0, + "step": 1147 + }, + { + "epoch": 0.7165762883782624, + "grad_norm": 0.2937605082988739, + "learning_rate": 5.408e-07, + "loss": 0.2889, + "mean_token_accuracy": 0.93614861369133, + "num_tokens": 82777901.0, + "step": 1148 + }, + { + "epoch": 0.7172004837514142, + "grad_norm": 0.42500919103622437, + "learning_rate": 5.403999999999999e-07, + "loss": 0.3072, + "mean_token_accuracy": 0.9222142100334167, + "num_tokens": 82852325.0, + "step": 1149 + }, + { + "epoch": 0.7178246791245659, + "grad_norm": 0.27352285385131836, + "learning_rate": 5.4e-07, + "loss": 0.332, + "mean_token_accuracy": 0.9211334697902203, + "num_tokens": 82918865.0, + "step": 1150 + }, + { + "epoch": 0.7184488744977178, + "grad_norm": 0.5938454866409302, + "learning_rate": 5.396e-07, + "loss": 0.3432, + "mean_token_accuracy": 0.9211635179817677, + "num_tokens": 82982463.0, + "step": 1151 + }, + { + "epoch": 0.7190730698708696, + "grad_norm": 0.5616174936294556, + "learning_rate": 5.392e-07, + "loss": 0.3094, + "mean_token_accuracy": 0.9271268770098686, + "num_tokens": 83060368.0, + "step": 1152 + }, + { + "epoch": 0.7196972652440213, + "grad_norm": 0.3479961156845093, + "learning_rate": 5.387999999999999e-07, + "loss": 0.304, + "mean_token_accuracy": 0.9277598224580288, + "num_tokens": 83134323.0, + "step": 1153 + }, + { + "epoch": 0.7203214606171732, + "grad_norm": 0.33551540970802307, + "learning_rate": 5.384e-07, + "loss": 0.3181, + "mean_token_accuracy": 0.9247422218322754, + "num_tokens": 83208428.0, + "step": 1154 + }, + { + "epoch": 0.720945655990325, + "grad_norm": 0.432501882314682, + "learning_rate": 5.38e-07, + "loss": 0.3246, + "mean_token_accuracy": 0.9283667728304863, + "num_tokens": 83278841.0, + "step": 1155 + }, + { + "epoch": 0.7215698513634767, + "grad_norm": 0.3050890266895294, + "learning_rate": 5.375999999999999e-07, + "loss": 0.2936, + "mean_token_accuracy": 0.9275626316666603, + "num_tokens": 83353424.0, + "step": 1156 + }, + { + "epoch": 0.7221940467366286, + "grad_norm": 0.2365804761648178, + "learning_rate": 5.372e-07, + "loss": 0.3129, + "mean_token_accuracy": 0.9263198338449001, + "num_tokens": 83427209.0, + "step": 1157 + }, + { + "epoch": 0.7228182421097804, + "grad_norm": 0.34807029366493225, + "learning_rate": 5.368e-07, + "loss": 0.3214, + "mean_token_accuracy": 0.9230219200253487, + "num_tokens": 83498121.0, + "step": 1158 + }, + { + "epoch": 0.7234424374829321, + "grad_norm": 0.5189346671104431, + "learning_rate": 5.364e-07, + "loss": 0.3402, + "mean_token_accuracy": 0.9178859665989876, + "num_tokens": 83567511.0, + "step": 1159 + }, + { + "epoch": 0.724066632856084, + "grad_norm": 1.2859077453613281, + "learning_rate": 5.36e-07, + "loss": 0.3609, + "mean_token_accuracy": 0.9151715524494648, + "num_tokens": 83636597.0, + "step": 1160 + }, + { + "epoch": 0.7246908282292358, + "grad_norm": 0.2486700564622879, + "learning_rate": 5.355999999999999e-07, + "loss": 0.3143, + "mean_token_accuracy": 0.9226592965424061, + "num_tokens": 83706101.0, + "step": 1161 + }, + { + "epoch": 0.7253150236023875, + "grad_norm": 0.6346601843833923, + "learning_rate": 5.352e-07, + "loss": 0.3216, + "mean_token_accuracy": 0.9234752170741558, + "num_tokens": 83773773.0, + "step": 1162 + }, + { + "epoch": 0.7259392189755394, + "grad_norm": 0.30137914419174194, + "learning_rate": 5.348e-07, + "loss": 0.3529, + "mean_token_accuracy": 0.9158134274184704, + "num_tokens": 83844567.0, + "step": 1163 + }, + { + "epoch": 0.7265634143486911, + "grad_norm": 0.3054022192955017, + "learning_rate": 5.343999999999999e-07, + "loss": 0.3218, + "mean_token_accuracy": 0.9281169660389423, + "num_tokens": 83921226.0, + "step": 1164 + }, + { + "epoch": 0.7271876097218429, + "grad_norm": 0.29389411211013794, + "learning_rate": 5.34e-07, + "loss": 0.3242, + "mean_token_accuracy": 0.9247043579816818, + "num_tokens": 83992987.0, + "step": 1165 + }, + { + "epoch": 0.7278118050949948, + "grad_norm": 0.3375873565673828, + "learning_rate": 5.336e-07, + "loss": 0.3325, + "mean_token_accuracy": 0.9222918935120106, + "num_tokens": 84062828.0, + "step": 1166 + }, + { + "epoch": 0.7284360004681465, + "grad_norm": 0.38642024993896484, + "learning_rate": 5.331999999999999e-07, + "loss": 0.3307, + "mean_token_accuracy": 0.9208414405584335, + "num_tokens": 84129467.0, + "step": 1167 + }, + { + "epoch": 0.7290601958412983, + "grad_norm": 0.2882246673107147, + "learning_rate": 5.328e-07, + "loss": 0.3277, + "mean_token_accuracy": 0.919327512383461, + "num_tokens": 84202183.0, + "step": 1168 + }, + { + "epoch": 0.7296843912144502, + "grad_norm": 0.2627275586128235, + "learning_rate": 5.324e-07, + "loss": 0.317, + "mean_token_accuracy": 0.9249442145228386, + "num_tokens": 84274417.0, + "step": 1169 + }, + { + "epoch": 0.7303085865876019, + "grad_norm": 0.33341243863105774, + "learning_rate": 5.32e-07, + "loss": 0.3535, + "mean_token_accuracy": 0.9194446913897991, + "num_tokens": 84345278.0, + "step": 1170 + }, + { + "epoch": 0.7309327819607537, + "grad_norm": 0.2293102741241455, + "learning_rate": 5.315999999999999e-07, + "loss": 0.2889, + "mean_token_accuracy": 0.9320733360946178, + "num_tokens": 84419922.0, + "step": 1171 + }, + { + "epoch": 0.7315569773339056, + "grad_norm": 0.3681367039680481, + "learning_rate": 5.312e-07, + "loss": 0.3627, + "mean_token_accuracy": 0.9147992692887783, + "num_tokens": 84490845.0, + "step": 1172 + }, + { + "epoch": 0.7321811727070573, + "grad_norm": 0.21113243699073792, + "learning_rate": 5.308000000000001e-07, + "loss": 0.2969, + "mean_token_accuracy": 0.9292296580970287, + "num_tokens": 84564201.0, + "step": 1173 + }, + { + "epoch": 0.7328053680802091, + "grad_norm": 0.2981301248073578, + "learning_rate": 5.303999999999999e-07, + "loss": 0.3172, + "mean_token_accuracy": 0.9242628067731857, + "num_tokens": 84634849.0, + "step": 1174 + }, + { + "epoch": 0.7334295634533609, + "grad_norm": 0.2273181974887848, + "learning_rate": 5.3e-07, + "loss": 0.3217, + "mean_token_accuracy": 0.9239397868514061, + "num_tokens": 84704525.0, + "step": 1175 + }, + { + "epoch": 0.7340537588265127, + "grad_norm": 0.2427506148815155, + "learning_rate": 5.296e-07, + "loss": 0.3178, + "mean_token_accuracy": 0.9293803833425045, + "num_tokens": 84776812.0, + "step": 1176 + }, + { + "epoch": 0.7346779541996645, + "grad_norm": 0.5828445553779602, + "learning_rate": 5.292e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.9298559501767159, + "num_tokens": 84851607.0, + "step": 1177 + }, + { + "epoch": 0.7353021495728163, + "grad_norm": 0.24520109593868256, + "learning_rate": 5.288e-07, + "loss": 0.3415, + "mean_token_accuracy": 0.9191625975072384, + "num_tokens": 84921086.0, + "step": 1178 + }, + { + "epoch": 0.7359263449459681, + "grad_norm": 0.3361842930316925, + "learning_rate": 5.284e-07, + "loss": 0.2918, + "mean_token_accuracy": 0.9332484044134617, + "num_tokens": 84996753.0, + "step": 1179 + }, + { + "epoch": 0.7365505403191199, + "grad_norm": 0.3142967224121094, + "learning_rate": 5.28e-07, + "loss": 0.3281, + "mean_token_accuracy": 0.925333172082901, + "num_tokens": 85071866.0, + "step": 1180 + }, + { + "epoch": 0.7371747356922717, + "grad_norm": 0.3148558735847473, + "learning_rate": 5.275999999999999e-07, + "loss": 0.2923, + "mean_token_accuracy": 0.9332444071769714, + "num_tokens": 85143995.0, + "step": 1181 + }, + { + "epoch": 0.7377989310654235, + "grad_norm": 0.6554597020149231, + "learning_rate": 5.272e-07, + "loss": 0.3287, + "mean_token_accuracy": 0.9221116863191128, + "num_tokens": 85219760.0, + "step": 1182 + }, + { + "epoch": 0.7384231264385753, + "grad_norm": 0.2787609398365021, + "learning_rate": 5.268e-07, + "loss": 0.325, + "mean_token_accuracy": 0.9222319461405277, + "num_tokens": 85289073.0, + "step": 1183 + }, + { + "epoch": 0.739047321811727, + "grad_norm": 0.5081881284713745, + "learning_rate": 5.264e-07, + "loss": 0.2489, + "mean_token_accuracy": 0.9398611001670361, + "num_tokens": 85367172.0, + "step": 1184 + }, + { + "epoch": 0.7396715171848789, + "grad_norm": 0.3034314513206482, + "learning_rate": 5.26e-07, + "loss": 0.3035, + "mean_token_accuracy": 0.927377812564373, + "num_tokens": 85444017.0, + "step": 1185 + }, + { + "epoch": 0.7402957125580306, + "grad_norm": 0.31223875284194946, + "learning_rate": 5.255999999999999e-07, + "loss": 0.2963, + "mean_token_accuracy": 0.9290304630994797, + "num_tokens": 85514464.0, + "step": 1186 + }, + { + "epoch": 0.7409199079311825, + "grad_norm": 0.23523546755313873, + "learning_rate": 5.252e-07, + "loss": 0.308, + "mean_token_accuracy": 0.9295033700764179, + "num_tokens": 85587447.0, + "step": 1187 + }, + { + "epoch": 0.7415441033043343, + "grad_norm": 1.1564338207244873, + "learning_rate": 5.248e-07, + "loss": 0.3048, + "mean_token_accuracy": 0.9319812916219234, + "num_tokens": 85662773.0, + "step": 1188 + }, + { + "epoch": 0.742168298677486, + "grad_norm": 0.4510135352611542, + "learning_rate": 5.243999999999999e-07, + "loss": 0.3045, + "mean_token_accuracy": 0.9292332455515862, + "num_tokens": 85737489.0, + "step": 1189 + }, + { + "epoch": 0.7427924940506379, + "grad_norm": 0.27003124356269836, + "learning_rate": 5.24e-07, + "loss": 0.297, + "mean_token_accuracy": 0.9289721548557281, + "num_tokens": 85808985.0, + "step": 1190 + }, + { + "epoch": 0.7434166894237897, + "grad_norm": 0.22788093984127045, + "learning_rate": 5.236e-07, + "loss": 0.333, + "mean_token_accuracy": 0.9168750904500484, + "num_tokens": 85878979.0, + "step": 1191 + }, + { + "epoch": 0.7440408847969414, + "grad_norm": 0.32437437772750854, + "learning_rate": 5.232e-07, + "loss": 0.3146, + "mean_token_accuracy": 0.9285647124052048, + "num_tokens": 85948383.0, + "step": 1192 + }, + { + "epoch": 0.7446650801700933, + "grad_norm": 0.2690334916114807, + "learning_rate": 5.228e-07, + "loss": 0.3173, + "mean_token_accuracy": 0.9249335415661335, + "num_tokens": 86020000.0, + "step": 1193 + }, + { + "epoch": 0.7452892755432451, + "grad_norm": 0.22597944736480713, + "learning_rate": 5.224e-07, + "loss": 0.3412, + "mean_token_accuracy": 0.9151249043643475, + "num_tokens": 86087800.0, + "step": 1194 + }, + { + "epoch": 0.7459134709163968, + "grad_norm": 0.260163813829422, + "learning_rate": 5.22e-07, + "loss": 0.3179, + "mean_token_accuracy": 0.922410398721695, + "num_tokens": 86156314.0, + "step": 1195 + }, + { + "epoch": 0.7465376662895487, + "grad_norm": 0.22290292382240295, + "learning_rate": 5.215999999999999e-07, + "loss": 0.2994, + "mean_token_accuracy": 0.9332176260650158, + "num_tokens": 86228506.0, + "step": 1196 + }, + { + "epoch": 0.7471618616627004, + "grad_norm": 0.34301167726516724, + "learning_rate": 5.212e-07, + "loss": 0.3299, + "mean_token_accuracy": 0.9203734695911407, + "num_tokens": 86299097.0, + "step": 1197 + }, + { + "epoch": 0.7477860570358522, + "grad_norm": 0.3207155466079712, + "learning_rate": 5.208000000000001e-07, + "loss": 0.34, + "mean_token_accuracy": 0.9195326045155525, + "num_tokens": 86370154.0, + "step": 1198 + }, + { + "epoch": 0.748410252409004, + "grad_norm": 0.26653173565864563, + "learning_rate": 5.203999999999999e-07, + "loss": 0.3209, + "mean_token_accuracy": 0.9236043132841587, + "num_tokens": 86442656.0, + "step": 1199 + }, + { + "epoch": 0.7490344477821558, + "grad_norm": 0.4515193700790405, + "learning_rate": 5.2e-07, + "loss": 0.3026, + "mean_token_accuracy": 0.9314410462975502, + "num_tokens": 86519386.0, + "step": 1200 + }, + { + "epoch": 0.7496586431553076, + "grad_norm": 0.21775180101394653, + "learning_rate": 5.196e-07, + "loss": 0.3087, + "mean_token_accuracy": 0.9295671209692955, + "num_tokens": 86589521.0, + "step": 1201 + }, + { + "epoch": 0.7502828385284595, + "grad_norm": 0.22986090183258057, + "learning_rate": 5.191999999999999e-07, + "loss": 0.2933, + "mean_token_accuracy": 0.9292001985013485, + "num_tokens": 86662844.0, + "step": 1202 + }, + { + "epoch": 0.7509070339016112, + "grad_norm": 0.4639571011066437, + "learning_rate": 5.188e-07, + "loss": 0.3248, + "mean_token_accuracy": 0.9245342165231705, + "num_tokens": 86733766.0, + "step": 1203 + }, + { + "epoch": 0.751531229274763, + "grad_norm": 0.7723350524902344, + "learning_rate": 5.184e-07, + "loss": 0.3451, + "mean_token_accuracy": 0.92121996358037, + "num_tokens": 86805070.0, + "step": 1204 + }, + { + "epoch": 0.7521554246479148, + "grad_norm": 0.5926011204719543, + "learning_rate": 5.18e-07, + "loss": 0.2912, + "mean_token_accuracy": 0.9321501739323139, + "num_tokens": 86880174.0, + "step": 1205 + }, + { + "epoch": 0.7527796200210666, + "grad_norm": 0.5139439702033997, + "learning_rate": 5.175999999999999e-07, + "loss": 0.3045, + "mean_token_accuracy": 0.932935394346714, + "num_tokens": 86954183.0, + "step": 1206 + }, + { + "epoch": 0.7534038153942184, + "grad_norm": 0.24623094499111176, + "learning_rate": 5.172e-07, + "loss": 0.3229, + "mean_token_accuracy": 0.9215483628213406, + "num_tokens": 87025854.0, + "step": 1207 + }, + { + "epoch": 0.7540280107673701, + "grad_norm": 0.2586802542209625, + "learning_rate": 5.168e-07, + "loss": 0.3025, + "mean_token_accuracy": 0.9273506328463554, + "num_tokens": 87098981.0, + "step": 1208 + }, + { + "epoch": 0.754652206140522, + "grad_norm": 0.2703830301761627, + "learning_rate": 5.163999999999999e-07, + "loss": 0.3028, + "mean_token_accuracy": 0.9293551780283451, + "num_tokens": 87172199.0, + "step": 1209 + }, + { + "epoch": 0.7552764015136738, + "grad_norm": 0.23472580313682556, + "learning_rate": 5.16e-07, + "loss": 0.3058, + "mean_token_accuracy": 0.9292072243988514, + "num_tokens": 87245738.0, + "step": 1210 + }, + { + "epoch": 0.7559005968868255, + "grad_norm": 0.23420746624469757, + "learning_rate": 5.155999999999999e-07, + "loss": 0.3251, + "mean_token_accuracy": 0.9235892593860626, + "num_tokens": 87321348.0, + "step": 1211 + }, + { + "epoch": 0.7565247922599774, + "grad_norm": 0.9890261292457581, + "learning_rate": 5.152e-07, + "loss": 0.3409, + "mean_token_accuracy": 0.9217673353850842, + "num_tokens": 87395879.0, + "step": 1212 + }, + { + "epoch": 0.7571489876331292, + "grad_norm": 0.2969365119934082, + "learning_rate": 5.148e-07, + "loss": 0.3267, + "mean_token_accuracy": 0.9257328025996685, + "num_tokens": 87468843.0, + "step": 1213 + }, + { + "epoch": 0.7577731830062809, + "grad_norm": 0.2662661373615265, + "learning_rate": 5.143999999999999e-07, + "loss": 0.3087, + "mean_token_accuracy": 0.9247562512755394, + "num_tokens": 87541321.0, + "step": 1214 + }, + { + "epoch": 0.7583973783794328, + "grad_norm": 0.23543278872966766, + "learning_rate": 5.14e-07, + "loss": 0.3107, + "mean_token_accuracy": 0.9278211444616318, + "num_tokens": 87610447.0, + "step": 1215 + }, + { + "epoch": 0.7590215737525846, + "grad_norm": 0.27834662795066833, + "learning_rate": 5.135999999999999e-07, + "loss": 0.3578, + "mean_token_accuracy": 0.9170341938734055, + "num_tokens": 87676501.0, + "step": 1216 + }, + { + "epoch": 0.7596457691257363, + "grad_norm": 0.613370418548584, + "learning_rate": 5.132e-07, + "loss": 0.3049, + "mean_token_accuracy": 0.9297184832394123, + "num_tokens": 87751856.0, + "step": 1217 + }, + { + "epoch": 0.7602699644988882, + "grad_norm": 0.4066840708255768, + "learning_rate": 5.128e-07, + "loss": 0.3136, + "mean_token_accuracy": 0.9266688153147697, + "num_tokens": 87823039.0, + "step": 1218 + }, + { + "epoch": 0.7608941598720399, + "grad_norm": 0.3204127550125122, + "learning_rate": 5.124e-07, + "loss": 0.3537, + "mean_token_accuracy": 0.9189915433526039, + "num_tokens": 87895424.0, + "step": 1219 + }, + { + "epoch": 0.7615183552451917, + "grad_norm": 0.3272000253200531, + "learning_rate": 5.12e-07, + "loss": 0.2976, + "mean_token_accuracy": 0.9300715290009975, + "num_tokens": 87967854.0, + "step": 1220 + }, + { + "epoch": 0.7621425506183436, + "grad_norm": 0.22308675944805145, + "learning_rate": 5.116e-07, + "loss": 0.3279, + "mean_token_accuracy": 0.927094504237175, + "num_tokens": 88040428.0, + "step": 1221 + }, + { + "epoch": 0.7627667459914953, + "grad_norm": 0.28196635842323303, + "learning_rate": 5.112e-07, + "loss": 0.3211, + "mean_token_accuracy": 0.9223690517246723, + "num_tokens": 88115272.0, + "step": 1222 + }, + { + "epoch": 0.7633909413646471, + "grad_norm": 0.3407323658466339, + "learning_rate": 5.108e-07, + "loss": 0.3531, + "mean_token_accuracy": 0.9218944385647774, + "num_tokens": 88185466.0, + "step": 1223 + }, + { + "epoch": 0.764015136737799, + "grad_norm": 0.24066978693008423, + "learning_rate": 5.103999999999999e-07, + "loss": 0.3089, + "mean_token_accuracy": 0.9254930429160595, + "num_tokens": 88259742.0, + "step": 1224 + }, + { + "epoch": 0.7646393321109507, + "grad_norm": 0.2670535743236542, + "learning_rate": 5.1e-07, + "loss": 0.3045, + "mean_token_accuracy": 0.929579958319664, + "num_tokens": 88336244.0, + "step": 1225 + }, + { + "epoch": 0.7652635274841025, + "grad_norm": 0.24806809425354004, + "learning_rate": 5.096000000000001e-07, + "loss": 0.2983, + "mean_token_accuracy": 0.9274754598736763, + "num_tokens": 88409633.0, + "step": 1226 + }, + { + "epoch": 0.7658877228572544, + "grad_norm": 0.20797449350357056, + "learning_rate": 5.091999999999999e-07, + "loss": 0.3174, + "mean_token_accuracy": 0.9261702336370945, + "num_tokens": 88481721.0, + "step": 1227 + }, + { + "epoch": 0.7665119182304061, + "grad_norm": 0.37408581376075745, + "learning_rate": 5.088e-07, + "loss": 0.326, + "mean_token_accuracy": 0.9201435819268227, + "num_tokens": 88553270.0, + "step": 1228 + }, + { + "epoch": 0.7671361136035579, + "grad_norm": 0.3834930956363678, + "learning_rate": 5.084e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.9237430617213249, + "num_tokens": 88623105.0, + "step": 1229 + }, + { + "epoch": 0.7677603089767097, + "grad_norm": 0.6836141347885132, + "learning_rate": 5.079999999999999e-07, + "loss": 0.2964, + "mean_token_accuracy": 0.9251777045428753, + "num_tokens": 88693057.0, + "step": 1230 + }, + { + "epoch": 0.7683845043498615, + "grad_norm": 0.5935520529747009, + "learning_rate": 5.076e-07, + "loss": 0.3282, + "mean_token_accuracy": 0.9198493212461472, + "num_tokens": 88765019.0, + "step": 1231 + }, + { + "epoch": 0.7690086997230133, + "grad_norm": 0.257715106010437, + "learning_rate": 5.072e-07, + "loss": 0.2855, + "mean_token_accuracy": 0.934584129601717, + "num_tokens": 88843299.0, + "step": 1232 + }, + { + "epoch": 0.7696328950961651, + "grad_norm": 0.29923513531684875, + "learning_rate": 5.068e-07, + "loss": 0.313, + "mean_token_accuracy": 0.9283731169998646, + "num_tokens": 88917004.0, + "step": 1233 + }, + { + "epoch": 0.7702570904693169, + "grad_norm": 0.23763053119182587, + "learning_rate": 5.063999999999999e-07, + "loss": 0.3113, + "mean_token_accuracy": 0.9276343882083893, + "num_tokens": 88990640.0, + "step": 1234 + }, + { + "epoch": 0.7708812858424687, + "grad_norm": 0.38054537773132324, + "learning_rate": 5.06e-07, + "loss": 0.3302, + "mean_token_accuracy": 0.9211019538342953, + "num_tokens": 89059217.0, + "step": 1235 + }, + { + "epoch": 0.7715054812156205, + "grad_norm": 0.40893635153770447, + "learning_rate": 5.056e-07, + "loss": 0.3318, + "mean_token_accuracy": 0.917583804577589, + "num_tokens": 89128130.0, + "step": 1236 + }, + { + "epoch": 0.7721296765887723, + "grad_norm": 0.43935009837150574, + "learning_rate": 5.051999999999999e-07, + "loss": 0.2785, + "mean_token_accuracy": 0.9330038614571095, + "num_tokens": 89205327.0, + "step": 1237 + }, + { + "epoch": 0.7727538719619241, + "grad_norm": 0.2686617076396942, + "learning_rate": 5.048e-07, + "loss": 0.3264, + "mean_token_accuracy": 0.922701720148325, + "num_tokens": 89274451.0, + "step": 1238 + }, + { + "epoch": 0.7733780673350759, + "grad_norm": 1.4218528270721436, + "learning_rate": 5.043999999999999e-07, + "loss": 0.3326, + "mean_token_accuracy": 0.9182117246091366, + "num_tokens": 89346701.0, + "step": 1239 + }, + { + "epoch": 0.7740022627082277, + "grad_norm": 0.26049456000328064, + "learning_rate": 5.04e-07, + "loss": 0.2884, + "mean_token_accuracy": 0.930869460105896, + "num_tokens": 89419351.0, + "step": 1240 + }, + { + "epoch": 0.7746264580813794, + "grad_norm": 0.25617948174476624, + "learning_rate": 5.036e-07, + "loss": 0.3097, + "mean_token_accuracy": 0.9273911379277706, + "num_tokens": 89491198.0, + "step": 1241 + }, + { + "epoch": 0.7752506534545313, + "grad_norm": 0.34554216265678406, + "learning_rate": 5.032e-07, + "loss": 0.3022, + "mean_token_accuracy": 0.9329226016998291, + "num_tokens": 89569543.0, + "step": 1242 + }, + { + "epoch": 0.7758748488276831, + "grad_norm": 0.27730220556259155, + "learning_rate": 5.028e-07, + "loss": 0.3184, + "mean_token_accuracy": 0.9232516996562481, + "num_tokens": 89635675.0, + "step": 1243 + }, + { + "epoch": 0.7764990442008348, + "grad_norm": 0.2507041096687317, + "learning_rate": 5.023999999999999e-07, + "loss": 0.2976, + "mean_token_accuracy": 0.9293660223484039, + "num_tokens": 89705614.0, + "step": 1244 + }, + { + "epoch": 0.7771232395739867, + "grad_norm": 0.40535369515419006, + "learning_rate": 5.02e-07, + "loss": 0.3158, + "mean_token_accuracy": 0.9269846193492413, + "num_tokens": 89779776.0, + "step": 1245 + }, + { + "epoch": 0.7777474349471385, + "grad_norm": 0.33309099078178406, + "learning_rate": 5.016e-07, + "loss": 0.3473, + "mean_token_accuracy": 0.9171466380357742, + "num_tokens": 89845209.0, + "step": 1246 + }, + { + "epoch": 0.7783716303202902, + "grad_norm": 0.32663217186927795, + "learning_rate": 5.012e-07, + "loss": 0.2771, + "mean_token_accuracy": 0.9322833232581615, + "num_tokens": 89917102.0, + "step": 1247 + }, + { + "epoch": 0.7789958256934421, + "grad_norm": 0.25488653779029846, + "learning_rate": 5.008e-07, + "loss": 0.315, + "mean_token_accuracy": 0.9274520874023438, + "num_tokens": 89987492.0, + "step": 1248 + }, + { + "epoch": 0.7796200210665939, + "grad_norm": 0.26127007603645325, + "learning_rate": 5.003999999999999e-07, + "loss": 0.3, + "mean_token_accuracy": 0.9299812465906143, + "num_tokens": 90062908.0, + "step": 1249 + }, + { + "epoch": 0.7802442164397456, + "grad_norm": 0.29464784264564514, + "learning_rate": 5e-07, + "loss": 0.3145, + "mean_token_accuracy": 0.921767208725214, + "num_tokens": 90135224.0, + "step": 1250 + }, + { + "epoch": 0.7808684118128975, + "grad_norm": 0.23986802995204926, + "learning_rate": 4.996e-07, + "loss": 0.2983, + "mean_token_accuracy": 0.9265477173030376, + "num_tokens": 90210241.0, + "step": 1251 + }, + { + "epoch": 0.7814926071860492, + "grad_norm": 0.2759888470172882, + "learning_rate": 4.991999999999999e-07, + "loss": 0.3022, + "mean_token_accuracy": 0.9267982058227062, + "num_tokens": 90278532.0, + "step": 1252 + }, + { + "epoch": 0.782116802559201, + "grad_norm": 0.34974971413612366, + "learning_rate": 4.988e-07, + "loss": 0.2942, + "mean_token_accuracy": 0.9275704920291901, + "num_tokens": 90351306.0, + "step": 1253 + }, + { + "epoch": 0.7827409979323529, + "grad_norm": 0.18729032576084137, + "learning_rate": 4.984e-07, + "loss": 0.3061, + "mean_token_accuracy": 0.9274096041917801, + "num_tokens": 90421546.0, + "step": 1254 + }, + { + "epoch": 0.7833651933055046, + "grad_norm": 0.21039742231369019, + "learning_rate": 4.979999999999999e-07, + "loss": 0.2807, + "mean_token_accuracy": 0.9336643777787685, + "num_tokens": 90496212.0, + "step": 1255 + }, + { + "epoch": 0.7839893886786564, + "grad_norm": 0.33042389154434204, + "learning_rate": 4.976e-07, + "loss": 0.3455, + "mean_token_accuracy": 0.9200369268655777, + "num_tokens": 90566066.0, + "step": 1256 + }, + { + "epoch": 0.7846135840518083, + "grad_norm": 0.3471817970275879, + "learning_rate": 4.972e-07, + "loss": 0.3461, + "mean_token_accuracy": 0.9155357219278812, + "num_tokens": 90635111.0, + "step": 1257 + }, + { + "epoch": 0.78523777942496, + "grad_norm": 0.2426455169916153, + "learning_rate": 4.968e-07, + "loss": 0.3069, + "mean_token_accuracy": 0.9306614771485329, + "num_tokens": 90705560.0, + "step": 1258 + }, + { + "epoch": 0.7858619747981118, + "grad_norm": 0.7047716379165649, + "learning_rate": 4.964e-07, + "loss": 0.2685, + "mean_token_accuracy": 0.9356939382851124, + "num_tokens": 90778806.0, + "step": 1259 + }, + { + "epoch": 0.7864861701712637, + "grad_norm": 0.20655910670757294, + "learning_rate": 4.96e-07, + "loss": 0.3185, + "mean_token_accuracy": 0.9284607172012329, + "num_tokens": 90847505.0, + "step": 1260 + }, + { + "epoch": 0.7871103655444154, + "grad_norm": 0.20230181515216827, + "learning_rate": 4.956e-07, + "loss": 0.3194, + "mean_token_accuracy": 0.92804254591465, + "num_tokens": 90919860.0, + "step": 1261 + }, + { + "epoch": 0.7877345609175672, + "grad_norm": 0.23924511671066284, + "learning_rate": 4.951999999999999e-07, + "loss": 0.2946, + "mean_token_accuracy": 0.932462140917778, + "num_tokens": 90990149.0, + "step": 1262 + }, + { + "epoch": 0.7883587562907189, + "grad_norm": 0.28403759002685547, + "learning_rate": 4.948e-07, + "loss": 0.3086, + "mean_token_accuracy": 0.9299705550074577, + "num_tokens": 91064553.0, + "step": 1263 + }, + { + "epoch": 0.7889829516638708, + "grad_norm": 0.20765992999076843, + "learning_rate": 4.944e-07, + "loss": 0.3398, + "mean_token_accuracy": 0.919780395925045, + "num_tokens": 91132858.0, + "step": 1264 + }, + { + "epoch": 0.7896071470370226, + "grad_norm": 0.34249264001846313, + "learning_rate": 4.94e-07, + "loss": 0.3036, + "mean_token_accuracy": 0.9278276972472668, + "num_tokens": 91205356.0, + "step": 1265 + }, + { + "epoch": 0.7902313424101743, + "grad_norm": 10.69211483001709, + "learning_rate": 4.935999999999999e-07, + "loss": 0.3065, + "mean_token_accuracy": 0.9281813502311707, + "num_tokens": 91280057.0, + "step": 1266 + }, + { + "epoch": 0.7908555377833262, + "grad_norm": 0.477251261472702, + "learning_rate": 4.932e-07, + "loss": 0.3076, + "mean_token_accuracy": 0.9287058897316456, + "num_tokens": 91350340.0, + "step": 1267 + }, + { + "epoch": 0.791479733156478, + "grad_norm": 0.20825354754924774, + "learning_rate": 4.928e-07, + "loss": 0.3526, + "mean_token_accuracy": 0.9174499101936817, + "num_tokens": 91417542.0, + "step": 1268 + }, + { + "epoch": 0.7921039285296297, + "grad_norm": 0.5579161047935486, + "learning_rate": 4.923999999999999e-07, + "loss": 0.2943, + "mean_token_accuracy": 0.9296446330845356, + "num_tokens": 91486793.0, + "step": 1269 + }, + { + "epoch": 0.7927281239027816, + "grad_norm": 0.2773629426956177, + "learning_rate": 4.92e-07, + "loss": 0.3288, + "mean_token_accuracy": 0.9213387742638588, + "num_tokens": 91550108.0, + "step": 1270 + }, + { + "epoch": 0.7933523192759334, + "grad_norm": 0.24674123525619507, + "learning_rate": 4.916e-07, + "loss": 0.3171, + "mean_token_accuracy": 0.9214802496135235, + "num_tokens": 91620076.0, + "step": 1271 + }, + { + "epoch": 0.7939765146490851, + "grad_norm": 0.2021162360906601, + "learning_rate": 4.912e-07, + "loss": 0.2948, + "mean_token_accuracy": 0.9299475438892841, + "num_tokens": 91693579.0, + "step": 1272 + }, + { + "epoch": 0.794600710022237, + "grad_norm": 0.18599866330623627, + "learning_rate": 4.908e-07, + "loss": 0.3087, + "mean_token_accuracy": 0.9270325936377048, + "num_tokens": 91763206.0, + "step": 1273 + }, + { + "epoch": 0.7952249053953887, + "grad_norm": 0.20272447168827057, + "learning_rate": 4.904e-07, + "loss": 0.3128, + "mean_token_accuracy": 0.929167240858078, + "num_tokens": 91833058.0, + "step": 1274 + }, + { + "epoch": 0.7958491007685405, + "grad_norm": 0.5321136713027954, + "learning_rate": 4.9e-07, + "loss": 0.3495, + "mean_token_accuracy": 0.9181141182780266, + "num_tokens": 91903975.0, + "step": 1275 + }, + { + "epoch": 0.7964732961416924, + "grad_norm": 0.2597091794013977, + "learning_rate": 4.895999999999999e-07, + "loss": 0.3388, + "mean_token_accuracy": 0.9192993640899658, + "num_tokens": 91971472.0, + "step": 1276 + }, + { + "epoch": 0.7970974915148441, + "grad_norm": 0.2528326213359833, + "learning_rate": 4.892e-07, + "loss": 0.2829, + "mean_token_accuracy": 0.9339222349226475, + "num_tokens": 92047948.0, + "step": 1277 + }, + { + "epoch": 0.7977216868879959, + "grad_norm": 0.25510677695274353, + "learning_rate": 4.888e-07, + "loss": 0.2891, + "mean_token_accuracy": 0.926655575633049, + "num_tokens": 92120637.0, + "step": 1278 + }, + { + "epoch": 0.7983458822611478, + "grad_norm": 0.2698904871940613, + "learning_rate": 4.884e-07, + "loss": 0.3114, + "mean_token_accuracy": 0.9267336428165436, + "num_tokens": 92192752.0, + "step": 1279 + }, + { + "epoch": 0.7989700776342995, + "grad_norm": 0.27269768714904785, + "learning_rate": 4.879999999999999e-07, + "loss": 0.3171, + "mean_token_accuracy": 0.9252177216112614, + "num_tokens": 92263547.0, + "step": 1280 + }, + { + "epoch": 0.7995942730074513, + "grad_norm": 0.6485013961791992, + "learning_rate": 4.876e-07, + "loss": 0.2888, + "mean_token_accuracy": 0.9319899305701256, + "num_tokens": 92336479.0, + "step": 1281 + }, + { + "epoch": 0.8002184683806032, + "grad_norm": 0.35811445116996765, + "learning_rate": 4.872e-07, + "loss": 0.2867, + "mean_token_accuracy": 0.932494405657053, + "num_tokens": 92408118.0, + "step": 1282 + }, + { + "epoch": 0.8008426637537549, + "grad_norm": 0.23816686868667603, + "learning_rate": 4.867999999999999e-07, + "loss": 0.3433, + "mean_token_accuracy": 0.9195086918771267, + "num_tokens": 92474570.0, + "step": 1283 + }, + { + "epoch": 0.8014668591269067, + "grad_norm": 0.3411473333835602, + "learning_rate": 4.864e-07, + "loss": 0.301, + "mean_token_accuracy": 0.9325961694121361, + "num_tokens": 92546231.0, + "step": 1284 + }, + { + "epoch": 0.8020910545000585, + "grad_norm": 0.291053831577301, + "learning_rate": 4.86e-07, + "loss": 0.3089, + "mean_token_accuracy": 0.9278082400560379, + "num_tokens": 92615770.0, + "step": 1285 + }, + { + "epoch": 0.8027152498732103, + "grad_norm": 0.4437994360923767, + "learning_rate": 4.856e-07, + "loss": 0.3023, + "mean_token_accuracy": 0.9311306588351727, + "num_tokens": 92690445.0, + "step": 1286 + }, + { + "epoch": 0.8033394452463621, + "grad_norm": 0.2321535348892212, + "learning_rate": 4.852e-07, + "loss": 0.3274, + "mean_token_accuracy": 0.9223106428980827, + "num_tokens": 92762502.0, + "step": 1287 + }, + { + "epoch": 0.8039636406195139, + "grad_norm": 0.2755758762359619, + "learning_rate": 4.848e-07, + "loss": 0.3341, + "mean_token_accuracy": 0.919795598834753, + "num_tokens": 92829814.0, + "step": 1288 + }, + { + "epoch": 0.8045878359926657, + "grad_norm": 0.26743367314338684, + "learning_rate": 4.844e-07, + "loss": 0.3231, + "mean_token_accuracy": 0.9239004850387573, + "num_tokens": 92902090.0, + "step": 1289 + }, + { + "epoch": 0.8052120313658175, + "grad_norm": 0.23846283555030823, + "learning_rate": 4.839999999999999e-07, + "loss": 0.317, + "mean_token_accuracy": 0.9247562661767006, + "num_tokens": 92974895.0, + "step": 1290 + }, + { + "epoch": 0.8058362267389693, + "grad_norm": 0.33014076948165894, + "learning_rate": 4.835999999999999e-07, + "loss": 0.32, + "mean_token_accuracy": 0.9264410585165024, + "num_tokens": 93046137.0, + "step": 1291 + }, + { + "epoch": 0.8064604221121211, + "grad_norm": 0.3013243079185486, + "learning_rate": 4.832e-07, + "loss": 0.3124, + "mean_token_accuracy": 0.9225173331797123, + "num_tokens": 93121241.0, + "step": 1292 + }, + { + "epoch": 0.8070846174852729, + "grad_norm": 0.21322762966156006, + "learning_rate": 4.828e-07, + "loss": 0.3213, + "mean_token_accuracy": 0.9241376481950283, + "num_tokens": 93188920.0, + "step": 1293 + }, + { + "epoch": 0.8077088128584247, + "grad_norm": 0.22367824614048004, + "learning_rate": 4.823999999999999e-07, + "loss": 0.3102, + "mean_token_accuracy": 0.9244293458759785, + "num_tokens": 93262538.0, + "step": 1294 + }, + { + "epoch": 0.8083330082315765, + "grad_norm": 0.8112219572067261, + "learning_rate": 4.82e-07, + "loss": 0.2611, + "mean_token_accuracy": 0.9359804950654507, + "num_tokens": 93340991.0, + "step": 1295 + }, + { + "epoch": 0.8089572036047282, + "grad_norm": 0.2527698874473572, + "learning_rate": 4.816e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9236507900059223, + "num_tokens": 93413956.0, + "step": 1296 + }, + { + "epoch": 0.8095813989778801, + "grad_norm": 0.2761595845222473, + "learning_rate": 4.812e-07, + "loss": 0.277, + "mean_token_accuracy": 0.9318163432180882, + "num_tokens": 93488473.0, + "step": 1297 + }, + { + "epoch": 0.8102055943510319, + "grad_norm": 3.999704599380493, + "learning_rate": 4.808e-07, + "loss": 0.2928, + "mean_token_accuracy": 0.9319278188049793, + "num_tokens": 93561013.0, + "step": 1298 + }, + { + "epoch": 0.8108297897241836, + "grad_norm": 0.30497217178344727, + "learning_rate": 4.804e-07, + "loss": 0.3146, + "mean_token_accuracy": 0.9276038073003292, + "num_tokens": 93639820.0, + "step": 1299 + }, + { + "epoch": 0.8114539850973355, + "grad_norm": 0.28263401985168457, + "learning_rate": 4.8e-07, + "loss": 0.2804, + "mean_token_accuracy": 0.935274563729763, + "num_tokens": 93714643.0, + "step": 1300 + }, + { + "epoch": 0.8120781804704873, + "grad_norm": 0.3211728036403656, + "learning_rate": 4.796e-07, + "loss": 0.3262, + "mean_token_accuracy": 0.9193594940006733, + "num_tokens": 93783667.0, + "step": 1301 + }, + { + "epoch": 0.812702375843639, + "grad_norm": 0.2646855115890503, + "learning_rate": 4.792e-07, + "loss": 0.2932, + "mean_token_accuracy": 0.9281762428581715, + "num_tokens": 93852903.0, + "step": 1302 + }, + { + "epoch": 0.8133265712167909, + "grad_norm": 0.18507516384124756, + "learning_rate": 4.788e-07, + "loss": 0.3073, + "mean_token_accuracy": 0.9257412739098072, + "num_tokens": 93923637.0, + "step": 1303 + }, + { + "epoch": 0.8139507665899427, + "grad_norm": 0.25278258323669434, + "learning_rate": 4.783999999999999e-07, + "loss": 0.2905, + "mean_token_accuracy": 0.9301873818039894, + "num_tokens": 93992000.0, + "step": 1304 + }, + { + "epoch": 0.8145749619630944, + "grad_norm": 0.255441278219223, + "learning_rate": 4.779999999999999e-07, + "loss": 0.29, + "mean_token_accuracy": 0.928561769425869, + "num_tokens": 94066016.0, + "step": 1305 + }, + { + "epoch": 0.8151991573362463, + "grad_norm": 0.200369194149971, + "learning_rate": 4.776e-07, + "loss": 0.3158, + "mean_token_accuracy": 0.9237971864640713, + "num_tokens": 94139713.0, + "step": 1306 + }, + { + "epoch": 0.815823352709398, + "grad_norm": 0.2338852435350418, + "learning_rate": 4.772e-07, + "loss": 0.3251, + "mean_token_accuracy": 0.9274236895143986, + "num_tokens": 94211988.0, + "step": 1307 + }, + { + "epoch": 0.8164475480825498, + "grad_norm": 0.3038715422153473, + "learning_rate": 4.768e-07, + "loss": 0.3077, + "mean_token_accuracy": 0.9269857928156853, + "num_tokens": 94281630.0, + "step": 1308 + }, + { + "epoch": 0.8170717434557017, + "grad_norm": 0.22201649844646454, + "learning_rate": 4.7639999999999995e-07, + "loss": 0.3341, + "mean_token_accuracy": 0.9229528941214085, + "num_tokens": 94349527.0, + "step": 1309 + }, + { + "epoch": 0.8176959388288534, + "grad_norm": 0.21482908725738525, + "learning_rate": 4.76e-07, + "loss": 0.3045, + "mean_token_accuracy": 0.9271098747849464, + "num_tokens": 94419931.0, + "step": 1310 + }, + { + "epoch": 0.8183201342020052, + "grad_norm": 0.29071325063705444, + "learning_rate": 4.756e-07, + "loss": 0.2832, + "mean_token_accuracy": 0.9303508922457695, + "num_tokens": 94495756.0, + "step": 1311 + }, + { + "epoch": 0.8189443295751571, + "grad_norm": 0.27348169684410095, + "learning_rate": 4.7519999999999997e-07, + "loss": 0.2952, + "mean_token_accuracy": 0.9314991869032383, + "num_tokens": 94570837.0, + "step": 1312 + }, + { + "epoch": 0.8195685249483088, + "grad_norm": 0.21364399790763855, + "learning_rate": 4.748e-07, + "loss": 0.3298, + "mean_token_accuracy": 0.9218700304627419, + "num_tokens": 94639221.0, + "step": 1313 + }, + { + "epoch": 0.8201927203214606, + "grad_norm": 0.41188061237335205, + "learning_rate": 4.7439999999999996e-07, + "loss": 0.2823, + "mean_token_accuracy": 0.9337622039020061, + "num_tokens": 94713477.0, + "step": 1314 + }, + { + "epoch": 0.8208169156946125, + "grad_norm": 0.22730779647827148, + "learning_rate": 4.7399999999999993e-07, + "loss": 0.2965, + "mean_token_accuracy": 0.9278580583631992, + "num_tokens": 94788128.0, + "step": 1315 + }, + { + "epoch": 0.8214411110677642, + "grad_norm": 0.24205677211284637, + "learning_rate": 4.736e-07, + "loss": 0.3131, + "mean_token_accuracy": 0.9245219118893147, + "num_tokens": 94859328.0, + "step": 1316 + }, + { + "epoch": 0.822065306440916, + "grad_norm": 0.2844335436820984, + "learning_rate": 4.732e-07, + "loss": 0.2891, + "mean_token_accuracy": 0.9307031407952309, + "num_tokens": 94932896.0, + "step": 1317 + }, + { + "epoch": 0.8226895018140679, + "grad_norm": 0.2957679033279419, + "learning_rate": 4.728e-07, + "loss": 0.333, + "mean_token_accuracy": 0.9239684976637363, + "num_tokens": 95002115.0, + "step": 1318 + }, + { + "epoch": 0.8233136971872196, + "grad_norm": 0.26508551836013794, + "learning_rate": 4.7239999999999997e-07, + "loss": 0.3236, + "mean_token_accuracy": 0.9238818138837814, + "num_tokens": 95071027.0, + "step": 1319 + }, + { + "epoch": 0.8239378925603714, + "grad_norm": 0.29709479212760925, + "learning_rate": 4.7199999999999994e-07, + "loss": 0.2983, + "mean_token_accuracy": 0.9285967573523521, + "num_tokens": 95144110.0, + "step": 1320 + }, + { + "epoch": 0.8245620879335231, + "grad_norm": 0.26764973998069763, + "learning_rate": 4.716e-07, + "loss": 0.3298, + "mean_token_accuracy": 0.9208036176860332, + "num_tokens": 95213379.0, + "step": 1321 + }, + { + "epoch": 0.825186283306675, + "grad_norm": 0.21977125108242035, + "learning_rate": 4.712e-07, + "loss": 0.3207, + "mean_token_accuracy": 0.9255136772990227, + "num_tokens": 95286062.0, + "step": 1322 + }, + { + "epoch": 0.8258104786798268, + "grad_norm": 0.2531127333641052, + "learning_rate": 4.7079999999999995e-07, + "loss": 0.3229, + "mean_token_accuracy": 0.9271750636398792, + "num_tokens": 95358613.0, + "step": 1323 + }, + { + "epoch": 0.8264346740529785, + "grad_norm": 0.38851967453956604, + "learning_rate": 4.704e-07, + "loss": 0.2917, + "mean_token_accuracy": 0.931358240544796, + "num_tokens": 95433776.0, + "step": 1324 + }, + { + "epoch": 0.8270588694261304, + "grad_norm": 0.22600294649600983, + "learning_rate": 4.6999999999999995e-07, + "loss": 0.3129, + "mean_token_accuracy": 0.9251480400562286, + "num_tokens": 95505093.0, + "step": 1325 + }, + { + "epoch": 0.8276830647992822, + "grad_norm": 0.25390228629112244, + "learning_rate": 4.6959999999999997e-07, + "loss": 0.2856, + "mean_token_accuracy": 0.92989457026124, + "num_tokens": 95579056.0, + "step": 1326 + }, + { + "epoch": 0.828307260172434, + "grad_norm": 0.19728393852710724, + "learning_rate": 4.692e-07, + "loss": 0.2873, + "mean_token_accuracy": 0.9317186623811722, + "num_tokens": 95653372.0, + "step": 1327 + }, + { + "epoch": 0.8289314555455858, + "grad_norm": 0.2803840935230255, + "learning_rate": 4.6879999999999996e-07, + "loss": 0.3435, + "mean_token_accuracy": 0.9185946397483349, + "num_tokens": 95722546.0, + "step": 1328 + }, + { + "epoch": 0.8295556509187376, + "grad_norm": 0.24696868658065796, + "learning_rate": 4.684e-07, + "loss": 0.3207, + "mean_token_accuracy": 0.9244390986859798, + "num_tokens": 95797004.0, + "step": 1329 + }, + { + "epoch": 0.8301798462918893, + "grad_norm": 0.27567198872566223, + "learning_rate": 4.68e-07, + "loss": 0.3274, + "mean_token_accuracy": 0.9217639081180096, + "num_tokens": 95864448.0, + "step": 1330 + }, + { + "epoch": 0.8308040416650412, + "grad_norm": 0.24500620365142822, + "learning_rate": 4.676e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.9213833995163441, + "num_tokens": 95936013.0, + "step": 1331 + }, + { + "epoch": 0.8314282370381929, + "grad_norm": 0.21822120249271393, + "learning_rate": 4.672e-07, + "loss": 0.3056, + "mean_token_accuracy": 0.9267791993916035, + "num_tokens": 96003492.0, + "step": 1332 + }, + { + "epoch": 0.8320524324113447, + "grad_norm": 0.19485080242156982, + "learning_rate": 4.6679999999999997e-07, + "loss": 0.2679, + "mean_token_accuracy": 0.9334764927625656, + "num_tokens": 96079905.0, + "step": 1333 + }, + { + "epoch": 0.8326766277844966, + "grad_norm": 0.21034878492355347, + "learning_rate": 4.6639999999999994e-07, + "loss": 0.2794, + "mean_token_accuracy": 0.93420784547925, + "num_tokens": 96151373.0, + "step": 1334 + }, + { + "epoch": 0.8333008231576483, + "grad_norm": 0.2630111575126648, + "learning_rate": 4.66e-07, + "loss": 0.332, + "mean_token_accuracy": 0.9200627654790878, + "num_tokens": 96218141.0, + "step": 1335 + }, + { + "epoch": 0.8339250185308001, + "grad_norm": 1.8334243297576904, + "learning_rate": 4.656e-07, + "loss": 0.2804, + "mean_token_accuracy": 0.935204952955246, + "num_tokens": 96295610.0, + "step": 1336 + }, + { + "epoch": 0.834549213903952, + "grad_norm": 0.23512110114097595, + "learning_rate": 4.6519999999999996e-07, + "loss": 0.2859, + "mean_token_accuracy": 0.9350117519497871, + "num_tokens": 96373622.0, + "step": 1337 + }, + { + "epoch": 0.8351734092771037, + "grad_norm": 0.26322272419929504, + "learning_rate": 4.648e-07, + "loss": 0.3134, + "mean_token_accuracy": 0.9295671358704567, + "num_tokens": 96446928.0, + "step": 1338 + }, + { + "epoch": 0.8357976046502555, + "grad_norm": 0.3047458529472351, + "learning_rate": 4.6439999999999995e-07, + "loss": 0.3198, + "mean_token_accuracy": 0.9296431466937065, + "num_tokens": 96516000.0, + "step": 1339 + }, + { + "epoch": 0.8364218000234074, + "grad_norm": 0.21884992718696594, + "learning_rate": 4.64e-07, + "loss": 0.301, + "mean_token_accuracy": 0.9305364191532135, + "num_tokens": 96587520.0, + "step": 1340 + }, + { + "epoch": 0.8370459953965591, + "grad_norm": 0.30071261525154114, + "learning_rate": 4.636e-07, + "loss": 0.3421, + "mean_token_accuracy": 0.9214305095374584, + "num_tokens": 96657840.0, + "step": 1341 + }, + { + "epoch": 0.8376701907697109, + "grad_norm": 0.340962290763855, + "learning_rate": 4.6319999999999997e-07, + "loss": 0.2716, + "mean_token_accuracy": 0.9349562972784042, + "num_tokens": 96732879.0, + "step": 1342 + }, + { + "epoch": 0.8382943861428627, + "grad_norm": 1.995600700378418, + "learning_rate": 4.628e-07, + "loss": 0.3148, + "mean_token_accuracy": 0.9268901497125626, + "num_tokens": 96805142.0, + "step": 1343 + }, + { + "epoch": 0.8389185815160145, + "grad_norm": 0.4358956217765808, + "learning_rate": 4.6239999999999996e-07, + "loss": 0.3311, + "mean_token_accuracy": 0.922329980880022, + "num_tokens": 96875819.0, + "step": 1344 + }, + { + "epoch": 0.8395427768891663, + "grad_norm": 0.2580212950706482, + "learning_rate": 4.62e-07, + "loss": 0.3307, + "mean_token_accuracy": 0.919509019702673, + "num_tokens": 96945399.0, + "step": 1345 + }, + { + "epoch": 0.8401669722623181, + "grad_norm": 0.32801616191864014, + "learning_rate": 4.616e-07, + "loss": 0.3405, + "mean_token_accuracy": 0.9138049185276031, + "num_tokens": 97012117.0, + "step": 1346 + }, + { + "epoch": 0.8407911676354699, + "grad_norm": 0.21839004755020142, + "learning_rate": 4.612e-07, + "loss": 0.2981, + "mean_token_accuracy": 0.9301725700497627, + "num_tokens": 97084070.0, + "step": 1347 + }, + { + "epoch": 0.8414153630086217, + "grad_norm": 0.21233205497264862, + "learning_rate": 4.6079999999999994e-07, + "loss": 0.3126, + "mean_token_accuracy": 0.9267206750810146, + "num_tokens": 97154284.0, + "step": 1348 + }, + { + "epoch": 0.8420395583817735, + "grad_norm": 0.22935333847999573, + "learning_rate": 4.6039999999999997e-07, + "loss": 0.2785, + "mean_token_accuracy": 0.9326333068311214, + "num_tokens": 97230248.0, + "step": 1349 + }, + { + "epoch": 0.8426637537549253, + "grad_norm": 0.231289342045784, + "learning_rate": 4.6e-07, + "loss": 0.3201, + "mean_token_accuracy": 0.9250080622732639, + "num_tokens": 97303340.0, + "step": 1350 + }, + { + "epoch": 0.8432879491280771, + "grad_norm": 0.2201935201883316, + "learning_rate": 4.596e-07, + "loss": 0.3349, + "mean_token_accuracy": 0.9149127639830112, + "num_tokens": 97367919.0, + "step": 1351 + }, + { + "epoch": 0.8439121445012289, + "grad_norm": 0.6288694739341736, + "learning_rate": 4.592e-07, + "loss": 0.288, + "mean_token_accuracy": 0.9301778487861156, + "num_tokens": 97442195.0, + "step": 1352 + }, + { + "epoch": 0.8445363398743807, + "grad_norm": 0.2003668248653412, + "learning_rate": 4.5879999999999995e-07, + "loss": 0.3429, + "mean_token_accuracy": 0.9173059873282909, + "num_tokens": 97512100.0, + "step": 1353 + }, + { + "epoch": 0.8451605352475324, + "grad_norm": 0.29848381876945496, + "learning_rate": 4.584e-07, + "loss": 0.3217, + "mean_token_accuracy": 0.9245139323174953, + "num_tokens": 97581778.0, + "step": 1354 + }, + { + "epoch": 0.8457847306206843, + "grad_norm": 0.24591217935085297, + "learning_rate": 4.58e-07, + "loss": 0.308, + "mean_token_accuracy": 0.9272121377289295, + "num_tokens": 97653464.0, + "step": 1355 + }, + { + "epoch": 0.8464089259938361, + "grad_norm": 0.3415531516075134, + "learning_rate": 4.5759999999999997e-07, + "loss": 0.2886, + "mean_token_accuracy": 0.9268341548740864, + "num_tokens": 97726958.0, + "step": 1356 + }, + { + "epoch": 0.8470331213669878, + "grad_norm": 0.6231511235237122, + "learning_rate": 4.572e-07, + "loss": 0.3178, + "mean_token_accuracy": 0.9277990385890007, + "num_tokens": 97800498.0, + "step": 1357 + }, + { + "epoch": 0.8476573167401397, + "grad_norm": 11.715060234069824, + "learning_rate": 4.5679999999999996e-07, + "loss": 0.3374, + "mean_token_accuracy": 0.9202205948531628, + "num_tokens": 97866560.0, + "step": 1358 + }, + { + "epoch": 0.8482815121132915, + "grad_norm": 0.2158426195383072, + "learning_rate": 4.5639999999999993e-07, + "loss": 0.3312, + "mean_token_accuracy": 0.9219558350741863, + "num_tokens": 97935921.0, + "step": 1359 + }, + { + "epoch": 0.8489057074864432, + "grad_norm": 0.2638317048549652, + "learning_rate": 4.56e-07, + "loss": 0.2991, + "mean_token_accuracy": 0.9287921078503132, + "num_tokens": 98009650.0, + "step": 1360 + }, + { + "epoch": 0.8495299028595951, + "grad_norm": 0.31106385588645935, + "learning_rate": 4.556e-07, + "loss": 0.3266, + "mean_token_accuracy": 0.9210994280874729, + "num_tokens": 98081643.0, + "step": 1361 + }, + { + "epoch": 0.8501540982327469, + "grad_norm": 0.268308162689209, + "learning_rate": 4.5519999999999995e-07, + "loss": 0.3344, + "mean_token_accuracy": 0.9242012612521648, + "num_tokens": 98153341.0, + "step": 1362 + }, + { + "epoch": 0.8507782936058986, + "grad_norm": 0.211997389793396, + "learning_rate": 4.5479999999999997e-07, + "loss": 0.3254, + "mean_token_accuracy": 0.9263162985444069, + "num_tokens": 98222274.0, + "step": 1363 + }, + { + "epoch": 0.8514024889790505, + "grad_norm": 0.2908545732498169, + "learning_rate": 4.544e-07, + "loss": 0.3157, + "mean_token_accuracy": 0.9264709055423737, + "num_tokens": 98294356.0, + "step": 1364 + }, + { + "epoch": 0.8520266843522022, + "grad_norm": 0.24270673096179962, + "learning_rate": 4.54e-07, + "loss": 0.3037, + "mean_token_accuracy": 0.9292899370193481, + "num_tokens": 98366972.0, + "step": 1365 + }, + { + "epoch": 0.852650879725354, + "grad_norm": 0.20563329756259918, + "learning_rate": 4.536e-07, + "loss": 0.3074, + "mean_token_accuracy": 0.9256131201982498, + "num_tokens": 98437505.0, + "step": 1366 + }, + { + "epoch": 0.8532750750985059, + "grad_norm": 0.21996480226516724, + "learning_rate": 4.5319999999999996e-07, + "loss": 0.2969, + "mean_token_accuracy": 0.92835683375597, + "num_tokens": 98505700.0, + "step": 1367 + }, + { + "epoch": 0.8538992704716576, + "grad_norm": 0.23851625621318817, + "learning_rate": 4.528e-07, + "loss": 0.2939, + "mean_token_accuracy": 0.9284695833921432, + "num_tokens": 98575294.0, + "step": 1368 + }, + { + "epoch": 0.8545234658448094, + "grad_norm": 0.21504953503608704, + "learning_rate": 4.524e-07, + "loss": 0.3144, + "mean_token_accuracy": 0.9241252578794956, + "num_tokens": 98646427.0, + "step": 1369 + }, + { + "epoch": 0.8551476612179613, + "grad_norm": 0.31458064913749695, + "learning_rate": 4.5199999999999997e-07, + "loss": 0.2992, + "mean_token_accuracy": 0.9277759417891502, + "num_tokens": 98714527.0, + "step": 1370 + }, + { + "epoch": 0.855771856591113, + "grad_norm": 0.26895374059677124, + "learning_rate": 4.516e-07, + "loss": 0.3295, + "mean_token_accuracy": 0.9242042638361454, + "num_tokens": 98787894.0, + "step": 1371 + }, + { + "epoch": 0.8563960519642648, + "grad_norm": 0.2581736147403717, + "learning_rate": 4.5119999999999996e-07, + "loss": 0.3006, + "mean_token_accuracy": 0.9252099432051182, + "num_tokens": 98859480.0, + "step": 1372 + }, + { + "epoch": 0.8570202473374167, + "grad_norm": 0.32175466418266296, + "learning_rate": 4.5079999999999993e-07, + "loss": 0.3483, + "mean_token_accuracy": 0.9202001355588436, + "num_tokens": 98928888.0, + "step": 1373 + }, + { + "epoch": 0.8576444427105684, + "grad_norm": 0.19613511860370636, + "learning_rate": 4.504e-07, + "loss": 0.2987, + "mean_token_accuracy": 0.9282330349087715, + "num_tokens": 99005440.0, + "step": 1374 + }, + { + "epoch": 0.8582686380837202, + "grad_norm": 0.25721731781959534, + "learning_rate": 4.5e-07, + "loss": 0.278, + "mean_token_accuracy": 0.9355393275618553, + "num_tokens": 99084471.0, + "step": 1375 + }, + { + "epoch": 0.858892833456872, + "grad_norm": 0.2900245189666748, + "learning_rate": 4.496e-07, + "loss": 0.3226, + "mean_token_accuracy": 0.9237043261528015, + "num_tokens": 99153906.0, + "step": 1376 + }, + { + "epoch": 0.8595170288300238, + "grad_norm": 0.22954224050045013, + "learning_rate": 4.4919999999999997e-07, + "loss": 0.3042, + "mean_token_accuracy": 0.9272525571286678, + "num_tokens": 99225010.0, + "step": 1377 + }, + { + "epoch": 0.8601412242031756, + "grad_norm": 0.37201324105262756, + "learning_rate": 4.4879999999999994e-07, + "loss": 0.3037, + "mean_token_accuracy": 0.9287581406533718, + "num_tokens": 99293817.0, + "step": 1378 + }, + { + "epoch": 0.8607654195763274, + "grad_norm": 0.2971428334712982, + "learning_rate": 4.484e-07, + "loss": 0.3179, + "mean_token_accuracy": 0.9237765520811081, + "num_tokens": 99370017.0, + "step": 1379 + }, + { + "epoch": 0.8613896149494792, + "grad_norm": 0.21377050876617432, + "learning_rate": 4.48e-07, + "loss": 0.3438, + "mean_token_accuracy": 0.9182898364961147, + "num_tokens": 99440160.0, + "step": 1380 + }, + { + "epoch": 0.862013810322631, + "grad_norm": 0.23021329939365387, + "learning_rate": 4.4759999999999996e-07, + "loss": 0.299, + "mean_token_accuracy": 0.9298000708222389, + "num_tokens": 99515734.0, + "step": 1381 + }, + { + "epoch": 0.8626380056957828, + "grad_norm": 0.24587848782539368, + "learning_rate": 4.472e-07, + "loss": 0.3065, + "mean_token_accuracy": 0.928221371024847, + "num_tokens": 99587853.0, + "step": 1382 + }, + { + "epoch": 0.8632622010689346, + "grad_norm": 0.23137114942073822, + "learning_rate": 4.4679999999999995e-07, + "loss": 0.2739, + "mean_token_accuracy": 0.934906892478466, + "num_tokens": 99662274.0, + "step": 1383 + }, + { + "epoch": 0.8638863964420864, + "grad_norm": 2.6374948024749756, + "learning_rate": 4.464e-07, + "loss": 0.3815, + "mean_token_accuracy": 0.9097710661590099, + "num_tokens": 99727041.0, + "step": 1384 + }, + { + "epoch": 0.8645105918152381, + "grad_norm": 0.2806600332260132, + "learning_rate": 4.46e-07, + "loss": 0.3436, + "mean_token_accuracy": 0.9207728356122971, + "num_tokens": 99790537.0, + "step": 1385 + }, + { + "epoch": 0.86513478718839, + "grad_norm": 0.287019819021225, + "learning_rate": 4.4559999999999997e-07, + "loss": 0.3519, + "mean_token_accuracy": 0.9174740798771381, + "num_tokens": 99863910.0, + "step": 1386 + }, + { + "epoch": 0.8657589825615417, + "grad_norm": 0.40255093574523926, + "learning_rate": 4.452e-07, + "loss": 0.3113, + "mean_token_accuracy": 0.9263402558863163, + "num_tokens": 99936083.0, + "step": 1387 + }, + { + "epoch": 0.8663831779346935, + "grad_norm": 0.4824659526348114, + "learning_rate": 4.4479999999999996e-07, + "loss": 0.3005, + "mean_token_accuracy": 0.9276278205215931, + "num_tokens": 100008984.0, + "step": 1388 + }, + { + "epoch": 0.8670073733078454, + "grad_norm": 0.18550696969032288, + "learning_rate": 4.444e-07, + "loss": 0.282, + "mean_token_accuracy": 0.9362342655658722, + "num_tokens": 100088029.0, + "step": 1389 + }, + { + "epoch": 0.8676315686809971, + "grad_norm": 0.5796900391578674, + "learning_rate": 4.44e-07, + "loss": 0.3148, + "mean_token_accuracy": 0.9273844659328461, + "num_tokens": 100159145.0, + "step": 1390 + }, + { + "epoch": 0.868255764054149, + "grad_norm": 0.7479981184005737, + "learning_rate": 4.436e-07, + "loss": 0.2838, + "mean_token_accuracy": 0.9326974488794804, + "num_tokens": 100232806.0, + "step": 1391 + }, + { + "epoch": 0.8688799594273008, + "grad_norm": 0.23599204421043396, + "learning_rate": 4.4319999999999995e-07, + "loss": 0.3222, + "mean_token_accuracy": 0.9228789396584034, + "num_tokens": 100303694.0, + "step": 1392 + }, + { + "epoch": 0.8695041548004525, + "grad_norm": 0.22743694484233856, + "learning_rate": 4.428e-07, + "loss": 0.2875, + "mean_token_accuracy": 0.9321911185979843, + "num_tokens": 100376146.0, + "step": 1393 + }, + { + "epoch": 0.8701283501736043, + "grad_norm": 0.28809449076652527, + "learning_rate": 4.424e-07, + "loss": 0.3159, + "mean_token_accuracy": 0.9242226481437683, + "num_tokens": 100446161.0, + "step": 1394 + }, + { + "epoch": 0.8707525455467562, + "grad_norm": 0.313827246427536, + "learning_rate": 4.4199999999999996e-07, + "loss": 0.2992, + "mean_token_accuracy": 0.9315675050020218, + "num_tokens": 100515496.0, + "step": 1395 + }, + { + "epoch": 0.8713767409199079, + "grad_norm": 0.2260451465845108, + "learning_rate": 4.416e-07, + "loss": 0.3015, + "mean_token_accuracy": 0.9290505237877369, + "num_tokens": 100591276.0, + "step": 1396 + }, + { + "epoch": 0.8720009362930597, + "grad_norm": 0.21511425077915192, + "learning_rate": 4.4119999999999995e-07, + "loss": 0.2966, + "mean_token_accuracy": 0.9306513480842113, + "num_tokens": 100663382.0, + "step": 1397 + }, + { + "epoch": 0.8726251316662115, + "grad_norm": 0.2949669659137726, + "learning_rate": 4.4080000000000003e-07, + "loss": 0.3359, + "mean_token_accuracy": 0.9198646135628223, + "num_tokens": 100733059.0, + "step": 1398 + }, + { + "epoch": 0.8732493270393633, + "grad_norm": 0.2607325315475464, + "learning_rate": 4.404e-07, + "loss": 0.2886, + "mean_token_accuracy": 0.9322608485817909, + "num_tokens": 100806703.0, + "step": 1399 + }, + { + "epoch": 0.8738735224125151, + "grad_norm": 0.22420014441013336, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.3393, + "mean_token_accuracy": 0.9173343293368816, + "num_tokens": 100876502.0, + "step": 1400 + }, + { + "epoch": 0.8744977177856669, + "grad_norm": 0.16812561452388763, + "learning_rate": 4.396e-07, + "loss": 0.3368, + "mean_token_accuracy": 0.9184993170201778, + "num_tokens": 100950001.0, + "step": 1401 + }, + { + "epoch": 0.8751219131588187, + "grad_norm": 0.6224853992462158, + "learning_rate": 4.3919999999999996e-07, + "loss": 0.3289, + "mean_token_accuracy": 0.9201460294425488, + "num_tokens": 101021512.0, + "step": 1402 + }, + { + "epoch": 0.8757461085319705, + "grad_norm": 0.2706308662891388, + "learning_rate": 4.388e-07, + "loss": 0.2898, + "mean_token_accuracy": 0.9291175492107868, + "num_tokens": 101096199.0, + "step": 1403 + }, + { + "epoch": 0.8763703039051223, + "grad_norm": 0.2599756419658661, + "learning_rate": 4.384e-07, + "loss": 0.3142, + "mean_token_accuracy": 0.9272696562111378, + "num_tokens": 101169652.0, + "step": 1404 + }, + { + "epoch": 0.8769944992782741, + "grad_norm": 0.30266013741493225, + "learning_rate": 4.38e-07, + "loss": 0.3077, + "mean_token_accuracy": 0.9269637912511826, + "num_tokens": 101241177.0, + "step": 1405 + }, + { + "epoch": 0.877618694651426, + "grad_norm": 0.2618788480758667, + "learning_rate": 4.3759999999999995e-07, + "loss": 0.2855, + "mean_token_accuracy": 0.9335198998451233, + "num_tokens": 101319861.0, + "step": 1406 + }, + { + "epoch": 0.8782428900245777, + "grad_norm": 0.2289922684431076, + "learning_rate": 4.3719999999999997e-07, + "loss": 0.2932, + "mean_token_accuracy": 0.9308609887957573, + "num_tokens": 101396430.0, + "step": 1407 + }, + { + "epoch": 0.8788670853977295, + "grad_norm": 0.22140169143676758, + "learning_rate": 4.368e-07, + "loss": 0.3269, + "mean_token_accuracy": 0.9229143671691418, + "num_tokens": 101462051.0, + "step": 1408 + }, + { + "epoch": 0.8794912807708812, + "grad_norm": 0.22949433326721191, + "learning_rate": 4.364e-07, + "loss": 0.2947, + "mean_token_accuracy": 0.9306843653321266, + "num_tokens": 101534992.0, + "step": 1409 + }, + { + "epoch": 0.8801154761440331, + "grad_norm": 0.2620709240436554, + "learning_rate": 4.36e-07, + "loss": 0.3094, + "mean_token_accuracy": 0.9236539453268051, + "num_tokens": 101605922.0, + "step": 1410 + }, + { + "epoch": 0.8807396715171849, + "grad_norm": 0.29271620512008667, + "learning_rate": 4.3559999999999996e-07, + "loss": 0.3023, + "mean_token_accuracy": 0.9288443773984909, + "num_tokens": 101681737.0, + "step": 1411 + }, + { + "epoch": 0.8813638668903366, + "grad_norm": 0.27722692489624023, + "learning_rate": 4.352e-07, + "loss": 0.3099, + "mean_token_accuracy": 0.9280101731419563, + "num_tokens": 101756188.0, + "step": 1412 + }, + { + "epoch": 0.8819880622634885, + "grad_norm": 0.3389802873134613, + "learning_rate": 4.348e-07, + "loss": 0.3064, + "mean_token_accuracy": 0.9282784648239613, + "num_tokens": 101829093.0, + "step": 1413 + }, + { + "epoch": 0.8826122576366403, + "grad_norm": 0.2262304425239563, + "learning_rate": 4.3439999999999997e-07, + "loss": 0.3333, + "mean_token_accuracy": 0.9141416884958744, + "num_tokens": 101897604.0, + "step": 1414 + }, + { + "epoch": 0.883236453009792, + "grad_norm": 0.19665999710559845, + "learning_rate": 4.34e-07, + "loss": 0.3028, + "mean_token_accuracy": 0.9291608929634094, + "num_tokens": 101972774.0, + "step": 1415 + }, + { + "epoch": 0.8838606483829439, + "grad_norm": 0.18623080849647522, + "learning_rate": 4.3359999999999997e-07, + "loss": 0.327, + "mean_token_accuracy": 0.9257093667984009, + "num_tokens": 102045952.0, + "step": 1416 + }, + { + "epoch": 0.8844848437560957, + "grad_norm": 0.28464755415916443, + "learning_rate": 4.3319999999999994e-07, + "loss": 0.2897, + "mean_token_accuracy": 0.9312638342380524, + "num_tokens": 102115663.0, + "step": 1417 + }, + { + "epoch": 0.8851090391292474, + "grad_norm": 0.5428276062011719, + "learning_rate": 4.328e-07, + "loss": 0.3311, + "mean_token_accuracy": 0.9192406758666039, + "num_tokens": 102188269.0, + "step": 1418 + }, + { + "epoch": 0.8857332345023993, + "grad_norm": 0.22083917260169983, + "learning_rate": 4.324e-07, + "loss": 0.3155, + "mean_token_accuracy": 0.9251565150916576, + "num_tokens": 102259404.0, + "step": 1419 + }, + { + "epoch": 0.886357429875551, + "grad_norm": 0.4886985123157501, + "learning_rate": 4.3199999999999995e-07, + "loss": 0.3606, + "mean_token_accuracy": 0.9148160144686699, + "num_tokens": 102330100.0, + "step": 1420 + }, + { + "epoch": 0.8869816252487028, + "grad_norm": 0.249943807721138, + "learning_rate": 4.316e-07, + "loss": 0.309, + "mean_token_accuracy": 0.9280661717057228, + "num_tokens": 102396640.0, + "step": 1421 + }, + { + "epoch": 0.8876058206218547, + "grad_norm": 0.1967613697052002, + "learning_rate": 4.312e-07, + "loss": 0.329, + "mean_token_accuracy": 0.9206053614616394, + "num_tokens": 102465673.0, + "step": 1422 + }, + { + "epoch": 0.8882300159950064, + "grad_norm": 0.22276544570922852, + "learning_rate": 4.308e-07, + "loss": 0.3603, + "mean_token_accuracy": 0.9112642854452133, + "num_tokens": 102534756.0, + "step": 1423 + }, + { + "epoch": 0.8888542113681582, + "grad_norm": 0.47951334714889526, + "learning_rate": 4.304e-07, + "loss": 0.3234, + "mean_token_accuracy": 0.9244290366768837, + "num_tokens": 102607871.0, + "step": 1424 + }, + { + "epoch": 0.8894784067413101, + "grad_norm": 0.19350938498973846, + "learning_rate": 4.2999999999999996e-07, + "loss": 0.3068, + "mean_token_accuracy": 0.9293696507811546, + "num_tokens": 102681324.0, + "step": 1425 + }, + { + "epoch": 0.8901026021144618, + "grad_norm": 0.23078599572181702, + "learning_rate": 4.296e-07, + "loss": 0.3196, + "mean_token_accuracy": 0.9231344126164913, + "num_tokens": 102755479.0, + "step": 1426 + }, + { + "epoch": 0.8907267974876136, + "grad_norm": 0.20979702472686768, + "learning_rate": 4.292e-07, + "loss": 0.3019, + "mean_token_accuracy": 0.9284933470189571, + "num_tokens": 102832260.0, + "step": 1427 + }, + { + "epoch": 0.8913509928607655, + "grad_norm": 0.3546234667301178, + "learning_rate": 4.288e-07, + "loss": 0.3324, + "mean_token_accuracy": 0.920235637575388, + "num_tokens": 102904427.0, + "step": 1428 + }, + { + "epoch": 0.8919751882339172, + "grad_norm": 0.2953638434410095, + "learning_rate": 4.284e-07, + "loss": 0.3131, + "mean_token_accuracy": 0.9283937886357307, + "num_tokens": 102977692.0, + "step": 1429 + }, + { + "epoch": 0.892599383607069, + "grad_norm": 0.37405651807785034, + "learning_rate": 4.2799999999999997e-07, + "loss": 0.2455, + "mean_token_accuracy": 0.9411461688578129, + "num_tokens": 103055453.0, + "step": 1430 + }, + { + "epoch": 0.8932235789802208, + "grad_norm": 0.2698046863079071, + "learning_rate": 4.2759999999999994e-07, + "loss": 0.2668, + "mean_token_accuracy": 0.935604028403759, + "num_tokens": 103130556.0, + "step": 1431 + }, + { + "epoch": 0.8938477743533726, + "grad_norm": 0.3210386037826538, + "learning_rate": 4.272e-07, + "loss": 0.2907, + "mean_token_accuracy": 0.9334711506962776, + "num_tokens": 103199838.0, + "step": 1432 + }, + { + "epoch": 0.8944719697265244, + "grad_norm": 0.4337041676044464, + "learning_rate": 4.268e-07, + "loss": 0.2956, + "mean_token_accuracy": 0.9307815879583359, + "num_tokens": 103274511.0, + "step": 1433 + }, + { + "epoch": 0.8950961650996762, + "grad_norm": 0.2664394676685333, + "learning_rate": 4.264e-07, + "loss": 0.3405, + "mean_token_accuracy": 0.9198729284107685, + "num_tokens": 103344527.0, + "step": 1434 + }, + { + "epoch": 0.895720360472828, + "grad_norm": 0.19608251750469208, + "learning_rate": 4.26e-07, + "loss": 0.2798, + "mean_token_accuracy": 0.9340275377035141, + "num_tokens": 103415266.0, + "step": 1435 + }, + { + "epoch": 0.8963445558459798, + "grad_norm": 0.32913944125175476, + "learning_rate": 4.2559999999999995e-07, + "loss": 0.2998, + "mean_token_accuracy": 0.9290631003677845, + "num_tokens": 103483417.0, + "step": 1436 + }, + { + "epoch": 0.8969687512191316, + "grad_norm": 0.2211001068353653, + "learning_rate": 4.252e-07, + "loss": 0.2883, + "mean_token_accuracy": 0.9313066974282265, + "num_tokens": 103560007.0, + "step": 1437 + }, + { + "epoch": 0.8975929465922834, + "grad_norm": 0.7351924180984497, + "learning_rate": 4.248e-07, + "loss": 0.3118, + "mean_token_accuracy": 0.9258391559123993, + "num_tokens": 103632692.0, + "step": 1438 + }, + { + "epoch": 0.8982171419654352, + "grad_norm": 0.32769775390625, + "learning_rate": 4.2439999999999996e-07, + "loss": 0.3221, + "mean_token_accuracy": 0.9225554093718529, + "num_tokens": 103704141.0, + "step": 1439 + }, + { + "epoch": 0.898841337338587, + "grad_norm": 0.2612161338329315, + "learning_rate": 4.24e-07, + "loss": 0.2833, + "mean_token_accuracy": 0.9333850555121899, + "num_tokens": 103780364.0, + "step": 1440 + }, + { + "epoch": 0.8994655327117388, + "grad_norm": 0.2581925094127655, + "learning_rate": 4.2359999999999995e-07, + "loss": 0.3342, + "mean_token_accuracy": 0.916926346719265, + "num_tokens": 103851906.0, + "step": 1441 + }, + { + "epoch": 0.9000897280848905, + "grad_norm": 0.3054351806640625, + "learning_rate": 4.232e-07, + "loss": 0.3249, + "mean_token_accuracy": 0.9235624521970749, + "num_tokens": 103924585.0, + "step": 1442 + }, + { + "epoch": 0.9007139234580424, + "grad_norm": 0.2061372697353363, + "learning_rate": 4.228e-07, + "loss": 0.2955, + "mean_token_accuracy": 0.9315860234200954, + "num_tokens": 103997926.0, + "step": 1443 + }, + { + "epoch": 0.9013381188311942, + "grad_norm": 0.2621244490146637, + "learning_rate": 4.2239999999999997e-07, + "loss": 0.2573, + "mean_token_accuracy": 0.938096433877945, + "num_tokens": 104075018.0, + "step": 1444 + }, + { + "epoch": 0.9019623142043459, + "grad_norm": 0.3371485471725464, + "learning_rate": 4.2199999999999994e-07, + "loss": 0.2838, + "mean_token_accuracy": 0.9332910068333149, + "num_tokens": 104149033.0, + "step": 1445 + }, + { + "epoch": 0.9025865095774978, + "grad_norm": 0.44770005345344543, + "learning_rate": 4.2159999999999996e-07, + "loss": 0.3047, + "mean_token_accuracy": 0.9244360588490963, + "num_tokens": 104225899.0, + "step": 1446 + }, + { + "epoch": 0.9032107049506496, + "grad_norm": 0.2618308663368225, + "learning_rate": 4.212e-07, + "loss": 0.3321, + "mean_token_accuracy": 0.9207029901444912, + "num_tokens": 104293916.0, + "step": 1447 + }, + { + "epoch": 0.9038349003238013, + "grad_norm": 0.4455057978630066, + "learning_rate": 4.208e-07, + "loss": 0.297, + "mean_token_accuracy": 0.9305690936744213, + "num_tokens": 104370688.0, + "step": 1448 + }, + { + "epoch": 0.9044590956969532, + "grad_norm": 0.21280695497989655, + "learning_rate": 4.204e-07, + "loss": 0.2673, + "mean_token_accuracy": 0.9366856068372726, + "num_tokens": 104444124.0, + "step": 1449 + }, + { + "epoch": 0.905083291070105, + "grad_norm": 0.26069003343582153, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.2987, + "mean_token_accuracy": 0.9314247816801071, + "num_tokens": 104521766.0, + "step": 1450 + }, + { + "epoch": 0.9057074864432567, + "grad_norm": 0.3021981418132782, + "learning_rate": 4.1959999999999997e-07, + "loss": 0.3232, + "mean_token_accuracy": 0.9226624257862568, + "num_tokens": 104591346.0, + "step": 1451 + }, + { + "epoch": 0.9063316818164086, + "grad_norm": 0.2334258258342743, + "learning_rate": 4.192e-07, + "loss": 0.2904, + "mean_token_accuracy": 0.9314080104231834, + "num_tokens": 104665976.0, + "step": 1452 + }, + { + "epoch": 0.9069558771895603, + "grad_norm": 0.46876949071884155, + "learning_rate": 4.1879999999999996e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.9201930426061153, + "num_tokens": 104732932.0, + "step": 1453 + }, + { + "epoch": 0.9075800725627121, + "grad_norm": 0.34948328137397766, + "learning_rate": 4.184e-07, + "loss": 0.3081, + "mean_token_accuracy": 0.9265077896416187, + "num_tokens": 104808229.0, + "step": 1454 + }, + { + "epoch": 0.908204267935864, + "grad_norm": 0.2665959298610687, + "learning_rate": 4.1799999999999996e-07, + "loss": 0.3024, + "mean_token_accuracy": 0.9250709414482117, + "num_tokens": 104877299.0, + "step": 1455 + }, + { + "epoch": 0.9088284633090157, + "grad_norm": 0.23899903893470764, + "learning_rate": 4.1760000000000003e-07, + "loss": 0.3113, + "mean_token_accuracy": 0.9269956983625889, + "num_tokens": 104947146.0, + "step": 1456 + }, + { + "epoch": 0.9094526586821675, + "grad_norm": 0.23126071691513062, + "learning_rate": 4.172e-07, + "loss": 0.3398, + "mean_token_accuracy": 0.9173771739006042, + "num_tokens": 105014125.0, + "step": 1457 + }, + { + "epoch": 0.9100768540553194, + "grad_norm": 0.28226009011268616, + "learning_rate": 4.1679999999999997e-07, + "loss": 0.3168, + "mean_token_accuracy": 0.9257363602519035, + "num_tokens": 105084524.0, + "step": 1458 + }, + { + "epoch": 0.9107010494284711, + "grad_norm": 0.20844340324401855, + "learning_rate": 4.164e-07, + "loss": 0.2958, + "mean_token_accuracy": 0.9263611026108265, + "num_tokens": 105153854.0, + "step": 1459 + }, + { + "epoch": 0.9113252448016229, + "grad_norm": 0.2391640990972519, + "learning_rate": 4.1599999999999997e-07, + "loss": 0.2589, + "mean_token_accuracy": 0.9340651854872704, + "num_tokens": 105227740.0, + "step": 1460 + }, + { + "epoch": 0.9119494401747748, + "grad_norm": 0.19487611949443817, + "learning_rate": 4.156e-07, + "loss": 0.3124, + "mean_token_accuracy": 0.9277138002216816, + "num_tokens": 105299525.0, + "step": 1461 + }, + { + "epoch": 0.9125736355479265, + "grad_norm": 0.2152072638273239, + "learning_rate": 4.152e-07, + "loss": 0.2948, + "mean_token_accuracy": 0.9281187728047371, + "num_tokens": 105374057.0, + "step": 1462 + }, + { + "epoch": 0.9131978309210783, + "grad_norm": 0.47581884264945984, + "learning_rate": 4.148e-07, + "loss": 0.3005, + "mean_token_accuracy": 0.9262504018843174, + "num_tokens": 105446060.0, + "step": 1463 + }, + { + "epoch": 0.91382202629423, + "grad_norm": 0.2584288418292999, + "learning_rate": 4.1439999999999995e-07, + "loss": 0.3074, + "mean_token_accuracy": 0.9254106879234314, + "num_tokens": 105517381.0, + "step": 1464 + }, + { + "epoch": 0.9144462216673819, + "grad_norm": 0.2504914104938507, + "learning_rate": 4.14e-07, + "loss": 0.3345, + "mean_token_accuracy": 0.9195071533322334, + "num_tokens": 105582818.0, + "step": 1465 + }, + { + "epoch": 0.9150704170405337, + "grad_norm": 0.20424772799015045, + "learning_rate": 4.136e-07, + "loss": 0.3054, + "mean_token_accuracy": 0.924877367913723, + "num_tokens": 105656597.0, + "step": 1466 + }, + { + "epoch": 0.9156946124136854, + "grad_norm": 0.2176269143819809, + "learning_rate": 4.1319999999999997e-07, + "loss": 0.3595, + "mean_token_accuracy": 0.9168545268476009, + "num_tokens": 105727300.0, + "step": 1467 + }, + { + "epoch": 0.9163188077868373, + "grad_norm": 0.2896645963191986, + "learning_rate": 4.128e-07, + "loss": 0.3196, + "mean_token_accuracy": 0.9249995909631252, + "num_tokens": 105797858.0, + "step": 1468 + }, + { + "epoch": 0.9169430031599891, + "grad_norm": 0.2890515923500061, + "learning_rate": 4.1239999999999996e-07, + "loss": 0.3081, + "mean_token_accuracy": 0.9298246763646603, + "num_tokens": 105871622.0, + "step": 1469 + }, + { + "epoch": 0.9175671985331408, + "grad_norm": 0.1837138533592224, + "learning_rate": 4.12e-07, + "loss": 0.2939, + "mean_token_accuracy": 0.931295245885849, + "num_tokens": 105945664.0, + "step": 1470 + }, + { + "epoch": 0.9181913939062927, + "grad_norm": 0.23534689843654633, + "learning_rate": 4.116e-07, + "loss": 0.2805, + "mean_token_accuracy": 0.935580987483263, + "num_tokens": 106021039.0, + "step": 1471 + }, + { + "epoch": 0.9188155892794445, + "grad_norm": 0.1975099891424179, + "learning_rate": 4.112e-07, + "loss": 0.2929, + "mean_token_accuracy": 0.9288500659167767, + "num_tokens": 106096903.0, + "step": 1472 + }, + { + "epoch": 0.9194397846525962, + "grad_norm": 0.2099357396364212, + "learning_rate": 4.108e-07, + "loss": 0.3107, + "mean_token_accuracy": 0.9252267740666866, + "num_tokens": 106171723.0, + "step": 1473 + }, + { + "epoch": 0.9200639800257481, + "grad_norm": 0.5031565427780151, + "learning_rate": 4.1039999999999997e-07, + "loss": 0.264, + "mean_token_accuracy": 0.9377866312861443, + "num_tokens": 106248340.0, + "step": 1474 + }, + { + "epoch": 0.9206881753988999, + "grad_norm": 0.22324055433273315, + "learning_rate": 4.0999999999999994e-07, + "loss": 0.3141, + "mean_token_accuracy": 0.9218956641852856, + "num_tokens": 106326037.0, + "step": 1475 + }, + { + "epoch": 0.9213123707720516, + "grad_norm": 0.2573715150356293, + "learning_rate": 4.096e-07, + "loss": 0.2949, + "mean_token_accuracy": 0.9305706806480885, + "num_tokens": 106403257.0, + "step": 1476 + }, + { + "epoch": 0.9219365661452035, + "grad_norm": 0.38446667790412903, + "learning_rate": 4.092e-07, + "loss": 0.2963, + "mean_token_accuracy": 0.9294497668743134, + "num_tokens": 106479043.0, + "step": 1477 + }, + { + "epoch": 0.9225607615183552, + "grad_norm": 0.23064759373664856, + "learning_rate": 4.0879999999999995e-07, + "loss": 0.3021, + "mean_token_accuracy": 0.9309441335499287, + "num_tokens": 106552589.0, + "step": 1478 + }, + { + "epoch": 0.923184956891507, + "grad_norm": 0.2229718118906021, + "learning_rate": 4.084e-07, + "loss": 0.3306, + "mean_token_accuracy": 0.9224933795630932, + "num_tokens": 106624382.0, + "step": 1479 + }, + { + "epoch": 0.9238091522646589, + "grad_norm": 0.37829819321632385, + "learning_rate": 4.0799999999999995e-07, + "loss": 0.3045, + "mean_token_accuracy": 0.929244838654995, + "num_tokens": 106695041.0, + "step": 1480 + }, + { + "epoch": 0.9244333476378106, + "grad_norm": 0.22006119787693024, + "learning_rate": 4.076e-07, + "loss": 0.3203, + "mean_token_accuracy": 0.926427386701107, + "num_tokens": 106771992.0, + "step": 1481 + }, + { + "epoch": 0.9250575430109624, + "grad_norm": 0.2141723781824112, + "learning_rate": 4.072e-07, + "loss": 0.3514, + "mean_token_accuracy": 0.9177560433745384, + "num_tokens": 106840815.0, + "step": 1482 + }, + { + "epoch": 0.9256817383841143, + "grad_norm": 0.23471519351005554, + "learning_rate": 4.0679999999999996e-07, + "loss": 0.3167, + "mean_token_accuracy": 0.9215167723596096, + "num_tokens": 106916179.0, + "step": 1483 + }, + { + "epoch": 0.926305933757266, + "grad_norm": 0.18685398995876312, + "learning_rate": 4.064e-07, + "loss": 0.2892, + "mean_token_accuracy": 0.9317849949002266, + "num_tokens": 106989736.0, + "step": 1484 + }, + { + "epoch": 0.9269301291304178, + "grad_norm": 0.2012556940317154, + "learning_rate": 4.06e-07, + "loss": 0.2922, + "mean_token_accuracy": 0.927622564136982, + "num_tokens": 107063169.0, + "step": 1485 + }, + { + "epoch": 0.9275543245035697, + "grad_norm": 0.28088533878326416, + "learning_rate": 4.056e-07, + "loss": 0.2906, + "mean_token_accuracy": 0.9323969818651676, + "num_tokens": 107139016.0, + "step": 1486 + }, + { + "epoch": 0.9281785198767214, + "grad_norm": 0.18422377109527588, + "learning_rate": 4.052e-07, + "loss": 0.3284, + "mean_token_accuracy": 0.9211296960711479, + "num_tokens": 107210236.0, + "step": 1487 + }, + { + "epoch": 0.9288027152498732, + "grad_norm": 0.28987160325050354, + "learning_rate": 4.0479999999999997e-07, + "loss": 0.3061, + "mean_token_accuracy": 0.9298386536538601, + "num_tokens": 107284865.0, + "step": 1488 + }, + { + "epoch": 0.929426910623025, + "grad_norm": 0.9706665873527527, + "learning_rate": 4.0439999999999994e-07, + "loss": 0.2842, + "mean_token_accuracy": 0.9326089099049568, + "num_tokens": 107354579.0, + "step": 1489 + }, + { + "epoch": 0.9300511059961768, + "grad_norm": 0.2657856047153473, + "learning_rate": 4.04e-07, + "loss": 0.2851, + "mean_token_accuracy": 0.9294800274074078, + "num_tokens": 107428589.0, + "step": 1490 + }, + { + "epoch": 0.9306753013693286, + "grad_norm": 0.3612760007381439, + "learning_rate": 4.036e-07, + "loss": 0.3079, + "mean_token_accuracy": 0.921242218464613, + "num_tokens": 107495710.0, + "step": 1491 + }, + { + "epoch": 0.9312994967424804, + "grad_norm": 0.5441721081733704, + "learning_rate": 4.032e-07, + "loss": 0.298, + "mean_token_accuracy": 0.9271158240735531, + "num_tokens": 107569117.0, + "step": 1492 + }, + { + "epoch": 0.9319236921156322, + "grad_norm": 0.6686025857925415, + "learning_rate": 4.028e-07, + "loss": 0.3205, + "mean_token_accuracy": 0.9263519011437893, + "num_tokens": 107642742.0, + "step": 1493 + }, + { + "epoch": 0.932547887488784, + "grad_norm": 0.2342577576637268, + "learning_rate": 4.0239999999999995e-07, + "loss": 0.2661, + "mean_token_accuracy": 0.9362364374101162, + "num_tokens": 107718542.0, + "step": 1494 + }, + { + "epoch": 0.9331720828619358, + "grad_norm": 0.2320278435945511, + "learning_rate": 4.02e-07, + "loss": 0.3014, + "mean_token_accuracy": 0.927986603230238, + "num_tokens": 107792985.0, + "step": 1495 + }, + { + "epoch": 0.9337962782350876, + "grad_norm": 0.2515869736671448, + "learning_rate": 4.016e-07, + "loss": 0.3356, + "mean_token_accuracy": 0.9195610173046589, + "num_tokens": 107862065.0, + "step": 1496 + }, + { + "epoch": 0.9344204736082394, + "grad_norm": 0.25462406873703003, + "learning_rate": 4.0119999999999997e-07, + "loss": 0.3252, + "mean_token_accuracy": 0.9214262068271637, + "num_tokens": 107935372.0, + "step": 1497 + }, + { + "epoch": 0.9350446689813912, + "grad_norm": 0.21103344857692719, + "learning_rate": 4.008e-07, + "loss": 0.2949, + "mean_token_accuracy": 0.930181510746479, + "num_tokens": 108009349.0, + "step": 1498 + }, + { + "epoch": 0.935668864354543, + "grad_norm": 0.37491458654403687, + "learning_rate": 4.0039999999999996e-07, + "loss": 0.2988, + "mean_token_accuracy": 0.9286434836685658, + "num_tokens": 108082489.0, + "step": 1499 + }, + { + "epoch": 0.9362930597276947, + "grad_norm": 0.2380947470664978, + "learning_rate": 4e-07, + "loss": 0.295, + "mean_token_accuracy": 0.930321242660284, + "num_tokens": 108154270.0, + "step": 1500 + }, + { + "epoch": 0.9369172551008466, + "grad_norm": 0.30462646484375, + "learning_rate": 3.996e-07, + "loss": 0.3522, + "mean_token_accuracy": 0.9162776879966259, + "num_tokens": 108218475.0, + "step": 1501 + }, + { + "epoch": 0.9375414504739984, + "grad_norm": 1.3595620393753052, + "learning_rate": 3.992e-07, + "loss": 0.2639, + "mean_token_accuracy": 0.9372390434145927, + "num_tokens": 108296254.0, + "step": 1502 + }, + { + "epoch": 0.9381656458471501, + "grad_norm": 0.25855323672294617, + "learning_rate": 3.9879999999999994e-07, + "loss": 0.315, + "mean_token_accuracy": 0.9272872768342495, + "num_tokens": 108371905.0, + "step": 1503 + }, + { + "epoch": 0.938789841220302, + "grad_norm": 1.031731128692627, + "learning_rate": 3.9839999999999997e-07, + "loss": 0.2934, + "mean_token_accuracy": 0.930096935480833, + "num_tokens": 108446445.0, + "step": 1504 + }, + { + "epoch": 0.9394140365934538, + "grad_norm": 0.182292640209198, + "learning_rate": 3.98e-07, + "loss": 0.3041, + "mean_token_accuracy": 0.927067868411541, + "num_tokens": 108520578.0, + "step": 1505 + }, + { + "epoch": 0.9400382319666055, + "grad_norm": 0.3520655930042267, + "learning_rate": 3.976e-07, + "loss": 0.3119, + "mean_token_accuracy": 0.9269010163843632, + "num_tokens": 108588593.0, + "step": 1506 + }, + { + "epoch": 0.9406624273397574, + "grad_norm": 0.2086082249879837, + "learning_rate": 3.972e-07, + "loss": 0.3229, + "mean_token_accuracy": 0.9272143393754959, + "num_tokens": 108661555.0, + "step": 1507 + }, + { + "epoch": 0.9412866227129092, + "grad_norm": 1.9862885475158691, + "learning_rate": 3.9679999999999995e-07, + "loss": 0.2684, + "mean_token_accuracy": 0.9317057691514492, + "num_tokens": 108736764.0, + "step": 1508 + }, + { + "epoch": 0.9419108180860609, + "grad_norm": 0.30021563172340393, + "learning_rate": 3.964e-07, + "loss": 0.3262, + "mean_token_accuracy": 0.9201641120016575, + "num_tokens": 108807214.0, + "step": 1509 + }, + { + "epoch": 0.9425350134592128, + "grad_norm": 0.31187909841537476, + "learning_rate": 3.96e-07, + "loss": 0.3468, + "mean_token_accuracy": 0.9177064560353756, + "num_tokens": 108873548.0, + "step": 1510 + }, + { + "epoch": 0.9431592088323645, + "grad_norm": 0.3519929051399231, + "learning_rate": 3.9559999999999997e-07, + "loss": 0.349, + "mean_token_accuracy": 0.9177522212266922, + "num_tokens": 108939876.0, + "step": 1511 + }, + { + "epoch": 0.9437834042055163, + "grad_norm": 0.22746022045612335, + "learning_rate": 3.952e-07, + "loss": 0.3025, + "mean_token_accuracy": 0.9305769875645638, + "num_tokens": 109014399.0, + "step": 1512 + }, + { + "epoch": 0.9444075995786682, + "grad_norm": 0.28738677501678467, + "learning_rate": 3.9479999999999996e-07, + "loss": 0.3031, + "mean_token_accuracy": 0.9284808672964573, + "num_tokens": 109086154.0, + "step": 1513 + }, + { + "epoch": 0.9450317949518199, + "grad_norm": 0.31238189339637756, + "learning_rate": 3.9439999999999993e-07, + "loss": 0.3707, + "mean_token_accuracy": 0.9154385328292847, + "num_tokens": 109157863.0, + "step": 1514 + }, + { + "epoch": 0.9456559903249717, + "grad_norm": 0.27992525696754456, + "learning_rate": 3.94e-07, + "loss": 0.3164, + "mean_token_accuracy": 0.9292497560381889, + "num_tokens": 109226839.0, + "step": 1515 + }, + { + "epoch": 0.9462801856981236, + "grad_norm": 0.1819540560245514, + "learning_rate": 3.936e-07, + "loss": 0.3039, + "mean_token_accuracy": 0.9269283041357994, + "num_tokens": 109298518.0, + "step": 1516 + }, + { + "epoch": 0.9469043810712753, + "grad_norm": 0.4744091033935547, + "learning_rate": 3.932e-07, + "loss": 0.2858, + "mean_token_accuracy": 0.9316937737166882, + "num_tokens": 109370436.0, + "step": 1517 + }, + { + "epoch": 0.9475285764444271, + "grad_norm": 0.18300119042396545, + "learning_rate": 3.9279999999999997e-07, + "loss": 0.2883, + "mean_token_accuracy": 0.9324792064726353, + "num_tokens": 109443402.0, + "step": 1518 + }, + { + "epoch": 0.948152771817579, + "grad_norm": 0.23999902606010437, + "learning_rate": 3.924e-07, + "loss": 0.2837, + "mean_token_accuracy": 0.9304834380745888, + "num_tokens": 109520983.0, + "step": 1519 + }, + { + "epoch": 0.9487769671907307, + "grad_norm": 0.2669299840927124, + "learning_rate": 3.92e-07, + "loss": 0.292, + "mean_token_accuracy": 0.9306202046573162, + "num_tokens": 109595465.0, + "step": 1520 + }, + { + "epoch": 0.9494011625638825, + "grad_norm": 0.17913204431533813, + "learning_rate": 3.916e-07, + "loss": 0.2895, + "mean_token_accuracy": 0.9334805645048618, + "num_tokens": 109671698.0, + "step": 1521 + }, + { + "epoch": 0.9500253579370342, + "grad_norm": 0.22783441841602325, + "learning_rate": 3.9119999999999996e-07, + "loss": 0.2929, + "mean_token_accuracy": 0.9300796091556549, + "num_tokens": 109744027.0, + "step": 1522 + }, + { + "epoch": 0.9506495533101861, + "grad_norm": 0.21075208485126495, + "learning_rate": 3.908e-07, + "loss": 0.2575, + "mean_token_accuracy": 0.937855925410986, + "num_tokens": 109819541.0, + "step": 1523 + }, + { + "epoch": 0.9512737486833379, + "grad_norm": 0.26121848821640015, + "learning_rate": 3.904e-07, + "loss": 0.2823, + "mean_token_accuracy": 0.9332057945430279, + "num_tokens": 109897218.0, + "step": 1524 + }, + { + "epoch": 0.9518979440564896, + "grad_norm": 0.2823277413845062, + "learning_rate": 3.8999999999999997e-07, + "loss": 0.3207, + "mean_token_accuracy": 0.9221915118396282, + "num_tokens": 109972625.0, + "step": 1525 + }, + { + "epoch": 0.9525221394296415, + "grad_norm": 0.2156323939561844, + "learning_rate": 3.896e-07, + "loss": 0.286, + "mean_token_accuracy": 0.9325485862791538, + "num_tokens": 110046216.0, + "step": 1526 + }, + { + "epoch": 0.9531463348027933, + "grad_norm": 0.27934643626213074, + "learning_rate": 3.8919999999999996e-07, + "loss": 0.2482, + "mean_token_accuracy": 0.9399369433522224, + "num_tokens": 110124794.0, + "step": 1527 + }, + { + "epoch": 0.953770530175945, + "grad_norm": 0.28440818190574646, + "learning_rate": 3.888e-07, + "loss": 0.3376, + "mean_token_accuracy": 0.9220961183309555, + "num_tokens": 110195905.0, + "step": 1528 + }, + { + "epoch": 0.9543947255490969, + "grad_norm": 0.2609595060348511, + "learning_rate": 3.884e-07, + "loss": 0.3122, + "mean_token_accuracy": 0.9269909039139748, + "num_tokens": 110267358.0, + "step": 1529 + }, + { + "epoch": 0.9550189209222487, + "grad_norm": 0.32703161239624023, + "learning_rate": 3.88e-07, + "loss": 0.3036, + "mean_token_accuracy": 0.9273074865341187, + "num_tokens": 110340528.0, + "step": 1530 + }, + { + "epoch": 0.9556431162954004, + "grad_norm": 0.2319597452878952, + "learning_rate": 3.876e-07, + "loss": 0.273, + "mean_token_accuracy": 0.9353843182325363, + "num_tokens": 110412222.0, + "step": 1531 + }, + { + "epoch": 0.9562673116685523, + "grad_norm": 0.2587757706642151, + "learning_rate": 3.8719999999999997e-07, + "loss": 0.2976, + "mean_token_accuracy": 0.9331025555729866, + "num_tokens": 110488083.0, + "step": 1532 + }, + { + "epoch": 0.956891507041704, + "grad_norm": 0.33140167593955994, + "learning_rate": 3.8679999999999994e-07, + "loss": 0.2882, + "mean_token_accuracy": 0.9309447258710861, + "num_tokens": 110561742.0, + "step": 1533 + }, + { + "epoch": 0.9575157024148558, + "grad_norm": 0.2194199562072754, + "learning_rate": 3.864e-07, + "loss": 0.3363, + "mean_token_accuracy": 0.9203958660364151, + "num_tokens": 110632121.0, + "step": 1534 + }, + { + "epoch": 0.9581398977880077, + "grad_norm": 0.868539571762085, + "learning_rate": 3.86e-07, + "loss": 0.2923, + "mean_token_accuracy": 0.9303386136889458, + "num_tokens": 110702421.0, + "step": 1535 + }, + { + "epoch": 0.9587640931611594, + "grad_norm": 0.21790970861911774, + "learning_rate": 3.8559999999999996e-07, + "loss": 0.2563, + "mean_token_accuracy": 0.9372724965214729, + "num_tokens": 110781407.0, + "step": 1536 + }, + { + "epoch": 0.9593882885343112, + "grad_norm": 0.20846930146217346, + "learning_rate": 3.852e-07, + "loss": 0.3077, + "mean_token_accuracy": 0.9257776513695717, + "num_tokens": 110851351.0, + "step": 1537 + }, + { + "epoch": 0.9600124839074631, + "grad_norm": 0.4070652425289154, + "learning_rate": 3.8479999999999995e-07, + "loss": 0.2933, + "mean_token_accuracy": 0.9304019808769226, + "num_tokens": 110922845.0, + "step": 1538 + }, + { + "epoch": 0.9606366792806148, + "grad_norm": 1.4363555908203125, + "learning_rate": 3.8440000000000003e-07, + "loss": 0.2808, + "mean_token_accuracy": 0.9342383928596973, + "num_tokens": 110998770.0, + "step": 1539 + }, + { + "epoch": 0.9612608746537666, + "grad_norm": 0.2531324625015259, + "learning_rate": 3.84e-07, + "loss": 0.3068, + "mean_token_accuracy": 0.9302497878670692, + "num_tokens": 111074267.0, + "step": 1540 + }, + { + "epoch": 0.9618850700269185, + "grad_norm": 0.2372715175151825, + "learning_rate": 3.8359999999999997e-07, + "loss": 0.3597, + "mean_token_accuracy": 0.9045576229691505, + "num_tokens": 111137224.0, + "step": 1541 + }, + { + "epoch": 0.9625092654000702, + "grad_norm": 0.2335701882839203, + "learning_rate": 3.832e-07, + "loss": 0.3571, + "mean_token_accuracy": 0.9170449376106262, + "num_tokens": 111205785.0, + "step": 1542 + }, + { + "epoch": 0.963133460773222, + "grad_norm": 0.2605380117893219, + "learning_rate": 3.8279999999999996e-07, + "loss": 0.29, + "mean_token_accuracy": 0.9323593750596046, + "num_tokens": 111279357.0, + "step": 1543 + }, + { + "epoch": 0.9637576561463738, + "grad_norm": 0.20021952688694, + "learning_rate": 3.824e-07, + "loss": 0.3092, + "mean_token_accuracy": 0.9272899143397808, + "num_tokens": 111351222.0, + "step": 1544 + }, + { + "epoch": 0.9643818515195256, + "grad_norm": 0.21225203573703766, + "learning_rate": 3.82e-07, + "loss": 0.2949, + "mean_token_accuracy": 0.9286953136324883, + "num_tokens": 111424591.0, + "step": 1545 + }, + { + "epoch": 0.9650060468926774, + "grad_norm": 0.2778775095939636, + "learning_rate": 3.816e-07, + "loss": 0.2902, + "mean_token_accuracy": 0.9321311637759209, + "num_tokens": 111498342.0, + "step": 1546 + }, + { + "epoch": 0.9656302422658292, + "grad_norm": 0.4023289084434509, + "learning_rate": 3.8119999999999995e-07, + "loss": 0.3119, + "mean_token_accuracy": 0.9269566349685192, + "num_tokens": 111573822.0, + "step": 1547 + }, + { + "epoch": 0.966254437638981, + "grad_norm": 0.39578378200531006, + "learning_rate": 3.808e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.923732940107584, + "num_tokens": 111645221.0, + "step": 1548 + }, + { + "epoch": 0.9668786330121328, + "grad_norm": 0.32403564453125, + "learning_rate": 3.804e-07, + "loss": 0.2979, + "mean_token_accuracy": 0.930999081581831, + "num_tokens": 111718248.0, + "step": 1549 + }, + { + "epoch": 0.9675028283852846, + "grad_norm": 0.43755772709846497, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.2908, + "mean_token_accuracy": 0.9285480976104736, + "num_tokens": 111793552.0, + "step": 1550 + }, + { + "epoch": 0.9681270237584364, + "grad_norm": 0.23464861512184143, + "learning_rate": 3.796e-07, + "loss": 0.3022, + "mean_token_accuracy": 0.9262748286128044, + "num_tokens": 111863279.0, + "step": 1551 + }, + { + "epoch": 0.9687512191315882, + "grad_norm": 0.23094603419303894, + "learning_rate": 3.7919999999999995e-07, + "loss": 0.3012, + "mean_token_accuracy": 0.9299602508544922, + "num_tokens": 111935329.0, + "step": 1552 + }, + { + "epoch": 0.96937541450474, + "grad_norm": 1.310754656791687, + "learning_rate": 3.7880000000000003e-07, + "loss": 0.3321, + "mean_token_accuracy": 0.9238010346889496, + "num_tokens": 112005349.0, + "step": 1553 + }, + { + "epoch": 0.9699996098778918, + "grad_norm": 0.26731136441230774, + "learning_rate": 3.784e-07, + "loss": 0.3185, + "mean_token_accuracy": 0.9252562634646893, + "num_tokens": 112075576.0, + "step": 1554 + }, + { + "epoch": 0.9706238052510435, + "grad_norm": 0.2373322695493698, + "learning_rate": 3.7799999999999997e-07, + "loss": 0.3152, + "mean_token_accuracy": 0.9211914502084255, + "num_tokens": 112145586.0, + "step": 1555 + }, + { + "epoch": 0.9712480006241954, + "grad_norm": 0.31798675656318665, + "learning_rate": 3.776e-07, + "loss": 0.2944, + "mean_token_accuracy": 0.9284104593098164, + "num_tokens": 112218496.0, + "step": 1556 + }, + { + "epoch": 0.9718721959973472, + "grad_norm": 0.7989190816879272, + "learning_rate": 3.7719999999999996e-07, + "loss": 0.2963, + "mean_token_accuracy": 0.9279116652905941, + "num_tokens": 112292072.0, + "step": 1557 + }, + { + "epoch": 0.9724963913704989, + "grad_norm": 0.27608856558799744, + "learning_rate": 3.768e-07, + "loss": 0.3264, + "mean_token_accuracy": 0.9196476824581623, + "num_tokens": 112359587.0, + "step": 1558 + }, + { + "epoch": 0.9731205867436508, + "grad_norm": 0.28271064162254333, + "learning_rate": 3.764e-07, + "loss": 0.2727, + "mean_token_accuracy": 0.9339366815984249, + "num_tokens": 112436108.0, + "step": 1559 + }, + { + "epoch": 0.9737447821168026, + "grad_norm": 0.2214769721031189, + "learning_rate": 3.76e-07, + "loss": 0.3389, + "mean_token_accuracy": 0.923375815153122, + "num_tokens": 112507644.0, + "step": 1560 + }, + { + "epoch": 0.9743689774899543, + "grad_norm": 0.27770498394966125, + "learning_rate": 3.7559999999999995e-07, + "loss": 0.2811, + "mean_token_accuracy": 0.9323306232690811, + "num_tokens": 112581332.0, + "step": 1561 + }, + { + "epoch": 0.9749931728631062, + "grad_norm": 0.286484956741333, + "learning_rate": 3.7519999999999997e-07, + "loss": 0.3333, + "mean_token_accuracy": 0.925186563283205, + "num_tokens": 112649311.0, + "step": 1562 + }, + { + "epoch": 0.975617368236258, + "grad_norm": 0.2229987531900406, + "learning_rate": 3.748e-07, + "loss": 0.2916, + "mean_token_accuracy": 0.9275497980415821, + "num_tokens": 112726077.0, + "step": 1563 + }, + { + "epoch": 0.9762415636094097, + "grad_norm": 0.24206797778606415, + "learning_rate": 3.744e-07, + "loss": 0.3324, + "mean_token_accuracy": 0.9200384020805359, + "num_tokens": 112797770.0, + "step": 1564 + }, + { + "epoch": 0.9768657589825616, + "grad_norm": 0.26477596163749695, + "learning_rate": 3.74e-07, + "loss": 0.3205, + "mean_token_accuracy": 0.9191819168627262, + "num_tokens": 112868340.0, + "step": 1565 + }, + { + "epoch": 0.9774899543557133, + "grad_norm": 0.7016551494598389, + "learning_rate": 3.7359999999999996e-07, + "loss": 0.3097, + "mean_token_accuracy": 0.927630215883255, + "num_tokens": 112942449.0, + "step": 1566 + }, + { + "epoch": 0.9781141497288651, + "grad_norm": 0.21598811447620392, + "learning_rate": 3.732e-07, + "loss": 0.287, + "mean_token_accuracy": 0.9339645206928253, + "num_tokens": 113014029.0, + "step": 1567 + }, + { + "epoch": 0.978738345102017, + "grad_norm": 0.1792646199464798, + "learning_rate": 3.728e-07, + "loss": 0.3022, + "mean_token_accuracy": 0.9261836335062981, + "num_tokens": 113088674.0, + "step": 1568 + }, + { + "epoch": 0.9793625404751687, + "grad_norm": 0.25003379583358765, + "learning_rate": 3.7239999999999997e-07, + "loss": 0.2643, + "mean_token_accuracy": 0.9395896829664707, + "num_tokens": 113162413.0, + "step": 1569 + }, + { + "epoch": 0.9799867358483205, + "grad_norm": 0.24991759657859802, + "learning_rate": 3.72e-07, + "loss": 0.29, + "mean_token_accuracy": 0.9313024692237377, + "num_tokens": 113238956.0, + "step": 1570 + }, + { + "epoch": 0.9806109312214724, + "grad_norm": 0.18492339551448822, + "learning_rate": 3.7159999999999997e-07, + "loss": 0.2936, + "mean_token_accuracy": 0.928056038916111, + "num_tokens": 113310549.0, + "step": 1571 + }, + { + "epoch": 0.9812351265946241, + "grad_norm": 0.2581470310688019, + "learning_rate": 3.7119999999999994e-07, + "loss": 0.3087, + "mean_token_accuracy": 0.9288634769618511, + "num_tokens": 113382881.0, + "step": 1572 + }, + { + "epoch": 0.9818593219677759, + "grad_norm": 0.23586726188659668, + "learning_rate": 3.708e-07, + "loss": 0.2777, + "mean_token_accuracy": 0.9338797517120838, + "num_tokens": 113456259.0, + "step": 1573 + }, + { + "epoch": 0.9824835173409278, + "grad_norm": 0.5645190477371216, + "learning_rate": 3.704e-07, + "loss": 0.3285, + "mean_token_accuracy": 0.9245781414210796, + "num_tokens": 113525856.0, + "step": 1574 + }, + { + "epoch": 0.9831077127140795, + "grad_norm": 0.2574668228626251, + "learning_rate": 3.7e-07, + "loss": 0.2941, + "mean_token_accuracy": 0.9290458224713802, + "num_tokens": 113599029.0, + "step": 1575 + }, + { + "epoch": 0.9837319080872313, + "grad_norm": 1.0085642337799072, + "learning_rate": 3.696e-07, + "loss": 0.2882, + "mean_token_accuracy": 0.9328582510352135, + "num_tokens": 113674990.0, + "step": 1576 + }, + { + "epoch": 0.984356103460383, + "grad_norm": 0.2103048861026764, + "learning_rate": 3.6919999999999994e-07, + "loss": 0.2929, + "mean_token_accuracy": 0.9308940954506397, + "num_tokens": 113750151.0, + "step": 1577 + }, + { + "epoch": 0.9849802988335349, + "grad_norm": 0.2094225138425827, + "learning_rate": 3.688e-07, + "loss": 0.3499, + "mean_token_accuracy": 0.9199853762984276, + "num_tokens": 113818971.0, + "step": 1578 + }, + { + "epoch": 0.9856044942066867, + "grad_norm": 0.23809947073459625, + "learning_rate": 3.684e-07, + "loss": 0.3091, + "mean_token_accuracy": 0.9281027875840664, + "num_tokens": 113890526.0, + "step": 1579 + }, + { + "epoch": 0.9862286895798384, + "grad_norm": 0.2628929913043976, + "learning_rate": 3.6799999999999996e-07, + "loss": 0.3014, + "mean_token_accuracy": 0.9268261343240738, + "num_tokens": 113963906.0, + "step": 1580 + }, + { + "epoch": 0.9868528849529903, + "grad_norm": 0.43035009503364563, + "learning_rate": 3.676e-07, + "loss": 0.3383, + "mean_token_accuracy": 0.9175631441175938, + "num_tokens": 114032000.0, + "step": 1581 + }, + { + "epoch": 0.9874770803261421, + "grad_norm": 1.0745575428009033, + "learning_rate": 3.672e-07, + "loss": 0.3258, + "mean_token_accuracy": 0.9206287935376167, + "num_tokens": 114099138.0, + "step": 1582 + }, + { + "epoch": 0.9881012756992938, + "grad_norm": 0.16735684871673584, + "learning_rate": 3.668e-07, + "loss": 0.3243, + "mean_token_accuracy": 0.920817207545042, + "num_tokens": 114172003.0, + "step": 1583 + }, + { + "epoch": 0.9887254710724457, + "grad_norm": 0.2557661831378937, + "learning_rate": 3.664e-07, + "loss": 0.3405, + "mean_token_accuracy": 0.918581310659647, + "num_tokens": 114243808.0, + "step": 1584 + }, + { + "epoch": 0.9893496664455975, + "grad_norm": 0.28524208068847656, + "learning_rate": 3.6599999999999997e-07, + "loss": 0.324, + "mean_token_accuracy": 0.9222669005393982, + "num_tokens": 114313460.0, + "step": 1585 + }, + { + "epoch": 0.9899738618187492, + "grad_norm": 0.8366701602935791, + "learning_rate": 3.6559999999999994e-07, + "loss": 0.3099, + "mean_token_accuracy": 0.9263188950717449, + "num_tokens": 114386909.0, + "step": 1586 + }, + { + "epoch": 0.9905980571919011, + "grad_norm": 0.27082502841949463, + "learning_rate": 3.652e-07, + "loss": 0.3239, + "mean_token_accuracy": 0.923606026917696, + "num_tokens": 114457001.0, + "step": 1587 + }, + { + "epoch": 0.9912222525650528, + "grad_norm": 0.18909859657287598, + "learning_rate": 3.648e-07, + "loss": 0.323, + "mean_token_accuracy": 0.9222583919763565, + "num_tokens": 114526052.0, + "step": 1588 + }, + { + "epoch": 0.9918464479382046, + "grad_norm": 1.0308641195297241, + "learning_rate": 3.644e-07, + "loss": 0.2968, + "mean_token_accuracy": 0.9295001737773418, + "num_tokens": 114599249.0, + "step": 1589 + }, + { + "epoch": 0.9924706433113565, + "grad_norm": 0.2252720594406128, + "learning_rate": 3.64e-07, + "loss": 0.2803, + "mean_token_accuracy": 0.9300199374556541, + "num_tokens": 114675224.0, + "step": 1590 + }, + { + "epoch": 0.9930948386845082, + "grad_norm": 0.3800940215587616, + "learning_rate": 3.6359999999999995e-07, + "loss": 0.2791, + "mean_token_accuracy": 0.9354421310126781, + "num_tokens": 114750616.0, + "step": 1591 + }, + { + "epoch": 0.99371903405766, + "grad_norm": 0.22756795585155487, + "learning_rate": 3.632e-07, + "loss": 0.2601, + "mean_token_accuracy": 0.9382712170481682, + "num_tokens": 114825081.0, + "step": 1592 + }, + { + "epoch": 0.9943432294308119, + "grad_norm": 0.23344050347805023, + "learning_rate": 3.628e-07, + "loss": 0.2691, + "mean_token_accuracy": 0.9366080015897751, + "num_tokens": 114904135.0, + "step": 1593 + }, + { + "epoch": 0.9949674248039636, + "grad_norm": 0.2361176311969757, + "learning_rate": 3.6239999999999996e-07, + "loss": 0.3222, + "mean_token_accuracy": 0.9253505617380142, + "num_tokens": 114974359.0, + "step": 1594 + }, + { + "epoch": 0.9955916201771154, + "grad_norm": 0.3250555694103241, + "learning_rate": 3.62e-07, + "loss": 0.2891, + "mean_token_accuracy": 0.9304782710969448, + "num_tokens": 115044587.0, + "step": 1595 + }, + { + "epoch": 0.9962158155502673, + "grad_norm": 0.18662358820438385, + "learning_rate": 3.6159999999999996e-07, + "loss": 0.3396, + "mean_token_accuracy": 0.917623694986105, + "num_tokens": 115112206.0, + "step": 1596 + }, + { + "epoch": 0.996840010923419, + "grad_norm": 0.3345811665058136, + "learning_rate": 3.612e-07, + "loss": 0.2856, + "mean_token_accuracy": 0.9328136704862118, + "num_tokens": 115188401.0, + "step": 1597 + }, + { + "epoch": 0.9974642062965708, + "grad_norm": 0.18522153794765472, + "learning_rate": 3.608e-07, + "loss": 0.2991, + "mean_token_accuracy": 0.9287934675812721, + "num_tokens": 115265109.0, + "step": 1598 + }, + { + "epoch": 0.9980884016697226, + "grad_norm": 0.18597567081451416, + "learning_rate": 3.6039999999999997e-07, + "loss": 0.3094, + "mean_token_accuracy": 0.9246964007616043, + "num_tokens": 115332618.0, + "step": 1599 + }, + { + "epoch": 0.9987125970428744, + "grad_norm": 0.32469889521598816, + "learning_rate": 3.6e-07, + "loss": 0.2591, + "mean_token_accuracy": 0.9359429776668549, + "num_tokens": 115406878.0, + "step": 1600 + }, + { + "epoch": 0.9993367924160262, + "grad_norm": 0.15983019769191742, + "learning_rate": 3.5959999999999996e-07, + "loss": 0.2981, + "mean_token_accuracy": 0.9248366616666317, + "num_tokens": 115478211.0, + "step": 1601 + }, + { + "epoch": 0.999960987789178, + "grad_norm": 0.17861220240592957, + "learning_rate": 3.592e-07, + "loss": 0.299, + "mean_token_accuracy": 0.9245053008198738, + "num_tokens": 115550676.0, + "step": 1602 + }, + { + "epoch": 1.0, + "grad_norm": 0.21406081318855286, + "learning_rate": 3.588e-07, + "loss": 0.0195, + "mean_token_accuracy": 0.9274051785469055, + "num_tokens": 115554704.0, + "step": 1603 + }, + { + "epoch": 1.0006241953731518, + "grad_norm": 0.1932128220796585, + "learning_rate": 3.584e-07, + "loss": 0.2932, + "mean_token_accuracy": 0.931299164891243, + "num_tokens": 115629543.0, + "step": 1604 + }, + { + "epoch": 1.0012483907463037, + "grad_norm": 0.38350042700767517, + "learning_rate": 3.5799999999999995e-07, + "loss": 0.3342, + "mean_token_accuracy": 0.9217900969088078, + "num_tokens": 115699814.0, + "step": 1605 + }, + { + "epoch": 1.0018725861194553, + "grad_norm": 0.6531975269317627, + "learning_rate": 3.5759999999999997e-07, + "loss": 0.3055, + "mean_token_accuracy": 0.9274476133286953, + "num_tokens": 115770372.0, + "step": 1606 + }, + { + "epoch": 1.0024967814926071, + "grad_norm": 0.39208337664604187, + "learning_rate": 3.572e-07, + "loss": 0.3113, + "mean_token_accuracy": 0.9228588417172432, + "num_tokens": 115844300.0, + "step": 1607 + }, + { + "epoch": 1.003120976865759, + "grad_norm": 0.2965168356895447, + "learning_rate": 3.5679999999999997e-07, + "loss": 0.3028, + "mean_token_accuracy": 0.9295454099774361, + "num_tokens": 115921833.0, + "step": 1608 + }, + { + "epoch": 1.0037451722389108, + "grad_norm": 0.3321475684642792, + "learning_rate": 3.564e-07, + "loss": 0.3204, + "mean_token_accuracy": 0.9248853847384453, + "num_tokens": 115993774.0, + "step": 1609 + }, + { + "epoch": 1.0043693676120626, + "grad_norm": 0.33594146370887756, + "learning_rate": 3.5599999999999996e-07, + "loss": 0.2804, + "mean_token_accuracy": 0.9334884472191334, + "num_tokens": 116066435.0, + "step": 1610 + }, + { + "epoch": 1.0049935629852145, + "grad_norm": 0.4940372407436371, + "learning_rate": 3.5560000000000003e-07, + "loss": 0.2817, + "mean_token_accuracy": 0.9334036894142628, + "num_tokens": 116142627.0, + "step": 1611 + }, + { + "epoch": 1.005617758358366, + "grad_norm": 0.6876238584518433, + "learning_rate": 3.552e-07, + "loss": 0.3298, + "mean_token_accuracy": 0.919537615031004, + "num_tokens": 116210467.0, + "step": 1612 + }, + { + "epoch": 1.006241953731518, + "grad_norm": 0.4228580594062805, + "learning_rate": 3.548e-07, + "loss": 0.3399, + "mean_token_accuracy": 0.9189438149333, + "num_tokens": 116284293.0, + "step": 1613 + }, + { + "epoch": 1.0068661491046698, + "grad_norm": 0.2676751911640167, + "learning_rate": 3.544e-07, + "loss": 0.3339, + "mean_token_accuracy": 0.9225188121199608, + "num_tokens": 116357382.0, + "step": 1614 + }, + { + "epoch": 1.0074903444778216, + "grad_norm": 0.803371250629425, + "learning_rate": 3.5399999999999997e-07, + "loss": 0.3234, + "mean_token_accuracy": 0.9196959435939789, + "num_tokens": 116427789.0, + "step": 1615 + }, + { + "epoch": 1.0081145398509734, + "grad_norm": 0.2615061402320862, + "learning_rate": 3.536e-07, + "loss": 0.2987, + "mean_token_accuracy": 0.931877925992012, + "num_tokens": 116502948.0, + "step": 1616 + }, + { + "epoch": 1.008738735224125, + "grad_norm": 0.2492731809616089, + "learning_rate": 3.532e-07, + "loss": 0.3161, + "mean_token_accuracy": 0.9250870570540428, + "num_tokens": 116577348.0, + "step": 1617 + }, + { + "epoch": 1.0093629305972769, + "grad_norm": 0.6607505083084106, + "learning_rate": 3.528e-07, + "loss": 0.3246, + "mean_token_accuracy": 0.9235294535756111, + "num_tokens": 116647281.0, + "step": 1618 + }, + { + "epoch": 1.0099871259704287, + "grad_norm": 0.23677097260951996, + "learning_rate": 3.5239999999999995e-07, + "loss": 0.3303, + "mean_token_accuracy": 0.9184599705040455, + "num_tokens": 116721149.0, + "step": 1619 + }, + { + "epoch": 1.0106113213435806, + "grad_norm": 0.26639312505722046, + "learning_rate": 3.52e-07, + "loss": 0.2974, + "mean_token_accuracy": 0.9295490384101868, + "num_tokens": 116792448.0, + "step": 1620 + }, + { + "epoch": 1.0112355167167324, + "grad_norm": 0.23138631880283356, + "learning_rate": 3.516e-07, + "loss": 0.2948, + "mean_token_accuracy": 0.9309183284640312, + "num_tokens": 116866573.0, + "step": 1621 + }, + { + "epoch": 1.0118597120898842, + "grad_norm": 0.22451195120811462, + "learning_rate": 3.512e-07, + "loss": 0.3587, + "mean_token_accuracy": 0.9159196130931377, + "num_tokens": 116936203.0, + "step": 1622 + }, + { + "epoch": 1.0124839074630358, + "grad_norm": 0.25354111194610596, + "learning_rate": 3.508e-07, + "loss": 0.304, + "mean_token_accuracy": 0.9258918017148972, + "num_tokens": 117008314.0, + "step": 1623 + }, + { + "epoch": 1.0131081028361877, + "grad_norm": 0.19401562213897705, + "learning_rate": 3.5039999999999996e-07, + "loss": 0.293, + "mean_token_accuracy": 0.9310904443264008, + "num_tokens": 117080036.0, + "step": 1624 + }, + { + "epoch": 1.0137322982093395, + "grad_norm": 0.33229368925094604, + "learning_rate": 3.5e-07, + "loss": 0.3217, + "mean_token_accuracy": 0.9226459600031376, + "num_tokens": 117149721.0, + "step": 1625 + }, + { + "epoch": 1.0143564935824914, + "grad_norm": 0.2286362200975418, + "learning_rate": 3.496e-07, + "loss": 0.3195, + "mean_token_accuracy": 0.9248562790453434, + "num_tokens": 117217613.0, + "step": 1626 + }, + { + "epoch": 1.0149806889556432, + "grad_norm": 0.22259265184402466, + "learning_rate": 3.492e-07, + "loss": 0.339, + "mean_token_accuracy": 0.9152814075350761, + "num_tokens": 117280604.0, + "step": 1627 + }, + { + "epoch": 1.0156048843287948, + "grad_norm": 0.3311367928981781, + "learning_rate": 3.488e-07, + "loss": 0.3606, + "mean_token_accuracy": 0.9144710674881935, + "num_tokens": 117348258.0, + "step": 1628 + }, + { + "epoch": 1.0162290797019466, + "grad_norm": 0.18232843279838562, + "learning_rate": 3.4839999999999997e-07, + "loss": 0.283, + "mean_token_accuracy": 0.9324621446430683, + "num_tokens": 117423219.0, + "step": 1629 + }, + { + "epoch": 1.0168532750750985, + "grad_norm": 0.25107041001319885, + "learning_rate": 3.4799999999999994e-07, + "loss": 0.3193, + "mean_token_accuracy": 0.9182125739753246, + "num_tokens": 117495025.0, + "step": 1630 + }, + { + "epoch": 1.0174774704482503, + "grad_norm": 0.2481946349143982, + "learning_rate": 3.476e-07, + "loss": 0.3084, + "mean_token_accuracy": 0.9280303232371807, + "num_tokens": 117566026.0, + "step": 1631 + }, + { + "epoch": 1.0181016658214022, + "grad_norm": 0.2603878676891327, + "learning_rate": 3.472e-07, + "loss": 0.3289, + "mean_token_accuracy": 0.9221860133111477, + "num_tokens": 117636183.0, + "step": 1632 + }, + { + "epoch": 1.018725861194554, + "grad_norm": 0.28766804933547974, + "learning_rate": 3.4679999999999996e-07, + "loss": 0.3314, + "mean_token_accuracy": 0.9180422127246857, + "num_tokens": 117708530.0, + "step": 1633 + }, + { + "epoch": 1.0193500565677056, + "grad_norm": 0.2663969099521637, + "learning_rate": 3.464e-07, + "loss": 0.2938, + "mean_token_accuracy": 0.9303982965648174, + "num_tokens": 117778283.0, + "step": 1634 + }, + { + "epoch": 1.0199742519408574, + "grad_norm": 0.2180851399898529, + "learning_rate": 3.4599999999999995e-07, + "loss": 0.3404, + "mean_token_accuracy": 0.9210164174437523, + "num_tokens": 117850137.0, + "step": 1635 + }, + { + "epoch": 1.0205984473140093, + "grad_norm": 0.309143990278244, + "learning_rate": 3.456e-07, + "loss": 0.3346, + "mean_token_accuracy": 0.9202089495956898, + "num_tokens": 117924616.0, + "step": 1636 + }, + { + "epoch": 1.0212226426871611, + "grad_norm": 0.38292497396469116, + "learning_rate": 3.452e-07, + "loss": 0.2571, + "mean_token_accuracy": 0.9372935742139816, + "num_tokens": 117999674.0, + "step": 1637 + }, + { + "epoch": 1.021846838060313, + "grad_norm": 0.19640207290649414, + "learning_rate": 3.4479999999999996e-07, + "loss": 0.3306, + "mean_token_accuracy": 0.917077299207449, + "num_tokens": 118068273.0, + "step": 1638 + }, + { + "epoch": 1.0224710334334646, + "grad_norm": 0.28674083948135376, + "learning_rate": 3.444e-07, + "loss": 0.2825, + "mean_token_accuracy": 0.9323832429945469, + "num_tokens": 118140623.0, + "step": 1639 + }, + { + "epoch": 1.0230952288066164, + "grad_norm": 0.2557254731655121, + "learning_rate": 3.4399999999999996e-07, + "loss": 0.311, + "mean_token_accuracy": 0.9289374426007271, + "num_tokens": 118210409.0, + "step": 1640 + }, + { + "epoch": 1.0237194241797682, + "grad_norm": 0.25651875138282776, + "learning_rate": 3.436e-07, + "loss": 0.3084, + "mean_token_accuracy": 0.9252827316522598, + "num_tokens": 118275511.0, + "step": 1641 + }, + { + "epoch": 1.02434361955292, + "grad_norm": 0.2779063284397125, + "learning_rate": 3.432e-07, + "loss": 0.3069, + "mean_token_accuracy": 0.9300004839897156, + "num_tokens": 118349812.0, + "step": 1642 + }, + { + "epoch": 1.024967814926072, + "grad_norm": 0.33479711413383484, + "learning_rate": 3.4279999999999997e-07, + "loss": 0.2863, + "mean_token_accuracy": 0.9282204061746597, + "num_tokens": 118425899.0, + "step": 1643 + }, + { + "epoch": 1.0255920102992238, + "grad_norm": 0.30559638142585754, + "learning_rate": 3.4239999999999994e-07, + "loss": 0.2983, + "mean_token_accuracy": 0.9294515438377857, + "num_tokens": 118497893.0, + "step": 1644 + }, + { + "epoch": 1.0262162056723754, + "grad_norm": 0.2491634488105774, + "learning_rate": 3.42e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.9191662259399891, + "num_tokens": 118564851.0, + "step": 1645 + }, + { + "epoch": 1.0268404010455272, + "grad_norm": 0.23112337291240692, + "learning_rate": 3.416e-07, + "loss": 0.329, + "mean_token_accuracy": 0.9221124276518822, + "num_tokens": 118634298.0, + "step": 1646 + }, + { + "epoch": 1.027464596418679, + "grad_norm": 0.23080191016197205, + "learning_rate": 3.412e-07, + "loss": 0.3247, + "mean_token_accuracy": 0.9221488758921623, + "num_tokens": 118702824.0, + "step": 1647 + }, + { + "epoch": 1.0280887917918309, + "grad_norm": 0.31089046597480774, + "learning_rate": 3.408e-07, + "loss": 0.2978, + "mean_token_accuracy": 0.9307245127856731, + "num_tokens": 118776548.0, + "step": 1648 + }, + { + "epoch": 1.0287129871649827, + "grad_norm": 0.2157994657754898, + "learning_rate": 3.4039999999999995e-07, + "loss": 0.2958, + "mean_token_accuracy": 0.9304368309676647, + "num_tokens": 118849065.0, + "step": 1649 + }, + { + "epoch": 1.0293371825381343, + "grad_norm": 0.2942715287208557, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.3105, + "mean_token_accuracy": 0.9245512895286083, + "num_tokens": 118919855.0, + "step": 1650 + }, + { + "epoch": 1.0299613779112862, + "grad_norm": 0.39231306314468384, + "learning_rate": 3.396e-07, + "loss": 0.3454, + "mean_token_accuracy": 0.9197710901498795, + "num_tokens": 118990180.0, + "step": 1651 + }, + { + "epoch": 1.030585573284438, + "grad_norm": 0.20545826852321625, + "learning_rate": 3.3919999999999997e-07, + "loss": 0.285, + "mean_token_accuracy": 0.9301052279770374, + "num_tokens": 119062537.0, + "step": 1652 + }, + { + "epoch": 1.0312097686575898, + "grad_norm": 0.5343942642211914, + "learning_rate": 3.388e-07, + "loss": 0.2987, + "mean_token_accuracy": 0.9297160319983959, + "num_tokens": 119132521.0, + "step": 1653 + }, + { + "epoch": 1.0318339640307417, + "grad_norm": 0.32938653230667114, + "learning_rate": 3.3839999999999996e-07, + "loss": 0.3297, + "mean_token_accuracy": 0.918176893144846, + "num_tokens": 119199786.0, + "step": 1654 + }, + { + "epoch": 1.0324581594038935, + "grad_norm": 0.34050726890563965, + "learning_rate": 3.38e-07, + "loss": 0.2657, + "mean_token_accuracy": 0.938018050044775, + "num_tokens": 119278460.0, + "step": 1655 + }, + { + "epoch": 1.0330823547770451, + "grad_norm": 0.1936492621898651, + "learning_rate": 3.376e-07, + "loss": 0.3248, + "mean_token_accuracy": 0.9237176179885864, + "num_tokens": 119351892.0, + "step": 1656 + }, + { + "epoch": 1.033706550150197, + "grad_norm": 0.2547052800655365, + "learning_rate": 3.372e-07, + "loss": 0.3013, + "mean_token_accuracy": 0.9312080107629299, + "num_tokens": 119423625.0, + "step": 1657 + }, + { + "epoch": 1.0343307455233488, + "grad_norm": 0.2038017362356186, + "learning_rate": 3.368e-07, + "loss": 0.3307, + "mean_token_accuracy": 0.917379055172205, + "num_tokens": 119492209.0, + "step": 1658 + }, + { + "epoch": 1.0349549408965006, + "grad_norm": 0.3556070923805237, + "learning_rate": 3.3639999999999997e-07, + "loss": 0.3034, + "mean_token_accuracy": 0.9250662997364998, + "num_tokens": 119566810.0, + "step": 1659 + }, + { + "epoch": 1.0355791362696525, + "grad_norm": 0.18778745830059052, + "learning_rate": 3.36e-07, + "loss": 0.3259, + "mean_token_accuracy": 0.924112644046545, + "num_tokens": 119641018.0, + "step": 1660 + }, + { + "epoch": 1.036203331642804, + "grad_norm": 0.20863384008407593, + "learning_rate": 3.356e-07, + "loss": 0.2937, + "mean_token_accuracy": 0.9315247312188148, + "num_tokens": 119714976.0, + "step": 1661 + }, + { + "epoch": 1.036827527015956, + "grad_norm": 0.1898520141839981, + "learning_rate": 3.352e-07, + "loss": 0.2817, + "mean_token_accuracy": 0.9316240064799786, + "num_tokens": 119787674.0, + "step": 1662 + }, + { + "epoch": 1.0374517223891078, + "grad_norm": 0.20981153845787048, + "learning_rate": 3.3479999999999995e-07, + "loss": 0.2825, + "mean_token_accuracy": 0.9336193650960922, + "num_tokens": 119861520.0, + "step": 1663 + }, + { + "epoch": 1.0380759177622596, + "grad_norm": 0.1972300112247467, + "learning_rate": 3.344e-07, + "loss": 0.3237, + "mean_token_accuracy": 0.9242386296391487, + "num_tokens": 119933385.0, + "step": 1664 + }, + { + "epoch": 1.0387001131354114, + "grad_norm": 0.24650917947292328, + "learning_rate": 3.34e-07, + "loss": 0.3056, + "mean_token_accuracy": 0.9223498292267323, + "num_tokens": 120002717.0, + "step": 1665 + }, + { + "epoch": 1.0393243085085633, + "grad_norm": 1.1208730936050415, + "learning_rate": 3.3359999999999997e-07, + "loss": 0.2954, + "mean_token_accuracy": 0.9291603602468967, + "num_tokens": 120075109.0, + "step": 1666 + }, + { + "epoch": 1.039948503881715, + "grad_norm": 0.21498827636241913, + "learning_rate": 3.332e-07, + "loss": 0.2812, + "mean_token_accuracy": 0.9332129769027233, + "num_tokens": 120148245.0, + "step": 1667 + }, + { + "epoch": 1.0405726992548667, + "grad_norm": 0.19925834238529205, + "learning_rate": 3.3279999999999996e-07, + "loss": 0.3305, + "mean_token_accuracy": 0.9193913899362087, + "num_tokens": 120221325.0, + "step": 1668 + }, + { + "epoch": 1.0411968946280186, + "grad_norm": 0.27961692214012146, + "learning_rate": 3.3239999999999993e-07, + "loss": 0.3156, + "mean_token_accuracy": 0.9253822788596153, + "num_tokens": 120295649.0, + "step": 1669 + }, + { + "epoch": 1.0418210900011704, + "grad_norm": 0.5844243764877319, + "learning_rate": 3.32e-07, + "loss": 0.3072, + "mean_token_accuracy": 0.9276253059506416, + "num_tokens": 120365066.0, + "step": 1670 + }, + { + "epoch": 1.0424452853743222, + "grad_norm": 0.23365755379199982, + "learning_rate": 3.316e-07, + "loss": 0.283, + "mean_token_accuracy": 0.9274475127458572, + "num_tokens": 120438639.0, + "step": 1671 + }, + { + "epoch": 1.0430694807474739, + "grad_norm": 0.1777581423521042, + "learning_rate": 3.312e-07, + "loss": 0.3047, + "mean_token_accuracy": 0.9253861084580421, + "num_tokens": 120507266.0, + "step": 1672 + }, + { + "epoch": 1.0436936761206257, + "grad_norm": 0.29000213742256165, + "learning_rate": 3.3079999999999997e-07, + "loss": 0.2925, + "mean_token_accuracy": 0.9297806397080421, + "num_tokens": 120582111.0, + "step": 1673 + }, + { + "epoch": 1.0443178714937775, + "grad_norm": 0.2519325613975525, + "learning_rate": 3.304e-07, + "loss": 0.3634, + "mean_token_accuracy": 0.9164963699877262, + "num_tokens": 120652117.0, + "step": 1674 + }, + { + "epoch": 1.0449420668669294, + "grad_norm": 0.19615894556045532, + "learning_rate": 3.3e-07, + "loss": 0.2555, + "mean_token_accuracy": 0.9374530576169491, + "num_tokens": 120734201.0, + "step": 1675 + }, + { + "epoch": 1.0455662622400812, + "grad_norm": 0.21159622073173523, + "learning_rate": 3.296e-07, + "loss": 0.3205, + "mean_token_accuracy": 0.9206772930920124, + "num_tokens": 120804916.0, + "step": 1676 + }, + { + "epoch": 1.046190457613233, + "grad_norm": 0.24135659635066986, + "learning_rate": 3.2919999999999996e-07, + "loss": 0.2538, + "mean_token_accuracy": 0.9383780732750893, + "num_tokens": 120879524.0, + "step": 1677 + }, + { + "epoch": 1.0468146529863847, + "grad_norm": 0.18967393040657043, + "learning_rate": 3.288e-07, + "loss": 0.3333, + "mean_token_accuracy": 0.9209445491433144, + "num_tokens": 120944587.0, + "step": 1678 + }, + { + "epoch": 1.0474388483595365, + "grad_norm": 0.30667001008987427, + "learning_rate": 3.284e-07, + "loss": 0.3431, + "mean_token_accuracy": 0.919658225029707, + "num_tokens": 121014046.0, + "step": 1679 + }, + { + "epoch": 1.0480630437326883, + "grad_norm": 0.20186658203601837, + "learning_rate": 3.28e-07, + "loss": 0.2749, + "mean_token_accuracy": 0.9354453422129154, + "num_tokens": 121088821.0, + "step": 1680 + }, + { + "epoch": 1.0486872391058402, + "grad_norm": 0.5021734833717346, + "learning_rate": 3.276e-07, + "loss": 0.3169, + "mean_token_accuracy": 0.9239302389323711, + "num_tokens": 121160017.0, + "step": 1681 + }, + { + "epoch": 1.049311434478992, + "grad_norm": 0.9187889695167542, + "learning_rate": 3.2719999999999997e-07, + "loss": 0.303, + "mean_token_accuracy": 0.9272826686501503, + "num_tokens": 121230296.0, + "step": 1682 + }, + { + "epoch": 1.0499356298521436, + "grad_norm": 0.3190925121307373, + "learning_rate": 3.268e-07, + "loss": 0.3076, + "mean_token_accuracy": 0.9248042553663254, + "num_tokens": 121305260.0, + "step": 1683 + }, + { + "epoch": 1.0505598252252955, + "grad_norm": 0.223711296916008, + "learning_rate": 3.264e-07, + "loss": 0.3236, + "mean_token_accuracy": 0.9252514615654945, + "num_tokens": 121378602.0, + "step": 1684 + }, + { + "epoch": 1.0511840205984473, + "grad_norm": 0.24088886380195618, + "learning_rate": 3.26e-07, + "loss": 0.3419, + "mean_token_accuracy": 0.9224191457033157, + "num_tokens": 121451497.0, + "step": 1685 + }, + { + "epoch": 1.0518082159715991, + "grad_norm": 0.470156729221344, + "learning_rate": 3.256e-07, + "loss": 0.3413, + "mean_token_accuracy": 0.9210297800600529, + "num_tokens": 121522979.0, + "step": 1686 + }, + { + "epoch": 1.052432411344751, + "grad_norm": 0.3204937279224396, + "learning_rate": 3.252e-07, + "loss": 0.3003, + "mean_token_accuracy": 0.9301727637648582, + "num_tokens": 121594750.0, + "step": 1687 + }, + { + "epoch": 1.0530566067179028, + "grad_norm": 0.20025834441184998, + "learning_rate": 3.2479999999999994e-07, + "loss": 0.3143, + "mean_token_accuracy": 0.9265815317630768, + "num_tokens": 121669791.0, + "step": 1688 + }, + { + "epoch": 1.0536808020910544, + "grad_norm": 0.27092644572257996, + "learning_rate": 3.244e-07, + "loss": 0.2713, + "mean_token_accuracy": 0.9341400675475597, + "num_tokens": 121744168.0, + "step": 1689 + }, + { + "epoch": 1.0543049974642063, + "grad_norm": 0.2896052300930023, + "learning_rate": 3.24e-07, + "loss": 0.293, + "mean_token_accuracy": 0.9298077523708344, + "num_tokens": 121818162.0, + "step": 1690 + }, + { + "epoch": 1.054929192837358, + "grad_norm": 0.1917794793844223, + "learning_rate": 3.2359999999999996e-07, + "loss": 0.3018, + "mean_token_accuracy": 0.9297118373215199, + "num_tokens": 121889343.0, + "step": 1691 + }, + { + "epoch": 1.05555338821051, + "grad_norm": 0.7489281892776489, + "learning_rate": 3.232e-07, + "loss": 0.3237, + "mean_token_accuracy": 0.9276138804852962, + "num_tokens": 121957701.0, + "step": 1692 + }, + { + "epoch": 1.0561775835836618, + "grad_norm": 0.28756335377693176, + "learning_rate": 3.2279999999999995e-07, + "loss": 0.265, + "mean_token_accuracy": 0.9336897656321526, + "num_tokens": 122032727.0, + "step": 1693 + }, + { + "epoch": 1.0568017789568134, + "grad_norm": 0.20763228833675385, + "learning_rate": 3.2240000000000003e-07, + "loss": 0.3058, + "mean_token_accuracy": 0.9265397749841213, + "num_tokens": 122105061.0, + "step": 1694 + }, + { + "epoch": 1.0574259743299652, + "grad_norm": 0.24599571526050568, + "learning_rate": 3.22e-07, + "loss": 0.334, + "mean_token_accuracy": 0.9207298420369625, + "num_tokens": 122177441.0, + "step": 1695 + }, + { + "epoch": 1.058050169703117, + "grad_norm": 0.21456308662891388, + "learning_rate": 3.2159999999999997e-07, + "loss": 0.2774, + "mean_token_accuracy": 0.9344044476747513, + "num_tokens": 122251941.0, + "step": 1696 + }, + { + "epoch": 1.0586743650762689, + "grad_norm": 0.21670344471931458, + "learning_rate": 3.212e-07, + "loss": 0.3128, + "mean_token_accuracy": 0.9210187308490276, + "num_tokens": 122323011.0, + "step": 1697 + }, + { + "epoch": 1.0592985604494207, + "grad_norm": 0.3766954839229584, + "learning_rate": 3.2079999999999996e-07, + "loss": 0.2928, + "mean_token_accuracy": 0.9298055320978165, + "num_tokens": 122395745.0, + "step": 1698 + }, + { + "epoch": 1.0599227558225726, + "grad_norm": 0.2169620543718338, + "learning_rate": 3.204e-07, + "loss": 0.2491, + "mean_token_accuracy": 0.9375383034348488, + "num_tokens": 122474779.0, + "step": 1699 + }, + { + "epoch": 1.0605469511957242, + "grad_norm": 0.35015055537223816, + "learning_rate": 3.2e-07, + "loss": 0.2824, + "mean_token_accuracy": 0.9323598779737949, + "num_tokens": 122549702.0, + "step": 1700 + }, + { + "epoch": 1.061171146568876, + "grad_norm": 0.373374879360199, + "learning_rate": 3.196e-07, + "loss": 0.3095, + "mean_token_accuracy": 0.9285491108894348, + "num_tokens": 122624726.0, + "step": 1701 + }, + { + "epoch": 1.0617953419420278, + "grad_norm": 0.30340754985809326, + "learning_rate": 3.1919999999999995e-07, + "loss": 0.3468, + "mean_token_accuracy": 0.9146638326346874, + "num_tokens": 122691537.0, + "step": 1702 + }, + { + "epoch": 1.0624195373151797, + "grad_norm": 0.23513586819171906, + "learning_rate": 3.1879999999999997e-07, + "loss": 0.2797, + "mean_token_accuracy": 0.9326867535710335, + "num_tokens": 122767129.0, + "step": 1703 + }, + { + "epoch": 1.0630437326883315, + "grad_norm": 0.3706021308898926, + "learning_rate": 3.184e-07, + "loss": 0.2779, + "mean_token_accuracy": 0.9286573939025402, + "num_tokens": 122841846.0, + "step": 1704 + }, + { + "epoch": 1.0636679280614834, + "grad_norm": 0.47585761547088623, + "learning_rate": 3.18e-07, + "loss": 0.2983, + "mean_token_accuracy": 0.9307053536176682, + "num_tokens": 122917383.0, + "step": 1705 + }, + { + "epoch": 1.064292123434635, + "grad_norm": 0.22225724160671234, + "learning_rate": 3.176e-07, + "loss": 0.2912, + "mean_token_accuracy": 0.9256017580628395, + "num_tokens": 122988864.0, + "step": 1706 + }, + { + "epoch": 1.0649163188077868, + "grad_norm": 0.35081037878990173, + "learning_rate": 3.1719999999999996e-07, + "loss": 0.3147, + "mean_token_accuracy": 0.9226274192333221, + "num_tokens": 123055828.0, + "step": 1707 + }, + { + "epoch": 1.0655405141809386, + "grad_norm": 0.25680631399154663, + "learning_rate": 3.1680000000000003e-07, + "loss": 0.2983, + "mean_token_accuracy": 0.9267714656889439, + "num_tokens": 123129888.0, + "step": 1708 + }, + { + "epoch": 1.0661647095540905, + "grad_norm": 0.8237215876579285, + "learning_rate": 3.164e-07, + "loss": 0.3179, + "mean_token_accuracy": 0.9253286980092525, + "num_tokens": 123199793.0, + "step": 1709 + }, + { + "epoch": 1.0667889049272423, + "grad_norm": 0.2593429684638977, + "learning_rate": 3.1599999999999997e-07, + "loss": 0.3042, + "mean_token_accuracy": 0.9251321293413639, + "num_tokens": 123269405.0, + "step": 1710 + }, + { + "epoch": 1.067413100300394, + "grad_norm": 0.3194665312767029, + "learning_rate": 3.156e-07, + "loss": 0.3274, + "mean_token_accuracy": 0.9202439375221729, + "num_tokens": 123342515.0, + "step": 1711 + }, + { + "epoch": 1.0680372956735458, + "grad_norm": 0.21264804899692535, + "learning_rate": 3.1519999999999996e-07, + "loss": 0.317, + "mean_token_accuracy": 0.922949992120266, + "num_tokens": 123413118.0, + "step": 1712 + }, + { + "epoch": 1.0686614910466976, + "grad_norm": 0.3277319371700287, + "learning_rate": 3.148e-07, + "loss": 0.3016, + "mean_token_accuracy": 0.9298577718436718, + "num_tokens": 123486119.0, + "step": 1713 + }, + { + "epoch": 1.0692856864198494, + "grad_norm": 2.1353938579559326, + "learning_rate": 3.144e-07, + "loss": 0.2969, + "mean_token_accuracy": 0.9313802868127823, + "num_tokens": 123556848.0, + "step": 1714 + }, + { + "epoch": 1.0699098817930013, + "grad_norm": 0.813490092754364, + "learning_rate": 3.14e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.927032820880413, + "num_tokens": 123628283.0, + "step": 1715 + }, + { + "epoch": 1.070534077166153, + "grad_norm": 0.3159823715686798, + "learning_rate": 3.1359999999999995e-07, + "loss": 0.3053, + "mean_token_accuracy": 0.9242353849112988, + "num_tokens": 123701286.0, + "step": 1716 + }, + { + "epoch": 1.0711582725393047, + "grad_norm": 0.3541158437728882, + "learning_rate": 3.1319999999999997e-07, + "loss": 0.3013, + "mean_token_accuracy": 0.9261049143970013, + "num_tokens": 123773087.0, + "step": 1717 + }, + { + "epoch": 1.0717824679124566, + "grad_norm": 0.2857579290866852, + "learning_rate": 3.128e-07, + "loss": 0.2998, + "mean_token_accuracy": 0.9278123453259468, + "num_tokens": 123849690.0, + "step": 1718 + }, + { + "epoch": 1.0724066632856084, + "grad_norm": 0.22061029076576233, + "learning_rate": 3.124e-07, + "loss": 0.2737, + "mean_token_accuracy": 0.9319339953362942, + "num_tokens": 123921812.0, + "step": 1719 + }, + { + "epoch": 1.0730308586587602, + "grad_norm": 1.4682681560516357, + "learning_rate": 3.12e-07, + "loss": 0.2912, + "mean_token_accuracy": 0.929526012390852, + "num_tokens": 123995497.0, + "step": 1720 + }, + { + "epoch": 1.073655054031912, + "grad_norm": 0.2545544505119324, + "learning_rate": 3.1159999999999996e-07, + "loss": 0.3342, + "mean_token_accuracy": 0.9215426780283451, + "num_tokens": 124065183.0, + "step": 1721 + }, + { + "epoch": 1.0742792494050637, + "grad_norm": 0.4985513389110565, + "learning_rate": 3.112e-07, + "loss": 0.3076, + "mean_token_accuracy": 0.9268179088830948, + "num_tokens": 124140655.0, + "step": 1722 + }, + { + "epoch": 1.0749034447782155, + "grad_norm": 0.21433235704898834, + "learning_rate": 3.108e-07, + "loss": 0.2897, + "mean_token_accuracy": 0.9324699901044369, + "num_tokens": 124213648.0, + "step": 1723 + }, + { + "epoch": 1.0755276401513674, + "grad_norm": 0.4376954138278961, + "learning_rate": 3.104e-07, + "loss": 0.2976, + "mean_token_accuracy": 0.9300878085196018, + "num_tokens": 124289210.0, + "step": 1724 + }, + { + "epoch": 1.0761518355245192, + "grad_norm": 0.910935640335083, + "learning_rate": 3.1e-07, + "loss": 0.2841, + "mean_token_accuracy": 0.9342397525906563, + "num_tokens": 124363564.0, + "step": 1725 + }, + { + "epoch": 1.076776030897671, + "grad_norm": 0.28917619585990906, + "learning_rate": 3.0959999999999997e-07, + "loss": 0.2831, + "mean_token_accuracy": 0.9278599992394447, + "num_tokens": 124432894.0, + "step": 1726 + }, + { + "epoch": 1.0774002262708229, + "grad_norm": 0.25585755705833435, + "learning_rate": 3.0919999999999994e-07, + "loss": 0.305, + "mean_token_accuracy": 0.9333919212222099, + "num_tokens": 124506210.0, + "step": 1727 + }, + { + "epoch": 1.0780244216439745, + "grad_norm": 0.2665160894393921, + "learning_rate": 3.088e-07, + "loss": 0.3161, + "mean_token_accuracy": 0.9263594783842564, + "num_tokens": 124578682.0, + "step": 1728 + }, + { + "epoch": 1.0786486170171263, + "grad_norm": 0.17166045308113098, + "learning_rate": 3.084e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9228389635682106, + "num_tokens": 124651849.0, + "step": 1729 + }, + { + "epoch": 1.0792728123902782, + "grad_norm": 0.2234860211610794, + "learning_rate": 3.08e-07, + "loss": 0.3043, + "mean_token_accuracy": 0.9272056445479393, + "num_tokens": 124723598.0, + "step": 1730 + }, + { + "epoch": 1.07989700776343, + "grad_norm": 0.21245871484279633, + "learning_rate": 3.076e-07, + "loss": 0.2995, + "mean_token_accuracy": 0.9266148433089256, + "num_tokens": 124794093.0, + "step": 1731 + }, + { + "epoch": 1.0805212031365818, + "grad_norm": 0.27061209082603455, + "learning_rate": 3.0719999999999995e-07, + "loss": 0.3237, + "mean_token_accuracy": 0.9217249415814877, + "num_tokens": 124861275.0, + "step": 1732 + }, + { + "epoch": 1.0811453985097335, + "grad_norm": 0.5388357639312744, + "learning_rate": 3.068e-07, + "loss": 0.2803, + "mean_token_accuracy": 0.9316447749733925, + "num_tokens": 124933281.0, + "step": 1733 + }, + { + "epoch": 1.0817695938828853, + "grad_norm": 0.2067924588918686, + "learning_rate": 3.064e-07, + "loss": 0.2979, + "mean_token_accuracy": 0.926850039511919, + "num_tokens": 125001694.0, + "step": 1734 + }, + { + "epoch": 1.0823937892560371, + "grad_norm": 0.2736384868621826, + "learning_rate": 3.0599999999999996e-07, + "loss": 0.2996, + "mean_token_accuracy": 0.9287301003932953, + "num_tokens": 125075725.0, + "step": 1735 + }, + { + "epoch": 1.083017984629189, + "grad_norm": 0.24116985499858856, + "learning_rate": 3.056e-07, + "loss": 0.3119, + "mean_token_accuracy": 0.9256196171045303, + "num_tokens": 125144594.0, + "step": 1736 + }, + { + "epoch": 1.0836421800023408, + "grad_norm": 0.21359170973300934, + "learning_rate": 3.052e-07, + "loss": 0.2879, + "mean_token_accuracy": 0.9316459558904171, + "num_tokens": 125218795.0, + "step": 1737 + }, + { + "epoch": 1.0842663753754924, + "grad_norm": 0.2494240552186966, + "learning_rate": 3.048e-07, + "loss": 0.3318, + "mean_token_accuracy": 0.9167494550347328, + "num_tokens": 125287741.0, + "step": 1738 + }, + { + "epoch": 1.0848905707486443, + "grad_norm": 0.3069607615470886, + "learning_rate": 3.044e-07, + "loss": 0.2788, + "mean_token_accuracy": 0.9359465390443802, + "num_tokens": 125362797.0, + "step": 1739 + }, + { + "epoch": 1.085514766121796, + "grad_norm": 0.2588959038257599, + "learning_rate": 3.0399999999999997e-07, + "loss": 0.3161, + "mean_token_accuracy": 0.9266990460455418, + "num_tokens": 125434659.0, + "step": 1740 + }, + { + "epoch": 1.086138961494948, + "grad_norm": 0.1978151947259903, + "learning_rate": 3.036e-07, + "loss": 0.3184, + "mean_token_accuracy": 0.9218169339001179, + "num_tokens": 125507248.0, + "step": 1741 + }, + { + "epoch": 1.0867631568680998, + "grad_norm": 0.3481571078300476, + "learning_rate": 3.032e-07, + "loss": 0.3034, + "mean_token_accuracy": 0.9269473142921925, + "num_tokens": 125576237.0, + "step": 1742 + }, + { + "epoch": 1.0873873522412516, + "grad_norm": 0.19139768183231354, + "learning_rate": 3.028e-07, + "loss": 0.3415, + "mean_token_accuracy": 0.916395116597414, + "num_tokens": 125646256.0, + "step": 1743 + }, + { + "epoch": 1.0880115476144032, + "grad_norm": 0.21481899917125702, + "learning_rate": 3.024e-07, + "loss": 0.2716, + "mean_token_accuracy": 0.9357072673738003, + "num_tokens": 125718249.0, + "step": 1744 + }, + { + "epoch": 1.088635742987555, + "grad_norm": 0.2420273721218109, + "learning_rate": 3.02e-07, + "loss": 0.2927, + "mean_token_accuracy": 0.9292151406407356, + "num_tokens": 125795595.0, + "step": 1745 + }, + { + "epoch": 1.089259938360707, + "grad_norm": 2.0011966228485107, + "learning_rate": 3.0159999999999995e-07, + "loss": 0.3032, + "mean_token_accuracy": 0.9290259294211864, + "num_tokens": 125867425.0, + "step": 1746 + }, + { + "epoch": 1.0898841337338587, + "grad_norm": 0.21148931980133057, + "learning_rate": 3.012e-07, + "loss": 0.3128, + "mean_token_accuracy": 0.926516305655241, + "num_tokens": 125940765.0, + "step": 1747 + }, + { + "epoch": 1.0905083291070106, + "grad_norm": 0.22971081733703613, + "learning_rate": 3.008e-07, + "loss": 0.2548, + "mean_token_accuracy": 0.9389605522155762, + "num_tokens": 126020037.0, + "step": 1748 + }, + { + "epoch": 1.0911325244801624, + "grad_norm": 0.20571602880954742, + "learning_rate": 3.0039999999999996e-07, + "loss": 0.3248, + "mean_token_accuracy": 0.9218075796961784, + "num_tokens": 126091322.0, + "step": 1749 + }, + { + "epoch": 1.091756719853314, + "grad_norm": 0.3938194215297699, + "learning_rate": 3e-07, + "loss": 0.2667, + "mean_token_accuracy": 0.9369597919285297, + "num_tokens": 126168689.0, + "step": 1750 + }, + { + "epoch": 1.0923809152264659, + "grad_norm": 0.19667664170265198, + "learning_rate": 2.9959999999999996e-07, + "loss": 0.2815, + "mean_token_accuracy": 0.9315198622643948, + "num_tokens": 126242242.0, + "step": 1751 + }, + { + "epoch": 1.0930051105996177, + "grad_norm": 0.19590339064598083, + "learning_rate": 2.9920000000000003e-07, + "loss": 0.259, + "mean_token_accuracy": 0.9365727752447128, + "num_tokens": 126317680.0, + "step": 1752 + }, + { + "epoch": 1.0936293059727695, + "grad_norm": 0.263824462890625, + "learning_rate": 2.988e-07, + "loss": 0.289, + "mean_token_accuracy": 0.9279736876487732, + "num_tokens": 126388428.0, + "step": 1753 + }, + { + "epoch": 1.0942535013459214, + "grad_norm": 0.2211267054080963, + "learning_rate": 2.9839999999999997e-07, + "loss": 0.2946, + "mean_token_accuracy": 0.9267088025808334, + "num_tokens": 126457585.0, + "step": 1754 + }, + { + "epoch": 1.094877696719073, + "grad_norm": 0.19925707578659058, + "learning_rate": 2.98e-07, + "loss": 0.3265, + "mean_token_accuracy": 0.9196182824671268, + "num_tokens": 126526377.0, + "step": 1755 + }, + { + "epoch": 1.0955018920922248, + "grad_norm": 0.2817637622356415, + "learning_rate": 2.9759999999999996e-07, + "loss": 0.2649, + "mean_token_accuracy": 0.9379608407616615, + "num_tokens": 126599730.0, + "step": 1756 + }, + { + "epoch": 1.0961260874653767, + "grad_norm": 1.557137370109558, + "learning_rate": 2.972e-07, + "loss": 0.3093, + "mean_token_accuracy": 0.926731813699007, + "num_tokens": 126675846.0, + "step": 1757 + }, + { + "epoch": 1.0967502828385285, + "grad_norm": 0.23826085031032562, + "learning_rate": 2.968e-07, + "loss": 0.2976, + "mean_token_accuracy": 0.9266126155853271, + "num_tokens": 126745326.0, + "step": 1758 + }, + { + "epoch": 1.0973744782116803, + "grad_norm": 0.5650027990341187, + "learning_rate": 2.964e-07, + "loss": 0.3383, + "mean_token_accuracy": 0.9220468513667583, + "num_tokens": 126814067.0, + "step": 1759 + }, + { + "epoch": 1.097998673584832, + "grad_norm": 0.44480961561203003, + "learning_rate": 2.9599999999999995e-07, + "loss": 0.2896, + "mean_token_accuracy": 0.9340163506567478, + "num_tokens": 126889928.0, + "step": 1760 + }, + { + "epoch": 1.0986228689579838, + "grad_norm": 0.20387758314609528, + "learning_rate": 2.9559999999999997e-07, + "loss": 0.2978, + "mean_token_accuracy": 0.9282066188752651, + "num_tokens": 126963387.0, + "step": 1761 + }, + { + "epoch": 1.0992470643311356, + "grad_norm": 0.23117607831954956, + "learning_rate": 2.952e-07, + "loss": 0.3003, + "mean_token_accuracy": 0.9271444529294968, + "num_tokens": 127034288.0, + "step": 1762 + }, + { + "epoch": 1.0998712597042875, + "grad_norm": 0.1723998636007309, + "learning_rate": 2.948e-07, + "loss": 0.2781, + "mean_token_accuracy": 0.9293723665177822, + "num_tokens": 127113845.0, + "step": 1763 + }, + { + "epoch": 1.1004954550774393, + "grad_norm": 0.35138267278671265, + "learning_rate": 2.944e-07, + "loss": 0.307, + "mean_token_accuracy": 0.9217171892523766, + "num_tokens": 127184299.0, + "step": 1764 + }, + { + "epoch": 1.1011196504505911, + "grad_norm": 0.2981915771961212, + "learning_rate": 2.9399999999999996e-07, + "loss": 0.3241, + "mean_token_accuracy": 0.9226033426821232, + "num_tokens": 127254392.0, + "step": 1765 + }, + { + "epoch": 1.1017438458237427, + "grad_norm": 0.3267624080181122, + "learning_rate": 2.9360000000000003e-07, + "loss": 0.2995, + "mean_token_accuracy": 0.9290510676801205, + "num_tokens": 127328436.0, + "step": 1766 + }, + { + "epoch": 1.1023680411968946, + "grad_norm": 0.2883751392364502, + "learning_rate": 2.932e-07, + "loss": 0.2994, + "mean_token_accuracy": 0.9295095764100552, + "num_tokens": 127402731.0, + "step": 1767 + }, + { + "epoch": 1.1029922365700464, + "grad_norm": 0.21919399499893188, + "learning_rate": 2.928e-07, + "loss": 0.2867, + "mean_token_accuracy": 0.9286461621522903, + "num_tokens": 127473231.0, + "step": 1768 + }, + { + "epoch": 1.1036164319431983, + "grad_norm": 0.2171352356672287, + "learning_rate": 2.924e-07, + "loss": 0.2713, + "mean_token_accuracy": 0.9339998848736286, + "num_tokens": 127544993.0, + "step": 1769 + }, + { + "epoch": 1.10424062731635, + "grad_norm": 0.44201549887657166, + "learning_rate": 2.9199999999999997e-07, + "loss": 0.2642, + "mean_token_accuracy": 0.9347834549844265, + "num_tokens": 127622302.0, + "step": 1770 + }, + { + "epoch": 1.104864822689502, + "grad_norm": 0.31703656911849976, + "learning_rate": 2.916e-07, + "loss": 0.3128, + "mean_token_accuracy": 0.924625538289547, + "num_tokens": 127694523.0, + "step": 1771 + }, + { + "epoch": 1.1054890180626535, + "grad_norm": 0.4020998179912567, + "learning_rate": 2.912e-07, + "loss": 0.2779, + "mean_token_accuracy": 0.9329924695193768, + "num_tokens": 127767960.0, + "step": 1772 + }, + { + "epoch": 1.1061132134358054, + "grad_norm": 0.4652239978313446, + "learning_rate": 2.908e-07, + "loss": 0.3439, + "mean_token_accuracy": 0.9155332818627357, + "num_tokens": 127837780.0, + "step": 1773 + }, + { + "epoch": 1.1067374088089572, + "grad_norm": 0.21311232447624207, + "learning_rate": 2.9039999999999995e-07, + "loss": 0.2963, + "mean_token_accuracy": 0.9291681200265884, + "num_tokens": 127909906.0, + "step": 1774 + }, + { + "epoch": 1.107361604182109, + "grad_norm": 0.3185514807701111, + "learning_rate": 2.9e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.9222244806587696, + "num_tokens": 127983878.0, + "step": 1775 + }, + { + "epoch": 1.1079857995552609, + "grad_norm": 0.3171515464782715, + "learning_rate": 2.896e-07, + "loss": 0.2849, + "mean_token_accuracy": 0.9316718392074108, + "num_tokens": 128056358.0, + "step": 1776 + }, + { + "epoch": 1.1086099949284125, + "grad_norm": 0.21025316417217255, + "learning_rate": 2.892e-07, + "loss": 0.3147, + "mean_token_accuracy": 0.9289660938084126, + "num_tokens": 128127520.0, + "step": 1777 + }, + { + "epoch": 1.1092341903015643, + "grad_norm": 0.27649930119514465, + "learning_rate": 2.888e-07, + "loss": 0.3003, + "mean_token_accuracy": 0.9280391745269299, + "num_tokens": 128199549.0, + "step": 1778 + }, + { + "epoch": 1.1098583856747162, + "grad_norm": 0.3147198557853699, + "learning_rate": 2.8839999999999996e-07, + "loss": 0.3366, + "mean_token_accuracy": 0.9205878898501396, + "num_tokens": 128269024.0, + "step": 1779 + }, + { + "epoch": 1.110482581047868, + "grad_norm": 0.2495032250881195, + "learning_rate": 2.88e-07, + "loss": 0.2986, + "mean_token_accuracy": 0.9252235107123852, + "num_tokens": 128338119.0, + "step": 1780 + }, + { + "epoch": 1.1111067764210198, + "grad_norm": 0.24749630689620972, + "learning_rate": 2.876e-07, + "loss": 0.3061, + "mean_token_accuracy": 0.9256924614310265, + "num_tokens": 128409444.0, + "step": 1781 + }, + { + "epoch": 1.1117309717941715, + "grad_norm": 0.22386480867862701, + "learning_rate": 2.872e-07, + "loss": 0.2993, + "mean_token_accuracy": 0.9266157075762749, + "num_tokens": 128484469.0, + "step": 1782 + }, + { + "epoch": 1.1123551671673233, + "grad_norm": 0.8728765249252319, + "learning_rate": 2.868e-07, + "loss": 0.294, + "mean_token_accuracy": 0.9282168969511986, + "num_tokens": 128556904.0, + "step": 1783 + }, + { + "epoch": 1.1129793625404751, + "grad_norm": 0.27757230401039124, + "learning_rate": 2.8639999999999997e-07, + "loss": 0.2922, + "mean_token_accuracy": 0.9269283674657345, + "num_tokens": 128628549.0, + "step": 1784 + }, + { + "epoch": 1.113603557913627, + "grad_norm": 0.22034601867198944, + "learning_rate": 2.8599999999999994e-07, + "loss": 0.2879, + "mean_token_accuracy": 0.9330173693597317, + "num_tokens": 128701644.0, + "step": 1785 + }, + { + "epoch": 1.1142277532867788, + "grad_norm": 0.19902749359607697, + "learning_rate": 2.856e-07, + "loss": 0.3067, + "mean_token_accuracy": 0.9243056252598763, + "num_tokens": 128773917.0, + "step": 1786 + }, + { + "epoch": 1.1148519486599306, + "grad_norm": 0.37048718333244324, + "learning_rate": 2.852e-07, + "loss": 0.3352, + "mean_token_accuracy": 0.9219833053648472, + "num_tokens": 128843355.0, + "step": 1787 + }, + { + "epoch": 1.1154761440330823, + "grad_norm": 0.2669626772403717, + "learning_rate": 2.848e-07, + "loss": 0.3147, + "mean_token_accuracy": 0.9261269830167294, + "num_tokens": 128911507.0, + "step": 1788 + }, + { + "epoch": 1.116100339406234, + "grad_norm": 0.3155808448791504, + "learning_rate": 2.844e-07, + "loss": 0.2941, + "mean_token_accuracy": 0.9300140291452408, + "num_tokens": 128979162.0, + "step": 1789 + }, + { + "epoch": 1.116724534779386, + "grad_norm": 0.3582897484302521, + "learning_rate": 2.8399999999999995e-07, + "loss": 0.2991, + "mean_token_accuracy": 0.9302190206944942, + "num_tokens": 129053511.0, + "step": 1790 + }, + { + "epoch": 1.1173487301525378, + "grad_norm": 0.2463095337152481, + "learning_rate": 2.836e-07, + "loss": 0.3046, + "mean_token_accuracy": 0.9281173199415207, + "num_tokens": 129123440.0, + "step": 1791 + }, + { + "epoch": 1.1179729255256896, + "grad_norm": 0.20070959627628326, + "learning_rate": 2.832e-07, + "loss": 0.32, + "mean_token_accuracy": 0.921328492462635, + "num_tokens": 129191027.0, + "step": 1792 + }, + { + "epoch": 1.1185971208988414, + "grad_norm": 0.20829720795154572, + "learning_rate": 2.8279999999999996e-07, + "loss": 0.3162, + "mean_token_accuracy": 0.9224825501441956, + "num_tokens": 129262682.0, + "step": 1793 + }, + { + "epoch": 1.119221316271993, + "grad_norm": 0.2078937590122223, + "learning_rate": 2.824e-07, + "loss": 0.3205, + "mean_token_accuracy": 0.9259395375847816, + "num_tokens": 129334446.0, + "step": 1794 + }, + { + "epoch": 1.119845511645145, + "grad_norm": 0.8026764988899231, + "learning_rate": 2.8199999999999996e-07, + "loss": 0.3274, + "mean_token_accuracy": 0.921380702406168, + "num_tokens": 129404674.0, + "step": 1795 + }, + { + "epoch": 1.1204697070182967, + "grad_norm": 0.878032386302948, + "learning_rate": 2.816e-07, + "loss": 0.3286, + "mean_token_accuracy": 0.9226947911083698, + "num_tokens": 129472312.0, + "step": 1796 + }, + { + "epoch": 1.1210939023914486, + "grad_norm": 0.236044779419899, + "learning_rate": 2.812e-07, + "loss": 0.3155, + "mean_token_accuracy": 0.9229910857975483, + "num_tokens": 129543929.0, + "step": 1797 + }, + { + "epoch": 1.1217180977646004, + "grad_norm": 0.24110299348831177, + "learning_rate": 2.8079999999999997e-07, + "loss": 0.2972, + "mean_token_accuracy": 0.9291915595531464, + "num_tokens": 129614452.0, + "step": 1798 + }, + { + "epoch": 1.122342293137752, + "grad_norm": 0.20963750779628754, + "learning_rate": 2.804e-07, + "loss": 0.2666, + "mean_token_accuracy": 0.935639638453722, + "num_tokens": 129689138.0, + "step": 1799 + }, + { + "epoch": 1.1229664885109039, + "grad_norm": 0.22237470746040344, + "learning_rate": 2.8e-07, + "loss": 0.2671, + "mean_token_accuracy": 0.9358622543513775, + "num_tokens": 129764566.0, + "step": 1800 + }, + { + "epoch": 1.1235906838840557, + "grad_norm": 0.24199751019477844, + "learning_rate": 2.796e-07, + "loss": 0.3144, + "mean_token_accuracy": 0.9241049475967884, + "num_tokens": 129830533.0, + "step": 1801 + }, + { + "epoch": 1.1242148792572075, + "grad_norm": 0.19200876355171204, + "learning_rate": 2.792e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.925644475966692, + "num_tokens": 129902294.0, + "step": 1802 + }, + { + "epoch": 1.1248390746303594, + "grad_norm": 0.21629273891448975, + "learning_rate": 2.788e-07, + "loss": 0.2963, + "mean_token_accuracy": 0.9291038624942303, + "num_tokens": 129971415.0, + "step": 1803 + }, + { + "epoch": 1.125463270003511, + "grad_norm": 0.23822374641895294, + "learning_rate": 2.7839999999999995e-07, + "loss": 0.3162, + "mean_token_accuracy": 0.9257005117833614, + "num_tokens": 130041079.0, + "step": 1804 + }, + { + "epoch": 1.1260874653766628, + "grad_norm": 0.26798659563064575, + "learning_rate": 2.7800000000000003e-07, + "loss": 0.27, + "mean_token_accuracy": 0.934216570109129, + "num_tokens": 130114219.0, + "step": 1805 + }, + { + "epoch": 1.1267116607498147, + "grad_norm": 0.2028493881225586, + "learning_rate": 2.776e-07, + "loss": 0.2923, + "mean_token_accuracy": 0.9314211085438728, + "num_tokens": 130184815.0, + "step": 1806 + }, + { + "epoch": 1.1273358561229665, + "grad_norm": 0.21209344267845154, + "learning_rate": 2.7719999999999997e-07, + "loss": 0.2633, + "mean_token_accuracy": 0.9350503049790859, + "num_tokens": 130260521.0, + "step": 1807 + }, + { + "epoch": 1.1279600514961183, + "grad_norm": 0.2654914855957031, + "learning_rate": 2.768e-07, + "loss": 0.3179, + "mean_token_accuracy": 0.9189272932708263, + "num_tokens": 130329323.0, + "step": 1808 + }, + { + "epoch": 1.1285842468692702, + "grad_norm": 0.22222192585468292, + "learning_rate": 2.7639999999999996e-07, + "loss": 0.3308, + "mean_token_accuracy": 0.9152827970683575, + "num_tokens": 130395793.0, + "step": 1809 + }, + { + "epoch": 1.1292084422424218, + "grad_norm": 0.7466000914573669, + "learning_rate": 2.7600000000000004e-07, + "loss": 0.312, + "mean_token_accuracy": 0.9242708086967468, + "num_tokens": 130463435.0, + "step": 1810 + }, + { + "epoch": 1.1298326376155736, + "grad_norm": 0.6550260782241821, + "learning_rate": 2.756e-07, + "loss": 0.2662, + "mean_token_accuracy": 0.9367930218577385, + "num_tokens": 130539506.0, + "step": 1811 + }, + { + "epoch": 1.1304568329887255, + "grad_norm": 0.1838720738887787, + "learning_rate": 2.752e-07, + "loss": 0.3253, + "mean_token_accuracy": 0.924756396561861, + "num_tokens": 130613320.0, + "step": 1812 + }, + { + "epoch": 1.1310810283618773, + "grad_norm": 0.22249767184257507, + "learning_rate": 2.748e-07, + "loss": 0.287, + "mean_token_accuracy": 0.9310209937393665, + "num_tokens": 130684636.0, + "step": 1813 + }, + { + "epoch": 1.1317052237350291, + "grad_norm": 0.2860652804374695, + "learning_rate": 2.7439999999999997e-07, + "loss": 0.2995, + "mean_token_accuracy": 0.9270205423235893, + "num_tokens": 130760707.0, + "step": 1814 + }, + { + "epoch": 1.132329419108181, + "grad_norm": 0.18419241905212402, + "learning_rate": 2.74e-07, + "loss": 0.2976, + "mean_token_accuracy": 0.9259940758347511, + "num_tokens": 130832240.0, + "step": 1815 + }, + { + "epoch": 1.1329536144813326, + "grad_norm": 0.27973178029060364, + "learning_rate": 2.736e-07, + "loss": 0.3092, + "mean_token_accuracy": 0.9277214109897614, + "num_tokens": 130904258.0, + "step": 1816 + }, + { + "epoch": 1.1335778098544844, + "grad_norm": 0.25905564427375793, + "learning_rate": 2.732e-07, + "loss": 0.3025, + "mean_token_accuracy": 0.9278492033481598, + "num_tokens": 130978820.0, + "step": 1817 + }, + { + "epoch": 1.1342020052276363, + "grad_norm": 0.36149126291275024, + "learning_rate": 2.7279999999999995e-07, + "loss": 0.3224, + "mean_token_accuracy": 0.9216597154736519, + "num_tokens": 131048320.0, + "step": 1818 + }, + { + "epoch": 1.134826200600788, + "grad_norm": 0.6759888529777527, + "learning_rate": 2.724e-07, + "loss": 0.3123, + "mean_token_accuracy": 0.9254679456353188, + "num_tokens": 131118199.0, + "step": 1819 + }, + { + "epoch": 1.13545039597394, + "grad_norm": 0.24249523878097534, + "learning_rate": 2.72e-07, + "loss": 0.2839, + "mean_token_accuracy": 0.932914275676012, + "num_tokens": 131192594.0, + "step": 1820 + }, + { + "epoch": 1.1360745913470915, + "grad_norm": 0.18677254021167755, + "learning_rate": 2.7159999999999997e-07, + "loss": 0.335, + "mean_token_accuracy": 0.9248083718121052, + "num_tokens": 131263001.0, + "step": 1821 + }, + { + "epoch": 1.1366987867202434, + "grad_norm": 0.3140883147716522, + "learning_rate": 2.712e-07, + "loss": 0.3013, + "mean_token_accuracy": 0.9304417856037617, + "num_tokens": 131339775.0, + "step": 1822 + }, + { + "epoch": 1.1373229820933952, + "grad_norm": 0.29275473952293396, + "learning_rate": 2.7079999999999996e-07, + "loss": 0.2912, + "mean_token_accuracy": 0.9277708381414413, + "num_tokens": 131411043.0, + "step": 1823 + }, + { + "epoch": 1.137947177466547, + "grad_norm": 0.41218438744544983, + "learning_rate": 2.704e-07, + "loss": 0.3053, + "mean_token_accuracy": 0.9323998130857944, + "num_tokens": 131483582.0, + "step": 1824 + }, + { + "epoch": 1.138571372839699, + "grad_norm": 0.37196704745292664, + "learning_rate": 2.7e-07, + "loss": 0.2742, + "mean_token_accuracy": 0.9367691725492477, + "num_tokens": 131557859.0, + "step": 1825 + }, + { + "epoch": 1.1391955682128505, + "grad_norm": 0.22672373056411743, + "learning_rate": 2.696e-07, + "loss": 0.3358, + "mean_token_accuracy": 0.9178244844079018, + "num_tokens": 131624570.0, + "step": 1826 + }, + { + "epoch": 1.1398197635860023, + "grad_norm": 0.43422314524650574, + "learning_rate": 2.692e-07, + "loss": 0.3009, + "mean_token_accuracy": 0.9280944280326366, + "num_tokens": 131703663.0, + "step": 1827 + }, + { + "epoch": 1.1404439589591542, + "grad_norm": 0.20988324284553528, + "learning_rate": 2.6879999999999997e-07, + "loss": 0.2946, + "mean_token_accuracy": 0.9268174730241299, + "num_tokens": 131778327.0, + "step": 1828 + }, + { + "epoch": 1.141068154332306, + "grad_norm": 0.22549273073673248, + "learning_rate": 2.684e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9286224097013474, + "num_tokens": 131852111.0, + "step": 1829 + }, + { + "epoch": 1.1416923497054579, + "grad_norm": 0.24371758103370667, + "learning_rate": 2.68e-07, + "loss": 0.3325, + "mean_token_accuracy": 0.9222479052841663, + "num_tokens": 131925100.0, + "step": 1830 + }, + { + "epoch": 1.1423165450786097, + "grad_norm": 0.17896609008312225, + "learning_rate": 2.676e-07, + "loss": 0.2909, + "mean_token_accuracy": 0.928986020386219, + "num_tokens": 131999822.0, + "step": 1831 + }, + { + "epoch": 1.1429407404517613, + "grad_norm": 0.33197128772735596, + "learning_rate": 2.6719999999999996e-07, + "loss": 0.2972, + "mean_token_accuracy": 0.9308925606310368, + "num_tokens": 132072334.0, + "step": 1832 + }, + { + "epoch": 1.1435649358249131, + "grad_norm": 0.22474373877048492, + "learning_rate": 2.668e-07, + "loss": 0.3048, + "mean_token_accuracy": 0.9309780970215797, + "num_tokens": 132142837.0, + "step": 1833 + }, + { + "epoch": 1.144189131198065, + "grad_norm": 0.19098779559135437, + "learning_rate": 2.664e-07, + "loss": 0.3148, + "mean_token_accuracy": 0.9202582724392414, + "num_tokens": 132212765.0, + "step": 1834 + }, + { + "epoch": 1.1448133265712168, + "grad_norm": 0.7869068384170532, + "learning_rate": 2.66e-07, + "loss": 0.3079, + "mean_token_accuracy": 0.9237210862338543, + "num_tokens": 132286804.0, + "step": 1835 + }, + { + "epoch": 1.1454375219443687, + "grad_norm": 0.3246939182281494, + "learning_rate": 2.656e-07, + "loss": 0.3163, + "mean_token_accuracy": 0.9224759265780449, + "num_tokens": 132357747.0, + "step": 1836 + }, + { + "epoch": 1.1460617173175205, + "grad_norm": 0.21558405458927155, + "learning_rate": 2.6519999999999997e-07, + "loss": 0.2839, + "mean_token_accuracy": 0.9333493672311306, + "num_tokens": 132428828.0, + "step": 1837 + }, + { + "epoch": 1.146685912690672, + "grad_norm": 0.32068946957588196, + "learning_rate": 2.648e-07, + "loss": 0.3369, + "mean_token_accuracy": 0.920955915004015, + "num_tokens": 132499839.0, + "step": 1838 + }, + { + "epoch": 1.147310108063824, + "grad_norm": 0.19209101796150208, + "learning_rate": 2.644e-07, + "loss": 0.3299, + "mean_token_accuracy": 0.9184231348335743, + "num_tokens": 132572079.0, + "step": 1839 + }, + { + "epoch": 1.1479343034369758, + "grad_norm": 0.21535316109657288, + "learning_rate": 2.64e-07, + "loss": 0.2873, + "mean_token_accuracy": 0.931008767336607, + "num_tokens": 132641508.0, + "step": 1840 + }, + { + "epoch": 1.1485584988101276, + "grad_norm": 0.17875225841999054, + "learning_rate": 2.636e-07, + "loss": 0.2963, + "mean_token_accuracy": 0.9300899244844913, + "num_tokens": 132713819.0, + "step": 1841 + }, + { + "epoch": 1.1491826941832795, + "grad_norm": 0.19520516693592072, + "learning_rate": 2.632e-07, + "loss": 0.2964, + "mean_token_accuracy": 0.9299054443836212, + "num_tokens": 132789221.0, + "step": 1842 + }, + { + "epoch": 1.149806889556431, + "grad_norm": 0.5471992492675781, + "learning_rate": 2.6279999999999994e-07, + "loss": 0.3114, + "mean_token_accuracy": 0.924699567258358, + "num_tokens": 132863925.0, + "step": 1843 + }, + { + "epoch": 1.150431084929583, + "grad_norm": 0.33980801701545715, + "learning_rate": 2.624e-07, + "loss": 0.3136, + "mean_token_accuracy": 0.9276256710290909, + "num_tokens": 132934644.0, + "step": 1844 + }, + { + "epoch": 1.1510552803027347, + "grad_norm": 0.24986259639263153, + "learning_rate": 2.62e-07, + "loss": 0.3001, + "mean_token_accuracy": 0.9270990453660488, + "num_tokens": 133006263.0, + "step": 1845 + }, + { + "epoch": 1.1516794756758866, + "grad_norm": 2.2196643352508545, + "learning_rate": 2.616e-07, + "loss": 0.3378, + "mean_token_accuracy": 0.9157010801136494, + "num_tokens": 133073911.0, + "step": 1846 + }, + { + "epoch": 1.1523036710490384, + "grad_norm": 0.24839279055595398, + "learning_rate": 2.612e-07, + "loss": 0.3257, + "mean_token_accuracy": 0.9250072650611401, + "num_tokens": 133145387.0, + "step": 1847 + }, + { + "epoch": 1.15292786642219, + "grad_norm": 0.24738164246082306, + "learning_rate": 2.6079999999999995e-07, + "loss": 0.315, + "mean_token_accuracy": 0.922322116792202, + "num_tokens": 133215736.0, + "step": 1848 + }, + { + "epoch": 1.1535520617953419, + "grad_norm": 0.34858354926109314, + "learning_rate": 2.6040000000000003e-07, + "loss": 0.2926, + "mean_token_accuracy": 0.9282812289893627, + "num_tokens": 133286608.0, + "step": 1849 + }, + { + "epoch": 1.1541762571684937, + "grad_norm": 0.24858058989048004, + "learning_rate": 2.6e-07, + "loss": 0.3062, + "mean_token_accuracy": 0.9257618598639965, + "num_tokens": 133357241.0, + "step": 1850 + }, + { + "epoch": 1.1548004525416455, + "grad_norm": 0.21907417476177216, + "learning_rate": 2.5959999999999997e-07, + "loss": 0.2968, + "mean_token_accuracy": 0.9295230209827423, + "num_tokens": 133426380.0, + "step": 1851 + }, + { + "epoch": 1.1554246479147974, + "grad_norm": 1.3821501731872559, + "learning_rate": 2.592e-07, + "loss": 0.2783, + "mean_token_accuracy": 0.9311175085604191, + "num_tokens": 133501950.0, + "step": 1852 + }, + { + "epoch": 1.1560488432879492, + "grad_norm": 0.16924183070659637, + "learning_rate": 2.5879999999999996e-07, + "loss": 0.2904, + "mean_token_accuracy": 0.9292899444699287, + "num_tokens": 133573045.0, + "step": 1853 + }, + { + "epoch": 1.156673038661101, + "grad_norm": 0.27745601534843445, + "learning_rate": 2.584e-07, + "loss": 0.2977, + "mean_token_accuracy": 0.9259501472115517, + "num_tokens": 133645808.0, + "step": 1854 + }, + { + "epoch": 1.1572972340342527, + "grad_norm": 0.2569347620010376, + "learning_rate": 2.58e-07, + "loss": 0.3133, + "mean_token_accuracy": 0.924938652664423, + "num_tokens": 133714543.0, + "step": 1855 + }, + { + "epoch": 1.1579214294074045, + "grad_norm": 0.2657967507839203, + "learning_rate": 2.576e-07, + "loss": 0.2852, + "mean_token_accuracy": 0.9319658391177654, + "num_tokens": 133791155.0, + "step": 1856 + }, + { + "epoch": 1.1585456247805563, + "grad_norm": 0.24488013982772827, + "learning_rate": 2.5719999999999995e-07, + "loss": 0.3003, + "mean_token_accuracy": 0.9235825389623642, + "num_tokens": 133856407.0, + "step": 1857 + }, + { + "epoch": 1.1591698201537082, + "grad_norm": 0.4416581988334656, + "learning_rate": 2.5679999999999997e-07, + "loss": 0.2578, + "mean_token_accuracy": 0.9360976293683052, + "num_tokens": 133930345.0, + "step": 1858 + }, + { + "epoch": 1.15979401552686, + "grad_norm": 0.3488563299179077, + "learning_rate": 2.564e-07, + "loss": 0.3352, + "mean_token_accuracy": 0.9190180860459805, + "num_tokens": 133997866.0, + "step": 1859 + }, + { + "epoch": 1.1604182109000116, + "grad_norm": 0.17035602033138275, + "learning_rate": 2.56e-07, + "loss": 0.308, + "mean_token_accuracy": 0.9266541711986065, + "num_tokens": 134066075.0, + "step": 1860 + }, + { + "epoch": 1.1610424062731635, + "grad_norm": 0.1885630190372467, + "learning_rate": 2.556e-07, + "loss": 0.2818, + "mean_token_accuracy": 0.9332041442394257, + "num_tokens": 134143740.0, + "step": 1861 + }, + { + "epoch": 1.1616666016463153, + "grad_norm": 0.5389247536659241, + "learning_rate": 2.5519999999999996e-07, + "loss": 0.3002, + "mean_token_accuracy": 0.9315211027860641, + "num_tokens": 134214077.0, + "step": 1862 + }, + { + "epoch": 1.1622907970194671, + "grad_norm": 0.2879144847393036, + "learning_rate": 2.5480000000000003e-07, + "loss": 0.294, + "mean_token_accuracy": 0.9316158406436443, + "num_tokens": 134282196.0, + "step": 1863 + }, + { + "epoch": 1.162914992392619, + "grad_norm": 0.5415794253349304, + "learning_rate": 2.544e-07, + "loss": 0.2966, + "mean_token_accuracy": 0.9295268543064594, + "num_tokens": 134356823.0, + "step": 1864 + }, + { + "epoch": 1.1635391877657706, + "grad_norm": 0.26620304584503174, + "learning_rate": 2.5399999999999997e-07, + "loss": 0.3123, + "mean_token_accuracy": 0.9269912056624889, + "num_tokens": 134424492.0, + "step": 1865 + }, + { + "epoch": 1.1641633831389224, + "grad_norm": 0.31247904896736145, + "learning_rate": 2.536e-07, + "loss": 0.3174, + "mean_token_accuracy": 0.925802081823349, + "num_tokens": 134495698.0, + "step": 1866 + }, + { + "epoch": 1.1647875785120743, + "grad_norm": 0.16684845089912415, + "learning_rate": 2.5319999999999996e-07, + "loss": 0.3054, + "mean_token_accuracy": 0.9268181398510933, + "num_tokens": 134572397.0, + "step": 1867 + }, + { + "epoch": 1.165411773885226, + "grad_norm": 0.1696167141199112, + "learning_rate": 2.528e-07, + "loss": 0.2716, + "mean_token_accuracy": 0.9315634407103062, + "num_tokens": 134647126.0, + "step": 1868 + }, + { + "epoch": 1.166035969258378, + "grad_norm": 0.2812148332595825, + "learning_rate": 2.524e-07, + "loss": 0.298, + "mean_token_accuracy": 0.9283396787941456, + "num_tokens": 134718192.0, + "step": 1869 + }, + { + "epoch": 1.1666601646315296, + "grad_norm": 0.2474401444196701, + "learning_rate": 2.52e-07, + "loss": 0.2968, + "mean_token_accuracy": 0.9312821514904499, + "num_tokens": 134793965.0, + "step": 1870 + }, + { + "epoch": 1.1672843600046814, + "grad_norm": 0.23733125627040863, + "learning_rate": 2.516e-07, + "loss": 0.2667, + "mean_token_accuracy": 0.9370587877929211, + "num_tokens": 134868125.0, + "step": 1871 + }, + { + "epoch": 1.1679085553778332, + "grad_norm": 0.2144850492477417, + "learning_rate": 2.5119999999999997e-07, + "loss": 0.3091, + "mean_token_accuracy": 0.9290624037384987, + "num_tokens": 134939268.0, + "step": 1872 + }, + { + "epoch": 1.168532750750985, + "grad_norm": 0.3344308137893677, + "learning_rate": 2.508e-07, + "loss": 0.291, + "mean_token_accuracy": 0.9301040731370449, + "num_tokens": 135014858.0, + "step": 1873 + }, + { + "epoch": 1.169156946124137, + "grad_norm": 0.34858083724975586, + "learning_rate": 2.504e-07, + "loss": 0.2978, + "mean_token_accuracy": 0.9308509156107903, + "num_tokens": 135089523.0, + "step": 1874 + }, + { + "epoch": 1.1697811414972887, + "grad_norm": 0.36934223771095276, + "learning_rate": 2.5e-07, + "loss": 0.2672, + "mean_token_accuracy": 0.9343902766704559, + "num_tokens": 135166513.0, + "step": 1875 + }, + { + "epoch": 1.1704053368704406, + "grad_norm": 0.35646188259124756, + "learning_rate": 2.4959999999999996e-07, + "loss": 0.2733, + "mean_token_accuracy": 0.9356859847903252, + "num_tokens": 135242067.0, + "step": 1876 + }, + { + "epoch": 1.1710295322435922, + "grad_norm": 0.2494499236345291, + "learning_rate": 2.492e-07, + "loss": 0.2523, + "mean_token_accuracy": 0.9377260804176331, + "num_tokens": 135319315.0, + "step": 1877 + }, + { + "epoch": 1.171653727616744, + "grad_norm": 0.4050208032131195, + "learning_rate": 2.488e-07, + "loss": 0.3393, + "mean_token_accuracy": 0.91460732370615, + "num_tokens": 135385756.0, + "step": 1878 + }, + { + "epoch": 1.1722779229898959, + "grad_norm": 0.4595908522605896, + "learning_rate": 2.484e-07, + "loss": 0.2984, + "mean_token_accuracy": 0.9306458607316017, + "num_tokens": 135461999.0, + "step": 1879 + }, + { + "epoch": 1.1729021183630477, + "grad_norm": 0.34942948818206787, + "learning_rate": 2.48e-07, + "loss": 0.2546, + "mean_token_accuracy": 0.9366017803549767, + "num_tokens": 135540371.0, + "step": 1880 + }, + { + "epoch": 1.1735263137361995, + "grad_norm": 0.19202062487602234, + "learning_rate": 2.4759999999999997e-07, + "loss": 0.3216, + "mean_token_accuracy": 0.9231920093297958, + "num_tokens": 135611265.0, + "step": 1881 + }, + { + "epoch": 1.1741505091093511, + "grad_norm": 0.22570616006851196, + "learning_rate": 2.472e-07, + "loss": 0.2821, + "mean_token_accuracy": 0.9327188841998577, + "num_tokens": 135685494.0, + "step": 1882 + }, + { + "epoch": 1.174774704482503, + "grad_norm": 0.20979247987270355, + "learning_rate": 2.4679999999999996e-07, + "loss": 0.311, + "mean_token_accuracy": 0.9240441806614399, + "num_tokens": 135755396.0, + "step": 1883 + }, + { + "epoch": 1.1753988998556548, + "grad_norm": 0.2375631481409073, + "learning_rate": 2.464e-07, + "loss": 0.3099, + "mean_token_accuracy": 0.9290084987878799, + "num_tokens": 135829082.0, + "step": 1884 + }, + { + "epoch": 1.1760230952288067, + "grad_norm": 0.19450223445892334, + "learning_rate": 2.46e-07, + "loss": 0.3052, + "mean_token_accuracy": 0.9239453449845314, + "num_tokens": 135899787.0, + "step": 1885 + }, + { + "epoch": 1.1766472906019585, + "grad_norm": 0.25800907611846924, + "learning_rate": 2.456e-07, + "loss": 0.2729, + "mean_token_accuracy": 0.9359027035534382, + "num_tokens": 135977967.0, + "step": 1886 + }, + { + "epoch": 1.17727148597511, + "grad_norm": 0.42402902245521545, + "learning_rate": 2.452e-07, + "loss": 0.2942, + "mean_token_accuracy": 0.9300507567822933, + "num_tokens": 136052124.0, + "step": 1887 + }, + { + "epoch": 1.177895681348262, + "grad_norm": 0.18239305913448334, + "learning_rate": 2.4479999999999997e-07, + "loss": 0.2869, + "mean_token_accuracy": 0.9308939017355442, + "num_tokens": 136123309.0, + "step": 1888 + }, + { + "epoch": 1.1785198767214138, + "grad_norm": 0.23847521841526031, + "learning_rate": 2.444e-07, + "loss": 0.2625, + "mean_token_accuracy": 0.9351480379700661, + "num_tokens": 136198358.0, + "step": 1889 + }, + { + "epoch": 1.1791440720945656, + "grad_norm": 0.2570955455303192, + "learning_rate": 2.4399999999999996e-07, + "loss": 0.2956, + "mean_token_accuracy": 0.9294303208589554, + "num_tokens": 136271208.0, + "step": 1890 + }, + { + "epoch": 1.1797682674677175, + "grad_norm": 0.21417833864688873, + "learning_rate": 2.436e-07, + "loss": 0.3143, + "mean_token_accuracy": 0.9235550835728645, + "num_tokens": 136340233.0, + "step": 1891 + }, + { + "epoch": 1.180392462840869, + "grad_norm": 0.23997154831886292, + "learning_rate": 2.432e-07, + "loss": 0.3289, + "mean_token_accuracy": 0.9248317442834377, + "num_tokens": 136410134.0, + "step": 1892 + }, + { + "epoch": 1.181016658214021, + "grad_norm": 0.19718722999095917, + "learning_rate": 2.428e-07, + "loss": 0.2911, + "mean_token_accuracy": 0.9323147349059582, + "num_tokens": 136483660.0, + "step": 1893 + }, + { + "epoch": 1.1816408535871727, + "grad_norm": 0.19593633711338043, + "learning_rate": 2.424e-07, + "loss": 0.2538, + "mean_token_accuracy": 0.9371818900108337, + "num_tokens": 136561973.0, + "step": 1894 + }, + { + "epoch": 1.1822650489603246, + "grad_norm": 0.820732831954956, + "learning_rate": 2.4199999999999997e-07, + "loss": 0.2989, + "mean_token_accuracy": 0.9258870854973793, + "num_tokens": 136632603.0, + "step": 1895 + }, + { + "epoch": 1.1828892443334764, + "grad_norm": 0.1931726038455963, + "learning_rate": 2.416e-07, + "loss": 0.3095, + "mean_token_accuracy": 0.9264886453747749, + "num_tokens": 136703942.0, + "step": 1896 + }, + { + "epoch": 1.1835134397066283, + "grad_norm": 0.2562076449394226, + "learning_rate": 2.4119999999999996e-07, + "loss": 0.3415, + "mean_token_accuracy": 0.9191934876143932, + "num_tokens": 136768459.0, + "step": 1897 + }, + { + "epoch": 1.18413763507978, + "grad_norm": 1.5060997009277344, + "learning_rate": 2.408e-07, + "loss": 0.3047, + "mean_token_accuracy": 0.9286314845085144, + "num_tokens": 136843647.0, + "step": 1898 + }, + { + "epoch": 1.1847618304529317, + "grad_norm": 0.7664989829063416, + "learning_rate": 2.404e-07, + "loss": 0.2887, + "mean_token_accuracy": 0.9316050447523594, + "num_tokens": 136921135.0, + "step": 1899 + }, + { + "epoch": 1.1853860258260835, + "grad_norm": 0.17486004531383514, + "learning_rate": 2.4e-07, + "loss": 0.2909, + "mean_token_accuracy": 0.9280970953404903, + "num_tokens": 136997809.0, + "step": 1900 + }, + { + "epoch": 1.1860102211992354, + "grad_norm": 0.6958367824554443, + "learning_rate": 2.396e-07, + "loss": 0.3104, + "mean_token_accuracy": 0.9216005057096481, + "num_tokens": 137069641.0, + "step": 1901 + }, + { + "epoch": 1.1866344165723872, + "grad_norm": 0.2238064855337143, + "learning_rate": 2.3919999999999997e-07, + "loss": 0.2641, + "mean_token_accuracy": 0.9349865131080151, + "num_tokens": 137150242.0, + "step": 1902 + }, + { + "epoch": 1.187258611945539, + "grad_norm": 0.23215816915035248, + "learning_rate": 2.388e-07, + "loss": 0.3025, + "mean_token_accuracy": 0.9272582344710827, + "num_tokens": 137225737.0, + "step": 1903 + }, + { + "epoch": 1.1878828073186907, + "grad_norm": 0.5705999135971069, + "learning_rate": 2.384e-07, + "loss": 0.2751, + "mean_token_accuracy": 0.9303819313645363, + "num_tokens": 137300297.0, + "step": 1904 + }, + { + "epoch": 1.1885070026918425, + "grad_norm": 0.25309160351753235, + "learning_rate": 2.38e-07, + "loss": 0.3459, + "mean_token_accuracy": 0.9212737530469894, + "num_tokens": 137364391.0, + "step": 1905 + }, + { + "epoch": 1.1891311980649943, + "grad_norm": 0.28164875507354736, + "learning_rate": 2.3759999999999998e-07, + "loss": 0.3051, + "mean_token_accuracy": 0.9257899895310402, + "num_tokens": 137437385.0, + "step": 1906 + }, + { + "epoch": 1.1897553934381462, + "grad_norm": 0.2582246661186218, + "learning_rate": 2.3719999999999998e-07, + "loss": 0.2996, + "mean_token_accuracy": 0.9279305823147297, + "num_tokens": 137508114.0, + "step": 1907 + }, + { + "epoch": 1.190379588811298, + "grad_norm": 0.28349068760871887, + "learning_rate": 2.368e-07, + "loss": 0.2981, + "mean_token_accuracy": 0.9257392808794975, + "num_tokens": 137578728.0, + "step": 1908 + }, + { + "epoch": 1.1910037841844496, + "grad_norm": 0.20615063607692719, + "learning_rate": 2.364e-07, + "loss": 0.2649, + "mean_token_accuracy": 0.9353856556117535, + "num_tokens": 137655032.0, + "step": 1909 + }, + { + "epoch": 1.1916279795576015, + "grad_norm": 0.3541504740715027, + "learning_rate": 2.3599999999999997e-07, + "loss": 0.2567, + "mean_token_accuracy": 0.9382404088973999, + "num_tokens": 137729288.0, + "step": 1910 + }, + { + "epoch": 1.1922521749307533, + "grad_norm": 0.32833313941955566, + "learning_rate": 2.356e-07, + "loss": 0.2818, + "mean_token_accuracy": 0.9336875565350056, + "num_tokens": 137804311.0, + "step": 1911 + }, + { + "epoch": 1.1928763703039051, + "grad_norm": 0.1803574413061142, + "learning_rate": 2.352e-07, + "loss": 0.2965, + "mean_token_accuracy": 0.92704052105546, + "num_tokens": 137873456.0, + "step": 1912 + }, + { + "epoch": 1.193500565677057, + "grad_norm": 0.19915112853050232, + "learning_rate": 2.3479999999999998e-07, + "loss": 0.2957, + "mean_token_accuracy": 0.9266354441642761, + "num_tokens": 137943188.0, + "step": 1913 + }, + { + "epoch": 1.1941247610502086, + "grad_norm": 0.5460178256034851, + "learning_rate": 2.3439999999999998e-07, + "loss": 0.3291, + "mean_token_accuracy": 0.9158940352499485, + "num_tokens": 138009654.0, + "step": 1914 + }, + { + "epoch": 1.1947489564233604, + "grad_norm": 0.21439026296138763, + "learning_rate": 2.34e-07, + "loss": 0.2713, + "mean_token_accuracy": 0.9356495849788189, + "num_tokens": 138085666.0, + "step": 1915 + }, + { + "epoch": 1.1953731517965123, + "grad_norm": 0.2986992299556732, + "learning_rate": 2.336e-07, + "loss": 0.279, + "mean_token_accuracy": 0.9336680062115192, + "num_tokens": 138160896.0, + "step": 1916 + }, + { + "epoch": 1.195997347169664, + "grad_norm": 0.2106047123670578, + "learning_rate": 2.3319999999999997e-07, + "loss": 0.2771, + "mean_token_accuracy": 0.93426463752985, + "num_tokens": 138233950.0, + "step": 1917 + }, + { + "epoch": 1.196621542542816, + "grad_norm": 0.41356176137924194, + "learning_rate": 2.328e-07, + "loss": 0.2898, + "mean_token_accuracy": 0.930479820817709, + "num_tokens": 138304810.0, + "step": 1918 + }, + { + "epoch": 1.1972457379159678, + "grad_norm": 0.17104043066501617, + "learning_rate": 2.324e-07, + "loss": 0.2765, + "mean_token_accuracy": 0.9311543405056, + "num_tokens": 138375722.0, + "step": 1919 + }, + { + "epoch": 1.1978699332891196, + "grad_norm": 0.25905850529670715, + "learning_rate": 2.32e-07, + "loss": 0.2707, + "mean_token_accuracy": 0.9316887222230434, + "num_tokens": 138453394.0, + "step": 1920 + }, + { + "epoch": 1.1984941286622712, + "grad_norm": 0.20590656995773315, + "learning_rate": 2.3159999999999998e-07, + "loss": 0.3018, + "mean_token_accuracy": 0.929004792124033, + "num_tokens": 138527995.0, + "step": 1921 + }, + { + "epoch": 1.199118324035423, + "grad_norm": 0.19631439447402954, + "learning_rate": 2.3119999999999998e-07, + "loss": 0.2582, + "mean_token_accuracy": 0.936049472540617, + "num_tokens": 138602253.0, + "step": 1922 + }, + { + "epoch": 1.199742519408575, + "grad_norm": 0.4334622919559479, + "learning_rate": 2.308e-07, + "loss": 0.2878, + "mean_token_accuracy": 0.9309201501309872, + "num_tokens": 138674623.0, + "step": 1923 + }, + { + "epoch": 1.2003667147817267, + "grad_norm": 0.3317464292049408, + "learning_rate": 2.3039999999999997e-07, + "loss": 0.2605, + "mean_token_accuracy": 0.9359247721731663, + "num_tokens": 138747524.0, + "step": 1924 + }, + { + "epoch": 1.2009909101548786, + "grad_norm": 0.17119291424751282, + "learning_rate": 2.3e-07, + "loss": 0.297, + "mean_token_accuracy": 0.9294658228754997, + "num_tokens": 138820385.0, + "step": 1925 + }, + { + "epoch": 1.2016151055280302, + "grad_norm": 0.9727433323860168, + "learning_rate": 2.296e-07, + "loss": 0.282, + "mean_token_accuracy": 0.9341945722699165, + "num_tokens": 138893145.0, + "step": 1926 + }, + { + "epoch": 1.202239300901182, + "grad_norm": 0.22031718492507935, + "learning_rate": 2.292e-07, + "loss": 0.3142, + "mean_token_accuracy": 0.9245168454945087, + "num_tokens": 138964736.0, + "step": 1927 + }, + { + "epoch": 1.2028634962743339, + "grad_norm": 0.4367298185825348, + "learning_rate": 2.2879999999999998e-07, + "loss": 0.3005, + "mean_token_accuracy": 0.9281546659767628, + "num_tokens": 139037258.0, + "step": 1928 + }, + { + "epoch": 1.2034876916474857, + "grad_norm": 0.24401399493217468, + "learning_rate": 2.2839999999999998e-07, + "loss": 0.2726, + "mean_token_accuracy": 0.9330070577561855, + "num_tokens": 139112366.0, + "step": 1929 + }, + { + "epoch": 1.2041118870206375, + "grad_norm": 0.33276596665382385, + "learning_rate": 2.28e-07, + "loss": 0.3412, + "mean_token_accuracy": 0.9193909876048565, + "num_tokens": 139183428.0, + "step": 1930 + }, + { + "epoch": 1.2047360823937892, + "grad_norm": 0.29439109563827515, + "learning_rate": 2.2759999999999997e-07, + "loss": 0.2943, + "mean_token_accuracy": 0.9299340173602104, + "num_tokens": 139258350.0, + "step": 1931 + }, + { + "epoch": 1.205360277766941, + "grad_norm": 0.583676815032959, + "learning_rate": 2.272e-07, + "loss": 0.2765, + "mean_token_accuracy": 0.9333144538104534, + "num_tokens": 139334586.0, + "step": 1932 + }, + { + "epoch": 1.2059844731400928, + "grad_norm": 0.26143327355384827, + "learning_rate": 2.268e-07, + "loss": 0.2949, + "mean_token_accuracy": 0.9322762191295624, + "num_tokens": 139410907.0, + "step": 1933 + }, + { + "epoch": 1.2066086685132447, + "grad_norm": 0.39439812302589417, + "learning_rate": 2.264e-07, + "loss": 0.2742, + "mean_token_accuracy": 0.9372350350022316, + "num_tokens": 139487672.0, + "step": 1934 + }, + { + "epoch": 1.2072328638863965, + "grad_norm": 0.26654186844825745, + "learning_rate": 2.2599999999999999e-07, + "loss": 0.2545, + "mean_token_accuracy": 0.937914039939642, + "num_tokens": 139567578.0, + "step": 1935 + }, + { + "epoch": 1.2078570592595481, + "grad_norm": 0.23210839927196503, + "learning_rate": 2.2559999999999998e-07, + "loss": 0.2678, + "mean_token_accuracy": 0.9339726194739342, + "num_tokens": 139640290.0, + "step": 1936 + }, + { + "epoch": 1.2084812546327, + "grad_norm": 0.31822264194488525, + "learning_rate": 2.252e-07, + "loss": 0.3201, + "mean_token_accuracy": 0.9255199208855629, + "num_tokens": 139713591.0, + "step": 1937 + }, + { + "epoch": 1.2091054500058518, + "grad_norm": 0.193868488073349, + "learning_rate": 2.248e-07, + "loss": 0.326, + "mean_token_accuracy": 0.923390332609415, + "num_tokens": 139782037.0, + "step": 1938 + }, + { + "epoch": 1.2097296453790036, + "grad_norm": 0.2125038504600525, + "learning_rate": 2.2439999999999997e-07, + "loss": 0.3038, + "mean_token_accuracy": 0.9275647886097431, + "num_tokens": 139852227.0, + "step": 1939 + }, + { + "epoch": 1.2103538407521555, + "grad_norm": 0.17288953065872192, + "learning_rate": 2.24e-07, + "loss": 0.2778, + "mean_token_accuracy": 0.9302581809461117, + "num_tokens": 139928261.0, + "step": 1940 + }, + { + "epoch": 1.2109780361253073, + "grad_norm": 0.24702724814414978, + "learning_rate": 2.236e-07, + "loss": 0.3128, + "mean_token_accuracy": 0.9271270148456097, + "num_tokens": 140000693.0, + "step": 1941 + }, + { + "epoch": 1.2116022314984591, + "grad_norm": 0.2686416804790497, + "learning_rate": 2.232e-07, + "loss": 0.3009, + "mean_token_accuracy": 0.9261568896472454, + "num_tokens": 140071720.0, + "step": 1942 + }, + { + "epoch": 1.2122264268716108, + "grad_norm": 0.25429093837738037, + "learning_rate": 2.2279999999999998e-07, + "loss": 0.3269, + "mean_token_accuracy": 0.9195707850158215, + "num_tokens": 140140175.0, + "step": 1943 + }, + { + "epoch": 1.2128506222447626, + "grad_norm": 0.22192858159542084, + "learning_rate": 2.2239999999999998e-07, + "loss": 0.3152, + "mean_token_accuracy": 0.9278103448450565, + "num_tokens": 140210746.0, + "step": 1944 + }, + { + "epoch": 1.2134748176179144, + "grad_norm": 0.2396640032529831, + "learning_rate": 2.22e-07, + "loss": 0.3045, + "mean_token_accuracy": 0.9242516867816448, + "num_tokens": 140282795.0, + "step": 1945 + }, + { + "epoch": 1.2140990129910663, + "grad_norm": 0.1565372794866562, + "learning_rate": 2.2159999999999997e-07, + "loss": 0.3174, + "mean_token_accuracy": 0.9228121414780617, + "num_tokens": 140353058.0, + "step": 1946 + }, + { + "epoch": 1.214723208364218, + "grad_norm": 0.35994771122932434, + "learning_rate": 2.212e-07, + "loss": 0.2955, + "mean_token_accuracy": 0.9281670711934566, + "num_tokens": 140429619.0, + "step": 1947 + }, + { + "epoch": 1.2153474037373697, + "grad_norm": 0.6001183986663818, + "learning_rate": 2.208e-07, + "loss": 0.3027, + "mean_token_accuracy": 0.9287135601043701, + "num_tokens": 140502302.0, + "step": 1948 + }, + { + "epoch": 1.2159715991105216, + "grad_norm": 1.0995194911956787, + "learning_rate": 2.2040000000000001e-07, + "loss": 0.3084, + "mean_token_accuracy": 0.9279600083827972, + "num_tokens": 140575880.0, + "step": 1949 + }, + { + "epoch": 1.2165957944836734, + "grad_norm": 0.5478682518005371, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.2833, + "mean_token_accuracy": 0.9330231547355652, + "num_tokens": 140652338.0, + "step": 1950 + }, + { + "epoch": 1.2172199898568252, + "grad_norm": 0.2642333209514618, + "learning_rate": 2.1959999999999998e-07, + "loss": 0.3075, + "mean_token_accuracy": 0.9248400889337063, + "num_tokens": 140723999.0, + "step": 1951 + }, + { + "epoch": 1.217844185229977, + "grad_norm": 3.0133700370788574, + "learning_rate": 2.192e-07, + "loss": 0.3061, + "mean_token_accuracy": 0.9269981160759926, + "num_tokens": 140795048.0, + "step": 1952 + }, + { + "epoch": 1.2184683806031287, + "grad_norm": 1.782266616821289, + "learning_rate": 2.1879999999999997e-07, + "loss": 0.2692, + "mean_token_accuracy": 0.9353551715612411, + "num_tokens": 140869302.0, + "step": 1953 + }, + { + "epoch": 1.2190925759762805, + "grad_norm": 0.48682355880737305, + "learning_rate": 2.184e-07, + "loss": 0.3172, + "mean_token_accuracy": 0.925975177437067, + "num_tokens": 140943635.0, + "step": 1954 + }, + { + "epoch": 1.2197167713494323, + "grad_norm": 0.21742987632751465, + "learning_rate": 2.18e-07, + "loss": 0.3033, + "mean_token_accuracy": 0.9312651455402374, + "num_tokens": 141014795.0, + "step": 1955 + }, + { + "epoch": 1.2203409667225842, + "grad_norm": 0.3188525140285492, + "learning_rate": 2.176e-07, + "loss": 0.2973, + "mean_token_accuracy": 0.9293990321457386, + "num_tokens": 141089186.0, + "step": 1956 + }, + { + "epoch": 1.220965162095736, + "grad_norm": 0.38894155621528625, + "learning_rate": 2.1719999999999999e-07, + "loss": 0.2803, + "mean_token_accuracy": 0.934301171451807, + "num_tokens": 141160217.0, + "step": 1957 + }, + { + "epoch": 1.2215893574688876, + "grad_norm": 0.2257828712463379, + "learning_rate": 2.1679999999999998e-07, + "loss": 0.2965, + "mean_token_accuracy": 0.9308297634124756, + "num_tokens": 141230200.0, + "step": 1958 + }, + { + "epoch": 1.2222135528420395, + "grad_norm": 0.19858938455581665, + "learning_rate": 2.164e-07, + "loss": 0.2942, + "mean_token_accuracy": 0.9300949797034264, + "num_tokens": 141302317.0, + "step": 1959 + }, + { + "epoch": 1.2228377482151913, + "grad_norm": 0.26591336727142334, + "learning_rate": 2.1599999999999998e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.9268428906798363, + "num_tokens": 141374895.0, + "step": 1960 + }, + { + "epoch": 1.2234619435883431, + "grad_norm": 0.2200537621974945, + "learning_rate": 2.156e-07, + "loss": 0.3007, + "mean_token_accuracy": 0.9278355240821838, + "num_tokens": 141448735.0, + "step": 1961 + }, + { + "epoch": 1.224086138961495, + "grad_norm": 0.22200044989585876, + "learning_rate": 2.152e-07, + "loss": 0.2773, + "mean_token_accuracy": 0.9348450526595116, + "num_tokens": 141526475.0, + "step": 1962 + }, + { + "epoch": 1.2247103343346468, + "grad_norm": 4.756194591522217, + "learning_rate": 2.148e-07, + "loss": 0.3198, + "mean_token_accuracy": 0.9263877421617508, + "num_tokens": 141596791.0, + "step": 1963 + }, + { + "epoch": 1.2253345297077987, + "grad_norm": 0.2623351812362671, + "learning_rate": 2.144e-07, + "loss": 0.2681, + "mean_token_accuracy": 0.9355500973761082, + "num_tokens": 141666177.0, + "step": 1964 + }, + { + "epoch": 1.2259587250809503, + "grad_norm": 0.6401394009590149, + "learning_rate": 2.1399999999999998e-07, + "loss": 0.3322, + "mean_token_accuracy": 0.9183893129229546, + "num_tokens": 141735335.0, + "step": 1965 + }, + { + "epoch": 1.226582920454102, + "grad_norm": 0.4154477119445801, + "learning_rate": 2.136e-07, + "loss": 0.3126, + "mean_token_accuracy": 0.9231975711882114, + "num_tokens": 141807016.0, + "step": 1966 + }, + { + "epoch": 1.227207115827254, + "grad_norm": 0.2412223070859909, + "learning_rate": 2.132e-07, + "loss": 0.3121, + "mean_token_accuracy": 0.9236135333776474, + "num_tokens": 141877571.0, + "step": 1967 + }, + { + "epoch": 1.2278313112004058, + "grad_norm": 0.24652279913425446, + "learning_rate": 2.1279999999999997e-07, + "loss": 0.3392, + "mean_token_accuracy": 0.9202834665775299, + "num_tokens": 141949761.0, + "step": 1968 + }, + { + "epoch": 1.2284555065735576, + "grad_norm": 0.2173452526330948, + "learning_rate": 2.124e-07, + "loss": 0.295, + "mean_token_accuracy": 0.9269723780453205, + "num_tokens": 142022821.0, + "step": 1969 + }, + { + "epoch": 1.2290797019467092, + "grad_norm": 0.6362020373344421, + "learning_rate": 2.12e-07, + "loss": 0.3371, + "mean_token_accuracy": 0.9173320271074772, + "num_tokens": 142095248.0, + "step": 1970 + }, + { + "epoch": 1.229703897319861, + "grad_norm": 0.2940523624420166, + "learning_rate": 2.116e-07, + "loss": 0.3408, + "mean_token_accuracy": 0.9135356210172176, + "num_tokens": 142158336.0, + "step": 1971 + }, + { + "epoch": 1.230328092693013, + "grad_norm": 0.44975677132606506, + "learning_rate": 2.1119999999999999e-07, + "loss": 0.3337, + "mean_token_accuracy": 0.9185782335698605, + "num_tokens": 142227207.0, + "step": 1972 + }, + { + "epoch": 1.2309522880661647, + "grad_norm": 0.2639687657356262, + "learning_rate": 2.1079999999999998e-07, + "loss": 0.3492, + "mean_token_accuracy": 0.9194901958107948, + "num_tokens": 142293910.0, + "step": 1973 + }, + { + "epoch": 1.2315764834393166, + "grad_norm": 0.21394580602645874, + "learning_rate": 2.104e-07, + "loss": 0.2908, + "mean_token_accuracy": 0.9302883297204971, + "num_tokens": 142366122.0, + "step": 1974 + }, + { + "epoch": 1.2322006788124682, + "grad_norm": 0.29126736521720886, + "learning_rate": 2.0999999999999997e-07, + "loss": 0.2944, + "mean_token_accuracy": 0.9283175393939018, + "num_tokens": 142438606.0, + "step": 1975 + }, + { + "epoch": 1.23282487418562, + "grad_norm": 0.19498232007026672, + "learning_rate": 2.096e-07, + "loss": 0.3004, + "mean_token_accuracy": 0.9242088571190834, + "num_tokens": 142509755.0, + "step": 1976 + }, + { + "epoch": 1.2334490695587719, + "grad_norm": 0.2363796979188919, + "learning_rate": 2.092e-07, + "loss": 0.3232, + "mean_token_accuracy": 0.9266494549810886, + "num_tokens": 142577245.0, + "step": 1977 + }, + { + "epoch": 1.2340732649319237, + "grad_norm": 0.1697138249874115, + "learning_rate": 2.0880000000000002e-07, + "loss": 0.2743, + "mean_token_accuracy": 0.9335421025753021, + "num_tokens": 142655668.0, + "step": 1978 + }, + { + "epoch": 1.2346974603050755, + "grad_norm": 0.1703985035419464, + "learning_rate": 2.0839999999999999e-07, + "loss": 0.2629, + "mean_token_accuracy": 0.9358954504132271, + "num_tokens": 142730795.0, + "step": 1979 + }, + { + "epoch": 1.2353216556782274, + "grad_norm": 0.2172739952802658, + "learning_rate": 2.0799999999999998e-07, + "loss": 0.3511, + "mean_token_accuracy": 0.9146418832242489, + "num_tokens": 142799929.0, + "step": 1980 + }, + { + "epoch": 1.235945851051379, + "grad_norm": 0.5477745532989502, + "learning_rate": 2.076e-07, + "loss": 0.3121, + "mean_token_accuracy": 0.9271867647767067, + "num_tokens": 142874257.0, + "step": 1981 + }, + { + "epoch": 1.2365700464245308, + "grad_norm": 0.29602423310279846, + "learning_rate": 2.0719999999999998e-07, + "loss": 0.3245, + "mean_token_accuracy": 0.9187286868691444, + "num_tokens": 142943622.0, + "step": 1982 + }, + { + "epoch": 1.2371942417976827, + "grad_norm": 0.15999367833137512, + "learning_rate": 2.068e-07, + "loss": 0.3002, + "mean_token_accuracy": 0.9291433840990067, + "num_tokens": 143016493.0, + "step": 1983 + }, + { + "epoch": 1.2378184371708345, + "grad_norm": 0.27512940764427185, + "learning_rate": 2.064e-07, + "loss": 0.2882, + "mean_token_accuracy": 0.9291428551077843, + "num_tokens": 143089576.0, + "step": 1984 + }, + { + "epoch": 1.2384426325439863, + "grad_norm": 0.20762284100055695, + "learning_rate": 2.06e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.9242803901433945, + "num_tokens": 143159642.0, + "step": 1985 + }, + { + "epoch": 1.2390668279171382, + "grad_norm": 0.27707576751708984, + "learning_rate": 2.056e-07, + "loss": 0.3108, + "mean_token_accuracy": 0.9236514419317245, + "num_tokens": 143233084.0, + "step": 1986 + }, + { + "epoch": 1.2396910232902898, + "grad_norm": 0.22370785474777222, + "learning_rate": 2.0519999999999998e-07, + "loss": 0.305, + "mean_token_accuracy": 0.9289496727287769, + "num_tokens": 143301832.0, + "step": 1987 + }, + { + "epoch": 1.2403152186634416, + "grad_norm": 0.7158634662628174, + "learning_rate": 2.048e-07, + "loss": 0.2975, + "mean_token_accuracy": 0.9266036041080952, + "num_tokens": 143372592.0, + "step": 1988 + }, + { + "epoch": 1.2409394140365935, + "grad_norm": 0.3471851944923401, + "learning_rate": 2.0439999999999998e-07, + "loss": 0.3149, + "mean_token_accuracy": 0.918112013489008, + "num_tokens": 143441046.0, + "step": 1989 + }, + { + "epoch": 1.2415636094097453, + "grad_norm": 0.4205593764781952, + "learning_rate": 2.0399999999999997e-07, + "loss": 0.3641, + "mean_token_accuracy": 0.9148489758372307, + "num_tokens": 143508370.0, + "step": 1990 + }, + { + "epoch": 1.2421878047828971, + "grad_norm": 0.20967119932174683, + "learning_rate": 2.036e-07, + "loss": 0.2898, + "mean_token_accuracy": 0.9309439212083817, + "num_tokens": 143579685.0, + "step": 1991 + }, + { + "epoch": 1.2428120001560488, + "grad_norm": 0.2066921591758728, + "learning_rate": 2.032e-07, + "loss": 0.3068, + "mean_token_accuracy": 0.9237515032291412, + "num_tokens": 143653790.0, + "step": 1992 + }, + { + "epoch": 1.2434361955292006, + "grad_norm": 0.3598202168941498, + "learning_rate": 2.028e-07, + "loss": 0.2799, + "mean_token_accuracy": 0.9304822869598866, + "num_tokens": 143723786.0, + "step": 1993 + }, + { + "epoch": 1.2440603909023524, + "grad_norm": 0.24529330432415009, + "learning_rate": 2.0239999999999999e-07, + "loss": 0.2949, + "mean_token_accuracy": 0.9302321821451187, + "num_tokens": 143793311.0, + "step": 1994 + }, + { + "epoch": 1.2446845862755043, + "grad_norm": 0.47127363085746765, + "learning_rate": 2.02e-07, + "loss": 0.2732, + "mean_token_accuracy": 0.9321400262415409, + "num_tokens": 143866404.0, + "step": 1995 + }, + { + "epoch": 1.245308781648656, + "grad_norm": 0.23904411494731903, + "learning_rate": 2.016e-07, + "loss": 0.2629, + "mean_token_accuracy": 0.9344308935105801, + "num_tokens": 143943062.0, + "step": 1996 + }, + { + "epoch": 1.2459329770218077, + "grad_norm": 0.20707716047763824, + "learning_rate": 2.0119999999999998e-07, + "loss": 0.2989, + "mean_token_accuracy": 0.931650672107935, + "num_tokens": 144014597.0, + "step": 1997 + }, + { + "epoch": 1.2465571723949596, + "grad_norm": 0.20972010493278503, + "learning_rate": 2.008e-07, + "loss": 0.3366, + "mean_token_accuracy": 0.9222993403673172, + "num_tokens": 144084304.0, + "step": 1998 + }, + { + "epoch": 1.2471813677681114, + "grad_norm": 0.1885141283273697, + "learning_rate": 2.004e-07, + "loss": 0.2985, + "mean_token_accuracy": 0.9274529851973057, + "num_tokens": 144156548.0, + "step": 1999 + }, + { + "epoch": 1.2478055631412632, + "grad_norm": 0.21138960123062134, + "learning_rate": 2e-07, + "loss": 0.2696, + "mean_token_accuracy": 0.9368129335343838, + "num_tokens": 144232112.0, + "step": 2000 + }, + { + "epoch": 1.248429758514415, + "grad_norm": 0.24056430160999298, + "learning_rate": 1.996e-07, + "loss": 0.2926, + "mean_token_accuracy": 0.9288466386497021, + "num_tokens": 144303735.0, + "step": 2001 + }, + { + "epoch": 1.249053953887567, + "grad_norm": 0.3529468774795532, + "learning_rate": 1.9919999999999998e-07, + "loss": 0.3034, + "mean_token_accuracy": 0.9274241030216217, + "num_tokens": 144379073.0, + "step": 2002 + }, + { + "epoch": 1.2496781492607185, + "grad_norm": 0.25440001487731934, + "learning_rate": 1.988e-07, + "loss": 0.3217, + "mean_token_accuracy": 0.9236328490078449, + "num_tokens": 144453027.0, + "step": 2003 + }, + { + "epoch": 1.2503023446338704, + "grad_norm": 0.27205395698547363, + "learning_rate": 1.9839999999999998e-07, + "loss": 0.314, + "mean_token_accuracy": 0.9258622750639915, + "num_tokens": 144522418.0, + "step": 2004 + }, + { + "epoch": 1.2509265400070222, + "grad_norm": 0.2388935536146164, + "learning_rate": 1.98e-07, + "loss": 0.3085, + "mean_token_accuracy": 0.9241451658308506, + "num_tokens": 144594501.0, + "step": 2005 + }, + { + "epoch": 1.251550735380174, + "grad_norm": 0.1971643567085266, + "learning_rate": 1.976e-07, + "loss": 0.3086, + "mean_token_accuracy": 0.925218727439642, + "num_tokens": 144668804.0, + "step": 2006 + }, + { + "epoch": 1.2521749307533259, + "grad_norm": 0.44967588782310486, + "learning_rate": 1.9719999999999997e-07, + "loss": 0.2868, + "mean_token_accuracy": 0.9300103224813938, + "num_tokens": 144738026.0, + "step": 2007 + }, + { + "epoch": 1.2527991261264777, + "grad_norm": 0.17146360874176025, + "learning_rate": 1.968e-07, + "loss": 0.2973, + "mean_token_accuracy": 0.9273517169058323, + "num_tokens": 144807283.0, + "step": 2008 + }, + { + "epoch": 1.2534233214996293, + "grad_norm": 0.43303409218788147, + "learning_rate": 1.9639999999999999e-07, + "loss": 0.3012, + "mean_token_accuracy": 0.9253453388810158, + "num_tokens": 144879393.0, + "step": 2009 + }, + { + "epoch": 1.2540475168727812, + "grad_norm": 0.2855178415775299, + "learning_rate": 1.96e-07, + "loss": 0.3168, + "mean_token_accuracy": 0.9221930205821991, + "num_tokens": 144949551.0, + "step": 2010 + }, + { + "epoch": 1.254671712245933, + "grad_norm": 0.23984257876873016, + "learning_rate": 1.9559999999999998e-07, + "loss": 0.298, + "mean_token_accuracy": 0.9309961311519146, + "num_tokens": 145019397.0, + "step": 2011 + }, + { + "epoch": 1.2552959076190848, + "grad_norm": 0.3261055648326874, + "learning_rate": 1.952e-07, + "loss": 0.2695, + "mean_token_accuracy": 0.9332415536046028, + "num_tokens": 145093129.0, + "step": 2012 + }, + { + "epoch": 1.2559201029922367, + "grad_norm": 0.49767160415649414, + "learning_rate": 1.948e-07, + "loss": 0.2754, + "mean_token_accuracy": 0.9331739097833633, + "num_tokens": 145170444.0, + "step": 2013 + }, + { + "epoch": 1.2565442983653883, + "grad_norm": 0.202736496925354, + "learning_rate": 1.944e-07, + "loss": 0.2927, + "mean_token_accuracy": 0.9300548434257507, + "num_tokens": 145244165.0, + "step": 2014 + }, + { + "epoch": 1.2571684937385401, + "grad_norm": 0.2687183618545532, + "learning_rate": 1.94e-07, + "loss": 0.2923, + "mean_token_accuracy": 0.9298434071242809, + "num_tokens": 145318529.0, + "step": 2015 + }, + { + "epoch": 1.257792689111692, + "grad_norm": 0.4045410454273224, + "learning_rate": 1.9359999999999999e-07, + "loss": 0.2837, + "mean_token_accuracy": 0.9343992620706558, + "num_tokens": 145395130.0, + "step": 2016 + }, + { + "epoch": 1.2584168844848438, + "grad_norm": 0.9221665263175964, + "learning_rate": 1.932e-07, + "loss": 0.3278, + "mean_token_accuracy": 0.9179613329470158, + "num_tokens": 145465174.0, + "step": 2017 + }, + { + "epoch": 1.2590410798579956, + "grad_norm": 0.1706632822751999, + "learning_rate": 1.9279999999999998e-07, + "loss": 0.2953, + "mean_token_accuracy": 0.9288019239902496, + "num_tokens": 145541736.0, + "step": 2018 + }, + { + "epoch": 1.2596652752311472, + "grad_norm": 1.1524754762649536, + "learning_rate": 1.9239999999999998e-07, + "loss": 0.294, + "mean_token_accuracy": 0.9304312244057655, + "num_tokens": 145616491.0, + "step": 2019 + }, + { + "epoch": 1.260289470604299, + "grad_norm": 0.5474220514297485, + "learning_rate": 1.92e-07, + "loss": 0.3086, + "mean_token_accuracy": 0.9286486282944679, + "num_tokens": 145689253.0, + "step": 2020 + }, + { + "epoch": 1.260913665977451, + "grad_norm": 3.031770706176758, + "learning_rate": 1.916e-07, + "loss": 0.3253, + "mean_token_accuracy": 0.9263739176094532, + "num_tokens": 145762147.0, + "step": 2021 + }, + { + "epoch": 1.2615378613506028, + "grad_norm": 0.2456066608428955, + "learning_rate": 1.912e-07, + "loss": 0.323, + "mean_token_accuracy": 0.921351570636034, + "num_tokens": 145827731.0, + "step": 2022 + }, + { + "epoch": 1.2621620567237546, + "grad_norm": 0.26813745498657227, + "learning_rate": 1.908e-07, + "loss": 0.2903, + "mean_token_accuracy": 0.9293633736670017, + "num_tokens": 145901826.0, + "step": 2023 + }, + { + "epoch": 1.2627862520969062, + "grad_norm": 0.1728161722421646, + "learning_rate": 1.904e-07, + "loss": 0.3431, + "mean_token_accuracy": 0.9204093813896179, + "num_tokens": 145971982.0, + "step": 2024 + }, + { + "epoch": 1.2634104474700583, + "grad_norm": 0.2152828425168991, + "learning_rate": 1.8999999999999998e-07, + "loss": 0.292, + "mean_token_accuracy": 0.9303814843297005, + "num_tokens": 146042777.0, + "step": 2025 + }, + { + "epoch": 1.2640346428432099, + "grad_norm": 0.36796197295188904, + "learning_rate": 1.8959999999999998e-07, + "loss": 0.3039, + "mean_token_accuracy": 0.9295073561370373, + "num_tokens": 146112631.0, + "step": 2026 + }, + { + "epoch": 1.2646588382163617, + "grad_norm": 0.28349900245666504, + "learning_rate": 1.892e-07, + "loss": 0.2944, + "mean_token_accuracy": 0.9303562492132187, + "num_tokens": 146188319.0, + "step": 2027 + }, + { + "epoch": 1.2652830335895136, + "grad_norm": 0.35517793893814087, + "learning_rate": 1.888e-07, + "loss": 0.3127, + "mean_token_accuracy": 0.9229304194450378, + "num_tokens": 146261892.0, + "step": 2028 + }, + { + "epoch": 1.2659072289626654, + "grad_norm": 0.20355266332626343, + "learning_rate": 1.884e-07, + "loss": 0.2992, + "mean_token_accuracy": 0.9290516339242458, + "num_tokens": 146333604.0, + "step": 2029 + }, + { + "epoch": 1.2665314243358172, + "grad_norm": 0.25064101815223694, + "learning_rate": 1.88e-07, + "loss": 0.3072, + "mean_token_accuracy": 0.9287717677652836, + "num_tokens": 146409562.0, + "step": 2030 + }, + { + "epoch": 1.2671556197089688, + "grad_norm": 0.24189861118793488, + "learning_rate": 1.8759999999999999e-07, + "loss": 0.3426, + "mean_token_accuracy": 0.9180783703923225, + "num_tokens": 146478099.0, + "step": 2031 + }, + { + "epoch": 1.2677798150821207, + "grad_norm": 0.22071579098701477, + "learning_rate": 1.872e-07, + "loss": 0.305, + "mean_token_accuracy": 0.9228972792625427, + "num_tokens": 146548131.0, + "step": 2032 + }, + { + "epoch": 1.2684040104552725, + "grad_norm": 0.3475932776927948, + "learning_rate": 1.8679999999999998e-07, + "loss": 0.3038, + "mean_token_accuracy": 0.9273154400289059, + "num_tokens": 146618925.0, + "step": 2033 + }, + { + "epoch": 1.2690282058284243, + "grad_norm": 0.21673567593097687, + "learning_rate": 1.864e-07, + "loss": 0.3061, + "mean_token_accuracy": 0.9230624437332153, + "num_tokens": 146688808.0, + "step": 2034 + }, + { + "epoch": 1.2696524012015762, + "grad_norm": 0.2747207283973694, + "learning_rate": 1.86e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9272730797529221, + "num_tokens": 146758771.0, + "step": 2035 + }, + { + "epoch": 1.2702765965747278, + "grad_norm": 0.1846000850200653, + "learning_rate": 1.8559999999999997e-07, + "loss": 0.3137, + "mean_token_accuracy": 0.9251486249268055, + "num_tokens": 146830841.0, + "step": 2036 + }, + { + "epoch": 1.2709007919478796, + "grad_norm": 0.5002678036689758, + "learning_rate": 1.852e-07, + "loss": 0.281, + "mean_token_accuracy": 0.9297677241265774, + "num_tokens": 146901274.0, + "step": 2037 + }, + { + "epoch": 1.2715249873210315, + "grad_norm": 0.21767351031303406, + "learning_rate": 1.848e-07, + "loss": 0.3165, + "mean_token_accuracy": 0.9284128174185753, + "num_tokens": 146974074.0, + "step": 2038 + }, + { + "epoch": 1.2721491826941833, + "grad_norm": 0.21886971592903137, + "learning_rate": 1.844e-07, + "loss": 0.3077, + "mean_token_accuracy": 0.9282158277928829, + "num_tokens": 147041748.0, + "step": 2039 + }, + { + "epoch": 1.2727733780673351, + "grad_norm": 0.16090872883796692, + "learning_rate": 1.8399999999999998e-07, + "loss": 0.2777, + "mean_token_accuracy": 0.9322857297956944, + "num_tokens": 147113296.0, + "step": 2040 + }, + { + "epoch": 1.2733975734404868, + "grad_norm": 0.21585586667060852, + "learning_rate": 1.836e-07, + "loss": 0.3115, + "mean_token_accuracy": 0.9247662089765072, + "num_tokens": 147183407.0, + "step": 2041 + }, + { + "epoch": 1.2740217688136386, + "grad_norm": 0.22781971096992493, + "learning_rate": 1.832e-07, + "loss": 0.2816, + "mean_token_accuracy": 0.9332077316939831, + "num_tokens": 147258240.0, + "step": 2042 + }, + { + "epoch": 1.2746459641867904, + "grad_norm": 0.18347255885601044, + "learning_rate": 1.8279999999999997e-07, + "loss": 0.3066, + "mean_token_accuracy": 0.9244193732738495, + "num_tokens": 147328937.0, + "step": 2043 + }, + { + "epoch": 1.2752701595599423, + "grad_norm": 0.45873525738716125, + "learning_rate": 1.824e-07, + "loss": 0.294, + "mean_token_accuracy": 0.9291373752057552, + "num_tokens": 147400468.0, + "step": 2044 + }, + { + "epoch": 1.275894354933094, + "grad_norm": 0.22074832022190094, + "learning_rate": 1.82e-07, + "loss": 0.3145, + "mean_token_accuracy": 0.9228701442480087, + "num_tokens": 147469997.0, + "step": 2045 + }, + { + "epoch": 1.2765185503062457, + "grad_norm": 0.2615092992782593, + "learning_rate": 1.816e-07, + "loss": 0.2847, + "mean_token_accuracy": 0.9327182918787003, + "num_tokens": 147546041.0, + "step": 2046 + }, + { + "epoch": 1.2771427456793978, + "grad_norm": 0.3230634927749634, + "learning_rate": 1.8119999999999998e-07, + "loss": 0.3167, + "mean_token_accuracy": 0.9214317575097084, + "num_tokens": 147614151.0, + "step": 2047 + }, + { + "epoch": 1.2777669410525494, + "grad_norm": 0.18710081279277802, + "learning_rate": 1.8079999999999998e-07, + "loss": 0.3048, + "mean_token_accuracy": 0.9230762906372547, + "num_tokens": 147687634.0, + "step": 2048 + }, + { + "epoch": 1.2783911364257012, + "grad_norm": 0.22710640728473663, + "learning_rate": 1.804e-07, + "loss": 0.2887, + "mean_token_accuracy": 0.9298210553824902, + "num_tokens": 147761758.0, + "step": 2049 + }, + { + "epoch": 1.279015331798853, + "grad_norm": 0.22693189978599548, + "learning_rate": 1.8e-07, + "loss": 0.2654, + "mean_token_accuracy": 0.9371523857116699, + "num_tokens": 147834065.0, + "step": 2050 + }, + { + "epoch": 1.279639527172005, + "grad_norm": 0.8788622617721558, + "learning_rate": 1.796e-07, + "loss": 0.3317, + "mean_token_accuracy": 0.9172456115484238, + "num_tokens": 147900054.0, + "step": 2051 + }, + { + "epoch": 1.2802637225451567, + "grad_norm": 0.24822445213794708, + "learning_rate": 1.792e-07, + "loss": 0.2871, + "mean_token_accuracy": 0.9277159534394741, + "num_tokens": 147966766.0, + "step": 2052 + }, + { + "epoch": 1.2808879179183084, + "grad_norm": 0.20177479088306427, + "learning_rate": 1.7879999999999999e-07, + "loss": 0.3395, + "mean_token_accuracy": 0.9181550033390522, + "num_tokens": 148036707.0, + "step": 2053 + }, + { + "epoch": 1.2815121132914602, + "grad_norm": 0.19135023653507233, + "learning_rate": 1.7839999999999998e-07, + "loss": 0.3228, + "mean_token_accuracy": 0.9183575175702572, + "num_tokens": 148105710.0, + "step": 2054 + }, + { + "epoch": 1.282136308664612, + "grad_norm": 0.1912209540605545, + "learning_rate": 1.7799999999999998e-07, + "loss": 0.2887, + "mean_token_accuracy": 0.9284338913857937, + "num_tokens": 148179697.0, + "step": 2055 + }, + { + "epoch": 1.2827605040377639, + "grad_norm": 0.21858327090740204, + "learning_rate": 1.776e-07, + "loss": 0.3033, + "mean_token_accuracy": 0.9243548139929771, + "num_tokens": 148249285.0, + "step": 2056 + }, + { + "epoch": 1.2833846994109157, + "grad_norm": 0.32965803146362305, + "learning_rate": 1.772e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9270556978881359, + "num_tokens": 148320397.0, + "step": 2057 + }, + { + "epoch": 1.2840088947840673, + "grad_norm": 0.19635984301567078, + "learning_rate": 1.768e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.9253975935280323, + "num_tokens": 148390378.0, + "step": 2058 + }, + { + "epoch": 1.2846330901572192, + "grad_norm": 0.20031481981277466, + "learning_rate": 1.764e-07, + "loss": 0.3138, + "mean_token_accuracy": 0.9247081764042377, + "num_tokens": 148465313.0, + "step": 2059 + }, + { + "epoch": 1.285257285530371, + "grad_norm": 0.3442493677139282, + "learning_rate": 1.76e-07, + "loss": 0.3255, + "mean_token_accuracy": 0.9157334379851818, + "num_tokens": 148535458.0, + "step": 2060 + }, + { + "epoch": 1.2858814809035228, + "grad_norm": 0.5641414523124695, + "learning_rate": 1.756e-07, + "loss": 0.3287, + "mean_token_accuracy": 0.9200550131499767, + "num_tokens": 148603394.0, + "step": 2061 + }, + { + "epoch": 1.2865056762766747, + "grad_norm": 0.31180840730667114, + "learning_rate": 1.7519999999999998e-07, + "loss": 0.3053, + "mean_token_accuracy": 0.9276639446616173, + "num_tokens": 148675445.0, + "step": 2062 + }, + { + "epoch": 1.2871298716498263, + "grad_norm": 0.27868202328681946, + "learning_rate": 1.748e-07, + "loss": 0.3262, + "mean_token_accuracy": 0.9205468781292439, + "num_tokens": 148746050.0, + "step": 2063 + }, + { + "epoch": 1.2877540670229781, + "grad_norm": 0.208776593208313, + "learning_rate": 1.744e-07, + "loss": 0.3213, + "mean_token_accuracy": 0.9230259507894516, + "num_tokens": 148816741.0, + "step": 2064 + }, + { + "epoch": 1.28837826239613, + "grad_norm": 0.2511376142501831, + "learning_rate": 1.7399999999999997e-07, + "loss": 0.3186, + "mean_token_accuracy": 0.9262756109237671, + "num_tokens": 148885477.0, + "step": 2065 + }, + { + "epoch": 1.2890024577692818, + "grad_norm": 0.19144724309444427, + "learning_rate": 1.736e-07, + "loss": 0.2884, + "mean_token_accuracy": 0.9278649464249611, + "num_tokens": 148957885.0, + "step": 2066 + }, + { + "epoch": 1.2896266531424336, + "grad_norm": 0.564781665802002, + "learning_rate": 1.732e-07, + "loss": 0.3061, + "mean_token_accuracy": 0.9259409867227077, + "num_tokens": 149033346.0, + "step": 2067 + }, + { + "epoch": 1.2902508485155852, + "grad_norm": 0.15809546411037445, + "learning_rate": 1.728e-07, + "loss": 0.289, + "mean_token_accuracy": 0.9320048168301582, + "num_tokens": 149103734.0, + "step": 2068 + }, + { + "epoch": 1.2908750438887373, + "grad_norm": 0.18684732913970947, + "learning_rate": 1.7239999999999998e-07, + "loss": 0.3046, + "mean_token_accuracy": 0.9255008287727833, + "num_tokens": 149178158.0, + "step": 2069 + }, + { + "epoch": 1.291499239261889, + "grad_norm": 0.3680233359336853, + "learning_rate": 1.7199999999999998e-07, + "loss": 0.2818, + "mean_token_accuracy": 0.9323186725378036, + "num_tokens": 149251568.0, + "step": 2070 + }, + { + "epoch": 1.2921234346350408, + "grad_norm": 1.9578653573989868, + "learning_rate": 1.716e-07, + "loss": 0.3028, + "mean_token_accuracy": 0.9281080961227417, + "num_tokens": 149328843.0, + "step": 2071 + }, + { + "epoch": 1.2927476300081926, + "grad_norm": 0.25955650210380554, + "learning_rate": 1.7119999999999997e-07, + "loss": 0.3156, + "mean_token_accuracy": 0.9275790825486183, + "num_tokens": 149402238.0, + "step": 2072 + }, + { + "epoch": 1.2933718253813444, + "grad_norm": 0.2280360609292984, + "learning_rate": 1.708e-07, + "loss": 0.2854, + "mean_token_accuracy": 0.9318882897496223, + "num_tokens": 149475284.0, + "step": 2073 + }, + { + "epoch": 1.2939960207544963, + "grad_norm": 0.9366874098777771, + "learning_rate": 1.704e-07, + "loss": 0.2957, + "mean_token_accuracy": 0.9322010166943073, + "num_tokens": 149547402.0, + "step": 2074 + }, + { + "epoch": 1.2946202161276479, + "grad_norm": 0.43038493394851685, + "learning_rate": 1.7000000000000001e-07, + "loss": 0.3171, + "mean_token_accuracy": 0.9240015558898449, + "num_tokens": 149613379.0, + "step": 2075 + }, + { + "epoch": 1.2952444115007997, + "grad_norm": 0.1956682801246643, + "learning_rate": 1.6959999999999998e-07, + "loss": 0.3038, + "mean_token_accuracy": 0.9293730519711971, + "num_tokens": 149685654.0, + "step": 2076 + }, + { + "epoch": 1.2958686068739516, + "grad_norm": 0.23349297046661377, + "learning_rate": 1.6919999999999998e-07, + "loss": 0.2685, + "mean_token_accuracy": 0.9319667965173721, + "num_tokens": 149760178.0, + "step": 2077 + }, + { + "epoch": 1.2964928022471034, + "grad_norm": 0.20997607707977295, + "learning_rate": 1.688e-07, + "loss": 0.27, + "mean_token_accuracy": 0.9343189746141434, + "num_tokens": 149834767.0, + "step": 2078 + }, + { + "epoch": 1.2971169976202552, + "grad_norm": 0.19894346594810486, + "learning_rate": 1.684e-07, + "loss": 0.3058, + "mean_token_accuracy": 0.9312025010585785, + "num_tokens": 149905314.0, + "step": 2079 + }, + { + "epoch": 1.2977411929934068, + "grad_norm": 0.2741308808326721, + "learning_rate": 1.68e-07, + "loss": 0.3156, + "mean_token_accuracy": 0.9254831857979298, + "num_tokens": 149976112.0, + "step": 2080 + }, + { + "epoch": 1.2983653883665587, + "grad_norm": 0.1983446180820465, + "learning_rate": 1.676e-07, + "loss": 0.298, + "mean_token_accuracy": 0.9264061711728573, + "num_tokens": 150047558.0, + "step": 2081 + }, + { + "epoch": 1.2989895837397105, + "grad_norm": 0.22013665735721588, + "learning_rate": 1.672e-07, + "loss": 0.2831, + "mean_token_accuracy": 0.9325968362390995, + "num_tokens": 150120660.0, + "step": 2082 + }, + { + "epoch": 1.2996137791128624, + "grad_norm": 0.2125433087348938, + "learning_rate": 1.6679999999999998e-07, + "loss": 0.3302, + "mean_token_accuracy": 0.9210727885365486, + "num_tokens": 150192781.0, + "step": 2083 + }, + { + "epoch": 1.3002379744860142, + "grad_norm": 0.29780057072639465, + "learning_rate": 1.6639999999999998e-07, + "loss": 0.3163, + "mean_token_accuracy": 0.9246529005467892, + "num_tokens": 150263914.0, + "step": 2084 + }, + { + "epoch": 1.3008621698591658, + "grad_norm": 0.26026323437690735, + "learning_rate": 1.66e-07, + "loss": 0.2827, + "mean_token_accuracy": 0.9344554543495178, + "num_tokens": 150338026.0, + "step": 2085 + }, + { + "epoch": 1.3014863652323176, + "grad_norm": 0.22109267115592957, + "learning_rate": 1.656e-07, + "loss": 0.292, + "mean_token_accuracy": 0.9248981326818466, + "num_tokens": 150409994.0, + "step": 2086 + }, + { + "epoch": 1.3021105606054695, + "grad_norm": 0.4244014322757721, + "learning_rate": 1.652e-07, + "loss": 0.2963, + "mean_token_accuracy": 0.9298700839281082, + "num_tokens": 150482844.0, + "step": 2087 + }, + { + "epoch": 1.3027347559786213, + "grad_norm": 0.3859269917011261, + "learning_rate": 1.648e-07, + "loss": 0.3255, + "mean_token_accuracy": 0.9226672649383545, + "num_tokens": 150553021.0, + "step": 2088 + }, + { + "epoch": 1.3033589513517732, + "grad_norm": 0.3091139793395996, + "learning_rate": 1.644e-07, + "loss": 0.3021, + "mean_token_accuracy": 0.9287936054170132, + "num_tokens": 150621826.0, + "step": 2089 + }, + { + "epoch": 1.3039831467249248, + "grad_norm": 0.15909993648529053, + "learning_rate": 1.64e-07, + "loss": 0.264, + "mean_token_accuracy": 0.9350255839526653, + "num_tokens": 150697923.0, + "step": 2090 + }, + { + "epoch": 1.3046073420980768, + "grad_norm": 0.5248836874961853, + "learning_rate": 1.6359999999999998e-07, + "loss": 0.3156, + "mean_token_accuracy": 0.9264041520655155, + "num_tokens": 150771254.0, + "step": 2091 + }, + { + "epoch": 1.3052315374712284, + "grad_norm": 0.17442071437835693, + "learning_rate": 1.632e-07, + "loss": 0.3246, + "mean_token_accuracy": 0.9226789958775043, + "num_tokens": 150844942.0, + "step": 2092 + }, + { + "epoch": 1.3058557328443803, + "grad_norm": 0.16829051077365875, + "learning_rate": 1.628e-07, + "loss": 0.2916, + "mean_token_accuracy": 0.9325590804219246, + "num_tokens": 150915971.0, + "step": 2093 + }, + { + "epoch": 1.3064799282175321, + "grad_norm": 0.26937010884284973, + "learning_rate": 1.6239999999999997e-07, + "loss": 0.329, + "mean_token_accuracy": 0.9227982014417648, + "num_tokens": 150987450.0, + "step": 2094 + }, + { + "epoch": 1.307104123590684, + "grad_norm": 0.22856605052947998, + "learning_rate": 1.62e-07, + "loss": 0.2986, + "mean_token_accuracy": 0.928221482783556, + "num_tokens": 151058819.0, + "step": 2095 + }, + { + "epoch": 1.3077283189638358, + "grad_norm": 0.2924377918243408, + "learning_rate": 1.616e-07, + "loss": 0.3291, + "mean_token_accuracy": 0.9177773930132389, + "num_tokens": 151124946.0, + "step": 2096 + }, + { + "epoch": 1.3083525143369874, + "grad_norm": 0.4520706534385681, + "learning_rate": 1.6120000000000001e-07, + "loss": 0.2952, + "mean_token_accuracy": 0.9258535131812096, + "num_tokens": 151198914.0, + "step": 2097 + }, + { + "epoch": 1.3089767097101392, + "grad_norm": 0.17133979499340057, + "learning_rate": 1.6079999999999998e-07, + "loss": 0.2903, + "mean_token_accuracy": 0.9308430776000023, + "num_tokens": 151270779.0, + "step": 2098 + }, + { + "epoch": 1.309600905083291, + "grad_norm": 0.9098308682441711, + "learning_rate": 1.6039999999999998e-07, + "loss": 0.2827, + "mean_token_accuracy": 0.9270777814090252, + "num_tokens": 151345514.0, + "step": 2099 + }, + { + "epoch": 1.310225100456443, + "grad_norm": 0.6785410046577454, + "learning_rate": 1.6e-07, + "loss": 0.3119, + "mean_token_accuracy": 0.9248600862920284, + "num_tokens": 151417278.0, + "step": 2100 + }, + { + "epoch": 1.3108492958295948, + "grad_norm": 0.21963003277778625, + "learning_rate": 1.5959999999999997e-07, + "loss": 0.2785, + "mean_token_accuracy": 0.9353148974478245, + "num_tokens": 151491368.0, + "step": 2101 + }, + { + "epoch": 1.3114734912027464, + "grad_norm": 0.23689569532871246, + "learning_rate": 1.592e-07, + "loss": 0.2774, + "mean_token_accuracy": 0.9305559992790222, + "num_tokens": 151569985.0, + "step": 2102 + }, + { + "epoch": 1.3120976865758982, + "grad_norm": 0.17777898907661438, + "learning_rate": 1.588e-07, + "loss": 0.2929, + "mean_token_accuracy": 0.9286807589232922, + "num_tokens": 151642920.0, + "step": 2103 + }, + { + "epoch": 1.31272188194905, + "grad_norm": 0.21397370100021362, + "learning_rate": 1.5840000000000002e-07, + "loss": 0.2914, + "mean_token_accuracy": 0.9290468208491802, + "num_tokens": 151717576.0, + "step": 2104 + }, + { + "epoch": 1.3133460773222019, + "grad_norm": 1.015663504600525, + "learning_rate": 1.5799999999999999e-07, + "loss": 0.2816, + "mean_token_accuracy": 0.9312173835933208, + "num_tokens": 151791052.0, + "step": 2105 + }, + { + "epoch": 1.3139702726953537, + "grad_norm": 0.34604233503341675, + "learning_rate": 1.5759999999999998e-07, + "loss": 0.2804, + "mean_token_accuracy": 0.9319759644567966, + "num_tokens": 151863510.0, + "step": 2106 + }, + { + "epoch": 1.3145944680685053, + "grad_norm": 0.18621015548706055, + "learning_rate": 1.572e-07, + "loss": 0.3091, + "mean_token_accuracy": 0.9237174615263939, + "num_tokens": 151933389.0, + "step": 2107 + }, + { + "epoch": 1.3152186634416572, + "grad_norm": 0.22556157410144806, + "learning_rate": 1.5679999999999997e-07, + "loss": 0.2952, + "mean_token_accuracy": 0.9274221397936344, + "num_tokens": 152006030.0, + "step": 2108 + }, + { + "epoch": 1.315842858814809, + "grad_norm": 0.2149975448846817, + "learning_rate": 1.564e-07, + "loss": 0.3354, + "mean_token_accuracy": 0.919729620218277, + "num_tokens": 152077215.0, + "step": 2109 + }, + { + "epoch": 1.3164670541879608, + "grad_norm": 9.403190612792969, + "learning_rate": 1.56e-07, + "loss": 0.3718, + "mean_token_accuracy": 0.9092171378433704, + "num_tokens": 152141108.0, + "step": 2110 + }, + { + "epoch": 1.3170912495611127, + "grad_norm": 0.263650506734848, + "learning_rate": 1.556e-07, + "loss": 0.3308, + "mean_token_accuracy": 0.9200950637459755, + "num_tokens": 152210735.0, + "step": 2111 + }, + { + "epoch": 1.3177154449342643, + "grad_norm": 0.22155408561229706, + "learning_rate": 1.552e-07, + "loss": 0.2762, + "mean_token_accuracy": 0.9350767470896244, + "num_tokens": 152287520.0, + "step": 2112 + }, + { + "epoch": 1.3183396403074163, + "grad_norm": 0.2379574179649353, + "learning_rate": 1.5479999999999998e-07, + "loss": 0.2939, + "mean_token_accuracy": 0.9297707825899124, + "num_tokens": 152358540.0, + "step": 2113 + }, + { + "epoch": 1.318963835680568, + "grad_norm": 0.5658234357833862, + "learning_rate": 1.544e-07, + "loss": 0.2916, + "mean_token_accuracy": 0.9281753525137901, + "num_tokens": 152431041.0, + "step": 2114 + }, + { + "epoch": 1.3195880310537198, + "grad_norm": 0.20936092734336853, + "learning_rate": 1.54e-07, + "loss": 0.2668, + "mean_token_accuracy": 0.9345216117799282, + "num_tokens": 152506308.0, + "step": 2115 + }, + { + "epoch": 1.3202122264268716, + "grad_norm": 0.4310249388217926, + "learning_rate": 1.5359999999999997e-07, + "loss": 0.2799, + "mean_token_accuracy": 0.9288446493446827, + "num_tokens": 152576048.0, + "step": 2116 + }, + { + "epoch": 1.3208364218000235, + "grad_norm": 0.24247108399868011, + "learning_rate": 1.532e-07, + "loss": 0.2869, + "mean_token_accuracy": 0.932581938803196, + "num_tokens": 152649600.0, + "step": 2117 + }, + { + "epoch": 1.3214606171731753, + "grad_norm": 0.2325311154127121, + "learning_rate": 1.528e-07, + "loss": 0.3076, + "mean_token_accuracy": 0.9238797873258591, + "num_tokens": 152718345.0, + "step": 2118 + }, + { + "epoch": 1.322084812546327, + "grad_norm": 0.21164049208164215, + "learning_rate": 1.524e-07, + "loss": 0.3037, + "mean_token_accuracy": 0.9276653975248337, + "num_tokens": 152785577.0, + "step": 2119 + }, + { + "epoch": 1.3227090079194788, + "grad_norm": 0.8483963012695312, + "learning_rate": 1.5199999999999998e-07, + "loss": 0.3031, + "mean_token_accuracy": 0.926462609320879, + "num_tokens": 152856019.0, + "step": 2120 + }, + { + "epoch": 1.3233332032926306, + "grad_norm": 0.5104178786277771, + "learning_rate": 1.516e-07, + "loss": 0.2984, + "mean_token_accuracy": 0.9252760335803032, + "num_tokens": 152929873.0, + "step": 2121 + }, + { + "epoch": 1.3239573986657824, + "grad_norm": 0.35099661350250244, + "learning_rate": 1.512e-07, + "loss": 0.3032, + "mean_token_accuracy": 0.9267092943191528, + "num_tokens": 153006529.0, + "step": 2122 + }, + { + "epoch": 1.3245815940389343, + "grad_norm": 1.313974142074585, + "learning_rate": 1.5079999999999997e-07, + "loss": 0.2758, + "mean_token_accuracy": 0.9307156130671501, + "num_tokens": 153081089.0, + "step": 2123 + }, + { + "epoch": 1.3252057894120859, + "grad_norm": 0.2720471918582916, + "learning_rate": 1.504e-07, + "loss": 0.3299, + "mean_token_accuracy": 0.9203598834574223, + "num_tokens": 153150829.0, + "step": 2124 + }, + { + "epoch": 1.3258299847852377, + "grad_norm": 0.21299681067466736, + "learning_rate": 1.5e-07, + "loss": 0.2883, + "mean_token_accuracy": 0.9276672825217247, + "num_tokens": 153221559.0, + "step": 2125 + }, + { + "epoch": 1.3264541801583896, + "grad_norm": 0.28736424446105957, + "learning_rate": 1.4960000000000002e-07, + "loss": 0.2991, + "mean_token_accuracy": 0.9271920546889305, + "num_tokens": 153292424.0, + "step": 2126 + }, + { + "epoch": 1.3270783755315414, + "grad_norm": 0.22134265303611755, + "learning_rate": 1.4919999999999999e-07, + "loss": 0.3239, + "mean_token_accuracy": 0.9231026321649551, + "num_tokens": 153361028.0, + "step": 2127 + }, + { + "epoch": 1.3277025709046932, + "grad_norm": 0.19726863503456116, + "learning_rate": 1.4879999999999998e-07, + "loss": 0.2965, + "mean_token_accuracy": 0.9272997081279755, + "num_tokens": 153433283.0, + "step": 2128 + }, + { + "epoch": 1.3283267662778449, + "grad_norm": 0.31667250394821167, + "learning_rate": 1.484e-07, + "loss": 0.2778, + "mean_token_accuracy": 0.9333846494555473, + "num_tokens": 153510819.0, + "step": 2129 + }, + { + "epoch": 1.3289509616509967, + "grad_norm": 0.2478756308555603, + "learning_rate": 1.4799999999999998e-07, + "loss": 0.3141, + "mean_token_accuracy": 0.9258810244500637, + "num_tokens": 153583482.0, + "step": 2130 + }, + { + "epoch": 1.3295751570241485, + "grad_norm": 0.21664753556251526, + "learning_rate": 1.476e-07, + "loss": 0.3326, + "mean_token_accuracy": 0.9223082773387432, + "num_tokens": 153655763.0, + "step": 2131 + }, + { + "epoch": 1.3301993523973004, + "grad_norm": 0.2687392830848694, + "learning_rate": 1.472e-07, + "loss": 0.2614, + "mean_token_accuracy": 0.9370056428015232, + "num_tokens": 153730517.0, + "step": 2132 + }, + { + "epoch": 1.3308235477704522, + "grad_norm": 0.1961941123008728, + "learning_rate": 1.4680000000000002e-07, + "loss": 0.3082, + "mean_token_accuracy": 0.9235725738108158, + "num_tokens": 153800526.0, + "step": 2133 + }, + { + "epoch": 1.3314477431436038, + "grad_norm": 0.7838155031204224, + "learning_rate": 1.464e-07, + "loss": 0.3278, + "mean_token_accuracy": 0.9198356866836548, + "num_tokens": 153866543.0, + "step": 2134 + }, + { + "epoch": 1.3320719385167559, + "grad_norm": 0.18363460898399353, + "learning_rate": 1.4599999999999998e-07, + "loss": 0.3088, + "mean_token_accuracy": 0.9227837659418583, + "num_tokens": 153934025.0, + "step": 2135 + }, + { + "epoch": 1.3326961338899075, + "grad_norm": 0.6120743155479431, + "learning_rate": 1.456e-07, + "loss": 0.2919, + "mean_token_accuracy": 0.9271253868937492, + "num_tokens": 154009908.0, + "step": 2136 + }, + { + "epoch": 1.3333203292630593, + "grad_norm": 0.1929367184638977, + "learning_rate": 1.4519999999999998e-07, + "loss": 0.2925, + "mean_token_accuracy": 0.9294635653495789, + "num_tokens": 154077630.0, + "step": 2137 + }, + { + "epoch": 1.3339445246362112, + "grad_norm": 0.2548283338546753, + "learning_rate": 1.448e-07, + "loss": 0.3142, + "mean_token_accuracy": 0.926267359405756, + "num_tokens": 154149510.0, + "step": 2138 + }, + { + "epoch": 1.334568720009363, + "grad_norm": 0.22463463246822357, + "learning_rate": 1.444e-07, + "loss": 0.2968, + "mean_token_accuracy": 0.9292173869907856, + "num_tokens": 154219937.0, + "step": 2139 + }, + { + "epoch": 1.3351929153825148, + "grad_norm": 0.45183709263801575, + "learning_rate": 1.44e-07, + "loss": 0.2755, + "mean_token_accuracy": 0.9320598132908344, + "num_tokens": 154291302.0, + "step": 2140 + }, + { + "epoch": 1.3358171107556664, + "grad_norm": 0.29061204195022583, + "learning_rate": 1.436e-07, + "loss": 0.304, + "mean_token_accuracy": 0.9259826615452766, + "num_tokens": 154362433.0, + "step": 2141 + }, + { + "epoch": 1.3364413061288183, + "grad_norm": 0.22951878607273102, + "learning_rate": 1.4319999999999999e-07, + "loss": 0.2662, + "mean_token_accuracy": 0.9369789212942123, + "num_tokens": 154436702.0, + "step": 2142 + }, + { + "epoch": 1.3370655015019701, + "grad_norm": 0.22993721067905426, + "learning_rate": 1.428e-07, + "loss": 0.3212, + "mean_token_accuracy": 0.9234988242387772, + "num_tokens": 154506922.0, + "step": 2143 + }, + { + "epoch": 1.337689696875122, + "grad_norm": 0.2498965859413147, + "learning_rate": 1.424e-07, + "loss": 0.3374, + "mean_token_accuracy": 0.9187369234859943, + "num_tokens": 154574233.0, + "step": 2144 + }, + { + "epoch": 1.3383138922482738, + "grad_norm": 0.24555444717407227, + "learning_rate": 1.4199999999999997e-07, + "loss": 0.277, + "mean_token_accuracy": 0.9320419579744339, + "num_tokens": 154647799.0, + "step": 2145 + }, + { + "epoch": 1.3389380876214254, + "grad_norm": 0.2576783001422882, + "learning_rate": 1.416e-07, + "loss": 0.3079, + "mean_token_accuracy": 0.9255846589803696, + "num_tokens": 154713489.0, + "step": 2146 + }, + { + "epoch": 1.3395622829945772, + "grad_norm": 0.2272663563489914, + "learning_rate": 1.412e-07, + "loss": 0.3194, + "mean_token_accuracy": 0.925260879099369, + "num_tokens": 154785444.0, + "step": 2147 + }, + { + "epoch": 1.340186478367729, + "grad_norm": 0.2670719623565674, + "learning_rate": 1.408e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9282893985509872, + "num_tokens": 154858448.0, + "step": 2148 + }, + { + "epoch": 1.340810673740881, + "grad_norm": 0.15122443437576294, + "learning_rate": 1.4039999999999999e-07, + "loss": 0.2951, + "mean_token_accuracy": 0.9218646474182606, + "num_tokens": 154929368.0, + "step": 2149 + }, + { + "epoch": 1.3414348691140328, + "grad_norm": 0.21860376000404358, + "learning_rate": 1.4e-07, + "loss": 0.2894, + "mean_token_accuracy": 0.9332394003868103, + "num_tokens": 154999396.0, + "step": 2150 + }, + { + "epoch": 1.3420590644871844, + "grad_norm": 0.4769876003265381, + "learning_rate": 1.396e-07, + "loss": 0.2692, + "mean_token_accuracy": 0.9352481439709663, + "num_tokens": 155073574.0, + "step": 2151 + }, + { + "epoch": 1.3426832598603362, + "grad_norm": 0.23993012309074402, + "learning_rate": 1.3919999999999998e-07, + "loss": 0.2681, + "mean_token_accuracy": 0.9331515170633793, + "num_tokens": 155146166.0, + "step": 2152 + }, + { + "epoch": 1.343307455233488, + "grad_norm": 0.2893688380718231, + "learning_rate": 1.388e-07, + "loss": 0.2902, + "mean_token_accuracy": 0.9312236532568932, + "num_tokens": 155222517.0, + "step": 2153 + }, + { + "epoch": 1.3439316506066399, + "grad_norm": 0.21530096232891083, + "learning_rate": 1.384e-07, + "loss": 0.3127, + "mean_token_accuracy": 0.9234160892665386, + "num_tokens": 155292557.0, + "step": 2154 + }, + { + "epoch": 1.3445558459797917, + "grad_norm": 0.19764351844787598, + "learning_rate": 1.3800000000000002e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.9180204123258591, + "num_tokens": 155360700.0, + "step": 2155 + }, + { + "epoch": 1.3451800413529433, + "grad_norm": 0.5522955656051636, + "learning_rate": 1.376e-07, + "loss": 0.3048, + "mean_token_accuracy": 0.9280840419232845, + "num_tokens": 155434999.0, + "step": 2156 + }, + { + "epoch": 1.3458042367260954, + "grad_norm": 0.42036890983581543, + "learning_rate": 1.3719999999999998e-07, + "loss": 0.2765, + "mean_token_accuracy": 0.9324880056083202, + "num_tokens": 155511607.0, + "step": 2157 + }, + { + "epoch": 1.346428432099247, + "grad_norm": 0.24666228890419006, + "learning_rate": 1.368e-07, + "loss": 0.2789, + "mean_token_accuracy": 0.9359610788524151, + "num_tokens": 155586696.0, + "step": 2158 + }, + { + "epoch": 1.3470526274723988, + "grad_norm": 0.26400744915008545, + "learning_rate": 1.3639999999999998e-07, + "loss": 0.3111, + "mean_token_accuracy": 0.9264621920883656, + "num_tokens": 155657130.0, + "step": 2159 + }, + { + "epoch": 1.3476768228455507, + "grad_norm": 0.3401220142841339, + "learning_rate": 1.36e-07, + "loss": 0.2809, + "mean_token_accuracy": 0.9319204911589622, + "num_tokens": 155732560.0, + "step": 2160 + }, + { + "epoch": 1.3483010182187025, + "grad_norm": 0.37774530053138733, + "learning_rate": 1.356e-07, + "loss": 0.2834, + "mean_token_accuracy": 0.9271791093051434, + "num_tokens": 155809781.0, + "step": 2161 + }, + { + "epoch": 1.3489252135918544, + "grad_norm": 0.30649739503860474, + "learning_rate": 1.352e-07, + "loss": 0.2832, + "mean_token_accuracy": 0.9330406151711941, + "num_tokens": 155882837.0, + "step": 2162 + }, + { + "epoch": 1.349549408965006, + "grad_norm": 0.25023749470710754, + "learning_rate": 1.348e-07, + "loss": 0.303, + "mean_token_accuracy": 0.9288054890930653, + "num_tokens": 155956459.0, + "step": 2163 + }, + { + "epoch": 1.3501736043381578, + "grad_norm": 0.19213595986366272, + "learning_rate": 1.3439999999999999e-07, + "loss": 0.2661, + "mean_token_accuracy": 0.9344171173870564, + "num_tokens": 156033376.0, + "step": 2164 + }, + { + "epoch": 1.3507977997113096, + "grad_norm": 0.24009476602077484, + "learning_rate": 1.34e-07, + "loss": 0.3165, + "mean_token_accuracy": 0.9263790436089039, + "num_tokens": 156104628.0, + "step": 2165 + }, + { + "epoch": 1.3514219950844615, + "grad_norm": 0.22480788826942444, + "learning_rate": 1.3359999999999998e-07, + "loss": 0.2977, + "mean_token_accuracy": 0.9272490292787552, + "num_tokens": 156175308.0, + "step": 2166 + }, + { + "epoch": 1.3520461904576133, + "grad_norm": 0.2095908224582672, + "learning_rate": 1.332e-07, + "loss": 0.3119, + "mean_token_accuracy": 0.9243741296231747, + "num_tokens": 156248213.0, + "step": 2167 + }, + { + "epoch": 1.352670385830765, + "grad_norm": 0.2671331465244293, + "learning_rate": 1.328e-07, + "loss": 0.3124, + "mean_token_accuracy": 0.9283413030207157, + "num_tokens": 156320905.0, + "step": 2168 + }, + { + "epoch": 1.3532945812039168, + "grad_norm": 0.26369500160217285, + "learning_rate": 1.324e-07, + "loss": 0.3858, + "mean_token_accuracy": 0.9072173349559307, + "num_tokens": 156386383.0, + "step": 2169 + }, + { + "epoch": 1.3539187765770686, + "grad_norm": 0.2236408293247223, + "learning_rate": 1.32e-07, + "loss": 0.3279, + "mean_token_accuracy": 0.9234435707330704, + "num_tokens": 156455347.0, + "step": 2170 + }, + { + "epoch": 1.3545429719502204, + "grad_norm": 0.7530428171157837, + "learning_rate": 1.316e-07, + "loss": 0.3071, + "mean_token_accuracy": 0.9265705198049545, + "num_tokens": 156525479.0, + "step": 2171 + }, + { + "epoch": 1.3551671673233723, + "grad_norm": 0.37952491641044617, + "learning_rate": 1.312e-07, + "loss": 0.3016, + "mean_token_accuracy": 0.9313018061220646, + "num_tokens": 156600345.0, + "step": 2172 + }, + { + "epoch": 1.355791362696524, + "grad_norm": 0.23359838128089905, + "learning_rate": 1.308e-07, + "loss": 0.2675, + "mean_token_accuracy": 0.9341041371226311, + "num_tokens": 156672809.0, + "step": 2173 + }, + { + "epoch": 1.3564155580696757, + "grad_norm": 0.2945937514305115, + "learning_rate": 1.3039999999999998e-07, + "loss": 0.2992, + "mean_token_accuracy": 0.9272616133093834, + "num_tokens": 156749100.0, + "step": 2174 + }, + { + "epoch": 1.3570397534428276, + "grad_norm": 0.20929881930351257, + "learning_rate": 1.3e-07, + "loss": 0.3034, + "mean_token_accuracy": 0.926336020231247, + "num_tokens": 156818508.0, + "step": 2175 + }, + { + "epoch": 1.3576639488159794, + "grad_norm": 0.3215138018131256, + "learning_rate": 1.296e-07, + "loss": 0.3013, + "mean_token_accuracy": 0.9266573041677475, + "num_tokens": 156890526.0, + "step": 2176 + }, + { + "epoch": 1.3582881441891312, + "grad_norm": 0.24577008187770844, + "learning_rate": 1.292e-07, + "loss": 0.294, + "mean_token_accuracy": 0.9303817674517632, + "num_tokens": 156963687.0, + "step": 2177 + }, + { + "epoch": 1.3589123395622829, + "grad_norm": 0.2476331740617752, + "learning_rate": 1.288e-07, + "loss": 0.2935, + "mean_token_accuracy": 0.9325810559093952, + "num_tokens": 157038870.0, + "step": 2178 + }, + { + "epoch": 1.359536534935435, + "grad_norm": 0.21037732064723969, + "learning_rate": 1.2839999999999999e-07, + "loss": 0.3493, + "mean_token_accuracy": 0.9125947952270508, + "num_tokens": 157103839.0, + "step": 2179 + }, + { + "epoch": 1.3601607303085865, + "grad_norm": 0.24752464890480042, + "learning_rate": 1.28e-07, + "loss": 0.2735, + "mean_token_accuracy": 0.9328849799931049, + "num_tokens": 157181538.0, + "step": 2180 + }, + { + "epoch": 1.3607849256817384, + "grad_norm": 0.29659977555274963, + "learning_rate": 1.2759999999999998e-07, + "loss": 0.3065, + "mean_token_accuracy": 0.9245304353535175, + "num_tokens": 157252546.0, + "step": 2181 + }, + { + "epoch": 1.3614091210548902, + "grad_norm": 0.18353208899497986, + "learning_rate": 1.272e-07, + "loss": 0.2911, + "mean_token_accuracy": 0.9291892163455486, + "num_tokens": 157322881.0, + "step": 2182 + }, + { + "epoch": 1.362033316428042, + "grad_norm": 0.26438790559768677, + "learning_rate": 1.268e-07, + "loss": 0.2773, + "mean_token_accuracy": 0.9319265112280846, + "num_tokens": 157398144.0, + "step": 2183 + }, + { + "epoch": 1.3626575118011939, + "grad_norm": 0.2609936594963074, + "learning_rate": 1.264e-07, + "loss": 0.264, + "mean_token_accuracy": 0.9359213188290596, + "num_tokens": 157470683.0, + "step": 2184 + }, + { + "epoch": 1.3632817071743455, + "grad_norm": 0.29164737462997437, + "learning_rate": 1.26e-07, + "loss": 0.3337, + "mean_token_accuracy": 0.9173795767128468, + "num_tokens": 157542270.0, + "step": 2185 + }, + { + "epoch": 1.3639059025474973, + "grad_norm": 0.16872219741344452, + "learning_rate": 1.2559999999999999e-07, + "loss": 0.2923, + "mean_token_accuracy": 0.9292659275233746, + "num_tokens": 157621134.0, + "step": 2186 + }, + { + "epoch": 1.3645300979206492, + "grad_norm": 0.2842298150062561, + "learning_rate": 1.252e-07, + "loss": 0.2972, + "mean_token_accuracy": 0.9280747696757317, + "num_tokens": 157695025.0, + "step": 2187 + }, + { + "epoch": 1.365154293293801, + "grad_norm": 0.2848183810710907, + "learning_rate": 1.2479999999999998e-07, + "loss": 0.3429, + "mean_token_accuracy": 0.9200753830373287, + "num_tokens": 157763808.0, + "step": 2188 + }, + { + "epoch": 1.3657784886669528, + "grad_norm": 0.25700223445892334, + "learning_rate": 1.244e-07, + "loss": 0.3357, + "mean_token_accuracy": 0.9219502955675125, + "num_tokens": 157833020.0, + "step": 2189 + }, + { + "epoch": 1.3664026840401045, + "grad_norm": 0.5977392792701721, + "learning_rate": 1.24e-07, + "loss": 0.3131, + "mean_token_accuracy": 0.9267587289214134, + "num_tokens": 157905332.0, + "step": 2190 + }, + { + "epoch": 1.3670268794132563, + "grad_norm": 0.16885489225387573, + "learning_rate": 1.236e-07, + "loss": 0.3057, + "mean_token_accuracy": 0.9278435558080673, + "num_tokens": 157977246.0, + "step": 2191 + }, + { + "epoch": 1.3676510747864081, + "grad_norm": 0.19773730635643005, + "learning_rate": 1.232e-07, + "loss": 0.2988, + "mean_token_accuracy": 0.9293907172977924, + "num_tokens": 158049594.0, + "step": 2192 + }, + { + "epoch": 1.36827527015956, + "grad_norm": 0.2348225712776184, + "learning_rate": 1.228e-07, + "loss": 0.2573, + "mean_token_accuracy": 0.9369558803737164, + "num_tokens": 158123723.0, + "step": 2193 + }, + { + "epoch": 1.3688994655327118, + "grad_norm": 0.17237122356891632, + "learning_rate": 1.2239999999999998e-07, + "loss": 0.2874, + "mean_token_accuracy": 0.9313949383795261, + "num_tokens": 158194467.0, + "step": 2194 + }, + { + "epoch": 1.3695236609058634, + "grad_norm": 0.18933625519275665, + "learning_rate": 1.2199999999999998e-07, + "loss": 0.2949, + "mean_token_accuracy": 0.9312510304152966, + "num_tokens": 158265526.0, + "step": 2195 + }, + { + "epoch": 1.3701478562790153, + "grad_norm": 0.5985440611839294, + "learning_rate": 1.216e-07, + "loss": 0.3264, + "mean_token_accuracy": 0.9151214361190796, + "num_tokens": 158335125.0, + "step": 2196 + }, + { + "epoch": 1.370772051652167, + "grad_norm": 0.24212469160556793, + "learning_rate": 1.212e-07, + "loss": 0.327, + "mean_token_accuracy": 0.9190125912427902, + "num_tokens": 158406468.0, + "step": 2197 + }, + { + "epoch": 1.371396247025319, + "grad_norm": 0.44599848985671997, + "learning_rate": 1.208e-07, + "loss": 0.2902, + "mean_token_accuracy": 0.9317015297710896, + "num_tokens": 158478646.0, + "step": 2198 + }, + { + "epoch": 1.3720204423984708, + "grad_norm": 0.5853514671325684, + "learning_rate": 1.204e-07, + "loss": 0.2615, + "mean_token_accuracy": 0.9359362572431564, + "num_tokens": 158555932.0, + "step": 2199 + }, + { + "epoch": 1.3726446377716226, + "grad_norm": 0.23210901021957397, + "learning_rate": 1.2e-07, + "loss": 0.2875, + "mean_token_accuracy": 0.9335306882858276, + "num_tokens": 158632868.0, + "step": 2200 + }, + { + "epoch": 1.3732688331447744, + "grad_norm": 0.25775545835494995, + "learning_rate": 1.1959999999999999e-07, + "loss": 0.2915, + "mean_token_accuracy": 0.931880209594965, + "num_tokens": 158704854.0, + "step": 2201 + }, + { + "epoch": 1.373893028517926, + "grad_norm": 0.27723443508148193, + "learning_rate": 1.192e-07, + "loss": 0.325, + "mean_token_accuracy": 0.9215355105698109, + "num_tokens": 158773856.0, + "step": 2202 + }, + { + "epoch": 1.3745172238910779, + "grad_norm": 0.17581932246685028, + "learning_rate": 1.1879999999999999e-07, + "loss": 0.2889, + "mean_token_accuracy": 0.9344906844198704, + "num_tokens": 158847710.0, + "step": 2203 + }, + { + "epoch": 1.3751414192642297, + "grad_norm": 0.26737263798713684, + "learning_rate": 1.184e-07, + "loss": 0.315, + "mean_token_accuracy": 0.9213084168732166, + "num_tokens": 158916705.0, + "step": 2204 + }, + { + "epoch": 1.3757656146373816, + "grad_norm": 0.33590206503868103, + "learning_rate": 1.1799999999999998e-07, + "loss": 0.2809, + "mean_token_accuracy": 0.9333944730460644, + "num_tokens": 158990112.0, + "step": 2205 + }, + { + "epoch": 1.3763898100105334, + "grad_norm": 0.2264554500579834, + "learning_rate": 1.176e-07, + "loss": 0.2773, + "mean_token_accuracy": 0.9314668700098991, + "num_tokens": 159059560.0, + "step": 2206 + }, + { + "epoch": 1.377014005383685, + "grad_norm": 0.22964994609355927, + "learning_rate": 1.1719999999999999e-07, + "loss": 0.2986, + "mean_token_accuracy": 0.9274906925857067, + "num_tokens": 159128889.0, + "step": 2207 + }, + { + "epoch": 1.3776382007568369, + "grad_norm": 0.17764867842197418, + "learning_rate": 1.168e-07, + "loss": 0.3372, + "mean_token_accuracy": 0.9147690981626511, + "num_tokens": 159194234.0, + "step": 2208 + }, + { + "epoch": 1.3782623961299887, + "grad_norm": 0.2533983290195465, + "learning_rate": 1.164e-07, + "loss": 0.2702, + "mean_token_accuracy": 0.9339640624821186, + "num_tokens": 159270910.0, + "step": 2209 + }, + { + "epoch": 1.3788865915031405, + "grad_norm": 0.4699341058731079, + "learning_rate": 1.16e-07, + "loss": 0.2704, + "mean_token_accuracy": 0.933040477335453, + "num_tokens": 159343382.0, + "step": 2210 + }, + { + "epoch": 1.3795107868762924, + "grad_norm": 0.23780974745750427, + "learning_rate": 1.1559999999999999e-07, + "loss": 0.3114, + "mean_token_accuracy": 0.9252216182649136, + "num_tokens": 159412549.0, + "step": 2211 + }, + { + "epoch": 1.380134982249444, + "grad_norm": 0.2815774083137512, + "learning_rate": 1.1519999999999999e-07, + "loss": 0.3176, + "mean_token_accuracy": 0.924490712583065, + "num_tokens": 159482707.0, + "step": 2212 + }, + { + "epoch": 1.3807591776225958, + "grad_norm": 0.3909592628479004, + "learning_rate": 1.148e-07, + "loss": 0.288, + "mean_token_accuracy": 0.9303807690739632, + "num_tokens": 159550771.0, + "step": 2213 + }, + { + "epoch": 1.3813833729957476, + "grad_norm": 0.6003786325454712, + "learning_rate": 1.1439999999999999e-07, + "loss": 0.3073, + "mean_token_accuracy": 0.9280496649444103, + "num_tokens": 159621298.0, + "step": 2214 + }, + { + "epoch": 1.3820075683688995, + "grad_norm": 0.23835577070713043, + "learning_rate": 1.14e-07, + "loss": 0.3094, + "mean_token_accuracy": 0.9234599806368351, + "num_tokens": 159690541.0, + "step": 2215 + }, + { + "epoch": 1.3826317637420513, + "grad_norm": 0.21615497767925262, + "learning_rate": 1.136e-07, + "loss": 0.2833, + "mean_token_accuracy": 0.9297896847128868, + "num_tokens": 159761960.0, + "step": 2216 + }, + { + "epoch": 1.383255959115203, + "grad_norm": 0.31490159034729004, + "learning_rate": 1.132e-07, + "loss": 0.2437, + "mean_token_accuracy": 0.9410693719983101, + "num_tokens": 159835282.0, + "step": 2217 + }, + { + "epoch": 1.3838801544883548, + "grad_norm": 0.18832997977733612, + "learning_rate": 1.1279999999999999e-07, + "loss": 0.2697, + "mean_token_accuracy": 0.9355428852140903, + "num_tokens": 159909642.0, + "step": 2218 + }, + { + "epoch": 1.3845043498615066, + "grad_norm": 0.21366973221302032, + "learning_rate": 1.124e-07, + "loss": 0.2769, + "mean_token_accuracy": 0.9334382563829422, + "num_tokens": 159979315.0, + "step": 2219 + }, + { + "epoch": 1.3851285452346584, + "grad_norm": 0.3905908465385437, + "learning_rate": 1.12e-07, + "loss": 0.3064, + "mean_token_accuracy": 0.9230459295213223, + "num_tokens": 160049273.0, + "step": 2220 + }, + { + "epoch": 1.3857527406078103, + "grad_norm": 0.25763818621635437, + "learning_rate": 1.116e-07, + "loss": 0.3061, + "mean_token_accuracy": 0.9226940423250198, + "num_tokens": 160120480.0, + "step": 2221 + }, + { + "epoch": 1.3863769359809621, + "grad_norm": 0.8387280702590942, + "learning_rate": 1.1119999999999999e-07, + "loss": 0.2855, + "mean_token_accuracy": 0.9295088350772858, + "num_tokens": 160194569.0, + "step": 2222 + }, + { + "epoch": 1.387001131354114, + "grad_norm": 0.25023144483566284, + "learning_rate": 1.1079999999999999e-07, + "loss": 0.3058, + "mean_token_accuracy": 0.9281865321099758, + "num_tokens": 160265174.0, + "step": 2223 + }, + { + "epoch": 1.3876253267272656, + "grad_norm": 0.26264578104019165, + "learning_rate": 1.104e-07, + "loss": 0.2936, + "mean_token_accuracy": 0.93183284252882, + "num_tokens": 160341162.0, + "step": 2224 + }, + { + "epoch": 1.3882495221004174, + "grad_norm": 0.30447623133659363, + "learning_rate": 1.0999999999999999e-07, + "loss": 0.3131, + "mean_token_accuracy": 0.9271162450313568, + "num_tokens": 160414124.0, + "step": 2225 + }, + { + "epoch": 1.3888737174735692, + "grad_norm": 0.3614172637462616, + "learning_rate": 1.096e-07, + "loss": 0.2695, + "mean_token_accuracy": 0.9353512935340405, + "num_tokens": 160487914.0, + "step": 2226 + }, + { + "epoch": 1.389497912846721, + "grad_norm": 0.31334587931632996, + "learning_rate": 1.092e-07, + "loss": 0.3037, + "mean_token_accuracy": 0.926281027495861, + "num_tokens": 160556228.0, + "step": 2227 + }, + { + "epoch": 1.390122108219873, + "grad_norm": 0.4954676926136017, + "learning_rate": 1.088e-07, + "loss": 0.3177, + "mean_token_accuracy": 0.9216667301952839, + "num_tokens": 160623318.0, + "step": 2228 + }, + { + "epoch": 1.3907463035930245, + "grad_norm": 0.2650703489780426, + "learning_rate": 1.0839999999999999e-07, + "loss": 0.2895, + "mean_token_accuracy": 0.9298974573612213, + "num_tokens": 160697409.0, + "step": 2229 + }, + { + "epoch": 1.3913704989661764, + "grad_norm": 0.2347315549850464, + "learning_rate": 1.0799999999999999e-07, + "loss": 0.3005, + "mean_token_accuracy": 0.9274422004818916, + "num_tokens": 160770842.0, + "step": 2230 + }, + { + "epoch": 1.3919946943393282, + "grad_norm": 2.1604340076446533, + "learning_rate": 1.076e-07, + "loss": 0.2977, + "mean_token_accuracy": 0.9286009259521961, + "num_tokens": 160844333.0, + "step": 2231 + }, + { + "epoch": 1.39261888971248, + "grad_norm": 0.24379242956638336, + "learning_rate": 1.072e-07, + "loss": 0.3289, + "mean_token_accuracy": 0.9247971698641777, + "num_tokens": 160913330.0, + "step": 2232 + }, + { + "epoch": 1.3932430850856319, + "grad_norm": 0.2135346531867981, + "learning_rate": 1.068e-07, + "loss": 0.343, + "mean_token_accuracy": 0.9153024964034557, + "num_tokens": 160981585.0, + "step": 2233 + }, + { + "epoch": 1.3938672804587835, + "grad_norm": 0.24270333349704742, + "learning_rate": 1.0639999999999999e-07, + "loss": 0.3295, + "mean_token_accuracy": 0.9248667247593403, + "num_tokens": 161051338.0, + "step": 2234 + }, + { + "epoch": 1.3944914758319353, + "grad_norm": 0.31979045271873474, + "learning_rate": 1.06e-07, + "loss": 0.3113, + "mean_token_accuracy": 0.926321592181921, + "num_tokens": 161122842.0, + "step": 2235 + }, + { + "epoch": 1.3951156712050872, + "grad_norm": 0.23423852026462555, + "learning_rate": 1.0559999999999999e-07, + "loss": 0.2696, + "mean_token_accuracy": 0.9335817359387875, + "num_tokens": 161197261.0, + "step": 2236 + }, + { + "epoch": 1.395739866578239, + "grad_norm": 0.2837078869342804, + "learning_rate": 1.052e-07, + "loss": 0.3247, + "mean_token_accuracy": 0.9247360117733479, + "num_tokens": 161270132.0, + "step": 2237 + }, + { + "epoch": 1.3963640619513908, + "grad_norm": 0.25982680916786194, + "learning_rate": 1.048e-07, + "loss": 0.3088, + "mean_token_accuracy": 0.920888677239418, + "num_tokens": 161336941.0, + "step": 2238 + }, + { + "epoch": 1.3969882573245425, + "grad_norm": 0.22519288957118988, + "learning_rate": 1.0440000000000001e-07, + "loss": 0.3053, + "mean_token_accuracy": 0.9284292832016945, + "num_tokens": 161413657.0, + "step": 2239 + }, + { + "epoch": 1.3976124526976943, + "grad_norm": 0.4808672368526459, + "learning_rate": 1.0399999999999999e-07, + "loss": 0.2946, + "mean_token_accuracy": 0.9285720139741898, + "num_tokens": 161487573.0, + "step": 2240 + }, + { + "epoch": 1.3982366480708461, + "grad_norm": 0.23378510773181915, + "learning_rate": 1.0359999999999999e-07, + "loss": 0.3253, + "mean_token_accuracy": 0.9235528893768787, + "num_tokens": 161560301.0, + "step": 2241 + }, + { + "epoch": 1.398860843443998, + "grad_norm": 0.22970958054065704, + "learning_rate": 1.032e-07, + "loss": 0.3421, + "mean_token_accuracy": 0.9142438881099224, + "num_tokens": 161625830.0, + "step": 2242 + }, + { + "epoch": 1.3994850388171498, + "grad_norm": 0.24135981500148773, + "learning_rate": 1.028e-07, + "loss": 0.2887, + "mean_token_accuracy": 0.9304102212190628, + "num_tokens": 161700668.0, + "step": 2243 + }, + { + "epoch": 1.4001092341903016, + "grad_norm": 2.1651358604431152, + "learning_rate": 1.024e-07, + "loss": 0.2964, + "mean_token_accuracy": 0.9286670237779617, + "num_tokens": 161773413.0, + "step": 2244 + }, + { + "epoch": 1.4007334295634535, + "grad_norm": 0.26837843656539917, + "learning_rate": 1.0199999999999999e-07, + "loss": 0.2907, + "mean_token_accuracy": 0.9339475519955158, + "num_tokens": 161847519.0, + "step": 2245 + }, + { + "epoch": 1.401357624936605, + "grad_norm": 0.2809571623802185, + "learning_rate": 1.016e-07, + "loss": 0.3377, + "mean_token_accuracy": 0.9195237569510937, + "num_tokens": 161912854.0, + "step": 2246 + }, + { + "epoch": 1.401981820309757, + "grad_norm": 0.47276565432548523, + "learning_rate": 1.0119999999999999e-07, + "loss": 0.3079, + "mean_token_accuracy": 0.9253488928079605, + "num_tokens": 161982073.0, + "step": 2247 + }, + { + "epoch": 1.4026060156829088, + "grad_norm": 0.3777218163013458, + "learning_rate": 1.008e-07, + "loss": 0.3204, + "mean_token_accuracy": 0.9238595068454742, + "num_tokens": 162052697.0, + "step": 2248 + }, + { + "epoch": 1.4032302110560606, + "grad_norm": 0.2259320616722107, + "learning_rate": 1.004e-07, + "loss": 0.3008, + "mean_token_accuracy": 0.9274771548807621, + "num_tokens": 162125074.0, + "step": 2249 + }, + { + "epoch": 1.4038544064292124, + "grad_norm": 0.25571826100349426, + "learning_rate": 1e-07, + "loss": 0.3105, + "mean_token_accuracy": 0.9243918471038342, + "num_tokens": 162193238.0, + "step": 2250 + }, + { + "epoch": 1.404478601802364, + "grad_norm": 0.19016645848751068, + "learning_rate": 9.959999999999999e-08, + "loss": 0.3056, + "mean_token_accuracy": 0.932238981127739, + "num_tokens": 162269095.0, + "step": 2251 + }, + { + "epoch": 1.405102797175516, + "grad_norm": 0.37692883610725403, + "learning_rate": 9.919999999999999e-08, + "loss": 0.2819, + "mean_token_accuracy": 0.9353825226426125, + "num_tokens": 162347012.0, + "step": 2252 + }, + { + "epoch": 1.4057269925486677, + "grad_norm": 0.29082798957824707, + "learning_rate": 9.88e-08, + "loss": 0.285, + "mean_token_accuracy": 0.930580522865057, + "num_tokens": 162420744.0, + "step": 2253 + }, + { + "epoch": 1.4063511879218196, + "grad_norm": 0.2945311963558197, + "learning_rate": 9.84e-08, + "loss": 0.2864, + "mean_token_accuracy": 0.9310416653752327, + "num_tokens": 162495060.0, + "step": 2254 + }, + { + "epoch": 1.4069753832949714, + "grad_norm": 0.8768035173416138, + "learning_rate": 9.8e-08, + "loss": 0.3131, + "mean_token_accuracy": 0.924415148794651, + "num_tokens": 162565502.0, + "step": 2255 + }, + { + "epoch": 1.407599578668123, + "grad_norm": 0.2143184244632721, + "learning_rate": 9.76e-08, + "loss": 0.2759, + "mean_token_accuracy": 0.9329290017485619, + "num_tokens": 162641488.0, + "step": 2256 + }, + { + "epoch": 1.4082237740412749, + "grad_norm": 0.5433815121650696, + "learning_rate": 9.72e-08, + "loss": 0.3157, + "mean_token_accuracy": 0.9261355847120285, + "num_tokens": 162713405.0, + "step": 2257 + }, + { + "epoch": 1.4088479694144267, + "grad_norm": 0.2665337920188904, + "learning_rate": 9.679999999999999e-08, + "loss": 0.2981, + "mean_token_accuracy": 0.9278672412037849, + "num_tokens": 162787952.0, + "step": 2258 + }, + { + "epoch": 1.4094721647875785, + "grad_norm": 0.21975503861904144, + "learning_rate": 9.639999999999999e-08, + "loss": 0.3474, + "mean_token_accuracy": 0.9180901125073433, + "num_tokens": 162856972.0, + "step": 2259 + }, + { + "epoch": 1.4100963601607304, + "grad_norm": 0.7483108639717102, + "learning_rate": 9.6e-08, + "loss": 0.3223, + "mean_token_accuracy": 0.9256804324686527, + "num_tokens": 162928386.0, + "step": 2260 + }, + { + "epoch": 1.410720555533882, + "grad_norm": 0.850912868976593, + "learning_rate": 9.56e-08, + "loss": 0.2972, + "mean_token_accuracy": 0.9276558347046375, + "num_tokens": 162999392.0, + "step": 2261 + }, + { + "epoch": 1.411344750907034, + "grad_norm": 0.33881205320358276, + "learning_rate": 9.52e-08, + "loss": 0.3171, + "mean_token_accuracy": 0.9255057610571384, + "num_tokens": 163070926.0, + "step": 2262 + }, + { + "epoch": 1.4119689462801857, + "grad_norm": 0.23362453281879425, + "learning_rate": 9.479999999999999e-08, + "loss": 0.2757, + "mean_token_accuracy": 0.9354800023138523, + "num_tokens": 163142390.0, + "step": 2263 + }, + { + "epoch": 1.4125931416533375, + "grad_norm": 0.529930591583252, + "learning_rate": 9.44e-08, + "loss": 0.3561, + "mean_token_accuracy": 0.9140395484864712, + "num_tokens": 163208883.0, + "step": 2264 + }, + { + "epoch": 1.4132173370264893, + "grad_norm": 0.17117589712142944, + "learning_rate": 9.4e-08, + "loss": 0.2952, + "mean_token_accuracy": 0.9288080744445324, + "num_tokens": 163283127.0, + "step": 2265 + }, + { + "epoch": 1.4138415323996412, + "grad_norm": 0.28553953766822815, + "learning_rate": 9.36e-08, + "loss": 0.279, + "mean_token_accuracy": 0.934183269739151, + "num_tokens": 163357346.0, + "step": 2266 + }, + { + "epoch": 1.414465727772793, + "grad_norm": 0.18048705160617828, + "learning_rate": 9.32e-08, + "loss": 0.2699, + "mean_token_accuracy": 0.9325662553310394, + "num_tokens": 163427940.0, + "step": 2267 + }, + { + "epoch": 1.4150899231459446, + "grad_norm": 0.2798011898994446, + "learning_rate": 9.279999999999998e-08, + "loss": 0.2831, + "mean_token_accuracy": 0.9322300851345062, + "num_tokens": 163499403.0, + "step": 2268 + }, + { + "epoch": 1.4157141185190965, + "grad_norm": 0.26309141516685486, + "learning_rate": 9.24e-08, + "loss": 0.2975, + "mean_token_accuracy": 0.9321208000183105, + "num_tokens": 163572102.0, + "step": 2269 + }, + { + "epoch": 1.4163383138922483, + "grad_norm": 0.17928718030452728, + "learning_rate": 9.199999999999999e-08, + "loss": 0.3038, + "mean_token_accuracy": 0.9269499219954014, + "num_tokens": 163640392.0, + "step": 2270 + }, + { + "epoch": 1.4169625092654001, + "grad_norm": 0.6378262042999268, + "learning_rate": 9.16e-08, + "loss": 0.2712, + "mean_token_accuracy": 0.9326031059026718, + "num_tokens": 163717993.0, + "step": 2271 + }, + { + "epoch": 1.417586704638552, + "grad_norm": 0.2337879240512848, + "learning_rate": 9.12e-08, + "loss": 0.3235, + "mean_token_accuracy": 0.9210788868367672, + "num_tokens": 163784194.0, + "step": 2272 + }, + { + "epoch": 1.4182109000117036, + "grad_norm": 1.3958945274353027, + "learning_rate": 9.08e-08, + "loss": 0.3336, + "mean_token_accuracy": 0.9189723543822765, + "num_tokens": 163851760.0, + "step": 2273 + }, + { + "epoch": 1.4188350953848554, + "grad_norm": 0.3061986565589905, + "learning_rate": 9.039999999999999e-08, + "loss": 0.2793, + "mean_token_accuracy": 0.9325354509055614, + "num_tokens": 163929012.0, + "step": 2274 + }, + { + "epoch": 1.4194592907580073, + "grad_norm": 0.3791041970252991, + "learning_rate": 9e-08, + "loss": 0.2737, + "mean_token_accuracy": 0.9322048053145409, + "num_tokens": 163999887.0, + "step": 2275 + }, + { + "epoch": 1.420083486131159, + "grad_norm": 0.5793964862823486, + "learning_rate": 8.96e-08, + "loss": 0.2788, + "mean_token_accuracy": 0.9343210831284523, + "num_tokens": 164073797.0, + "step": 2276 + }, + { + "epoch": 1.420707681504311, + "grad_norm": 0.3610789477825165, + "learning_rate": 8.919999999999999e-08, + "loss": 0.2642, + "mean_token_accuracy": 0.9388095363974571, + "num_tokens": 164144028.0, + "step": 2277 + }, + { + "epoch": 1.4213318768774625, + "grad_norm": 0.22820943593978882, + "learning_rate": 8.88e-08, + "loss": 0.3106, + "mean_token_accuracy": 0.9257412068545818, + "num_tokens": 164214210.0, + "step": 2278 + }, + { + "epoch": 1.4219560722506144, + "grad_norm": 0.18349969387054443, + "learning_rate": 8.84e-08, + "loss": 0.29, + "mean_token_accuracy": 0.9289681166410446, + "num_tokens": 164288166.0, + "step": 2279 + }, + { + "epoch": 1.4225802676237662, + "grad_norm": 0.40295836329460144, + "learning_rate": 8.8e-08, + "loss": 0.2641, + "mean_token_accuracy": 0.9377103708684444, + "num_tokens": 164368039.0, + "step": 2280 + }, + { + "epoch": 1.423204462996918, + "grad_norm": 0.2283107340335846, + "learning_rate": 8.759999999999999e-08, + "loss": 0.3038, + "mean_token_accuracy": 0.9263504967093468, + "num_tokens": 164438205.0, + "step": 2281 + }, + { + "epoch": 1.4238286583700699, + "grad_norm": 0.21077032387256622, + "learning_rate": 8.72e-08, + "loss": 0.2816, + "mean_token_accuracy": 0.9303104840219021, + "num_tokens": 164509113.0, + "step": 2282 + }, + { + "epoch": 1.4244528537432215, + "grad_norm": 0.4684450626373291, + "learning_rate": 8.68e-08, + "loss": 0.2896, + "mean_token_accuracy": 0.9304402768611908, + "num_tokens": 164578846.0, + "step": 2283 + }, + { + "epoch": 1.4250770491163736, + "grad_norm": 0.22609148919582367, + "learning_rate": 8.64e-08, + "loss": 0.2903, + "mean_token_accuracy": 0.9323207437992096, + "num_tokens": 164651397.0, + "step": 2284 + }, + { + "epoch": 1.4257012444895252, + "grad_norm": 0.16950254142284393, + "learning_rate": 8.599999999999999e-08, + "loss": 0.3075, + "mean_token_accuracy": 0.9278007335960865, + "num_tokens": 164723584.0, + "step": 2285 + }, + { + "epoch": 1.426325439862677, + "grad_norm": 0.17741648852825165, + "learning_rate": 8.559999999999999e-08, + "loss": 0.2798, + "mean_token_accuracy": 0.9312286861240864, + "num_tokens": 164801992.0, + "step": 2286 + }, + { + "epoch": 1.4269496352358289, + "grad_norm": 0.22767587006092072, + "learning_rate": 8.52e-08, + "loss": 0.2792, + "mean_token_accuracy": 0.9281168058514595, + "num_tokens": 164874009.0, + "step": 2287 + }, + { + "epoch": 1.4275738306089807, + "grad_norm": 0.4446609616279602, + "learning_rate": 8.479999999999999e-08, + "loss": 0.3168, + "mean_token_accuracy": 0.9239183776080608, + "num_tokens": 164951508.0, + "step": 2288 + }, + { + "epoch": 1.4281980259821325, + "grad_norm": 0.41796547174453735, + "learning_rate": 8.44e-08, + "loss": 0.3013, + "mean_token_accuracy": 0.9294224828481674, + "num_tokens": 165025810.0, + "step": 2289 + }, + { + "epoch": 1.4288222213552841, + "grad_norm": 0.31657424569129944, + "learning_rate": 8.4e-08, + "loss": 0.2834, + "mean_token_accuracy": 0.9294983707368374, + "num_tokens": 165096717.0, + "step": 2290 + }, + { + "epoch": 1.429446416728436, + "grad_norm": 0.34868133068084717, + "learning_rate": 8.36e-08, + "loss": 0.293, + "mean_token_accuracy": 0.9345362931489944, + "num_tokens": 165170150.0, + "step": 2291 + }, + { + "epoch": 1.4300706121015878, + "grad_norm": 0.5973358154296875, + "learning_rate": 8.319999999999999e-08, + "loss": 0.3246, + "mean_token_accuracy": 0.9256414771080017, + "num_tokens": 165245493.0, + "step": 2292 + }, + { + "epoch": 1.4306948074747396, + "grad_norm": 0.3267904818058014, + "learning_rate": 8.28e-08, + "loss": 0.2849, + "mean_token_accuracy": 0.9302861653268337, + "num_tokens": 165320238.0, + "step": 2293 + }, + { + "epoch": 1.4313190028478915, + "grad_norm": 0.21055717766284943, + "learning_rate": 8.24e-08, + "loss": 0.2643, + "mean_token_accuracy": 0.9373898841440678, + "num_tokens": 165396857.0, + "step": 2294 + }, + { + "epoch": 1.431943198221043, + "grad_norm": 0.20468543469905853, + "learning_rate": 8.2e-08, + "loss": 0.2922, + "mean_token_accuracy": 0.9289688766002655, + "num_tokens": 165466336.0, + "step": 2295 + }, + { + "epoch": 1.432567393594195, + "grad_norm": 0.18894900381565094, + "learning_rate": 8.16e-08, + "loss": 0.3266, + "mean_token_accuracy": 0.9181978553533554, + "num_tokens": 165535985.0, + "step": 2296 + }, + { + "epoch": 1.4331915889673468, + "grad_norm": 0.3723616898059845, + "learning_rate": 8.119999999999999e-08, + "loss": 0.2875, + "mean_token_accuracy": 0.9318655617535114, + "num_tokens": 165608255.0, + "step": 2297 + }, + { + "epoch": 1.4338157843404986, + "grad_norm": 0.21552817523479462, + "learning_rate": 8.08e-08, + "loss": 0.2757, + "mean_token_accuracy": 0.9325765930116177, + "num_tokens": 165683266.0, + "step": 2298 + }, + { + "epoch": 1.4344399797136504, + "grad_norm": 0.3438664674758911, + "learning_rate": 8.039999999999999e-08, + "loss": 0.2856, + "mean_token_accuracy": 0.9289220981299877, + "num_tokens": 165753179.0, + "step": 2299 + }, + { + "epoch": 1.435064175086802, + "grad_norm": 0.22233910858631134, + "learning_rate": 8e-08, + "loss": 0.316, + "mean_token_accuracy": 0.9223077781498432, + "num_tokens": 165822562.0, + "step": 2300 + }, + { + "epoch": 1.435688370459954, + "grad_norm": 0.24199515581130981, + "learning_rate": 7.96e-08, + "loss": 0.3401, + "mean_token_accuracy": 0.9195297248661518, + "num_tokens": 165894126.0, + "step": 2301 + }, + { + "epoch": 1.4363125658331057, + "grad_norm": 0.24746735394001007, + "learning_rate": 7.920000000000001e-08, + "loss": 0.329, + "mean_token_accuracy": 0.9237656183540821, + "num_tokens": 165968735.0, + "step": 2302 + }, + { + "epoch": 1.4369367612062576, + "grad_norm": 0.33511480689048767, + "learning_rate": 7.879999999999999e-08, + "loss": 0.3465, + "mean_token_accuracy": 0.9161080569028854, + "num_tokens": 166033248.0, + "step": 2303 + }, + { + "epoch": 1.4375609565794094, + "grad_norm": 0.37997451424598694, + "learning_rate": 7.839999999999999e-08, + "loss": 0.3346, + "mean_token_accuracy": 0.9193439595401287, + "num_tokens": 166100933.0, + "step": 2304 + }, + { + "epoch": 1.438185151952561, + "grad_norm": 0.752350389957428, + "learning_rate": 7.8e-08, + "loss": 0.3289, + "mean_token_accuracy": 0.9239380285143852, + "num_tokens": 166167868.0, + "step": 2305 + }, + { + "epoch": 1.438809347325713, + "grad_norm": 0.21845866739749908, + "learning_rate": 7.76e-08, + "loss": 0.2843, + "mean_token_accuracy": 0.9316939078271389, + "num_tokens": 166239840.0, + "step": 2306 + }, + { + "epoch": 1.4394335426988647, + "grad_norm": 0.19468358159065247, + "learning_rate": 7.72e-08, + "loss": 0.3043, + "mean_token_accuracy": 0.925888154655695, + "num_tokens": 166312235.0, + "step": 2307 + }, + { + "epoch": 1.4400577380720165, + "grad_norm": 0.2861173748970032, + "learning_rate": 7.679999999999999e-08, + "loss": 0.3039, + "mean_token_accuracy": 0.9297107644379139, + "num_tokens": 166383810.0, + "step": 2308 + }, + { + "epoch": 1.4406819334451684, + "grad_norm": 0.1964455395936966, + "learning_rate": 7.64e-08, + "loss": 0.2888, + "mean_token_accuracy": 0.928243238478899, + "num_tokens": 166456701.0, + "step": 2309 + }, + { + "epoch": 1.4413061288183202, + "grad_norm": 0.9433870911598206, + "learning_rate": 7.599999999999999e-08, + "loss": 0.2889, + "mean_token_accuracy": 0.9303351379930973, + "num_tokens": 166528725.0, + "step": 2310 + }, + { + "epoch": 1.441930324191472, + "grad_norm": 0.33527013659477234, + "learning_rate": 7.56e-08, + "loss": 0.3025, + "mean_token_accuracy": 0.9257965460419655, + "num_tokens": 166599787.0, + "step": 2311 + }, + { + "epoch": 1.4425545195646237, + "grad_norm": 0.38248270750045776, + "learning_rate": 7.52e-08, + "loss": 0.3048, + "mean_token_accuracy": 0.9286483749747276, + "num_tokens": 166675011.0, + "step": 2312 + }, + { + "epoch": 1.4431787149377755, + "grad_norm": 0.41258928179740906, + "learning_rate": 7.480000000000001e-08, + "loss": 0.3314, + "mean_token_accuracy": 0.9200765267014503, + "num_tokens": 166741181.0, + "step": 2313 + }, + { + "epoch": 1.4438029103109273, + "grad_norm": 0.2248321920633316, + "learning_rate": 7.439999999999999e-08, + "loss": 0.3545, + "mean_token_accuracy": 0.9140613488852978, + "num_tokens": 166809219.0, + "step": 2314 + }, + { + "epoch": 1.4444271056840792, + "grad_norm": 0.21146978437900543, + "learning_rate": 7.399999999999999e-08, + "loss": 0.2999, + "mean_token_accuracy": 0.9247135445475578, + "num_tokens": 166883815.0, + "step": 2315 + }, + { + "epoch": 1.445051301057231, + "grad_norm": 0.27269965410232544, + "learning_rate": 7.36e-08, + "loss": 0.3196, + "mean_token_accuracy": 0.9260952286422253, + "num_tokens": 166951070.0, + "step": 2316 + }, + { + "epoch": 1.4456754964303826, + "grad_norm": 0.6741915345191956, + "learning_rate": 7.32e-08, + "loss": 0.2985, + "mean_token_accuracy": 0.9282975867390633, + "num_tokens": 167027740.0, + "step": 2317 + }, + { + "epoch": 1.4462996918035345, + "grad_norm": 0.27990198135375977, + "learning_rate": 7.28e-08, + "loss": 0.3174, + "mean_token_accuracy": 0.9270673915743828, + "num_tokens": 167099298.0, + "step": 2318 + }, + { + "epoch": 1.4469238871766863, + "grad_norm": 0.19164586067199707, + "learning_rate": 7.24e-08, + "loss": 0.3507, + "mean_token_accuracy": 0.9100921154022217, + "num_tokens": 167165847.0, + "step": 2319 + }, + { + "epoch": 1.4475480825498381, + "grad_norm": 1.0741263628005981, + "learning_rate": 7.2e-08, + "loss": 0.3193, + "mean_token_accuracy": 0.926135491579771, + "num_tokens": 167232130.0, + "step": 2320 + }, + { + "epoch": 1.44817227792299, + "grad_norm": 0.28014421463012695, + "learning_rate": 7.159999999999999e-08, + "loss": 0.2949, + "mean_token_accuracy": 0.9304358623921871, + "num_tokens": 167304004.0, + "step": 2321 + }, + { + "epoch": 1.4487964732961416, + "grad_norm": 0.1848498284816742, + "learning_rate": 7.12e-08, + "loss": 0.3223, + "mean_token_accuracy": 0.9268829338252544, + "num_tokens": 167374298.0, + "step": 2322 + }, + { + "epoch": 1.4494206686692934, + "grad_norm": 0.28508517146110535, + "learning_rate": 7.08e-08, + "loss": 0.3313, + "mean_token_accuracy": 0.9203253574669361, + "num_tokens": 167439856.0, + "step": 2323 + }, + { + "epoch": 1.4500448640424453, + "grad_norm": 0.26912015676498413, + "learning_rate": 7.04e-08, + "loss": 0.3159, + "mean_token_accuracy": 0.9261527955532074, + "num_tokens": 167510384.0, + "step": 2324 + }, + { + "epoch": 1.450669059415597, + "grad_norm": 0.2879539430141449, + "learning_rate": 7e-08, + "loss": 0.3091, + "mean_token_accuracy": 0.9265532828867435, + "num_tokens": 167581619.0, + "step": 2325 + }, + { + "epoch": 1.451293254788749, + "grad_norm": 0.21311315894126892, + "learning_rate": 6.959999999999999e-08, + "loss": 0.3226, + "mean_token_accuracy": 0.9216817058622837, + "num_tokens": 167651166.0, + "step": 2326 + }, + { + "epoch": 1.4519174501619005, + "grad_norm": 0.4190974533557892, + "learning_rate": 6.92e-08, + "loss": 0.2531, + "mean_token_accuracy": 0.937009934335947, + "num_tokens": 167729636.0, + "step": 2327 + }, + { + "epoch": 1.4525416455350526, + "grad_norm": 0.20628924667835236, + "learning_rate": 6.88e-08, + "loss": 0.3313, + "mean_token_accuracy": 0.9208254627883434, + "num_tokens": 167799603.0, + "step": 2328 + }, + { + "epoch": 1.4531658409082042, + "grad_norm": 0.24314343929290771, + "learning_rate": 6.84e-08, + "loss": 0.281, + "mean_token_accuracy": 0.9326652102172375, + "num_tokens": 167873866.0, + "step": 2329 + }, + { + "epoch": 1.453790036281356, + "grad_norm": 0.24209488928318024, + "learning_rate": 6.8e-08, + "loss": 0.2689, + "mean_token_accuracy": 0.9326137155294418, + "num_tokens": 167948178.0, + "step": 2330 + }, + { + "epoch": 1.454414231654508, + "grad_norm": 0.2450747936964035, + "learning_rate": 6.76e-08, + "loss": 0.2842, + "mean_token_accuracy": 0.9302664548158646, + "num_tokens": 168020846.0, + "step": 2331 + }, + { + "epoch": 1.4550384270276597, + "grad_norm": 0.8675456643104553, + "learning_rate": 6.719999999999999e-08, + "loss": 0.2903, + "mean_token_accuracy": 0.9304787628352642, + "num_tokens": 168096756.0, + "step": 2332 + }, + { + "epoch": 1.4556626224008116, + "grad_norm": 0.22853794693946838, + "learning_rate": 6.679999999999999e-08, + "loss": 0.2919, + "mean_token_accuracy": 0.9285392984747887, + "num_tokens": 168166856.0, + "step": 2333 + }, + { + "epoch": 1.4562868177739632, + "grad_norm": 0.40912437438964844, + "learning_rate": 6.64e-08, + "loss": 0.3087, + "mean_token_accuracy": 0.9275959134101868, + "num_tokens": 168241242.0, + "step": 2334 + }, + { + "epoch": 1.456911013147115, + "grad_norm": 2.038982629776001, + "learning_rate": 6.6e-08, + "loss": 0.2998, + "mean_token_accuracy": 0.9270787611603737, + "num_tokens": 168315726.0, + "step": 2335 + }, + { + "epoch": 1.4575352085202669, + "grad_norm": 0.21409299969673157, + "learning_rate": 6.56e-08, + "loss": 0.2965, + "mean_token_accuracy": 0.9309280253946781, + "num_tokens": 168387670.0, + "step": 2336 + }, + { + "epoch": 1.4581594038934187, + "grad_norm": 0.2544315755367279, + "learning_rate": 6.519999999999999e-08, + "loss": 0.3235, + "mean_token_accuracy": 0.9164434857666492, + "num_tokens": 168458898.0, + "step": 2337 + }, + { + "epoch": 1.4587835992665705, + "grad_norm": 0.30257412791252136, + "learning_rate": 6.48e-08, + "loss": 0.2775, + "mean_token_accuracy": 0.9305780977010727, + "num_tokens": 168533185.0, + "step": 2338 + }, + { + "epoch": 1.4594077946397221, + "grad_norm": 0.20508061349391937, + "learning_rate": 6.44e-08, + "loss": 0.2942, + "mean_token_accuracy": 0.9306410737335682, + "num_tokens": 168606906.0, + "step": 2339 + }, + { + "epoch": 1.460031990012874, + "grad_norm": 0.28449806571006775, + "learning_rate": 6.4e-08, + "loss": 0.2845, + "mean_token_accuracy": 0.9272063337266445, + "num_tokens": 168678717.0, + "step": 2340 + }, + { + "epoch": 1.4606561853860258, + "grad_norm": 0.2490503489971161, + "learning_rate": 6.36e-08, + "loss": 0.2904, + "mean_token_accuracy": 0.9290844835340977, + "num_tokens": 168749137.0, + "step": 2341 + }, + { + "epoch": 1.4612803807591777, + "grad_norm": 0.18828070163726807, + "learning_rate": 6.32e-08, + "loss": 0.2649, + "mean_token_accuracy": 0.9349101819097996, + "num_tokens": 168824663.0, + "step": 2342 + }, + { + "epoch": 1.4619045761323295, + "grad_norm": 0.29763421416282654, + "learning_rate": 6.279999999999999e-08, + "loss": 0.263, + "mean_token_accuracy": 0.9329787604510784, + "num_tokens": 168897959.0, + "step": 2343 + }, + { + "epoch": 1.462528771505481, + "grad_norm": 0.2850882411003113, + "learning_rate": 6.239999999999999e-08, + "loss": 0.2821, + "mean_token_accuracy": 0.9315609969198704, + "num_tokens": 168972493.0, + "step": 2344 + }, + { + "epoch": 1.463152966878633, + "grad_norm": 0.2244882583618164, + "learning_rate": 6.2e-08, + "loss": 0.3193, + "mean_token_accuracy": 0.9191980697214603, + "num_tokens": 169041056.0, + "step": 2345 + }, + { + "epoch": 1.4637771622517848, + "grad_norm": 0.18382996320724487, + "learning_rate": 6.16e-08, + "loss": 0.2791, + "mean_token_accuracy": 0.935517281293869, + "num_tokens": 169113312.0, + "step": 2346 + }, + { + "epoch": 1.4644013576249366, + "grad_norm": 0.3573710024356842, + "learning_rate": 6.119999999999999e-08, + "loss": 0.2704, + "mean_token_accuracy": 0.9328648634254932, + "num_tokens": 169188900.0, + "step": 2347 + }, + { + "epoch": 1.4650255529980885, + "grad_norm": 0.28366324305534363, + "learning_rate": 6.08e-08, + "loss": 0.2767, + "mean_token_accuracy": 0.9338286258280277, + "num_tokens": 169265472.0, + "step": 2348 + }, + { + "epoch": 1.46564974837124, + "grad_norm": 0.30091527104377747, + "learning_rate": 6.04e-08, + "loss": 0.306, + "mean_token_accuracy": 0.928806833922863, + "num_tokens": 169336863.0, + "step": 2349 + }, + { + "epoch": 1.4662739437443921, + "grad_norm": 0.5103444457054138, + "learning_rate": 6e-08, + "loss": 0.2936, + "mean_token_accuracy": 0.9309053346514702, + "num_tokens": 169409592.0, + "step": 2350 + }, + { + "epoch": 1.4668981391175437, + "grad_norm": 0.933778703212738, + "learning_rate": 5.96e-08, + "loss": 0.2921, + "mean_token_accuracy": 0.930558905005455, + "num_tokens": 169485918.0, + "step": 2351 + }, + { + "epoch": 1.4675223344906956, + "grad_norm": 0.30135542154312134, + "learning_rate": 5.92e-08, + "loss": 0.3011, + "mean_token_accuracy": 0.9276548065245152, + "num_tokens": 169559047.0, + "step": 2352 + }, + { + "epoch": 1.4681465298638474, + "grad_norm": 0.2249504029750824, + "learning_rate": 5.88e-08, + "loss": 0.3032, + "mean_token_accuracy": 0.9236448258161545, + "num_tokens": 169625554.0, + "step": 2353 + }, + { + "epoch": 1.4687707252369993, + "grad_norm": 0.6283105611801147, + "learning_rate": 5.84e-08, + "loss": 0.2761, + "mean_token_accuracy": 0.9331337995827198, + "num_tokens": 169702641.0, + "step": 2354 + }, + { + "epoch": 1.469394920610151, + "grad_norm": 0.23857049643993378, + "learning_rate": 5.8e-08, + "loss": 0.298, + "mean_token_accuracy": 0.929531067609787, + "num_tokens": 169771841.0, + "step": 2355 + }, + { + "epoch": 1.4700191159833027, + "grad_norm": 0.35011720657348633, + "learning_rate": 5.759999999999999e-08, + "loss": 0.3, + "mean_token_accuracy": 0.9271261915564537, + "num_tokens": 169842820.0, + "step": 2356 + }, + { + "epoch": 1.4706433113564545, + "grad_norm": 0.21875707805156708, + "learning_rate": 5.7199999999999996e-08, + "loss": 0.2981, + "mean_token_accuracy": 0.9307105056941509, + "num_tokens": 169916784.0, + "step": 2357 + }, + { + "epoch": 1.4712675067296064, + "grad_norm": 0.28787335753440857, + "learning_rate": 5.68e-08, + "loss": 0.3002, + "mean_token_accuracy": 0.9292552620172501, + "num_tokens": 169986088.0, + "step": 2358 + }, + { + "epoch": 1.4718917021027582, + "grad_norm": 0.20266355574131012, + "learning_rate": 5.6399999999999995e-08, + "loss": 0.3305, + "mean_token_accuracy": 0.9174962528049946, + "num_tokens": 170051797.0, + "step": 2359 + }, + { + "epoch": 1.47251589747591, + "grad_norm": 0.30074775218963623, + "learning_rate": 5.6e-08, + "loss": 0.332, + "mean_token_accuracy": 0.9221298880875111, + "num_tokens": 170124059.0, + "step": 2360 + }, + { + "epoch": 1.4731400928490617, + "grad_norm": 0.26626116037368774, + "learning_rate": 5.5599999999999995e-08, + "loss": 0.3071, + "mean_token_accuracy": 0.9280472360551357, + "num_tokens": 170197031.0, + "step": 2361 + }, + { + "epoch": 1.4737642882222135, + "grad_norm": 0.27912622690200806, + "learning_rate": 5.52e-08, + "loss": 0.3037, + "mean_token_accuracy": 0.9304065108299255, + "num_tokens": 170270376.0, + "step": 2362 + }, + { + "epoch": 1.4743884835953653, + "grad_norm": 0.24152587354183197, + "learning_rate": 5.48e-08, + "loss": 0.3031, + "mean_token_accuracy": 0.9272795952856541, + "num_tokens": 170338995.0, + "step": 2363 + }, + { + "epoch": 1.4750126789685172, + "grad_norm": 0.3269670605659485, + "learning_rate": 5.44e-08, + "loss": 0.3161, + "mean_token_accuracy": 0.9226135462522507, + "num_tokens": 170409832.0, + "step": 2364 + }, + { + "epoch": 1.475636874341669, + "grad_norm": 0.6743448376655579, + "learning_rate": 5.3999999999999994e-08, + "loss": 0.2988, + "mean_token_accuracy": 0.9292638972401619, + "num_tokens": 170485958.0, + "step": 2365 + }, + { + "epoch": 1.4762610697148206, + "grad_norm": 0.6958937048912048, + "learning_rate": 5.36e-08, + "loss": 0.3159, + "mean_token_accuracy": 0.9274067915976048, + "num_tokens": 170558753.0, + "step": 2366 + }, + { + "epoch": 1.4768852650879725, + "grad_norm": 0.1586783230304718, + "learning_rate": 5.319999999999999e-08, + "loss": 0.2757, + "mean_token_accuracy": 0.9316280744969845, + "num_tokens": 170634891.0, + "step": 2367 + }, + { + "epoch": 1.4775094604611243, + "grad_norm": 0.18796971440315247, + "learning_rate": 5.2799999999999996e-08, + "loss": 0.3112, + "mean_token_accuracy": 0.9276409335434437, + "num_tokens": 170705962.0, + "step": 2368 + }, + { + "epoch": 1.4781336558342761, + "grad_norm": 0.2396310567855835, + "learning_rate": 5.24e-08, + "loss": 0.308, + "mean_token_accuracy": 0.9255803562700748, + "num_tokens": 170778878.0, + "step": 2369 + }, + { + "epoch": 1.478757851207428, + "grad_norm": 0.17760396003723145, + "learning_rate": 5.1999999999999996e-08, + "loss": 0.2917, + "mean_token_accuracy": 0.9306518025696278, + "num_tokens": 170856619.0, + "step": 2370 + }, + { + "epoch": 1.4793820465805796, + "grad_norm": 0.26160937547683716, + "learning_rate": 5.16e-08, + "loss": 0.3089, + "mean_token_accuracy": 0.9266801215708256, + "num_tokens": 170924903.0, + "step": 2371 + }, + { + "epoch": 1.4800062419537316, + "grad_norm": 0.23355920612812042, + "learning_rate": 5.12e-08, + "loss": 0.3234, + "mean_token_accuracy": 0.9241162501275539, + "num_tokens": 170994739.0, + "step": 2372 + }, + { + "epoch": 1.4806304373268833, + "grad_norm": 0.2905130684375763, + "learning_rate": 5.08e-08, + "loss": 0.286, + "mean_token_accuracy": 0.9316273890435696, + "num_tokens": 171066343.0, + "step": 2373 + }, + { + "epoch": 1.481254632700035, + "grad_norm": 0.24926261603832245, + "learning_rate": 5.04e-08, + "loss": 0.3086, + "mean_token_accuracy": 0.9216854646801949, + "num_tokens": 171138217.0, + "step": 2374 + }, + { + "epoch": 1.481878828073187, + "grad_norm": 0.3028871715068817, + "learning_rate": 5e-08, + "loss": 0.2938, + "mean_token_accuracy": 0.9303243793547153, + "num_tokens": 171211601.0, + "step": 2375 + }, + { + "epoch": 1.4825030234463388, + "grad_norm": 0.2247331291437149, + "learning_rate": 4.9599999999999994e-08, + "loss": 0.3383, + "mean_token_accuracy": 0.9154311679303646, + "num_tokens": 171283722.0, + "step": 2376 + }, + { + "epoch": 1.4831272188194906, + "grad_norm": 0.17281556129455566, + "learning_rate": 4.92e-08, + "loss": 0.2716, + "mean_token_accuracy": 0.93617357686162, + "num_tokens": 171360210.0, + "step": 2377 + }, + { + "epoch": 1.4837514141926422, + "grad_norm": 0.42635077238082886, + "learning_rate": 4.88e-08, + "loss": 0.281, + "mean_token_accuracy": 0.932771623134613, + "num_tokens": 171434460.0, + "step": 2378 + }, + { + "epoch": 1.484375609565794, + "grad_norm": 0.2523311674594879, + "learning_rate": 4.8399999999999997e-08, + "loss": 0.3143, + "mean_token_accuracy": 0.9203478060662746, + "num_tokens": 171506463.0, + "step": 2379 + }, + { + "epoch": 1.484999804938946, + "grad_norm": 0.22192850708961487, + "learning_rate": 4.8e-08, + "loss": 0.3025, + "mean_token_accuracy": 0.928737461566925, + "num_tokens": 171577569.0, + "step": 2380 + }, + { + "epoch": 1.4856240003120977, + "grad_norm": 0.8303393721580505, + "learning_rate": 4.76e-08, + "loss": 0.3008, + "mean_token_accuracy": 0.9269648566842079, + "num_tokens": 171647592.0, + "step": 2381 + }, + { + "epoch": 1.4862481956852496, + "grad_norm": 0.24506014585494995, + "learning_rate": 4.72e-08, + "loss": 0.3022, + "mean_token_accuracy": 0.9264957867562771, + "num_tokens": 171720244.0, + "step": 2382 + }, + { + "epoch": 1.4868723910584012, + "grad_norm": 0.17005586624145508, + "learning_rate": 4.68e-08, + "loss": 0.2773, + "mean_token_accuracy": 0.9342509880661964, + "num_tokens": 171791481.0, + "step": 2383 + }, + { + "epoch": 1.487496586431553, + "grad_norm": 0.24636325240135193, + "learning_rate": 4.639999999999999e-08, + "loss": 0.3052, + "mean_token_accuracy": 0.925624955445528, + "num_tokens": 171866354.0, + "step": 2384 + }, + { + "epoch": 1.4881207818047049, + "grad_norm": 0.26109248399734497, + "learning_rate": 4.5999999999999995e-08, + "loss": 0.3002, + "mean_token_accuracy": 0.9277599714696407, + "num_tokens": 171936904.0, + "step": 2385 + }, + { + "epoch": 1.4887449771778567, + "grad_norm": 0.3165014386177063, + "learning_rate": 4.56e-08, + "loss": 0.314, + "mean_token_accuracy": 0.9241543896496296, + "num_tokens": 172008027.0, + "step": 2386 + }, + { + "epoch": 1.4893691725510085, + "grad_norm": 0.25920820236206055, + "learning_rate": 4.5199999999999994e-08, + "loss": 0.3057, + "mean_token_accuracy": 0.9261327907443047, + "num_tokens": 172077278.0, + "step": 2387 + }, + { + "epoch": 1.4899933679241602, + "grad_norm": 0.2768191993236542, + "learning_rate": 4.48e-08, + "loss": 0.3316, + "mean_token_accuracy": 0.9186080284416676, + "num_tokens": 172150448.0, + "step": 2388 + }, + { + "epoch": 1.490617563297312, + "grad_norm": 0.3389921486377716, + "learning_rate": 4.44e-08, + "loss": 0.294, + "mean_token_accuracy": 0.9298953115940094, + "num_tokens": 172223199.0, + "step": 2389 + }, + { + "epoch": 1.4912417586704638, + "grad_norm": 0.2478993833065033, + "learning_rate": 4.4e-08, + "loss": 0.3499, + "mean_token_accuracy": 0.9165273420512676, + "num_tokens": 172291771.0, + "step": 2390 + }, + { + "epoch": 1.4918659540436157, + "grad_norm": 0.16161221265792847, + "learning_rate": 4.36e-08, + "loss": 0.2669, + "mean_token_accuracy": 0.9336376525461674, + "num_tokens": 172367420.0, + "step": 2391 + }, + { + "epoch": 1.4924901494167675, + "grad_norm": 0.38500404357910156, + "learning_rate": 4.32e-08, + "loss": 0.3343, + "mean_token_accuracy": 0.9166819714009762, + "num_tokens": 172434076.0, + "step": 2392 + }, + { + "epoch": 1.4931143447899191, + "grad_norm": 0.28801801800727844, + "learning_rate": 4.279999999999999e-08, + "loss": 0.2958, + "mean_token_accuracy": 0.9302184134721756, + "num_tokens": 172507238.0, + "step": 2393 + }, + { + "epoch": 1.4937385401630712, + "grad_norm": 0.2409009486436844, + "learning_rate": 4.2399999999999996e-08, + "loss": 0.2927, + "mean_token_accuracy": 0.9308332018554211, + "num_tokens": 172585358.0, + "step": 2394 + }, + { + "epoch": 1.4943627355362228, + "grad_norm": 0.3008463382720947, + "learning_rate": 4.2e-08, + "loss": 0.3204, + "mean_token_accuracy": 0.9247551113367081, + "num_tokens": 172653259.0, + "step": 2395 + }, + { + "epoch": 1.4949869309093746, + "grad_norm": 0.22220267355442047, + "learning_rate": 4.1599999999999995e-08, + "loss": 0.3242, + "mean_token_accuracy": 0.9236058667302132, + "num_tokens": 172723767.0, + "step": 2396 + }, + { + "epoch": 1.4956111262825265, + "grad_norm": 0.21777011454105377, + "learning_rate": 4.12e-08, + "loss": 0.325, + "mean_token_accuracy": 0.9199660420417786, + "num_tokens": 172788498.0, + "step": 2397 + }, + { + "epoch": 1.4962353216556783, + "grad_norm": 0.2659027874469757, + "learning_rate": 4.08e-08, + "loss": 0.3115, + "mean_token_accuracy": 0.9272710122168064, + "num_tokens": 172861438.0, + "step": 2398 + }, + { + "epoch": 1.4968595170288301, + "grad_norm": 0.3286789357662201, + "learning_rate": 4.04e-08, + "loss": 0.2893, + "mean_token_accuracy": 0.9338060617446899, + "num_tokens": 172935326.0, + "step": 2399 + }, + { + "epoch": 1.4974837124019817, + "grad_norm": 0.1823185533285141, + "learning_rate": 4e-08, + "loss": 0.2691, + "mean_token_accuracy": 0.9342191405594349, + "num_tokens": 173006918.0, + "step": 2400 + }, + { + "epoch": 1.4981079077751336, + "grad_norm": 0.26332369446754456, + "learning_rate": 3.9600000000000004e-08, + "loss": 0.3046, + "mean_token_accuracy": 0.9227634444832802, + "num_tokens": 173076645.0, + "step": 2401 + }, + { + "epoch": 1.4987321031482854, + "grad_norm": 0.18826189637184143, + "learning_rate": 3.9199999999999994e-08, + "loss": 0.3139, + "mean_token_accuracy": 0.9254961647093296, + "num_tokens": 173148109.0, + "step": 2402 + }, + { + "epoch": 1.4993562985214373, + "grad_norm": 0.1605939120054245, + "learning_rate": 3.88e-08, + "loss": 0.2934, + "mean_token_accuracy": 0.9301251545548439, + "num_tokens": 173220469.0, + "step": 2403 + }, + { + "epoch": 1.499980493894589, + "grad_norm": 0.7790800929069519, + "learning_rate": 3.839999999999999e-08, + "loss": 0.3361, + "mean_token_accuracy": 0.9204860515892506, + "num_tokens": 173289502.0, + "step": 2404 + }, + { + "epoch": 1.5006046892677407, + "grad_norm": 0.18504062294960022, + "learning_rate": 3.7999999999999996e-08, + "loss": 0.2766, + "mean_token_accuracy": 0.9339430145919323, + "num_tokens": 173365952.0, + "step": 2405 + }, + { + "epoch": 1.5012288846408925, + "grad_norm": 0.3589896857738495, + "learning_rate": 3.76e-08, + "loss": 0.2939, + "mean_token_accuracy": 0.9300957098603249, + "num_tokens": 173440281.0, + "step": 2406 + }, + { + "epoch": 1.5018530800140444, + "grad_norm": 0.20058850944042206, + "learning_rate": 3.7199999999999996e-08, + "loss": 0.2748, + "mean_token_accuracy": 0.9334316179156303, + "num_tokens": 173514173.0, + "step": 2407 + }, + { + "epoch": 1.5024772753871962, + "grad_norm": 0.2873539626598358, + "learning_rate": 3.68e-08, + "loss": 0.3536, + "mean_token_accuracy": 0.9099237695336342, + "num_tokens": 173575653.0, + "step": 2408 + }, + { + "epoch": 1.503101470760348, + "grad_norm": 0.2543013095855713, + "learning_rate": 3.64e-08, + "loss": 0.3202, + "mean_token_accuracy": 0.9250536262989044, + "num_tokens": 173650028.0, + "step": 2409 + }, + { + "epoch": 1.5037256661334997, + "grad_norm": 0.21940435469150543, + "learning_rate": 3.6e-08, + "loss": 0.3128, + "mean_token_accuracy": 0.9230681583285332, + "num_tokens": 173722070.0, + "step": 2410 + }, + { + "epoch": 1.5043498615066517, + "grad_norm": 0.15130607783794403, + "learning_rate": 3.56e-08, + "loss": 0.3311, + "mean_token_accuracy": 0.9210895411670208, + "num_tokens": 173794512.0, + "step": 2411 + }, + { + "epoch": 1.5049740568798033, + "grad_norm": 0.2173030972480774, + "learning_rate": 3.52e-08, + "loss": 0.2716, + "mean_token_accuracy": 0.9346710741519928, + "num_tokens": 173867923.0, + "step": 2412 + }, + { + "epoch": 1.5055982522529552, + "grad_norm": 0.23842328786849976, + "learning_rate": 3.4799999999999994e-08, + "loss": 0.2933, + "mean_token_accuracy": 0.9278075955808163, + "num_tokens": 173936518.0, + "step": 2413 + }, + { + "epoch": 1.506222447626107, + "grad_norm": 0.23444490134716034, + "learning_rate": 3.44e-08, + "loss": 0.2921, + "mean_token_accuracy": 0.9273904040455818, + "num_tokens": 174009620.0, + "step": 2414 + }, + { + "epoch": 1.5068466429992586, + "grad_norm": 0.20182958245277405, + "learning_rate": 3.4e-08, + "loss": 0.2979, + "mean_token_accuracy": 0.9272923544049263, + "num_tokens": 174085213.0, + "step": 2415 + }, + { + "epoch": 1.5074708383724107, + "grad_norm": 0.22688433527946472, + "learning_rate": 3.3599999999999996e-08, + "loss": 0.2996, + "mean_token_accuracy": 0.9280979670584202, + "num_tokens": 174158935.0, + "step": 2416 + }, + { + "epoch": 1.5080950337455623, + "grad_norm": 0.30959248542785645, + "learning_rate": 3.32e-08, + "loss": 0.3149, + "mean_token_accuracy": 0.9221625439822674, + "num_tokens": 174227285.0, + "step": 2417 + }, + { + "epoch": 1.5087192291187141, + "grad_norm": 0.23859451711177826, + "learning_rate": 3.28e-08, + "loss": 0.3261, + "mean_token_accuracy": 0.9202194847166538, + "num_tokens": 174294107.0, + "step": 2418 + }, + { + "epoch": 1.509343424491866, + "grad_norm": 0.8122697472572327, + "learning_rate": 3.24e-08, + "loss": 0.2988, + "mean_token_accuracy": 0.9295856952667236, + "num_tokens": 174364725.0, + "step": 2419 + }, + { + "epoch": 1.5099676198650176, + "grad_norm": 0.2208295315504074, + "learning_rate": 3.2e-08, + "loss": 0.3294, + "mean_token_accuracy": 0.9222045503556728, + "num_tokens": 174436239.0, + "step": 2420 + }, + { + "epoch": 1.5105918152381697, + "grad_norm": 0.32371705770492554, + "learning_rate": 3.16e-08, + "loss": 0.2925, + "mean_token_accuracy": 0.9317890256643295, + "num_tokens": 174508993.0, + "step": 2421 + }, + { + "epoch": 1.5112160106113213, + "grad_norm": 0.25620484352111816, + "learning_rate": 3.1199999999999995e-08, + "loss": 0.2876, + "mean_token_accuracy": 0.929258044809103, + "num_tokens": 174585399.0, + "step": 2422 + }, + { + "epoch": 1.511840205984473, + "grad_norm": 0.2838667631149292, + "learning_rate": 3.08e-08, + "loss": 0.3219, + "mean_token_accuracy": 0.918518178164959, + "num_tokens": 174655093.0, + "step": 2423 + }, + { + "epoch": 1.512464401357625, + "grad_norm": 0.2580810785293579, + "learning_rate": 3.04e-08, + "loss": 0.3205, + "mean_token_accuracy": 0.9246489964425564, + "num_tokens": 174724904.0, + "step": 2424 + }, + { + "epoch": 1.5130885967307768, + "grad_norm": 0.36080506443977356, + "learning_rate": 3e-08, + "loss": 0.3025, + "mean_token_accuracy": 0.9265349246561527, + "num_tokens": 174795015.0, + "step": 2425 + }, + { + "epoch": 1.5137127921039286, + "grad_norm": 0.341679185628891, + "learning_rate": 2.96e-08, + "loss": 0.2983, + "mean_token_accuracy": 0.9288870580494404, + "num_tokens": 174868126.0, + "step": 2426 + }, + { + "epoch": 1.5143369874770802, + "grad_norm": 0.1804443746805191, + "learning_rate": 2.92e-08, + "loss": 0.2918, + "mean_token_accuracy": 0.9325439631938934, + "num_tokens": 174940723.0, + "step": 2427 + }, + { + "epoch": 1.514961182850232, + "grad_norm": 0.17018955945968628, + "learning_rate": 2.8799999999999996e-08, + "loss": 0.2979, + "mean_token_accuracy": 0.9298811852931976, + "num_tokens": 175016453.0, + "step": 2428 + }, + { + "epoch": 1.515585378223384, + "grad_norm": 0.23251865804195404, + "learning_rate": 2.84e-08, + "loss": 0.2914, + "mean_token_accuracy": 0.9285848736763, + "num_tokens": 175092027.0, + "step": 2429 + }, + { + "epoch": 1.5162095735965357, + "grad_norm": 0.23907387256622314, + "learning_rate": 2.8e-08, + "loss": 0.2907, + "mean_token_accuracy": 0.9297003597021103, + "num_tokens": 175165495.0, + "step": 2430 + }, + { + "epoch": 1.5168337689696876, + "grad_norm": 0.597614049911499, + "learning_rate": 2.76e-08, + "loss": 0.3041, + "mean_token_accuracy": 0.9300348162651062, + "num_tokens": 175233622.0, + "step": 2431 + }, + { + "epoch": 1.5174579643428392, + "grad_norm": 0.2612685561180115, + "learning_rate": 2.72e-08, + "loss": 0.3073, + "mean_token_accuracy": 0.9293926730751991, + "num_tokens": 175306990.0, + "step": 2432 + }, + { + "epoch": 1.5180821597159913, + "grad_norm": 0.17203544080257416, + "learning_rate": 2.68e-08, + "loss": 0.2899, + "mean_token_accuracy": 0.9303685240447521, + "num_tokens": 175377904.0, + "step": 2433 + }, + { + "epoch": 1.5187063550891429, + "grad_norm": 0.21997766196727753, + "learning_rate": 2.6399999999999998e-08, + "loss": 0.3245, + "mean_token_accuracy": 0.9193855300545692, + "num_tokens": 175447671.0, + "step": 2434 + }, + { + "epoch": 1.5193305504622947, + "grad_norm": 0.2176414132118225, + "learning_rate": 2.5999999999999998e-08, + "loss": 0.295, + "mean_token_accuracy": 0.9287814721465111, + "num_tokens": 175524035.0, + "step": 2435 + }, + { + "epoch": 1.5199547458354465, + "grad_norm": 0.2559150159358978, + "learning_rate": 2.56e-08, + "loss": 0.2748, + "mean_token_accuracy": 0.9298008233308792, + "num_tokens": 175601294.0, + "step": 2436 + }, + { + "epoch": 1.5205789412085982, + "grad_norm": 0.2823890447616577, + "learning_rate": 2.52e-08, + "loss": 0.288, + "mean_token_accuracy": 0.9279176630079746, + "num_tokens": 175673065.0, + "step": 2437 + }, + { + "epoch": 1.5212031365817502, + "grad_norm": 0.9031587243080139, + "learning_rate": 2.4799999999999997e-08, + "loss": 0.25, + "mean_token_accuracy": 0.9376320205628872, + "num_tokens": 175747401.0, + "step": 2438 + }, + { + "epoch": 1.5218273319549018, + "grad_norm": 0.3653998076915741, + "learning_rate": 2.44e-08, + "loss": 0.2855, + "mean_token_accuracy": 0.9287306852638721, + "num_tokens": 175819332.0, + "step": 2439 + }, + { + "epoch": 1.5224515273280537, + "grad_norm": 0.2079857438802719, + "learning_rate": 2.4e-08, + "loss": 0.3146, + "mean_token_accuracy": 0.9204912595450878, + "num_tokens": 175887924.0, + "step": 2440 + }, + { + "epoch": 1.5230757227012055, + "grad_norm": 0.1775536984205246, + "learning_rate": 2.36e-08, + "loss": 0.2526, + "mean_token_accuracy": 0.9371605664491653, + "num_tokens": 175967922.0, + "step": 2441 + }, + { + "epoch": 1.5236999180743571, + "grad_norm": 0.2285599559545517, + "learning_rate": 2.3199999999999996e-08, + "loss": 0.2987, + "mean_token_accuracy": 0.9285437278449535, + "num_tokens": 176037121.0, + "step": 2442 + }, + { + "epoch": 1.5243241134475092, + "grad_norm": 0.19697079062461853, + "learning_rate": 2.28e-08, + "loss": 0.2698, + "mean_token_accuracy": 0.9340740703046322, + "num_tokens": 176110400.0, + "step": 2443 + }, + { + "epoch": 1.5249483088206608, + "grad_norm": 0.2030375748872757, + "learning_rate": 2.24e-08, + "loss": 0.2824, + "mean_token_accuracy": 0.9322567656636238, + "num_tokens": 176183732.0, + "step": 2444 + }, + { + "epoch": 1.5255725041938126, + "grad_norm": 0.27508923411369324, + "learning_rate": 2.2e-08, + "loss": 0.2728, + "mean_token_accuracy": 0.9347629100084305, + "num_tokens": 176257597.0, + "step": 2445 + }, + { + "epoch": 1.5261966995669645, + "grad_norm": 0.19993537664413452, + "learning_rate": 2.16e-08, + "loss": 0.3395, + "mean_token_accuracy": 0.9181911870837212, + "num_tokens": 176326865.0, + "step": 2446 + }, + { + "epoch": 1.5268208949401163, + "grad_norm": 0.39371258020401, + "learning_rate": 2.1199999999999998e-08, + "loss": 0.2607, + "mean_token_accuracy": 0.934710681438446, + "num_tokens": 176401918.0, + "step": 2447 + }, + { + "epoch": 1.5274450903132681, + "grad_norm": 0.2682410478591919, + "learning_rate": 2.0799999999999998e-08, + "loss": 0.3084, + "mean_token_accuracy": 0.9250551909208298, + "num_tokens": 176469312.0, + "step": 2448 + }, + { + "epoch": 1.5280692856864198, + "grad_norm": 0.2546030580997467, + "learning_rate": 2.04e-08, + "loss": 0.3158, + "mean_token_accuracy": 0.9197724387049675, + "num_tokens": 176539822.0, + "step": 2449 + }, + { + "epoch": 1.5286934810595716, + "grad_norm": 0.3962733745574951, + "learning_rate": 2e-08, + "loss": 0.3187, + "mean_token_accuracy": 0.9263073056936264, + "num_tokens": 176613029.0, + "step": 2450 + }, + { + "epoch": 1.5293176764327234, + "grad_norm": 0.2193756252527237, + "learning_rate": 1.9599999999999997e-08, + "loss": 0.2985, + "mean_token_accuracy": 0.9305267296731472, + "num_tokens": 176682447.0, + "step": 2451 + }, + { + "epoch": 1.5299418718058753, + "grad_norm": 0.29642799496650696, + "learning_rate": 1.9199999999999997e-08, + "loss": 0.3591, + "mean_token_accuracy": 0.9083001539111137, + "num_tokens": 176748982.0, + "step": 2452 + }, + { + "epoch": 1.530566067179027, + "grad_norm": 0.18559861183166504, + "learning_rate": 1.88e-08, + "loss": 0.3119, + "mean_token_accuracy": 0.9241405017673969, + "num_tokens": 176819569.0, + "step": 2453 + }, + { + "epoch": 1.5311902625521787, + "grad_norm": 0.18608522415161133, + "learning_rate": 1.84e-08, + "loss": 0.2846, + "mean_token_accuracy": 0.9279976300895214, + "num_tokens": 176891184.0, + "step": 2454 + }, + { + "epoch": 1.5318144579253308, + "grad_norm": 0.3439576327800751, + "learning_rate": 1.8e-08, + "loss": 0.2883, + "mean_token_accuracy": 0.9310185834765434, + "num_tokens": 176964207.0, + "step": 2455 + }, + { + "epoch": 1.5324386532984824, + "grad_norm": 0.47678637504577637, + "learning_rate": 1.76e-08, + "loss": 0.282, + "mean_token_accuracy": 0.9319860935211182, + "num_tokens": 177038922.0, + "step": 2456 + }, + { + "epoch": 1.5330628486716342, + "grad_norm": 0.31708285212516785, + "learning_rate": 1.72e-08, + "loss": 0.3125, + "mean_token_accuracy": 0.9229981936514378, + "num_tokens": 177110322.0, + "step": 2457 + }, + { + "epoch": 1.533687044044786, + "grad_norm": 0.17785820364952087, + "learning_rate": 1.6799999999999998e-08, + "loss": 0.2514, + "mean_token_accuracy": 0.9372963719069958, + "num_tokens": 177190385.0, + "step": 2458 + }, + { + "epoch": 1.5343112394179377, + "grad_norm": 0.21619783341884613, + "learning_rate": 1.64e-08, + "loss": 0.3256, + "mean_token_accuracy": 0.9189807176589966, + "num_tokens": 177260535.0, + "step": 2459 + }, + { + "epoch": 1.5349354347910897, + "grad_norm": 0.19008275866508484, + "learning_rate": 1.6e-08, + "loss": 0.3116, + "mean_token_accuracy": 0.9292586594820023, + "num_tokens": 177327965.0, + "step": 2460 + }, + { + "epoch": 1.5355596301642414, + "grad_norm": 0.28387829661369324, + "learning_rate": 1.5599999999999997e-08, + "loss": 0.3016, + "mean_token_accuracy": 0.9234476201236248, + "num_tokens": 177400483.0, + "step": 2461 + }, + { + "epoch": 1.5361838255373932, + "grad_norm": 0.21607163548469543, + "learning_rate": 1.52e-08, + "loss": 0.2785, + "mean_token_accuracy": 0.9331503808498383, + "num_tokens": 177473686.0, + "step": 2462 + }, + { + "epoch": 1.536808020910545, + "grad_norm": 0.2702539563179016, + "learning_rate": 1.48e-08, + "loss": 0.2897, + "mean_token_accuracy": 0.9292602054774761, + "num_tokens": 177544463.0, + "step": 2463 + }, + { + "epoch": 1.5374322162836966, + "grad_norm": 1.6630282402038574, + "learning_rate": 1.4399999999999998e-08, + "loss": 0.3187, + "mean_token_accuracy": 0.9269638992846012, + "num_tokens": 177616168.0, + "step": 2464 + }, + { + "epoch": 1.5380564116568487, + "grad_norm": 0.22292675077915192, + "learning_rate": 1.4e-08, + "loss": 0.2824, + "mean_token_accuracy": 0.931102842092514, + "num_tokens": 177690061.0, + "step": 2465 + }, + { + "epoch": 1.5386806070300003, + "grad_norm": 0.267769455909729, + "learning_rate": 1.36e-08, + "loss": 0.3273, + "mean_token_accuracy": 0.9238092638552189, + "num_tokens": 177763714.0, + "step": 2466 + }, + { + "epoch": 1.5393048024031522, + "grad_norm": 0.22737376391887665, + "learning_rate": 1.3199999999999999e-08, + "loss": 0.2527, + "mean_token_accuracy": 0.940272618085146, + "num_tokens": 177841611.0, + "step": 2467 + }, + { + "epoch": 1.539928997776304, + "grad_norm": 0.36249616742134094, + "learning_rate": 1.28e-08, + "loss": 0.3063, + "mean_token_accuracy": 0.9260867349803448, + "num_tokens": 177914474.0, + "step": 2468 + }, + { + "epoch": 1.5405531931494558, + "grad_norm": 0.20127016305923462, + "learning_rate": 1.2399999999999999e-08, + "loss": 0.2996, + "mean_token_accuracy": 0.9277869760990143, + "num_tokens": 177984412.0, + "step": 2469 + }, + { + "epoch": 1.5411773885226077, + "grad_norm": 0.21077390015125275, + "learning_rate": 1.2e-08, + "loss": 0.2795, + "mean_token_accuracy": 0.9321720525622368, + "num_tokens": 178056134.0, + "step": 2470 + }, + { + "epoch": 1.5418015838957593, + "grad_norm": 0.26560312509536743, + "learning_rate": 1.1599999999999998e-08, + "loss": 0.2899, + "mean_token_accuracy": 0.9305733181536198, + "num_tokens": 178130433.0, + "step": 2471 + }, + { + "epoch": 1.5424257792689113, + "grad_norm": 0.27589020133018494, + "learning_rate": 1.12e-08, + "loss": 0.2868, + "mean_token_accuracy": 0.9286252744495869, + "num_tokens": 178202756.0, + "step": 2472 + }, + { + "epoch": 1.543049974642063, + "grad_norm": 0.2312583476305008, + "learning_rate": 1.08e-08, + "loss": 0.2789, + "mean_token_accuracy": 0.9339400827884674, + "num_tokens": 178274385.0, + "step": 2473 + }, + { + "epoch": 1.5436741700152148, + "grad_norm": 0.1686672419309616, + "learning_rate": 1.0399999999999999e-08, + "loss": 0.3442, + "mean_token_accuracy": 0.9188282713294029, + "num_tokens": 178345929.0, + "step": 2474 + }, + { + "epoch": 1.5442983653883666, + "grad_norm": 0.3751900792121887, + "learning_rate": 1e-08, + "loss": 0.3204, + "mean_token_accuracy": 0.9246494099497795, + "num_tokens": 178418872.0, + "step": 2475 + }, + { + "epoch": 1.5449225607615182, + "grad_norm": 0.777836799621582, + "learning_rate": 9.599999999999998e-09, + "loss": 0.2988, + "mean_token_accuracy": 0.9264791235327721, + "num_tokens": 178487973.0, + "step": 2476 + }, + { + "epoch": 1.5455467561346703, + "grad_norm": 0.30609917640686035, + "learning_rate": 9.2e-09, + "loss": 0.3312, + "mean_token_accuracy": 0.9222712330520153, + "num_tokens": 178558519.0, + "step": 2477 + }, + { + "epoch": 1.546170951507822, + "grad_norm": 0.2074725180864334, + "learning_rate": 8.8e-09, + "loss": 0.3023, + "mean_token_accuracy": 0.9257556162774563, + "num_tokens": 178631185.0, + "step": 2478 + }, + { + "epoch": 1.5467951468809737, + "grad_norm": 0.2055860161781311, + "learning_rate": 8.399999999999999e-09, + "loss": 0.3142, + "mean_token_accuracy": 0.9245745874941349, + "num_tokens": 178703422.0, + "step": 2479 + }, + { + "epoch": 1.5474193422541256, + "grad_norm": 0.31035709381103516, + "learning_rate": 8e-09, + "loss": 0.3032, + "mean_token_accuracy": 0.923737071454525, + "num_tokens": 178774415.0, + "step": 2480 + }, + { + "epoch": 1.5480435376272772, + "grad_norm": 0.6624286770820618, + "learning_rate": 7.6e-09, + "loss": 0.2938, + "mean_token_accuracy": 0.9283818006515503, + "num_tokens": 178844576.0, + "step": 2481 + }, + { + "epoch": 1.5486677330004293, + "grad_norm": 0.21301789581775665, + "learning_rate": 7.199999999999999e-09, + "loss": 0.3144, + "mean_token_accuracy": 0.9260341264307499, + "num_tokens": 178914795.0, + "step": 2482 + }, + { + "epoch": 1.5492919283735809, + "grad_norm": 0.2399834841489792, + "learning_rate": 6.8e-09, + "loss": 0.3035, + "mean_token_accuracy": 0.9283800907433033, + "num_tokens": 178986497.0, + "step": 2483 + }, + { + "epoch": 1.5499161237467327, + "grad_norm": 0.24645648896694183, + "learning_rate": 6.4e-09, + "loss": 0.2915, + "mean_token_accuracy": 0.92583042755723, + "num_tokens": 179059115.0, + "step": 2484 + }, + { + "epoch": 1.5505403191198845, + "grad_norm": 0.29368671774864197, + "learning_rate": 6e-09, + "loss": 0.2824, + "mean_token_accuracy": 0.9307620488107204, + "num_tokens": 179130137.0, + "step": 2485 + }, + { + "epoch": 1.5511645144930362, + "grad_norm": 0.3135833442211151, + "learning_rate": 5.6e-09, + "loss": 0.3431, + "mean_token_accuracy": 0.9151646941900253, + "num_tokens": 179198475.0, + "step": 2486 + }, + { + "epoch": 1.5517887098661882, + "grad_norm": 0.21225892007350922, + "learning_rate": 5.1999999999999994e-09, + "loss": 0.3109, + "mean_token_accuracy": 0.924727950245142, + "num_tokens": 179269427.0, + "step": 2487 + }, + { + "epoch": 1.5524129052393398, + "grad_norm": 0.28182631731033325, + "learning_rate": 4.799999999999999e-09, + "loss": 0.2792, + "mean_token_accuracy": 0.9313678964972496, + "num_tokens": 179340818.0, + "step": 2488 + }, + { + "epoch": 1.5530371006124917, + "grad_norm": 0.2660740613937378, + "learning_rate": 4.4e-09, + "loss": 0.3081, + "mean_token_accuracy": 0.9287791773676872, + "num_tokens": 179411491.0, + "step": 2489 + }, + { + "epoch": 1.5536612959856435, + "grad_norm": 0.20930573344230652, + "learning_rate": 4e-09, + "loss": 0.2381, + "mean_token_accuracy": 0.9401608891785145, + "num_tokens": 179488473.0, + "step": 2490 + }, + { + "epoch": 1.5542854913587953, + "grad_norm": 0.2976415157318115, + "learning_rate": 3.5999999999999996e-09, + "loss": 0.3018, + "mean_token_accuracy": 0.9268345572054386, + "num_tokens": 179557967.0, + "step": 2491 + }, + { + "epoch": 1.5549096867319472, + "grad_norm": 0.25529032945632935, + "learning_rate": 3.2e-09, + "loss": 0.2817, + "mean_token_accuracy": 0.9312438927590847, + "num_tokens": 179635599.0, + "step": 2492 + }, + { + "epoch": 1.5555338821050988, + "grad_norm": 0.1818600296974182, + "learning_rate": 2.8e-09, + "loss": 0.2593, + "mean_token_accuracy": 0.9348039664328098, + "num_tokens": 179708615.0, + "step": 2493 + }, + { + "epoch": 1.5561580774782509, + "grad_norm": 0.4674505591392517, + "learning_rate": 2.3999999999999996e-09, + "loss": 0.3026, + "mean_token_accuracy": 0.9244759269058704, + "num_tokens": 179780398.0, + "step": 2494 + }, + { + "epoch": 1.5567822728514025, + "grad_norm": 0.24638880789279938, + "learning_rate": 2e-09, + "loss": 0.3331, + "mean_token_accuracy": 0.9198136553168297, + "num_tokens": 179851949.0, + "step": 2495 + }, + { + "epoch": 1.5574064682245543, + "grad_norm": 0.2822139859199524, + "learning_rate": 1.6e-09, + "loss": 0.3275, + "mean_token_accuracy": 0.9228845089673996, + "num_tokens": 179922013.0, + "step": 2496 + }, + { + "epoch": 1.5580306635977061, + "grad_norm": 0.22678102552890778, + "learning_rate": 1.1999999999999998e-09, + "loss": 0.3029, + "mean_token_accuracy": 0.9253904782235622, + "num_tokens": 179992297.0, + "step": 2497 + }, + { + "epoch": 1.5586548589708578, + "grad_norm": 0.25330451130867004, + "learning_rate": 8e-10, + "loss": 0.3074, + "mean_token_accuracy": 0.9272580109536648, + "num_tokens": 180063275.0, + "step": 2498 + }, + { + "epoch": 1.5592790543440098, + "grad_norm": 2.065382480621338, + "learning_rate": 4e-10, + "loss": 0.2888, + "mean_token_accuracy": 0.9283899217844009, + "num_tokens": 180136961.0, + "step": 2499 + }, + { + "epoch": 1.5599032497171614, + "grad_norm": 0.18339255452156067, + "learning_rate": 0.0, + "loss": 0.2981, + "mean_token_accuracy": 0.9290702827274799, + "num_tokens": 180206849.0, + "step": 2500 + } + ], + "logging_steps": 1.0, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.376557795327082e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}