diff --git "a/VideoRFT-SFT-64/checkpoint-1000/trainer_state.json" "b/VideoRFT-SFT-64/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/VideoRFT-SFT-64/checkpoint-1000/trainer_state.json" @@ -0,0 +1,9033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6241953731517965, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006241953731517965, + "grad_norm": 17.25544548034668, + "learning_rate": 9.996e-07, + "loss": 8.5434, + "mean_token_accuracy": 0.273918142542243, + "num_tokens": 68439.0, + "step": 1 + }, + { + "epoch": 0.001248390746303593, + "grad_norm": 358.458984375, + "learning_rate": 9.992e-07, + "loss": 9.5736, + "mean_token_accuracy": 0.23063147906214, + "num_tokens": 138954.0, + "step": 2 + }, + { + "epoch": 0.0018725861194553895, + "grad_norm": 27.68710708618164, + "learning_rate": 9.988e-07, + "loss": 9.2342, + "mean_token_accuracy": 0.24578081723302603, + "num_tokens": 208893.0, + "step": 3 + }, + { + "epoch": 0.002496781492607186, + "grad_norm": 11.739906311035156, + "learning_rate": 9.983999999999998e-07, + "loss": 8.3543, + "mean_token_accuracy": 0.28470986522734165, + "num_tokens": 276500.0, + "step": 4 + }, + { + "epoch": 0.0031209768657589824, + "grad_norm": 14.976438522338867, + "learning_rate": 9.98e-07, + "loss": 9.3069, + "mean_token_accuracy": 0.23937270510941744, + "num_tokens": 348577.0, + "step": 5 + }, + { + "epoch": 0.003745172238910779, + "grad_norm": 34.87214660644531, + "learning_rate": 9.976e-07, + "loss": 8.2966, + "mean_token_accuracy": 0.2748953141272068, + "num_tokens": 418896.0, + "step": 6 + }, + { + "epoch": 0.004369367612062576, + "grad_norm": 18.72233772277832, + "learning_rate": 9.972e-07, + "loss": 8.5032, + "mean_token_accuracy": 0.27094727009534836, + "num_tokens": 489370.0, + "step": 7 + }, + { + "epoch": 0.004993562985214372, + "grad_norm": 11.080031394958496, + "learning_rate": 9.968e-07, + "loss": 9.3235, + "mean_token_accuracy": 0.24496650323271751, + "num_tokens": 562867.0, + "step": 8 + }, + { + "epoch": 0.005617758358366169, + "grad_norm": 90.85437774658203, + "learning_rate": 9.964e-07, + "loss": 8.7871, + "mean_token_accuracy": 0.2528521418571472, + "num_tokens": 634883.0, + "step": 9 + }, + { + "epoch": 0.006241953731517965, + "grad_norm": 30.238351821899414, + "learning_rate": 9.959999999999999e-07, + "loss": 8.3435, + "mean_token_accuracy": 0.2776096425950527, + "num_tokens": 707954.0, + "step": 10 + }, + { + "epoch": 0.006866149104669762, + "grad_norm": 15.585051536560059, + "learning_rate": 9.956e-07, + "loss": 7.7686, + "mean_token_accuracy": 0.3210552539676428, + "num_tokens": 780843.0, + "step": 11 + }, + { + "epoch": 0.007490344477821558, + "grad_norm": 15.990130424499512, + "learning_rate": 9.952e-07, + "loss": 7.8187, + "mean_token_accuracy": 0.3180449502542615, + "num_tokens": 854137.0, + "step": 12 + }, + { + "epoch": 0.008114539850973355, + "grad_norm": 25.397552490234375, + "learning_rate": 9.948e-07, + "loss": 7.9021, + "mean_token_accuracy": 0.2877113036811352, + "num_tokens": 926274.0, + "step": 13 + }, + { + "epoch": 0.008738735224125152, + "grad_norm": 12.293536186218262, + "learning_rate": 9.944e-07, + "loss": 7.169, + "mean_token_accuracy": 0.3381050443276763, + "num_tokens": 996445.0, + "step": 14 + }, + { + "epoch": 0.009362930597276947, + "grad_norm": 45.536712646484375, + "learning_rate": 9.94e-07, + "loss": 7.8367, + "mean_token_accuracy": 0.30633394606411457, + "num_tokens": 1068884.0, + "step": 15 + }, + { + "epoch": 0.009987125970428744, + "grad_norm": 64.1750717163086, + "learning_rate": 9.936e-07, + "loss": 7.5882, + "mean_token_accuracy": 0.31630368810147047, + "num_tokens": 1142700.0, + "step": 16 + }, + { + "epoch": 0.01061132134358054, + "grad_norm": 42.90964889526367, + "learning_rate": 9.931999999999999e-07, + "loss": 7.8268, + "mean_token_accuracy": 0.2754935659468174, + "num_tokens": 1216019.0, + "step": 17 + }, + { + "epoch": 0.011235516716732338, + "grad_norm": 6.927515029907227, + "learning_rate": 9.928e-07, + "loss": 6.9444, + "mean_token_accuracy": 0.33526306599378586, + "num_tokens": 1285731.0, + "step": 18 + }, + { + "epoch": 0.011859712089884135, + "grad_norm": 24.95079231262207, + "learning_rate": 9.923999999999998e-07, + "loss": 6.5419, + "mean_token_accuracy": 0.34128680545836687, + "num_tokens": 1358383.0, + "step": 19 + }, + { + "epoch": 0.01248390746303593, + "grad_norm": 33.162437438964844, + "learning_rate": 9.92e-07, + "loss": 6.5872, + "mean_token_accuracy": 0.3143086237832904, + "num_tokens": 1430552.0, + "step": 20 + }, + { + "epoch": 0.013108102836187727, + "grad_norm": 11.807770729064941, + "learning_rate": 9.916e-07, + "loss": 5.0385, + "mean_token_accuracy": 0.3795516202226281, + "num_tokens": 1494029.0, + "step": 21 + }, + { + "epoch": 0.013732298209339524, + "grad_norm": 27.99307632446289, + "learning_rate": 9.912e-07, + "loss": 6.1081, + "mean_token_accuracy": 0.32700329925864935, + "num_tokens": 1565051.0, + "step": 22 + }, + { + "epoch": 0.01435649358249132, + "grad_norm": 16.961132049560547, + "learning_rate": 9.908e-07, + "loss": 6.285, + "mean_token_accuracy": 0.3062258521094918, + "num_tokens": 1638553.0, + "step": 23 + }, + { + "epoch": 0.014980688955643116, + "grad_norm": 9.736113548278809, + "learning_rate": 9.903999999999999e-07, + "loss": 5.5588, + "mean_token_accuracy": 0.35241850558668375, + "num_tokens": 1708763.0, + "step": 24 + }, + { + "epoch": 0.015604884328794913, + "grad_norm": 11.06151008605957, + "learning_rate": 9.9e-07, + "loss": 6.1869, + "mean_token_accuracy": 0.31777405086904764, + "num_tokens": 1784143.0, + "step": 25 + }, + { + "epoch": 0.01622907970194671, + "grad_norm": 11.496856689453125, + "learning_rate": 9.896e-07, + "loss": 5.58, + "mean_token_accuracy": 0.3433418981730938, + "num_tokens": 1853644.0, + "step": 26 + }, + { + "epoch": 0.016853275075098505, + "grad_norm": 8.099380493164062, + "learning_rate": 9.892e-07, + "loss": 6.3192, + "mean_token_accuracy": 0.3149422388523817, + "num_tokens": 1929869.0, + "step": 27 + }, + { + "epoch": 0.017477470448250303, + "grad_norm": 30.143823623657227, + "learning_rate": 9.888e-07, + "loss": 5.7214, + "mean_token_accuracy": 0.32226298097521067, + "num_tokens": 2001157.0, + "step": 28 + }, + { + "epoch": 0.0181016658214021, + "grad_norm": 6.38374137878418, + "learning_rate": 9.884e-07, + "loss": 5.7185, + "mean_token_accuracy": 0.3307513426989317, + "num_tokens": 2075305.0, + "step": 29 + }, + { + "epoch": 0.018725861194553894, + "grad_norm": 0.8968116044998169, + "learning_rate": 9.88e-07, + "loss": 5.4392, + "mean_token_accuracy": 0.34164623729884624, + "num_tokens": 2146379.0, + "step": 30 + }, + { + "epoch": 0.019350056567705692, + "grad_norm": 13.619659423828125, + "learning_rate": 9.876e-07, + "loss": 5.656, + "mean_token_accuracy": 0.3418971076607704, + "num_tokens": 2221821.0, + "step": 31 + }, + { + "epoch": 0.019974251940857488, + "grad_norm": 4.945206165313721, + "learning_rate": 9.871999999999998e-07, + "loss": 5.0611, + "mean_token_accuracy": 0.35001065488904715, + "num_tokens": 2292070.0, + "step": 32 + }, + { + "epoch": 0.020598447314009286, + "grad_norm": 40.115970611572266, + "learning_rate": 9.868e-07, + "loss": 5.1247, + "mean_token_accuracy": 0.35443792305886745, + "num_tokens": 2364977.0, + "step": 33 + }, + { + "epoch": 0.02122264268716108, + "grad_norm": 13.273247718811035, + "learning_rate": 9.864e-07, + "loss": 4.9631, + "mean_token_accuracy": 0.364372487179935, + "num_tokens": 2435306.0, + "step": 34 + }, + { + "epoch": 0.021846838060312877, + "grad_norm": 7.187255382537842, + "learning_rate": 9.86e-07, + "loss": 4.4443, + "mean_token_accuracy": 0.3889983994886279, + "num_tokens": 2506196.0, + "step": 35 + }, + { + "epoch": 0.022471033433464675, + "grad_norm": 14.940446853637695, + "learning_rate": 9.856e-07, + "loss": 4.8625, + "mean_token_accuracy": 0.3650856511667371, + "num_tokens": 2579194.0, + "step": 36 + }, + { + "epoch": 0.02309522880661647, + "grad_norm": 3.3641610145568848, + "learning_rate": 9.852e-07, + "loss": 5.0014, + "mean_token_accuracy": 0.3314152853563428, + "num_tokens": 2654270.0, + "step": 37 + }, + { + "epoch": 0.02371942417976827, + "grad_norm": 6.470470905303955, + "learning_rate": 9.847999999999999e-07, + "loss": 4.857, + "mean_token_accuracy": 0.3407899560406804, + "num_tokens": 2728760.0, + "step": 38 + }, + { + "epoch": 0.024343619552920064, + "grad_norm": 6.731522083282471, + "learning_rate": 9.844e-07, + "loss": 5.1998, + "mean_token_accuracy": 0.2973237466067076, + "num_tokens": 2805485.0, + "step": 39 + }, + { + "epoch": 0.02496781492607186, + "grad_norm": 2.7483808994293213, + "learning_rate": 9.84e-07, + "loss": 4.0682, + "mean_token_accuracy": 0.39288341347128153, + "num_tokens": 2875491.0, + "step": 40 + }, + { + "epoch": 0.025592010299223658, + "grad_norm": 4.212852954864502, + "learning_rate": 9.836e-07, + "loss": 4.913, + "mean_token_accuracy": 0.3132828688248992, + "num_tokens": 2950304.0, + "step": 41 + }, + { + "epoch": 0.026216205672375453, + "grad_norm": 3.8348255157470703, + "learning_rate": 9.832e-07, + "loss": 4.3569, + "mean_token_accuracy": 0.3750774832442403, + "num_tokens": 3021730.0, + "step": 42 + }, + { + "epoch": 0.02684040104552725, + "grad_norm": 0.8846646547317505, + "learning_rate": 9.828e-07, + "loss": 3.9858, + "mean_token_accuracy": 0.39173205476254225, + "num_tokens": 3091280.0, + "step": 43 + }, + { + "epoch": 0.027464596418679047, + "grad_norm": 6.942370414733887, + "learning_rate": 9.824e-07, + "loss": 4.419, + "mean_token_accuracy": 0.3575789360329509, + "num_tokens": 3165267.0, + "step": 44 + }, + { + "epoch": 0.028088791791830842, + "grad_norm": 2.3689680099487305, + "learning_rate": 9.819999999999999e-07, + "loss": 4.1538, + "mean_token_accuracy": 0.3756133858114481, + "num_tokens": 3237323.0, + "step": 45 + }, + { + "epoch": 0.02871298716498264, + "grad_norm": 1.8894206285476685, + "learning_rate": 9.816e-07, + "loss": 3.9113, + "mean_token_accuracy": 0.4163288287818432, + "num_tokens": 3308609.0, + "step": 46 + }, + { + "epoch": 0.029337182538134436, + "grad_norm": 13.67268180847168, + "learning_rate": 9.811999999999998e-07, + "loss": 3.6531, + "mean_token_accuracy": 0.41791752725839615, + "num_tokens": 3376906.0, + "step": 47 + }, + { + "epoch": 0.02996137791128623, + "grad_norm": 4.209970474243164, + "learning_rate": 9.808e-07, + "loss": 4.5005, + "mean_token_accuracy": 0.34258707892149687, + "num_tokens": 3452278.0, + "step": 48 + }, + { + "epoch": 0.03058557328443803, + "grad_norm": 7.526029109954834, + "learning_rate": 9.804e-07, + "loss": 4.2911, + "mean_token_accuracy": 0.3827690966427326, + "num_tokens": 3527058.0, + "step": 49 + }, + { + "epoch": 0.031209768657589825, + "grad_norm": 2.122880220413208, + "learning_rate": 9.8e-07, + "loss": 4.3308, + "mean_token_accuracy": 0.3736236896365881, + "num_tokens": 3600503.0, + "step": 50 + }, + { + "epoch": 0.031833964030741624, + "grad_norm": 3.8492212295532227, + "learning_rate": 9.796e-07, + "loss": 4.099, + "mean_token_accuracy": 0.3650047332048416, + "num_tokens": 3673199.0, + "step": 51 + }, + { + "epoch": 0.03245815940389342, + "grad_norm": 3.9944045543670654, + "learning_rate": 9.791999999999999e-07, + "loss": 3.1844, + "mean_token_accuracy": 0.4542169217020273, + "num_tokens": 3740412.0, + "step": 52 + }, + { + "epoch": 0.033082354777045214, + "grad_norm": 6.169367790222168, + "learning_rate": 9.788e-07, + "loss": 4.0363, + "mean_token_accuracy": 0.3692982345819473, + "num_tokens": 3815563.0, + "step": 53 + }, + { + "epoch": 0.03370655015019701, + "grad_norm": 2.1248767375946045, + "learning_rate": 9.784e-07, + "loss": 3.5037, + "mean_token_accuracy": 0.4372657844796777, + "num_tokens": 3885917.0, + "step": 54 + }, + { + "epoch": 0.03433074552334881, + "grad_norm": 6.588809013366699, + "learning_rate": 9.78e-07, + "loss": 3.4912, + "mean_token_accuracy": 0.41233247332274914, + "num_tokens": 3956647.0, + "step": 55 + }, + { + "epoch": 0.03495494089650061, + "grad_norm": 4.523096084594727, + "learning_rate": 9.776e-07, + "loss": 3.9076, + "mean_token_accuracy": 0.39284045808017254, + "num_tokens": 4033452.0, + "step": 56 + }, + { + "epoch": 0.0355791362696524, + "grad_norm": 7.290942668914795, + "learning_rate": 9.772e-07, + "loss": 3.5392, + "mean_token_accuracy": 0.427378224208951, + "num_tokens": 4104394.0, + "step": 57 + }, + { + "epoch": 0.0362033316428042, + "grad_norm": 10.499979972839355, + "learning_rate": 9.768e-07, + "loss": 3.8001, + "mean_token_accuracy": 0.39460830576717854, + "num_tokens": 4179110.0, + "step": 58 + }, + { + "epoch": 0.03682752701595599, + "grad_norm": 2.6359169483184814, + "learning_rate": 9.764e-07, + "loss": 3.3732, + "mean_token_accuracy": 0.42214369028806686, + "num_tokens": 4249007.0, + "step": 59 + }, + { + "epoch": 0.03745172238910779, + "grad_norm": 2.7533557415008545, + "learning_rate": 9.759999999999998e-07, + "loss": 3.5268, + "mean_token_accuracy": 0.41251463908702135, + "num_tokens": 4320460.0, + "step": 60 + }, + { + "epoch": 0.03807591776225959, + "grad_norm": 7.864406108856201, + "learning_rate": 9.756e-07, + "loss": 3.4771, + "mean_token_accuracy": 0.4127057120203972, + "num_tokens": 4393683.0, + "step": 61 + }, + { + "epoch": 0.038700113135411385, + "grad_norm": 8.053912162780762, + "learning_rate": 9.752e-07, + "loss": 3.6572, + "mean_token_accuracy": 0.39490477088838816, + "num_tokens": 4469096.0, + "step": 62 + }, + { + "epoch": 0.03932430850856318, + "grad_norm": 5.509406089782715, + "learning_rate": 9.748e-07, + "loss": 3.1147, + "mean_token_accuracy": 0.44453135784715414, + "num_tokens": 4536789.0, + "step": 63 + }, + { + "epoch": 0.039948503881714975, + "grad_norm": 2.907407522201538, + "learning_rate": 9.744e-07, + "loss": 3.191, + "mean_token_accuracy": 0.4393644714727998, + "num_tokens": 4608142.0, + "step": 64 + }, + { + "epoch": 0.04057269925486677, + "grad_norm": 2.617462158203125, + "learning_rate": 9.74e-07, + "loss": 3.4136, + "mean_token_accuracy": 0.3966305162757635, + "num_tokens": 4681844.0, + "step": 65 + }, + { + "epoch": 0.04119689462801857, + "grad_norm": 2.643540143966675, + "learning_rate": 9.735999999999999e-07, + "loss": 3.558, + "mean_token_accuracy": 0.4038737080991268, + "num_tokens": 4757568.0, + "step": 66 + }, + { + "epoch": 0.04182109000117037, + "grad_norm": 0.34584879875183105, + "learning_rate": 9.731999999999998e-07, + "loss": 3.3735, + "mean_token_accuracy": 0.41008515655994415, + "num_tokens": 4830387.0, + "step": 67 + }, + { + "epoch": 0.04244528537432216, + "grad_norm": 8.369765281677246, + "learning_rate": 9.728e-07, + "loss": 3.5372, + "mean_token_accuracy": 0.4134551119059324, + "num_tokens": 4903369.0, + "step": 68 + }, + { + "epoch": 0.04306948074747396, + "grad_norm": 2.8616392612457275, + "learning_rate": 9.724e-07, + "loss": 3.6074, + "mean_token_accuracy": 0.3741075322031975, + "num_tokens": 4979757.0, + "step": 69 + }, + { + "epoch": 0.04369367612062575, + "grad_norm": 8.232832908630371, + "learning_rate": 9.72e-07, + "loss": 3.8138, + "mean_token_accuracy": 0.3565077856183052, + "num_tokens": 5057123.0, + "step": 70 + }, + { + "epoch": 0.044317871493777555, + "grad_norm": 5.5518646240234375, + "learning_rate": 9.716e-07, + "loss": 3.3853, + "mean_token_accuracy": 0.40091484785079956, + "num_tokens": 5131793.0, + "step": 71 + }, + { + "epoch": 0.04494206686692935, + "grad_norm": 7.901788711547852, + "learning_rate": 9.712e-07, + "loss": 3.1943, + "mean_token_accuracy": 0.42189537547528744, + "num_tokens": 5204744.0, + "step": 72 + }, + { + "epoch": 0.045566262240081146, + "grad_norm": 5.840290069580078, + "learning_rate": 9.707999999999999e-07, + "loss": 2.9538, + "mean_token_accuracy": 0.44593478739261627, + "num_tokens": 5274641.0, + "step": 73 + }, + { + "epoch": 0.04619045761323294, + "grad_norm": 5.598299980163574, + "learning_rate": 9.704e-07, + "loss": 2.7585, + "mean_token_accuracy": 0.4696329589933157, + "num_tokens": 5342886.0, + "step": 74 + }, + { + "epoch": 0.046814652986384736, + "grad_norm": 5.895838260650635, + "learning_rate": 9.7e-07, + "loss": 3.0451, + "mean_token_accuracy": 0.43241861648857594, + "num_tokens": 5416456.0, + "step": 75 + }, + { + "epoch": 0.04743884835953654, + "grad_norm": 2.880441665649414, + "learning_rate": 9.696e-07, + "loss": 3.2557, + "mean_token_accuracy": 0.4075364079326391, + "num_tokens": 5490235.0, + "step": 76 + }, + { + "epoch": 0.04806304373268833, + "grad_norm": 0.316191703081131, + "learning_rate": 9.692e-07, + "loss": 2.4266, + "mean_token_accuracy": 0.49917714670300484, + "num_tokens": 5555086.0, + "step": 77 + }, + { + "epoch": 0.04868723910584013, + "grad_norm": 5.317595481872559, + "learning_rate": 9.688e-07, + "loss": 3.3519, + "mean_token_accuracy": 0.3744805287569761, + "num_tokens": 5629905.0, + "step": 78 + }, + { + "epoch": 0.049311434478991924, + "grad_norm": 8.304346084594727, + "learning_rate": 9.684e-07, + "loss": 3.2339, + "mean_token_accuracy": 0.39141651801764965, + "num_tokens": 5702121.0, + "step": 79 + }, + { + "epoch": 0.04993562985214372, + "grad_norm": 8.848092079162598, + "learning_rate": 9.679999999999999e-07, + "loss": 3.4629, + "mean_token_accuracy": 0.3851572498679161, + "num_tokens": 5779350.0, + "step": 80 + }, + { + "epoch": 0.050559825225295514, + "grad_norm": 8.295083045959473, + "learning_rate": 9.676e-07, + "loss": 3.3826, + "mean_token_accuracy": 0.39699453115463257, + "num_tokens": 5856826.0, + "step": 81 + }, + { + "epoch": 0.051184020598447316, + "grad_norm": 10.870695114135742, + "learning_rate": 9.671999999999998e-07, + "loss": 2.8136, + "mean_token_accuracy": 0.47172751650214195, + "num_tokens": 5926579.0, + "step": 82 + }, + { + "epoch": 0.05180821597159911, + "grad_norm": 2.6977903842926025, + "learning_rate": 9.668e-07, + "loss": 2.7124, + "mean_token_accuracy": 0.46093476563692093, + "num_tokens": 5998201.0, + "step": 83 + }, + { + "epoch": 0.05243241134475091, + "grad_norm": 5.437819957733154, + "learning_rate": 9.664e-07, + "loss": 2.6206, + "mean_token_accuracy": 0.4777855705469847, + "num_tokens": 6067593.0, + "step": 84 + }, + { + "epoch": 0.0530566067179027, + "grad_norm": 5.301544189453125, + "learning_rate": 9.66e-07, + "loss": 2.7652, + "mean_token_accuracy": 0.45118373818695545, + "num_tokens": 6137564.0, + "step": 85 + }, + { + "epoch": 0.0536808020910545, + "grad_norm": 8.00975227355957, + "learning_rate": 9.656e-07, + "loss": 3.056, + "mean_token_accuracy": 0.426558505743742, + "num_tokens": 6213150.0, + "step": 86 + }, + { + "epoch": 0.0543049974642063, + "grad_norm": 5.430366039276123, + "learning_rate": 9.651999999999999e-07, + "loss": 2.7748, + "mean_token_accuracy": 0.44263155199587345, + "num_tokens": 6284919.0, + "step": 87 + }, + { + "epoch": 0.054929192837358094, + "grad_norm": 5.378356456756592, + "learning_rate": 9.647999999999999e-07, + "loss": 2.7902, + "mean_token_accuracy": 0.4566228464245796, + "num_tokens": 6357628.0, + "step": 88 + }, + { + "epoch": 0.05555338821050989, + "grad_norm": 5.3619704246521, + "learning_rate": 9.644e-07, + "loss": 3.3253, + "mean_token_accuracy": 0.38746162317693233, + "num_tokens": 6433563.0, + "step": 89 + }, + { + "epoch": 0.056177583583661685, + "grad_norm": 7.832232475280762, + "learning_rate": 9.64e-07, + "loss": 2.7835, + "mean_token_accuracy": 0.4331488534808159, + "num_tokens": 6504577.0, + "step": 90 + }, + { + "epoch": 0.05680177895681348, + "grad_norm": 2.9571073055267334, + "learning_rate": 9.636e-07, + "loss": 2.8488, + "mean_token_accuracy": 0.4346345001831651, + "num_tokens": 6576313.0, + "step": 91 + }, + { + "epoch": 0.05742597432996528, + "grad_norm": 5.558602333068848, + "learning_rate": 9.632e-07, + "loss": 2.6471, + "mean_token_accuracy": 0.4774735514074564, + "num_tokens": 6647849.0, + "step": 92 + }, + { + "epoch": 0.05805016970311708, + "grad_norm": 8.028719902038574, + "learning_rate": 9.628e-07, + "loss": 3.0663, + "mean_token_accuracy": 0.3925786167383194, + "num_tokens": 6722617.0, + "step": 93 + }, + { + "epoch": 0.05867436507626887, + "grad_norm": 5.597121715545654, + "learning_rate": 9.624e-07, + "loss": 2.5673, + "mean_token_accuracy": 0.4711476918309927, + "num_tokens": 6792266.0, + "step": 94 + }, + { + "epoch": 0.05929856044942067, + "grad_norm": 2.907738208770752, + "learning_rate": 9.619999999999999e-07, + "loss": 2.3481, + "mean_token_accuracy": 0.48446944914758205, + "num_tokens": 6861691.0, + "step": 95 + }, + { + "epoch": 0.05992275582257246, + "grad_norm": 7.614640235900879, + "learning_rate": 9.616e-07, + "loss": 2.5295, + "mean_token_accuracy": 0.47646917775273323, + "num_tokens": 6931251.0, + "step": 96 + }, + { + "epoch": 0.060546951195724265, + "grad_norm": 5.174458980560303, + "learning_rate": 9.612e-07, + "loss": 2.9762, + "mean_token_accuracy": 0.39805862680077553, + "num_tokens": 7007101.0, + "step": 97 + }, + { + "epoch": 0.06117114656887606, + "grad_norm": 5.423238277435303, + "learning_rate": 9.608e-07, + "loss": 3.1361, + "mean_token_accuracy": 0.37795576453208923, + "num_tokens": 7082510.0, + "step": 98 + }, + { + "epoch": 0.061795341942027855, + "grad_norm": 5.688440322875977, + "learning_rate": 9.604e-07, + "loss": 2.4505, + "mean_token_accuracy": 0.4660475980490446, + "num_tokens": 7151058.0, + "step": 99 + }, + { + "epoch": 0.06241953731517965, + "grad_norm": 5.158726692199707, + "learning_rate": 9.6e-07, + "loss": 2.7193, + "mean_token_accuracy": 0.43638070672750473, + "num_tokens": 7224260.0, + "step": 100 + }, + { + "epoch": 0.06304373268833145, + "grad_norm": 3.8112547397613525, + "learning_rate": 9.595999999999999e-07, + "loss": 2.3645, + "mean_token_accuracy": 0.4727396834641695, + "num_tokens": 7291378.0, + "step": 101 + }, + { + "epoch": 0.06366792806148325, + "grad_norm": 8.052077293395996, + "learning_rate": 9.592e-07, + "loss": 2.8315, + "mean_token_accuracy": 0.4083132743835449, + "num_tokens": 7364994.0, + "step": 102 + }, + { + "epoch": 0.06429212343463504, + "grad_norm": 10.764423370361328, + "learning_rate": 9.588e-07, + "loss": 2.2744, + "mean_token_accuracy": 0.4706199821084738, + "num_tokens": 7431894.0, + "step": 103 + }, + { + "epoch": 0.06491631880778684, + "grad_norm": 0.34294837713241577, + "learning_rate": 9.584e-07, + "loss": 2.4039, + "mean_token_accuracy": 0.4625617694109678, + "num_tokens": 7500493.0, + "step": 104 + }, + { + "epoch": 0.06554051418093863, + "grad_norm": 8.098499298095703, + "learning_rate": 9.58e-07, + "loss": 2.8511, + "mean_token_accuracy": 0.3948721010237932, + "num_tokens": 7575570.0, + "step": 105 + }, + { + "epoch": 0.06616470955409043, + "grad_norm": 7.982059955596924, + "learning_rate": 9.576e-07, + "loss": 3.1074, + "mean_token_accuracy": 0.3637480493634939, + "num_tokens": 7654100.0, + "step": 106 + }, + { + "epoch": 0.06678890492724222, + "grad_norm": 10.697887420654297, + "learning_rate": 9.572e-07, + "loss": 2.6167, + "mean_token_accuracy": 0.4347928697243333, + "num_tokens": 7727014.0, + "step": 107 + }, + { + "epoch": 0.06741310030039402, + "grad_norm": 10.78205394744873, + "learning_rate": 9.567999999999999e-07, + "loss": 2.493, + "mean_token_accuracy": 0.4426170848309994, + "num_tokens": 7796256.0, + "step": 108 + }, + { + "epoch": 0.06803729567354581, + "grad_norm": 8.350289344787598, + "learning_rate": 9.564e-07, + "loss": 2.5657, + "mean_token_accuracy": 0.4259480107575655, + "num_tokens": 7867745.0, + "step": 109 + }, + { + "epoch": 0.06866149104669762, + "grad_norm": 8.640109062194824, + "learning_rate": 9.559999999999998e-07, + "loss": 2.8211, + "mean_token_accuracy": 0.3914318438619375, + "num_tokens": 7943892.0, + "step": 110 + }, + { + "epoch": 0.06928568641984942, + "grad_norm": 5.193445682525635, + "learning_rate": 9.556e-07, + "loss": 2.5277, + "mean_token_accuracy": 0.43346918001770973, + "num_tokens": 8018450.0, + "step": 111 + }, + { + "epoch": 0.06990988179300121, + "grad_norm": 10.809521675109863, + "learning_rate": 9.552e-07, + "loss": 2.8952, + "mean_token_accuracy": 0.39842038974165916, + "num_tokens": 8095684.0, + "step": 112 + }, + { + "epoch": 0.07053407716615301, + "grad_norm": 5.497768878936768, + "learning_rate": 9.548e-07, + "loss": 2.3967, + "mean_token_accuracy": 0.4447348341345787, + "num_tokens": 8166406.0, + "step": 113 + }, + { + "epoch": 0.0711582725393048, + "grad_norm": 5.638972282409668, + "learning_rate": 9.544e-07, + "loss": 2.7301, + "mean_token_accuracy": 0.3977528251707554, + "num_tokens": 8239771.0, + "step": 114 + }, + { + "epoch": 0.0717824679124566, + "grad_norm": 8.302619934082031, + "learning_rate": 9.539999999999999e-07, + "loss": 2.7009, + "mean_token_accuracy": 0.3963926210999489, + "num_tokens": 8314680.0, + "step": 115 + }, + { + "epoch": 0.0724066632856084, + "grad_norm": 5.551988124847412, + "learning_rate": 9.536e-07, + "loss": 2.2423, + "mean_token_accuracy": 0.4626568406820297, + "num_tokens": 8383016.0, + "step": 116 + }, + { + "epoch": 0.07303085865876019, + "grad_norm": 10.686290740966797, + "learning_rate": 9.532e-07, + "loss": 2.3966, + "mean_token_accuracy": 0.44909630715847015, + "num_tokens": 8454455.0, + "step": 117 + }, + { + "epoch": 0.07365505403191198, + "grad_norm": 8.04770278930664, + "learning_rate": 9.527999999999999e-07, + "loss": 2.3436, + "mean_token_accuracy": 0.4485420174896717, + "num_tokens": 8525729.0, + "step": 118 + }, + { + "epoch": 0.07427924940506378, + "grad_norm": 8.204573631286621, + "learning_rate": 9.524e-07, + "loss": 2.2376, + "mean_token_accuracy": 0.46074257604777813, + "num_tokens": 8594649.0, + "step": 119 + }, + { + "epoch": 0.07490344477821557, + "grad_norm": 2.9314703941345215, + "learning_rate": 9.52e-07, + "loss": 2.25, + "mean_token_accuracy": 0.44587117433547974, + "num_tokens": 8662344.0, + "step": 120 + }, + { + "epoch": 0.07552764015136738, + "grad_norm": 8.148239135742188, + "learning_rate": 9.515999999999999e-07, + "loss": 2.2322, + "mean_token_accuracy": 0.4659658204764128, + "num_tokens": 8731400.0, + "step": 121 + }, + { + "epoch": 0.07615183552451918, + "grad_norm": 8.139657020568848, + "learning_rate": 9.512e-07, + "loss": 2.2556, + "mean_token_accuracy": 0.4854020159691572, + "num_tokens": 8801941.0, + "step": 122 + }, + { + "epoch": 0.07677603089767097, + "grad_norm": 8.320220947265625, + "learning_rate": 9.508e-07, + "loss": 2.3501, + "mean_token_accuracy": 0.5229448135942221, + "num_tokens": 8874600.0, + "step": 123 + }, + { + "epoch": 0.07740022627082277, + "grad_norm": 5.417640209197998, + "learning_rate": 9.503999999999999e-07, + "loss": 2.595, + "mean_token_accuracy": 0.5553014483302832, + "num_tokens": 8950359.0, + "step": 124 + }, + { + "epoch": 0.07802442164397456, + "grad_norm": 11.254721641540527, + "learning_rate": 9.499999999999999e-07, + "loss": 2.4769, + "mean_token_accuracy": 0.5048603918403387, + "num_tokens": 9022501.0, + "step": 125 + }, + { + "epoch": 0.07864861701712636, + "grad_norm": 2.4788269996643066, + "learning_rate": 9.496e-07, + "loss": 2.371, + "mean_token_accuracy": 0.4698783978819847, + "num_tokens": 9095815.0, + "step": 126 + }, + { + "epoch": 0.07927281239027816, + "grad_norm": 8.651468276977539, + "learning_rate": 9.492e-07, + "loss": 2.0845, + "mean_token_accuracy": 0.47476439364254475, + "num_tokens": 9162981.0, + "step": 127 + }, + { + "epoch": 0.07989700776342995, + "grad_norm": 5.616523265838623, + "learning_rate": 9.487999999999999e-07, + "loss": 2.4026, + "mean_token_accuracy": 0.46908529102802277, + "num_tokens": 9235913.0, + "step": 128 + }, + { + "epoch": 0.08052120313658175, + "grad_norm": 5.564061164855957, + "learning_rate": 9.484e-07, + "loss": 2.2018, + "mean_token_accuracy": 0.614772479981184, + "num_tokens": 9307376.0, + "step": 129 + }, + { + "epoch": 0.08114539850973354, + "grad_norm": 5.697883605957031, + "learning_rate": 9.479999999999999e-07, + "loss": 2.2463, + "mean_token_accuracy": 0.6068003140389919, + "num_tokens": 9378080.0, + "step": 130 + }, + { + "epoch": 0.08176959388288535, + "grad_norm": 2.5631017684936523, + "learning_rate": 9.475999999999999e-07, + "loss": 2.4799, + "mean_token_accuracy": 0.4611246697604656, + "num_tokens": 9454418.0, + "step": 131 + }, + { + "epoch": 0.08239378925603714, + "grad_norm": 11.26060962677002, + "learning_rate": 9.472e-07, + "loss": 2.3173, + "mean_token_accuracy": 0.4402575436979532, + "num_tokens": 9528458.0, + "step": 132 + }, + { + "epoch": 0.08301798462918894, + "grad_norm": 8.238165855407715, + "learning_rate": 9.468e-07, + "loss": 2.3181, + "mean_token_accuracy": 0.47400329262018204, + "num_tokens": 9601991.0, + "step": 133 + }, + { + "epoch": 0.08364218000234074, + "grad_norm": 8.445817947387695, + "learning_rate": 9.464e-07, + "loss": 2.0695, + "mean_token_accuracy": 0.7179492376744747, + "num_tokens": 9671423.0, + "step": 134 + }, + { + "epoch": 0.08426637537549253, + "grad_norm": 8.088071823120117, + "learning_rate": 9.459999999999999e-07, + "loss": 2.2359, + "mean_token_accuracy": 0.8247935585677624, + "num_tokens": 9742576.0, + "step": 135 + }, + { + "epoch": 0.08489057074864433, + "grad_norm": 8.635455131530762, + "learning_rate": 9.456e-07, + "loss": 2.0614, + "mean_token_accuracy": 0.653862040489912, + "num_tokens": 9813994.0, + "step": 136 + }, + { + "epoch": 0.08551476612179612, + "grad_norm": 5.3678083419799805, + "learning_rate": 9.452e-07, + "loss": 1.8988, + "mean_token_accuracy": 0.5860353857278824, + "num_tokens": 9881296.0, + "step": 137 + }, + { + "epoch": 0.08613896149494792, + "grad_norm": 8.261592864990234, + "learning_rate": 9.447999999999999e-07, + "loss": 2.1675, + "mean_token_accuracy": 0.5836942959576845, + "num_tokens": 9955180.0, + "step": 138 + }, + { + "epoch": 0.08676315686809971, + "grad_norm": 8.667990684509277, + "learning_rate": 9.444e-07, + "loss": 2.1832, + "mean_token_accuracy": 0.7322559282183647, + "num_tokens": 10027945.0, + "step": 139 + }, + { + "epoch": 0.0873873522412515, + "grad_norm": 5.292821407318115, + "learning_rate": 9.439999999999999e-07, + "loss": 1.8674, + "mean_token_accuracy": 0.8219792768359184, + "num_tokens": 10096359.0, + "step": 140 + }, + { + "epoch": 0.0880115476144033, + "grad_norm": 7.995087623596191, + "learning_rate": 9.436e-07, + "loss": 2.0883, + "mean_token_accuracy": 0.827305443584919, + "num_tokens": 10169962.0, + "step": 141 + }, + { + "epoch": 0.08863574298755511, + "grad_norm": 8.51004409790039, + "learning_rate": 9.432e-07, + "loss": 2.1471, + "mean_token_accuracy": 0.6703935079276562, + "num_tokens": 10242410.0, + "step": 142 + }, + { + "epoch": 0.0892599383607069, + "grad_norm": 4.5640411376953125, + "learning_rate": 9.427999999999999e-07, + "loss": 1.9284, + "mean_token_accuracy": 0.5677116364240646, + "num_tokens": 10311870.0, + "step": 143 + }, + { + "epoch": 0.0898841337338587, + "grad_norm": 8.135300636291504, + "learning_rate": 9.424e-07, + "loss": 1.9048, + "mean_token_accuracy": 0.6515733599662781, + "num_tokens": 10380736.0, + "step": 144 + }, + { + "epoch": 0.0905083291070105, + "grad_norm": 5.441276550292969, + "learning_rate": 9.419999999999999e-07, + "loss": 2.4262, + "mean_token_accuracy": 0.8654657527804375, + "num_tokens": 10458829.0, + "step": 145 + }, + { + "epoch": 0.09113252448016229, + "grad_norm": 5.705361366271973, + "learning_rate": 9.415999999999999e-07, + "loss": 2.2468, + "mean_token_accuracy": 0.9081156849861145, + "num_tokens": 10533998.0, + "step": 146 + }, + { + "epoch": 0.09175671985331409, + "grad_norm": 2.7271947860717773, + "learning_rate": 9.412e-07, + "loss": 1.8659, + "mean_token_accuracy": 0.8830665722489357, + "num_tokens": 10601684.0, + "step": 147 + }, + { + "epoch": 0.09238091522646588, + "grad_norm": 5.58353853225708, + "learning_rate": 9.408e-07, + "loss": 2.0464, + "mean_token_accuracy": 0.7929886244237423, + "num_tokens": 10673767.0, + "step": 148 + }, + { + "epoch": 0.09300511059961768, + "grad_norm": 7.02947473526001, + "learning_rate": 9.403999999999999e-07, + "loss": 1.9452, + "mean_token_accuracy": 0.7648746557533741, + "num_tokens": 10744697.0, + "step": 149 + }, + { + "epoch": 0.09362930597276947, + "grad_norm": 5.471534252166748, + "learning_rate": 9.399999999999999e-07, + "loss": 2.202, + "mean_token_accuracy": 0.7427481934428215, + "num_tokens": 10820687.0, + "step": 150 + }, + { + "epoch": 0.09425350134592127, + "grad_norm": 8.263473510742188, + "learning_rate": 9.396e-07, + "loss": 1.7151, + "mean_token_accuracy": 0.8576823957264423, + "num_tokens": 10887314.0, + "step": 151 + }, + { + "epoch": 0.09487769671907308, + "grad_norm": 10.934205055236816, + "learning_rate": 9.391999999999999e-07, + "loss": 2.1236, + "mean_token_accuracy": 0.9021845832467079, + "num_tokens": 10961236.0, + "step": 152 + }, + { + "epoch": 0.09550189209222487, + "grad_norm": 11.433155059814453, + "learning_rate": 9.387999999999999e-07, + "loss": 2.2159, + "mean_token_accuracy": 0.9015358090400696, + "num_tokens": 11037387.0, + "step": 153 + }, + { + "epoch": 0.09612608746537667, + "grad_norm": 5.680862903594971, + "learning_rate": 9.384e-07, + "loss": 1.5898, + "mean_token_accuracy": 0.8703570403158665, + "num_tokens": 11101996.0, + "step": 154 + }, + { + "epoch": 0.09675028283852846, + "grad_norm": 5.380183219909668, + "learning_rate": 9.379999999999998e-07, + "loss": 1.898, + "mean_token_accuracy": 0.8727101534605026, + "num_tokens": 11174169.0, + "step": 155 + }, + { + "epoch": 0.09737447821168026, + "grad_norm": 8.087199211120605, + "learning_rate": 9.375999999999999e-07, + "loss": 1.8839, + "mean_token_accuracy": 0.8993255347013474, + "num_tokens": 11247143.0, + "step": 156 + }, + { + "epoch": 0.09799867358483205, + "grad_norm": 11.024115562438965, + "learning_rate": 9.372e-07, + "loss": 1.6381, + "mean_token_accuracy": 0.9047844186425209, + "num_tokens": 11315002.0, + "step": 157 + }, + { + "epoch": 0.09862286895798385, + "grad_norm": 8.222757339477539, + "learning_rate": 9.368e-07, + "loss": 1.8106, + "mean_token_accuracy": 0.9168143719434738, + "num_tokens": 11385579.0, + "step": 158 + }, + { + "epoch": 0.09924706433113564, + "grad_norm": 2.9269771575927734, + "learning_rate": 9.363999999999999e-07, + "loss": 1.9285, + "mean_token_accuracy": 0.9164522737264633, + "num_tokens": 11457459.0, + "step": 159 + }, + { + "epoch": 0.09987125970428744, + "grad_norm": 2.9141650199890137, + "learning_rate": 9.36e-07, + "loss": 1.7026, + "mean_token_accuracy": 0.8860682621598244, + "num_tokens": 11526888.0, + "step": 160 + }, + { + "epoch": 0.10049545507743923, + "grad_norm": 8.452841758728027, + "learning_rate": 9.356e-07, + "loss": 1.8318, + "mean_token_accuracy": 0.9092588238418102, + "num_tokens": 11597756.0, + "step": 161 + }, + { + "epoch": 0.10111965045059103, + "grad_norm": 8.58360481262207, + "learning_rate": 9.352e-07, + "loss": 2.085, + "mean_token_accuracy": 0.9195736832916737, + "num_tokens": 11674933.0, + "step": 162 + }, + { + "epoch": 0.10174384582374284, + "grad_norm": 8.074356079101562, + "learning_rate": 9.347999999999999e-07, + "loss": 1.9983, + "mean_token_accuracy": 0.9227902665734291, + "num_tokens": 11750293.0, + "step": 163 + }, + { + "epoch": 0.10236804119689463, + "grad_norm": 10.822897911071777, + "learning_rate": 9.344e-07, + "loss": 2.1212, + "mean_token_accuracy": 0.922254353761673, + "num_tokens": 11830078.0, + "step": 164 + }, + { + "epoch": 0.10299223657004643, + "grad_norm": 8.02038860321045, + "learning_rate": 9.34e-07, + "loss": 1.8677, + "mean_token_accuracy": 0.9150257557630539, + "num_tokens": 11903664.0, + "step": 165 + }, + { + "epoch": 0.10361643194319822, + "grad_norm": 5.587518215179443, + "learning_rate": 9.335999999999999e-07, + "loss": 1.9175, + "mean_token_accuracy": 0.9223049283027649, + "num_tokens": 11977732.0, + "step": 166 + }, + { + "epoch": 0.10424062731635002, + "grad_norm": 11.292394638061523, + "learning_rate": 9.332e-07, + "loss": 1.9164, + "mean_token_accuracy": 0.9137436300516129, + "num_tokens": 12052295.0, + "step": 167 + }, + { + "epoch": 0.10486482268950181, + "grad_norm": 5.468366622924805, + "learning_rate": 9.327999999999999e-07, + "loss": 2.0037, + "mean_token_accuracy": 0.929412417113781, + "num_tokens": 12128204.0, + "step": 168 + }, + { + "epoch": 0.10548901806265361, + "grad_norm": 8.034927368164062, + "learning_rate": 9.324e-07, + "loss": 1.8816, + "mean_token_accuracy": 0.9212962165474892, + "num_tokens": 12203525.0, + "step": 169 + }, + { + "epoch": 0.1061132134358054, + "grad_norm": 2.746997356414795, + "learning_rate": 9.32e-07, + "loss": 1.8241, + "mean_token_accuracy": 0.9208043627440929, + "num_tokens": 12277147.0, + "step": 170 + }, + { + "epoch": 0.1067374088089572, + "grad_norm": 2.918668508529663, + "learning_rate": 9.315999999999999e-07, + "loss": 1.8257, + "mean_token_accuracy": 0.922179501503706, + "num_tokens": 12351158.0, + "step": 171 + }, + { + "epoch": 0.107361604182109, + "grad_norm": 8.092795372009277, + "learning_rate": 9.312e-07, + "loss": 1.6924, + "mean_token_accuracy": 0.9252418130636215, + "num_tokens": 12422954.0, + "step": 172 + }, + { + "epoch": 0.1079857995552608, + "grad_norm": 5.240293502807617, + "learning_rate": 9.307999999999999e-07, + "loss": 1.7143, + "mean_token_accuracy": 0.9194028824567795, + "num_tokens": 12494742.0, + "step": 173 + }, + { + "epoch": 0.1086099949284126, + "grad_norm": 4.985151767730713, + "learning_rate": 9.303999999999999e-07, + "loss": 1.5177, + "mean_token_accuracy": 0.9118992052972317, + "num_tokens": 12563185.0, + "step": 174 + }, + { + "epoch": 0.1092341903015644, + "grad_norm": 8.365934371948242, + "learning_rate": 9.3e-07, + "loss": 1.4329, + "mean_token_accuracy": 0.9112710319459438, + "num_tokens": 12630309.0, + "step": 175 + }, + { + "epoch": 0.10985838567471619, + "grad_norm": 0.28322505950927734, + "learning_rate": 9.296e-07, + "loss": 1.4942, + "mean_token_accuracy": 0.9076949469745159, + "num_tokens": 12698485.0, + "step": 176 + }, + { + "epoch": 0.11048258104786798, + "grad_norm": 7.978460788726807, + "learning_rate": 9.292e-07, + "loss": 1.581, + "mean_token_accuracy": 0.9133112877607346, + "num_tokens": 12768437.0, + "step": 177 + }, + { + "epoch": 0.11110677642101978, + "grad_norm": 5.322417259216309, + "learning_rate": 9.287999999999999e-07, + "loss": 1.5908, + "mean_token_accuracy": 0.9186087660491467, + "num_tokens": 12839515.0, + "step": 178 + }, + { + "epoch": 0.11173097179417157, + "grad_norm": 5.470612525939941, + "learning_rate": 9.284e-07, + "loss": 1.5479, + "mean_token_accuracy": 0.9106381684541702, + "num_tokens": 12910203.0, + "step": 179 + }, + { + "epoch": 0.11235516716732337, + "grad_norm": 10.888998031616211, + "learning_rate": 9.28e-07, + "loss": 1.696, + "mean_token_accuracy": 0.9200823903083801, + "num_tokens": 12984904.0, + "step": 180 + }, + { + "epoch": 0.11297936254047516, + "grad_norm": 2.578615427017212, + "learning_rate": 9.275999999999999e-07, + "loss": 1.6689, + "mean_token_accuracy": 0.9243290685117245, + "num_tokens": 13059950.0, + "step": 181 + }, + { + "epoch": 0.11360355791362696, + "grad_norm": 5.610386848449707, + "learning_rate": 9.272e-07, + "loss": 1.5423, + "mean_token_accuracy": 0.918064784258604, + "num_tokens": 13131118.0, + "step": 182 + }, + { + "epoch": 0.11422775328677875, + "grad_norm": 10.521158218383789, + "learning_rate": 9.268e-07, + "loss": 1.7309, + "mean_token_accuracy": 0.923332441598177, + "num_tokens": 13205962.0, + "step": 183 + }, + { + "epoch": 0.11485194865993056, + "grad_norm": 5.277563571929932, + "learning_rate": 9.263999999999999e-07, + "loss": 1.7071, + "mean_token_accuracy": 0.9258010573685169, + "num_tokens": 13281976.0, + "step": 184 + }, + { + "epoch": 0.11547614403308236, + "grad_norm": 7.757433891296387, + "learning_rate": 9.26e-07, + "loss": 1.4781, + "mean_token_accuracy": 0.9150664880871773, + "num_tokens": 13352745.0, + "step": 185 + }, + { + "epoch": 0.11610033940623415, + "grad_norm": 8.166189193725586, + "learning_rate": 9.256e-07, + "loss": 1.4733, + "mean_token_accuracy": 0.9123788960278034, + "num_tokens": 13422787.0, + "step": 186 + }, + { + "epoch": 0.11672453477938595, + "grad_norm": 10.356062889099121, + "learning_rate": 9.251999999999999e-07, + "loss": 1.5437, + "mean_token_accuracy": 0.9145462922751904, + "num_tokens": 13494241.0, + "step": 187 + }, + { + "epoch": 0.11734873015253774, + "grad_norm": 8.242297172546387, + "learning_rate": 9.247999999999999e-07, + "loss": 1.5485, + "mean_token_accuracy": 0.9178762994706631, + "num_tokens": 13568229.0, + "step": 188 + }, + { + "epoch": 0.11797292552568954, + "grad_norm": 5.212475299835205, + "learning_rate": 9.244e-07, + "loss": 1.4062, + "mean_token_accuracy": 0.9117616638541222, + "num_tokens": 13637973.0, + "step": 189 + }, + { + "epoch": 0.11859712089884134, + "grad_norm": 4.846549987792969, + "learning_rate": 9.24e-07, + "loss": 1.5921, + "mean_token_accuracy": 0.9237067364156246, + "num_tokens": 13714130.0, + "step": 190 + }, + { + "epoch": 0.11922131627199313, + "grad_norm": 5.192752361297607, + "learning_rate": 9.235999999999999e-07, + "loss": 1.5236, + "mean_token_accuracy": 0.9133113771677017, + "num_tokens": 13786160.0, + "step": 191 + }, + { + "epoch": 0.11984551164514493, + "grad_norm": 5.267004489898682, + "learning_rate": 9.232e-07, + "loss": 1.4886, + "mean_token_accuracy": 0.9154262915253639, + "num_tokens": 13858518.0, + "step": 192 + }, + { + "epoch": 0.12046970701829672, + "grad_norm": 10.632615089416504, + "learning_rate": 9.227999999999999e-07, + "loss": 1.4575, + "mean_token_accuracy": 0.9238744340837002, + "num_tokens": 13930328.0, + "step": 193 + }, + { + "epoch": 0.12109390239144853, + "grad_norm": 10.462716102600098, + "learning_rate": 9.224e-07, + "loss": 1.4196, + "mean_token_accuracy": 0.9145567081868649, + "num_tokens": 14001245.0, + "step": 194 + }, + { + "epoch": 0.12171809776460032, + "grad_norm": 5.286381721496582, + "learning_rate": 9.22e-07, + "loss": 1.4195, + "mean_token_accuracy": 0.9208745285868645, + "num_tokens": 14075712.0, + "step": 195 + }, + { + "epoch": 0.12234229313775212, + "grad_norm": 2.423999547958374, + "learning_rate": 9.215999999999999e-07, + "loss": 1.3295, + "mean_token_accuracy": 0.9187790527939796, + "num_tokens": 14146535.0, + "step": 196 + }, + { + "epoch": 0.12296648851090392, + "grad_norm": 7.909485340118408, + "learning_rate": 9.212e-07, + "loss": 1.3757, + "mean_token_accuracy": 0.9210257269442081, + "num_tokens": 14217629.0, + "step": 197 + }, + { + "epoch": 0.12359068388405571, + "grad_norm": 2.5925230979919434, + "learning_rate": 9.207999999999999e-07, + "loss": 1.2879, + "mean_token_accuracy": 0.9173031412065029, + "num_tokens": 14288166.0, + "step": 198 + }, + { + "epoch": 0.1242148792572075, + "grad_norm": 7.866016387939453, + "learning_rate": 9.203999999999999e-07, + "loss": 1.3209, + "mean_token_accuracy": 0.915681928396225, + "num_tokens": 14360576.0, + "step": 199 + }, + { + "epoch": 0.1248390746303593, + "grad_norm": 5.128418445587158, + "learning_rate": 9.2e-07, + "loss": 1.4467, + "mean_token_accuracy": 0.91860106959939, + "num_tokens": 14436107.0, + "step": 200 + }, + { + "epoch": 0.1254632700035111, + "grad_norm": 7.8198933601379395, + "learning_rate": 9.196e-07, + "loss": 1.3385, + "mean_token_accuracy": 0.9225534051656723, + "num_tokens": 14509047.0, + "step": 201 + }, + { + "epoch": 0.1260874653766629, + "grad_norm": 5.133849143981934, + "learning_rate": 9.192e-07, + "loss": 1.1601, + "mean_token_accuracy": 0.9136953912675381, + "num_tokens": 14575534.0, + "step": 202 + }, + { + "epoch": 0.1267116607498147, + "grad_norm": 9.62126636505127, + "learning_rate": 9.187999999999999e-07, + "loss": 1.4671, + "mean_token_accuracy": 0.923810213804245, + "num_tokens": 14652889.0, + "step": 203 + }, + { + "epoch": 0.1273358561229665, + "grad_norm": 7.788510799407959, + "learning_rate": 9.184e-07, + "loss": 1.3339, + "mean_token_accuracy": 0.9229598566889763, + "num_tokens": 14727352.0, + "step": 204 + }, + { + "epoch": 0.12796005149611828, + "grad_norm": 1.0009371042251587, + "learning_rate": 9.18e-07, + "loss": 1.2681, + "mean_token_accuracy": 0.9171731658279896, + "num_tokens": 14799568.0, + "step": 205 + }, + { + "epoch": 0.12858424686927009, + "grad_norm": 7.367977619171143, + "learning_rate": 9.175999999999999e-07, + "loss": 1.2686, + "mean_token_accuracy": 0.9179406464099884, + "num_tokens": 14873349.0, + "step": 206 + }, + { + "epoch": 0.12920844224242187, + "grad_norm": 8.009611129760742, + "learning_rate": 9.172e-07, + "loss": 1.2355, + "mean_token_accuracy": 0.9173550568521023, + "num_tokens": 14941765.0, + "step": 207 + }, + { + "epoch": 0.12983263761557368, + "grad_norm": 0.28489747643470764, + "learning_rate": 9.168e-07, + "loss": 1.1213, + "mean_token_accuracy": 0.9089673720300198, + "num_tokens": 15010336.0, + "step": 208 + }, + { + "epoch": 0.13045683298872546, + "grad_norm": 5.03115177154541, + "learning_rate": 9.163999999999999e-07, + "loss": 1.195, + "mean_token_accuracy": 0.917554147541523, + "num_tokens": 15077965.0, + "step": 209 + }, + { + "epoch": 0.13108102836187727, + "grad_norm": 5.094935894012451, + "learning_rate": 9.16e-07, + "loss": 1.2757, + "mean_token_accuracy": 0.9190593808889389, + "num_tokens": 15151565.0, + "step": 210 + }, + { + "epoch": 0.13170522373502908, + "grad_norm": 2.76515531539917, + "learning_rate": 9.156e-07, + "loss": 1.1963, + "mean_token_accuracy": 0.9166045114398003, + "num_tokens": 15221738.0, + "step": 211 + }, + { + "epoch": 0.13232941910818086, + "grad_norm": 10.110687255859375, + "learning_rate": 9.151999999999999e-07, + "loss": 1.2716, + "mean_token_accuracy": 0.9204342179000378, + "num_tokens": 15295424.0, + "step": 212 + }, + { + "epoch": 0.13295361448133267, + "grad_norm": 5.2001261711120605, + "learning_rate": 9.147999999999999e-07, + "loss": 1.2542, + "mean_token_accuracy": 0.9178045317530632, + "num_tokens": 15368527.0, + "step": 213 + }, + { + "epoch": 0.13357780985448445, + "grad_norm": 5.049051761627197, + "learning_rate": 9.144e-07, + "loss": 1.2213, + "mean_token_accuracy": 0.9194723777472973, + "num_tokens": 15443255.0, + "step": 214 + }, + { + "epoch": 0.13420200522763626, + "grad_norm": 7.801861763000488, + "learning_rate": 9.14e-07, + "loss": 1.3295, + "mean_token_accuracy": 0.9262109063565731, + "num_tokens": 15519506.0, + "step": 215 + }, + { + "epoch": 0.13482620060078804, + "grad_norm": 7.463664531707764, + "learning_rate": 9.135999999999999e-07, + "loss": 1.1187, + "mean_token_accuracy": 0.918974194675684, + "num_tokens": 15587685.0, + "step": 216 + }, + { + "epoch": 0.13545039597393985, + "grad_norm": 7.7051568031311035, + "learning_rate": 9.132e-07, + "loss": 1.1218, + "mean_token_accuracy": 0.9162348955869675, + "num_tokens": 15658562.0, + "step": 217 + }, + { + "epoch": 0.13607459134709163, + "grad_norm": 4.836665630340576, + "learning_rate": 9.127999999999999e-07, + "loss": 0.9558, + "mean_token_accuracy": 0.9027752466499805, + "num_tokens": 15722096.0, + "step": 218 + }, + { + "epoch": 0.13669878672024344, + "grad_norm": 10.028327941894531, + "learning_rate": 9.123999999999999e-07, + "loss": 1.2193, + "mean_token_accuracy": 0.9179589003324509, + "num_tokens": 15797169.0, + "step": 219 + }, + { + "epoch": 0.13732298209339525, + "grad_norm": 6.999484062194824, + "learning_rate": 9.12e-07, + "loss": 1.0519, + "mean_token_accuracy": 0.909615945070982, + "num_tokens": 15866145.0, + "step": 220 + }, + { + "epoch": 0.13794717746654703, + "grad_norm": 2.3297009468078613, + "learning_rate": 9.115999999999999e-07, + "loss": 1.0906, + "mean_token_accuracy": 0.9236158840358257, + "num_tokens": 15935429.0, + "step": 221 + }, + { + "epoch": 0.13857137283969884, + "grad_norm": 7.299313068389893, + "learning_rate": 9.112e-07, + "loss": 1.099, + "mean_token_accuracy": 0.9201101921498775, + "num_tokens": 16008366.0, + "step": 222 + }, + { + "epoch": 0.13919556821285062, + "grad_norm": 4.8594746589660645, + "learning_rate": 9.108e-07, + "loss": 1.0927, + "mean_token_accuracy": 0.91213034465909, + "num_tokens": 16078781.0, + "step": 223 + }, + { + "epoch": 0.13981976358600243, + "grad_norm": 9.167181015014648, + "learning_rate": 9.103999999999999e-07, + "loss": 1.1846, + "mean_token_accuracy": 0.9285093583166599, + "num_tokens": 16157560.0, + "step": 224 + }, + { + "epoch": 0.1404439589591542, + "grad_norm": 7.562880992889404, + "learning_rate": 9.1e-07, + "loss": 1.0718, + "mean_token_accuracy": 0.9226764664053917, + "num_tokens": 16229833.0, + "step": 225 + }, + { + "epoch": 0.14106815433230602, + "grad_norm": 7.0906500816345215, + "learning_rate": 9.095999999999999e-07, + "loss": 1.0585, + "mean_token_accuracy": 0.915476281195879, + "num_tokens": 16300263.0, + "step": 226 + }, + { + "epoch": 0.1416923497054578, + "grad_norm": 2.385265588760376, + "learning_rate": 9.092e-07, + "loss": 1.0082, + "mean_token_accuracy": 0.9205562435090542, + "num_tokens": 16369556.0, + "step": 227 + }, + { + "epoch": 0.1423165450786096, + "grad_norm": 6.672084331512451, + "learning_rate": 9.088e-07, + "loss": 1.079, + "mean_token_accuracy": 0.9254900999367237, + "num_tokens": 16441489.0, + "step": 228 + }, + { + "epoch": 0.1429407404517614, + "grad_norm": 4.2808518409729, + "learning_rate": 9.084e-07, + "loss": 0.9808, + "mean_token_accuracy": 0.9142305105924606, + "num_tokens": 16510117.0, + "step": 229 + }, + { + "epoch": 0.1435649358249132, + "grad_norm": 7.262923717498779, + "learning_rate": 9.08e-07, + "loss": 1.1066, + "mean_token_accuracy": 0.9294875040650368, + "num_tokens": 16586050.0, + "step": 230 + }, + { + "epoch": 0.144189131198065, + "grad_norm": 2.315251350402832, + "learning_rate": 9.075999999999999e-07, + "loss": 1.0402, + "mean_token_accuracy": 0.9211455695331097, + "num_tokens": 16658076.0, + "step": 231 + }, + { + "epoch": 0.1448133265712168, + "grad_norm": 4.604584217071533, + "learning_rate": 9.072e-07, + "loss": 0.8917, + "mean_token_accuracy": 0.9137470684945583, + "num_tokens": 16725015.0, + "step": 232 + }, + { + "epoch": 0.1454375219443686, + "grad_norm": 6.627901554107666, + "learning_rate": 9.068e-07, + "loss": 1.0017, + "mean_token_accuracy": 0.9179323613643646, + "num_tokens": 16796477.0, + "step": 233 + }, + { + "epoch": 0.14606171731752038, + "grad_norm": 7.2127227783203125, + "learning_rate": 9.063999999999999e-07, + "loss": 1.0481, + "mean_token_accuracy": 0.9238373003900051, + "num_tokens": 16873535.0, + "step": 234 + }, + { + "epoch": 0.1466859126906722, + "grad_norm": 6.750350475311279, + "learning_rate": 9.06e-07, + "loss": 1.0132, + "mean_token_accuracy": 0.9286116436123848, + "num_tokens": 16946255.0, + "step": 235 + }, + { + "epoch": 0.14731010806382397, + "grad_norm": 6.974132061004639, + "learning_rate": 9.056e-07, + "loss": 0.985, + "mean_token_accuracy": 0.9234875813126564, + "num_tokens": 17017960.0, + "step": 236 + }, + { + "epoch": 0.14793430343697578, + "grad_norm": 6.718789100646973, + "learning_rate": 9.051999999999999e-07, + "loss": 1.0101, + "mean_token_accuracy": 0.9212662689387798, + "num_tokens": 17092320.0, + "step": 237 + }, + { + "epoch": 0.14855849881012756, + "grad_norm": 2.276811361312866, + "learning_rate": 9.048e-07, + "loss": 0.8876, + "mean_token_accuracy": 0.9173206947743893, + "num_tokens": 17161200.0, + "step": 238 + }, + { + "epoch": 0.14918269418327937, + "grad_norm": 4.366613864898682, + "learning_rate": 9.044e-07, + "loss": 0.8801, + "mean_token_accuracy": 0.9057007618248463, + "num_tokens": 17227939.0, + "step": 239 + }, + { + "epoch": 0.14980688955643115, + "grad_norm": 6.607369422912598, + "learning_rate": 9.039999999999999e-07, + "loss": 1.0189, + "mean_token_accuracy": 0.9282168634235859, + "num_tokens": 17306041.0, + "step": 240 + }, + { + "epoch": 0.15043108492958296, + "grad_norm": 4.232856273651123, + "learning_rate": 9.035999999999999e-07, + "loss": 0.8925, + "mean_token_accuracy": 0.9180495738983154, + "num_tokens": 17375044.0, + "step": 241 + }, + { + "epoch": 0.15105528030273477, + "grad_norm": 4.2526044845581055, + "learning_rate": 9.032e-07, + "loss": 0.9413, + "mean_token_accuracy": 0.920606717467308, + "num_tokens": 17447118.0, + "step": 242 + }, + { + "epoch": 0.15167947567588655, + "grad_norm": 2.206423044204712, + "learning_rate": 9.028e-07, + "loss": 0.9194, + "mean_token_accuracy": 0.9231000654399395, + "num_tokens": 17520541.0, + "step": 243 + }, + { + "epoch": 0.15230367104903836, + "grad_norm": 6.550907135009766, + "learning_rate": 9.023999999999999e-07, + "loss": 0.8887, + "mean_token_accuracy": 0.9143834039568901, + "num_tokens": 17593508.0, + "step": 244 + }, + { + "epoch": 0.15292786642219014, + "grad_norm": 4.179169654846191, + "learning_rate": 9.02e-07, + "loss": 0.9158, + "mean_token_accuracy": 0.925802793353796, + "num_tokens": 17669670.0, + "step": 245 + }, + { + "epoch": 0.15355206179534195, + "grad_norm": 2.0171589851379395, + "learning_rate": 9.015999999999999e-07, + "loss": 0.8846, + "mean_token_accuracy": 0.9246592856943607, + "num_tokens": 17743445.0, + "step": 246 + }, + { + "epoch": 0.15417625716849373, + "grad_norm": 6.173559665679932, + "learning_rate": 9.011999999999999e-07, + "loss": 0.8851, + "mean_token_accuracy": 0.9320320673286915, + "num_tokens": 17818727.0, + "step": 247 + }, + { + "epoch": 0.15480045254164554, + "grad_norm": 6.248245716094971, + "learning_rate": 9.008e-07, + "loss": 0.7844, + "mean_token_accuracy": 0.912699606269598, + "num_tokens": 17885175.0, + "step": 248 + }, + { + "epoch": 0.15542464791479732, + "grad_norm": 2.034191131591797, + "learning_rate": 9.004e-07, + "loss": 0.8611, + "mean_token_accuracy": 0.9232048504054546, + "num_tokens": 17958917.0, + "step": 249 + }, + { + "epoch": 0.15604884328794913, + "grad_norm": 4.073791027069092, + "learning_rate": 9e-07, + "loss": 0.8231, + "mean_token_accuracy": 0.9232704900205135, + "num_tokens": 18031523.0, + "step": 250 + }, + { + "epoch": 0.15667303866110094, + "grad_norm": 6.003040313720703, + "learning_rate": 8.995999999999999e-07, + "loss": 0.8051, + "mean_token_accuracy": 0.9214721880853176, + "num_tokens": 18101157.0, + "step": 251 + }, + { + "epoch": 0.15729723403425272, + "grad_norm": 7.990306854248047, + "learning_rate": 8.992e-07, + "loss": 0.8663, + "mean_token_accuracy": 0.9240248017013073, + "num_tokens": 18178470.0, + "step": 252 + }, + { + "epoch": 0.15792142940740453, + "grad_norm": 7.7937116622924805, + "learning_rate": 8.988e-07, + "loss": 0.8144, + "mean_token_accuracy": 0.9162204191088676, + "num_tokens": 18251554.0, + "step": 253 + }, + { + "epoch": 0.1585456247805563, + "grad_norm": 4.021476745605469, + "learning_rate": 8.983999999999999e-07, + "loss": 0.7996, + "mean_token_accuracy": 0.917805690318346, + "num_tokens": 18322539.0, + "step": 254 + }, + { + "epoch": 0.15916982015370812, + "grad_norm": 8.010640144348145, + "learning_rate": 8.98e-07, + "loss": 0.7645, + "mean_token_accuracy": 0.9174820892512798, + "num_tokens": 18391936.0, + "step": 255 + }, + { + "epoch": 0.1597940155268599, + "grad_norm": 4.005990028381348, + "learning_rate": 8.975999999999999e-07, + "loss": 0.8062, + "mean_token_accuracy": 0.9221107959747314, + "num_tokens": 18463522.0, + "step": 256 + }, + { + "epoch": 0.1604182109000117, + "grad_norm": 3.8430492877960205, + "learning_rate": 8.972e-07, + "loss": 0.7543, + "mean_token_accuracy": 0.9232521802186966, + "num_tokens": 18535494.0, + "step": 257 + }, + { + "epoch": 0.1610424062731635, + "grad_norm": 5.738049507141113, + "learning_rate": 8.968e-07, + "loss": 0.7607, + "mean_token_accuracy": 0.9231390170753002, + "num_tokens": 18606448.0, + "step": 258 + }, + { + "epoch": 0.1616666016463153, + "grad_norm": 3.8291401863098145, + "learning_rate": 8.963999999999999e-07, + "loss": 0.782, + "mean_token_accuracy": 0.9246168956160545, + "num_tokens": 18678690.0, + "step": 259 + }, + { + "epoch": 0.16229079701946708, + "grad_norm": 3.7632787227630615, + "learning_rate": 8.96e-07, + "loss": 0.8008, + "mean_token_accuracy": 0.9147759675979614, + "num_tokens": 18750838.0, + "step": 260 + }, + { + "epoch": 0.1629149923926189, + "grad_norm": 2.0309298038482666, + "learning_rate": 8.955999999999999e-07, + "loss": 0.7, + "mean_token_accuracy": 0.906768687069416, + "num_tokens": 18816296.0, + "step": 261 + }, + { + "epoch": 0.1635391877657707, + "grad_norm": 3.681788921356201, + "learning_rate": 8.951999999999999e-07, + "loss": 0.7258, + "mean_token_accuracy": 0.9221327230334282, + "num_tokens": 18886104.0, + "step": 262 + }, + { + "epoch": 0.16416338313892248, + "grad_norm": 5.3626837730407715, + "learning_rate": 8.948e-07, + "loss": 0.6993, + "mean_token_accuracy": 0.9221790917217731, + "num_tokens": 18955722.0, + "step": 263 + }, + { + "epoch": 0.1647875785120743, + "grad_norm": 5.501428604125977, + "learning_rate": 8.944e-07, + "loss": 0.7254, + "mean_token_accuracy": 0.9257684424519539, + "num_tokens": 19029819.0, + "step": 264 + }, + { + "epoch": 0.16541177388522607, + "grad_norm": 3.6627449989318848, + "learning_rate": 8.939999999999999e-07, + "loss": 0.7309, + "mean_token_accuracy": 0.9193191789090633, + "num_tokens": 19102504.0, + "step": 265 + }, + { + "epoch": 0.16603596925837788, + "grad_norm": 0.2712026536464691, + "learning_rate": 8.935999999999999e-07, + "loss": 0.7061, + "mean_token_accuracy": 0.9194264896214008, + "num_tokens": 19173251.0, + "step": 266 + }, + { + "epoch": 0.16666016463152966, + "grad_norm": 5.12305212020874, + "learning_rate": 8.932e-07, + "loss": 0.7359, + "mean_token_accuracy": 0.9240240082144737, + "num_tokens": 19249509.0, + "step": 267 + }, + { + "epoch": 0.16728436000468147, + "grad_norm": 3.4386534690856934, + "learning_rate": 8.928e-07, + "loss": 0.692, + "mean_token_accuracy": 0.9101699814200401, + "num_tokens": 19318156.0, + "step": 268 + }, + { + "epoch": 0.16790855537783325, + "grad_norm": 5.161988735198975, + "learning_rate": 8.923999999999999e-07, + "loss": 0.7102, + "mean_token_accuracy": 0.9245063103735447, + "num_tokens": 19391686.0, + "step": 269 + }, + { + "epoch": 0.16853275075098506, + "grad_norm": 3.3804566860198975, + "learning_rate": 8.92e-07, + "loss": 0.6881, + "mean_token_accuracy": 0.9200237616896629, + "num_tokens": 19466118.0, + "step": 270 + }, + { + "epoch": 0.16915694612413684, + "grad_norm": 3.553159475326538, + "learning_rate": 8.915999999999999e-07, + "loss": 0.6936, + "mean_token_accuracy": 0.9296778216958046, + "num_tokens": 19541364.0, + "step": 271 + }, + { + "epoch": 0.16978114149728865, + "grad_norm": 1.7626819610595703, + "learning_rate": 8.911999999999999e-07, + "loss": 0.6934, + "mean_token_accuracy": 0.9221810065209866, + "num_tokens": 19613848.0, + "step": 272 + }, + { + "epoch": 0.17040533687044046, + "grad_norm": 1.6031099557876587, + "learning_rate": 8.908e-07, + "loss": 0.686, + "mean_token_accuracy": 0.9166489988565445, + "num_tokens": 19685477.0, + "step": 273 + }, + { + "epoch": 0.17102953224359224, + "grad_norm": 3.2471511363983154, + "learning_rate": 8.904e-07, + "loss": 0.6502, + "mean_token_accuracy": 0.9268861003220081, + "num_tokens": 19756640.0, + "step": 274 + }, + { + "epoch": 0.17165372761674405, + "grad_norm": 6.707427501678467, + "learning_rate": 8.9e-07, + "loss": 0.6561, + "mean_token_accuracy": 0.9250268638134003, + "num_tokens": 19828741.0, + "step": 275 + }, + { + "epoch": 0.17227792298989583, + "grad_norm": 3.2816786766052246, + "learning_rate": 8.895999999999999e-07, + "loss": 0.6568, + "mean_token_accuracy": 0.9165336675941944, + "num_tokens": 19900364.0, + "step": 276 + }, + { + "epoch": 0.17290211836304764, + "grad_norm": 6.137754440307617, + "learning_rate": 8.892e-07, + "loss": 0.6575, + "mean_token_accuracy": 0.9207403063774109, + "num_tokens": 19972956.0, + "step": 277 + }, + { + "epoch": 0.17352631373619942, + "grad_norm": 4.800975799560547, + "learning_rate": 8.888e-07, + "loss": 0.6715, + "mean_token_accuracy": 0.9269882887601852, + "num_tokens": 20048223.0, + "step": 278 + }, + { + "epoch": 0.17415050910935123, + "grad_norm": 3.0919814109802246, + "learning_rate": 8.883999999999999e-07, + "loss": 0.6271, + "mean_token_accuracy": 0.9199992828071117, + "num_tokens": 20121461.0, + "step": 279 + }, + { + "epoch": 0.174774704482503, + "grad_norm": 4.76736307144165, + "learning_rate": 8.88e-07, + "loss": 0.6244, + "mean_token_accuracy": 0.9144281633198261, + "num_tokens": 20189927.0, + "step": 280 + }, + { + "epoch": 0.17539889985565482, + "grad_norm": 2.987694263458252, + "learning_rate": 8.875999999999999e-07, + "loss": 0.6483, + "mean_token_accuracy": 0.9280169196426868, + "num_tokens": 20266049.0, + "step": 281 + }, + { + "epoch": 0.1760230952288066, + "grad_norm": 4.727084636688232, + "learning_rate": 8.872e-07, + "loss": 0.6123, + "mean_token_accuracy": 0.9181976951658726, + "num_tokens": 20334358.0, + "step": 282 + }, + { + "epoch": 0.1766472906019584, + "grad_norm": 5.995877742767334, + "learning_rate": 8.868e-07, + "loss": 0.6263, + "mean_token_accuracy": 0.9246489144861698, + "num_tokens": 20407439.0, + "step": 283 + }, + { + "epoch": 0.17727148597511022, + "grad_norm": 4.42982292175293, + "learning_rate": 8.863999999999999e-07, + "loss": 0.6197, + "mean_token_accuracy": 0.9229095615446568, + "num_tokens": 20479706.0, + "step": 284 + }, + { + "epoch": 0.177895681348262, + "grad_norm": 4.397432804107666, + "learning_rate": 8.86e-07, + "loss": 0.6677, + "mean_token_accuracy": 0.9228343367576599, + "num_tokens": 20556437.0, + "step": 285 + }, + { + "epoch": 0.1785198767214138, + "grad_norm": 4.311978340148926, + "learning_rate": 8.856e-07, + "loss": 0.6148, + "mean_token_accuracy": 0.9273019395768642, + "num_tokens": 20629867.0, + "step": 286 + }, + { + "epoch": 0.1791440720945656, + "grad_norm": 2.751955270767212, + "learning_rate": 8.851999999999999e-07, + "loss": 0.6209, + "mean_token_accuracy": 0.9213513396680355, + "num_tokens": 20703095.0, + "step": 287 + }, + { + "epoch": 0.1797682674677174, + "grad_norm": 4.102765083312988, + "learning_rate": 8.848e-07, + "loss": 0.6343, + "mean_token_accuracy": 0.9197850301861763, + "num_tokens": 20780530.0, + "step": 288 + }, + { + "epoch": 0.18039246284086918, + "grad_norm": 4.114978313446045, + "learning_rate": 8.844e-07, + "loss": 0.6205, + "mean_token_accuracy": 0.9163020439445972, + "num_tokens": 20852090.0, + "step": 289 + }, + { + "epoch": 0.181016658214021, + "grad_norm": 2.690525531768799, + "learning_rate": 8.839999999999999e-07, + "loss": 0.5775, + "mean_token_accuracy": 0.9212459437549114, + "num_tokens": 20924817.0, + "step": 290 + }, + { + "epoch": 0.18164085358717277, + "grad_norm": 2.542531967163086, + "learning_rate": 8.836e-07, + "loss": 0.5937, + "mean_token_accuracy": 0.9168826639652252, + "num_tokens": 20995736.0, + "step": 291 + }, + { + "epoch": 0.18226504896032458, + "grad_norm": 3.8409745693206787, + "learning_rate": 8.832e-07, + "loss": 0.5717, + "mean_token_accuracy": 0.9313341379165649, + "num_tokens": 21072521.0, + "step": 292 + }, + { + "epoch": 0.1828892443334764, + "grad_norm": 2.5837008953094482, + "learning_rate": 8.827999999999999e-07, + "loss": 0.5705, + "mean_token_accuracy": 0.9210564531385899, + "num_tokens": 21144339.0, + "step": 293 + }, + { + "epoch": 0.18351343970662817, + "grad_norm": 1.3285531997680664, + "learning_rate": 8.823999999999999e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.9195240288972855, + "num_tokens": 21214517.0, + "step": 294 + }, + { + "epoch": 0.18413763507977998, + "grad_norm": 2.5066184997558594, + "learning_rate": 8.82e-07, + "loss": 0.5684, + "mean_token_accuracy": 0.9227254241704941, + "num_tokens": 21287375.0, + "step": 295 + }, + { + "epoch": 0.18476183045293176, + "grad_norm": 3.7103333473205566, + "learning_rate": 8.816000000000001e-07, + "loss": 0.5736, + "mean_token_accuracy": 0.9259331077337265, + "num_tokens": 21364221.0, + "step": 296 + }, + { + "epoch": 0.18538602582608357, + "grad_norm": 3.6857786178588867, + "learning_rate": 8.811999999999999e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.9199340753257275, + "num_tokens": 21433773.0, + "step": 297 + }, + { + "epoch": 0.18601022119923535, + "grad_norm": 2.4223549365997314, + "learning_rate": 8.808e-07, + "loss": 0.5617, + "mean_token_accuracy": 0.9241475202143192, + "num_tokens": 21505557.0, + "step": 298 + }, + { + "epoch": 0.18663441657238716, + "grad_norm": 3.512373924255371, + "learning_rate": 8.804e-07, + "loss": 0.5554, + "mean_token_accuracy": 0.9271243251860142, + "num_tokens": 21578997.0, + "step": 299 + }, + { + "epoch": 0.18725861194553894, + "grad_norm": 1.1658852100372314, + "learning_rate": 8.799999999999999e-07, + "loss": 0.5605, + "mean_token_accuracy": 0.91535235196352, + "num_tokens": 21649357.0, + "step": 300 + }, + { + "epoch": 0.18788280731869075, + "grad_norm": 1.2288845777511597, + "learning_rate": 8.796e-07, + "loss": 0.5274, + "mean_token_accuracy": 0.9191324077546597, + "num_tokens": 21718787.0, + "step": 301 + }, + { + "epoch": 0.18850700269184253, + "grad_norm": 1.1937166452407837, + "learning_rate": 8.792e-07, + "loss": 0.5581, + "mean_token_accuracy": 0.9143362790346146, + "num_tokens": 21789343.0, + "step": 302 + }, + { + "epoch": 0.18913119806499434, + "grad_norm": 3.4846014976501465, + "learning_rate": 8.788e-07, + "loss": 0.5822, + "mean_token_accuracy": 0.911719061434269, + "num_tokens": 21860110.0, + "step": 303 + }, + { + "epoch": 0.18975539343814615, + "grad_norm": 4.586836814880371, + "learning_rate": 8.783999999999999e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.9314946755766869, + "num_tokens": 21934618.0, + "step": 304 + }, + { + "epoch": 0.19037958881129793, + "grad_norm": 3.182224750518799, + "learning_rate": 8.78e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.9216264933347702, + "num_tokens": 22010108.0, + "step": 305 + }, + { + "epoch": 0.19100378418444974, + "grad_norm": 3.312130928039551, + "learning_rate": 8.776e-07, + "loss": 0.5419, + "mean_token_accuracy": 0.9131336398422718, + "num_tokens": 22078727.0, + "step": 306 + }, + { + "epoch": 0.19162797955760152, + "grad_norm": 1.0294653177261353, + "learning_rate": 8.771999999999999e-07, + "loss": 0.536, + "mean_token_accuracy": 0.9182093925774097, + "num_tokens": 22149618.0, + "step": 307 + }, + { + "epoch": 0.19225217493075333, + "grad_norm": 2.1827099323272705, + "learning_rate": 8.768e-07, + "loss": 0.528, + "mean_token_accuracy": 0.9168641194701195, + "num_tokens": 22219669.0, + "step": 308 + }, + { + "epoch": 0.19287637030390511, + "grad_norm": 2.200655460357666, + "learning_rate": 8.763999999999999e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.9270654171705246, + "num_tokens": 22291908.0, + "step": 309 + }, + { + "epoch": 0.19350056567705692, + "grad_norm": 4.376758098602295, + "learning_rate": 8.76e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.9147041626274586, + "num_tokens": 22359363.0, + "step": 310 + }, + { + "epoch": 0.1941247610502087, + "grad_norm": 1.9491456747055054, + "learning_rate": 8.756e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.9250685833394527, + "num_tokens": 22432494.0, + "step": 311 + }, + { + "epoch": 0.19474895642336051, + "grad_norm": 0.30999302864074707, + "learning_rate": 8.751999999999999e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.9207328855991364, + "num_tokens": 22503492.0, + "step": 312 + }, + { + "epoch": 0.1953731517965123, + "grad_norm": 1.0560754537582397, + "learning_rate": 8.748e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.9238898158073425, + "num_tokens": 22575416.0, + "step": 313 + }, + { + "epoch": 0.1959973471696641, + "grad_norm": 2.971607208251953, + "learning_rate": 8.743999999999999e-07, + "loss": 0.533, + "mean_token_accuracy": 0.9224493429064751, + "num_tokens": 22649065.0, + "step": 314 + }, + { + "epoch": 0.1966215425428159, + "grad_norm": 1.9408035278320312, + "learning_rate": 8.739999999999999e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.925947979092598, + "num_tokens": 22724399.0, + "step": 315 + }, + { + "epoch": 0.1972457379159677, + "grad_norm": 1.9919980764389038, + "learning_rate": 8.736e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.9135693944990635, + "num_tokens": 22795990.0, + "step": 316 + }, + { + "epoch": 0.1978699332891195, + "grad_norm": 1.16225266456604, + "learning_rate": 8.732e-07, + "loss": 0.515, + "mean_token_accuracy": 0.9201497547328472, + "num_tokens": 22867292.0, + "step": 317 + }, + { + "epoch": 0.19849412866227129, + "grad_norm": 1.9718466997146606, + "learning_rate": 8.728e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.9120406582951546, + "num_tokens": 22934474.0, + "step": 318 + }, + { + "epoch": 0.1991183240354231, + "grad_norm": 1.9588881731033325, + "learning_rate": 8.723999999999999e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.9219380579888821, + "num_tokens": 23007530.0, + "step": 319 + }, + { + "epoch": 0.19974251940857488, + "grad_norm": 0.9395773410797119, + "learning_rate": 8.72e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.9167209789156914, + "num_tokens": 23079046.0, + "step": 320 + }, + { + "epoch": 0.20036671478172668, + "grad_norm": 0.9463289380073547, + "learning_rate": 8.716e-07, + "loss": 0.5155, + "mean_token_accuracy": 0.9193439371883869, + "num_tokens": 23149666.0, + "step": 321 + }, + { + "epoch": 0.20099091015487847, + "grad_norm": 0.8668283224105835, + "learning_rate": 8.711999999999999e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.9267007894814014, + "num_tokens": 23221965.0, + "step": 322 + }, + { + "epoch": 0.20161510552803028, + "grad_norm": 0.9823909401893616, + "learning_rate": 8.708e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.9271320924162865, + "num_tokens": 23294488.0, + "step": 323 + }, + { + "epoch": 0.20223930090118206, + "grad_norm": 0.961146354675293, + "learning_rate": 8.704e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.9141620211303234, + "num_tokens": 23361710.0, + "step": 324 + }, + { + "epoch": 0.20286349627433387, + "grad_norm": 1.7309682369232178, + "learning_rate": 8.699999999999999e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.9189098365604877, + "num_tokens": 23437107.0, + "step": 325 + }, + { + "epoch": 0.20348769164748567, + "grad_norm": 0.912071943283081, + "learning_rate": 8.696e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.9214383848011494, + "num_tokens": 23504787.0, + "step": 326 + }, + { + "epoch": 0.20411188702063746, + "grad_norm": 0.9424958825111389, + "learning_rate": 8.692e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.9286565147340298, + "num_tokens": 23577551.0, + "step": 327 + }, + { + "epoch": 0.20473608239378926, + "grad_norm": 0.9649046063423157, + "learning_rate": 8.687999999999999e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.9243063032627106, + "num_tokens": 23651549.0, + "step": 328 + }, + { + "epoch": 0.20536027776694105, + "grad_norm": 1.6607611179351807, + "learning_rate": 8.683999999999999e-07, + "loss": 0.4811, + "mean_token_accuracy": 0.9211996085941792, + "num_tokens": 23724050.0, + "step": 329 + }, + { + "epoch": 0.20598447314009286, + "grad_norm": 0.9780636429786682, + "learning_rate": 8.68e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.920963428914547, + "num_tokens": 23793356.0, + "step": 330 + }, + { + "epoch": 0.20660866851324464, + "grad_norm": 2.5218260288238525, + "learning_rate": 8.676e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.9156523831188679, + "num_tokens": 23865901.0, + "step": 331 + }, + { + "epoch": 0.20723286388639645, + "grad_norm": 1.7883888483047485, + "learning_rate": 8.671999999999999e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.9203068241477013, + "num_tokens": 23933168.0, + "step": 332 + }, + { + "epoch": 0.20785705925954823, + "grad_norm": 0.8361603021621704, + "learning_rate": 8.668e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.9065203927457333, + "num_tokens": 24002835.0, + "step": 333 + }, + { + "epoch": 0.20848125463270004, + "grad_norm": 0.912175714969635, + "learning_rate": 8.663999999999999e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.923988152295351, + "num_tokens": 24075843.0, + "step": 334 + }, + { + "epoch": 0.20910545000585185, + "grad_norm": 2.5694775581359863, + "learning_rate": 8.659999999999999e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.9196614623069763, + "num_tokens": 24150074.0, + "step": 335 + }, + { + "epoch": 0.20972964537900363, + "grad_norm": 2.5357964038848877, + "learning_rate": 8.656e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.917326096445322, + "num_tokens": 24222073.0, + "step": 336 + }, + { + "epoch": 0.21035384075215544, + "grad_norm": 2.5211615562438965, + "learning_rate": 8.651999999999999e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.9228744432330132, + "num_tokens": 24295899.0, + "step": 337 + }, + { + "epoch": 0.21097803612530722, + "grad_norm": 0.8773064613342285, + "learning_rate": 8.648e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.9232124611735344, + "num_tokens": 24370875.0, + "step": 338 + }, + { + "epoch": 0.21160223149845903, + "grad_norm": 0.7836575508117676, + "learning_rate": 8.643999999999999e-07, + "loss": 0.4595, + "mean_token_accuracy": 0.925804577767849, + "num_tokens": 24446073.0, + "step": 339 + }, + { + "epoch": 0.2122264268716108, + "grad_norm": 2.4918482303619385, + "learning_rate": 8.639999999999999e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.9221850037574768, + "num_tokens": 24519043.0, + "step": 340 + }, + { + "epoch": 0.21285062224476262, + "grad_norm": 0.8078194260597229, + "learning_rate": 8.636e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.9222163297235966, + "num_tokens": 24591558.0, + "step": 341 + }, + { + "epoch": 0.2134748176179144, + "grad_norm": 1.670353889465332, + "learning_rate": 8.632e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.9207929521799088, + "num_tokens": 24661489.0, + "step": 342 + }, + { + "epoch": 0.2140990129910662, + "grad_norm": 1.6045187711715698, + "learning_rate": 8.628e-07, + "loss": 0.4317, + "mean_token_accuracy": 0.9321544617414474, + "num_tokens": 24735383.0, + "step": 343 + }, + { + "epoch": 0.214723208364218, + "grad_norm": 1.4934444427490234, + "learning_rate": 8.624e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.91195372864604, + "num_tokens": 24808092.0, + "step": 344 + }, + { + "epoch": 0.2153474037373698, + "grad_norm": 0.8003431558609009, + "learning_rate": 8.62e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.9115734659135342, + "num_tokens": 24874295.0, + "step": 345 + }, + { + "epoch": 0.2159715991105216, + "grad_norm": 2.375620126724243, + "learning_rate": 8.616e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.9086932577192783, + "num_tokens": 24940399.0, + "step": 346 + }, + { + "epoch": 0.2165957944836734, + "grad_norm": 3.018425703048706, + "learning_rate": 8.611999999999999e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.9146109260618687, + "num_tokens": 25010896.0, + "step": 347 + }, + { + "epoch": 0.2172199898568252, + "grad_norm": 1.399186372756958, + "learning_rate": 8.608e-07, + "loss": 0.4331, + "mean_token_accuracy": 0.9315755292773247, + "num_tokens": 25085233.0, + "step": 348 + }, + { + "epoch": 0.21784418522997698, + "grad_norm": 0.8286827206611633, + "learning_rate": 8.604000000000001e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.9212476573884487, + "num_tokens": 25150827.0, + "step": 349 + }, + { + "epoch": 0.2184683806031288, + "grad_norm": 0.7845478653907776, + "learning_rate": 8.599999999999999e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.9202132783830166, + "num_tokens": 25217061.0, + "step": 350 + }, + { + "epoch": 0.21909257597628057, + "grad_norm": 0.7909486293792725, + "learning_rate": 8.596e-07, + "loss": 0.4397, + "mean_token_accuracy": 0.9279148392379284, + "num_tokens": 25291821.0, + "step": 351 + }, + { + "epoch": 0.21971677134943238, + "grad_norm": 1.524327278137207, + "learning_rate": 8.592e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.9234129972755909, + "num_tokens": 25359049.0, + "step": 352 + }, + { + "epoch": 0.22034096672258416, + "grad_norm": 2.1286449432373047, + "learning_rate": 8.587999999999999e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.925106979906559, + "num_tokens": 25431227.0, + "step": 353 + }, + { + "epoch": 0.22096516209573597, + "grad_norm": 2.2779810428619385, + "learning_rate": 8.584e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.9252244904637337, + "num_tokens": 25506007.0, + "step": 354 + }, + { + "epoch": 0.22158935746888775, + "grad_norm": 1.3562296628952026, + "learning_rate": 8.58e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.9169961102306843, + "num_tokens": 25576911.0, + "step": 355 + }, + { + "epoch": 0.22221355284203956, + "grad_norm": 0.7795654535293579, + "learning_rate": 8.576e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.9218585044145584, + "num_tokens": 25645426.0, + "step": 356 + }, + { + "epoch": 0.22283774821519137, + "grad_norm": 1.5523988008499146, + "learning_rate": 8.571999999999999e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.9274989813566208, + "num_tokens": 25720071.0, + "step": 357 + }, + { + "epoch": 0.22346194358834315, + "grad_norm": 1.2857192754745483, + "learning_rate": 8.568e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.9232259094715118, + "num_tokens": 25795999.0, + "step": 358 + }, + { + "epoch": 0.22408613896149496, + "grad_norm": 2.137576103210449, + "learning_rate": 8.564e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.9243124797940254, + "num_tokens": 25867136.0, + "step": 359 + }, + { + "epoch": 0.22471033433464674, + "grad_norm": 2.458399534225464, + "learning_rate": 8.559999999999999e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.9150168560445309, + "num_tokens": 25935016.0, + "step": 360 + }, + { + "epoch": 0.22533452970779855, + "grad_norm": 1.9607969522476196, + "learning_rate": 8.556e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.923651646822691, + "num_tokens": 26011032.0, + "step": 361 + }, + { + "epoch": 0.22595872508095033, + "grad_norm": 1.3728708028793335, + "learning_rate": 8.551999999999999e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.9165203161537647, + "num_tokens": 26080969.0, + "step": 362 + }, + { + "epoch": 0.22658292045410214, + "grad_norm": 1.392662763595581, + "learning_rate": 8.548e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.9057382196187973, + "num_tokens": 26149313.0, + "step": 363 + }, + { + "epoch": 0.22720711582725392, + "grad_norm": 1.3480995893478394, + "learning_rate": 8.544e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.9191391468048096, + "num_tokens": 26219897.0, + "step": 364 + }, + { + "epoch": 0.22783131120040573, + "grad_norm": 2.04896879196167, + "learning_rate": 8.539999999999999e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.9199820160865784, + "num_tokens": 26288382.0, + "step": 365 + }, + { + "epoch": 0.2284555065735575, + "grad_norm": 1.9956952333450317, + "learning_rate": 8.536e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.9249612577259541, + "num_tokens": 26361670.0, + "step": 366 + }, + { + "epoch": 0.22907970194670932, + "grad_norm": 0.7502142190933228, + "learning_rate": 8.531999999999999e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.9189267456531525, + "num_tokens": 26433413.0, + "step": 367 + }, + { + "epoch": 0.22970389731986113, + "grad_norm": 1.308139443397522, + "learning_rate": 8.528e-07, + "loss": 0.425, + "mean_token_accuracy": 0.926760770380497, + "num_tokens": 26505506.0, + "step": 368 + }, + { + "epoch": 0.2303280926930129, + "grad_norm": 1.296675205230713, + "learning_rate": 8.524e-07, + "loss": 0.4051, + "mean_token_accuracy": 0.9225211814045906, + "num_tokens": 26575815.0, + "step": 369 + }, + { + "epoch": 0.23095228806616472, + "grad_norm": 1.9104191064834595, + "learning_rate": 8.52e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.9201551452279091, + "num_tokens": 26647797.0, + "step": 370 + }, + { + "epoch": 0.2315764834393165, + "grad_norm": 0.8089154362678528, + "learning_rate": 8.516e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.9099730663001537, + "num_tokens": 26717975.0, + "step": 371 + }, + { + "epoch": 0.2322006788124683, + "grad_norm": 1.8594608306884766, + "learning_rate": 8.511999999999999e-07, + "loss": 0.4212, + "mean_token_accuracy": 0.9235678352415562, + "num_tokens": 26792156.0, + "step": 372 + }, + { + "epoch": 0.2328248741856201, + "grad_norm": 1.2001798152923584, + "learning_rate": 8.508e-07, + "loss": 0.4173, + "mean_token_accuracy": 0.9245698302984238, + "num_tokens": 26864277.0, + "step": 373 + }, + { + "epoch": 0.2334490695587719, + "grad_norm": 1.230749249458313, + "learning_rate": 8.504e-07, + "loss": 0.4216, + "mean_token_accuracy": 0.921494010835886, + "num_tokens": 26937479.0, + "step": 374 + }, + { + "epoch": 0.23407326493192368, + "grad_norm": 1.1909148693084717, + "learning_rate": 8.499999999999999e-07, + "loss": 0.4199, + "mean_token_accuracy": 0.9290775023400784, + "num_tokens": 27014918.0, + "step": 375 + }, + { + "epoch": 0.2346974603050755, + "grad_norm": 1.3617513179779053, + "learning_rate": 8.496e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.9312286600470543, + "num_tokens": 27090755.0, + "step": 376 + }, + { + "epoch": 0.2353216556782273, + "grad_norm": 0.6376486420631409, + "learning_rate": 8.492e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.9228227473795414, + "num_tokens": 27162943.0, + "step": 377 + }, + { + "epoch": 0.23594585105137908, + "grad_norm": 1.1157821416854858, + "learning_rate": 8.487999999999999e-07, + "loss": 0.4072, + "mean_token_accuracy": 0.9222900941967964, + "num_tokens": 27233894.0, + "step": 378 + }, + { + "epoch": 0.2365700464245309, + "grad_norm": 0.5952740907669067, + "learning_rate": 8.484e-07, + "loss": 0.4095, + "mean_token_accuracy": 0.9275263771414757, + "num_tokens": 27306114.0, + "step": 379 + }, + { + "epoch": 0.23719424179768267, + "grad_norm": 0.5804332494735718, + "learning_rate": 8.48e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.9191073924303055, + "num_tokens": 27378710.0, + "step": 380 + }, + { + "epoch": 0.23781843717083448, + "grad_norm": 0.6052722930908203, + "learning_rate": 8.475999999999999e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.9200603328645229, + "num_tokens": 27448079.0, + "step": 381 + }, + { + "epoch": 0.23844263254398626, + "grad_norm": 1.1853963136672974, + "learning_rate": 8.471999999999999e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.914018739014864, + "num_tokens": 27518565.0, + "step": 382 + }, + { + "epoch": 0.23906682791713807, + "grad_norm": 0.6546916961669922, + "learning_rate": 8.468e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.9173280820250511, + "num_tokens": 27587846.0, + "step": 383 + }, + { + "epoch": 0.23969102329028985, + "grad_norm": 1.7882285118103027, + "learning_rate": 8.464e-07, + "loss": 0.4053, + "mean_token_accuracy": 0.9251374267041683, + "num_tokens": 27661293.0, + "step": 384 + }, + { + "epoch": 0.24031521866344166, + "grad_norm": 2.310976266860962, + "learning_rate": 8.459999999999999e-07, + "loss": 0.4088, + "mean_token_accuracy": 0.9227562807500362, + "num_tokens": 27733527.0, + "step": 385 + }, + { + "epoch": 0.24093941403659344, + "grad_norm": 0.3753027319908142, + "learning_rate": 8.456e-07, + "loss": 0.406, + "mean_token_accuracy": 0.9240587763488293, + "num_tokens": 27809744.0, + "step": 386 + }, + { + "epoch": 0.24156360940974525, + "grad_norm": 1.1255004405975342, + "learning_rate": 8.451999999999999e-07, + "loss": 0.4027, + "mean_token_accuracy": 0.922537162899971, + "num_tokens": 27879908.0, + "step": 387 + }, + { + "epoch": 0.24218780478289706, + "grad_norm": 1.728273630142212, + "learning_rate": 8.447999999999999e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.9165510907769203, + "num_tokens": 27947722.0, + "step": 388 + }, + { + "epoch": 0.24281200015604884, + "grad_norm": 1.1392329931259155, + "learning_rate": 8.444e-07, + "loss": 0.4137, + "mean_token_accuracy": 0.9213821589946747, + "num_tokens": 28016839.0, + "step": 389 + }, + { + "epoch": 0.24343619552920065, + "grad_norm": 1.1253199577331543, + "learning_rate": 8.439999999999999e-07, + "loss": 0.4112, + "mean_token_accuracy": 0.9206866696476936, + "num_tokens": 28084621.0, + "step": 390 + }, + { + "epoch": 0.24406039090235243, + "grad_norm": 1.1136630773544312, + "learning_rate": 8.436e-07, + "loss": 0.4187, + "mean_token_accuracy": 0.9186587296426296, + "num_tokens": 28157293.0, + "step": 391 + }, + { + "epoch": 0.24468458627550424, + "grad_norm": 0.5871824622154236, + "learning_rate": 8.431999999999999e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.9158234111964703, + "num_tokens": 28227939.0, + "step": 392 + }, + { + "epoch": 0.24530878164865602, + "grad_norm": 2.1546518802642822, + "learning_rate": 8.428e-07, + "loss": 0.3839, + "mean_token_accuracy": 0.928234089165926, + "num_tokens": 28301942.0, + "step": 393 + }, + { + "epoch": 0.24593297702180783, + "grad_norm": 1.6724817752838135, + "learning_rate": 8.424e-07, + "loss": 0.4064, + "mean_token_accuracy": 0.9248273372650146, + "num_tokens": 28376719.0, + "step": 394 + }, + { + "epoch": 0.2465571723949596, + "grad_norm": 1.69247305393219, + "learning_rate": 8.419999999999999e-07, + "loss": 0.4113, + "mean_token_accuracy": 0.9238225184381008, + "num_tokens": 28453154.0, + "step": 395 + }, + { + "epoch": 0.24718136776811142, + "grad_norm": 1.1182528734207153, + "learning_rate": 8.416e-07, + "loss": 0.3865, + "mean_token_accuracy": 0.9262501187622547, + "num_tokens": 28520596.0, + "step": 396 + }, + { + "epoch": 0.2478055631412632, + "grad_norm": 0.5353150963783264, + "learning_rate": 8.411999999999999e-07, + "loss": 0.4092, + "mean_token_accuracy": 0.9224611297249794, + "num_tokens": 28589771.0, + "step": 397 + }, + { + "epoch": 0.248429758514415, + "grad_norm": 1.5561952590942383, + "learning_rate": 8.408e-07, + "loss": 0.4197, + "mean_token_accuracy": 0.9252982772886753, + "num_tokens": 28665975.0, + "step": 398 + }, + { + "epoch": 0.24905395388756682, + "grad_norm": 0.9938348531723022, + "learning_rate": 8.404e-07, + "loss": 0.3826, + "mean_token_accuracy": 0.9210449159145355, + "num_tokens": 28735893.0, + "step": 399 + }, + { + "epoch": 0.2496781492607186, + "grad_norm": 1.2187830209732056, + "learning_rate": 8.399999999999999e-07, + "loss": 0.4053, + "mean_token_accuracy": 0.9271294698119164, + "num_tokens": 28810621.0, + "step": 400 + }, + { + "epoch": 0.2503023446338704, + "grad_norm": 1.3881856203079224, + "learning_rate": 8.396e-07, + "loss": 0.3908, + "mean_token_accuracy": 0.9332247227430344, + "num_tokens": 28888430.0, + "step": 401 + }, + { + "epoch": 0.2509265400070222, + "grad_norm": 1.5399160385131836, + "learning_rate": 8.391999999999999e-07, + "loss": 0.4008, + "mean_token_accuracy": 0.926115620881319, + "num_tokens": 28960778.0, + "step": 402 + }, + { + "epoch": 0.251550735380174, + "grad_norm": 1.5015639066696167, + "learning_rate": 8.387999999999999e-07, + "loss": 0.38, + "mean_token_accuracy": 0.9280642941594124, + "num_tokens": 29032917.0, + "step": 403 + }, + { + "epoch": 0.2521749307533258, + "grad_norm": 0.9755170345306396, + "learning_rate": 8.384e-07, + "loss": 0.4271, + "mean_token_accuracy": 0.9176996536552906, + "num_tokens": 29104383.0, + "step": 404 + }, + { + "epoch": 0.2527991261264776, + "grad_norm": 0.9623498916625977, + "learning_rate": 8.38e-07, + "loss": 0.3931, + "mean_token_accuracy": 0.9232301451265812, + "num_tokens": 29176946.0, + "step": 405 + }, + { + "epoch": 0.2534233214996294, + "grad_norm": 1.4719113111495972, + "learning_rate": 8.375999999999999e-07, + "loss": 0.3994, + "mean_token_accuracy": 0.9208735749125481, + "num_tokens": 29250010.0, + "step": 406 + }, + { + "epoch": 0.25404751687278115, + "grad_norm": 1.420078158378601, + "learning_rate": 8.372e-07, + "loss": 0.3847, + "mean_token_accuracy": 0.9272407628595829, + "num_tokens": 29323717.0, + "step": 407 + }, + { + "epoch": 0.254671712245933, + "grad_norm": 0.936745285987854, + "learning_rate": 8.368e-07, + "loss": 0.4253, + "mean_token_accuracy": 0.9173033535480499, + "num_tokens": 29394139.0, + "step": 408 + }, + { + "epoch": 0.25529590761908477, + "grad_norm": 0.5124381184577942, + "learning_rate": 8.363999999999999e-07, + "loss": 0.3956, + "mean_token_accuracy": 0.9207809679210186, + "num_tokens": 29462291.0, + "step": 409 + }, + { + "epoch": 0.25592010299223655, + "grad_norm": 0.541309118270874, + "learning_rate": 8.359999999999999e-07, + "loss": 0.3691, + "mean_token_accuracy": 0.9308284670114517, + "num_tokens": 29538759.0, + "step": 410 + }, + { + "epoch": 0.2565442983653884, + "grad_norm": 0.9743673205375671, + "learning_rate": 8.356e-07, + "loss": 0.3842, + "mean_token_accuracy": 0.9242149740457535, + "num_tokens": 29612667.0, + "step": 411 + }, + { + "epoch": 0.25716849373854017, + "grad_norm": 0.531112015247345, + "learning_rate": 8.352000000000001e-07, + "loss": 0.371, + "mean_token_accuracy": 0.9262283891439438, + "num_tokens": 29683930.0, + "step": 412 + }, + { + "epoch": 0.25779268911169195, + "grad_norm": 1.4184839725494385, + "learning_rate": 8.347999999999999e-07, + "loss": 0.3917, + "mean_token_accuracy": 0.9264179468154907, + "num_tokens": 29756605.0, + "step": 413 + }, + { + "epoch": 0.25841688448484373, + "grad_norm": 1.4830143451690674, + "learning_rate": 8.344e-07, + "loss": 0.4125, + "mean_token_accuracy": 0.9165854007005692, + "num_tokens": 29827260.0, + "step": 414 + }, + { + "epoch": 0.25904107985799557, + "grad_norm": 1.3852146863937378, + "learning_rate": 8.34e-07, + "loss": 0.3884, + "mean_token_accuracy": 0.9258201904594898, + "num_tokens": 29900579.0, + "step": 415 + }, + { + "epoch": 0.25966527523114735, + "grad_norm": 1.811882734298706, + "learning_rate": 8.335999999999999e-07, + "loss": 0.3795, + "mean_token_accuracy": 0.92490029707551, + "num_tokens": 29975594.0, + "step": 416 + }, + { + "epoch": 0.26028947060429913, + "grad_norm": 1.353249192237854, + "learning_rate": 8.332e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.9173424355685711, + "num_tokens": 30046265.0, + "step": 417 + }, + { + "epoch": 0.2609136659774509, + "grad_norm": 0.9155253171920776, + "learning_rate": 8.328e-07, + "loss": 0.3977, + "mean_token_accuracy": 0.9226347766816616, + "num_tokens": 30121600.0, + "step": 418 + }, + { + "epoch": 0.26153786135060275, + "grad_norm": 1.4468632936477661, + "learning_rate": 8.324e-07, + "loss": 0.3684, + "mean_token_accuracy": 0.9264166504144669, + "num_tokens": 30196202.0, + "step": 419 + }, + { + "epoch": 0.26216205672375453, + "grad_norm": 0.9410948753356934, + "learning_rate": 8.319999999999999e-07, + "loss": 0.4268, + "mean_token_accuracy": 0.9185077361762524, + "num_tokens": 30266252.0, + "step": 420 + }, + { + "epoch": 0.2627862520969063, + "grad_norm": 0.9149848222732544, + "learning_rate": 8.316e-07, + "loss": 0.4048, + "mean_token_accuracy": 0.9243842512369156, + "num_tokens": 30341952.0, + "step": 421 + }, + { + "epoch": 0.26341044747005815, + "grad_norm": 0.8869105577468872, + "learning_rate": 8.312e-07, + "loss": 0.3984, + "mean_token_accuracy": 0.921892773360014, + "num_tokens": 30410852.0, + "step": 422 + }, + { + "epoch": 0.26403464284320993, + "grad_norm": 0.9244428277015686, + "learning_rate": 8.308e-07, + "loss": 0.4001, + "mean_token_accuracy": 0.919981986284256, + "num_tokens": 30480407.0, + "step": 423 + }, + { + "epoch": 0.2646588382163617, + "grad_norm": 1.783027172088623, + "learning_rate": 8.304e-07, + "loss": 0.4101, + "mean_token_accuracy": 0.9190192967653275, + "num_tokens": 30554652.0, + "step": 424 + }, + { + "epoch": 0.2652830335895135, + "grad_norm": 0.8427060842514038, + "learning_rate": 8.299999999999999e-07, + "loss": 0.3868, + "mean_token_accuracy": 0.9234179295599461, + "num_tokens": 30623620.0, + "step": 425 + }, + { + "epoch": 0.26590722896266533, + "grad_norm": 0.24793501198291779, + "learning_rate": 8.296e-07, + "loss": 0.3966, + "mean_token_accuracy": 0.9199277870357037, + "num_tokens": 30697139.0, + "step": 426 + }, + { + "epoch": 0.2665314243358171, + "grad_norm": 0.46406906843185425, + "learning_rate": 8.292e-07, + "loss": 0.3978, + "mean_token_accuracy": 0.9246162809431553, + "num_tokens": 30769639.0, + "step": 427 + }, + { + "epoch": 0.2671556197089689, + "grad_norm": 0.27684247493743896, + "learning_rate": 8.287999999999999e-07, + "loss": 0.3624, + "mean_token_accuracy": 0.9286627471446991, + "num_tokens": 30844746.0, + "step": 428 + }, + { + "epoch": 0.26777981508212073, + "grad_norm": 0.486666202545166, + "learning_rate": 8.284e-07, + "loss": 0.3778, + "mean_token_accuracy": 0.9263641089200974, + "num_tokens": 30921360.0, + "step": 429 + }, + { + "epoch": 0.2684040104552725, + "grad_norm": 0.8701847195625305, + "learning_rate": 8.28e-07, + "loss": 0.3917, + "mean_token_accuracy": 0.9192351438105106, + "num_tokens": 30990600.0, + "step": 430 + }, + { + "epoch": 0.2690282058284243, + "grad_norm": 0.47957533597946167, + "learning_rate": 8.275999999999999e-07, + "loss": 0.3453, + "mean_token_accuracy": 0.9272926487028599, + "num_tokens": 31066108.0, + "step": 431 + }, + { + "epoch": 0.2696524012015761, + "grad_norm": 1.20380437374115, + "learning_rate": 8.272e-07, + "loss": 0.3861, + "mean_token_accuracy": 0.9172127097845078, + "num_tokens": 31133799.0, + "step": 432 + }, + { + "epoch": 0.2702765965747279, + "grad_norm": 1.2132657766342163, + "learning_rate": 8.268e-07, + "loss": 0.3743, + "mean_token_accuracy": 0.9244321323931217, + "num_tokens": 31205749.0, + "step": 433 + }, + { + "epoch": 0.2709007919478797, + "grad_norm": 0.5054382681846619, + "learning_rate": 8.263999999999999e-07, + "loss": 0.3527, + "mean_token_accuracy": 0.9261834844946861, + "num_tokens": 31274405.0, + "step": 434 + }, + { + "epoch": 0.2715249873210315, + "grad_norm": 1.2120206356048584, + "learning_rate": 8.259999999999999e-07, + "loss": 0.3817, + "mean_token_accuracy": 0.9236193560063839, + "num_tokens": 31346190.0, + "step": 435 + }, + { + "epoch": 0.27214918269418326, + "grad_norm": 1.2147562503814697, + "learning_rate": 8.256e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.9178253710269928, + "num_tokens": 31417066.0, + "step": 436 + }, + { + "epoch": 0.2727733780673351, + "grad_norm": 0.8387507200241089, + "learning_rate": 8.252000000000001e-07, + "loss": 0.4085, + "mean_token_accuracy": 0.9183575063943863, + "num_tokens": 31487638.0, + "step": 437 + }, + { + "epoch": 0.2733975734404869, + "grad_norm": 0.457234650850296, + "learning_rate": 8.247999999999999e-07, + "loss": 0.3921, + "mean_token_accuracy": 0.920664619654417, + "num_tokens": 31555726.0, + "step": 438 + }, + { + "epoch": 0.27402176881363866, + "grad_norm": 1.218550205230713, + "learning_rate": 8.244e-07, + "loss": 0.3972, + "mean_token_accuracy": 0.9214368127286434, + "num_tokens": 31626798.0, + "step": 439 + }, + { + "epoch": 0.2746459641867905, + "grad_norm": 1.2510223388671875, + "learning_rate": 8.24e-07, + "loss": 0.368, + "mean_token_accuracy": 0.9258331805467606, + "num_tokens": 31698223.0, + "step": 440 + }, + { + "epoch": 0.2752701595599423, + "grad_norm": 1.1848870515823364, + "learning_rate": 8.235999999999999e-07, + "loss": 0.4065, + "mean_token_accuracy": 0.9150502532720566, + "num_tokens": 31770218.0, + "step": 441 + }, + { + "epoch": 0.27589435493309405, + "grad_norm": 0.8210954666137695, + "learning_rate": 8.232e-07, + "loss": 0.3879, + "mean_token_accuracy": 0.9278232716023922, + "num_tokens": 31845687.0, + "step": 442 + }, + { + "epoch": 0.27651855030624584, + "grad_norm": 0.8770108819007874, + "learning_rate": 8.228e-07, + "loss": 0.399, + "mean_token_accuracy": 0.9185272864997387, + "num_tokens": 31916228.0, + "step": 443 + }, + { + "epoch": 0.2771427456793977, + "grad_norm": 1.1238223314285278, + "learning_rate": 8.224e-07, + "loss": 0.3823, + "mean_token_accuracy": 0.9234112687408924, + "num_tokens": 31986380.0, + "step": 444 + }, + { + "epoch": 0.27776694105254945, + "grad_norm": 0.8185886144638062, + "learning_rate": 8.219999999999999e-07, + "loss": 0.3675, + "mean_token_accuracy": 0.9260559901595116, + "num_tokens": 32057315.0, + "step": 445 + }, + { + "epoch": 0.27839113642570124, + "grad_norm": 1.1564223766326904, + "learning_rate": 8.216e-07, + "loss": 0.3536, + "mean_token_accuracy": 0.9315725937485695, + "num_tokens": 32134655.0, + "step": 446 + }, + { + "epoch": 0.279015331798853, + "grad_norm": 1.0922423601150513, + "learning_rate": 8.212e-07, + "loss": 0.3777, + "mean_token_accuracy": 0.919820062816143, + "num_tokens": 32206037.0, + "step": 447 + }, + { + "epoch": 0.27963952717200485, + "grad_norm": 0.8300904631614685, + "learning_rate": 8.207999999999999e-07, + "loss": 0.3843, + "mean_token_accuracy": 0.9201874099671841, + "num_tokens": 32271782.0, + "step": 448 + }, + { + "epoch": 0.28026372254515663, + "grad_norm": 1.0689630508422852, + "learning_rate": 8.204e-07, + "loss": 0.4132, + "mean_token_accuracy": 0.9163887202739716, + "num_tokens": 32341728.0, + "step": 449 + }, + { + "epoch": 0.2808879179183084, + "grad_norm": 0.7719376683235168, + "learning_rate": 8.199999999999999e-07, + "loss": 0.4028, + "mean_token_accuracy": 0.9178178422152996, + "num_tokens": 32413746.0, + "step": 450 + }, + { + "epoch": 0.28151211329146025, + "grad_norm": 0.8189422488212585, + "learning_rate": 8.196e-07, + "loss": 0.3718, + "mean_token_accuracy": 0.9275276958942413, + "num_tokens": 32489023.0, + "step": 451 + }, + { + "epoch": 0.28213630866461203, + "grad_norm": 0.7677391171455383, + "learning_rate": 8.192e-07, + "loss": 0.3993, + "mean_token_accuracy": 0.9175617136061192, + "num_tokens": 32557619.0, + "step": 452 + }, + { + "epoch": 0.2827605040377638, + "grad_norm": 0.7534482479095459, + "learning_rate": 8.187999999999999e-07, + "loss": 0.3715, + "mean_token_accuracy": 0.9263954050838947, + "num_tokens": 32629108.0, + "step": 453 + }, + { + "epoch": 0.2833846994109156, + "grad_norm": 1.0587571859359741, + "learning_rate": 8.184e-07, + "loss": 0.393, + "mean_token_accuracy": 0.9122283458709717, + "num_tokens": 32699246.0, + "step": 454 + }, + { + "epoch": 0.28400889478406743, + "grad_norm": 1.0744235515594482, + "learning_rate": 8.179999999999999e-07, + "loss": 0.3799, + "mean_token_accuracy": 0.9200029857456684, + "num_tokens": 32771788.0, + "step": 455 + }, + { + "epoch": 0.2846330901572192, + "grad_norm": 0.5133286714553833, + "learning_rate": 8.175999999999999e-07, + "loss": 0.4008, + "mean_token_accuracy": 0.9154097177088261, + "num_tokens": 32844323.0, + "step": 456 + }, + { + "epoch": 0.285257285530371, + "grad_norm": 0.4051693081855774, + "learning_rate": 8.172e-07, + "loss": 0.3743, + "mean_token_accuracy": 0.9187863431870937, + "num_tokens": 32911195.0, + "step": 457 + }, + { + "epoch": 0.2858814809035228, + "grad_norm": 0.43125689029693604, + "learning_rate": 8.168e-07, + "loss": 0.3903, + "mean_token_accuracy": 0.9152910970151424, + "num_tokens": 32983744.0, + "step": 458 + }, + { + "epoch": 0.2865056762766746, + "grad_norm": 1.155842900276184, + "learning_rate": 8.163999999999999e-07, + "loss": 0.3766, + "mean_token_accuracy": 0.9185449928045273, + "num_tokens": 33050055.0, + "step": 459 + }, + { + "epoch": 0.2871298716498264, + "grad_norm": 0.6426854133605957, + "learning_rate": 8.159999999999999e-07, + "loss": 0.3952, + "mean_token_accuracy": 0.9194389209151268, + "num_tokens": 33123662.0, + "step": 460 + }, + { + "epoch": 0.2877540670229782, + "grad_norm": 0.3941771388053894, + "learning_rate": 8.156e-07, + "loss": 0.3895, + "mean_token_accuracy": 0.9190852232277393, + "num_tokens": 33195820.0, + "step": 461 + }, + { + "epoch": 0.28837826239613, + "grad_norm": 0.4276549220085144, + "learning_rate": 8.152e-07, + "loss": 0.3632, + "mean_token_accuracy": 0.9277602657675743, + "num_tokens": 33269124.0, + "step": 462 + }, + { + "epoch": 0.2890024577692818, + "grad_norm": 0.4527488946914673, + "learning_rate": 8.147999999999999e-07, + "loss": 0.3597, + "mean_token_accuracy": 0.927538301795721, + "num_tokens": 33338440.0, + "step": 463 + }, + { + "epoch": 0.2896266531424336, + "grad_norm": 0.3035643994808197, + "learning_rate": 8.144e-07, + "loss": 0.3548, + "mean_token_accuracy": 0.9277514107525349, + "num_tokens": 33413283.0, + "step": 464 + }, + { + "epoch": 0.29025084851558536, + "grad_norm": 0.6827325224876404, + "learning_rate": 8.14e-07, + "loss": 0.3905, + "mean_token_accuracy": 0.9223032481968403, + "num_tokens": 33486403.0, + "step": 465 + }, + { + "epoch": 0.2908750438887372, + "grad_norm": 0.3996064364910126, + "learning_rate": 8.135999999999999e-07, + "loss": 0.3597, + "mean_token_accuracy": 0.9243958182632923, + "num_tokens": 33556839.0, + "step": 466 + }, + { + "epoch": 0.291499239261889, + "grad_norm": 1.0095113515853882, + "learning_rate": 8.132e-07, + "loss": 0.3672, + "mean_token_accuracy": 0.9211184307932854, + "num_tokens": 33627939.0, + "step": 467 + }, + { + "epoch": 0.29212343463504076, + "grad_norm": 0.6771048307418823, + "learning_rate": 8.128e-07, + "loss": 0.3493, + "mean_token_accuracy": 0.9311572276055813, + "num_tokens": 33703915.0, + "step": 468 + }, + { + "epoch": 0.29274763000819254, + "grad_norm": 1.3612362146377563, + "learning_rate": 8.123999999999999e-07, + "loss": 0.3898, + "mean_token_accuracy": 0.9170313812792301, + "num_tokens": 33774039.0, + "step": 469 + }, + { + "epoch": 0.2933718253813444, + "grad_norm": 0.9771288633346558, + "learning_rate": 8.12e-07, + "loss": 0.3882, + "mean_token_accuracy": 0.9206005297601223, + "num_tokens": 33849336.0, + "step": 470 + }, + { + "epoch": 0.29399602075449616, + "grad_norm": 1.024867057800293, + "learning_rate": 8.116e-07, + "loss": 0.3446, + "mean_token_accuracy": 0.9304168000817299, + "num_tokens": 33923177.0, + "step": 471 + }, + { + "epoch": 0.29462021612764794, + "grad_norm": 0.6518855690956116, + "learning_rate": 8.112e-07, + "loss": 0.361, + "mean_token_accuracy": 0.9236439689993858, + "num_tokens": 33996124.0, + "step": 472 + }, + { + "epoch": 0.2952444115007998, + "grad_norm": 0.6530463099479675, + "learning_rate": 8.107999999999999e-07, + "loss": 0.3687, + "mean_token_accuracy": 0.9250088781118393, + "num_tokens": 34068623.0, + "step": 473 + }, + { + "epoch": 0.29586860687395156, + "grad_norm": 1.3373897075653076, + "learning_rate": 8.104e-07, + "loss": 0.3572, + "mean_token_accuracy": 0.9255791194736958, + "num_tokens": 34143815.0, + "step": 474 + }, + { + "epoch": 0.29649280224710334, + "grad_norm": 1.049225926399231, + "learning_rate": 8.1e-07, + "loss": 0.3817, + "mean_token_accuracy": 0.9227948747575283, + "num_tokens": 34218652.0, + "step": 475 + }, + { + "epoch": 0.2971169976202551, + "grad_norm": 1.324798345565796, + "learning_rate": 8.095999999999999e-07, + "loss": 0.4043, + "mean_token_accuracy": 0.9184755980968475, + "num_tokens": 34289768.0, + "step": 476 + }, + { + "epoch": 0.29774119299340696, + "grad_norm": 0.23902744054794312, + "learning_rate": 8.092e-07, + "loss": 0.3584, + "mean_token_accuracy": 0.9244324378669262, + "num_tokens": 34361761.0, + "step": 477 + }, + { + "epoch": 0.29836538836655874, + "grad_norm": 0.6264401078224182, + "learning_rate": 8.087999999999999e-07, + "loss": 0.4131, + "mean_token_accuracy": 0.9105452783405781, + "num_tokens": 34433362.0, + "step": 478 + }, + { + "epoch": 0.2989895837397105, + "grad_norm": 0.7376995086669922, + "learning_rate": 8.084e-07, + "loss": 0.3779, + "mean_token_accuracy": 0.9226237758994102, + "num_tokens": 34504708.0, + "step": 479 + }, + { + "epoch": 0.2996137791128623, + "grad_norm": 0.9867478013038635, + "learning_rate": 8.08e-07, + "loss": 0.3878, + "mean_token_accuracy": 0.9225059673190117, + "num_tokens": 34573700.0, + "step": 480 + }, + { + "epoch": 0.30023797448601414, + "grad_norm": 0.996479868888855, + "learning_rate": 8.075999999999999e-07, + "loss": 0.3638, + "mean_token_accuracy": 0.923095341771841, + "num_tokens": 34644923.0, + "step": 481 + }, + { + "epoch": 0.3008621698591659, + "grad_norm": 0.3789809048175812, + "learning_rate": 8.072e-07, + "loss": 0.3836, + "mean_token_accuracy": 0.9172099679708481, + "num_tokens": 34712662.0, + "step": 482 + }, + { + "epoch": 0.3014863652323177, + "grad_norm": 0.5409672856330872, + "learning_rate": 8.067999999999999e-07, + "loss": 0.3438, + "mean_token_accuracy": 0.9296190738677979, + "num_tokens": 34789101.0, + "step": 483 + }, + { + "epoch": 0.30211056060546954, + "grad_norm": 0.6593160629272461, + "learning_rate": 8.064e-07, + "loss": 0.3783, + "mean_token_accuracy": 0.9195466078817844, + "num_tokens": 34861419.0, + "step": 484 + }, + { + "epoch": 0.3027347559786213, + "grad_norm": 0.9712687730789185, + "learning_rate": 8.06e-07, + "loss": 0.3442, + "mean_token_accuracy": 0.9274141415953636, + "num_tokens": 34934817.0, + "step": 485 + }, + { + "epoch": 0.3033589513517731, + "grad_norm": 0.9481886625289917, + "learning_rate": 8.056e-07, + "loss": 0.3613, + "mean_token_accuracy": 0.9272597245872021, + "num_tokens": 35009108.0, + "step": 486 + }, + { + "epoch": 0.3039831467249249, + "grad_norm": 0.6604760885238647, + "learning_rate": 8.052e-07, + "loss": 0.3691, + "mean_token_accuracy": 0.9250000976026058, + "num_tokens": 35084183.0, + "step": 487 + }, + { + "epoch": 0.3046073420980767, + "grad_norm": 0.4148165285587311, + "learning_rate": 8.047999999999999e-07, + "loss": 0.343, + "mean_token_accuracy": 0.9259302839636803, + "num_tokens": 35152335.0, + "step": 488 + }, + { + "epoch": 0.3052315374712285, + "grad_norm": 0.6812533736228943, + "learning_rate": 8.044e-07, + "loss": 0.3648, + "mean_token_accuracy": 0.926187340170145, + "num_tokens": 35226350.0, + "step": 489 + }, + { + "epoch": 0.3058557328443803, + "grad_norm": 0.928580641746521, + "learning_rate": 8.04e-07, + "loss": 0.3541, + "mean_token_accuracy": 0.9230936653912067, + "num_tokens": 35296847.0, + "step": 490 + }, + { + "epoch": 0.30647992821753206, + "grad_norm": 0.9404719471931458, + "learning_rate": 8.035999999999999e-07, + "loss": 0.4057, + "mean_token_accuracy": 0.9147538281977177, + "num_tokens": 35367486.0, + "step": 491 + }, + { + "epoch": 0.3071041235906839, + "grad_norm": 0.8735427856445312, + "learning_rate": 8.032e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.9132251814007759, + "num_tokens": 35439097.0, + "step": 492 + }, + { + "epoch": 0.3077283189638357, + "grad_norm": 0.9538032412528992, + "learning_rate": 8.028e-07, + "loss": 0.4088, + "mean_token_accuracy": 0.9159552976489067, + "num_tokens": 35512252.0, + "step": 493 + }, + { + "epoch": 0.30835251433698746, + "grad_norm": 0.6534470319747925, + "learning_rate": 8.023999999999999e-07, + "loss": 0.3348, + "mean_token_accuracy": 0.9320591278374195, + "num_tokens": 35586964.0, + "step": 494 + }, + { + "epoch": 0.3089767097101393, + "grad_norm": 0.6475063562393188, + "learning_rate": 8.02e-07, + "loss": 0.3771, + "mean_token_accuracy": 0.9201016835868359, + "num_tokens": 35657576.0, + "step": 495 + }, + { + "epoch": 0.3096009050832911, + "grad_norm": 0.35505348443984985, + "learning_rate": 8.016e-07, + "loss": 0.3685, + "mean_token_accuracy": 0.9209739379584789, + "num_tokens": 35724423.0, + "step": 496 + }, + { + "epoch": 0.31022510045644286, + "grad_norm": 0.5871096253395081, + "learning_rate": 8.012e-07, + "loss": 0.3524, + "mean_token_accuracy": 0.9270750507712364, + "num_tokens": 35797645.0, + "step": 497 + }, + { + "epoch": 0.31084929582959464, + "grad_norm": 0.6056950688362122, + "learning_rate": 8.007999999999999e-07, + "loss": 0.3925, + "mean_token_accuracy": 0.9213661998510361, + "num_tokens": 35864538.0, + "step": 498 + }, + { + "epoch": 0.3114734912027465, + "grad_norm": 0.6547621488571167, + "learning_rate": 8.004e-07, + "loss": 0.3887, + "mean_token_accuracy": 0.9117082841694355, + "num_tokens": 35929722.0, + "step": 499 + }, + { + "epoch": 0.31209768657589826, + "grad_norm": 0.9326969385147095, + "learning_rate": 8e-07, + "loss": 0.365, + "mean_token_accuracy": 0.921146672219038, + "num_tokens": 35999033.0, + "step": 500 + }, + { + "epoch": 0.31272188194905004, + "grad_norm": 0.44340744614601135, + "learning_rate": 7.995999999999999e-07, + "loss": 0.3692, + "mean_token_accuracy": 0.9207760877907276, + "num_tokens": 36073494.0, + "step": 501 + }, + { + "epoch": 0.3133460773222019, + "grad_norm": 0.567279040813446, + "learning_rate": 7.992e-07, + "loss": 0.3795, + "mean_token_accuracy": 0.9197339341044426, + "num_tokens": 36146852.0, + "step": 502 + }, + { + "epoch": 0.31397027269535366, + "grad_norm": 0.42524921894073486, + "learning_rate": 7.987999999999999e-07, + "loss": 0.3513, + "mean_token_accuracy": 0.9261737614870071, + "num_tokens": 36218524.0, + "step": 503 + }, + { + "epoch": 0.31459446806850544, + "grad_norm": 0.8648039698600769, + "learning_rate": 7.984e-07, + "loss": 0.3633, + "mean_token_accuracy": 0.9185593500733376, + "num_tokens": 36291663.0, + "step": 504 + }, + { + "epoch": 0.3152186634416572, + "grad_norm": 0.3618353009223938, + "learning_rate": 7.98e-07, + "loss": 0.3621, + "mean_token_accuracy": 0.9248528815805912, + "num_tokens": 36363896.0, + "step": 505 + }, + { + "epoch": 0.31584285881480906, + "grad_norm": 1.1000434160232544, + "learning_rate": 7.975999999999999e-07, + "loss": 0.3067, + "mean_token_accuracy": 0.935703057795763, + "num_tokens": 36442041.0, + "step": 506 + }, + { + "epoch": 0.31646705418796084, + "grad_norm": 0.3486091196537018, + "learning_rate": 7.972e-07, + "loss": 0.3619, + "mean_token_accuracy": 0.9244496859610081, + "num_tokens": 36515156.0, + "step": 507 + }, + { + "epoch": 0.3170912495611126, + "grad_norm": 0.5138978362083435, + "learning_rate": 7.967999999999999e-07, + "loss": 0.3886, + "mean_token_accuracy": 0.9163962192833424, + "num_tokens": 36588071.0, + "step": 508 + }, + { + "epoch": 0.3177154449342644, + "grad_norm": 1.234805941581726, + "learning_rate": 7.964e-07, + "loss": 0.3803, + "mean_token_accuracy": 0.9220647066831589, + "num_tokens": 36657176.0, + "step": 509 + }, + { + "epoch": 0.31833964030741624, + "grad_norm": 0.85943603515625, + "learning_rate": 7.96e-07, + "loss": 0.3618, + "mean_token_accuracy": 0.9248210079967976, + "num_tokens": 36734905.0, + "step": 510 + }, + { + "epoch": 0.318963835680568, + "grad_norm": 0.5815002918243408, + "learning_rate": 7.956e-07, + "loss": 0.3917, + "mean_token_accuracy": 0.9153008237481117, + "num_tokens": 36804876.0, + "step": 511 + }, + { + "epoch": 0.3195880310537198, + "grad_norm": 0.559831976890564, + "learning_rate": 7.952e-07, + "loss": 0.3496, + "mean_token_accuracy": 0.92459636926651, + "num_tokens": 36872556.0, + "step": 512 + }, + { + "epoch": 0.32021222642687164, + "grad_norm": 1.2308224439620972, + "learning_rate": 7.947999999999999e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8950334936380386, + "num_tokens": 36934444.0, + "step": 513 + }, + { + "epoch": 0.3208364218000234, + "grad_norm": 0.8181596994400024, + "learning_rate": 7.944e-07, + "loss": 0.3325, + "mean_token_accuracy": 0.9320288971066475, + "num_tokens": 37008882.0, + "step": 514 + }, + { + "epoch": 0.3214606171731752, + "grad_norm": 0.4042363464832306, + "learning_rate": 7.94e-07, + "loss": 0.3836, + "mean_token_accuracy": 0.9188083931803703, + "num_tokens": 37080376.0, + "step": 515 + }, + { + "epoch": 0.322084812546327, + "grad_norm": 1.0137232542037964, + "learning_rate": 7.935999999999999e-07, + "loss": 0.3765, + "mean_token_accuracy": 0.9191593118011951, + "num_tokens": 37153866.0, + "step": 516 + }, + { + "epoch": 0.3227090079194788, + "grad_norm": 0.840237557888031, + "learning_rate": 7.932e-07, + "loss": 0.3859, + "mean_token_accuracy": 0.917050663381815, + "num_tokens": 37228829.0, + "step": 517 + }, + { + "epoch": 0.3233332032926306, + "grad_norm": 1.093410849571228, + "learning_rate": 7.928e-07, + "loss": 0.3661, + "mean_token_accuracy": 0.9273787848651409, + "num_tokens": 37299866.0, + "step": 518 + }, + { + "epoch": 0.3239573986657824, + "grad_norm": 1.0735011100769043, + "learning_rate": 7.923999999999999e-07, + "loss": 0.3451, + "mean_token_accuracy": 0.9202693998813629, + "num_tokens": 37368186.0, + "step": 519 + }, + { + "epoch": 0.32458159403893416, + "grad_norm": 1.009791612625122, + "learning_rate": 7.92e-07, + "loss": 0.3393, + "mean_token_accuracy": 0.9257577694952488, + "num_tokens": 37440532.0, + "step": 520 + }, + { + "epoch": 0.325205789412086, + "grad_norm": 0.5683333873748779, + "learning_rate": 7.916e-07, + "loss": 0.3651, + "mean_token_accuracy": 0.9217745624482632, + "num_tokens": 37511829.0, + "step": 521 + }, + { + "epoch": 0.3258299847852378, + "grad_norm": 0.7119737863540649, + "learning_rate": 7.911999999999999e-07, + "loss": 0.3841, + "mean_token_accuracy": 0.9179428033530712, + "num_tokens": 37584018.0, + "step": 522 + }, + { + "epoch": 0.32645418015838956, + "grad_norm": 0.52000492811203, + "learning_rate": 7.907999999999999e-07, + "loss": 0.3817, + "mean_token_accuracy": 0.9178862608969212, + "num_tokens": 37651312.0, + "step": 523 + }, + { + "epoch": 0.3270783755315414, + "grad_norm": 1.0212749242782593, + "learning_rate": 7.904e-07, + "loss": 0.3721, + "mean_token_accuracy": 0.9203926138579845, + "num_tokens": 37721284.0, + "step": 524 + }, + { + "epoch": 0.3277025709046932, + "grad_norm": 0.30904000997543335, + "learning_rate": 7.9e-07, + "loss": 0.3447, + "mean_token_accuracy": 0.9289012476801872, + "num_tokens": 37795216.0, + "step": 525 + }, + { + "epoch": 0.32832676627784496, + "grad_norm": 1.0068774223327637, + "learning_rate": 7.895999999999999e-07, + "loss": 0.3429, + "mean_token_accuracy": 0.9220792353153229, + "num_tokens": 37869115.0, + "step": 526 + }, + { + "epoch": 0.32895096165099674, + "grad_norm": 0.7811362147331238, + "learning_rate": 7.892e-07, + "loss": 0.3277, + "mean_token_accuracy": 0.9309323355555534, + "num_tokens": 37939342.0, + "step": 527 + }, + { + "epoch": 0.3295751570241486, + "grad_norm": 0.5424177050590515, + "learning_rate": 7.887999999999999e-07, + "loss": 0.3594, + "mean_token_accuracy": 0.9236319363117218, + "num_tokens": 38016632.0, + "step": 528 + }, + { + "epoch": 0.33019935239730036, + "grad_norm": 0.7413662672042847, + "learning_rate": 7.883999999999999e-07, + "loss": 0.3306, + "mean_token_accuracy": 0.931208036839962, + "num_tokens": 38089679.0, + "step": 529 + }, + { + "epoch": 0.33082354777045214, + "grad_norm": 1.0389795303344727, + "learning_rate": 7.88e-07, + "loss": 0.3616, + "mean_token_accuracy": 0.926129661500454, + "num_tokens": 38166813.0, + "step": 530 + }, + { + "epoch": 0.3314477431436039, + "grad_norm": 0.3387523889541626, + "learning_rate": 7.875999999999999e-07, + "loss": 0.3404, + "mean_token_accuracy": 0.9300377070903778, + "num_tokens": 38240133.0, + "step": 531 + }, + { + "epoch": 0.33207193851675576, + "grad_norm": 0.5325695276260376, + "learning_rate": 7.872e-07, + "loss": 0.3512, + "mean_token_accuracy": 0.9276345297694206, + "num_tokens": 38314974.0, + "step": 532 + }, + { + "epoch": 0.33269613388990754, + "grad_norm": 0.984096884727478, + "learning_rate": 7.868e-07, + "loss": 0.3632, + "mean_token_accuracy": 0.9203736409544945, + "num_tokens": 38384547.0, + "step": 533 + }, + { + "epoch": 0.3333203292630593, + "grad_norm": 0.7203853726387024, + "learning_rate": 7.864e-07, + "loss": 0.3421, + "mean_token_accuracy": 0.9251072555780411, + "num_tokens": 38458541.0, + "step": 534 + }, + { + "epoch": 0.33394452463621116, + "grad_norm": 0.5719137787818909, + "learning_rate": 7.86e-07, + "loss": 0.3531, + "mean_token_accuracy": 0.9257007800042629, + "num_tokens": 38531488.0, + "step": 535 + }, + { + "epoch": 0.33456872000936294, + "grad_norm": 0.489790141582489, + "learning_rate": 7.855999999999999e-07, + "loss": 0.3332, + "mean_token_accuracy": 0.9269058927893639, + "num_tokens": 38602621.0, + "step": 536 + }, + { + "epoch": 0.3351929153825147, + "grad_norm": 0.48282408714294434, + "learning_rate": 7.852e-07, + "loss": 0.3778, + "mean_token_accuracy": 0.9227754510939121, + "num_tokens": 38673689.0, + "step": 537 + }, + { + "epoch": 0.3358171107556665, + "grad_norm": 0.7483878135681152, + "learning_rate": 7.848e-07, + "loss": 0.3111, + "mean_token_accuracy": 0.9343430511653423, + "num_tokens": 38750314.0, + "step": 538 + }, + { + "epoch": 0.33644130612881834, + "grad_norm": 0.7510576248168945, + "learning_rate": 7.844e-07, + "loss": 0.3718, + "mean_token_accuracy": 0.9187307097017765, + "num_tokens": 38826470.0, + "step": 539 + }, + { + "epoch": 0.3370655015019701, + "grad_norm": 0.7023730278015137, + "learning_rate": 7.84e-07, + "loss": 0.363, + "mean_token_accuracy": 0.9249767996370792, + "num_tokens": 38895411.0, + "step": 540 + }, + { + "epoch": 0.3376896968751219, + "grad_norm": 0.503331184387207, + "learning_rate": 7.835999999999999e-07, + "loss": 0.3811, + "mean_token_accuracy": 0.9164841137826443, + "num_tokens": 38963944.0, + "step": 541 + }, + { + "epoch": 0.3383138922482737, + "grad_norm": 0.4962184727191925, + "learning_rate": 7.832e-07, + "loss": 0.3871, + "mean_token_accuracy": 0.9121665470302105, + "num_tokens": 39031110.0, + "step": 542 + }, + { + "epoch": 0.3389380876214255, + "grad_norm": 2.1462461948394775, + "learning_rate": 7.828e-07, + "loss": 0.3531, + "mean_token_accuracy": 0.9244196452200413, + "num_tokens": 39103052.0, + "step": 543 + }, + { + "epoch": 0.3395622829945773, + "grad_norm": 0.7186685800552368, + "learning_rate": 7.823999999999999e-07, + "loss": 0.3179, + "mean_token_accuracy": 0.9326954819262028, + "num_tokens": 39183940.0, + "step": 544 + }, + { + "epoch": 0.3401864783677291, + "grad_norm": 0.706167995929718, + "learning_rate": 7.82e-07, + "loss": 0.3691, + "mean_token_accuracy": 0.9131165146827698, + "num_tokens": 39253289.0, + "step": 545 + }, + { + "epoch": 0.3408106737408809, + "grad_norm": 0.33079788088798523, + "learning_rate": 7.816e-07, + "loss": 0.349, + "mean_token_accuracy": 0.9257286265492439, + "num_tokens": 39324763.0, + "step": 546 + }, + { + "epoch": 0.3414348691140327, + "grad_norm": 0.6968596577644348, + "learning_rate": 7.811999999999999e-07, + "loss": 0.3438, + "mean_token_accuracy": 0.9264971278607845, + "num_tokens": 39397582.0, + "step": 547 + }, + { + "epoch": 0.3420590644871845, + "grad_norm": 0.7445778846740723, + "learning_rate": 7.808e-07, + "loss": 0.3838, + "mean_token_accuracy": 0.9126134067773819, + "num_tokens": 39469006.0, + "step": 548 + }, + { + "epoch": 0.34268325986033626, + "grad_norm": 0.482977032661438, + "learning_rate": 7.804e-07, + "loss": 0.3285, + "mean_token_accuracy": 0.931167010217905, + "num_tokens": 39547491.0, + "step": 549 + }, + { + "epoch": 0.3433074552334881, + "grad_norm": 0.7700469493865967, + "learning_rate": 7.799999999999999e-07, + "loss": 0.3661, + "mean_token_accuracy": 0.9219079948961735, + "num_tokens": 39617465.0, + "step": 550 + }, + { + "epoch": 0.3439316506066399, + "grad_norm": 0.9452283978462219, + "learning_rate": 7.795999999999999e-07, + "loss": 0.3585, + "mean_token_accuracy": 0.9159074127674103, + "num_tokens": 39688818.0, + "step": 551 + }, + { + "epoch": 0.34455584597979166, + "grad_norm": 0.6997014284133911, + "learning_rate": 7.792e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9276483282446861, + "num_tokens": 39762354.0, + "step": 552 + }, + { + "epoch": 0.34518004135294345, + "grad_norm": 0.4857024848461151, + "learning_rate": 7.788000000000001e-07, + "loss": 0.3608, + "mean_token_accuracy": 0.9200742207467556, + "num_tokens": 39831316.0, + "step": 553 + }, + { + "epoch": 0.3458042367260953, + "grad_norm": 0.48957309126853943, + "learning_rate": 7.783999999999999e-07, + "loss": 0.3403, + "mean_token_accuracy": 0.9275477230548859, + "num_tokens": 39906027.0, + "step": 554 + }, + { + "epoch": 0.34642843209924706, + "grad_norm": 0.48259392380714417, + "learning_rate": 7.78e-07, + "loss": 0.3677, + "mean_token_accuracy": 0.9230173043906689, + "num_tokens": 39978449.0, + "step": 555 + }, + { + "epoch": 0.34705262747239884, + "grad_norm": 0.645094633102417, + "learning_rate": 7.776e-07, + "loss": 0.3602, + "mean_token_accuracy": 0.92342958599329, + "num_tokens": 40051792.0, + "step": 556 + }, + { + "epoch": 0.3476768228455507, + "grad_norm": 0.936673104763031, + "learning_rate": 7.771999999999999e-07, + "loss": 0.3368, + "mean_token_accuracy": 0.9274178445339203, + "num_tokens": 40125285.0, + "step": 557 + }, + { + "epoch": 0.34830101821870246, + "grad_norm": 0.32944637537002563, + "learning_rate": 7.768e-07, + "loss": 0.377, + "mean_token_accuracy": 0.918434377759695, + "num_tokens": 40196636.0, + "step": 558 + }, + { + "epoch": 0.34892521359185424, + "grad_norm": 0.7557623982429504, + "learning_rate": 7.764e-07, + "loss": 0.3711, + "mean_token_accuracy": 0.9227299764752388, + "num_tokens": 40267280.0, + "step": 559 + }, + { + "epoch": 0.349549408965006, + "grad_norm": 0.46764472126960754, + "learning_rate": 7.76e-07, + "loss": 0.3372, + "mean_token_accuracy": 0.9256122633814812, + "num_tokens": 40341095.0, + "step": 560 + }, + { + "epoch": 0.35017360433815786, + "grad_norm": 0.6640878319740295, + "learning_rate": 7.755999999999999e-07, + "loss": 0.3558, + "mean_token_accuracy": 0.9216841869056225, + "num_tokens": 40411829.0, + "step": 561 + }, + { + "epoch": 0.35079779971130964, + "grad_norm": 0.2680525779724121, + "learning_rate": 7.752e-07, + "loss": 0.347, + "mean_token_accuracy": 0.9253168888390064, + "num_tokens": 40481909.0, + "step": 562 + }, + { + "epoch": 0.3514219950844614, + "grad_norm": 0.9046684503555298, + "learning_rate": 7.748e-07, + "loss": 0.3419, + "mean_token_accuracy": 0.9286962039768696, + "num_tokens": 40557235.0, + "step": 563 + }, + { + "epoch": 0.3520461904576132, + "grad_norm": 0.5573182106018066, + "learning_rate": 7.743999999999999e-07, + "loss": 0.3292, + "mean_token_accuracy": 0.925889540463686, + "num_tokens": 40631577.0, + "step": 564 + }, + { + "epoch": 0.35267038583076504, + "grad_norm": 0.8464686870574951, + "learning_rate": 7.74e-07, + "loss": 0.3286, + "mean_token_accuracy": 0.9284627512097359, + "num_tokens": 40705192.0, + "step": 565 + }, + { + "epoch": 0.3532945812039168, + "grad_norm": 0.4341270923614502, + "learning_rate": 7.735999999999999e-07, + "loss": 0.3325, + "mean_token_accuracy": 0.9269193299114704, + "num_tokens": 40776977.0, + "step": 566 + }, + { + "epoch": 0.3539187765770686, + "grad_norm": 0.4663252830505371, + "learning_rate": 7.732e-07, + "loss": 0.3679, + "mean_token_accuracy": 0.9171105623245239, + "num_tokens": 40846053.0, + "step": 567 + }, + { + "epoch": 0.35454297195022044, + "grad_norm": 0.46172523498535156, + "learning_rate": 7.728e-07, + "loss": 0.3374, + "mean_token_accuracy": 0.9275957085192204, + "num_tokens": 40920086.0, + "step": 568 + }, + { + "epoch": 0.3551671673233722, + "grad_norm": 0.2973470389842987, + "learning_rate": 7.723999999999999e-07, + "loss": 0.2961, + "mean_token_accuracy": 0.9342636428773403, + "num_tokens": 40996730.0, + "step": 569 + }, + { + "epoch": 0.355791362696524, + "grad_norm": 0.6722999215126038, + "learning_rate": 7.72e-07, + "loss": 0.3914, + "mean_token_accuracy": 0.9111681878566742, + "num_tokens": 41065835.0, + "step": 570 + }, + { + "epoch": 0.3564155580696758, + "grad_norm": 0.4647853672504425, + "learning_rate": 7.716e-07, + "loss": 0.3628, + "mean_token_accuracy": 0.9266398400068283, + "num_tokens": 41138942.0, + "step": 571 + }, + { + "epoch": 0.3570397534428276, + "grad_norm": 0.7400867938995361, + "learning_rate": 7.711999999999999e-07, + "loss": 0.3641, + "mean_token_accuracy": 0.9188443422317505, + "num_tokens": 41210620.0, + "step": 572 + }, + { + "epoch": 0.3576639488159794, + "grad_norm": 0.35945945978164673, + "learning_rate": 7.708e-07, + "loss": 0.3423, + "mean_token_accuracy": 0.9254635684192181, + "num_tokens": 41283277.0, + "step": 573 + }, + { + "epoch": 0.3582881441891312, + "grad_norm": 0.4682519733905792, + "learning_rate": 7.704e-07, + "loss": 0.3759, + "mean_token_accuracy": 0.9224316813051701, + "num_tokens": 41353184.0, + "step": 574 + }, + { + "epoch": 0.35891233956228297, + "grad_norm": 0.8023708462715149, + "learning_rate": 7.699999999999999e-07, + "loss": 0.3108, + "mean_token_accuracy": 0.9367459304630756, + "num_tokens": 41431034.0, + "step": 575 + }, + { + "epoch": 0.3595365349354348, + "grad_norm": 0.707558810710907, + "learning_rate": 7.695999999999999e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9225289300084114, + "num_tokens": 41504232.0, + "step": 576 + }, + { + "epoch": 0.3601607303085866, + "grad_norm": 1.1275163888931274, + "learning_rate": 7.692e-07, + "loss": 0.331, + "mean_token_accuracy": 0.9252995476126671, + "num_tokens": 41578544.0, + "step": 577 + }, + { + "epoch": 0.36078492568173837, + "grad_norm": 0.42957794666290283, + "learning_rate": 7.688000000000001e-07, + "loss": 0.3578, + "mean_token_accuracy": 0.920187171548605, + "num_tokens": 41649210.0, + "step": 578 + }, + { + "epoch": 0.3614091210548902, + "grad_norm": 0.8529049158096313, + "learning_rate": 7.683999999999999e-07, + "loss": 0.3315, + "mean_token_accuracy": 0.9296321868896484, + "num_tokens": 41728910.0, + "step": 579 + }, + { + "epoch": 0.362033316428042, + "grad_norm": 0.6229732036590576, + "learning_rate": 7.68e-07, + "loss": 0.3045, + "mean_token_accuracy": 0.9329870566725731, + "num_tokens": 41805093.0, + "step": 580 + }, + { + "epoch": 0.36265751180119377, + "grad_norm": 0.7021951079368591, + "learning_rate": 7.676e-07, + "loss": 0.3538, + "mean_token_accuracy": 0.9242266975343227, + "num_tokens": 41873966.0, + "step": 581 + }, + { + "epoch": 0.36328170717434555, + "grad_norm": 0.4560698866844177, + "learning_rate": 7.671999999999999e-07, + "loss": 0.3298, + "mean_token_accuracy": 0.9291101545095444, + "num_tokens": 41949321.0, + "step": 582 + }, + { + "epoch": 0.3639059025474974, + "grad_norm": 0.4377002418041229, + "learning_rate": 7.668e-07, + "loss": 0.3624, + "mean_token_accuracy": 0.9210468083620071, + "num_tokens": 42021246.0, + "step": 583 + }, + { + "epoch": 0.36453009792064917, + "grad_norm": 0.294572651386261, + "learning_rate": 7.664e-07, + "loss": 0.3523, + "mean_token_accuracy": 0.9218520075082779, + "num_tokens": 42092564.0, + "step": 584 + }, + { + "epoch": 0.36515429329380095, + "grad_norm": 0.29711201786994934, + "learning_rate": 7.66e-07, + "loss": 0.3678, + "mean_token_accuracy": 0.9170925952494144, + "num_tokens": 42163349.0, + "step": 585 + }, + { + "epoch": 0.3657784886669528, + "grad_norm": 0.6175806522369385, + "learning_rate": 7.655999999999999e-07, + "loss": 0.3812, + "mean_token_accuracy": 0.9189609922468662, + "num_tokens": 42234589.0, + "step": 586 + }, + { + "epoch": 0.36640268404010456, + "grad_norm": 0.7820612788200378, + "learning_rate": 7.652e-07, + "loss": 0.3493, + "mean_token_accuracy": 0.9235295131802559, + "num_tokens": 42303104.0, + "step": 587 + }, + { + "epoch": 0.36702687941325635, + "grad_norm": 0.48036956787109375, + "learning_rate": 7.648e-07, + "loss": 0.3623, + "mean_token_accuracy": 0.9198840446770191, + "num_tokens": 42374253.0, + "step": 588 + }, + { + "epoch": 0.3676510747864081, + "grad_norm": 0.6162477731704712, + "learning_rate": 7.643999999999999e-07, + "loss": 0.3427, + "mean_token_accuracy": 0.9269851222634315, + "num_tokens": 42451484.0, + "step": 589 + }, + { + "epoch": 0.36827527015955996, + "grad_norm": 0.6017821431159973, + "learning_rate": 7.64e-07, + "loss": 0.3463, + "mean_token_accuracy": 0.9242015965282917, + "num_tokens": 42524055.0, + "step": 590 + }, + { + "epoch": 0.36889946553271175, + "grad_norm": 0.7923152446746826, + "learning_rate": 7.635999999999999e-07, + "loss": 0.3401, + "mean_token_accuracy": 0.9252413921058178, + "num_tokens": 42599454.0, + "step": 591 + }, + { + "epoch": 0.3695236609058635, + "grad_norm": 0.452820360660553, + "learning_rate": 7.632e-07, + "loss": 0.4054, + "mean_token_accuracy": 0.9114637114107609, + "num_tokens": 42666310.0, + "step": 592 + }, + { + "epoch": 0.3701478562790153, + "grad_norm": 0.5999427437782288, + "learning_rate": 7.628e-07, + "loss": 0.3438, + "mean_token_accuracy": 0.9267684370279312, + "num_tokens": 42739825.0, + "step": 593 + }, + { + "epoch": 0.37077205165216714, + "grad_norm": 0.7625322937965393, + "learning_rate": 7.623999999999999e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9250470995903015, + "num_tokens": 42816765.0, + "step": 594 + }, + { + "epoch": 0.3713962470253189, + "grad_norm": 0.6362447738647461, + "learning_rate": 7.62e-07, + "loss": 0.3317, + "mean_token_accuracy": 0.9234643131494522, + "num_tokens": 42890591.0, + "step": 595 + }, + { + "epoch": 0.3720204423984707, + "grad_norm": 0.5599761009216309, + "learning_rate": 7.616e-07, + "loss": 0.3412, + "mean_token_accuracy": 0.9247772358357906, + "num_tokens": 42961701.0, + "step": 596 + }, + { + "epoch": 0.37264463777162254, + "grad_norm": 0.5791319012641907, + "learning_rate": 7.611999999999999e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9239939153194427, + "num_tokens": 43033519.0, + "step": 597 + }, + { + "epoch": 0.3732688331447743, + "grad_norm": 0.5624537467956543, + "learning_rate": 7.608e-07, + "loss": 0.3296, + "mean_token_accuracy": 0.9278119280934334, + "num_tokens": 43105774.0, + "step": 598 + }, + { + "epoch": 0.3738930285179261, + "grad_norm": 0.7942206859588623, + "learning_rate": 7.604e-07, + "loss": 0.3631, + "mean_token_accuracy": 0.9214097931981087, + "num_tokens": 43175469.0, + "step": 599 + }, + { + "epoch": 0.3745172238910779, + "grad_norm": 0.34156644344329834, + "learning_rate": 7.599999999999999e-07, + "loss": 0.3381, + "mean_token_accuracy": 0.925388790667057, + "num_tokens": 43244559.0, + "step": 600 + }, + { + "epoch": 0.3751414192642297, + "grad_norm": 0.2829967737197876, + "learning_rate": 7.596e-07, + "loss": 0.3615, + "mean_token_accuracy": 0.9234224781394005, + "num_tokens": 43316161.0, + "step": 601 + }, + { + "epoch": 0.3757656146373815, + "grad_norm": 0.6025108098983765, + "learning_rate": 7.592e-07, + "loss": 0.3705, + "mean_token_accuracy": 0.919626459479332, + "num_tokens": 43386130.0, + "step": 602 + }, + { + "epoch": 0.3763898100105333, + "grad_norm": 0.5837751030921936, + "learning_rate": 7.588e-07, + "loss": 0.3318, + "mean_token_accuracy": 0.9294660799205303, + "num_tokens": 43460187.0, + "step": 603 + }, + { + "epoch": 0.37701400538368507, + "grad_norm": 0.3845606744289398, + "learning_rate": 7.583999999999999e-07, + "loss": 0.341, + "mean_token_accuracy": 0.9264329858124256, + "num_tokens": 43532441.0, + "step": 604 + }, + { + "epoch": 0.3776382007568369, + "grad_norm": 0.40432336926460266, + "learning_rate": 7.58e-07, + "loss": 0.3305, + "mean_token_accuracy": 0.9207708723843098, + "num_tokens": 43603439.0, + "step": 605 + }, + { + "epoch": 0.3782623961299887, + "grad_norm": 0.5441488027572632, + "learning_rate": 7.576000000000001e-07, + "loss": 0.3213, + "mean_token_accuracy": 0.9323575422167778, + "num_tokens": 43678349.0, + "step": 606 + }, + { + "epoch": 0.37888659150314047, + "grad_norm": 0.5424903035163879, + "learning_rate": 7.571999999999999e-07, + "loss": 0.3629, + "mean_token_accuracy": 0.9164081439375877, + "num_tokens": 43748106.0, + "step": 607 + }, + { + "epoch": 0.3795107868762923, + "grad_norm": 0.5946375727653503, + "learning_rate": 7.568e-07, + "loss": 0.3344, + "mean_token_accuracy": 0.9272113926708698, + "num_tokens": 43818438.0, + "step": 608 + }, + { + "epoch": 0.3801349822494441, + "grad_norm": 0.9699196815490723, + "learning_rate": 7.564e-07, + "loss": 0.3225, + "mean_token_accuracy": 0.9256913587450981, + "num_tokens": 43890611.0, + "step": 609 + }, + { + "epoch": 0.38075917762259587, + "grad_norm": 0.4665364921092987, + "learning_rate": 7.559999999999999e-07, + "loss": 0.3366, + "mean_token_accuracy": 0.9285813868045807, + "num_tokens": 43967137.0, + "step": 610 + }, + { + "epoch": 0.38138337299574765, + "grad_norm": 0.5336482524871826, + "learning_rate": 7.556e-07, + "loss": 0.344, + "mean_token_accuracy": 0.9211304225027561, + "num_tokens": 44037936.0, + "step": 611 + }, + { + "epoch": 0.3820075683688995, + "grad_norm": 0.5326005816459656, + "learning_rate": 7.552e-07, + "loss": 0.3637, + "mean_token_accuracy": 0.9215891696512699, + "num_tokens": 44111609.0, + "step": 612 + }, + { + "epoch": 0.38263176374205127, + "grad_norm": 0.27235645055770874, + "learning_rate": 7.548e-07, + "loss": 0.3398, + "mean_token_accuracy": 0.9236892722547054, + "num_tokens": 44185059.0, + "step": 613 + }, + { + "epoch": 0.38325595911520305, + "grad_norm": 0.47155284881591797, + "learning_rate": 7.543999999999999e-07, + "loss": 0.3316, + "mean_token_accuracy": 0.9268237687647343, + "num_tokens": 44258343.0, + "step": 614 + }, + { + "epoch": 0.38388015448835483, + "grad_norm": 0.3805808424949646, + "learning_rate": 7.54e-07, + "loss": 0.3426, + "mean_token_accuracy": 0.9244777448475361, + "num_tokens": 44328611.0, + "step": 615 + }, + { + "epoch": 0.38450434986150667, + "grad_norm": 0.5564144849777222, + "learning_rate": 7.536e-07, + "loss": 0.3464, + "mean_token_accuracy": 0.9197547808289528, + "num_tokens": 44399191.0, + "step": 616 + }, + { + "epoch": 0.38512854523465845, + "grad_norm": 0.5806828737258911, + "learning_rate": 7.531999999999999e-07, + "loss": 0.2991, + "mean_token_accuracy": 0.9331597909331322, + "num_tokens": 44474432.0, + "step": 617 + }, + { + "epoch": 0.38575274060781023, + "grad_norm": 0.5906999707221985, + "learning_rate": 7.528e-07, + "loss": 0.3572, + "mean_token_accuracy": 0.9175950661301613, + "num_tokens": 44543952.0, + "step": 618 + }, + { + "epoch": 0.38637693598096207, + "grad_norm": 0.3685188889503479, + "learning_rate": 7.523999999999999e-07, + "loss": 0.345, + "mean_token_accuracy": 0.9218173548579216, + "num_tokens": 44615331.0, + "step": 619 + }, + { + "epoch": 0.38700113135411385, + "grad_norm": 0.42963331937789917, + "learning_rate": 7.52e-07, + "loss": 0.3824, + "mean_token_accuracy": 0.9150598384439945, + "num_tokens": 44685793.0, + "step": 620 + }, + { + "epoch": 0.38762532672726563, + "grad_norm": 0.7856587171554565, + "learning_rate": 7.516e-07, + "loss": 0.3391, + "mean_token_accuracy": 0.9240389838814735, + "num_tokens": 44760555.0, + "step": 621 + }, + { + "epoch": 0.3882495221004174, + "grad_norm": 0.6671410799026489, + "learning_rate": 7.511999999999999e-07, + "loss": 0.3251, + "mean_token_accuracy": 0.9271936528384686, + "num_tokens": 44830025.0, + "step": 622 + }, + { + "epoch": 0.38887371747356925, + "grad_norm": 0.36151447892189026, + "learning_rate": 7.508e-07, + "loss": 0.3365, + "mean_token_accuracy": 0.9240407310426235, + "num_tokens": 44905764.0, + "step": 623 + }, + { + "epoch": 0.38949791284672103, + "grad_norm": 0.38457614183425903, + "learning_rate": 7.503999999999999e-07, + "loss": 0.3688, + "mean_token_accuracy": 0.9188949130475521, + "num_tokens": 44975012.0, + "step": 624 + }, + { + "epoch": 0.3901221082198728, + "grad_norm": 0.3652454614639282, + "learning_rate": 7.5e-07, + "loss": 0.3142, + "mean_token_accuracy": 0.9310969449579716, + "num_tokens": 45045839.0, + "step": 625 + }, + { + "epoch": 0.3907463035930246, + "grad_norm": 0.5461031794548035, + "learning_rate": 7.496e-07, + "loss": 0.3425, + "mean_token_accuracy": 0.9251462630927563, + "num_tokens": 45117462.0, + "step": 626 + }, + { + "epoch": 0.3913704989661764, + "grad_norm": 0.4545503854751587, + "learning_rate": 7.492e-07, + "loss": 0.383, + "mean_token_accuracy": 0.9178108982741833, + "num_tokens": 45185548.0, + "step": 627 + }, + { + "epoch": 0.3919946943393282, + "grad_norm": 0.38261616230010986, + "learning_rate": 7.488e-07, + "loss": 0.3504, + "mean_token_accuracy": 0.920975849032402, + "num_tokens": 45261338.0, + "step": 628 + }, + { + "epoch": 0.39261888971248, + "grad_norm": 0.2625387907028198, + "learning_rate": 7.483999999999999e-07, + "loss": 0.3511, + "mean_token_accuracy": 0.9234513938426971, + "num_tokens": 45334516.0, + "step": 629 + }, + { + "epoch": 0.3932430850856318, + "grad_norm": 0.48953914642333984, + "learning_rate": 7.48e-07, + "loss": 0.3267, + "mean_token_accuracy": 0.9276548400521278, + "num_tokens": 45408844.0, + "step": 630 + }, + { + "epoch": 0.3938672804587836, + "grad_norm": 0.25507792830467224, + "learning_rate": 7.476e-07, + "loss": 0.3275, + "mean_token_accuracy": 0.9272936768829823, + "num_tokens": 45484129.0, + "step": 631 + }, + { + "epoch": 0.3944914758319354, + "grad_norm": 0.7709338068962097, + "learning_rate": 7.471999999999999e-07, + "loss": 0.3385, + "mean_token_accuracy": 0.9237735457718372, + "num_tokens": 45557220.0, + "step": 632 + }, + { + "epoch": 0.39511567120508717, + "grad_norm": 0.5819105505943298, + "learning_rate": 7.468e-07, + "loss": 0.3112, + "mean_token_accuracy": 0.9251358509063721, + "num_tokens": 45631093.0, + "step": 633 + }, + { + "epoch": 0.395739866578239, + "grad_norm": 0.26736271381378174, + "learning_rate": 7.464e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.926128089427948, + "num_tokens": 45702723.0, + "step": 634 + }, + { + "epoch": 0.3963640619513908, + "grad_norm": 0.4306083917617798, + "learning_rate": 7.459999999999999e-07, + "loss": 0.3154, + "mean_token_accuracy": 0.9313743449747562, + "num_tokens": 45774922.0, + "step": 635 + }, + { + "epoch": 0.39698825732454257, + "grad_norm": 0.40494468808174133, + "learning_rate": 7.456e-07, + "loss": 0.3519, + "mean_token_accuracy": 0.9186866208910942, + "num_tokens": 45846528.0, + "step": 636 + }, + { + "epoch": 0.39761245269769435, + "grad_norm": 0.5655471086502075, + "learning_rate": 7.452e-07, + "loss": 0.3445, + "mean_token_accuracy": 0.9245801381766796, + "num_tokens": 45915531.0, + "step": 637 + }, + { + "epoch": 0.3982366480708462, + "grad_norm": 0.2741180956363678, + "learning_rate": 7.447999999999999e-07, + "loss": 0.3468, + "mean_token_accuracy": 0.9221871383488178, + "num_tokens": 45990719.0, + "step": 638 + }, + { + "epoch": 0.39886084344399797, + "grad_norm": 0.29522502422332764, + "learning_rate": 7.443999999999999e-07, + "loss": 0.3547, + "mean_token_accuracy": 0.9206781312823296, + "num_tokens": 46061240.0, + "step": 639 + }, + { + "epoch": 0.39948503881714975, + "grad_norm": 0.6466435790061951, + "learning_rate": 7.44e-07, + "loss": 0.329, + "mean_token_accuracy": 0.9293878078460693, + "num_tokens": 46135944.0, + "step": 640 + }, + { + "epoch": 0.4001092341903016, + "grad_norm": 0.29942116141319275, + "learning_rate": 7.436e-07, + "loss": 0.3458, + "mean_token_accuracy": 0.925358172506094, + "num_tokens": 46207508.0, + "step": 641 + }, + { + "epoch": 0.40073342956345337, + "grad_norm": 0.6349466443061829, + "learning_rate": 7.431999999999999e-07, + "loss": 0.3321, + "mean_token_accuracy": 0.9232594855129719, + "num_tokens": 46278741.0, + "step": 642 + }, + { + "epoch": 0.40135762493660515, + "grad_norm": 0.48047778010368347, + "learning_rate": 7.428e-07, + "loss": 0.3527, + "mean_token_accuracy": 0.9166908822953701, + "num_tokens": 46347139.0, + "step": 643 + }, + { + "epoch": 0.40198182030975693, + "grad_norm": 0.418552964925766, + "learning_rate": 7.423999999999999e-07, + "loss": 0.3519, + "mean_token_accuracy": 0.9196455329656601, + "num_tokens": 46415780.0, + "step": 644 + }, + { + "epoch": 0.40260601568290877, + "grad_norm": 0.7869686484336853, + "learning_rate": 7.42e-07, + "loss": 0.3725, + "mean_token_accuracy": 0.9172872118651867, + "num_tokens": 46484109.0, + "step": 645 + }, + { + "epoch": 0.40323021105606055, + "grad_norm": 0.3682011663913727, + "learning_rate": 7.416e-07, + "loss": 0.317, + "mean_token_accuracy": 0.9321021102368832, + "num_tokens": 46558889.0, + "step": 646 + }, + { + "epoch": 0.40385440642921233, + "grad_norm": 0.5389231443405151, + "learning_rate": 7.411999999999999e-07, + "loss": 0.3306, + "mean_token_accuracy": 0.9275304302573204, + "num_tokens": 46634325.0, + "step": 647 + }, + { + "epoch": 0.4044786018023641, + "grad_norm": 0.2784671187400818, + "learning_rate": 7.408e-07, + "loss": 0.3292, + "mean_token_accuracy": 0.9264209941029549, + "num_tokens": 46705187.0, + "step": 648 + }, + { + "epoch": 0.40510279717551595, + "grad_norm": 0.6616075038909912, + "learning_rate": 7.403999999999999e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9315904937684536, + "num_tokens": 46782367.0, + "step": 649 + }, + { + "epoch": 0.40572699254866773, + "grad_norm": 0.5459262728691101, + "learning_rate": 7.4e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9320596344769001, + "num_tokens": 46853397.0, + "step": 650 + }, + { + "epoch": 0.4063511879218195, + "grad_norm": 0.37146082520484924, + "learning_rate": 7.396e-07, + "loss": 0.3389, + "mean_token_accuracy": 0.9261991530656815, + "num_tokens": 46920787.0, + "step": 651 + }, + { + "epoch": 0.40697538329497135, + "grad_norm": 0.38181981444358826, + "learning_rate": 7.392e-07, + "loss": 0.3666, + "mean_token_accuracy": 0.9177272506058216, + "num_tokens": 46994472.0, + "step": 652 + }, + { + "epoch": 0.40759957866812313, + "grad_norm": 0.41725292801856995, + "learning_rate": 7.388e-07, + "loss": 0.3277, + "mean_token_accuracy": 0.9293075203895569, + "num_tokens": 47068619.0, + "step": 653 + }, + { + "epoch": 0.4082237740412749, + "grad_norm": 0.4707614481449127, + "learning_rate": 7.383999999999999e-07, + "loss": 0.357, + "mean_token_accuracy": 0.9203765951097012, + "num_tokens": 47136036.0, + "step": 654 + }, + { + "epoch": 0.4088479694144267, + "grad_norm": 0.414760947227478, + "learning_rate": 7.38e-07, + "loss": 0.3512, + "mean_token_accuracy": 0.9205658622086048, + "num_tokens": 47207154.0, + "step": 655 + }, + { + "epoch": 0.40947216478757853, + "grad_norm": 0.6463868021965027, + "learning_rate": 7.376e-07, + "loss": 0.3118, + "mean_token_accuracy": 0.931792251765728, + "num_tokens": 47283959.0, + "step": 656 + }, + { + "epoch": 0.4100963601607303, + "grad_norm": 0.37666013836860657, + "learning_rate": 7.371999999999999e-07, + "loss": 0.3435, + "mean_token_accuracy": 0.9253863766789436, + "num_tokens": 47354576.0, + "step": 657 + }, + { + "epoch": 0.4107205555338821, + "grad_norm": 0.5181844234466553, + "learning_rate": 7.368e-07, + "loss": 0.3291, + "mean_token_accuracy": 0.9307320602238178, + "num_tokens": 47428241.0, + "step": 658 + }, + { + "epoch": 0.41134475090703393, + "grad_norm": 0.37594252824783325, + "learning_rate": 7.364000000000001e-07, + "loss": 0.3175, + "mean_token_accuracy": 0.9273154586553574, + "num_tokens": 47499461.0, + "step": 659 + }, + { + "epoch": 0.4119689462801857, + "grad_norm": 0.5229082703590393, + "learning_rate": 7.359999999999999e-07, + "loss": 0.3476, + "mean_token_accuracy": 0.9239908717572689, + "num_tokens": 47573290.0, + "step": 660 + }, + { + "epoch": 0.4125931416533375, + "grad_norm": 0.36330297589302063, + "learning_rate": 7.356e-07, + "loss": 0.3656, + "mean_token_accuracy": 0.922837421298027, + "num_tokens": 47643897.0, + "step": 661 + }, + { + "epoch": 0.4132173370264893, + "grad_norm": 0.5052156448364258, + "learning_rate": 7.352e-07, + "loss": 0.3318, + "mean_token_accuracy": 0.9263353794813156, + "num_tokens": 47716736.0, + "step": 662 + }, + { + "epoch": 0.4138415323996411, + "grad_norm": 0.2872786819934845, + "learning_rate": 7.347999999999999e-07, + "loss": 0.3451, + "mean_token_accuracy": 0.9226055145263672, + "num_tokens": 47788926.0, + "step": 663 + }, + { + "epoch": 0.4144657277727929, + "grad_norm": 0.4757533669471741, + "learning_rate": 7.344e-07, + "loss": 0.3726, + "mean_token_accuracy": 0.9131042547523975, + "num_tokens": 47855829.0, + "step": 664 + }, + { + "epoch": 0.4150899231459447, + "grad_norm": 0.49452951550483704, + "learning_rate": 7.34e-07, + "loss": 0.3473, + "mean_token_accuracy": 0.9252515956759453, + "num_tokens": 47927776.0, + "step": 665 + }, + { + "epoch": 0.41571411851909645, + "grad_norm": 0.44129571318626404, + "learning_rate": 7.336e-07, + "loss": 0.3153, + "mean_token_accuracy": 0.9331275187432766, + "num_tokens": 47994963.0, + "step": 666 + }, + { + "epoch": 0.4163383138922483, + "grad_norm": 0.4779670238494873, + "learning_rate": 7.331999999999999e-07, + "loss": 0.3205, + "mean_token_accuracy": 0.932083748281002, + "num_tokens": 48067456.0, + "step": 667 + }, + { + "epoch": 0.41696250926540007, + "grad_norm": 0.35190168023109436, + "learning_rate": 7.328e-07, + "loss": 0.3184, + "mean_token_accuracy": 0.9310644865036011, + "num_tokens": 48140224.0, + "step": 668 + }, + { + "epoch": 0.41758670463855185, + "grad_norm": 0.4720968008041382, + "learning_rate": 7.324e-07, + "loss": 0.3711, + "mean_token_accuracy": 0.9152346327900887, + "num_tokens": 48208398.0, + "step": 669 + }, + { + "epoch": 0.4182109000117037, + "grad_norm": 0.5012626051902771, + "learning_rate": 7.319999999999999e-07, + "loss": 0.3405, + "mean_token_accuracy": 0.9250470548868179, + "num_tokens": 48280914.0, + "step": 670 + }, + { + "epoch": 0.41883509538485547, + "grad_norm": 0.6238592863082886, + "learning_rate": 7.316e-07, + "loss": 0.3299, + "mean_token_accuracy": 0.9239915870130062, + "num_tokens": 48352906.0, + "step": 671 + }, + { + "epoch": 0.41945929075800725, + "grad_norm": 0.4548547565937042, + "learning_rate": 7.311999999999999e-07, + "loss": 0.3417, + "mean_token_accuracy": 0.920001145452261, + "num_tokens": 48426029.0, + "step": 672 + }, + { + "epoch": 0.42008348613115903, + "grad_norm": 0.40740537643432617, + "learning_rate": 7.308e-07, + "loss": 0.364, + "mean_token_accuracy": 0.9215504974126816, + "num_tokens": 48497296.0, + "step": 673 + }, + { + "epoch": 0.42070768150431087, + "grad_norm": 0.34685757756233215, + "learning_rate": 7.304e-07, + "loss": 0.32, + "mean_token_accuracy": 0.9303984344005585, + "num_tokens": 48571241.0, + "step": 674 + }, + { + "epoch": 0.42133187687746265, + "grad_norm": 2.8446075916290283, + "learning_rate": 7.3e-07, + "loss": 0.3484, + "mean_token_accuracy": 0.9234076887369156, + "num_tokens": 48644885.0, + "step": 675 + }, + { + "epoch": 0.42195607225061443, + "grad_norm": 0.5381395816802979, + "learning_rate": 7.296e-07, + "loss": 0.344, + "mean_token_accuracy": 0.9226926155388355, + "num_tokens": 48717883.0, + "step": 676 + }, + { + "epoch": 0.4225802676237662, + "grad_norm": 0.3551507294178009, + "learning_rate": 7.291999999999999e-07, + "loss": 0.3499, + "mean_token_accuracy": 0.9218867346644402, + "num_tokens": 48788914.0, + "step": 677 + }, + { + "epoch": 0.42320446299691805, + "grad_norm": 0.5331665873527527, + "learning_rate": 7.288e-07, + "loss": 0.3602, + "mean_token_accuracy": 0.9145861491560936, + "num_tokens": 48859661.0, + "step": 678 + }, + { + "epoch": 0.42382865837006983, + "grad_norm": 0.27044275403022766, + "learning_rate": 7.284e-07, + "loss": 0.3292, + "mean_token_accuracy": 0.9240933321416378, + "num_tokens": 48932733.0, + "step": 679 + }, + { + "epoch": 0.4244528537432216, + "grad_norm": 0.4443718492984772, + "learning_rate": 7.28e-07, + "loss": 0.346, + "mean_token_accuracy": 0.9242341965436935, + "num_tokens": 49008579.0, + "step": 680 + }, + { + "epoch": 0.42507704911637345, + "grad_norm": 0.36047619581222534, + "learning_rate": 7.276e-07, + "loss": 0.3097, + "mean_token_accuracy": 0.9259445257484913, + "num_tokens": 49077572.0, + "step": 681 + }, + { + "epoch": 0.42570124448952523, + "grad_norm": 0.4974146783351898, + "learning_rate": 7.271999999999999e-07, + "loss": 0.3585, + "mean_token_accuracy": 0.9235406182706356, + "num_tokens": 49149258.0, + "step": 682 + }, + { + "epoch": 0.426325439862677, + "grad_norm": 0.34493857622146606, + "learning_rate": 7.268e-07, + "loss": 0.3131, + "mean_token_accuracy": 0.9315989725291729, + "num_tokens": 49221956.0, + "step": 683 + }, + { + "epoch": 0.4269496352358288, + "grad_norm": 0.4466032385826111, + "learning_rate": 7.264e-07, + "loss": 0.3372, + "mean_token_accuracy": 0.9272586852312088, + "num_tokens": 49298313.0, + "step": 684 + }, + { + "epoch": 0.42757383060898063, + "grad_norm": 0.46275943517684937, + "learning_rate": 7.259999999999999e-07, + "loss": 0.3319, + "mean_token_accuracy": 0.927916556596756, + "num_tokens": 49369108.0, + "step": 685 + }, + { + "epoch": 0.4281980259821324, + "grad_norm": 0.27612945437431335, + "learning_rate": 7.256e-07, + "loss": 0.3326, + "mean_token_accuracy": 0.9266219213604927, + "num_tokens": 49443585.0, + "step": 686 + }, + { + "epoch": 0.4288222213552842, + "grad_norm": 0.3892485499382019, + "learning_rate": 7.252e-07, + "loss": 0.3506, + "mean_token_accuracy": 0.9223848171532154, + "num_tokens": 49510772.0, + "step": 687 + }, + { + "epoch": 0.429446416728436, + "grad_norm": 0.35472822189331055, + "learning_rate": 7.247999999999999e-07, + "loss": 0.3339, + "mean_token_accuracy": 0.9262644350528717, + "num_tokens": 49581456.0, + "step": 688 + }, + { + "epoch": 0.4300706121015878, + "grad_norm": 0.5208513736724854, + "learning_rate": 7.244e-07, + "loss": 0.3695, + "mean_token_accuracy": 0.9152507446706295, + "num_tokens": 49645190.0, + "step": 689 + }, + { + "epoch": 0.4306948074747396, + "grad_norm": 0.613875150680542, + "learning_rate": 7.24e-07, + "loss": 0.3322, + "mean_token_accuracy": 0.92716945707798, + "num_tokens": 49718362.0, + "step": 690 + }, + { + "epoch": 0.4313190028478914, + "grad_norm": 0.43399953842163086, + "learning_rate": 7.235999999999999e-07, + "loss": 0.331, + "mean_token_accuracy": 0.9249005541205406, + "num_tokens": 49791382.0, + "step": 691 + }, + { + "epoch": 0.4319431982210432, + "grad_norm": 0.35168054699897766, + "learning_rate": 7.231999999999999e-07, + "loss": 0.3832, + "mean_token_accuracy": 0.911179032176733, + "num_tokens": 49858666.0, + "step": 692 + }, + { + "epoch": 0.432567393594195, + "grad_norm": 0.48481592535972595, + "learning_rate": 7.228e-07, + "loss": 0.3038, + "mean_token_accuracy": 0.9313280507922173, + "num_tokens": 49934095.0, + "step": 693 + }, + { + "epoch": 0.4331915889673468, + "grad_norm": 0.4384796917438507, + "learning_rate": 7.224e-07, + "loss": 0.3042, + "mean_token_accuracy": 0.9318602569401264, + "num_tokens": 50011103.0, + "step": 694 + }, + { + "epoch": 0.43381578434049856, + "grad_norm": 1.4764841794967651, + "learning_rate": 7.219999999999999e-07, + "loss": 0.3374, + "mean_token_accuracy": 0.9262259677052498, + "num_tokens": 50089506.0, + "step": 695 + }, + { + "epoch": 0.4344399797136504, + "grad_norm": 0.4757544994354248, + "learning_rate": 7.216e-07, + "loss": 0.3466, + "mean_token_accuracy": 0.9230071902275085, + "num_tokens": 50158682.0, + "step": 696 + }, + { + "epoch": 0.4350641750868022, + "grad_norm": 0.4512925446033478, + "learning_rate": 7.211999999999999e-07, + "loss": 0.3477, + "mean_token_accuracy": 0.9217991046607494, + "num_tokens": 50230570.0, + "step": 697 + }, + { + "epoch": 0.43568837045995396, + "grad_norm": 0.47074785828590393, + "learning_rate": 7.207999999999999e-07, + "loss": 0.3376, + "mean_token_accuracy": 0.9200443737208843, + "num_tokens": 50298085.0, + "step": 698 + }, + { + "epoch": 0.43631256583310574, + "grad_norm": 0.3686661422252655, + "learning_rate": 7.204e-07, + "loss": 0.3294, + "mean_token_accuracy": 0.9283047094941139, + "num_tokens": 50374486.0, + "step": 699 + }, + { + "epoch": 0.4369367612062576, + "grad_norm": 0.3725312650203705, + "learning_rate": 7.2e-07, + "loss": 0.3618, + "mean_token_accuracy": 0.9213119931519032, + "num_tokens": 50443880.0, + "step": 700 + }, + { + "epoch": 0.43756095657940935, + "grad_norm": 0.2869217097759247, + "learning_rate": 7.196e-07, + "loss": 0.3395, + "mean_token_accuracy": 0.9261060692369938, + "num_tokens": 50513420.0, + "step": 701 + }, + { + "epoch": 0.43818515195256114, + "grad_norm": 0.32761839032173157, + "learning_rate": 7.191999999999999e-07, + "loss": 0.3138, + "mean_token_accuracy": 0.9308922737836838, + "num_tokens": 50590229.0, + "step": 702 + }, + { + "epoch": 0.438809347325713, + "grad_norm": 0.48981109261512756, + "learning_rate": 7.188e-07, + "loss": 0.2983, + "mean_token_accuracy": 0.9335628412663937, + "num_tokens": 50667701.0, + "step": 703 + }, + { + "epoch": 0.43943354269886475, + "grad_norm": 0.26082706451416016, + "learning_rate": 7.184e-07, + "loss": 0.3096, + "mean_token_accuracy": 0.9277554415166378, + "num_tokens": 50738076.0, + "step": 704 + }, + { + "epoch": 0.44005773807201654, + "grad_norm": 0.3906603753566742, + "learning_rate": 7.179999999999999e-07, + "loss": 0.3502, + "mean_token_accuracy": 0.921177189797163, + "num_tokens": 50807586.0, + "step": 705 + }, + { + "epoch": 0.4406819334451683, + "grad_norm": 0.3802482485771179, + "learning_rate": 7.176e-07, + "loss": 0.3297, + "mean_token_accuracy": 0.9246837832033634, + "num_tokens": 50876848.0, + "step": 706 + }, + { + "epoch": 0.44130612881832015, + "grad_norm": 0.3646096885204315, + "learning_rate": 7.171999999999999e-07, + "loss": 0.3563, + "mean_token_accuracy": 0.9209400676190853, + "num_tokens": 50947419.0, + "step": 707 + }, + { + "epoch": 0.44193032419147193, + "grad_norm": 0.4562234878540039, + "learning_rate": 7.168e-07, + "loss": 0.349, + "mean_token_accuracy": 0.9228729270398617, + "num_tokens": 51019131.0, + "step": 708 + }, + { + "epoch": 0.4425545195646237, + "grad_norm": 0.44281619787216187, + "learning_rate": 7.164e-07, + "loss": 0.3346, + "mean_token_accuracy": 0.9228230901062489, + "num_tokens": 51085508.0, + "step": 709 + }, + { + "epoch": 0.4431787149377755, + "grad_norm": 0.3311275541782379, + "learning_rate": 7.159999999999999e-07, + "loss": 0.341, + "mean_token_accuracy": 0.921529233455658, + "num_tokens": 51157007.0, + "step": 710 + }, + { + "epoch": 0.44380291031092733, + "grad_norm": 0.3935934603214264, + "learning_rate": 7.156e-07, + "loss": 0.3325, + "mean_token_accuracy": 0.9283835887908936, + "num_tokens": 51227104.0, + "step": 711 + }, + { + "epoch": 0.4444271056840791, + "grad_norm": 0.4497720003128052, + "learning_rate": 7.151999999999999e-07, + "loss": 0.3261, + "mean_token_accuracy": 0.9282115884125233, + "num_tokens": 51298974.0, + "step": 712 + }, + { + "epoch": 0.4450513010572309, + "grad_norm": 0.5246007442474365, + "learning_rate": 7.147999999999999e-07, + "loss": 0.3627, + "mean_token_accuracy": 0.9195816032588482, + "num_tokens": 51372135.0, + "step": 713 + }, + { + "epoch": 0.44567549643038273, + "grad_norm": 0.735645592212677, + "learning_rate": 7.144e-07, + "loss": 0.3665, + "mean_token_accuracy": 0.9182613231241703, + "num_tokens": 51439662.0, + "step": 714 + }, + { + "epoch": 0.4462996918035345, + "grad_norm": 0.26671546697616577, + "learning_rate": 7.14e-07, + "loss": 0.3315, + "mean_token_accuracy": 0.9239188209176064, + "num_tokens": 51508083.0, + "step": 715 + }, + { + "epoch": 0.4469238871766863, + "grad_norm": 0.27894970774650574, + "learning_rate": 7.135999999999999e-07, + "loss": 0.3041, + "mean_token_accuracy": 0.9310220927000046, + "num_tokens": 51582486.0, + "step": 716 + }, + { + "epoch": 0.4475480825498381, + "grad_norm": 0.5564171075820923, + "learning_rate": 7.131999999999999e-07, + "loss": 0.3479, + "mean_token_accuracy": 0.9212802983820438, + "num_tokens": 51655807.0, + "step": 717 + }, + { + "epoch": 0.4481722779229899, + "grad_norm": 0.34685245156288147, + "learning_rate": 7.128e-07, + "loss": 0.3568, + "mean_token_accuracy": 0.9221464432775974, + "num_tokens": 51730645.0, + "step": 718 + }, + { + "epoch": 0.4487964732961417, + "grad_norm": 0.6511409878730774, + "learning_rate": 7.124e-07, + "loss": 0.3163, + "mean_token_accuracy": 0.9305948689579964, + "num_tokens": 51807543.0, + "step": 719 + }, + { + "epoch": 0.4494206686692935, + "grad_norm": 0.40851685404777527, + "learning_rate": 7.119999999999999e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.9276647195219994, + "num_tokens": 51879153.0, + "step": 720 + }, + { + "epoch": 0.45004486404244526, + "grad_norm": 0.3345511555671692, + "learning_rate": 7.116e-07, + "loss": 0.3031, + "mean_token_accuracy": 0.9311193153262138, + "num_tokens": 51950755.0, + "step": 721 + }, + { + "epoch": 0.4506690594155971, + "grad_norm": 1.2511225938796997, + "learning_rate": 7.112000000000001e-07, + "loss": 0.3638, + "mean_token_accuracy": 0.9171359911561012, + "num_tokens": 52021123.0, + "step": 722 + }, + { + "epoch": 0.4512932547887489, + "grad_norm": 0.5214754939079285, + "learning_rate": 7.107999999999999e-07, + "loss": 0.3163, + "mean_token_accuracy": 0.9283814020454884, + "num_tokens": 52089916.0, + "step": 723 + }, + { + "epoch": 0.45191745016190066, + "grad_norm": 0.26029619574546814, + "learning_rate": 7.104e-07, + "loss": 0.3063, + "mean_token_accuracy": 0.9296494275331497, + "num_tokens": 52162690.0, + "step": 724 + }, + { + "epoch": 0.4525416455350525, + "grad_norm": 0.24395406246185303, + "learning_rate": 7.1e-07, + "loss": 0.3419, + "mean_token_accuracy": 0.9235851019620895, + "num_tokens": 52233543.0, + "step": 725 + }, + { + "epoch": 0.4531658409082043, + "grad_norm": 0.39033153653144836, + "learning_rate": 7.096e-07, + "loss": 0.3495, + "mean_token_accuracy": 0.9230191260576248, + "num_tokens": 52300820.0, + "step": 726 + }, + { + "epoch": 0.45379003628135606, + "grad_norm": 0.39378222823143005, + "learning_rate": 7.092e-07, + "loss": 0.3068, + "mean_token_accuracy": 0.9316980689764023, + "num_tokens": 52377765.0, + "step": 727 + }, + { + "epoch": 0.45441423165450784, + "grad_norm": 0.333275705575943, + "learning_rate": 7.088e-07, + "loss": 0.33, + "mean_token_accuracy": 0.9285393580794334, + "num_tokens": 52448172.0, + "step": 728 + }, + { + "epoch": 0.4550384270276597, + "grad_norm": 0.42309725284576416, + "learning_rate": 7.084e-07, + "loss": 0.3302, + "mean_token_accuracy": 0.9275981672108173, + "num_tokens": 52524993.0, + "step": 729 + }, + { + "epoch": 0.45566262240081146, + "grad_norm": 0.5311827063560486, + "learning_rate": 7.079999999999999e-07, + "loss": 0.2751, + "mean_token_accuracy": 0.9398890398442745, + "num_tokens": 52604810.0, + "step": 730 + }, + { + "epoch": 0.45628681777396324, + "grad_norm": 0.404119610786438, + "learning_rate": 7.076e-07, + "loss": 0.3103, + "mean_token_accuracy": 0.9297262318432331, + "num_tokens": 52680013.0, + "step": 731 + }, + { + "epoch": 0.456911013147115, + "grad_norm": 0.2676778733730316, + "learning_rate": 7.072e-07, + "loss": 0.3742, + "mean_token_accuracy": 0.9136869981884956, + "num_tokens": 52748122.0, + "step": 732 + }, + { + "epoch": 0.45753520852026686, + "grad_norm": 0.3234860599040985, + "learning_rate": 7.068e-07, + "loss": 0.3596, + "mean_token_accuracy": 0.9213199652731419, + "num_tokens": 52822157.0, + "step": 733 + }, + { + "epoch": 0.45815940389341864, + "grad_norm": 0.21793296933174133, + "learning_rate": 7.064e-07, + "loss": 0.3601, + "mean_token_accuracy": 0.9182924181222916, + "num_tokens": 52893125.0, + "step": 734 + }, + { + "epoch": 0.4587835992665704, + "grad_norm": 0.45688652992248535, + "learning_rate": 7.059999999999999e-07, + "loss": 0.3333, + "mean_token_accuracy": 0.9230261892080307, + "num_tokens": 52967530.0, + "step": 735 + }, + { + "epoch": 0.45940779463972226, + "grad_norm": 0.5506690740585327, + "learning_rate": 7.056e-07, + "loss": 0.3257, + "mean_token_accuracy": 0.9282068461179733, + "num_tokens": 53038033.0, + "step": 736 + }, + { + "epoch": 0.46003199001287404, + "grad_norm": 0.39481911063194275, + "learning_rate": 7.052e-07, + "loss": 0.386, + "mean_token_accuracy": 0.9134550280869007, + "num_tokens": 53108827.0, + "step": 737 + }, + { + "epoch": 0.4606561853860258, + "grad_norm": 0.39907434582710266, + "learning_rate": 7.047999999999999e-07, + "loss": 0.3391, + "mean_token_accuracy": 0.9239070154726505, + "num_tokens": 53177694.0, + "step": 738 + }, + { + "epoch": 0.4612803807591776, + "grad_norm": 0.5079224705696106, + "learning_rate": 7.044e-07, + "loss": 0.3167, + "mean_token_accuracy": 0.926607720553875, + "num_tokens": 53252577.0, + "step": 739 + }, + { + "epoch": 0.46190457613232944, + "grad_norm": 0.28749263286590576, + "learning_rate": 7.04e-07, + "loss": 0.324, + "mean_token_accuracy": 0.9308020249009132, + "num_tokens": 53323806.0, + "step": 740 + }, + { + "epoch": 0.4625287715054812, + "grad_norm": 0.2666906714439392, + "learning_rate": 7.035999999999999e-07, + "loss": 0.3384, + "mean_token_accuracy": 0.9271521978080273, + "num_tokens": 53398478.0, + "step": 741 + }, + { + "epoch": 0.463152966878633, + "grad_norm": 0.3235752284526825, + "learning_rate": 7.032e-07, + "loss": 0.3692, + "mean_token_accuracy": 0.9176539406180382, + "num_tokens": 53466260.0, + "step": 742 + }, + { + "epoch": 0.46377716225178484, + "grad_norm": 0.424331933259964, + "learning_rate": 7.028e-07, + "loss": 0.3174, + "mean_token_accuracy": 0.9246450252830982, + "num_tokens": 53539239.0, + "step": 743 + }, + { + "epoch": 0.4644013576249366, + "grad_norm": 0.256606787443161, + "learning_rate": 7.024e-07, + "loss": 0.3466, + "mean_token_accuracy": 0.9182081036269665, + "num_tokens": 53608741.0, + "step": 744 + }, + { + "epoch": 0.4650255529980884, + "grad_norm": 0.3169555962085724, + "learning_rate": 7.019999999999999e-07, + "loss": 0.3355, + "mean_token_accuracy": 0.9210515283048153, + "num_tokens": 53677629.0, + "step": 745 + }, + { + "epoch": 0.4656497483712402, + "grad_norm": 0.49976810812950134, + "learning_rate": 7.016e-07, + "loss": 0.3716, + "mean_token_accuracy": 0.9182851128280163, + "num_tokens": 53746893.0, + "step": 746 + }, + { + "epoch": 0.466273943744392, + "grad_norm": 0.48305338621139526, + "learning_rate": 7.012000000000001e-07, + "loss": 0.3783, + "mean_token_accuracy": 0.9136515147984028, + "num_tokens": 53814084.0, + "step": 747 + }, + { + "epoch": 0.4668981391175438, + "grad_norm": 0.3004201352596283, + "learning_rate": 7.007999999999999e-07, + "loss": 0.3317, + "mean_token_accuracy": 0.9266849867999554, + "num_tokens": 53888580.0, + "step": 748 + }, + { + "epoch": 0.4675223344906956, + "grad_norm": 0.248160257935524, + "learning_rate": 7.004e-07, + "loss": 0.346, + "mean_token_accuracy": 0.9221676141023636, + "num_tokens": 53961561.0, + "step": 749 + }, + { + "epoch": 0.46814652986384736, + "grad_norm": 0.4498157501220703, + "learning_rate": 7e-07, + "loss": 0.3635, + "mean_token_accuracy": 0.9175557233393192, + "num_tokens": 54030954.0, + "step": 750 + }, + { + "epoch": 0.4687707252369992, + "grad_norm": 0.40121790766716003, + "learning_rate": 6.995999999999999e-07, + "loss": 0.3449, + "mean_token_accuracy": 0.9215624295175076, + "num_tokens": 54106994.0, + "step": 751 + }, + { + "epoch": 0.469394920610151, + "grad_norm": 0.4183201193809509, + "learning_rate": 6.992e-07, + "loss": 0.3065, + "mean_token_accuracy": 0.9317567013204098, + "num_tokens": 54180104.0, + "step": 752 + }, + { + "epoch": 0.47001911598330276, + "grad_norm": 0.3203515410423279, + "learning_rate": 6.988e-07, + "loss": 0.3106, + "mean_token_accuracy": 0.9256347455084324, + "num_tokens": 54252966.0, + "step": 753 + }, + { + "epoch": 0.4706433113564546, + "grad_norm": 0.32995855808258057, + "learning_rate": 6.984e-07, + "loss": 0.3285, + "mean_token_accuracy": 0.9257500730454922, + "num_tokens": 54322889.0, + "step": 754 + }, + { + "epoch": 0.4712675067296064, + "grad_norm": 0.215129554271698, + "learning_rate": 6.979999999999999e-07, + "loss": 0.3506, + "mean_token_accuracy": 0.9233107268810272, + "num_tokens": 54394403.0, + "step": 755 + }, + { + "epoch": 0.47189170210275816, + "grad_norm": 0.4501388370990753, + "learning_rate": 6.976e-07, + "loss": 0.3222, + "mean_token_accuracy": 0.93061588332057, + "num_tokens": 54464671.0, + "step": 756 + }, + { + "epoch": 0.47251589747590994, + "grad_norm": 0.3878070116043091, + "learning_rate": 6.972e-07, + "loss": 0.3284, + "mean_token_accuracy": 0.9270447343587875, + "num_tokens": 54539439.0, + "step": 757 + }, + { + "epoch": 0.4731400928490618, + "grad_norm": 0.3112216889858246, + "learning_rate": 6.967999999999999e-07, + "loss": 0.3086, + "mean_token_accuracy": 0.9334950409829617, + "num_tokens": 54614382.0, + "step": 758 + }, + { + "epoch": 0.47376428822221356, + "grad_norm": 0.37291496992111206, + "learning_rate": 6.964e-07, + "loss": 0.3469, + "mean_token_accuracy": 0.9203985668718815, + "num_tokens": 54681933.0, + "step": 759 + }, + { + "epoch": 0.47438848359536534, + "grad_norm": 1.7890000343322754, + "learning_rate": 6.959999999999999e-07, + "loss": 0.3754, + "mean_token_accuracy": 0.9173572920262814, + "num_tokens": 54752095.0, + "step": 760 + }, + { + "epoch": 0.4750126789685171, + "grad_norm": 0.38452449440956116, + "learning_rate": 6.956e-07, + "loss": 0.3164, + "mean_token_accuracy": 0.9268531575798988, + "num_tokens": 54820617.0, + "step": 761 + }, + { + "epoch": 0.47563687434166896, + "grad_norm": 0.40700098872184753, + "learning_rate": 6.952e-07, + "loss": 0.3562, + "mean_token_accuracy": 0.9231256954371929, + "num_tokens": 54889992.0, + "step": 762 + }, + { + "epoch": 0.47626106971482074, + "grad_norm": 0.313430517911911, + "learning_rate": 6.947999999999999e-07, + "loss": 0.2851, + "mean_token_accuracy": 0.933523815125227, + "num_tokens": 54960485.0, + "step": 763 + }, + { + "epoch": 0.4768852650879725, + "grad_norm": 0.3233875632286072, + "learning_rate": 6.944e-07, + "loss": 0.3637, + "mean_token_accuracy": 0.9248353838920593, + "num_tokens": 55032811.0, + "step": 764 + }, + { + "epoch": 0.47750946046112436, + "grad_norm": 0.3850257396697998, + "learning_rate": 6.939999999999999e-07, + "loss": 0.3072, + "mean_token_accuracy": 0.9323156476020813, + "num_tokens": 55102615.0, + "step": 765 + }, + { + "epoch": 0.47813365583427614, + "grad_norm": 0.30128633975982666, + "learning_rate": 6.935999999999999e-07, + "loss": 0.377, + "mean_token_accuracy": 0.9120356626808643, + "num_tokens": 55174731.0, + "step": 766 + }, + { + "epoch": 0.4787578512074279, + "grad_norm": 0.4391093850135803, + "learning_rate": 6.932e-07, + "loss": 0.3509, + "mean_token_accuracy": 0.9172132723033428, + "num_tokens": 55243975.0, + "step": 767 + }, + { + "epoch": 0.4793820465805797, + "grad_norm": 0.38679075241088867, + "learning_rate": 6.928e-07, + "loss": 0.3403, + "mean_token_accuracy": 0.9184436909854412, + "num_tokens": 55311031.0, + "step": 768 + }, + { + "epoch": 0.48000624195373154, + "grad_norm": 0.29473400115966797, + "learning_rate": 6.924e-07, + "loss": 0.3042, + "mean_token_accuracy": 0.9334207512438297, + "num_tokens": 55382862.0, + "step": 769 + }, + { + "epoch": 0.4806304373268833, + "grad_norm": 0.3953630328178406, + "learning_rate": 6.919999999999999e-07, + "loss": 0.3284, + "mean_token_accuracy": 0.9284822195768356, + "num_tokens": 55459316.0, + "step": 770 + }, + { + "epoch": 0.4812546327000351, + "grad_norm": 0.5145599842071533, + "learning_rate": 6.916e-07, + "loss": 0.3368, + "mean_token_accuracy": 0.9249029085040092, + "num_tokens": 55531290.0, + "step": 771 + }, + { + "epoch": 0.4818788280731869, + "grad_norm": 0.35696113109588623, + "learning_rate": 6.912e-07, + "loss": 0.3194, + "mean_token_accuracy": 0.9284510426223278, + "num_tokens": 55603569.0, + "step": 772 + }, + { + "epoch": 0.4825030234463387, + "grad_norm": 0.4163554012775421, + "learning_rate": 6.907999999999999e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.926842711865902, + "num_tokens": 55672030.0, + "step": 773 + }, + { + "epoch": 0.4831272188194905, + "grad_norm": 0.4742501974105835, + "learning_rate": 6.904e-07, + "loss": 0.3656, + "mean_token_accuracy": 0.9189232848584652, + "num_tokens": 55739752.0, + "step": 774 + }, + { + "epoch": 0.4837514141926423, + "grad_norm": 0.4275151491165161, + "learning_rate": 6.9e-07, + "loss": 0.2974, + "mean_token_accuracy": 0.9297335892915726, + "num_tokens": 55812195.0, + "step": 775 + }, + { + "epoch": 0.4843756095657941, + "grad_norm": 0.309121310710907, + "learning_rate": 6.895999999999999e-07, + "loss": 0.3182, + "mean_token_accuracy": 0.9266856200993061, + "num_tokens": 55886563.0, + "step": 776 + }, + { + "epoch": 0.4849998049389459, + "grad_norm": 0.5211467146873474, + "learning_rate": 6.892e-07, + "loss": 0.3314, + "mean_token_accuracy": 0.9253830797970295, + "num_tokens": 55956897.0, + "step": 777 + }, + { + "epoch": 0.4856240003120977, + "grad_norm": 0.23561716079711914, + "learning_rate": 6.888e-07, + "loss": 0.3245, + "mean_token_accuracy": 0.9261765368282795, + "num_tokens": 56030117.0, + "step": 778 + }, + { + "epoch": 0.48624819568524946, + "grad_norm": 0.3978263735771179, + "learning_rate": 6.883999999999999e-07, + "loss": 0.3051, + "mean_token_accuracy": 0.9322772137820721, + "num_tokens": 56102961.0, + "step": 779 + }, + { + "epoch": 0.4868723910584013, + "grad_norm": 0.3226126432418823, + "learning_rate": 6.879999999999999e-07, + "loss": 0.3491, + "mean_token_accuracy": 0.9202737398445606, + "num_tokens": 56172222.0, + "step": 780 + }, + { + "epoch": 0.4874965864315531, + "grad_norm": 0.3008454740047455, + "learning_rate": 6.876e-07, + "loss": 0.3361, + "mean_token_accuracy": 0.9267997480928898, + "num_tokens": 56245752.0, + "step": 781 + }, + { + "epoch": 0.48812078180470486, + "grad_norm": 0.3068612515926361, + "learning_rate": 6.872e-07, + "loss": 0.3638, + "mean_token_accuracy": 0.9167297668755054, + "num_tokens": 56316235.0, + "step": 782 + }, + { + "epoch": 0.48874497717785664, + "grad_norm": 0.22114723920822144, + "learning_rate": 6.867999999999999e-07, + "loss": 0.3171, + "mean_token_accuracy": 0.9272118136286736, + "num_tokens": 56390998.0, + "step": 783 + }, + { + "epoch": 0.4893691725510085, + "grad_norm": 0.2479737401008606, + "learning_rate": 6.864e-07, + "loss": 0.3324, + "mean_token_accuracy": 0.9246704541146755, + "num_tokens": 56459176.0, + "step": 784 + }, + { + "epoch": 0.48999336792416026, + "grad_norm": 0.38008129596710205, + "learning_rate": 6.86e-07, + "loss": 0.3046, + "mean_token_accuracy": 0.9347673803567886, + "num_tokens": 56535165.0, + "step": 785 + }, + { + "epoch": 0.49061756329731204, + "grad_norm": 0.34412187337875366, + "learning_rate": 6.855999999999999e-07, + "loss": 0.3265, + "mean_token_accuracy": 0.9240523464977741, + "num_tokens": 56605577.0, + "step": 786 + }, + { + "epoch": 0.4912417586704639, + "grad_norm": 0.3644009232521057, + "learning_rate": 6.852e-07, + "loss": 0.2954, + "mean_token_accuracy": 0.9324140585958958, + "num_tokens": 56680245.0, + "step": 787 + }, + { + "epoch": 0.49186595404361566, + "grad_norm": 0.39564791321754456, + "learning_rate": 6.847999999999999e-07, + "loss": 0.3124, + "mean_token_accuracy": 0.9314664751291275, + "num_tokens": 56753379.0, + "step": 788 + }, + { + "epoch": 0.49249014941676744, + "grad_norm": 0.479529470205307, + "learning_rate": 6.844e-07, + "loss": 0.3378, + "mean_token_accuracy": 0.9203203991055489, + "num_tokens": 56829976.0, + "step": 789 + }, + { + "epoch": 0.4931143447899192, + "grad_norm": 0.4279347062110901, + "learning_rate": 6.84e-07, + "loss": 0.296, + "mean_token_accuracy": 0.9355221167206764, + "num_tokens": 56905864.0, + "step": 790 + }, + { + "epoch": 0.49373854016307106, + "grad_norm": 0.34300193190574646, + "learning_rate": 6.836e-07, + "loss": 0.33, + "mean_token_accuracy": 0.9281573966145515, + "num_tokens": 56978229.0, + "step": 791 + }, + { + "epoch": 0.49436273553622284, + "grad_norm": 0.43124455213546753, + "learning_rate": 6.832e-07, + "loss": 0.3322, + "mean_token_accuracy": 0.9186022728681564, + "num_tokens": 57048922.0, + "step": 792 + }, + { + "epoch": 0.4949869309093746, + "grad_norm": 0.35299694538116455, + "learning_rate": 6.827999999999999e-07, + "loss": 0.3436, + "mean_token_accuracy": 0.9200886636972427, + "num_tokens": 57114461.0, + "step": 793 + }, + { + "epoch": 0.4956111262825264, + "grad_norm": 0.27989789843559265, + "learning_rate": 6.824e-07, + "loss": 0.2922, + "mean_token_accuracy": 0.9341375753283501, + "num_tokens": 57189742.0, + "step": 794 + }, + { + "epoch": 0.49623532165567824, + "grad_norm": 0.25702568888664246, + "learning_rate": 6.82e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.9275923520326614, + "num_tokens": 57261773.0, + "step": 795 + }, + { + "epoch": 0.49685951702883, + "grad_norm": 0.3535645008087158, + "learning_rate": 6.816e-07, + "loss": 0.3492, + "mean_token_accuracy": 0.9200767949223518, + "num_tokens": 57331145.0, + "step": 796 + }, + { + "epoch": 0.4974837124019818, + "grad_norm": 0.36620262265205383, + "learning_rate": 6.812e-07, + "loss": 0.3161, + "mean_token_accuracy": 0.9303906634449959, + "num_tokens": 57410396.0, + "step": 797 + }, + { + "epoch": 0.49810790777513364, + "grad_norm": 0.35625168681144714, + "learning_rate": 6.807999999999999e-07, + "loss": 0.3612, + "mean_token_accuracy": 0.9239784106612206, + "num_tokens": 57482555.0, + "step": 798 + }, + { + "epoch": 0.4987321031482854, + "grad_norm": 0.42324453592300415, + "learning_rate": 6.804e-07, + "loss": 0.3224, + "mean_token_accuracy": 0.9290669783949852, + "num_tokens": 57557257.0, + "step": 799 + }, + { + "epoch": 0.4993562985214372, + "grad_norm": 0.2871803343296051, + "learning_rate": 6.800000000000001e-07, + "loss": 0.3067, + "mean_token_accuracy": 0.9317543730139732, + "num_tokens": 57633259.0, + "step": 800 + }, + { + "epoch": 0.499980493894589, + "grad_norm": 0.4968184232711792, + "learning_rate": 6.795999999999999e-07, + "loss": 0.3442, + "mean_token_accuracy": 0.9231974147260189, + "num_tokens": 57707507.0, + "step": 801 + }, + { + "epoch": 0.5006046892677408, + "grad_norm": 0.30473563075065613, + "learning_rate": 6.792e-07, + "loss": 0.3311, + "mean_token_accuracy": 0.9224537685513496, + "num_tokens": 57776950.0, + "step": 802 + }, + { + "epoch": 0.5012288846408925, + "grad_norm": 0.3754504323005676, + "learning_rate": 6.788e-07, + "loss": 0.3003, + "mean_token_accuracy": 0.9335438162088394, + "num_tokens": 57849072.0, + "step": 803 + }, + { + "epoch": 0.5018530800140444, + "grad_norm": 0.29765474796295166, + "learning_rate": 6.783999999999999e-07, + "loss": 0.3271, + "mean_token_accuracy": 0.9281776584684849, + "num_tokens": 57925007.0, + "step": 804 + }, + { + "epoch": 0.5024772753871962, + "grad_norm": 0.5371522903442383, + "learning_rate": 6.78e-07, + "loss": 0.3261, + "mean_token_accuracy": 0.9277910217642784, + "num_tokens": 57996838.0, + "step": 805 + }, + { + "epoch": 0.503101470760348, + "grad_norm": 0.46128857135772705, + "learning_rate": 6.776e-07, + "loss": 0.355, + "mean_token_accuracy": 0.9118218421936035, + "num_tokens": 58068029.0, + "step": 806 + }, + { + "epoch": 0.5037256661334998, + "grad_norm": 1.187224268913269, + "learning_rate": 6.772e-07, + "loss": 0.3481, + "mean_token_accuracy": 0.9187640734016895, + "num_tokens": 58139184.0, + "step": 807 + }, + { + "epoch": 0.5043498615066516, + "grad_norm": 0.3919733166694641, + "learning_rate": 6.767999999999999e-07, + "loss": 0.3575, + "mean_token_accuracy": 0.9180425144731998, + "num_tokens": 58203177.0, + "step": 808 + }, + { + "epoch": 0.5049740568798033, + "grad_norm": 0.33362945914268494, + "learning_rate": 6.764e-07, + "loss": 0.3329, + "mean_token_accuracy": 0.9237672202289104, + "num_tokens": 58277291.0, + "step": 809 + }, + { + "epoch": 0.5055982522529552, + "grad_norm": 0.2621035873889923, + "learning_rate": 6.76e-07, + "loss": 0.3141, + "mean_token_accuracy": 0.9307037070393562, + "num_tokens": 58348287.0, + "step": 810 + }, + { + "epoch": 0.506222447626107, + "grad_norm": 0.23340308666229248, + "learning_rate": 6.755999999999999e-07, + "loss": 0.3461, + "mean_token_accuracy": 0.921839028596878, + "num_tokens": 58415089.0, + "step": 811 + }, + { + "epoch": 0.5068466429992587, + "grad_norm": 0.3458343744277954, + "learning_rate": 6.752e-07, + "loss": 0.327, + "mean_token_accuracy": 0.9286703877151012, + "num_tokens": 58489140.0, + "step": 812 + }, + { + "epoch": 0.5074708383724106, + "grad_norm": 0.43826496601104736, + "learning_rate": 6.747999999999999e-07, + "loss": 0.3141, + "mean_token_accuracy": 0.929670188575983, + "num_tokens": 58563568.0, + "step": 813 + }, + { + "epoch": 0.5080950337455623, + "grad_norm": 0.8658429980278015, + "learning_rate": 6.744e-07, + "loss": 0.3165, + "mean_token_accuracy": 0.930111650377512, + "num_tokens": 58636169.0, + "step": 814 + }, + { + "epoch": 0.5087192291187141, + "grad_norm": 0.5724182724952698, + "learning_rate": 6.74e-07, + "loss": 0.2598, + "mean_token_accuracy": 0.9401777200400829, + "num_tokens": 58711570.0, + "step": 815 + }, + { + "epoch": 0.509343424491866, + "grad_norm": 0.28098592162132263, + "learning_rate": 6.736e-07, + "loss": 0.3574, + "mean_token_accuracy": 0.918064646422863, + "num_tokens": 58781110.0, + "step": 816 + }, + { + "epoch": 0.5099676198650177, + "grad_norm": 0.3825387954711914, + "learning_rate": 6.732e-07, + "loss": 0.2951, + "mean_token_accuracy": 0.9338416680693626, + "num_tokens": 58857532.0, + "step": 817 + }, + { + "epoch": 0.5105918152381695, + "grad_norm": 0.2271539717912674, + "learning_rate": 6.727999999999999e-07, + "loss": 0.3392, + "mean_token_accuracy": 0.9180256314575672, + "num_tokens": 58929618.0, + "step": 818 + }, + { + "epoch": 0.5112160106113214, + "grad_norm": 1.0451000928878784, + "learning_rate": 6.724e-07, + "loss": 0.3521, + "mean_token_accuracy": 0.9188212901353836, + "num_tokens": 58999419.0, + "step": 819 + }, + { + "epoch": 0.5118402059844731, + "grad_norm": 0.4600159525871277, + "learning_rate": 6.72e-07, + "loss": 0.3371, + "mean_token_accuracy": 0.9256699904799461, + "num_tokens": 59069688.0, + "step": 820 + }, + { + "epoch": 0.5124644013576249, + "grad_norm": 0.37027785181999207, + "learning_rate": 6.716e-07, + "loss": 0.2953, + "mean_token_accuracy": 0.9317917823791504, + "num_tokens": 59143221.0, + "step": 821 + }, + { + "epoch": 0.5130885967307768, + "grad_norm": 0.39364737272262573, + "learning_rate": 6.712e-07, + "loss": 0.3326, + "mean_token_accuracy": 0.9256380274891853, + "num_tokens": 59215897.0, + "step": 822 + }, + { + "epoch": 0.5137127921039285, + "grad_norm": 0.2712680995464325, + "learning_rate": 6.707999999999999e-07, + "loss": 0.3472, + "mean_token_accuracy": 0.9214113615453243, + "num_tokens": 59286560.0, + "step": 823 + }, + { + "epoch": 0.5143369874770803, + "grad_norm": 0.45734336972236633, + "learning_rate": 6.704e-07, + "loss": 0.3489, + "mean_token_accuracy": 0.9196154400706291, + "num_tokens": 59357481.0, + "step": 824 + }, + { + "epoch": 0.5149611828502321, + "grad_norm": 0.37052568793296814, + "learning_rate": 6.7e-07, + "loss": 0.3118, + "mean_token_accuracy": 0.9303006753325462, + "num_tokens": 59430046.0, + "step": 825 + }, + { + "epoch": 0.5155853782233839, + "grad_norm": 0.35565704107284546, + "learning_rate": 6.695999999999999e-07, + "loss": 0.3276, + "mean_token_accuracy": 0.927744098007679, + "num_tokens": 59504861.0, + "step": 826 + }, + { + "epoch": 0.5162095735965357, + "grad_norm": 0.3806411027908325, + "learning_rate": 6.692e-07, + "loss": 0.3101, + "mean_token_accuracy": 0.9285195842385292, + "num_tokens": 59576408.0, + "step": 827 + }, + { + "epoch": 0.5168337689696875, + "grad_norm": 0.2509946823120117, + "learning_rate": 6.688e-07, + "loss": 0.3039, + "mean_token_accuracy": 0.9321120083332062, + "num_tokens": 59652561.0, + "step": 828 + }, + { + "epoch": 0.5174579643428393, + "grad_norm": 0.30902498960494995, + "learning_rate": 6.683999999999999e-07, + "loss": 0.3298, + "mean_token_accuracy": 0.9265851452946663, + "num_tokens": 59727436.0, + "step": 829 + }, + { + "epoch": 0.5180821597159911, + "grad_norm": 0.34848296642303467, + "learning_rate": 6.68e-07, + "loss": 0.2895, + "mean_token_accuracy": 0.9355588145554066, + "num_tokens": 59801993.0, + "step": 830 + }, + { + "epoch": 0.5187063550891429, + "grad_norm": 0.8798127174377441, + "learning_rate": 6.676e-07, + "loss": 0.3075, + "mean_token_accuracy": 0.9326577596366405, + "num_tokens": 59872428.0, + "step": 831 + }, + { + "epoch": 0.5193305504622947, + "grad_norm": 0.42768946290016174, + "learning_rate": 6.671999999999999e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9277102574706078, + "num_tokens": 59948764.0, + "step": 832 + }, + { + "epoch": 0.5199547458354465, + "grad_norm": 0.27936241030693054, + "learning_rate": 6.667999999999999e-07, + "loss": 0.3675, + "mean_token_accuracy": 0.9130092337727547, + "num_tokens": 60014163.0, + "step": 833 + }, + { + "epoch": 0.5205789412085983, + "grad_norm": 0.4408262073993683, + "learning_rate": 6.664e-07, + "loss": 0.3086, + "mean_token_accuracy": 0.9310822710394859, + "num_tokens": 60089115.0, + "step": 834 + }, + { + "epoch": 0.5212031365817501, + "grad_norm": 0.3274775743484497, + "learning_rate": 6.66e-07, + "loss": 0.342, + "mean_token_accuracy": 0.9205053001642227, + "num_tokens": 60160550.0, + "step": 835 + }, + { + "epoch": 0.5218273319549018, + "grad_norm": 1.1035127639770508, + "learning_rate": 6.655999999999999e-07, + "loss": 0.3431, + "mean_token_accuracy": 0.9214097373187542, + "num_tokens": 60230895.0, + "step": 836 + }, + { + "epoch": 0.5224515273280537, + "grad_norm": 0.2535402476787567, + "learning_rate": 6.652e-07, + "loss": 0.3285, + "mean_token_accuracy": 0.9241763278841972, + "num_tokens": 60301917.0, + "step": 837 + }, + { + "epoch": 0.5230757227012055, + "grad_norm": 0.3422260582447052, + "learning_rate": 6.647999999999999e-07, + "loss": 0.314, + "mean_token_accuracy": 0.9301337525248528, + "num_tokens": 60376748.0, + "step": 838 + }, + { + "epoch": 0.5236999180743572, + "grad_norm": 0.3284092843532562, + "learning_rate": 6.643999999999999e-07, + "loss": 0.3368, + "mean_token_accuracy": 0.9259582236409187, + "num_tokens": 60447900.0, + "step": 839 + }, + { + "epoch": 0.5243241134475091, + "grad_norm": 0.4033794701099396, + "learning_rate": 6.64e-07, + "loss": 0.3109, + "mean_token_accuracy": 0.929807037115097, + "num_tokens": 60518915.0, + "step": 840 + }, + { + "epoch": 0.5249483088206609, + "grad_norm": 0.6306251287460327, + "learning_rate": 6.636e-07, + "loss": 0.3108, + "mean_token_accuracy": 0.9308784641325474, + "num_tokens": 60593012.0, + "step": 841 + }, + { + "epoch": 0.5255725041938126, + "grad_norm": 0.3373713791370392, + "learning_rate": 6.632e-07, + "loss": 0.3523, + "mean_token_accuracy": 0.9216680601239204, + "num_tokens": 60660953.0, + "step": 842 + }, + { + "epoch": 0.5261966995669645, + "grad_norm": 0.3094328045845032, + "learning_rate": 6.627999999999999e-07, + "loss": 0.3497, + "mean_token_accuracy": 0.9154267460107803, + "num_tokens": 60727607.0, + "step": 843 + }, + { + "epoch": 0.5268208949401163, + "grad_norm": 0.2698313891887665, + "learning_rate": 6.624e-07, + "loss": 0.3177, + "mean_token_accuracy": 0.9287923537194729, + "num_tokens": 60801976.0, + "step": 844 + }, + { + "epoch": 0.527445090313268, + "grad_norm": 0.27001866698265076, + "learning_rate": 6.62e-07, + "loss": 0.316, + "mean_token_accuracy": 0.9283559136092663, + "num_tokens": 60871857.0, + "step": 845 + }, + { + "epoch": 0.5280692856864199, + "grad_norm": 0.2707677185535431, + "learning_rate": 6.615999999999999e-07, + "loss": 0.3185, + "mean_token_accuracy": 0.9285137169063091, + "num_tokens": 60945519.0, + "step": 846 + }, + { + "epoch": 0.5286934810595716, + "grad_norm": 0.2894807457923889, + "learning_rate": 6.612e-07, + "loss": 0.3154, + "mean_token_accuracy": 0.9292592667043209, + "num_tokens": 61019928.0, + "step": 847 + }, + { + "epoch": 0.5293176764327234, + "grad_norm": 0.25697293877601624, + "learning_rate": 6.608e-07, + "loss": 0.3282, + "mean_token_accuracy": 0.9247947856783867, + "num_tokens": 61090697.0, + "step": 848 + }, + { + "epoch": 0.5299418718058753, + "grad_norm": 0.35483691096305847, + "learning_rate": 6.604e-07, + "loss": 0.3117, + "mean_token_accuracy": 0.9281399734318256, + "num_tokens": 61163683.0, + "step": 849 + }, + { + "epoch": 0.530566067179027, + "grad_norm": 0.8732873797416687, + "learning_rate": 6.6e-07, + "loss": 0.3137, + "mean_token_accuracy": 0.927576832473278, + "num_tokens": 61240425.0, + "step": 850 + }, + { + "epoch": 0.5311902625521788, + "grad_norm": 0.33668267726898193, + "learning_rate": 6.595999999999999e-07, + "loss": 0.2974, + "mean_token_accuracy": 0.9318785108625889, + "num_tokens": 61314112.0, + "step": 851 + }, + { + "epoch": 0.5318144579253307, + "grad_norm": 0.2964807450771332, + "learning_rate": 6.592e-07, + "loss": 0.2961, + "mean_token_accuracy": 0.9362316727638245, + "num_tokens": 61387594.0, + "step": 852 + }, + { + "epoch": 0.5324386532984824, + "grad_norm": 0.41130802035331726, + "learning_rate": 6.588e-07, + "loss": 0.2981, + "mean_token_accuracy": 0.9318909607827663, + "num_tokens": 61459533.0, + "step": 853 + }, + { + "epoch": 0.5330628486716342, + "grad_norm": 0.8601356148719788, + "learning_rate": 6.583999999999999e-07, + "loss": 0.3589, + "mean_token_accuracy": 0.9185883365571499, + "num_tokens": 61527652.0, + "step": 854 + }, + { + "epoch": 0.5336870440447861, + "grad_norm": 0.32772937417030334, + "learning_rate": 6.58e-07, + "loss": 0.2959, + "mean_token_accuracy": 0.9342739060521126, + "num_tokens": 61602399.0, + "step": 855 + }, + { + "epoch": 0.5343112394179378, + "grad_norm": 0.2073204219341278, + "learning_rate": 6.576e-07, + "loss": 0.3444, + "mean_token_accuracy": 0.9244417250156403, + "num_tokens": 61670724.0, + "step": 856 + }, + { + "epoch": 0.5349354347910896, + "grad_norm": 0.31500622630119324, + "learning_rate": 6.571999999999999e-07, + "loss": 0.3192, + "mean_token_accuracy": 0.9282494559884071, + "num_tokens": 61742959.0, + "step": 857 + }, + { + "epoch": 0.5355596301642415, + "grad_norm": 0.3423157334327698, + "learning_rate": 6.568e-07, + "loss": 0.3548, + "mean_token_accuracy": 0.9158097580075264, + "num_tokens": 61811114.0, + "step": 858 + }, + { + "epoch": 0.5361838255373932, + "grad_norm": 0.2900506854057312, + "learning_rate": 6.564e-07, + "loss": 0.3265, + "mean_token_accuracy": 0.9255715534090996, + "num_tokens": 61882969.0, + "step": 859 + }, + { + "epoch": 0.536808020910545, + "grad_norm": 0.2774682641029358, + "learning_rate": 6.56e-07, + "loss": 0.3436, + "mean_token_accuracy": 0.9228880815207958, + "num_tokens": 61952163.0, + "step": 860 + }, + { + "epoch": 0.5374322162836968, + "grad_norm": 0.3559592068195343, + "learning_rate": 6.555999999999999e-07, + "loss": 0.3688, + "mean_token_accuracy": 0.9127374067902565, + "num_tokens": 62020369.0, + "step": 861 + }, + { + "epoch": 0.5380564116568486, + "grad_norm": 0.3357591927051544, + "learning_rate": 6.552e-07, + "loss": 0.3395, + "mean_token_accuracy": 0.923944428563118, + "num_tokens": 62090634.0, + "step": 862 + }, + { + "epoch": 0.5386806070300004, + "grad_norm": 0.3759511113166809, + "learning_rate": 6.548000000000001e-07, + "loss": 0.3447, + "mean_token_accuracy": 0.9209614582359791, + "num_tokens": 62161178.0, + "step": 863 + }, + { + "epoch": 0.5393048024031522, + "grad_norm": 0.5431933999061584, + "learning_rate": 6.543999999999999e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9273117445409298, + "num_tokens": 62228117.0, + "step": 864 + }, + { + "epoch": 0.539928997776304, + "grad_norm": 0.28718411922454834, + "learning_rate": 6.54e-07, + "loss": 0.3088, + "mean_token_accuracy": 0.9304384291172028, + "num_tokens": 62297179.0, + "step": 865 + }, + { + "epoch": 0.5405531931494558, + "grad_norm": 0.24944348633289337, + "learning_rate": 6.536e-07, + "loss": 0.331, + "mean_token_accuracy": 0.9251144230365753, + "num_tokens": 62366224.0, + "step": 866 + }, + { + "epoch": 0.5411773885226075, + "grad_norm": 0.3461836874485016, + "learning_rate": 6.531999999999999e-07, + "loss": 0.377, + "mean_token_accuracy": 0.9160482995212078, + "num_tokens": 62439463.0, + "step": 867 + }, + { + "epoch": 0.5418015838957594, + "grad_norm": 0.3286309540271759, + "learning_rate": 6.528e-07, + "loss": 0.3369, + "mean_token_accuracy": 0.925285592675209, + "num_tokens": 62511189.0, + "step": 868 + }, + { + "epoch": 0.5424257792689112, + "grad_norm": 0.2973572313785553, + "learning_rate": 6.524e-07, + "loss": 0.3499, + "mean_token_accuracy": 0.9185475669801235, + "num_tokens": 62587568.0, + "step": 869 + }, + { + "epoch": 0.543049974642063, + "grad_norm": 0.25069576501846313, + "learning_rate": 6.52e-07, + "loss": 0.3018, + "mean_token_accuracy": 0.9320270493626595, + "num_tokens": 62661423.0, + "step": 870 + }, + { + "epoch": 0.5436741700152148, + "grad_norm": 0.2799758017063141, + "learning_rate": 6.515999999999999e-07, + "loss": 0.359, + "mean_token_accuracy": 0.9199363514780998, + "num_tokens": 62733682.0, + "step": 871 + }, + { + "epoch": 0.5442983653883665, + "grad_norm": 0.3064819574356079, + "learning_rate": 6.512e-07, + "loss": 0.3139, + "mean_token_accuracy": 0.9285257048904896, + "num_tokens": 62808212.0, + "step": 872 + }, + { + "epoch": 0.5449225607615183, + "grad_norm": 0.3458581864833832, + "learning_rate": 6.508e-07, + "loss": 0.3426, + "mean_token_accuracy": 0.9253249615430832, + "num_tokens": 62874463.0, + "step": 873 + }, + { + "epoch": 0.5455467561346702, + "grad_norm": 0.34328359365463257, + "learning_rate": 6.504e-07, + "loss": 0.3623, + "mean_token_accuracy": 0.9138705618679523, + "num_tokens": 62939303.0, + "step": 874 + }, + { + "epoch": 0.5461709515078219, + "grad_norm": 2.6398191452026367, + "learning_rate": 6.5e-07, + "loss": 0.3286, + "mean_token_accuracy": 0.9278340861201286, + "num_tokens": 63013503.0, + "step": 875 + }, + { + "epoch": 0.5467951468809737, + "grad_norm": 2.7471883296966553, + "learning_rate": 6.495999999999999e-07, + "loss": 0.3513, + "mean_token_accuracy": 0.9204366207122803, + "num_tokens": 63085928.0, + "step": 876 + }, + { + "epoch": 0.5474193422541256, + "grad_norm": 0.36083847284317017, + "learning_rate": 6.492e-07, + "loss": 0.3329, + "mean_token_accuracy": 0.9260806180536747, + "num_tokens": 63154631.0, + "step": 877 + }, + { + "epoch": 0.5480435376272773, + "grad_norm": 2.3419203758239746, + "learning_rate": 6.488e-07, + "loss": 0.341, + "mean_token_accuracy": 0.9239551052451134, + "num_tokens": 63223905.0, + "step": 878 + }, + { + "epoch": 0.5486677330004291, + "grad_norm": 0.2830072045326233, + "learning_rate": 6.483999999999999e-07, + "loss": 0.3442, + "mean_token_accuracy": 0.9240602850914001, + "num_tokens": 63295303.0, + "step": 879 + }, + { + "epoch": 0.549291928373581, + "grad_norm": 0.3667299449443817, + "learning_rate": 6.48e-07, + "loss": 0.3014, + "mean_token_accuracy": 0.9317955821752548, + "num_tokens": 63369491.0, + "step": 880 + }, + { + "epoch": 0.5499161237467327, + "grad_norm": 0.24864889681339264, + "learning_rate": 6.476e-07, + "loss": 0.3529, + "mean_token_accuracy": 0.9187034443020821, + "num_tokens": 63436479.0, + "step": 881 + }, + { + "epoch": 0.5505403191198845, + "grad_norm": 0.5485105514526367, + "learning_rate": 6.471999999999999e-07, + "loss": 0.3057, + "mean_token_accuracy": 0.9314697347581387, + "num_tokens": 63513212.0, + "step": 882 + }, + { + "epoch": 0.5511645144930363, + "grad_norm": 0.268966406583786, + "learning_rate": 6.468e-07, + "loss": 0.2654, + "mean_token_accuracy": 0.9404886960983276, + "num_tokens": 63590432.0, + "step": 883 + }, + { + "epoch": 0.5517887098661881, + "grad_norm": 0.2550604045391083, + "learning_rate": 6.464e-07, + "loss": 0.3201, + "mean_token_accuracy": 0.9265033677220345, + "num_tokens": 63658317.0, + "step": 884 + }, + { + "epoch": 0.55241290523934, + "grad_norm": 0.32950738072395325, + "learning_rate": 6.46e-07, + "loss": 0.2993, + "mean_token_accuracy": 0.9328174777328968, + "num_tokens": 63735198.0, + "step": 885 + }, + { + "epoch": 0.5530371006124917, + "grad_norm": 0.357653945684433, + "learning_rate": 6.455999999999999e-07, + "loss": 0.3537, + "mean_token_accuracy": 0.9245644435286522, + "num_tokens": 63804965.0, + "step": 886 + }, + { + "epoch": 0.5536612959856435, + "grad_norm": 0.26075589656829834, + "learning_rate": 6.452e-07, + "loss": 0.3624, + "mean_token_accuracy": 0.9179616272449493, + "num_tokens": 63873341.0, + "step": 887 + }, + { + "epoch": 0.5542854913587953, + "grad_norm": 0.5456115007400513, + "learning_rate": 6.448000000000001e-07, + "loss": 0.3286, + "mean_token_accuracy": 0.9267865121364594, + "num_tokens": 63942132.0, + "step": 888 + }, + { + "epoch": 0.5549096867319471, + "grad_norm": 0.2823208272457123, + "learning_rate": 6.443999999999999e-07, + "loss": 0.3452, + "mean_token_accuracy": 0.9206357337534428, + "num_tokens": 64009816.0, + "step": 889 + }, + { + "epoch": 0.5555338821050989, + "grad_norm": 0.29662907123565674, + "learning_rate": 6.44e-07, + "loss": 0.2953, + "mean_token_accuracy": 0.934375673532486, + "num_tokens": 64088166.0, + "step": 890 + }, + { + "epoch": 0.5561580774782507, + "grad_norm": 0.22591635584831238, + "learning_rate": 6.436e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.9257152192294598, + "num_tokens": 64159560.0, + "step": 891 + }, + { + "epoch": 0.5567822728514025, + "grad_norm": 0.26975518465042114, + "learning_rate": 6.431999999999999e-07, + "loss": 0.3039, + "mean_token_accuracy": 0.9288149103522301, + "num_tokens": 64234794.0, + "step": 892 + }, + { + "epoch": 0.5574064682245543, + "grad_norm": 0.35066017508506775, + "learning_rate": 6.428e-07, + "loss": 0.2818, + "mean_token_accuracy": 0.9383094571530819, + "num_tokens": 64313405.0, + "step": 893 + }, + { + "epoch": 0.558030663597706, + "grad_norm": 0.3492513597011566, + "learning_rate": 6.424e-07, + "loss": 0.3084, + "mean_token_accuracy": 0.9289005398750305, + "num_tokens": 64384097.0, + "step": 894 + }, + { + "epoch": 0.5586548589708579, + "grad_norm": 0.56803959608078, + "learning_rate": 6.42e-07, + "loss": 0.3498, + "mean_token_accuracy": 0.9189568646252155, + "num_tokens": 64454292.0, + "step": 895 + }, + { + "epoch": 0.5592790543440097, + "grad_norm": 0.3062562942504883, + "learning_rate": 6.415999999999999e-07, + "loss": 0.2761, + "mean_token_accuracy": 0.935680590569973, + "num_tokens": 64532399.0, + "step": 896 + }, + { + "epoch": 0.5599032497171614, + "grad_norm": 0.3139180839061737, + "learning_rate": 6.412e-07, + "loss": 0.358, + "mean_token_accuracy": 0.915798969566822, + "num_tokens": 64602024.0, + "step": 897 + }, + { + "epoch": 0.5605274450903133, + "grad_norm": 0.47270873188972473, + "learning_rate": 6.408e-07, + "loss": 0.3262, + "mean_token_accuracy": 0.9262346103787422, + "num_tokens": 64671684.0, + "step": 898 + }, + { + "epoch": 0.5611516404634651, + "grad_norm": 0.2556508779525757, + "learning_rate": 6.403999999999999e-07, + "loss": 0.3196, + "mean_token_accuracy": 0.9279545471072197, + "num_tokens": 64742119.0, + "step": 899 + }, + { + "epoch": 0.5617758358366168, + "grad_norm": 0.3059193193912506, + "learning_rate": 6.4e-07, + "loss": 0.3151, + "mean_token_accuracy": 0.9242472685873508, + "num_tokens": 64813055.0, + "step": 900 + }, + { + "epoch": 0.5624000312097687, + "grad_norm": 0.3003333508968353, + "learning_rate": 6.395999999999999e-07, + "loss": 0.2785, + "mean_token_accuracy": 0.934153325855732, + "num_tokens": 64882919.0, + "step": 901 + }, + { + "epoch": 0.5630242265829205, + "grad_norm": 0.5348864793777466, + "learning_rate": 6.392e-07, + "loss": 0.3132, + "mean_token_accuracy": 0.9258575029671192, + "num_tokens": 64955311.0, + "step": 902 + }, + { + "epoch": 0.5636484219560722, + "grad_norm": 0.23971465229988098, + "learning_rate": 6.388e-07, + "loss": 0.3103, + "mean_token_accuracy": 0.9243427775800228, + "num_tokens": 65026770.0, + "step": 903 + }, + { + "epoch": 0.5642726173292241, + "grad_norm": 0.29327601194381714, + "learning_rate": 6.383999999999999e-07, + "loss": 0.3362, + "mean_token_accuracy": 0.9241050370037556, + "num_tokens": 65097009.0, + "step": 904 + }, + { + "epoch": 0.5648968127023758, + "grad_norm": 0.2987630367279053, + "learning_rate": 6.38e-07, + "loss": 0.302, + "mean_token_accuracy": 0.9335595779120922, + "num_tokens": 65171547.0, + "step": 905 + }, + { + "epoch": 0.5655210080755276, + "grad_norm": 0.3643862009048462, + "learning_rate": 6.375999999999999e-07, + "loss": 0.3376, + "mean_token_accuracy": 0.9240440875291824, + "num_tokens": 65240974.0, + "step": 906 + }, + { + "epoch": 0.5661452034486795, + "grad_norm": 0.21183830499649048, + "learning_rate": 6.371999999999999e-07, + "loss": 0.3524, + "mean_token_accuracy": 0.9210678488016129, + "num_tokens": 65310494.0, + "step": 907 + }, + { + "epoch": 0.5667693988218312, + "grad_norm": 0.2144000381231308, + "learning_rate": 6.368e-07, + "loss": 0.341, + "mean_token_accuracy": 0.9211500398814678, + "num_tokens": 65383354.0, + "step": 908 + }, + { + "epoch": 0.567393594194983, + "grad_norm": 0.281815767288208, + "learning_rate": 6.364e-07, + "loss": 0.3551, + "mean_token_accuracy": 0.9175384566187859, + "num_tokens": 65455327.0, + "step": 909 + }, + { + "epoch": 0.5680177895681349, + "grad_norm": 0.3019249439239502, + "learning_rate": 6.36e-07, + "loss": 0.3203, + "mean_token_accuracy": 0.9279223792254925, + "num_tokens": 65529219.0, + "step": 910 + }, + { + "epoch": 0.5686419849412866, + "grad_norm": 0.3000909686088562, + "learning_rate": 6.356e-07, + "loss": 0.3433, + "mean_token_accuracy": 0.923235822468996, + "num_tokens": 65602479.0, + "step": 911 + }, + { + "epoch": 0.5692661803144384, + "grad_norm": 0.5725570321083069, + "learning_rate": 6.352e-07, + "loss": 0.3533, + "mean_token_accuracy": 0.9162441082298756, + "num_tokens": 65670664.0, + "step": 912 + }, + { + "epoch": 0.5698903756875903, + "grad_norm": 0.3796912133693695, + "learning_rate": 6.348e-07, + "loss": 0.3258, + "mean_token_accuracy": 0.9275628663599491, + "num_tokens": 65739173.0, + "step": 913 + }, + { + "epoch": 0.570514571060742, + "grad_norm": 0.389944463968277, + "learning_rate": 6.343999999999999e-07, + "loss": 0.3235, + "mean_token_accuracy": 0.9257393516600132, + "num_tokens": 65812369.0, + "step": 914 + }, + { + "epoch": 0.5711387664338938, + "grad_norm": 0.29456764459609985, + "learning_rate": 6.34e-07, + "loss": 0.3573, + "mean_token_accuracy": 0.915925107896328, + "num_tokens": 65878979.0, + "step": 915 + }, + { + "epoch": 0.5717629618070456, + "grad_norm": 0.46188029646873474, + "learning_rate": 6.336000000000001e-07, + "loss": 0.3068, + "mean_token_accuracy": 0.930741872638464, + "num_tokens": 65956913.0, + "step": 916 + }, + { + "epoch": 0.5723871571801974, + "grad_norm": 0.33244258165359497, + "learning_rate": 6.331999999999999e-07, + "loss": 0.3188, + "mean_token_accuracy": 0.9271979779005051, + "num_tokens": 66029823.0, + "step": 917 + }, + { + "epoch": 0.5730113525533492, + "grad_norm": 0.2609564960002899, + "learning_rate": 6.328e-07, + "loss": 0.3108, + "mean_token_accuracy": 0.9277559220790863, + "num_tokens": 66102502.0, + "step": 918 + }, + { + "epoch": 0.573635547926501, + "grad_norm": 0.323757141828537, + "learning_rate": 6.324e-07, + "loss": 0.3423, + "mean_token_accuracy": 0.9249501451849937, + "num_tokens": 66173455.0, + "step": 919 + }, + { + "epoch": 0.5742597432996528, + "grad_norm": 0.27009880542755127, + "learning_rate": 6.319999999999999e-07, + "loss": 0.3456, + "mean_token_accuracy": 0.9221528805792332, + "num_tokens": 66243623.0, + "step": 920 + }, + { + "epoch": 0.5748839386728046, + "grad_norm": 0.48479601740837097, + "learning_rate": 6.316e-07, + "loss": 0.3722, + "mean_token_accuracy": 0.9155812859535217, + "num_tokens": 66312071.0, + "step": 921 + }, + { + "epoch": 0.5755081340459564, + "grad_norm": 0.3155643045902252, + "learning_rate": 6.312e-07, + "loss": 0.2919, + "mean_token_accuracy": 0.9334325976669788, + "num_tokens": 66387975.0, + "step": 922 + }, + { + "epoch": 0.5761323294191082, + "grad_norm": 0.36289772391319275, + "learning_rate": 6.308e-07, + "loss": 0.3549, + "mean_token_accuracy": 0.9194199480116367, + "num_tokens": 66458828.0, + "step": 923 + }, + { + "epoch": 0.57675652479226, + "grad_norm": 0.2849428355693817, + "learning_rate": 6.303999999999999e-07, + "loss": 0.3582, + "mean_token_accuracy": 0.9200663976371288, + "num_tokens": 66528694.0, + "step": 924 + }, + { + "epoch": 0.5773807201654118, + "grad_norm": 0.3699878752231598, + "learning_rate": 6.3e-07, + "loss": 0.2834, + "mean_token_accuracy": 0.9354452006518841, + "num_tokens": 66603601.0, + "step": 925 + }, + { + "epoch": 0.5780049155385636, + "grad_norm": 0.2520447075366974, + "learning_rate": 6.296e-07, + "loss": 0.3256, + "mean_token_accuracy": 0.9290561005473137, + "num_tokens": 66675899.0, + "step": 926 + }, + { + "epoch": 0.5786291109117153, + "grad_norm": 0.29595306515693665, + "learning_rate": 6.291999999999999e-07, + "loss": 0.3209, + "mean_token_accuracy": 0.9225086234509945, + "num_tokens": 66746314.0, + "step": 927 + }, + { + "epoch": 0.5792533062848672, + "grad_norm": 0.28613483905792236, + "learning_rate": 6.288e-07, + "loss": 0.2797, + "mean_token_accuracy": 0.9364576525986195, + "num_tokens": 66825002.0, + "step": 928 + }, + { + "epoch": 0.579877501658019, + "grad_norm": 0.39827844500541687, + "learning_rate": 6.283999999999999e-07, + "loss": 0.3445, + "mean_token_accuracy": 0.9226041659712791, + "num_tokens": 66896340.0, + "step": 929 + }, + { + "epoch": 0.5805016970311707, + "grad_norm": 0.3121720850467682, + "learning_rate": 6.28e-07, + "loss": 0.3474, + "mean_token_accuracy": 0.9231056421995163, + "num_tokens": 66968295.0, + "step": 930 + }, + { + "epoch": 0.5811258924043226, + "grad_norm": 0.3524979054927826, + "learning_rate": 6.276e-07, + "loss": 0.3225, + "mean_token_accuracy": 0.9291897267103195, + "num_tokens": 67043688.0, + "step": 931 + }, + { + "epoch": 0.5817500877774744, + "grad_norm": 0.3090422451496124, + "learning_rate": 6.271999999999999e-07, + "loss": 0.328, + "mean_token_accuracy": 0.9241249933838844, + "num_tokens": 67115363.0, + "step": 932 + }, + { + "epoch": 0.5823742831506261, + "grad_norm": 0.41576865315437317, + "learning_rate": 6.268e-07, + "loss": 0.3369, + "mean_token_accuracy": 0.921410359442234, + "num_tokens": 67187338.0, + "step": 933 + }, + { + "epoch": 0.582998478523778, + "grad_norm": 0.37004759907722473, + "learning_rate": 6.263999999999999e-07, + "loss": 0.3167, + "mean_token_accuracy": 0.9293888881802559, + "num_tokens": 67265983.0, + "step": 934 + }, + { + "epoch": 0.5836226738969298, + "grad_norm": 0.3313625752925873, + "learning_rate": 6.26e-07, + "loss": 0.3145, + "mean_token_accuracy": 0.9283946938812733, + "num_tokens": 67340892.0, + "step": 935 + }, + { + "epoch": 0.5842468692700815, + "grad_norm": 0.7900646328926086, + "learning_rate": 6.256e-07, + "loss": 0.3379, + "mean_token_accuracy": 0.9216341227293015, + "num_tokens": 67412448.0, + "step": 936 + }, + { + "epoch": 0.5848710646432334, + "grad_norm": 0.2533631920814514, + "learning_rate": 6.252e-07, + "loss": 0.368, + "mean_token_accuracy": 0.914864793419838, + "num_tokens": 67480650.0, + "step": 937 + }, + { + "epoch": 0.5854952600163851, + "grad_norm": 0.2965437173843384, + "learning_rate": 6.248e-07, + "loss": 0.2901, + "mean_token_accuracy": 0.9302844144403934, + "num_tokens": 67554150.0, + "step": 938 + }, + { + "epoch": 0.5861194553895369, + "grad_norm": 0.5340675115585327, + "learning_rate": 6.243999999999999e-07, + "loss": 0.3003, + "mean_token_accuracy": 0.9331514127552509, + "num_tokens": 67629794.0, + "step": 939 + }, + { + "epoch": 0.5867436507626888, + "grad_norm": 0.37367090582847595, + "learning_rate": 6.24e-07, + "loss": 0.3012, + "mean_token_accuracy": 0.9326327741146088, + "num_tokens": 67704124.0, + "step": 940 + }, + { + "epoch": 0.5873678461358405, + "grad_norm": 0.3025112748146057, + "learning_rate": 6.236e-07, + "loss": 0.325, + "mean_token_accuracy": 0.9248803555965424, + "num_tokens": 67778035.0, + "step": 941 + }, + { + "epoch": 0.5879920415089923, + "grad_norm": 0.2649754285812378, + "learning_rate": 6.231999999999999e-07, + "loss": 0.3219, + "mean_token_accuracy": 0.9248533174395561, + "num_tokens": 67853540.0, + "step": 942 + }, + { + "epoch": 0.5886162368821442, + "grad_norm": 0.2228899896144867, + "learning_rate": 6.228e-07, + "loss": 0.3183, + "mean_token_accuracy": 0.9270938411355019, + "num_tokens": 67926136.0, + "step": 943 + }, + { + "epoch": 0.5892404322552959, + "grad_norm": 0.3619498908519745, + "learning_rate": 6.224e-07, + "loss": 0.3212, + "mean_token_accuracy": 0.9239177778363228, + "num_tokens": 68000306.0, + "step": 944 + }, + { + "epoch": 0.5898646276284477, + "grad_norm": 0.3066573143005371, + "learning_rate": 6.219999999999999e-07, + "loss": 0.2775, + "mean_token_accuracy": 0.9364175796508789, + "num_tokens": 68080885.0, + "step": 945 + }, + { + "epoch": 0.5904888230015995, + "grad_norm": 0.3840203881263733, + "learning_rate": 6.216e-07, + "loss": 0.2915, + "mean_token_accuracy": 0.9344231821596622, + "num_tokens": 68159099.0, + "step": 946 + }, + { + "epoch": 0.5911130183747513, + "grad_norm": 0.2889833152294159, + "learning_rate": 6.212e-07, + "loss": 0.3317, + "mean_token_accuracy": 0.9206769168376923, + "num_tokens": 68229668.0, + "step": 947 + }, + { + "epoch": 0.5917372137479031, + "grad_norm": 0.30188289284706116, + "learning_rate": 6.208e-07, + "loss": 0.3534, + "mean_token_accuracy": 0.9204833246767521, + "num_tokens": 68296711.0, + "step": 948 + }, + { + "epoch": 0.5923614091210548, + "grad_norm": 0.4030374586582184, + "learning_rate": 6.203999999999999e-07, + "loss": 0.3452, + "mean_token_accuracy": 0.9226979650557041, + "num_tokens": 68369658.0, + "step": 949 + }, + { + "epoch": 0.5929856044942067, + "grad_norm": 0.22480404376983643, + "learning_rate": 6.2e-07, + "loss": 0.3053, + "mean_token_accuracy": 0.931752871721983, + "num_tokens": 68443758.0, + "step": 950 + }, + { + "epoch": 0.5936097998673585, + "grad_norm": 0.23992112278938293, + "learning_rate": 6.196e-07, + "loss": 0.3217, + "mean_token_accuracy": 0.9227739870548248, + "num_tokens": 68516928.0, + "step": 951 + }, + { + "epoch": 0.5942339952405102, + "grad_norm": 0.2592463195323944, + "learning_rate": 6.191999999999999e-07, + "loss": 0.3082, + "mean_token_accuracy": 0.9312540851533413, + "num_tokens": 68591076.0, + "step": 952 + }, + { + "epoch": 0.5948581906136621, + "grad_norm": 0.25232332944869995, + "learning_rate": 6.188e-07, + "loss": 0.3046, + "mean_token_accuracy": 0.9280905947089195, + "num_tokens": 68664847.0, + "step": 953 + }, + { + "epoch": 0.5954823859868139, + "grad_norm": 0.3403299152851105, + "learning_rate": 6.183999999999999e-07, + "loss": 0.3178, + "mean_token_accuracy": 0.930928997695446, + "num_tokens": 68740824.0, + "step": 954 + }, + { + "epoch": 0.5961065813599656, + "grad_norm": 0.4414551556110382, + "learning_rate": 6.18e-07, + "loss": 0.346, + "mean_token_accuracy": 0.9227063581347466, + "num_tokens": 68812612.0, + "step": 955 + }, + { + "epoch": 0.5967307767331175, + "grad_norm": 0.24249251186847687, + "learning_rate": 6.176e-07, + "loss": 0.2788, + "mean_token_accuracy": 0.9356615990400314, + "num_tokens": 68887988.0, + "step": 956 + }, + { + "epoch": 0.5973549721062693, + "grad_norm": 0.286842405796051, + "learning_rate": 6.172e-07, + "loss": 0.3207, + "mean_token_accuracy": 0.9255156442523003, + "num_tokens": 68958825.0, + "step": 957 + }, + { + "epoch": 0.597979167479421, + "grad_norm": 0.25298529863357544, + "learning_rate": 6.168e-07, + "loss": 0.3157, + "mean_token_accuracy": 0.9285575449466705, + "num_tokens": 69030300.0, + "step": 958 + }, + { + "epoch": 0.5986033628525729, + "grad_norm": 0.24993418157100677, + "learning_rate": 6.163999999999999e-07, + "loss": 0.3193, + "mean_token_accuracy": 0.9286920800805092, + "num_tokens": 69104377.0, + "step": 959 + }, + { + "epoch": 0.5992275582257246, + "grad_norm": 0.3230576813220978, + "learning_rate": 6.16e-07, + "loss": 0.3252, + "mean_token_accuracy": 0.9245075061917305, + "num_tokens": 69175213.0, + "step": 960 + }, + { + "epoch": 0.5998517535988764, + "grad_norm": 0.3114889860153198, + "learning_rate": 6.156e-07, + "loss": 0.3357, + "mean_token_accuracy": 0.9220588020980358, + "num_tokens": 69244002.0, + "step": 961 + }, + { + "epoch": 0.6004759489720283, + "grad_norm": 0.7709341645240784, + "learning_rate": 6.152e-07, + "loss": 0.332, + "mean_token_accuracy": 0.9231755025684834, + "num_tokens": 69313761.0, + "step": 962 + }, + { + "epoch": 0.60110014434518, + "grad_norm": 0.2480955421924591, + "learning_rate": 6.148e-07, + "loss": 0.2954, + "mean_token_accuracy": 0.930020809173584, + "num_tokens": 69387535.0, + "step": 963 + }, + { + "epoch": 0.6017243397183318, + "grad_norm": 0.2863242030143738, + "learning_rate": 6.143999999999999e-07, + "loss": 0.3176, + "mean_token_accuracy": 0.9253921695053577, + "num_tokens": 69461407.0, + "step": 964 + }, + { + "epoch": 0.6023485350914837, + "grad_norm": 0.27440109848976135, + "learning_rate": 6.14e-07, + "loss": 0.3052, + "mean_token_accuracy": 0.9294677600264549, + "num_tokens": 69535781.0, + "step": 965 + }, + { + "epoch": 0.6029727304646354, + "grad_norm": 0.2807350158691406, + "learning_rate": 6.136e-07, + "loss": 0.3112, + "mean_token_accuracy": 0.9273786917328835, + "num_tokens": 69608015.0, + "step": 966 + }, + { + "epoch": 0.6035969258377872, + "grad_norm": 0.25397181510925293, + "learning_rate": 6.131999999999999e-07, + "loss": 0.2883, + "mean_token_accuracy": 0.9340161979198456, + "num_tokens": 69686884.0, + "step": 967 + }, + { + "epoch": 0.6042211212109391, + "grad_norm": 0.28548285365104675, + "learning_rate": 6.128e-07, + "loss": 0.3666, + "mean_token_accuracy": 0.9168828167021275, + "num_tokens": 69756019.0, + "step": 968 + }, + { + "epoch": 0.6048453165840908, + "grad_norm": 0.3107945919036865, + "learning_rate": 6.124000000000001e-07, + "loss": 0.2937, + "mean_token_accuracy": 0.931410439312458, + "num_tokens": 69831995.0, + "step": 969 + }, + { + "epoch": 0.6054695119572426, + "grad_norm": 0.2681730091571808, + "learning_rate": 6.119999999999999e-07, + "loss": 0.2895, + "mean_token_accuracy": 0.932623777538538, + "num_tokens": 69908398.0, + "step": 970 + }, + { + "epoch": 0.6060937073303944, + "grad_norm": 0.3379480242729187, + "learning_rate": 6.116e-07, + "loss": 0.3231, + "mean_token_accuracy": 0.9255988374352455, + "num_tokens": 69980869.0, + "step": 971 + }, + { + "epoch": 0.6067179027035462, + "grad_norm": 0.38758599758148193, + "learning_rate": 6.112e-07, + "loss": 0.2985, + "mean_token_accuracy": 0.9341311641037464, + "num_tokens": 70051853.0, + "step": 972 + }, + { + "epoch": 0.607342098076698, + "grad_norm": 0.3435940146446228, + "learning_rate": 6.107999999999999e-07, + "loss": 0.3167, + "mean_token_accuracy": 0.9263659529387951, + "num_tokens": 70121755.0, + "step": 973 + }, + { + "epoch": 0.6079662934498498, + "grad_norm": 0.2881772816181183, + "learning_rate": 6.104e-07, + "loss": 0.313, + "mean_token_accuracy": 0.9251901470124722, + "num_tokens": 70196903.0, + "step": 974 + }, + { + "epoch": 0.6085904888230016, + "grad_norm": 0.2645215690135956, + "learning_rate": 6.1e-07, + "loss": 0.326, + "mean_token_accuracy": 0.9231912940740585, + "num_tokens": 70265776.0, + "step": 975 + }, + { + "epoch": 0.6092146841961534, + "grad_norm": 0.3434924781322479, + "learning_rate": 6.096e-07, + "loss": 0.2883, + "mean_token_accuracy": 0.9334718286991119, + "num_tokens": 70343965.0, + "step": 976 + }, + { + "epoch": 0.6098388795693052, + "grad_norm": 0.35382527112960815, + "learning_rate": 6.091999999999999e-07, + "loss": 0.3257, + "mean_token_accuracy": 0.9259385503828526, + "num_tokens": 70415493.0, + "step": 977 + }, + { + "epoch": 0.610463074942457, + "grad_norm": 0.34119918942451477, + "learning_rate": 6.088e-07, + "loss": 0.3338, + "mean_token_accuracy": 0.922543577849865, + "num_tokens": 70486317.0, + "step": 978 + }, + { + "epoch": 0.6110872703156088, + "grad_norm": 0.26998358964920044, + "learning_rate": 6.084000000000001e-07, + "loss": 0.3708, + "mean_token_accuracy": 0.9130705781280994, + "num_tokens": 70553635.0, + "step": 979 + }, + { + "epoch": 0.6117114656887606, + "grad_norm": 0.2922830581665039, + "learning_rate": 6.079999999999999e-07, + "loss": 0.2852, + "mean_token_accuracy": 0.9325008504092693, + "num_tokens": 70630489.0, + "step": 980 + }, + { + "epoch": 0.6123356610619124, + "grad_norm": 0.5830912590026855, + "learning_rate": 6.076e-07, + "loss": 0.3162, + "mean_token_accuracy": 0.9273079708218575, + "num_tokens": 70704467.0, + "step": 981 + }, + { + "epoch": 0.6129598564350641, + "grad_norm": 0.2736106514930725, + "learning_rate": 6.072e-07, + "loss": 0.3606, + "mean_token_accuracy": 0.91494956985116, + "num_tokens": 70772618.0, + "step": 982 + }, + { + "epoch": 0.613584051808216, + "grad_norm": 0.4670395255088806, + "learning_rate": 6.068e-07, + "loss": 0.3554, + "mean_token_accuracy": 0.919471625238657, + "num_tokens": 70842812.0, + "step": 983 + }, + { + "epoch": 0.6142082471813678, + "grad_norm": 1.4782307147979736, + "learning_rate": 6.064e-07, + "loss": 0.3009, + "mean_token_accuracy": 0.9287406280636787, + "num_tokens": 70915951.0, + "step": 984 + }, + { + "epoch": 0.6148324425545195, + "grad_norm": 0.26075005531311035, + "learning_rate": 6.06e-07, + "loss": 0.3239, + "mean_token_accuracy": 0.9193221107125282, + "num_tokens": 70987901.0, + "step": 985 + }, + { + "epoch": 0.6154566379276714, + "grad_norm": 0.4419979453086853, + "learning_rate": 6.056e-07, + "loss": 0.282, + "mean_token_accuracy": 0.932734627276659, + "num_tokens": 71067581.0, + "step": 986 + }, + { + "epoch": 0.6160808333008232, + "grad_norm": 0.25967687368392944, + "learning_rate": 6.051999999999999e-07, + "loss": 0.2964, + "mean_token_accuracy": 0.9326891787350178, + "num_tokens": 71138064.0, + "step": 987 + }, + { + "epoch": 0.6167050286739749, + "grad_norm": 0.2729571461677551, + "learning_rate": 6.048e-07, + "loss": 0.3181, + "mean_token_accuracy": 0.9252961538732052, + "num_tokens": 71212646.0, + "step": 988 + }, + { + "epoch": 0.6173292240471268, + "grad_norm": 0.6203498244285583, + "learning_rate": 6.044e-07, + "loss": 0.329, + "mean_token_accuracy": 0.9260466694831848, + "num_tokens": 71283979.0, + "step": 989 + }, + { + "epoch": 0.6179534194202786, + "grad_norm": 0.23159927129745483, + "learning_rate": 6.04e-07, + "loss": 0.3351, + "mean_token_accuracy": 0.922436885535717, + "num_tokens": 71352496.0, + "step": 990 + }, + { + "epoch": 0.6185776147934303, + "grad_norm": 1.0212888717651367, + "learning_rate": 6.036e-07, + "loss": 0.3335, + "mean_token_accuracy": 0.9248460121452808, + "num_tokens": 71425071.0, + "step": 991 + }, + { + "epoch": 0.6192018101665822, + "grad_norm": 0.8426920771598816, + "learning_rate": 6.031999999999999e-07, + "loss": 0.3457, + "mean_token_accuracy": 0.9211708642542362, + "num_tokens": 71495542.0, + "step": 992 + }, + { + "epoch": 0.6198260055397339, + "grad_norm": 0.2713734209537506, + "learning_rate": 6.028e-07, + "loss": 0.2941, + "mean_token_accuracy": 0.9296718016266823, + "num_tokens": 71574519.0, + "step": 993 + }, + { + "epoch": 0.6204502009128857, + "grad_norm": 0.3907274603843689, + "learning_rate": 6.024e-07, + "loss": 0.3075, + "mean_token_accuracy": 0.9300686977803707, + "num_tokens": 71651496.0, + "step": 994 + }, + { + "epoch": 0.6210743962860376, + "grad_norm": 0.5442532300949097, + "learning_rate": 6.019999999999999e-07, + "loss": 0.3043, + "mean_token_accuracy": 0.929480466991663, + "num_tokens": 71724206.0, + "step": 995 + }, + { + "epoch": 0.6216985916591893, + "grad_norm": 0.1812916249036789, + "learning_rate": 6.016e-07, + "loss": 0.3389, + "mean_token_accuracy": 0.9178191274404526, + "num_tokens": 71792233.0, + "step": 996 + }, + { + "epoch": 0.6223227870323411, + "grad_norm": 0.3315819799900055, + "learning_rate": 6.012e-07, + "loss": 0.2923, + "mean_token_accuracy": 0.9316144585609436, + "num_tokens": 71865719.0, + "step": 997 + }, + { + "epoch": 0.622946982405493, + "grad_norm": 0.3334830701351166, + "learning_rate": 6.007999999999999e-07, + "loss": 0.3289, + "mean_token_accuracy": 0.9240243583917618, + "num_tokens": 71936941.0, + "step": 998 + }, + { + "epoch": 0.6235711777786447, + "grad_norm": 0.31655794382095337, + "learning_rate": 6.004e-07, + "loss": 0.3194, + "mean_token_accuracy": 0.9260966926813126, + "num_tokens": 72010180.0, + "step": 999 + }, + { + "epoch": 0.6241953731517965, + "grad_norm": 0.2527737319469452, + "learning_rate": 6e-07, + "loss": 0.3314, + "mean_token_accuracy": 0.924817081540823, + "num_tokens": 72081845.0, + "step": 1000 + } + ], + "logging_steps": 1.0, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.350581536465355e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}